vd/vendor/overtrue/pinyin/src/Pinyin.php
2018-12-18 16:46:51 +08:00

310 lines
7.8 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/*
* This file is part of the overtrue/pinyin.
*
* (c) overtrue <i@overtrue.me>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Overtrue\Pinyin;
use InvalidArgumentException;
/*
* Chinese to pinyin translator.
*
* @author overtrue <i@overtrue.me>
* @copyright 2015 overtrue <i@overtrue.me>
*
* @link https://github.com/overtrue/pinyin
* @link http://overtrue.me
*/
define('PINYIN_NONE', 'none');
define('PINYIN_ASCII', 'ascii');
define('PINYIN_UNICODE', 'unicode');
class Pinyin
{
const NONE = 'none';
const ASCII = 'ascii';
const UNICODE = 'unicode';
/**
* Dict loader.
*
* @var \Overtrue\Pinyin\DictLoaderInterface
*/
protected $loader;
/**
* Punctuations map.
*
* @var array
*/
protected $punctuations = array(
'' => ',',
'。' => '.',
'' => '!',
'' => '?',
'' => ':',
'“' => '"',
'”' => '"',
'' => "'",
'' => "'",
);
/**
* Constructor.
*
* @param string $loaderName
*/
public function __construct($loaderName = null)
{
$this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
}
/**
* Convert string to pinyin.
*
* @param string $string
* @param string $option
*
* @return array
*/
public function convert($string, $option = self::NONE)
{
$pinyin = $this->romanize($string);
return $this->splitWords($pinyin, $option);
}
/**
* Convert string (person name) to pinyin.
*
* @param string $stringName
* @param string $option
*
* @return array
*/
public function name($stringName, $option = self::NONE)
{
$pinyin = $this->romanize($stringName, true);
return $this->splitWords($pinyin, $option);
}
/**
* Return a pinyin permalink from string.
*
* @param string $string
* @param string $delimiter
*
* @return string
*/
public function permalink($string, $delimiter = '-')
{
if (!in_array($delimiter, array('_', '-', '.', ''), true)) {
throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
}
return implode($delimiter, $this->convert($string, false));
}
/**
* Return first letters.
*
* @param string $string
* @param string $delimiter
*
* @return string
*/
public function abbr($string, $delimiter = '')
{
return implode($delimiter, array_map(function ($pinyin) {
return $pinyin[0];
}, $this->convert($string, false)));
}
/**
* Chinese phrase to pinyin.
*
* @param string $string
* @param string $delimiter
* @param string $option
*
* @return string
*/
public function phrase($string, $delimiter = ' ', $option = self::NONE)
{
return implode($delimiter, $this->convert($string, $option));
}
/**
* Chinese to pinyin sentense.
*
* @param string $sentence
* @param bool $withTone
*
* @return string
*/
public function sentence($sentence, $withTone = false)
{
$marks = array_keys($this->punctuations);
$punctuationsRegex = preg_quote(implode(array_merge($marks, $this->punctuations)), '/');
$regex = '/[^üāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜa-z0-9'.$punctuationsRegex.'\s_]+/iu';
$pinyin = preg_replace($regex, '', $this->romanize($sentence));
$punctuations = array_merge($this->punctuations, array("\t" => ' ', ' ' => ' '));
$pinyin = trim(str_replace(array_keys($punctuations), $punctuations, $pinyin));
return $withTone ? $pinyin : $this->format($pinyin, false);
}
/**
* Loader setter.
*
* @param \Overtrue\Pinyin\DictLoaderInterface $loader
*
* @return $this
*/
public function setLoader(DictLoaderInterface $loader)
{
$this->loader = $loader;
return $this;
}
/**
* Return dict loader,.
*
* @return \Overtrue\Pinyin\DictLoaderInterface
*/
public function getLoader()
{
if (!($this->loader instanceof DictLoaderInterface)) {
$dataDir = dirname(__DIR__).'/data/';
$loaderName = $this->loader;
$this->loader = new $loaderName($dataDir);
}
return $this->loader;
}
/**
* Preprocess.
*
* @param string $string
*
* @return string
*/
protected function prepare($string)
{
$string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
return "\t".$matches[0];
}, $string);
return preg_replace("~[^\p{Han}\p{P}\p{Z}\p{M}\p{N}\p{L}\t]~u", '', $string);
}
/**
* Convert Chinese to pinyin.
*
* @param string $string
* @param bool $isName
*
* @return string
*/
protected function romanize($string, $isName = false)
{
$string = $this->prepare($string);
$dictLoader = $this->getLoader();
if ($isName) {
$string = $this->convertSurname($string, $dictLoader);
}
$dictLoader->map(function ($dictionary) use (&$string) {
$string = strtr($string, $dictionary);
});
return $string;
}
/**
* Convert Chinese Surname to pinyin.
*
* @param string $string
* @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
*
* @return string
*/
protected function convertSurname($string, $dictLoader)
{
$dictLoader->mapSurname(function ($dictionary) use (&$string) {
foreach ($dictionary as $surname => $pinyin) {
if (strpos($string, $surname) === 0) {
$string = $pinyin.mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
break;
}
}
});
return $string;
}
/**
* Split pinyin string to words.
*
* @param string $pinyin
* @param string $option
*
* @return array
*/
public function splitWords($pinyin, $option)
{
$split = array_filter(preg_split('/[^üāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜa-z\d]+/iu', $pinyin));
if ($option !== self::UNICODE) {
foreach ($split as $index => $pinyin) {
$split[$index] = $this->format($pinyin, $option === self::ASCII);
}
}
return array_values($split);
}
/**
* Format.
*
* @param string $pinyin
* @param bool $tone
*
* @return string
*/
protected function format($pinyin, $tone = false)
{
$replacements = array(
'üē' => array('ue', 1), 'üé' => array('ue', 2), 'üě' => array('ue', 3), 'üè' => array('ue', 4),
'ā' => array('a', 1), 'ē' => array('e', 1), 'ī' => array('i', 1), 'ō' => array('o', 1), 'ū' => array('u', 1), 'ǖ' => array('v', 1),
'á' => array('a', 2), 'é' => array('e', 2), 'í' => array('i', 2), 'ó' => array('o', 2), 'ú' => array('u', 2), 'ǘ' => array('v', 2),
'ǎ' => array('a', 3), 'ě' => array('e', 3), 'ǐ' => array('i', 3), 'ǒ' => array('o', 3), 'ǔ' => array('u', 3), 'ǚ' => array('v', 3),
'à' => array('a', 4), 'è' => array('e', 4), 'ì' => array('i', 4), 'ò' => array('o', 4), 'ù' => array('u', 4), 'ǜ' => array('v', 4),
);
foreach ($replacements as $unicde => $replacement) {
if (false !== strpos($pinyin, $unicde)) {
$pinyin = str_replace($unicde, $replacement[0], $pinyin).($tone ? $replacement[1] : '');
}
}
return $pinyin;
}
}