2015-08-17 17:00:26 -07:00
< ? php
namespace Drupal\Component\Transliteration ;
/**
* Implements transliteration without using the PECL extensions .
*
* Transliterations are done character - by - character , by looking up non - US - ASCII
* characters in a transliteration database .
*
* The database comes from two types of files , both of which are searched for in
* the PhpTransliteration :: $dataDirectory directory . First , language - specific
* overrides are searched ( see PhpTransliteration :: readLanguageOverrides ()) . If
* there is no language - specific override for a character , the generic
* transliteration character tables are searched ( see
* PhpTransliteration :: readGenericData ()) . If looking up the character in the
* generic table results in a NULL value , or an illegal character is
* encountered , then a substitute character is returned .
2016-04-20 09:56:34 -07:00
*
* Some parts of this code were derived from the MediaWiki project ' s UtfNormal
* class , Copyright © 2004 Brion Vibber < brion @ pobox . com > ,
* http :// www . mediawiki . org /
2015-08-17 17:00:26 -07:00
*/
class PhpTransliteration implements TransliterationInterface {
/**
* Directory where data for transliteration resides .
*
* The constructor sets this ( by default ) to subdirectory 'data' underneath
* the directory where the class ' s PHP file resides .
*
* @ var string
*/
protected $dataDirectory ;
/**
* Associative array of language - specific character transliteration tables .
*
* The outermost array keys are language codes . For each language code key ,
* the value is an array whose keys are Unicode character codes , and whose
* values are the transliterations of those characters to US - ASCII . This is
* set up as needed in PhpTransliteration :: replace () by calling
* PhpTransliteration :: readLanguageOverrides () .
*
* @ var array
*/
protected $languageOverrides = array ();
/**
* Non - language - specific transliteration tables .
*
* Array whose keys are the upper two bytes of the Unicode character , and
* whose values are an array of transliterations for each lower - two bytes
* character code . This is set up as needed in PhpTransliteration :: replace ()
* by calling PhpTransliteration :: readGenericData () .
*
* @ var array
*/
protected $genericMap = array ();
/**
* Constructs a transliteration object .
*
* @ param string $data_directory
* ( optional ) The directory where data files reside . If omitted , defaults
* to subdirectory 'data' underneath the directory where the class ' s PHP
* file resides .
*/
public function __construct ( $data_directory = NULL ) {
$this -> dataDirectory = ( isset ( $data_directory )) ? $data_directory : __DIR__ . '/data' ;
}
/**
* { @ inheritdoc }
*/
public function removeDiacritics ( $string ) {
$result = '' ;
foreach ( preg_split ( '//u' , $string , 0 , PREG_SPLIT_NO_EMPTY ) as $character ) {
$code = self :: ordUTF8 ( $character );
// These two Unicode ranges include the accented US-ASCII letters, with a
// few characters that aren't accented letters mixed in. So define the
// ranges and the excluded characters.
$range1 = $code > 0x00bf && $code < 0x017f ;
$exclusions_range1 = array ( 0x00d0 , 0x00d7 , 0x00f0 , 0x00f7 , 0x0138 , 0x014a , 0x014b );
$range2 = $code > 0x01cc && $code < 0x0250 ;
$exclusions_range2 = array ( 0x01DD , 0x01f7 , 0x021c , 0x021d , 0x0220 , 0x0221 , 0x0241 , 0x0242 , 0x0245 );
$replacement = $character ;
if (( $range1 && ! in_array ( $code , $exclusions_range1 )) || ( $range2 && ! in_array ( $code , $exclusions_range2 ))) {
$to_add = $this -> lookupReplacement ( $code , 'xyz' );
2016-06-02 15:56:09 -07:00
if ( strlen ( $to_add ) === 1 ) {
2015-08-17 17:00:26 -07:00
$replacement = $to_add ;
}
}
$result .= $replacement ;
}
return $result ;
}
/**
* { @ inheritdoc }
*/
public function transliterate ( $string , $langcode = 'en' , $unknown_character = '?' , $max_length = NULL ) {
$result = '' ;
$length = 0 ;
// Split into Unicode characters and transliterate each one.
foreach ( preg_split ( '//u' , $string , 0 , PREG_SPLIT_NO_EMPTY ) as $character ) {
$code = self :: ordUTF8 ( $character );
if ( $code == - 1 ) {
$to_add = $unknown_character ;
}
else {
$to_add = $this -> replace ( $code , $langcode , $unknown_character );
}
// Check if this exceeds the maximum allowed length.
if ( isset ( $max_length )) {
$length += strlen ( $to_add );
if ( $length > $max_length ) {
// There is no more space.
return $result ;
}
}
$result .= $to_add ;
}
return $result ;
}
/**
* Finds the character code for a UTF - 8 character : like ord () but for UTF - 8.
*
* @ param string $character
* A single UTF - 8 character .
*
* @ return int
* The character code , or - 1 if an illegal character is found .
*/
protected static function ordUTF8 ( $character ) {
$first_byte = ord ( $character [ 0 ]);
if (( $first_byte & 0x80 ) == 0 ) {
// Single-byte form: 0xxxxxxxx.
return $first_byte ;
}
if (( $first_byte & 0xe0 ) == 0xc0 ) {
// Two-byte form: 110xxxxx 10xxxxxx.
return (( $first_byte & 0x1f ) << 6 ) + ( ord ( $character [ 1 ]) & 0x3f );
}
if (( $first_byte & 0xf0 ) == 0xe0 ) {
// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
return (( $first_byte & 0x0f ) << 12 ) + (( ord ( $character [ 1 ]) & 0x3f ) << 6 ) + ( ord ( $character [ 2 ]) & 0x3f );
}
if (( $first_byte & 0xf8 ) == 0xf0 ) {
// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
return (( $first_byte & 0x07 ) << 18 ) + (( ord ( $character [ 1 ]) & 0x3f ) << 12 ) + (( ord ( $character [ 2 ]) & 0x3f ) << 6 ) + ( ord ( $character [ 3 ]) & 0x3f );
}
// Other forms are not legal.
return - 1 ;
}
/**
* Replaces a single Unicode character using the transliteration database .
*
* @ param int $code
* The character code of a Unicode character .
* @ param string $langcode
* The language code of the language the character is in .
* @ param string $unknown_character
* The character to substitute for characters without transliterated
* equivalents .
*
* @ return string
* US - ASCII replacement character . If it has a mapping , it is returned ;
* otherwise , $unknown_character is returned . The replacement can contain
* multiple characters .
*/
protected function replace ( $code , $langcode , $unknown_character ) {
if ( $code < 0x80 ) {
// Already lower ASCII.
return chr ( $code );
}
// See if there is a language-specific override for this character.
if ( ! isset ( $this -> languageOverrides [ $langcode ])) {
$this -> readLanguageOverrides ( $langcode );
}
if ( isset ( $this -> languageOverrides [ $langcode ][ $code ])) {
return $this -> languageOverrides [ $langcode ][ $code ];
}
return $this -> lookupReplacement ( $code , $unknown_character );
}
/**
* Look up the generic replacement for a UTF - 8 character code .
*
* @ param $code
* The UTF - 8 character code .
* @ param string $unknown_character
* ( optional ) The character to substitute for characters without entries in
* the replacement tables .
*
* @ return string
* US - ASCII replacement characters . If it has a mapping , it is returned ;
* otherwise , $unknown_character is returned . The replacement can contain
* multiple characters .
*/
protected function lookupReplacement ( $code , $unknown_character = '?' ) {
// See if there is a generic mapping for this character.
$bank = $code >> 8 ;
if ( ! isset ( $this -> genericMap [ $bank ])) {
$this -> readGenericData ( $bank );
}
$code = $code & 0xff ;
return isset ( $this -> genericMap [ $bank ][ $code ]) ? $this -> genericMap [ $bank ][ $code ] : $unknown_character ;
}
/**
* Reads in language overrides for a language code .
*
* The data is read from files named " $langcode .php " in
* PhpTransliteration :: $dataDirectory . These files should set up an array
* variable $overrides with an element whose key is $langcode and whose value
* is an array whose keys are character codes , and whose values are their
* transliterations in this language . The character codes can be for any valid
* Unicode character , independent of the number of bytes .
*
* @ param $langcode
* Code for the language to read .
*/
protected function readLanguageOverrides ( $langcode ) {
// Figure out the file name to use by sanitizing the language code,
// just in case.
$file = $this -> dataDirectory . '/' . preg_replace ( '/[^a-zA-Z\-]/' , '' , $langcode ) . '.php' ;
// Read in this file, which should set up a variable called $overrides,
// which will be local to this function.
if ( is_file ( $file )) {
include $file ;
}
if ( ! isset ( $overrides ) || ! is_array ( $overrides )) {
$overrides = array ( $langcode => array ());
}
$this -> languageOverrides [ $langcode ] = $overrides [ $langcode ];
}
/**
* Reads in generic transliteration data for a bank of characters .
*
* The data is read in from a file named " x $bank .php " ( with $bank in
* hexadecimal notation ) in PhpTransliteration :: $dataDirectory . These files
* should set up a variable $bank containing an array whose numerical indices
* are the remaining two bytes of the character code , and whose values are the
* transliterations of these characters into US - ASCII . Note that the maximum
* Unicode character that can be encoded in this way is 4 bytes .
*
* @ param $bank
* First two bytes of the Unicode character , or 0 for the ASCII range .
*/
protected function readGenericData ( $bank ) {
// Figure out the file name.
$file = $this -> dataDirectory . '/x' . sprintf ( '%02x' , $bank ) . '.php' ;
// Read in this file, which should set up a variable called $base, which
// will be local to this function.
if ( is_file ( $file )) {
include $file ;
}
if ( ! isset ( $base ) || ! is_array ( $base )) {
$base = array ();
}
// Save this data.
$this -> genericMap [ $bank ] = $base ;
}
2016-06-02 15:56:09 -07:00
2015-08-17 17:00:26 -07:00
}