drupalcampbristol/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php

<?php

namespace Drupal\Component\Transliteration;

/**
 * Implements transliteration without using the PECL extensions.
 *
 * Transliterations are done character-by-character, by looking up non-US-ASCII
 * characters in a transliteration database.
 *
 * The database comes from two types of files, both of which are searched for in
 * the PhpTransliteration::$dataDirectory directory. First, language-specific
 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
 * there is no language-specific override for a character, the generic
 * transliteration character tables are searched (see
 * PhpTransliteration::readGenericData()). If looking up the character in the
 * generic table results in a NULL value, or an illegal character is
 * encountered, then a substitute character is returned.
 *
 * Some parts of this code were derived from the MediaWiki project's UtfNormal
 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
 * http://www.mediawiki.org/
 */
class PhpTransliteration implements TransliterationInterface {

  /**
   * Directory where data for transliteration resides.
   *
   * The constructor sets this (by default) to subdirectory 'data' underneath
   * the directory where the class's PHP file resides.
   *
   * @var string
   */
  protected $dataDirectory;

  /**
   * Associative array of language-specific character transliteration tables.
   *
   * The outermost array keys are language codes. For each language code key,
   * the value is an array whose keys are Unicode character codes, and whose
   * values are the transliterations of those characters to US-ASCII. This is
   * set up as needed in PhpTransliteration::replace() by calling
   * PhpTransliteration::readLanguageOverrides().
   *
   * @var array
   */
  protected $languageOverrides = array();

  /**
   * Non-language-specific transliteration tables.
   *
   * Array whose keys are the upper two bytes of the Unicode character, and
   * whose values are an array of transliterations for each lower-two bytes
   * character code. This is set up as needed in PhpTransliteration::replace()
   * by calling PhpTransliteration::readGenericData().
   *
   * @var array
   */
  protected $genericMap = array();

  /**
   * Constructs a transliteration object.
   *
   * @param string $data_directory
   *   (optional) The directory where data files reside. If omitted, defaults
   *   to subdirectory 'data' underneath the directory where the class's PHP
   *   file resides.
   */
  public function __construct($data_directory = NULL) {
    $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
  }

  /**
   * {@inheritdoc}
   */
  public function removeDiacritics($string) {
    $result = '';

    foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
      $code = self::ordUTF8($character);

      // These two Unicode ranges include the accented US-ASCII letters, with a
      // few characters that aren't accented letters mixed in. So define the
      // ranges and the excluded characters.
      $range1 = $code > 0x00bf && $code < 0x017f;
      $exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);
      $range2 = $code > 0x01cc && $code < 0x0250;
      $exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);

      $replacement = $character;
      if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
        $to_add = $this->lookupReplacement($code, 'xyz');
        if (strlen($to_add) === 1) {
          $replacement = $to_add;
        }
      }

      $result .= $replacement;
    }

    return $result;
  }

  /**
   * {@inheritdoc}
   */
  public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
    $result = '';
    $length = 0;
    // Split into Unicode characters and transliterate each one.
    foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
      $code = self::ordUTF8($character);
      if ($code == -1) {
        $to_add = $unknown_character;
      }
      else {
        $to_add = $this->replace($code, $langcode, $unknown_character);
      }

      // Check if this exceeds the maximum allowed length.
      if (isset($max_length)) {
        $length += strlen($to_add);
        if ($length > $max_length) {
          // There is no more space.
          return $result;
        }
      }

      $result .= $to_add;
    }

    return $result;
  }

  /**
   * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
   *
   * @param string $character
   *   A single UTF-8 character.
   *
   * @return int
   *   The character code, or -1 if an illegal character is found.
   */
  protected static function ordUTF8($character) {
    $first_byte = ord($character[0]);

    if (($first_byte & 0x80) == 0) {
      // Single-byte form: 0xxxxxxxx.
      return $first_byte;
    }
    if (($first_byte & 0xe0) == 0xc0) {
      // Two-byte form: 110xxxxx 10xxxxxx.
      return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
    }
    if (($first_byte & 0xf0) == 0xe0) {
      // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
    }
    if (($first_byte & 0xf8) == 0xf0) {
      // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
    }

    // Other forms are not legal.
    return -1;
  }

  /**
   * Replaces a single Unicode character using the transliteration database.
   *
   * @param int $code
   *   The character code of a Unicode character.
   * @param string $langcode
   *   The language code of the language the character is in.
   * @param string $unknown_character
   *   The character to substitute for characters without transliterated
   *   equivalents.
   *
   * @return string
   *   US-ASCII replacement character. If it has a mapping, it is returned;
   *   otherwise, $unknown_character is returned. The replacement can contain
   *   multiple characters.
   */
  protected function replace($code, $langcode, $unknown_character) {
    if ($code < 0x80) {
      // Already lower ASCII.
      return chr($code);
    }

    // See if there is a language-specific override for this character.
    if (!isset($this->languageOverrides[$langcode])) {
      $this->readLanguageOverrides($langcode);
    }
    if (isset($this->languageOverrides[$langcode][$code])) {
      return $this->languageOverrides[$langcode][$code];
    }

    return $this->lookupReplacement($code, $unknown_character);
  }

  /**
   * Look up the generic replacement for a UTF-8 character code.
   *
   * @param $code
   *   The UTF-8 character code.
   * @param string $unknown_character
   *   (optional) The character to substitute for characters without entries in
   *   the replacement tables.
   *
   * @return string
   *   US-ASCII replacement characters. If it has a mapping, it is returned;
   *   otherwise, $unknown_character is returned. The replacement can contain
   *   multiple characters.
   */
  protected function lookupReplacement($code, $unknown_character = '?') {
    // See if there is a generic mapping for this character.
    $bank = $code >> 8;
    if (!isset($this->genericMap[$bank])) {
      $this->readGenericData($bank);
    }
    $code = $code & 0xff;
    return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
  }

  /**
   * Reads in language overrides for a language code.
   *
   * The data is read from files named "$langcode.php" in
   * PhpTransliteration::$dataDirectory. These files should set up an array
   * variable $overrides with an element whose key is $langcode and whose value
   * is an array whose keys are character codes, and whose values are their
   * transliterations in this language. The character codes can be for any valid
   * Unicode character, independent of the number of bytes.
   *
   * @param $langcode
   *   Code for the language to read.
   */
  protected function readLanguageOverrides($langcode) {
    // Figure out the file name to use by sanitizing the language code,
    // just in case.
    $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';

    // Read in this file, which should set up a variable called $overrides,
    // which will be local to this function.
    if (is_file($file)) {
      include $file;
    }
    if (!isset($overrides) || !is_array($overrides)) {
      $overrides = array($langcode => array());
    }
    $this->languageOverrides[$langcode] = $overrides[$langcode];
  }

  /**
   * Reads in generic transliteration data for a bank of characters.
   *
   * The data is read in from a file named "x$bank.php" (with $bank in
   * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
   * should set up a variable $bank containing an array whose numerical indices
   * are the remaining two bytes of the character code, and whose values are the
   * transliterations of these characters into US-ASCII. Note that the maximum
   * Unicode character that can be encoded in this way is 4 bytes.
   *
   * @param $bank
   *   First two bytes of the Unicode character, or 0 for the ASCII range.
   */
  protected function readGenericData($bank) {
    // Figure out the file name.
    $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';

    // Read in this file, which should set up a variable called $base, which
    // will be local to this function.
    if (is_file($file)) {
      include $file;
    }
    if (!isset($base) || !is_array($base)) {
      $base = array();
    }

    // Save this data.
    $this->genericMap[$bank] = $base;
  }

}
Drupal 8.0.0 beta 12. More info: https://www.drupal.org/node/2514176 2015-08-17 17:00:26 -07:00			`<?php`

			`namespace Drupal\Component\Transliteration;`

			`/**`
			`* Implements transliteration without using the PECL extensions.`
			`*`
			`* Transliterations are done character-by-character, by looking up non-US-ASCII`
			`* characters in a transliteration database.`
			`*`
			`* The database comes from two types of files, both of which are searched for in`
			`* the PhpTransliteration::$dataDirectory directory. First, language-specific`
			`* overrides are searched (see PhpTransliteration::readLanguageOverrides()). If`
			`* there is no language-specific override for a character, the generic`
			`* transliteration character tables are searched (see`
			`* PhpTransliteration::readGenericData()). If looking up the character in the`
			`* generic table results in a NULL value, or an illegal character is`
			`* encountered, then a substitute character is returned.`
Update to Drupal 8.1.0. For more information, see https://www.drupal.org/drupal-8.1.0-release-notes 2016-04-20 09:56:34 -07:00			`*`
			`* Some parts of this code were derived from the MediaWiki project's UtfNormal`
			`* class, Copyright © 2004 Brion Vibber <brion@pobox.com>,`
			`* http://www.mediawiki.org/`
Drupal 8.0.0 beta 12. More info: https://www.drupal.org/node/2514176 2015-08-17 17:00:26 -07:00			`*/`
			`class PhpTransliteration implements TransliterationInterface {`

			`/**`
			`* Directory where data for transliteration resides.`
			`*`
			`* The constructor sets this (by default) to subdirectory 'data' underneath`
			`* the directory where the class's PHP file resides.`
			`*`
			`* @var string`
			`*/`
			`protected $dataDirectory;`

			`/**`
			`* Associative array of language-specific character transliteration tables.`
			`*`
			`* The outermost array keys are language codes. For each language code key,`
			`* the value is an array whose keys are Unicode character codes, and whose`
			`* values are the transliterations of those characters to US-ASCII. This is`
			`* set up as needed in PhpTransliteration::replace() by calling`
			`* PhpTransliteration::readLanguageOverrides().`
			`*`
			`* @var array`
			`*/`
			`protected $languageOverrides = array();`

			`/**`
			`* Non-language-specific transliteration tables.`
			`*`
			`* Array whose keys are the upper two bytes of the Unicode character, and`
			`* whose values are an array of transliterations for each lower-two bytes`
			`* character code. This is set up as needed in PhpTransliteration::replace()`
			`* by calling PhpTransliteration::readGenericData().`
			`*`
			`* @var array`
			`*/`
			`protected $genericMap = array();`

			`/**`
			`* Constructs a transliteration object.`
			`*`
			`* @param string $data_directory`
			`* (optional) The directory where data files reside. If omitted, defaults`
			`* to subdirectory 'data' underneath the directory where the class's PHP`
			`* file resides.`
			`*/`
			`public function __construct($data_directory = NULL) {`
			`$this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';`
			`}`

			`/**`
			`* {@inheritdoc}`
			`*/`
			`public function removeDiacritics($string) {`
			`$result = '';`

			`foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {`
			`$code = self::ordUTF8($character);`

			`// These two Unicode ranges include the accented US-ASCII letters, with a`
			`// few characters that aren't accented letters mixed in. So define the`
			`// ranges and the excluded characters.`
			`$range1 = $code > 0x00bf && $code < 0x017f;`
			`$exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);`
			`$range2 = $code > 0x01cc && $code < 0x0250;`
			`$exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);`

			`$replacement = $character;`
			`if (($range1 && !in_array($code, $exclusions_range1)) \|\| ($range2 && !in_array($code, $exclusions_range2))) {`
			`$to_add = $this->lookupReplacement($code, 'xyz');`
Update to Drupal 8.1.2. For more information, see https://www.drupal.org/project/drupal/releases/8.1.2 2016-06-02 15:56:09 -07:00			`if (strlen($to_add) === 1) {`
Drupal 8.0.0 beta 12. More info: https://www.drupal.org/node/2514176 2015-08-17 17:00:26 -07:00			`$replacement = $to_add;`
			`}`
			`}`

			`$result .= $replacement;`
			`}`

			`return $result;`
			`}`

			`/**`
			`* {@inheritdoc}`
			`*/`
			`public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {`
			`$result = '';`
			`$length = 0;`
			`// Split into Unicode characters and transliterate each one.`
			`foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {`
			`$code = self::ordUTF8($character);`
			`if ($code == -1) {`
			`$to_add = $unknown_character;`
			`}`
			`else {`
			`$to_add = $this->replace($code, $langcode, $unknown_character);`
			`}`

			`// Check if this exceeds the maximum allowed length.`
			`if (isset($max_length)) {`
			`$length += strlen($to_add);`
			`if ($length > $max_length) {`
			`// There is no more space.`
			`return $result;`
			`}`
			`}`

			`$result .= $to_add;`
			`}`

			`return $result;`
			`}`

			`/**`
			`* Finds the character code for a UTF-8 character: like ord() but for UTF-8.`
			`*`
			`* @param string $character`
			`* A single UTF-8 character.`
			`*`
			`* @return int`
			`* The character code, or -1 if an illegal character is found.`
			`*/`
			`protected static function ordUTF8($character) {`
			`$first_byte = ord($character[0]);`

			`if (($first_byte & 0x80) == 0) {`
			`// Single-byte form: 0xxxxxxxx.`
			`return $first_byte;`
			`}`
			`if (($first_byte & 0xe0) == 0xc0) {`
			`// Two-byte form: 110xxxxx 10xxxxxx.`
			`return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);`
			`}`
			`if (($first_byte & 0xf0) == 0xe0) {`
			`// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.`
			`return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);`
			`}`
			`if (($first_byte & 0xf8) == 0xf0) {`
			`// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.`
			`return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);`
			`}`

			`// Other forms are not legal.`
			`return -1;`
			`}`

			`/**`
			`* Replaces a single Unicode character using the transliteration database.`
			`*`
			`* @param int $code`
			`* The character code of a Unicode character.`
			`* @param string $langcode`
			`* The language code of the language the character is in.`
			`* @param string $unknown_character`
			`* The character to substitute for characters without transliterated`
			`* equivalents.`
			`*`
			`* @return string`
			`* US-ASCII replacement character. If it has a mapping, it is returned;`
			`* otherwise, $unknown_character is returned. The replacement can contain`
			`* multiple characters.`
			`*/`
			`protected function replace($code, $langcode, $unknown_character) {`
			`if ($code < 0x80) {`
			`// Already lower ASCII.`
			`return chr($code);`
			`}`

			`// See if there is a language-specific override for this character.`
			`if (!isset($this->languageOverrides[$langcode])) {`
			`$this->readLanguageOverrides($langcode);`
			`}`
			`if (isset($this->languageOverrides[$langcode][$code])) {`
			`return $this->languageOverrides[$langcode][$code];`
			`}`

			`return $this->lookupReplacement($code, $unknown_character);`
			`}`

			`/**`
			`* Look up the generic replacement for a UTF-8 character code.`
			`*`
			`* @param $code`
			`* The UTF-8 character code.`
			`* @param string $unknown_character`
			`* (optional) The character to substitute for characters without entries in`
			`* the replacement tables.`
			`*`
			`* @return string`
			`* US-ASCII replacement characters. If it has a mapping, it is returned;`
			`* otherwise, $unknown_character is returned. The replacement can contain`
			`* multiple characters.`
			`*/`
			`protected function lookupReplacement($code, $unknown_character = '?') {`
			`// See if there is a generic mapping for this character.`
			`$bank = $code >> 8;`
			`if (!isset($this->genericMap[$bank])) {`
			`$this->readGenericData($bank);`
			`}`
			`$code = $code & 0xff;`
			`return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;`
			`}`

			`/**`
			`* Reads in language overrides for a language code.`
			`*`
			`* The data is read from files named "$langcode.php" in`
			`* PhpTransliteration::$dataDirectory. These files should set up an array`
			`* variable $overrides with an element whose key is $langcode and whose value`
			`* is an array whose keys are character codes, and whose values are their`
			`* transliterations in this language. The character codes can be for any valid`
			`* Unicode character, independent of the number of bytes.`
			`*`
			`* @param $langcode`
			`* Code for the language to read.`
			`*/`
			`protected function readLanguageOverrides($langcode) {`
			`// Figure out the file name to use by sanitizing the language code,`
			`// just in case.`
			`$file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';`

			`// Read in this file, which should set up a variable called $overrides,`
			`// which will be local to this function.`
			`if (is_file($file)) {`
			`include $file;`
			`}`
			`if (!isset($overrides) \|\| !is_array($overrides)) {`
			`$overrides = array($langcode => array());`
			`}`
			`$this->languageOverrides[$langcode] = $overrides[$langcode];`
			`}`

			`/**`
			`* Reads in generic transliteration data for a bank of characters.`
			`*`
			`* The data is read in from a file named "x$bank.php" (with $bank in`
			`* hexadecimal notation) in PhpTransliteration::$dataDirectory. These files`
			`* should set up a variable $bank containing an array whose numerical indices`
			`* are the remaining two bytes of the character code, and whose values are the`
			`* transliterations of these characters into US-ASCII. Note that the maximum`
			`* Unicode character that can be encoded in this way is 4 bytes.`
			`*`
			`* @param $bank`
			`* First two bytes of the Unicode character, or 0 for the ASCII range.`
			`*/`
			`protected function readGenericData($bank) {`
			`// Figure out the file name.`
			`$file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';`

			`// Read in this file, which should set up a variable called $base, which`
			`// will be local to this function.`
			`if (is_file($file)) {`
			`include $file;`
			`}`
			`if (!isset($base) \|\| !is_array($base)) {`
			`$base = array();`
			`}`

			`// Save this data.`
			`$this->genericMap[$bank] = $base;`
			`}`
Update to Drupal 8.1.2. For more information, see https://www.drupal.org/project/drupal/releases/8.1.2 2016-06-02 15:56:09 -07:00
Drupal 8.0.0 beta 12. More info: https://www.drupal.org/node/2514176 2015-08-17 17:00:26 -07:00			`}`
No results found.