[ Index ]

PHP Cross Reference of phpBB-3.2.11-deutsch

title

Body

[close]

/vendor/patchwork/utf8/src/Patchwork/PHP/Shim/ -> Normalizer.php (source)

   1  <?php
   2  
   3  /*
   4   * Copyright (C) 2016 Nicolas Grekas - p@tchwork.com
   5   *
   6   * This library is free software; you can redistribute it and/or modify it
   7   * under the terms of the (at your option):
   8   * Apache License v2.0 (http://apache.org/licenses/LICENSE-2.0.txt), or
   9   * GNU General Public License v2.0 (http://gnu.org/licenses/gpl-2.0.txt).
  10   */
  11  
  12  namespace Patchwork\PHP\Shim;
  13  
  14  /**
  15   * Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension.
  16   *
  17   * It has been validated with Unicode 6.3 Normalization Conformance Test.
  18   * See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations.
  19   *
  20   * @internal
  21   */
  22  class Normalizer
  23  {
  24      const NONE = 1;
  25      const FORM_D = 2;
  26      const FORM_KD = 3;
  27      const FORM_C = 4;
  28      const FORM_KC = 5;
  29      const NFD = 2;
  30      const NFKD = 3;
  31      const NFC = 4;
  32      const NFKC = 5;
  33  
  34      private static $C;
  35      private static $D;
  36      private static $KD;
  37      private static $cC;
  38      private static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4);
  39      private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
  40  
  41      public static function isNormalized($s, $form = self::NFC)
  42      {
  43          if ($form <= self::NONE || self::NFKC < $form) {
  44              return false;
  45          }
  46          if (!isset($s[strspn($s .= '', self::$ASCII)])) {
  47              return true;
  48          }
  49          if (self::NFC === $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) {
  50              return true;
  51          }
  52  
  53          return false; // Pretend false as quick checks implementented in PHP won't be so quick
  54      }
  55  
  56      public static function normalize($s, $form = self::NFC)
  57      {
  58          if (!preg_match('//u', $s .= '')) {
  59              return false;
  60          }
  61  
  62          switch ($form) {
  63              case self::NONE: return $s;
  64              case self::NFC: $C = true; $K = false; break;
  65              case self::NFD: $C = false; $K = false; break;
  66              case self::NFKC: $C = true; $K = true; break;
  67              case self::NFKD: $C = false; $K = true; break;
  68              default: return false;
  69          }
  70  
  71          if ('' === $s) {
  72              return '';
  73          }
  74  
  75          if ($K && null === self::$KD) {
  76              self::$KD = self::getData('compatibilityDecomposition');
  77          }
  78  
  79          if (null === self::$D) {
  80              self::$D = self::getData('canonicalDecomposition');
  81              self::$cC = self::getData('combiningClass');
  82          }
  83  
  84          if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) {
  85              mb_internal_encoding('8bit');
  86          }
  87  
  88          $r = self::decompose($s, $K);
  89  
  90          if ($C) {
  91              if (null === self::$C) {
  92                  self::$C = self::getData('canonicalComposition');
  93              }
  94  
  95              $r = self::recompose($r);
  96          }
  97          if (null !== $mbEncoding) {
  98              mb_internal_encoding($mbEncoding);
  99          }
 100  
 101          return $r;
 102      }
 103  
 104      private static function recompose($s)
 105      {
 106          $ASCII = self::$ASCII;
 107          $compMap = self::$C;
 108          $combClass = self::$cC;
 109          $ulenMask = self::$ulenMask;
 110  
 111          $result = $tail = '';
 112  
 113          $i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"];
 114          $len = strlen($s);
 115  
 116          $lastUchr = substr($s, 0, $i);
 117          $lastUcls = isset($combClass[$lastUchr]) ? 256 : 0;
 118  
 119          while ($i < $len) {
 120              if ($s[$i] < "\x80") {
 121                  // ASCII chars
 122  
 123                  if ($tail) {
 124                      $lastUchr .= $tail;
 125                      $tail = '';
 126                  }
 127  
 128                  if ($j = strspn($s, $ASCII, $i + 1)) {
 129                      $lastUchr .= substr($s, $i, $j);
 130                      $i += $j;
 131                  }
 132  
 133                  $result .= $lastUchr;
 134                  $lastUchr = $s[$i];
 135                  $lastUcls = 0;
 136                  ++$i;
 137                  continue;
 138              }
 139  
 140              $ulen = $ulenMask[$s[$i] & "\xF0"];
 141              $uchr = substr($s, $i, $ulen);
 142  
 143              if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr
 144                  ||   $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr
 145                  || $lastUcls) {
 146                  // Table lookup and combining chars composition
 147  
 148                  $ucls = isset($combClass[$uchr]) ? $combClass[$uchr] : 0;
 149  
 150                  if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) {
 151                      $lastUchr = $compMap[$lastUchr.$uchr];
 152                  } elseif ($lastUcls = $ucls) {
 153                      $tail .= $uchr;
 154                  } else {
 155                      if ($tail) {
 156                          $lastUchr .= $tail;
 157                          $tail = '';
 158                      }
 159  
 160                      $result .= $lastUchr;
 161                      $lastUchr = $uchr;
 162                  }
 163              } else {
 164                  // Hangul chars
 165  
 166                  $L = ord($lastUchr[2]) - 0x80;
 167                  $V = ord($uchr[2]) - 0xA1;
 168                  $T = 0;
 169  
 170                  $uchr = substr($s, $i + $ulen, 3);
 171  
 172                  if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") {
 173                      $T = ord($uchr[2]) - 0xA7;
 174                      0 > $T && $T += 0x40;
 175                      $ulen += 3;
 176                  }
 177  
 178                  $L = 0xAC00 + ($L * 21 + $V) * 28 + $T;
 179                  $lastUchr = chr(0xE0 | $L >> 12).chr(0x80 | $L >> 6 & 0x3F).chr(0x80 | $L & 0x3F);
 180              }
 181  
 182              $i += $ulen;
 183          }
 184  
 185          return $result.$lastUchr.$tail;
 186      }
 187  
 188      private static function decompose($s, $c)
 189      {
 190          $result = '';
 191  
 192          $ASCII = self::$ASCII;
 193          $decompMap = self::$D;
 194          $combClass = self::$cC;
 195          $ulenMask = self::$ulenMask;
 196          if ($c) {
 197              $compatMap = self::$KD;
 198          }
 199  
 200          $c = array();
 201          $i = 0;
 202          $len = strlen($s);
 203  
 204          while ($i < $len) {
 205              if ($s[$i] < "\x80") {
 206                  // ASCII chars
 207  
 208                  if ($c) {
 209                      ksort($c);
 210                      $result .= implode('', $c);
 211                      $c = array();
 212                  }
 213  
 214                  $j = 1 + strspn($s, $ASCII, $i + 1);
 215                  $result .= substr($s, $i, $j);
 216                  $i += $j;
 217                  continue;
 218              }
 219  
 220              $ulen = $ulenMask[$s[$i] & "\xF0"];
 221              $uchr = substr($s, $i, $ulen);
 222              $i += $ulen;
 223  
 224              if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) {
 225                  // Table lookup
 226  
 227                  if ($uchr !== $j = isset($compatMap[$uchr]) ? $compatMap[$uchr] : (isset($decompMap[$uchr]) ? $decompMap[$uchr] : $uchr)) {
 228                      $uchr = $j;
 229  
 230                      $j = strlen($uchr);
 231                      $ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"];
 232  
 233                      if ($ulen != $j) {
 234                          // Put trailing chars in $s
 235  
 236                          $j -= $ulen;
 237                          $i -= $j;
 238  
 239                          if (0 > $i) {
 240                              $s = str_repeat(' ', -$i).$s;
 241                              $len -= $i;
 242                              $i = 0;
 243                          }
 244  
 245                          while ($j--) {
 246                              $s[$i + $j] = $uchr[$ulen + $j];
 247                          }
 248  
 249                          $uchr = substr($uchr, 0, $ulen);
 250                      }
 251                  }
 252                  if (isset($combClass[$uchr])) {
 253                      // Combining chars, for sorting
 254  
 255                      if (!isset($c[$combClass[$uchr]])) {
 256                          $c[$combClass[$uchr]] = '';
 257                      }
 258                      $c[$combClass[$uchr]] .= $uchr;
 259                      continue;
 260                  }
 261              } else {
 262                  // Hangul chars
 263  
 264                  $uchr = unpack('C*', $uchr);
 265                  $j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80;
 266  
 267                  $uchr = "\xE1\x84".chr(0x80 + (int) ($j / 588))
 268                         ."\xE1\x85".chr(0xA1 + (int) (($j % 588) / 28));
 269  
 270                  if ($j %= 28) {
 271                      $uchr .= $j < 25
 272                          ? ("\xE1\x86".chr(0xA7 + $j))
 273                          : ("\xE1\x87".chr(0x67 + $j));
 274                  }
 275              }
 276              if ($c) {
 277                  ksort($c);
 278                  $result .= implode('', $c);
 279                  $c = array();
 280              }
 281  
 282              $result .= $uchr;
 283          }
 284  
 285          if ($c) {
 286              ksort($c);
 287              $result .= implode('', $c);
 288          }
 289  
 290          return $result;
 291      }
 292  
 293      private static function getData($file)
 294      {
 295          if (file_exists($file = __DIR__.'/unidata/'.$file.'.ser')) {
 296              return unserialize(file_get_contents($file));
 297          }
 298  
 299          return false;
 300      }
 301  }


Generated: Wed Nov 11 20:33:01 2020 Cross-referenced by PHPXref 0.7.1