PHPXRef 0.7.1 : phpBB-3.1.12-deutsch : /includes/utf/utf

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3  *
   4  * This file is part of the phpBB Forum Software package.
   5  *
   6  * @copyright (c) phpBB Limited <https://www.phpbb.com>
   7  * @license GNU General Public License, version 2 (GPL-2.0)
   8  *
   9  * For full copyright and license information, please see
  10  * the docs/CREDITS.txt file.
  11  *
  12  */
  13  
  14  /**
  15  */
  16  if (!defined('IN_PHPBB'))
  17  {
  18      exit;
  19  }
  20  
  21  /**
  22  * Some Unicode characters encoded in UTF-8
  23  *
  24  * Preserved for compatibility
  25  */
  26  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
  27  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
  28  define('UTF8_FFFE', "\xEF\xBF\xBE");
  29  define('UTF8_FFFF', "\xEF\xBF\xBF");
  30  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
  31  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
  32  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
  33  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
  34  
  35  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
  36  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
  37  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
  38  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
  39  
  40  // Unset global variables
  41  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
  42  
  43  // NFC_QC and NFKC_QC values
  44  define('UNICODE_QC_MAYBE', 0);
  45  define('UNICODE_QC_NO', 1);
  46  
  47  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
  48  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
  49  
  50  // Contains all the tail bytes that can appear in the composition of a UTF-8 char
  51  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
  52  
  53  // Constants used by the Hangul [de]composition algorithms
  54  define('UNICODE_HANGUL_SBASE', 0xAC00);
  55  define('UNICODE_HANGUL_LBASE', 0x1100);
  56  define('UNICODE_HANGUL_VBASE', 0x1161);
  57  define('UNICODE_HANGUL_TBASE', 0x11A7);
  58  define('UNICODE_HANGUL_SCOUNT', 11172);
  59  define('UNICODE_HANGUL_LCOUNT', 19);
  60  define('UNICODE_HANGUL_VCOUNT', 21);
  61  define('UNICODE_HANGUL_TCOUNT', 28);
  62  define('UNICODE_HANGUL_NCOUNT', 588);
  63  define('UNICODE_JAMO_L', 0);
  64  define('UNICODE_JAMO_V', 1);
  65  define('UNICODE_JAMO_T', 2);
  66  
  67  /**
  68  * Unicode normalization routines
  69  */
  70  class utf_normalizer
  71  {
  72      /**
  73      * Validate, cleanup and normalize a string
  74      *
  75      * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  76      * and convert to Normal Form C, canonical composition.
  77      *
  78      * @param    string    &$str    The dirty string
  79      * @return    string            The same string, all shiny and cleaned-up
  80      */
  81  	static function cleanup(&$str)
  82      {
  83          // The string below is the list of all autorized characters, sorted by frequency in latin text
  84          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
  85          $len = strlen($str);
  86  
  87          if ($pos == $len)
  88          {
  89              // ASCII strings with no special chars return immediately
  90              return;
  91          }
  92  
  93          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
  94          if (!isset($GLOBALS['utf_nfc_qc']))
  95          {
  96              global $phpbb_root_path, $phpEx;
  97              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
  98          }
  99  
 100          if (!isset($GLOBALS['utf_canonical_decomp']))
 101          {
 102              global $phpbb_root_path, $phpEx;
 103              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 104          }
 105  
 106          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
 107          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
 108          $str = strtr(
 109              $str,
 110              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
 111              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
 112          );
 113  
 114          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 115      }
 116  
 117      /**
 118      * Validate and normalize a UTF string to NFC
 119      *
 120      * @param    string    &$str    Unchecked UTF string
 121      * @return    string            The string, validated and in normal form
 122      */
 123  	static function nfc(&$str)
 124      {
 125          $pos = strspn($str, UTF8_ASCII_RANGE);
 126          $len = strlen($str);
 127  
 128          if ($pos == $len)
 129          {
 130              // ASCII strings return immediately
 131              return;
 132          }
 133  
 134          if (!isset($GLOBALS['utf_nfc_qc']))
 135          {
 136              global $phpbb_root_path, $phpEx;
 137              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 138          }
 139  
 140          if (!isset($GLOBALS['utf_canonical_decomp']))
 141          {
 142              global $phpbb_root_path, $phpEx;
 143              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 144          }
 145  
 146          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 147      }
 148  
 149      /**
 150      * Validate and normalize a UTF string to NFKC
 151      *
 152      * @param    string    &$str    Unchecked UTF string
 153      * @return    string            The string, validated and in normal form
 154      */
 155  	static function nfkc(&$str)
 156      {
 157          $pos = strspn($str, UTF8_ASCII_RANGE);
 158          $len = strlen($str);
 159  
 160          if ($pos == $len)
 161          {
 162              // ASCII strings return immediately
 163              return;
 164          }
 165  
 166          if (!isset($GLOBALS['utf_nfkc_qc']))
 167          {
 168              global $phpbb_root_path, $phpEx;
 169              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
 170          }
 171  
 172          if (!isset($GLOBALS['utf_compatibility_decomp']))
 173          {
 174              global $phpbb_root_path, $phpEx;
 175              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 176          }
 177  
 178          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 179      }
 180  
 181      /**
 182      * Validate and normalize a UTF string to NFD
 183      *
 184      * @param    string    &$str    Unchecked UTF string
 185      * @return    string            The string, validated and in normal form
 186      */
 187  	static function nfd(&$str)
 188      {
 189          $pos = strspn($str, UTF8_ASCII_RANGE);
 190          $len = strlen($str);
 191  
 192          if ($pos == $len)
 193          {
 194              // ASCII strings return immediately
 195              return;
 196          }
 197  
 198          if (!isset($GLOBALS['utf_canonical_decomp']))
 199          {
 200              global $phpbb_root_path, $phpEx;
 201              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 202          }
 203  
 204          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
 205      }
 206  
 207      /**
 208      * Validate and normalize a UTF string to NFKD
 209      *
 210      * @param    string    &$str    Unchecked UTF string
 211      * @return    string            The string, validated and in normal form
 212      */
 213  	static function nfkd(&$str)
 214      {
 215          $pos = strspn($str, UTF8_ASCII_RANGE);
 216          $len = strlen($str);
 217  
 218          if ($pos == $len)
 219          {
 220              // ASCII strings return immediately
 221              return;
 222          }
 223  
 224          if (!isset($GLOBALS['utf_compatibility_decomp']))
 225          {
 226              global $phpbb_root_path, $phpEx;
 227              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 228          }
 229  
 230          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
 231      }
 232  
 233  
 234      /**
 235      * Recompose a UTF string
 236      *
 237      * @param    string    $str            Unchecked UTF string
 238      * @param    integer    $pos            Position of the first UTF char (in bytes)
 239      * @param    integer    $len            Length of the string (in bytes)
 240      * @param    array    &$qc            Quick-check array, passed by reference but never modified
 241      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
 242      * @return    string                    The string, validated and recomposed
 243      *
 244      * @access    private
 245      */
 246  	static function recompose($str, $pos, $len, &$qc, &$decomp_map)
 247      {
 248          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
 249  
 250          // Load some commonly-used tables
 251          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
 252          {
 253              global $phpbb_root_path, $phpEx;
 254              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 255          }
 256  
 257          // Load the canonical composition table
 258          if (!isset($utf_canonical_comp))
 259          {
 260              global $phpbb_root_path, $phpEx;
 261              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
 262          }
 263  
 264          // Buffer the last ASCII char before the UTF-8 stuff if applicable
 265          $tmp = '';
 266          $i = $tmp_pos = $last_cc = 0;
 267  
 268          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
 269  
 270          // UTF char length array
 271          // This array is used to determine the length of a UTF character.
 272          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
 273          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
 274          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
 275          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
 276          $utf_len_mask = array(
 277              // Leading bytes masks
 278              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 279              // Trailing bytes masks
 280              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 281          );
 282  
 283          $extra_check = array(
 284              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 285              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 286              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 287          );
 288  
 289          $utf_validation_mask = array(
 290              2    => "\xE0\xC0",
 291              3    => "\xF0\xC0\xC0",
 292              4    => "\xF8\xC0\xC0\xC0"
 293          );
 294  
 295          $utf_validation_check = array(
 296              2    => "\xC0\x80",
 297              3    => "\xE0\x80\x80",
 298              4    => "\xF0\x80\x80\x80"
 299          );
 300  
 301          // Main loop
 302          do
 303          {
 304              // STEP 0: Capture the current char and buffer it
 305              $c = $str[$pos];
 306              $c_mask = $c & "\xF0";
 307  
 308              if (isset($utf_len_mask[$c_mask]))
 309              {
 310                  // Byte at $pos is either a leading byte or a missplaced trailing byte
 311                  if ($utf_len = $utf_len_mask[$c_mask])
 312                  {
 313                      // Capture the char
 314                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
 315  
 316                      // Let's find out if a thorough check is needed
 317                      if (isset($qc[$utf_char]))
 318                      {
 319                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
 320                      }
 321                      else if (isset($utf_combining_class[$utf_char]))
 322                      {
 323                          if ($utf_combining_class[$utf_char] < $last_cc)
 324                          {
 325                              // A combining character that is NOT canonically ordered
 326                          }
 327                          else
 328                          {
 329                              // A combining character that IS canonically ordered, skip to the next char
 330                              $last_cc = $utf_combining_class[$utf_char];
 331  
 332                              $pos += $utf_len;
 333                              continue;
 334                          }
 335                      }
 336                      else
 337                      {
 338                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
 339                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
 340                          $last_cc = 0;
 341  
 342                          // Check that we have the correct number of trailing bytes
 343                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
 344                          {
 345                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
 346                              // has been encoded in a five- or six- byte sequence
 347                              if ($utf_char[0] >= "\xF8")
 348                              {
 349                                  if ($utf_char[0] < "\xFC")
 350                                  {
 351                                      $trailing_bytes = 4;
 352                                  }
 353                                  else if ($utf_char[0] > "\xFD")
 354                                  {
 355                                      $trailing_bytes = 0;
 356                                  }
 357                                  else
 358                                  {
 359                                      $trailing_bytes = 5;
 360                                  }
 361                              }
 362                              else
 363                              {
 364                                  $trailing_bytes = $utf_len - 1;
 365                              }
 366  
 367                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 368                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 369                              $tmp_pos = $pos;
 370  
 371                              continue;
 372                          }
 373  
 374                          if (isset($extra_check[$c]))
 375                          {
 376                              switch ($c)
 377                              {
 378                                  // Note: 0xED is quite common in Korean
 379                                  case "\xED":
 380                                      if ($utf_char >= "\xED\xA0\x80")
 381                                      {
 382                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
 383                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 384                                          $pos += $utf_len;
 385                                          $tmp_pos = $pos;
 386                                          continue 2;
 387                                      }
 388                                  break;
 389  
 390                                  // Note: 0xEF is quite common in Japanese
 391                                  case "\xEF":
 392                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
 393                                      {
 394                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
 395                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 396                                          $pos += $utf_len;
 397                                          $tmp_pos = $pos;
 398                                          continue 2;
 399                                      }
 400                                  break;
 401  
 402                                  case "\xC0":
 403                                  case "\xC1":
 404                                      if ($utf_char <= "\xC1\xBF")
 405                                      {
 406                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
 407                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 408                                          $pos += $utf_len;
 409                                          $tmp_pos = $pos;
 410                                          continue 2;
 411                                      }
 412                                  break;
 413  
 414                                  case "\xE0":
 415                                      if ($utf_char <= "\xE0\x9F\xBF")
 416                                      {
 417                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
 418                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 419                                          $pos += $utf_len;
 420                                          $tmp_pos = $pos;
 421                                          continue 2;
 422                                      }
 423                                  break;
 424  
 425                                  case "\xF0":
 426                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
 427                                      {
 428                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
 429                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 430                                          $pos += $utf_len;
 431                                          $tmp_pos = $pos;
 432                                          continue 2;
 433                                      }
 434                                  break;
 435  
 436                                  default:
 437                                      // Five- and six- byte sequences do not need being checked for here anymore
 438                                      if ($utf_char > UTF8_MAX)
 439                                      {
 440                                          // Out of the Unicode range
 441                                          if ($utf_char[0] < "\xF8")
 442                                          {
 443                                              $trailing_bytes = 3;
 444                                          }
 445                                          else if ($utf_char[0] < "\xFC")
 446                                          {
 447                                              $trailing_bytes = 4;
 448                                          }
 449                                          else if ($utf_char[0] > "\xFD")
 450                                          {
 451                                              $trailing_bytes = 0;
 452                                          }
 453                                          else
 454                                          {
 455                                              $trailing_bytes = 5;
 456                                          }
 457  
 458                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 459                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 460                                          $tmp_pos = $pos;
 461                                          continue 2;
 462                                      }
 463                                  break;
 464                              }
 465                          }
 466  
 467                          // The char is a valid starter, move the cursor and go on
 468                          $pos += $utf_len;
 469                          continue;
 470                      }
 471                  }
 472                  else
 473                  {
 474                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
 475                      // each of them was a Unicode replacement char
 476                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
 477                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
 478  
 479                      $pos += $spn;
 480                      $tmp_pos = $pos;
 481                      continue;
 482                  }
 483  
 484                  // STEP 1: Decompose current char
 485  
 486                  // We have found a character that is either:
 487                  //  - in the NFC_QC/NFKC_QC list
 488                  //  - a non-starter char that is not canonically ordered
 489                  //
 490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:
 491                  //
 492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
 493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
 494                  //
 495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
 496                  // immediately followed by a starter that is not on the QC list
 497                  //
 498                  $utf_seq = array();
 499                  $last_cc = 0;
 500                  $lpos = $pos;
 501                  $pos += $utf_len;
 502  
 503                  if (isset($decomp_map[$utf_char]))
 504                  {
 505                      $_pos = 0;
 506                      $_len = strlen($decomp_map[$utf_char]);
 507  
 508                      do
 509                      {
 510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
 511  
 512                          if (isset($_utf_len))
 513                          {
 514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 515                              $_pos += $_utf_len;
 516                          }
 517                          else
 518                          {
 519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];
 520                              ++$_pos;
 521                          }
 522                      }
 523                      while ($_pos < $_len);
 524                  }
 525                  else
 526                  {
 527                      // The char is not decomposable
 528                      $utf_seq = array($utf_char);
 529                  }
 530  
 531                  // STEP 2: Capture the starter
 532  
 533                  // Check out the combining class of the first character of the UTF sequence
 534                  $k = 0;
 535                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
 536                  {
 537                      // Not a starter, inspect previous characters
 538                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
 539                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
 540                      // although it is slower than this method.
 541                      //
 542                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
 543                      // at offset $i) and process them in backward mode until we find a starter.
 544                      //
 545                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
 546                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
 547                      $starter_found = 0;
 548                      $j_min = max(1, $i - 7);
 549  
 550                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
 551                      {
 552                          $utf_char = $buffer[$j & 7];
 553                          $lpos -= strlen($utf_char);
 554  
 555                          if (isset($decomp_map[$utf_char]))
 556                          {
 557                              // The char is a composite, decompose for storage
 558                              $decomp_seq = array();
 559                              $_pos = 0;
 560                              $_len = strlen($decomp_map[$utf_char]);
 561  
 562                              do
 563                              {
 564                                  $c = $decomp_map[$utf_char][$_pos];
 565                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];
 566  
 567                                  if (isset($_utf_len))
 568                                  {
 569                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 570                                      $_pos += $_utf_len;
 571                                  }
 572                                  else
 573                                  {
 574                                      $decomp_seq[] = $c;
 575                                      ++$_pos;
 576                                  }
 577                              }
 578                              while ($_pos < $_len);
 579  
 580                              // Prepend the UTF sequence with our decomposed sequence
 581                              if (isset($decomp_seq[1]))
 582                              {
 583                                  // The char expanded into several chars
 584                                  $decomp_cnt = sizeof($decomp_seq);
 585  
 586                                  foreach ($decomp_seq as $decomp_i => $decomp_char)
 587                                  {
 588                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
 589                                  }
 590                                  $k -= $decomp_cnt;
 591                              }
 592                              else
 593                              {
 594                                  // Decomposed to a single char, easier to prepend
 595                                  $utf_seq[--$k] = $decomp_seq[0];
 596                              }
 597                          }
 598                          else
 599                          {
 600                              $utf_seq[--$k] = $utf_char;
 601                          }
 602  
 603                          if (!isset($utf_combining_class[$utf_seq[$k]]))
 604                          {
 605                              // We have found our starter
 606                              $starter_found = 1;
 607                              break;
 608                          }
 609                      }
 610  
 611                      if (!$starter_found && $lpos > $tmp_pos)
 612                      {
 613                          // The starter was not found in the buffer, let's rewind some more
 614                          do
 615                          {
 616                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
 617                              $c = $str[--$lpos];
 618                              $c_mask = $c & "\xF0";
 619  
 620                              if (isset($utf_len_mask[$c_mask]))
 621                              {
 622                                  // UTF byte
 623                                  if ($utf_len = $utf_len_mask[$c_mask])
 624                                  {
 625                                      // UTF *leading* byte
 626                                      $utf_char = substr($str, $lpos, $utf_len);
 627  
 628                                      if (isset($decomp_map[$utf_char]))
 629                                      {
 630                                          // Decompose the character
 631                                          $decomp_seq = array();
 632                                          $_pos = 0;
 633                                          $_len = strlen($decomp_map[$utf_char]);
 634  
 635                                          do
 636                                          {
 637                                              $c = $decomp_map[$utf_char][$_pos];
 638                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];
 639  
 640                                              if (isset($_utf_len))
 641                                              {
 642                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 643                                                  $_pos += $_utf_len;
 644                                              }
 645                                              else
 646                                              {
 647                                                  $decomp_seq[] = $c;
 648                                                  ++$_pos;
 649                                              }
 650                                          }
 651                                          while ($_pos < $_len);
 652  
 653                                          // Prepend the UTF sequence with our decomposed sequence
 654                                          if (isset($decomp_seq[1]))
 655                                          {
 656                                              // The char expanded into several chars
 657                                              $decomp_cnt = sizeof($decomp_seq);
 658                                              foreach ($decomp_seq as $decomp_i => $utf_char)
 659                                              {
 660                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
 661                                              }
 662                                              $k -= $decomp_cnt;
 663                                          }
 664                                          else
 665                                          {
 666                                              // Decomposed to a single char, easier to prepend
 667                                              $utf_seq[--$k] = $decomp_seq[0];
 668                                          }
 669                                      }
 670                                      else
 671                                      {
 672                                          $utf_seq[--$k] = $utf_char;
 673                                      }
 674                                  }
 675                              }
 676                              else
 677                              {
 678                                  // ASCII char
 679                                  $utf_seq[--$k] = $c;
 680                              }
 681                          }
 682                          while ($lpos > $tmp_pos);
 683                      }
 684                  }
 685  
 686                  // STEP 3: Capture following combining modifiers
 687  
 688                  while ($pos < $len)
 689                  {
 690                      $c_mask = $str[$pos] & "\xF0";
 691  
 692                      if (isset($utf_len_mask[$c_mask]))
 693                      {
 694                          if ($utf_len = $utf_len_mask[$c_mask])
 695                          {
 696                              $utf_char = substr($str, $pos, $utf_len);
 697                          }
 698                          else
 699                          {
 700                              // A trailing byte came out of nowhere
 701                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
 702                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
 703                              break;
 704                          }
 705  
 706                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
 707                          {
 708                              // Combining character, add it to the sequence and move the cursor
 709                              if (isset($decomp_map[$utf_char]))
 710                              {
 711                                  // Decompose the character
 712                                  $_pos = 0;
 713                                  $_len = strlen($decomp_map[$utf_char]);
 714  
 715                                  do
 716                                  {
 717                                      $c = $decomp_map[$utf_char][$_pos];
 718                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];
 719  
 720                                      if (isset($_utf_len))
 721                                      {
 722                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 723                                          $_pos += $_utf_len;
 724                                      }
 725                                      else
 726                                      {
 727                                          $utf_seq[] = $c;
 728                                          ++$_pos;
 729                                      }
 730                                  }
 731                                  while ($_pos < $_len);
 732                              }
 733                              else
 734                              {
 735                                  $utf_seq[] = $utf_char;
 736                              }
 737  
 738                              $pos += $utf_len;
 739                          }
 740                          else
 741                          {
 742                              // Combining class 0 and no QC, break out of the loop
 743                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
 744                              break;
 745                          }
 746                      }
 747                      else
 748                      {
 749                          // ASCII chars are starters
 750                          break;
 751                      }
 752                  }
 753  
 754                  // STEP 4: Sort and combine
 755  
 756                  // Here we sort...
 757                  $k_max = $k + sizeof($utf_seq);
 758  
 759                  if (!$k && $k_max == 1)
 760                  {
 761                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
 762                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
 763  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
 764  //                        {
 765                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
 766                          $tmp_pos = $pos;
 767  //                        }
 768  
 769                      continue;
 770                  }
 771  
 772                  // ...there we combine
 773                  if (isset($utf_combining_class[$utf_seq[$k]]))
 774                  {
 775                      $starter = $nf_seq = '';
 776                  }
 777                  else
 778                  {
 779                      $starter = $utf_seq[$k++];
 780                      $nf_seq = '';
 781                  }
 782                  $utf_sort = array();
 783  
 784                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
 785                  // at the end of the string without altering it
 786                  $utf_seq[] = '';
 787  
 788                  do
 789                  {
 790                      $utf_char = $utf_seq[$k++];
 791  
 792                      if (isset($utf_combining_class[$utf_char]))
 793                      {
 794                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
 795                      }
 796                      else
 797                      {
 798                          if (empty($utf_sort))
 799                          {
 800                              // No combining characters... check for a composite of the two starters
 801                              if (isset($utf_canonical_comp[$starter . $utf_char]))
 802                              {
 803                                  // Good ol' composite character
 804                                  $starter = $utf_canonical_comp[$starter . $utf_char];
 805                              }
 806                              else if (isset($utf_jamo_type[$utf_char]))
 807                              {
 808                                  // Current char is a composable jamo
 809                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
 810                                  {
 811                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
 812                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
 813                                      {
 814                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
 815                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
 816                                          ++$k;
 817                                      }
 818                                      else
 819                                      {
 820                                          // L+V jamos, combine to a LV Hangul syllable
 821                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
 822                                      }
 823  
 824                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 825                                  }
 826                                  else
 827                                  {
 828                                      // Non-composable jamo, just add it to the sequence
 829                                      $nf_seq .= $starter;
 830                                      $starter = $utf_char;
 831                                  }
 832                              }
 833                              else
 834                              {
 835                                  // No composite, just add the first starter to the sequence then continue with the other one
 836                                  $nf_seq .= $starter;
 837                                  $starter = $utf_char;
 838                              }
 839                          }
 840                          else
 841                          {
 842                              ksort($utf_sort);
 843  
 844                              // For each class of combining characters
 845                              foreach ($utf_sort as $cc => $utf_chars)
 846                              {
 847                                  $j = 0;
 848  
 849                                  do
 850                                  {
 851                                      // Look for a composite
 852                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
 853                                      {
 854                                          // Found a composite, replace the starter
 855                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
 856                                          unset($utf_sort[$cc][$j]);
 857                                      }
 858                                      else
 859                                      {
 860                                          // No composite, all following characters in that class are blocked
 861                                          break;
 862                                      }
 863                                  }
 864                                  while (isset($utf_sort[$cc][++$j]));
 865                              }
 866  
 867                              // Add the starter to the normalized sequence, followed by non-starters in canonical order
 868                              $nf_seq .= $starter;
 869  
 870                              foreach ($utf_sort as $utf_chars)
 871                              {
 872                                  if (!empty($utf_chars))
 873                                  {
 874                                      $nf_seq .= implode('', $utf_chars);
 875                                  }
 876                              }
 877  
 878                              // Reset the array and go on
 879                              $utf_sort = array();
 880                              $starter = $utf_char;
 881                          }
 882                      }
 883                  }
 884                  while ($k <= $k_max);
 885  
 886                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
 887                  $tmp_pos = $pos;
 888              }
 889              else
 890              {
 891                  // Only a ASCII char can make the program get here
 892                  //
 893                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
 894                  //
 895                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
 896                  // multi-byte text (where the only ASCII chars are spaces and punctuation)
 897                  if (++$pos != $len)
 898                  {
 899                      if ($str[$pos] < "\x80")
 900                      {
 901                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
 902                          $buffer[++$i & 7] = $str[$pos - 1];
 903                      }
 904                      else
 905                      {
 906                          $buffer[++$i & 7] = $c;
 907                      }
 908                  }
 909              }
 910          }
 911          while ($pos < $len);
 912  
 913          // Now is time to return the string
 914          if ($tmp_pos)
 915          {
 916              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
 917              if ($tmp_pos == $len)
 918              {
 919                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
 920                  return $tmp;
 921              }
 922              else
 923              {
 924                  // The rightmost chunk of $str has not been appended to $tmp yet
 925                  return $tmp . substr($str, $tmp_pos);
 926              }
 927          }
 928  
 929          // The string was already in normal form
 930          return $str;
 931      }
 932  
 933      /**
 934      * Decompose a UTF string
 935      *
 936      * @param    string    $str            UTF string
 937      * @param    integer    $pos            Position of the first UTF char (in bytes)
 938      * @param    integer    $len            Length of the string (in bytes)
 939      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
 940      * @return    string                    The string, decomposed and sorted canonically
 941      *
 942      * @access    private
 943      */
 944  	static function decompose($str, $pos, $len, &$decomp_map)
 945      {
 946          global $utf_combining_class;
 947  
 948          // Load some commonly-used tables
 949          if (!isset($utf_combining_class))
 950          {
 951              global $phpbb_root_path, $phpEx;
 952              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 953          }
 954  
 955          // UTF char length array
 956          $utf_len_mask = array(
 957              // Leading bytes masks
 958              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 959              // Trailing bytes masks
 960              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 961          );
 962  
 963          // Some extra checks are triggered on the first byte of a UTF sequence
 964          $extra_check = array(
 965              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 966              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 967              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 968          );
 969  
 970          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
 971          //   - 2-byte: 110? ???? 10?? ????
 972          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
 973          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
 974          // Note that 5- and 6- byte sequences are automatically discarded
 975          $utf_validation_mask = array(
 976              2    => "\xE0\xC0",
 977              3    => "\xF0\xC0\xC0",
 978              4    => "\xF8\xC0\xC0\xC0"
 979          );
 980  
 981          $utf_validation_check = array(
 982              2    => "\xC0\x80",
 983              3    => "\xE0\x80\x80",
 984              4    => "\xF0\x80\x80\x80"
 985          );
 986  
 987          $tmp = '';
 988          $starter_pos = $pos;
 989          $tmp_pos = $last_cc = $sort = $dump = 0;
 990          $utf_sort = array();
 991  
 992          // Main loop
 993          do
 994          {
 995              // STEP 0: Capture the current char
 996  
 997              $cur_mask = $str[$pos] & "\xF0";
 998              if (isset($utf_len_mask[$cur_mask]))
 999              {
1000                  if ($utf_len = $utf_len_mask[$cur_mask])
1001                  {
1002                      // Multibyte char
1003                      $utf_char = substr($str, $pos, $utf_len);
1004                      $pos += $utf_len;
1005                  }
1006                  else
1007                  {
1008                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1009                      // replacement char and we will advance the cursor
1010                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1011  
1012                      if ($dump)
1013                      {
1014                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1015  
1016                          // Dump combiners
1017                          if (!empty($utf_sort))
1018                          {
1019                              if ($sort)
1020                              {
1021                                  ksort($utf_sort);
1022                              }
1023  
1024                              foreach ($utf_sort as $utf_chars)
1025                              {
1026                                  $tmp .= implode('', $utf_chars);
1027                              }
1028                          }
1029  
1030                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1031                          $dump = $sort = 0;
1032                      }
1033                      else
1034                      {
1035                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1036                      }
1037  
1038                      $pos += $spn;
1039                      $tmp_pos = $starter_pos = $pos;
1040  
1041                      $utf_sort = array();
1042                      $last_cc = 0;
1043  
1044                      continue;
1045                  }
1046  
1047                  // STEP 1: Decide what to do with current char
1048  
1049                  // Now, in that order:
1050                  //  - check if that character is decomposable
1051                  //  - check if that character is a non-starter
1052                  //  - check if that character requires extra checks to be performed
1053                  if (isset($decomp_map[$utf_char]))
1054                  {
1055                      // Decompose the char
1056                      $_pos = 0;
1057                      $_len = strlen($decomp_map[$utf_char]);
1058  
1059                      do
1060                      {
1061                          $c = $decomp_map[$utf_char][$_pos];
1062                          $_utf_len =& $utf_len_mask[$c & "\xF0"];
1063  
1064                          if (isset($_utf_len))
1065                          {
1066                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1067                              $_pos += $_utf_len;
1068  
1069                              if (isset($utf_combining_class[$_utf_char]))
1070                              {
1071                                  // The character decomposed to a non-starter, buffer it for sorting
1072                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1073  
1074                                  if ($utf_combining_class[$_utf_char] < $last_cc)
1075                                  {
1076                                      // Not canonically ordered, will require sorting
1077                                      $sort = $dump = 1;
1078                                  }
1079                                  else
1080                                  {
1081                                      $dump = 1;
1082                                      $last_cc = $utf_combining_class[$_utf_char];
1083                                  }
1084                              }
1085                              else
1086                              {
1087                                  // This character decomposition contains a starter, dump the buffer and continue
1088                                  if ($dump)
1089                                  {
1090                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1091  
1092                                      // Dump combiners
1093                                      if (!empty($utf_sort))
1094                                      {
1095                                          if ($sort)
1096                                          {
1097                                              ksort($utf_sort);
1098                                          }
1099  
1100                                          foreach ($utf_sort as $utf_chars)
1101                                          {
1102                                              $tmp .= implode('', $utf_chars);
1103                                          }
1104                                      }
1105  
1106                                      $tmp .= $_utf_char;
1107                                      $dump = $sort = 0;
1108                                  }
1109                                  else
1110                                  {
1111                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1112                                  }
1113  
1114                                  $tmp_pos = $starter_pos = $pos;
1115                                  $utf_sort = array();
1116                                  $last_cc = 0;
1117                              }
1118                          }
1119                          else
1120                          {
1121                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1122                              ++$_pos;
1123  
1124                              if ($dump)
1125                              {
1126                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1127  
1128                                  // Dump combiners
1129                                  if (!empty($utf_sort))
1130                                  {
1131                                      if ($sort)
1132                                      {
1133                                          ksort($utf_sort);
1134                                      }
1135  
1136                                      foreach ($utf_sort as $utf_chars)
1137                                      {
1138                                          $tmp .= implode('', $utf_chars);
1139                                      }
1140                                  }
1141  
1142                                  $tmp .= $c;
1143                                  $dump = $sort = 0;
1144                              }
1145                              else
1146                              {
1147                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1148                              }
1149  
1150                              $tmp_pos = $starter_pos = $pos;
1151                              $utf_sort = array();
1152                              $last_cc = 0;
1153                          }
1154                      }
1155                      while ($_pos < $_len);
1156                  }
1157                  else if (isset($utf_combining_class[$utf_char]))
1158                  {
1159                      // Combining character
1160                      if ($utf_combining_class[$utf_char] < $last_cc)
1161                      {
1162                          // Not in canonical order
1163                          $sort = $dump = 1;
1164                      }
1165                      else
1166                      {
1167                          $last_cc = $utf_combining_class[$utf_char];
1168                      }
1169  
1170                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1171                  }
1172                  else
1173                  {
1174                      // Non-decomposable starter, check out if it's a Hangul syllable
1175                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1176                      {
1177                          // Nope, regular UTF char, check that we have the correct number of trailing bytes
1178                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1179                          {
1180                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1181                              // has been encoded in a five- or six- byte sequence.
1182                              // Move the cursor back to its original position then advance it to the position it should really be at
1183                              $pos -= $utf_len;
1184                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1185  
1186                              if (!empty($utf_sort))
1187                              {
1188                                  ksort($utf_sort);
1189  
1190                                  foreach ($utf_sort as $utf_chars)
1191                                  {
1192                                      $tmp .= implode('', $utf_chars);
1193                                  }
1194                                  $utf_sort = array();
1195                              }
1196  
1197                              // Add a replacement char then another replacement char for every trailing byte.
1198                              //
1199                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1200                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1201                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1202  
1203                              $dump = $sort = 0;
1204  
1205                              $pos += $spn;
1206                              $tmp_pos = $pos;
1207                              continue;
1208                          }
1209  
1210                          if (isset($extra_check[$utf_char[0]]))
1211                          {
1212                              switch ($utf_char[0])
1213                              {
1214                                  // Note: 0xED is quite common in Korean
1215                                  case "\xED":
1216                                      if ($utf_char >= "\xED\xA0\x80")
1217                                      {
1218                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1219                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1220  
1221                                          if (!empty($utf_sort))
1222                                          {
1223                                              ksort($utf_sort);
1224  
1225                                              foreach ($utf_sort as $utf_chars)
1226                                              {
1227                                                  $tmp .= implode('', $utf_chars);
1228                                              }
1229                                              $utf_sort = array();
1230                                          }
1231  
1232                                          $tmp .= UTF8_REPLACEMENT;
1233                                          $dump = $sort = 0;
1234  
1235                                          $tmp_pos = $starter_pos = $pos;
1236                                          continue 2;
1237                                      }
1238                                  break;
1239  
1240                                  // Note: 0xEF is quite common in Japanese
1241                                  case "\xEF":
1242                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1243                                      {
1244                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1245                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1246  
1247                                          if (!empty($utf_sort))
1248                                          {
1249                                              ksort($utf_sort);
1250  
1251                                              foreach ($utf_sort as $utf_chars)
1252                                              {
1253                                                  $tmp .= implode('', $utf_chars);
1254                                              }
1255                                              $utf_sort = array();
1256                                          }
1257  
1258                                          $tmp .= UTF8_REPLACEMENT;
1259                                          $dump = $sort = 0;
1260  
1261                                          $tmp_pos = $starter_pos = $pos;
1262                                          continue 2;
1263                                      }
1264                                  break;
1265  
1266                                  case "\xC0":
1267                                  case "\xC1":
1268                                      if ($utf_char <= "\xC1\xBF")
1269                                      {
1270                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1271                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1272  
1273                                          if (!empty($utf_sort))
1274                                          {
1275                                              ksort($utf_sort);
1276  
1277                                              foreach ($utf_sort as $utf_chars)
1278                                              {
1279                                                  $tmp .= implode('', $utf_chars);
1280                                              }
1281                                              $utf_sort = array();
1282                                          }
1283  
1284                                          $tmp .= UTF8_REPLACEMENT;
1285                                          $dump = $sort = 0;
1286  
1287                                          $tmp_pos = $starter_pos = $pos;
1288                                          continue 2;
1289                                      }
1290                                  break;
1291  
1292                                  case "\xE0":
1293                                      if ($utf_char <= "\xE0\x9F\xBF")
1294                                      {
1295                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
1296                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1297  
1298                                          if (!empty($utf_sort))
1299                                          {
1300                                              ksort($utf_sort);
1301  
1302                                              foreach ($utf_sort as $utf_chars)
1303                                              {
1304                                                  $tmp .= implode('', $utf_chars);
1305                                              }
1306                                              $utf_sort = array();
1307                                          }
1308  
1309                                          $tmp .= UTF8_REPLACEMENT;
1310                                          $dump = $sort = 0;
1311  
1312                                          $tmp_pos = $starter_pos = $pos;
1313                                          continue 2;
1314                                      }
1315                                  break;
1316  
1317                                  case "\xF0":
1318                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
1319                                      {
1320                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
1321                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1322  
1323                                          if (!empty($utf_sort))
1324                                          {
1325                                              ksort($utf_sort);
1326  
1327                                              foreach ($utf_sort as $utf_chars)
1328                                              {
1329                                                  $tmp .= implode('', $utf_chars);
1330                                              }
1331                                              $utf_sort = array();
1332                                          }
1333  
1334                                          $tmp .= UTF8_REPLACEMENT;
1335                                          $dump = $sort = 0;
1336  
1337                                          $tmp_pos = $starter_pos = $pos;
1338                                          continue 2;
1339                                      }
1340                                  break;
1341  
1342                                  default:
1343                                      if ($utf_char > UTF8_MAX)
1344                                      {
1345                                          // Out of the Unicode range
1346                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1347  
1348                                          if (!empty($utf_sort))
1349                                          {
1350                                              ksort($utf_sort);
1351  
1352                                              foreach ($utf_sort as $utf_chars)
1353                                              {
1354                                                  $tmp .= implode('', $utf_chars);
1355                                              }
1356                                              $utf_sort = array();
1357                                          }
1358  
1359                                          $tmp .= UTF8_REPLACEMENT;
1360                                          $dump = $sort = 0;
1361  
1362                                          $tmp_pos = $starter_pos = $pos;
1363                                          continue 2;
1364                                      }
1365                                  break;
1366                              }
1367                          }
1368                      }
1369                      else
1370                      {
1371                          // Hangul syllable
1372                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1373  
1374                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1375                          //
1376                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1377                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1378                          {
1379                              if ($t_index < 25)
1380                              {
1381                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1382                                  $utf_char[8] = chr(0xA7 + $t_index);
1383                              }
1384                              else
1385                              {
1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1387                                  $utf_char[8] = chr(0x67 + $t_index);
1388                              }
1389                          }
1390                          else
1391                          {
1392                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1393                          }
1394  
1395                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1396                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1397  
1398                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1399                          $dump = 1;
1400                      }
1401  
1402                      // Do we need to dump stuff to the tmp string?
1403                      if ($dump)
1404                      {
1405                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1406  
1407                          // Dump combiners
1408                          if (!empty($utf_sort))
1409                          {
1410                              if ($sort)
1411                              {
1412                                  ksort($utf_sort);
1413                              }
1414  
1415                              foreach ($utf_sort as $utf_chars)
1416                              {
1417                                  $tmp .= implode('', $utf_chars);
1418                              }
1419                          }
1420  
1421                          $tmp .= $utf_char;
1422                          $dump = $sort = 0;
1423                          $tmp_pos = $pos;
1424                      }
1425  
1426                      $last_cc = 0;
1427                      $utf_sort = array();
1428                      $starter_pos = $pos;
1429                  }
1430              }
1431              else
1432              {
1433                  // ASCII char, which happens to be a starter (as any other ASCII char)
1434                  if ($dump)
1435                  {
1436                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1437  
1438                      // Dump combiners
1439                      if (!empty($utf_sort))
1440                      {
1441                          if ($sort)
1442                          {
1443                              ksort($utf_sort);
1444                          }
1445  
1446                          foreach ($utf_sort as $utf_chars)
1447                          {
1448                              $tmp .= implode('', $utf_chars);
1449                          }
1450                      }
1451  
1452                      $tmp .= $str[$pos];
1453                      $dump = $sort = 0;
1454                      $tmp_pos = ++$pos;
1455  
1456                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1457                  }
1458                  else
1459                  {
1460                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1461                  }
1462  
1463                  $last_cc = 0;
1464                  $utf_sort = array();
1465                  $starter_pos = $pos;
1466              }
1467          }
1468          while ($pos < $len);
1469  
1470          // Now is time to return the string
1471          if ($dump)
1472          {
1473              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1474  
1475              // Dump combiners
1476              if (!empty($utf_sort))
1477              {
1478                  if ($sort)
1479                  {
1480                      ksort($utf_sort);
1481                  }
1482  
1483                  foreach ($utf_sort as $utf_chars)
1484                  {
1485                      $tmp .= implode('', $utf_chars);
1486                  }
1487              }
1488  
1489              return $tmp;
1490          }
1491          else if ($tmp_pos)
1492          {
1493              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1494              if ($tmp_pos == $len)
1495              {
1496                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1497                  return $tmp;
1498              }
1499              else
1500              {
1501                  // The rightmost chunk of $str has not been appended to $tmp yet
1502                  return $tmp . substr($str, $tmp_pos);
1503              }
1504          }
1505  
1506          // The string was already in normal form
1507          return $str;
1508      }
1509  }
PHP Cross Reference of phpBB-3.1.12-deutsch

/includes/utf/ -> utf_normalizer.php (source)