[ Index ]

PHP Cross Reference of phpBB-3.3.14-deutsch

title

Body

[close]

/vendor/s9e/text-formatter/src/Configurator/Helpers/ -> RegexpParser.php (source)

   1  <?php
   2  
   3  /**
   4  * @package   s9e\TextFormatter
   5  * @copyright Copyright (c) 2010-2022 The s9e authors
   6  * @license   http://www.opensource.org/licenses/mit-license.php The MIT License
   7  */
   8  namespace s9e\TextFormatter\Configurator\Helpers;
   9  
  10  use RuntimeException;
  11  
  12  abstract class RegexpParser
  13  {
  14      /**
  15      * Generate a regexp that matches any single character allowed in a regexp
  16      *
  17      * This method will generate a regexp that can be used to determine whether a given character
  18      * could in theory be allowed in a string that matches the source regexp. For example, the source
  19      * regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp
  20      * /foo/ would generate // because it's not anchored so any characters could be found before or
  21      * after the literal "foo".
  22      *
  23      * @param  string $regexp Source regexp
  24      * @return string         Regexp that matches any single character allowed in the source regexp
  25      */
  26  	public static function getAllowedCharacterRegexp($regexp)
  27      {
  28          $def = self::parse($regexp);
  29  
  30          // If the regexp is uses the multiline modifier, this regexp can't match the whole string if
  31          // it contains newlines, so in effect it could allow any content
  32          if (strpos($def['modifiers'], 'm') !== false)
  33          {
  34              return '//';
  35          }
  36  
  37          if (substr($def['regexp'], 0, 1) !== '^'
  38           || substr($def['regexp'], -1)   !== '$')
  39          {
  40              return '//';
  41          }
  42  
  43          // Append a token to mark the end of the regexp
  44          $def['tokens'][] = [
  45              'pos'  => strlen($def['regexp']),
  46              'len'  => 0,
  47              'type' => 'end'
  48          ];
  49  
  50          $patterns = [];
  51  
  52          // Collect the literal portions of the source regexp while testing for alternations
  53          $literal = '';
  54          $pos     = 0;
  55          $skipPos = 0;
  56          $depth   = 0;
  57          foreach ($def['tokens'] as $token)
  58          {
  59              // Skip options
  60              if ($token['type'] === 'option')
  61              {
  62                  $skipPos = max($skipPos, $token['pos'] + $token['len']);
  63              }
  64  
  65              // Skip assertions
  66              if (strpos($token['type'], 'AssertionStart') !== false)
  67              {
  68                  $endToken = $def['tokens'][$token['endToken']];
  69                  $skipPos  = max($skipPos, $endToken['pos'] + $endToken['len']);
  70              }
  71  
  72              if ($token['pos'] >= $skipPos)
  73              {
  74                  if ($token['type'] === 'characterClass')
  75                  {
  76                      $patterns[] = '[' . $token['content'] . ']';
  77                  }
  78  
  79                  if ($token['pos'] > $pos)
  80                  {
  81                      // Capture the content between last position and current position
  82                      $tmp = substr($def['regexp'], $pos, $token['pos'] - $pos);
  83  
  84                      // Append the content to the literal portion
  85                      $literal .= $tmp;
  86  
  87                      // Test for alternations if it's the root of the regexp
  88                      if (!$depth)
  89                      {
  90                          // Remove literal backslashes for convenience
  91                          $tmp = str_replace('\\\\', '', $tmp);
  92  
  93                          // Look for an unescaped | that is not followed by ^
  94                          if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp))
  95                          {
  96                              return '//';
  97                          }
  98  
  99                          // Look for an unescaped | that is not preceded by $
 100                          if (preg_match('/(?<![$\\\\])\\|/', $tmp))
 101                          {
 102                              return '//';
 103                          }
 104                      }
 105                  }
 106              }
 107  
 108              if (substr($token['type'], -5) === 'Start')
 109              {
 110                  ++$depth;
 111              }
 112              elseif (substr($token['type'], -3) === 'End')
 113              {
 114                  --$depth;
 115              }
 116  
 117              $pos = max($skipPos, $token['pos'] + $token['len']);
 118          }
 119  
 120          // Test for the presence of an unescaped dot
 121          if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal))
 122          {
 123              if (strpos($def['modifiers'], 's') !== false
 124               || strpos($literal, "\n") !== false)
 125              {
 126                  return '//';
 127              }
 128  
 129              $patterns[] = '.';
 130  
 131              // Remove unescaped dots
 132              $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal);
 133          }
 134  
 135          // Remove unescaped quantifiers *, + and ?
 136          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal);
 137  
 138          // Remove unescaped quantifiers {}
 139          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal);
 140  
 141          // Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references
 142          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal);
 143  
 144          // Remove unescaped ^, | and $
 145          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal);
 146  
 147          // Escape unescaped - and ] so they are safe to use in a character class
 148          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal);
 149  
 150          // If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n
 151          if (strpos($def['modifiers'], 'D') === false)
 152          {
 153              $literal .= "\n";
 154          }
 155  
 156          // Add the literal portion of the regexp to the patterns, as a character class
 157          if ($literal !== '')
 158          {
 159              $patterns[] = '[' . $literal . ']';
 160          }
 161  
 162          // Test whether this regexp actually matches anything
 163          if (empty($patterns))
 164          {
 165              return '/^$/D';
 166          }
 167  
 168          // Build the allowed characters regexp
 169          $regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter'];
 170  
 171          // Add the modifiers
 172          if (strpos($def['modifiers'], 'i') !== false)
 173          {
 174              $regexp .= 'i';
 175          }
 176          if (strpos($def['modifiers'], 'u') !== false)
 177          {
 178              $regexp .= 'u';
 179          }
 180  
 181          return $regexp;
 182      }
 183  
 184      /**
 185      * Return the name of each capture in given regexp
 186      *
 187      * Will return an empty string for unnamed captures
 188      *
 189      * @param  string   $regexp
 190      * @return string[]
 191      */
 192  	public static function getCaptureNames($regexp)
 193      {
 194          $map        = [''];
 195          $regexpInfo = self::parse($regexp);
 196          foreach ($regexpInfo['tokens'] as $tok)
 197          {
 198              if ($tok['type'] === 'capturingSubpatternStart')
 199              {
 200                  $map[] = $tok['name'] ?? '';
 201              }
 202          }
 203  
 204          return $map;
 205      }
 206  
 207      /**
 208      * @param  string $regexp
 209      * @return array
 210      */
 211  	public static function parse($regexp)
 212      {
 213          if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m))
 214          {
 215              throw new RuntimeException('Could not parse regexp delimiters');
 216          }
 217  
 218          $ret = [
 219              'delimiter' => $m[1],
 220              'modifiers' => $m[3],
 221              'regexp'    => $m[2],
 222              'tokens'    => []
 223          ];
 224  
 225          $regexp = $m[2];
 226  
 227          $openSubpatterns = [];
 228  
 229          $pos = 0;
 230          $regexpLen = strlen($regexp);
 231  
 232          while ($pos < $regexpLen)
 233          {
 234              switch ($regexp[$pos])
 235              {
 236                  case '\\':
 237                      // skip next character
 238                      $pos += 2;
 239                      break;
 240  
 241                  case '[':
 242                      if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos))
 243                      {
 244                          throw new RuntimeException('Could not find matching bracket from pos ' . $pos);
 245                      }
 246  
 247                      $ret['tokens'][] = [
 248                          'pos'         => $pos,
 249                          'len'         => strlen($m[0]),
 250                          'type'        => 'characterClass',
 251                          'content'     => $m[1],
 252                          'quantifiers' => $m[2]
 253                      ];
 254  
 255                      $pos += strlen($m[0]);
 256                      break;
 257  
 258                  case '(':
 259                      if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos))
 260                      {
 261                          // This is an option (?i) so we skip past the right parenthesis
 262                          $ret['tokens'][] = [
 263                              'pos'     => $pos,
 264                              'len'     => strlen($m[0]),
 265                              'type'    => 'option',
 266                              'options' => $m[1]
 267                          ];
 268  
 269                          $pos += strlen($m[0]);
 270                          break;
 271                      }
 272  
 273                      // This should be a subpattern, we just have to sniff which kind
 274                      if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos))
 275                      {
 276                          // This is a named capture
 277                          $tok = [
 278                              'pos'  => $pos,
 279                              'len'  => strlen($m[0][0]),
 280                              'type' => 'capturingSubpatternStart',
 281                              'name' => $m['name'][0]
 282                          ];
 283  
 284                          $pos += strlen($m[0][0]);
 285                      }
 286                      elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos))
 287                      {
 288                          // This is a non-capturing subpattern (?:xxx)
 289                          $tok = [
 290                              'pos'     => $pos,
 291                              'len'     => strlen($m[0]),
 292                              'type'    => 'nonCapturingSubpatternStart',
 293                              'options' => $m[1]
 294                          ];
 295  
 296                          $pos += strlen($m[0]);
 297                      }
 298                      elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos))
 299                      {
 300                          /* This is a non-capturing subpattern with atomic grouping "(?>x+)" */
 301                          $tok = [
 302                              'pos'     => $pos,
 303                              'len'     => strlen($m[0]),
 304                              'type'    => 'nonCapturingSubpatternStart',
 305                              'subtype' => 'atomic'
 306                          ];
 307  
 308                          $pos += strlen($m[0]);
 309                      }
 310                      elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos))
 311                      {
 312                          // This is an assertion
 313                          $assertions = [
 314                              '='  => 'lookahead',
 315                              '<=' => 'lookbehind',
 316                              '!'  => 'negativeLookahead',
 317                              '<!' => 'negativeLookbehind'
 318                          ];
 319  
 320                          $tok = [
 321                              'pos'     => $pos,
 322                              'len'     => strlen($m[0]),
 323                              'type'    => $assertions[$m[1]] . 'AssertionStart'
 324                          ];
 325  
 326                          $pos += strlen($m[0]);
 327                      }
 328                      elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos))
 329                      {
 330                          throw new RuntimeException('Unsupported subpattern type at pos ' . $pos);
 331                      }
 332                      else
 333                      {
 334                          // This should be a normal capture
 335                          $tok = [
 336                              'pos'  => $pos,
 337                              'len'  => 1,
 338                              'type' => 'capturingSubpatternStart'
 339                          ];
 340  
 341                          ++$pos;
 342                      }
 343  
 344                      $openSubpatterns[] = count($ret['tokens']);
 345                      $ret['tokens'][] = $tok;
 346                      break;
 347  
 348                  case ')':
 349                      if (empty($openSubpatterns))
 350                      {
 351                          throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos);
 352                      }
 353  
 354                      // Add the key to this token to its matching token and capture this subpattern's
 355                      // content
 356                      $k = array_pop($openSubpatterns);
 357                      $startToken =& $ret['tokens'][$k];
 358                      $startToken['endToken'] = count($ret['tokens']);
 359                      $startToken['content']  = substr(
 360                          $regexp,
 361                          $startToken['pos'] + $startToken['len'],
 362                          $pos - ($startToken['pos'] + $startToken['len'])
 363                      );
 364  
 365                      // Look for quantifiers after the subpattern, e.g. (?:ab)++
 366                      $spn = strspn($regexp, '+*?', 1 + $pos);
 367                      $quantifiers = substr($regexp, 1 + $pos, $spn);
 368  
 369                      $ret['tokens'][] = [
 370                          'pos'  => $pos,
 371                          'len'  => 1 + $spn,
 372                          'type' => substr($startToken['type'], 0, -5) . 'End',
 373                          'quantifiers' => $quantifiers
 374                      ];
 375  
 376                      unset($startToken);
 377  
 378                      $pos += 1 + $spn;
 379                      break;
 380  
 381                  default:
 382                      ++$pos;
 383              }
 384          }
 385  
 386          if (!empty($openSubpatterns))
 387          {
 388              throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']);
 389          }
 390  
 391          return $ret;
 392      }
 393  }


Generated: Mon Nov 25 19:05:08 2024 Cross-referenced by PHPXref 0.7.1