[ Index ] |
PHP Cross Reference of phpBB-3.3.14-deutsch |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * @package s9e\TextFormatter 5 * @copyright Copyright (c) 2010-2022 The s9e authors 6 * @license http://www.opensource.org/licenses/mit-license.php The MIT License 7 */ 8 namespace s9e\TextFormatter\Configurator\Helpers; 9 10 use RuntimeException; 11 12 abstract class RegexpParser 13 { 14 /** 15 * Generate a regexp that matches any single character allowed in a regexp 16 * 17 * This method will generate a regexp that can be used to determine whether a given character 18 * could in theory be allowed in a string that matches the source regexp. For example, the source 19 * regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp 20 * /foo/ would generate // because it's not anchored so any characters could be found before or 21 * after the literal "foo". 22 * 23 * @param string $regexp Source regexp 24 * @return string Regexp that matches any single character allowed in the source regexp 25 */ 26 public static function getAllowedCharacterRegexp($regexp) 27 { 28 $def = self::parse($regexp); 29 30 // If the regexp is uses the multiline modifier, this regexp can't match the whole string if 31 // it contains newlines, so in effect it could allow any content 32 if (strpos($def['modifiers'], 'm') !== false) 33 { 34 return '//'; 35 } 36 37 if (substr($def['regexp'], 0, 1) !== '^' 38 || substr($def['regexp'], -1) !== '$') 39 { 40 return '//'; 41 } 42 43 // Append a token to mark the end of the regexp 44 $def['tokens'][] = [ 45 'pos' => strlen($def['regexp']), 46 'len' => 0, 47 'type' => 'end' 48 ]; 49 50 $patterns = []; 51 52 // Collect the literal portions of the source regexp while testing for alternations 53 $literal = ''; 54 $pos = 0; 55 $skipPos = 0; 56 $depth = 0; 57 foreach ($def['tokens'] as $token) 58 { 59 // Skip options 60 if ($token['type'] === 'option') 61 { 62 $skipPos = max($skipPos, $token['pos'] + $token['len']); 63 } 64 65 // Skip assertions 66 if (strpos($token['type'], 'AssertionStart') !== false) 67 { 68 $endToken = $def['tokens'][$token['endToken']]; 69 $skipPos = max($skipPos, $endToken['pos'] + $endToken['len']); 70 } 71 72 if ($token['pos'] >= $skipPos) 73 { 74 if ($token['type'] === 'characterClass') 75 { 76 $patterns[] = '[' . $token['content'] . ']'; 77 } 78 79 if ($token['pos'] > $pos) 80 { 81 // Capture the content between last position and current position 82 $tmp = substr($def['regexp'], $pos, $token['pos'] - $pos); 83 84 // Append the content to the literal portion 85 $literal .= $tmp; 86 87 // Test for alternations if it's the root of the regexp 88 if (!$depth) 89 { 90 // Remove literal backslashes for convenience 91 $tmp = str_replace('\\\\', '', $tmp); 92 93 // Look for an unescaped | that is not followed by ^ 94 if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp)) 95 { 96 return '//'; 97 } 98 99 // Look for an unescaped | that is not preceded by $ 100 if (preg_match('/(?<![$\\\\])\\|/', $tmp)) 101 { 102 return '//'; 103 } 104 } 105 } 106 } 107 108 if (substr($token['type'], -5) === 'Start') 109 { 110 ++$depth; 111 } 112 elseif (substr($token['type'], -3) === 'End') 113 { 114 --$depth; 115 } 116 117 $pos = max($skipPos, $token['pos'] + $token['len']); 118 } 119 120 // Test for the presence of an unescaped dot 121 if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal)) 122 { 123 if (strpos($def['modifiers'], 's') !== false 124 || strpos($literal, "\n") !== false) 125 { 126 return '//'; 127 } 128 129 $patterns[] = '.'; 130 131 // Remove unescaped dots 132 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal); 133 } 134 135 // Remove unescaped quantifiers *, + and ? 136 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal); 137 138 // Remove unescaped quantifiers {} 139 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal); 140 141 // Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references 142 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal); 143 144 // Remove unescaped ^, | and $ 145 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal); 146 147 // Escape unescaped - and ] so they are safe to use in a character class 148 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal); 149 150 // If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n 151 if (strpos($def['modifiers'], 'D') === false) 152 { 153 $literal .= "\n"; 154 } 155 156 // Add the literal portion of the regexp to the patterns, as a character class 157 if ($literal !== '') 158 { 159 $patterns[] = '[' . $literal . ']'; 160 } 161 162 // Test whether this regexp actually matches anything 163 if (empty($patterns)) 164 { 165 return '/^$/D'; 166 } 167 168 // Build the allowed characters regexp 169 $regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter']; 170 171 // Add the modifiers 172 if (strpos($def['modifiers'], 'i') !== false) 173 { 174 $regexp .= 'i'; 175 } 176 if (strpos($def['modifiers'], 'u') !== false) 177 { 178 $regexp .= 'u'; 179 } 180 181 return $regexp; 182 } 183 184 /** 185 * Return the name of each capture in given regexp 186 * 187 * Will return an empty string for unnamed captures 188 * 189 * @param string $regexp 190 * @return string[] 191 */ 192 public static function getCaptureNames($regexp) 193 { 194 $map = ['']; 195 $regexpInfo = self::parse($regexp); 196 foreach ($regexpInfo['tokens'] as $tok) 197 { 198 if ($tok['type'] === 'capturingSubpatternStart') 199 { 200 $map[] = $tok['name'] ?? ''; 201 } 202 } 203 204 return $map; 205 } 206 207 /** 208 * @param string $regexp 209 * @return array 210 */ 211 public static function parse($regexp) 212 { 213 if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m)) 214 { 215 throw new RuntimeException('Could not parse regexp delimiters'); 216 } 217 218 $ret = [ 219 'delimiter' => $m[1], 220 'modifiers' => $m[3], 221 'regexp' => $m[2], 222 'tokens' => [] 223 ]; 224 225 $regexp = $m[2]; 226 227 $openSubpatterns = []; 228 229 $pos = 0; 230 $regexpLen = strlen($regexp); 231 232 while ($pos < $regexpLen) 233 { 234 switch ($regexp[$pos]) 235 { 236 case '\\': 237 // skip next character 238 $pos += 2; 239 break; 240 241 case '[': 242 if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos)) 243 { 244 throw new RuntimeException('Could not find matching bracket from pos ' . $pos); 245 } 246 247 $ret['tokens'][] = [ 248 'pos' => $pos, 249 'len' => strlen($m[0]), 250 'type' => 'characterClass', 251 'content' => $m[1], 252 'quantifiers' => $m[2] 253 ]; 254 255 $pos += strlen($m[0]); 256 break; 257 258 case '(': 259 if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos)) 260 { 261 // This is an option (?i) so we skip past the right parenthesis 262 $ret['tokens'][] = [ 263 'pos' => $pos, 264 'len' => strlen($m[0]), 265 'type' => 'option', 266 'options' => $m[1] 267 ]; 268 269 $pos += strlen($m[0]); 270 break; 271 } 272 273 // This should be a subpattern, we just have to sniff which kind 274 if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos)) 275 { 276 // This is a named capture 277 $tok = [ 278 'pos' => $pos, 279 'len' => strlen($m[0][0]), 280 'type' => 'capturingSubpatternStart', 281 'name' => $m['name'][0] 282 ]; 283 284 $pos += strlen($m[0][0]); 285 } 286 elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos)) 287 { 288 // This is a non-capturing subpattern (?:xxx) 289 $tok = [ 290 'pos' => $pos, 291 'len' => strlen($m[0]), 292 'type' => 'nonCapturingSubpatternStart', 293 'options' => $m[1] 294 ]; 295 296 $pos += strlen($m[0]); 297 } 298 elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos)) 299 { 300 /* This is a non-capturing subpattern with atomic grouping "(?>x+)" */ 301 $tok = [ 302 'pos' => $pos, 303 'len' => strlen($m[0]), 304 'type' => 'nonCapturingSubpatternStart', 305 'subtype' => 'atomic' 306 ]; 307 308 $pos += strlen($m[0]); 309 } 310 elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos)) 311 { 312 // This is an assertion 313 $assertions = [ 314 '=' => 'lookahead', 315 '<=' => 'lookbehind', 316 '!' => 'negativeLookahead', 317 '<!' => 'negativeLookbehind' 318 ]; 319 320 $tok = [ 321 'pos' => $pos, 322 'len' => strlen($m[0]), 323 'type' => $assertions[$m[1]] . 'AssertionStart' 324 ]; 325 326 $pos += strlen($m[0]); 327 } 328 elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos)) 329 { 330 throw new RuntimeException('Unsupported subpattern type at pos ' . $pos); 331 } 332 else 333 { 334 // This should be a normal capture 335 $tok = [ 336 'pos' => $pos, 337 'len' => 1, 338 'type' => 'capturingSubpatternStart' 339 ]; 340 341 ++$pos; 342 } 343 344 $openSubpatterns[] = count($ret['tokens']); 345 $ret['tokens'][] = $tok; 346 break; 347 348 case ')': 349 if (empty($openSubpatterns)) 350 { 351 throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos); 352 } 353 354 // Add the key to this token to its matching token and capture this subpattern's 355 // content 356 $k = array_pop($openSubpatterns); 357 $startToken =& $ret['tokens'][$k]; 358 $startToken['endToken'] = count($ret['tokens']); 359 $startToken['content'] = substr( 360 $regexp, 361 $startToken['pos'] + $startToken['len'], 362 $pos - ($startToken['pos'] + $startToken['len']) 363 ); 364 365 // Look for quantifiers after the subpattern, e.g. (?:ab)++ 366 $spn = strspn($regexp, '+*?', 1 + $pos); 367 $quantifiers = substr($regexp, 1 + $pos, $spn); 368 369 $ret['tokens'][] = [ 370 'pos' => $pos, 371 'len' => 1 + $spn, 372 'type' => substr($startToken['type'], 0, -5) . 'End', 373 'quantifiers' => $quantifiers 374 ]; 375 376 unset($startToken); 377 378 $pos += 1 + $spn; 379 break; 380 381 default: 382 ++$pos; 383 } 384 } 385 386 if (!empty($openSubpatterns)) 387 { 388 throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']); 389 } 390 391 return $ret; 392 } 393 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Mon Nov 25 19:05:08 2024 | Cross-referenced by PHPXref 0.7.1 |