strlen($def['regexp']), 'len' => 0, 'type' => 'end' ]; $patterns = []; // Collect the literal portions of the source regexp while testing for alternations $literal = ''; $pos = 0; $skipPos = 0; $depth = 0; foreach ($def['tokens'] as $token) { // Skip options if ($token['type'] === 'option') { $skipPos = max($skipPos, $token['pos'] + $token['len']); } // Skip assertions if (strpos($token['type'], 'AssertionStart') !== false) { $endToken = $def['tokens'][$token['endToken']]; $skipPos = max($skipPos, $endToken['pos'] + $endToken['len']); } if ($token['pos'] >= $skipPos) { if ($token['type'] === 'characterClass') { $patterns[] = '[' . $token['content'] . ']'; } if ($token['pos'] > $pos) { // Capture the content between last position and current position $tmp = substr($def['regexp'], $pos, $token['pos'] - $pos); // Append the content to the literal portion $literal .= $tmp; // Test for alternations if it's the root of the regexp if (!$depth) { // Remove literal backslashes for convenience $tmp = str_replace('\\\\', '', $tmp); // Look for an unescaped | that is not followed by ^ if (preg_match('/(? $m[1], 'modifiers' => $m[3], 'regexp' => $m[2], 'tokens' => [] ]; $regexp = $m[2]; $openSubpatterns = []; $pos = 0; $regexpLen = strlen($regexp); while ($pos < $regexpLen) { switch ($regexp[$pos]) { case '\\': // skip next character $pos += 2; break; case '[': if (!preg_match('#\\[(.*?(? $pos, 'len' => strlen($m[0]), 'type' => 'characterClass', 'content' => $m[1], 'quantifiers' => $m[2] ]; $pos += strlen($m[0]); break; case '(': if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos)) { // This is an option (?i) so we skip past the right parenthesis $ret['tokens'][] = [ 'pos' => $pos, 'len' => strlen($m[0]), 'type' => 'option', 'options' => $m[1] ]; $pos += strlen($m[0]); break; } // This should be a subpattern, we just have to sniff which kind if (preg_match("#(?J)\\(\\?(?:P?<(?[a-z_0-9]+)>|'(?[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos)) { // This is a named capture $tok = [ 'pos' => $pos, 'len' => strlen($m[0][0]), 'type' => 'capturingSubpatternStart', 'name' => $m['name'][0] ]; $pos += strlen($m[0][0]); } elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos)) { // This is a non-capturing subpattern (?:xxx) $tok = [ 'pos' => $pos, 'len' => strlen($m[0]), 'type' => 'nonCapturingSubpatternStart', 'options' => $m[1] ]; $pos += strlen($m[0]); } elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos)) { /* This is a non-capturing subpattern with atomic grouping "(?>x+)" */ $tok = [ 'pos' => $pos, 'len' => strlen($m[0]), 'type' => 'nonCapturingSubpatternStart', 'subtype' => 'atomic' ]; $pos += strlen($m[0]); } elseif (preg_match('#\\(\\?( 'lookahead', '<=' => 'lookbehind', '!' => 'negativeLookahead', ' 'negativeLookbehind' ]; $tok = [ 'pos' => $pos, 'len' => strlen($m[0]), 'type' => $assertions[$m[1]] . 'AssertionStart' ]; $pos += strlen($m[0]); } elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos)) { throw new RuntimeException('Unsupported subpattern type at pos ' . $pos); } else { // This should be a normal capture $tok = [ 'pos' => $pos, 'len' => 1, 'type' => 'capturingSubpatternStart' ]; ++$pos; } $openSubpatterns[] = count($ret['tokens']); $ret['tokens'][] = $tok; break; case ')': if (empty($openSubpatterns)) { throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos); } // Add the key to this token to its matching token and capture this subpattern's // content $k = array_pop($openSubpatterns); $startToken =& $ret['tokens'][$k]; $startToken['endToken'] = count($ret['tokens']); $startToken['content'] = substr( $regexp, $startToken['pos'] + $startToken['len'], $pos - ($startToken['pos'] + $startToken['len']) ); // Look for quantifiers after the subpattern, e.g. (?:ab)++ $spn = strspn($regexp, '+*?', 1 + $pos); $quantifiers = substr($regexp, 1 + $pos, $spn); $ret['tokens'][] = [ 'pos' => $pos, 'len' => 1 + $spn, 'type' => substr($startToken['type'], 0, -5) . 'End', 'quantifiers' => $quantifiers ]; unset($startToken); $pos += 1 + $spn; break; default: ++$pos; } } if (!empty($openSubpatterns)) { throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']); } return $ret; } }