From 16308a30fbac786ffc8d6e604a87ceb5ce7d0631 Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sat, 8 Dec 2018 19:17:02 +0100 Subject: [PATCH 1/6] improved parser regex to report single tokens for series of non-token characters, added opcode optimized function aliases --- src/Parser/RegularParser.php | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index e1ea646..1d47706 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -42,7 +42,7 @@ public function parse($text) $this->tokens = $this->tokenize($text); $this->backtracks = array(); $this->position = 0; - $this->tokensCount = count($this->tokens); + $this->tokensCount = \count($this->tokens); $shortcodes = array(); while($this->position < $this->tokensCount) { @@ -52,7 +52,7 @@ public function parse($text) $names = array(); $this->beginBacktrack(); $matches = $this->shortcode($names); - if(is_array($matches)) { + if(\is_array($matches)) { foreach($matches as $shortcode) { $shortcodes[] = $shortcode; } @@ -130,11 +130,11 @@ private function content(array &$names) $this->beginBacktrack(); $matchedShortcodes = $this->shortcode($names); - if(is_string($matchedShortcodes)) { + if(\is_string($matchedShortcodes)) { $closingName = $matchedShortcodes; break; } - if(is_array($matchedShortcodes)) { + if(\is_array($matchedShortcodes)) { foreach($matchedShortcodes as $matchedShortcode) { $shortcodes[] = $matchedShortcode; } @@ -168,7 +168,7 @@ private function close(array &$names) if(!$this->match(self::TOKEN_STRING, $setName, true)) { return false; } if(!$this->match(self::TOKEN_CLOSE)) { return false; } - return in_array($closingName, $names, true) ? $closingName : false; + return \in_array($closingName, $names, true) ? $closingName : false; } private function bbCode() @@ -237,7 +237,7 @@ private function getBacktrack() private function backtrack($modifyPosition = true) { $tokens = array_pop($this->backtracks); - $count = count($tokens); + $count = \count($tokens); if($modifyPosition) { $this->position -= $count; } @@ -273,6 +273,7 @@ private function match($type, $callback = null, $ws = false) } unset($backtrack); + /** @var callable $callback */ $callback && $callback($token); $this->position++; @@ -321,6 +322,13 @@ private function getTokenizerRegex(SyntaxInterface $syntax) return preg_replace('/(.)/us', '\\\\$0', $text); }; + $symbols = array_map($quote, [ + $syntax->getOpeningTag(), + $syntax->getClosingTag(), + $syntax->getClosingTagMarker(), + $syntax->getParameterValueSeparator(), + $syntax->getParameterValueDelimiter(), + ]); $rules = array( $group($syntax->getOpeningTag(), 'open'), $group($syntax->getClosingTag(), 'close'), @@ -328,14 +336,7 @@ private function getTokenizerRegex(SyntaxInterface $syntax) $group($syntax->getParameterValueSeparator(), 'separator'), $group($syntax->getParameterValueDelimiter(), 'delimiter'), '(?\s+)', - '(?\\\\.|(?:(?!'.implode('|', array( - $quote($syntax->getOpeningTag()), - $quote($syntax->getClosingTag()), - $quote($syntax->getClosingTagMarker()), - $quote($syntax->getParameterValueSeparator()), - $quote($syntax->getParameterValueDelimiter()), - '\s+', - )).').)+)', + '(?(?:(?!'.implode('|', $symbols).'|\s)(?:\\\\.|.))+)', ); return '~('.implode('|', $rules).')~us'; From 495c2114743eb27a35aac6ecd3e58d2c4279d8e0 Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sat, 8 Dec 2018 20:26:30 +0100 Subject: [PATCH 2/6] backtracks now rely on their offsets only, removed backtrack token storage, profiler reports over 143x performance improvement for RegularParser, a well spent Saturday afternoon --- src/Parser/RegularParser.php | 41 +++++++++++++++++------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index 1d47706..d5e03c9 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -16,8 +16,9 @@ final class RegularParser implements ParserInterface private $tokens; private $tokensCount; private $position; - /** @var array[] */ + /** @var int[] */ private $backtracks; + private $lastBacktrack; const TOKEN_OPEN = 1; const TOKEN_CLOSE = 2; @@ -41,6 +42,7 @@ public function parse($text) { $this->tokens = $this->tokenize($text); $this->backtracks = array(); + $this->lastBacktrack = 0; $this->position = 0; $this->tokensCount = \count($this->tokens); @@ -225,32 +227,35 @@ private function value() private function beginBacktrack() { - $this->backtracks[] = array(); + $this->backtracks[] = $this->position; + $this->lastBacktrack = $this->position; } private function getBacktrack() { - // switch from array_map() to array_column() when dropping support for PHP <5.5 - return implode('', array_map(function(array $token) { return $token[1]; }, array_pop($this->backtracks))); + $position = array_pop($this->backtracks); + $backtrack = ''; + for($i = $position; $i < $this->position; $i++) { + $backtrack .= $this->tokens[$i][1]; + } + + return $backtrack; } private function backtrack($modifyPosition = true) { - $tokens = array_pop($this->backtracks); - $count = \count($tokens); + $position = array_pop($this->backtracks); if($modifyPosition) { - $this->position -= $count; + $this->position = $position; } - foreach($this->backtracks as &$backtrack) { - // array_pop() in loop is much faster than array_slice() because - // it operates directly on the passed array - for($i = 0; $i < $count; $i++) { - array_pop($backtrack); - } + $backtrack = ''; + for($i = $position; $i < $this->lastBacktrack; $i++) { + $backtrack .= $this->tokens[$i][1]; } + $this->lastBacktrack = $position; - return implode('', array_map(function(array $token) { return $token[1]; }, $tokens)); + return $backtrack; } private function lookahead($type) @@ -268,21 +273,13 @@ private function match($type, $callback = null, $ws = false) if(!empty($type) && $token[0] !== $type) { return false; } - foreach($this->backtracks as &$backtrack) { - $backtrack[] = $token; - } - unset($backtrack); /** @var callable $callback */ $callback && $callback($token); $this->position++; if($ws && $this->position < $this->tokensCount && $this->tokens[$this->position][0] === self::TOKEN_WS) { - $token = $this->tokens[$this->position]; $this->position++; - foreach($this->backtracks as &$backtrack) { - $backtrack[] = $token; - } } return true; From 2f52e54d13916a6d5a3b77a6427c212b9240bf5e Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sun, 9 Dec 2018 00:37:37 +0100 Subject: [PATCH 3/6] fixed issue #70, preg_match_all silently failed and returned only the small part of expected results, named captures based tokenizer was replaced with string matching which eats significant portion of previous performance gains, minor RegularParser and Processor improvements --- src/Parser/RegularParser.php | 149 ++++++++++++++--------------------- src/Processor/Processor.php | 12 +-- 2 files changed, 68 insertions(+), 93 deletions(-) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index d5e03c9..116ed20 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -13,12 +13,14 @@ final class RegularParser implements ParserInterface { private $lexerRegex; + private $nameRegex; private $tokens; private $tokensCount; private $position; /** @var int[] */ private $backtracks; private $lastBacktrack; + private $tokenMap; const TOKEN_OPEN = 1; const TOKEN_CLOSE = 2; @@ -30,7 +32,8 @@ final class RegularParser implements ParserInterface public function __construct(SyntaxInterface $syntax = null) { - $this->lexerRegex = $this->getTokenizerRegex($syntax ?: new CommonSyntax()); + $this->lexerRegex = $this->prepareLexer($syntax ?: new CommonSyntax()); + $this->nameRegex = '~^'.RegexBuilderUtility::buildNameRegex().'$~us'; } /** @@ -73,29 +76,28 @@ private function getObject($name, $parameters, $bbCode, $offset, $content, $text private function shortcode(array &$names) { - $name = null; - $offset = null; - - $setName = function(array $token) use(&$name) { $name = $token[1]; }; - $setOffset = function(array $token) use(&$offset) { $offset = $token[2]; }; - - if(!$this->match(self::TOKEN_OPEN, $setOffset, true)) { return false; } - if(!$this->match(self::TOKEN_STRING, $setName, false)) { return false; } + if(!$this->match(self::TOKEN_OPEN, false)) { return false; } + $offset = $this->tokens[$this->position - 1][2]; + $this->match(self::TOKEN_WS, false); + if('' === $name = $this->match(self::TOKEN_STRING, false)) { return false; } if($this->lookahead(self::TOKEN_STRING)) { return false; } - if(!preg_match_all('~^'.RegexBuilderUtility::buildNameRegex().'$~us', $name, $matches)) { return false; } - $this->match(self::TOKEN_WS); - if(false === ($bbCode = $this->bbCode())) { return false; } + if(1 !== preg_match($this->nameRegex, $name, $matches)) { return false; } + $this->match(self::TOKEN_WS, false); + // bbCode + $bbCode = $this->match(self::TOKEN_SEPARATOR, true) ? $this->value() : null; + if(false === $bbCode) { return false; } + // parameters if(false === ($parameters = $this->parameters())) { return false; } // self-closing - if($this->match(self::TOKEN_MARKER, null, true)) { - if(!$this->match(self::TOKEN_CLOSE)) { return false; } + if($this->match(self::TOKEN_MARKER, true)) { + if(!$this->match(self::TOKEN_CLOSE, false)) { return false; } return array($this->getObject($name, $parameters, $bbCode, $offset, null, $this->getBacktrack())); } // just-closed or with-content - if(!$this->match(self::TOKEN_CLOSE)) { return false; } + if(!$this->match(self::TOKEN_CLOSE, false)) { return false; } $this->beginBacktrack(); $names[] = $name; list($content, $shortcodes, $closingName) = $this->content($names); @@ -120,14 +122,13 @@ private function shortcode(array &$names) private function content(array &$names) { - $content = null; + $content = ''; $shortcodes = array(); $closingName = null; - $appendContent = function(array $token) use(&$content) { $content .= $token[1]; }; while($this->position < $this->tokensCount) { while($this->position < $this->tokensCount && false === $this->lookahead(self::TOKEN_OPEN)) { - $this->match(null, $appendContent, true); + $content .= $this->match(null, true); } $this->beginBacktrack(); @@ -154,7 +155,7 @@ private function content(array &$names) $closingName = null; $this->backtrack(); - $this->match(null, $appendContent); + $content .= $this->match(null, false); } return array($this->position < $this->tokensCount ? $content : false, $shortcodes, $closingName); @@ -162,36 +163,25 @@ private function content(array &$names) private function close(array &$names) { - $closingName = null; - $setName = function(array $token) use(&$closingName) { $closingName = $token[1]; }; - - if(!$this->match(self::TOKEN_OPEN, null, true)) { return false; } - if(!$this->match(self::TOKEN_MARKER, null, true)) { return false; } - if(!$this->match(self::TOKEN_STRING, $setName, true)) { return false; } - if(!$this->match(self::TOKEN_CLOSE)) { return false; } + if(!$this->match(self::TOKEN_OPEN, true)) { return false; } + if(!$this->match(self::TOKEN_MARKER, true)) { return false; } + if(!$closingName = $this->match(self::TOKEN_STRING, true)) { return false; } + if(!$this->match(self::TOKEN_CLOSE, false)) { return false; } return \in_array($closingName, $names, true) ? $closingName : false; } - private function bbCode() - { - return $this->match(self::TOKEN_SEPARATOR, null, true) ? $this->value() : null; - } - private function parameters() { $parameters = array(); - $setName = function(array $token) use(&$name) { $name = $token[1]; }; while(true) { - $name = null; - - $this->match(self::TOKEN_WS); + $this->match(self::TOKEN_WS, false); if($this->lookahead(self::TOKEN_MARKER) || $this->lookahead(self::TOKEN_CLOSE)) { break; } - if(!$this->match(self::TOKEN_STRING, $setName, true)) { return false; } - if(!$this->match(self::TOKEN_SEPARATOR, null, true)) { $parameters[$name] = null; continue; } + if(!$name = $this->match(self::TOKEN_STRING, true)) { return false; } + if(!$this->match(self::TOKEN_SEPARATOR, true)) { $parameters[$name] = null; continue; } if(false === ($value = $this->value())) { return false; } - $this->match(self::TOKEN_WS); + $this->match(self::TOKEN_WS, false); $parameters[$name] = $value; } @@ -202,19 +192,19 @@ private function parameters() private function value() { $value = ''; - $appendValue = function(array $token) use(&$value) { $value .= $token[1]; }; - if($this->match(self::TOKEN_DELIMITER)) { + if($this->match(self::TOKEN_DELIMITER, false)) { while($this->position < $this->tokensCount && false === $this->lookahead(self::TOKEN_DELIMITER)) { - $this->match(null, $appendValue); + $value .= $this->match(null, false); } - return $this->match(self::TOKEN_DELIMITER) ? $value : false; + return $this->match(self::TOKEN_DELIMITER, false) ? $value : false; } - if($this->match(self::TOKEN_STRING, $appendValue)) { - while($this->match(self::TOKEN_STRING, $appendValue)) { - continue; + if($tmp = $this->match(self::TOKEN_STRING, false)) { + $value .= $tmp; + while($tmp = $this->match(self::TOKEN_STRING, false)) { + $value .= $tmp; } return $value; @@ -263,79 +253,62 @@ private function lookahead($type) return $this->position < $this->tokensCount && (empty($type) || $this->tokens[$this->position][0] === $type); } - private function match($type, $callback = null, $ws = false) + private function match($type, $ws) { if($this->position >= $this->tokensCount) { - return false; + return ''; } $token = $this->tokens[$this->position]; if(!empty($type) && $token[0] !== $type) { - return false; + return ''; } - /** @var callable $callback */ - $callback && $callback($token); $this->position++; - if($ws && $this->position < $this->tokensCount && $this->tokens[$this->position][0] === self::TOKEN_WS) { $this->position++; } - return true; + return $token[1]; } /* --- LEXER ----------------------------------------------------------- */ private function tokenize($text) { - preg_match_all($this->lexerRegex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + preg_match_all($this->lexerRegex, $text, $matches, PREG_OFFSET_CAPTURE); + if(preg_last_error() !== PREG_NO_ERROR) { + throw new \RuntimeException(sprintf('PCRE failure `%s`.', preg_last_error())); + } + $tokens = array(); $position = 0; - - foreach($matches as $match) { - switch(true) { - case -1 !== $match['open'][1]: { $token = $match['open'][0]; $type = self::TOKEN_OPEN; break; } - case -1 !== $match['close'][1]: { $token = $match['close'][0]; $type = self::TOKEN_CLOSE; break; } - case -1 !== $match['marker'][1]: { $token = $match['marker'][0]; $type = self::TOKEN_MARKER; break; } - case -1 !== $match['separator'][1]: { $token = $match['separator'][0]; $type = self::TOKEN_SEPARATOR; break; } - case -1 !== $match['delimiter'][1]: { $token = $match['delimiter'][0]; $type = self::TOKEN_DELIMITER; break; } - case -1 !== $match['ws'][1]: { $token = $match['ws'][0]; $type = self::TOKEN_WS; break; } - default: { $token = $match['string'][0]; $type = self::TOKEN_STRING; } - } - $tokens[] = array($type, $token, $position); - $position += mb_strlen($token, 'utf-8'); + foreach($matches[0] as $match) { + $type = isset($this->tokenMap[$match[0]]) + ? $this->tokenMap[$match[0]] + : (ctype_space($match[0]) ? self::TOKEN_WS : self::TOKEN_STRING); + $tokens[] = array($type, $match[0], $position); + $position += mb_strlen($match[0], 'utf-8'); } return $tokens; } - private function getTokenizerRegex(SyntaxInterface $syntax) + private function prepareLexer(SyntaxInterface $syntax) { - $group = function($text, $group) { - return '(?<'.$group.'>'.preg_replace('/(.)/us', '\\\\$0', $text).')'; - }; + $this->tokenMap = array( + $syntax->getOpeningTag() => self::TOKEN_OPEN, + $syntax->getClosingTag() => self::TOKEN_CLOSE, + $syntax->getClosingTagMarker() => self::TOKEN_MARKER, + $syntax->getParameterValueSeparator() => self::TOKEN_SEPARATOR, + $syntax->getParameterValueDelimiter() => self::TOKEN_DELIMITER, + ); + $quote = function($text) { return preg_replace('/(.)/us', '\\\\$0', $text); }; + $symbols = array_map($quote, array_keys($this->tokenMap)); - $symbols = array_map($quote, [ - $syntax->getOpeningTag(), - $syntax->getClosingTag(), - $syntax->getClosingTagMarker(), - $syntax->getParameterValueSeparator(), - $syntax->getParameterValueDelimiter(), - ]); - $rules = array( - $group($syntax->getOpeningTag(), 'open'), - $group($syntax->getClosingTag(), 'close'), - $group($syntax->getClosingTagMarker(), 'marker'), - $group($syntax->getParameterValueSeparator(), 'separator'), - $group($syntax->getParameterValueDelimiter(), 'delimiter'), - '(?\s+)', - '(?(?:(?!'.implode('|', $symbols).'|\s)(?:\\\\.|.))+)', - ); - - return '~('.implode('|', $rules).')~us'; + return '~('.implode('|', $symbols).'|\s+|\\\\.|[\w-]+|.)~us'; } } diff --git a/src/Processor/Processor.php b/src/Processor/Processor.php index 86df24d..8f25abe 100644 --- a/src/Processor/Processor.php +++ b/src/Processor/Processor.php @@ -106,7 +106,6 @@ private function processIteration($text, ProcessorContext $context, ProcessedSho $replaces[] = new ReplacedShortcode($shortcode, $replace); } - $replaces = array_filter($replaces); $applyEvent = new ReplaceShortcodesEvent($text, $replaces, $parent); $this->dispatchEvent(Events::REPLACE_SHORTCODES, $applyEvent); @@ -116,13 +115,16 @@ private function processIteration($text, ProcessorContext $context, ProcessedSho private function applyReplaces($text, array $replaces) { - return array_reduce(array_reverse($replaces), function($state, ReplacedShortcode $s) { + /** @var ReplacedShortcode $s */ + foreach(array_reverse($replaces) as $s) { $offset = $s->getOffset(); $length = mb_strlen($s->getText(), 'utf-8'); - $textLength = mb_strlen($state, 'utf-8'); + $textLength = mb_strlen($text, 'utf-8'); - return mb_substr($state, 0, $offset, 'utf-8').$s->getReplacement().mb_substr($state, $offset + $length, $textLength, 'utf-8'); - }, $text); + $text = mb_substr($text, 0, $offset, 'utf-8').$s->getReplacement().mb_substr($text, $offset + $length, $textLength, 'utf-8'); + } + + return $text; } private function processHandler(ParsedShortcodeInterface $parsed, ProcessorContext $context, $handler) From 590dfbb301c9e61e9d5dee7065deedbe870663c9 Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sun, 9 Dec 2018 12:29:27 +0100 Subject: [PATCH 4/6] inlined content() private method in RegularParser, nesting level is halved, simplified lookahead() method body as type is always specified --- src/Parser/RegularParser.php | 54 +++++++++++++++++------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index 116ed20..7b842ce 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -100,28 +100,8 @@ private function shortcode(array &$names) if(!$this->match(self::TOKEN_CLOSE, false)) { return false; } $this->beginBacktrack(); $names[] = $name; - list($content, $shortcodes, $closingName) = $this->content($names); - if(null !== $closingName && $closingName !== $name) { - array_pop($names); - array_pop($this->backtracks); - array_pop($this->backtracks); - - return $closingName; - } - if(false === $content || $closingName !== $name) { - $this->backtrack(false); - $text = $this->backtrack(false); - - return array_merge(array($this->getObject($name, $parameters, $bbCode, $offset, null, $text)), $shortcodes); - } - $content = $this->getBacktrack(); - if(!$this->close($names)) { return false; } - - return array($this->getObject($name, $parameters, $bbCode, $offset, $content, $this->getBacktrack())); - } - private function content(array &$names) - { + // begin inlined content() $content = ''; $shortcodes = array(); $closingName = null; @@ -132,13 +112,13 @@ private function content(array &$names) } $this->beginBacktrack(); - $matchedShortcodes = $this->shortcode($names); - if(\is_string($matchedShortcodes)) { - $closingName = $matchedShortcodes; + $contentMatchedShortcodes = $this->shortcode($names); + if(\is_string($contentMatchedShortcodes)) { + $closingName = $contentMatchedShortcodes; break; } - if(\is_array($matchedShortcodes)) { - foreach($matchedShortcodes as $matchedShortcode) { + if(\is_array($contentMatchedShortcodes)) { + foreach($contentMatchedShortcodes as $matchedShortcode) { $shortcodes[] = $matchedShortcode; } continue; @@ -157,8 +137,26 @@ private function content(array &$names) $content .= $this->match(null, false); } + $content = $this->position < $this->tokensCount ? $content : false; + // end inlined content() + + if(null !== $closingName && $closingName !== $name) { + array_pop($names); + array_pop($this->backtracks); + array_pop($this->backtracks); + + return $closingName; + } + if(false === $content || $closingName !== $name) { + $this->backtrack(false); + $text = $this->backtrack(false); - return array($this->position < $this->tokensCount ? $content : false, $shortcodes, $closingName); + return array_merge(array($this->getObject($name, $parameters, $bbCode, $offset, null, $text)), $shortcodes); + } + $content = $this->getBacktrack(); + if(!$this->close($names)) { return false; } + + return array($this->getObject($name, $parameters, $bbCode, $offset, $content, $this->getBacktrack())); } private function close(array &$names) @@ -250,7 +248,7 @@ private function backtrack($modifyPosition = true) private function lookahead($type) { - return $this->position < $this->tokensCount && (empty($type) || $this->tokens[$this->position][0] === $type); + return $this->position < $this->tokensCount && $this->tokens[$this->position][0] === $type; } private function match($type, $ws) From df9330a3cc202b9d921ef0a1d482259961ee6354 Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sun, 9 Dec 2018 12:58:32 +0100 Subject: [PATCH 5/6] disable xdebug.max_nesting_level during RegularParser parsing process and restore it afterwards: there is no reliable way of detecting the current nesting level and safely manipulating its value in runtime to avoid process-breaking Error being thrown --- src/Parser/RegularParser.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index 7b842ce..99308bf 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -43,6 +43,7 @@ public function __construct(SyntaxInterface $syntax = null) */ public function parse($text) { + $nestingLevel = ini_set('xdebug.max_nesting_level', -1); $this->tokens = $this->tokenize($text); $this->backtracks = array(); $this->lastBacktrack = 0; @@ -63,6 +64,7 @@ public function parse($text) } } } + ini_set('xdebug.max_nesting_level', $nestingLevel); return $shortcodes; } From 39440f19b2d1a35601561a85d24020f615905cc9 Mon Sep 17 00:00:00 2001 From: Tomasz Kowalczyk Date: Sun, 9 Dec 2018 15:06:35 +0100 Subject: [PATCH 6/6] reverted token lexer to named capture groups, for some reason I'm no longer able to reproduce PCRE_JIT_STACKLIMIT_ERROR --- src/Parser/RegularParser.php | 56 ++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/Parser/RegularParser.php b/src/Parser/RegularParser.php index 99308bf..8d799e0 100644 --- a/src/Parser/RegularParser.php +++ b/src/Parser/RegularParser.php @@ -20,7 +20,6 @@ final class RegularParser implements ParserInterface /** @var int[] */ private $backtracks; private $lastBacktrack; - private $tokenMap; const TOKEN_OPEN = 1; const TOKEN_CLOSE = 2; @@ -276,19 +275,27 @@ private function match($type, $ws) private function tokenize($text) { - preg_match_all($this->lexerRegex, $text, $matches, PREG_OFFSET_CAPTURE); - if(preg_last_error() !== PREG_NO_ERROR) { + $count = preg_match_all($this->lexerRegex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + if(false === $count || preg_last_error() !== PREG_NO_ERROR) { throw new \RuntimeException(sprintf('PCRE failure `%s`.', preg_last_error())); } $tokens = array(); $position = 0; - foreach($matches[0] as $match) { - $type = isset($this->tokenMap[$match[0]]) - ? $this->tokenMap[$match[0]] - : (ctype_space($match[0]) ? self::TOKEN_WS : self::TOKEN_STRING); - $tokens[] = array($type, $match[0], $position); - $position += mb_strlen($match[0], 'utf-8'); + + foreach($matches as $match) { + switch(true) { + case -1 !== $match['string'][1]: { $token = $match['string'][0]; $type = self::TOKEN_STRING; break; } + case -1 !== $match['ws'][1]: { $token = $match['ws'][0]; $type = self::TOKEN_WS; break; } + case -1 !== $match['marker'][1]: { $token = $match['marker'][0]; $type = self::TOKEN_MARKER; break; } + case -1 !== $match['delimiter'][1]: { $token = $match['delimiter'][0]; $type = self::TOKEN_DELIMITER; break; } + case -1 !== $match['separator'][1]: { $token = $match['separator'][0]; $type = self::TOKEN_SEPARATOR; break; } + case -1 !== $match['open'][1]: { $token = $match['open'][0]; $type = self::TOKEN_OPEN; break; } + case -1 !== $match['close'][1]: { $token = $match['close'][0]; $type = self::TOKEN_CLOSE; break; } + default: { throw new \RuntimeException(sprintf('Invalid token.')); } + } + $tokens[] = array($type, $token, $position); + $position += mb_strlen($token, 'utf-8'); } return $tokens; @@ -296,19 +303,30 @@ private function tokenize($text) private function prepareLexer(SyntaxInterface $syntax) { - $this->tokenMap = array( - $syntax->getOpeningTag() => self::TOKEN_OPEN, - $syntax->getClosingTag() => self::TOKEN_CLOSE, - $syntax->getClosingTagMarker() => self::TOKEN_MARKER, - $syntax->getParameterValueSeparator() => self::TOKEN_SEPARATOR, - $syntax->getParameterValueDelimiter() => self::TOKEN_DELIMITER, - ); - + $group = function($text, $group) { + return '(?<'.$group.'>'.preg_replace('/(.)/us', '\\\\$0', $text).')'; + }; $quote = function($text) { return preg_replace('/(.)/us', '\\\\$0', $text); }; - $symbols = array_map($quote, array_keys($this->tokenMap)); - return '~('.implode('|', $symbols).'|\s+|\\\\.|[\w-]+|.)~us'; + $rules = array( + '(?\\\\.|(?:(?!'.implode('|', array( + $quote($syntax->getOpeningTag()), + $quote($syntax->getClosingTag()), + $quote($syntax->getClosingTagMarker()), + $quote($syntax->getParameterValueSeparator()), + $quote($syntax->getParameterValueDelimiter()), + '\s+', + )).').)+)', + '(?\s+)', + $group($syntax->getClosingTagMarker(), 'marker'), + $group($syntax->getParameterValueDelimiter(), 'delimiter'), + $group($syntax->getParameterValueSeparator(), 'separator'), + $group($syntax->getOpeningTag(), 'open'), + $group($syntax->getClosingTag(), 'close'), + ); + + return '~('.implode('|', $rules).')~us'; } }