Skip to content

Commit 801a389

Browse files
committed
Compile the pattern for PatternFormatter
The verification that was made upon the pattern was almost like a parser/tokenizer in itself. This change leverages that behavior to introduce full pattern compilation. When the PatternFormatter encounters a novel pattern that it has never seen before, it compiles that pattern into a 3-step CompiledPattern instance that has a search regex, a replacement pattern and instructions for the callback. Upon seeing already compiled patterns, all the PatternFormatter has to do is perform the motions (one preg_replace_callback) of the existing compiled pattern. Further steps for pattern canonicalization could be taken, such as normalizing equivalent patterns into a single form, so they could share the same cached space. However, that micro-optimization was too expensive and counter-productive. This change also opens up possibilities for in-file warmup, as CompiledPattern instances are simple objects. An user could pre-compile his/her hot-path patterns beforehand to share the cache even across diferent processes.
1 parent e101c99 commit 801a389

File tree

3 files changed

+230
-213
lines changed

3 files changed

+230
-213
lines changed

src/Internal/CompiledPattern.php

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
<?php
2+
3+
/*
4+
* SPDX-FileCopyrightText: (c) Respect Project Contributors
5+
* SPDX-License-Identifier: ISC
6+
* SPDX-FileContributor: Alexandre Gomes Gaigalas <alganet@gmail.com>
7+
*/
8+
9+
declare(strict_types=1);
10+
11+
namespace Respect\StringFormatter\Internal;
12+
13+
use Respect\StringFormatter\InvalidFormatterException;
14+
15+
use function array_keys;
16+
use function count;
17+
use function implode;
18+
use function mb_strtolower;
19+
use function mb_strtoupper;
20+
use function mb_substr;
21+
use function preg_match;
22+
use function preg_match_all;
23+
use function sprintf;
24+
use function str_starts_with;
25+
use function strtolower;
26+
use function substr;
27+
28+
use const PREG_OFFSET_CAPTURE;
29+
30+
final class CompiledPattern
31+
{
32+
private const array FILTERS = [
33+
'#' => '.',
34+
'0' => '\p{N}',
35+
'A' => '\p{Lu}',
36+
'a' => '\p{Ll}',
37+
'C' => '\p{L}',
38+
'W' => '\p{L}|\p{N}',
39+
];
40+
41+
private const array TRANSFORM_MAP = ['l' => 'lower', 'u' => 'upper', 'i' => 'invert'];
42+
43+
/** @var array<string, CompiledPattern> */
44+
private static array $compiledPatterns = [];
45+
46+
/** @var array<string, string> */
47+
private static array $compiledQualifiers = [];
48+
49+
/** @param array<int, array{filter: string, transform: string|null}> $instructions */
50+
private function __construct(
51+
private(set) readonly string $pattern,
52+
private(set) readonly string $search,
53+
private(set) readonly string $replacement,
54+
private(set) readonly array $instructions,
55+
) {
56+
}
57+
58+
public static function compile(string $pattern): self
59+
{
60+
if (isset(self::$compiledPatterns[$pattern])) {
61+
return self::$compiledPatterns[$pattern];
62+
}
63+
64+
if ($pattern === '') {
65+
throw new InvalidFormatterException('Pattern cannot be empty');
66+
}
67+
68+
$search = '';
69+
$replacement = '';
70+
$instructions = [];
71+
$groupIndex = 1;
72+
73+
$transformState = null;
74+
$nextTransform = null;
75+
76+
preg_match_all(sprintf(
77+
'/(?:\\\\.|[%1$s]|(?:\{[^}]*\}|[*+?])|[^\\\%1$s{}+*?]+|.)/u',
78+
implode('', array_keys(self::FILTERS)),
79+
), $pattern, $tokens, PREG_OFFSET_CAPTURE);
80+
81+
$tokenList = $tokens[0];
82+
$count = count($tokenList);
83+
84+
for ($i = 0; $i < $count; $i++) {
85+
[$tokenText, $offset] = $tokenList[$i];
86+
87+
if (str_starts_with($tokenText, '\\')) {
88+
if ($tokenText === '\\') {
89+
throw new InvalidFormatterException('Incomplete escape sequence at end of pattern');
90+
}
91+
92+
$char = mb_substr($tokenText, 1);
93+
94+
if ($char === 'd') {
95+
$inner = '.';
96+
$search .= sprintf('((?:.*?%s){0,1})', $inner);
97+
$replacement .= sprintf('%%%d$', $groupIndex);
98+
$instructions[$groupIndex] = ['filter' => sprintf('/%s/u', $inner), 'transform' => 'delete'];
99+
$groupIndex++;
100+
continue;
101+
}
102+
103+
if ($char === 'E') {
104+
$transformState = null;
105+
continue;
106+
}
107+
108+
if (isset(self::TRANSFORM_MAP[$char])) {
109+
$nextTransform = self::TRANSFORM_MAP[$char];
110+
continue;
111+
}
112+
113+
$lowerChar = strtolower($char);
114+
if (isset(self::TRANSFORM_MAP[$lowerChar]) && $char !== $lowerChar) {
115+
$transformState = self::TRANSFORM_MAP[$lowerChar];
116+
continue;
117+
}
118+
119+
$replacement .= $char;
120+
continue;
121+
}
122+
123+
if (isset(self::FILTERS[$tokenText])) {
124+
$filterChar = $tokenText;
125+
$regexQuantifier = '{0,1}';
126+
127+
if (isset($tokenList[$i + 1]) && preg_match('/^(?:\{[^}]*\}|[*+?])$/u', $tokenList[$i + 1][0])) {
128+
$i++;
129+
$regexQuantifier = self::compileQualifier($tokenList[$i][0], $tokenList[$i][1]);
130+
}
131+
132+
$inner = self::FILTERS[$filterChar];
133+
$search .= sprintf('((?:.*?%s)%s)', $inner, $regexQuantifier);
134+
135+
$replacement .= sprintf('%%%d$', $groupIndex);
136+
$instructions[$groupIndex] = [
137+
'filter' => sprintf('/%s/u', $inner),
138+
'transform' => $nextTransform ?? $transformState,
139+
];
140+
141+
$groupIndex++;
142+
$nextTransform = null;
143+
continue;
144+
}
145+
146+
if (preg_match('/^(?:\{[^}]*\}|[*+?])$/u', $tokenText)) {
147+
throw new InvalidFormatterException(
148+
sprintf('Quantifier "%s" must follow a filter pattern at position %d', $tokenText[0], $offset),
149+
);
150+
}
151+
152+
if (str_starts_with($tokenText, '{')) {
153+
throw new InvalidFormatterException(
154+
sprintf('Invalid or malformed quantifier at position %d', $offset),
155+
);
156+
}
157+
158+
$replacement .= $tokenText;
159+
}
160+
161+
return self::$compiledPatterns[$pattern] = new self(
162+
$pattern,
163+
'/^' . $search . '/us',
164+
$replacement,
165+
$instructions,
166+
);
167+
}
168+
169+
public static function transform(string $val, string|null $transform): string
170+
{
171+
return match ($transform) {
172+
'delete' => '',
173+
'lower' => mb_strtolower($val),
174+
'upper' => mb_strtoupper($val),
175+
'invert' => mb_strtolower($val) ^ mb_strtoupper($val) ^ $val,
176+
default => $val,
177+
};
178+
}
179+
180+
private static function compileQualifier(string $token, int $offset): string
181+
{
182+
if (isset(self::$compiledQualifiers[$token])) {
183+
return self::$compiledQualifiers[$token];
184+
}
185+
186+
if ($token === '*') {
187+
return '*';
188+
}
189+
190+
if ($token === '+') {
191+
return '{1,}';
192+
}
193+
194+
$content = substr($token, 1, -1);
195+
if ($content === '' || $content === ',' || !preg_match('/^(\d+(?:,\d*)?|,\d+)$/', $content)) {
196+
throw new InvalidFormatterException(sprintf('Invalid or malformed quantifier at position %d', $offset));
197+
}
198+
199+
preg_match('/^\{(\d*)(?:,(\d*))?\}$/', $token, $m);
200+
$max = $m[2] ?? $m[1];
201+
202+
return self::$compiledQualifiers[$token] = $max === '' ? '*' : sprintf('{0,%s}', $max);
203+
}
204+
}

0 commit comments

Comments
 (0)