Skip to content
1 change: 1 addition & 0 deletions packages/engine-javascript/src/engine-compile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export function defaultJavaScriptRegexConstructor(pattern: string, options?: ToR
return toRegExp(
pattern,
{
target: 'ES2018',
global: true,
hasIndices: true,
// This has no benefit for the standard JS engine, but it avoids a perf penalty for
Expand Down
325 changes: 268 additions & 57 deletions packages/engine-javascript/src/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ export interface JavaScriptRegexScannerOptions {
}

export class JavaScriptScanner implements PatternScanner {
regexps: (RegExp | null)[]
regexps: (RegExp | null)[][]
patternGroupCounts: number[] = []

constructor(
public patterns: (string | RegExp)[],
Expand All @@ -46,30 +47,37 @@ export class JavaScriptScanner implements PatternScanner {

this.regexps = patterns.map((p) => {
if (typeof p !== 'string') {
return p
this.patternGroupCounts.push(0)
return [p]
}
// Cache
const cached = cache?.get(p)
if (cached) {
if (cached instanceof RegExp) {
return cached

const groups = countCapturingGroups(p)
this.patternGroupCounts.push(groups)

const chunks = splitPattern(p)
return chunks.map((chunk) => {
// Cache
const cached = cache?.get(chunk)
if (cached) {
if (cached instanceof RegExp) {
return cached
}
if (forgiving)
return null
throw cached
}
if (forgiving)
return null
throw cached
}
try {
const regex = regexConstructor(p)
cache?.set(p, regex)
return regex
}
catch (e) {
cache?.set(p, e as Error)
if (forgiving)
return null
// console.error({ ...e })
throw e
}
try {
const regex = regexConstructor(chunk)
cache?.set(chunk, regex)
return regex
}
catch (e) {
cache?.set(chunk, e as Error)
if (forgiving)
return null
throw e
}
})
})
}

Expand All @@ -79,48 +87,59 @@ export class JavaScriptScanner implements PatternScanner {
: string.content
const pending: [index: number, match: RegExpExecArray, offset: number][] = []

function toResult(index: number, match: RegExpExecArray, offset = 0): IOnigMatch {
return {
index,
captureIndices: match.indices!.map((indice) => {
if (indice == null) {
return {
start: MAX,
end: MAX,
length: 0,
}
}
function toResult(index: number, match: RegExpExecArray, expectedGroupCount: number, offset = 0): IOnigMatch {
const indices = match.indices! as ([number, number] | null)[]
// Pad indices to match expected group count
while (indices.length < expectedGroupCount + 1) {
indices.push(null)
}

const captureIndices = indices.map((indice) => {
if (indice == null) {
return {
start: indice[0] + offset,
end: indice[1] + offset,
length: indice[1] - indice[0],
start: MAX,
end: MAX,
length: 0,
}
}),
}
return {
start: indice[0] + offset,
end: indice[1] + offset,
length: indice[1] - indice[0],
}
})

return {
index,
captureIndices,
}
}

for (let i = 0; i < this.regexps.length; i++) {
const regexp = this.regexps[i]
if (!regexp)
continue
try {
regexp.lastIndex = startPosition
const match = regexp.exec(str)

if (!match)
const regexpList = this.regexps[i]
for (let j = 0; j < regexpList.length; j++) {
const regexp = regexpList[j]
if (!regexp || !(regexp instanceof RegExp))
continue
try {
regexp.lastIndex = startPosition
const match = regexp.exec(str)

if (!match)
continue

// If the match is at the start position, return it immediately
if (match.index === startPosition) {
return toResult(i, match, 0)
// If the match is at the start position, return it immediately
if (match.index === startPosition) {
return toResult(i, match, this.patternGroupCounts[i], 0)
}
// Otherwise, store it for later
pending.push([i, match, 0])
}
catch (e) {
if (this.options.forgiving)
continue
throw e
}
// Otherwise, store it for later
pending.push([i, match, 0])
}
catch (e) {
if (this.options.forgiving)
continue
throw e
}
}

Expand All @@ -129,11 +148,203 @@ export class JavaScriptScanner implements PatternScanner {
const minIndex = Math.min(...pending.map(m => m[1].index))
for (const [i, match, offset] of pending) {
if (match.index === minIndex) {
return toResult(i, match, offset)
return toResult(i, match, this.patternGroupCounts[i], offset)
}
}
}

return null
}
}

function splitPattern(pattern: string, checkOptimization = true): string[] {
// A conservative limit for the regex pattern length.
if (pattern.length < 350)
return [pattern]

// Optimization: Only target patterns that are likely to hit the specific issue.
// The issue observed is related to case-insensitivity (?i) and word boundaries \b.
// Also, splitting breaks capture group indexing if nested groups exist.
// So we should be VERY conservative.
if (checkOptimization && !pattern.includes('(?i'))
return [pattern]

let parenBalance = 0
let bracketBalance = 0
let splitIndex = -1
const target = pattern.length / 2
let bestDist = Infinity

// Scan for top-level pipe |
for (let i = 0; i < pattern.length; i++) {
const char = pattern[i]
if (char === '\\') {
i++
continue
}
if (char === '[') {
bracketBalance++
}
else if (char === ']') {
bracketBalance--
}
else if (char === '(' && bracketBalance === 0) {
parenBalance++
}
else if (char === ')' && bracketBalance === 0) {
parenBalance--
}
else if (char === '|' && parenBalance === 0 && bracketBalance === 0) {
const dist = Math.abs(i - target)
if (dist < bestDist) {
bestDist = dist
splitIndex = i
}
}
}

if (splitIndex !== -1) {
const left = pattern.slice(0, splitIndex)
const right = pattern.slice(splitIndex + 1)
// Pass checkOptimization=false because we are inside a pattern that already passed the check
return [...splitPattern(left, false), ...splitPattern(right, false)]
}

// Wrappers check

// (?i) prefix
if (pattern.startsWith('(?i)')) {
const inner = pattern.slice(4)
const chunks = splitPattern(inner, false)
return chunks.map(c => `(?i)${c}`)
}

// \b ... \b
if (pattern.startsWith('\\b') && pattern.endsWith('\\b')) {
const inner = pattern.slice(2, -2)
if (isBalanced(inner)) {
const chunks = splitPattern(inner, false)
if (chunks.length > 1) {
return chunks.map(c => `\\b${c}\\b`)
}
}
}

// (?i: ... )
if (pattern.startsWith('(?i:') && pattern.endsWith(')')) {
if (isSingleGroup(pattern)) {
const inner = pattern.slice(4, -1)
const chunks = splitPattern(inner, false)
if (chunks.length > 1) {
return chunks.map(c => `(?i:${c})`)
}
}
}

// (?: ... )
if (pattern.startsWith('(?:') && pattern.endsWith(')')) {
if (isSingleGroup(pattern)) {
const inner = pattern.slice(3, -1)
const chunks = splitPattern(inner, false)
if (chunks.length > 1) {
return chunks.map(c => `(?:${c})`)
}
}
}

// ( ... )
if (pattern.startsWith('(') && pattern.endsWith(')')) {
if (isSingleGroup(pattern)) {
const inner = pattern.slice(1, -1)
const chunks = splitPattern(inner, false)
if (chunks.length > 1) {
return chunks.map(c => `(${c})`)
}
}
}

return [pattern]
}

function countCapturingGroups(pattern: string): number {
let groups = 0
let bracketBalance = 0
for (let i = 0; i < pattern.length; i++) {
if (pattern[i] === '\\') {
i++
continue
}
if (pattern[i] === '[') {
bracketBalance++
}
else if (pattern[i] === ']') {
bracketBalance--
}
else if (pattern[i] === '(' && bracketBalance === 0) {
if (pattern[i + 1] !== '?') {
groups++
}
else if (pattern[i + 2] === '<' && pattern[i + 3] !== '=' && pattern[i + 3] !== '!') {
groups++
}
}
}
return groups
}

function isBalanced(s: string): boolean {
let parenBalance = 0
let bracketBalance = 0
for (let i = 0; i < s.length; i++) {
const char = s[i]
if (char === '\\') {
i++
continue
}
if (char === '[')
bracketBalance++
else if (char === ']')
bracketBalance--
else if (char === '(' && bracketBalance === 0)
parenBalance++
else if (char === ')' && bracketBalance === 0)
parenBalance--
}
return parenBalance === 0 && bracketBalance === 0
}

function isSingleGroup(s: string): boolean {
if (!s.startsWith('(')) {
return false
}
if (!s.endsWith(')')) {
return false
}

let parenBalance = 0
let bracketBalance = 0
// We expect the first ( to be closed ONLY at the very end.
for (let i = 0; i < s.length; i++) {
const char = s[i]
if (char === '\\') {
i++
continue
}
if (char === '[') {
bracketBalance++
}
else if (char === ']') {
bracketBalance--
}
else if (char === '(' && bracketBalance === 0) {
parenBalance++
}
else if (char === ')' && bracketBalance === 0) {
parenBalance--
if (parenBalance === 0 && i < s.length - 1) {
return false // Closed before end
}
}
}
return parenBalance === 0 && bracketBalance === 0
}
Loading