diff --git a/packages/engine-javascript/src/engine-compile.ts b/packages/engine-javascript/src/engine-compile.ts index 42c213836..31d2b299e 100644 --- a/packages/engine-javascript/src/engine-compile.ts +++ b/packages/engine-javascript/src/engine-compile.ts @@ -29,6 +29,7 @@ export function defaultJavaScriptRegexConstructor(pattern: string, options?: ToR return toRegExp( pattern, { + target: 'ES2018', global: true, hasIndices: true, // This has no benefit for the standard JS engine, but it avoids a perf penalty for diff --git a/packages/engine-javascript/src/scanner.ts b/packages/engine-javascript/src/scanner.ts index ad5b58290..df052a900 100644 --- a/packages/engine-javascript/src/scanner.ts +++ b/packages/engine-javascript/src/scanner.ts @@ -28,7 +28,8 @@ export interface JavaScriptRegexScannerOptions { } export class JavaScriptScanner implements PatternScanner { - regexps: (RegExp | null)[] + regexps: (RegExp | null)[][] + patternGroupCounts: number[] = [] constructor( public patterns: (string | RegExp)[], @@ -46,30 +47,37 @@ export class JavaScriptScanner implements PatternScanner { this.regexps = patterns.map((p) => { if (typeof p !== 'string') { - return p + this.patternGroupCounts.push(0) + return [p] } - // Cache - const cached = cache?.get(p) - if (cached) { - if (cached instanceof RegExp) { - return cached + + const groups = countCapturingGroups(p) + this.patternGroupCounts.push(groups) + + const chunks = splitPattern(p) + return chunks.map((chunk) => { + // Cache + const cached = cache?.get(chunk) + if (cached) { + if (cached instanceof RegExp) { + return cached + } + if (forgiving) + return null + throw cached } - if (forgiving) - return null - throw cached - } - try { - const regex = regexConstructor(p) - cache?.set(p, regex) - return regex - } - catch (e) { - cache?.set(p, e as Error) - if (forgiving) - return null - // console.error({ ...e }) - throw e - } + try { + const regex = regexConstructor(chunk) + cache?.set(chunk, regex) + return regex + } + catch (e) { + cache?.set(chunk, e as Error) + if (forgiving) + return null + throw e + } + }) }) } @@ -79,48 +87,59 @@ export class JavaScriptScanner implements PatternScanner { : string.content const pending: [index: number, match: RegExpExecArray, offset: number][] = [] - function toResult(index: number, match: RegExpExecArray, offset = 0): IOnigMatch { - return { - index, - captureIndices: match.indices!.map((indice) => { - if (indice == null) { - return { - start: MAX, - end: MAX, - length: 0, - } - } + function toResult(index: number, match: RegExpExecArray, expectedGroupCount: number, offset = 0): IOnigMatch { + const indices = match.indices! as ([number, number] | null)[] + // Pad indices to match expected group count + while (indices.length < expectedGroupCount + 1) { + indices.push(null) + } + + const captureIndices = indices.map((indice) => { + if (indice == null) { return { - start: indice[0] + offset, - end: indice[1] + offset, - length: indice[1] - indice[0], + start: MAX, + end: MAX, + length: 0, } - }), + } + return { + start: indice[0] + offset, + end: indice[1] + offset, + length: indice[1] - indice[0], + } + }) + + return { + index, + captureIndices, } } for (let i = 0; i < this.regexps.length; i++) { - const regexp = this.regexps[i] - if (!regexp) - continue - try { - regexp.lastIndex = startPosition - const match = regexp.exec(str) - - if (!match) + const regexpList = this.regexps[i] + for (let j = 0; j < regexpList.length; j++) { + const regexp = regexpList[j] + if (!regexp || !(regexp instanceof RegExp)) continue + try { + regexp.lastIndex = startPosition + const match = regexp.exec(str) + + if (!match) + continue - // If the match is at the start position, return it immediately - if (match.index === startPosition) { - return toResult(i, match, 0) + // If the match is at the start position, return it immediately + if (match.index === startPosition) { + return toResult(i, match, this.patternGroupCounts[i], 0) + } + // Otherwise, store it for later + pending.push([i, match, 0]) + } + catch (e) { + if (this.options.forgiving) + continue + throw e } - // Otherwise, store it for later - pending.push([i, match, 0]) - } - catch (e) { - if (this.options.forgiving) - continue - throw e } } @@ -129,7 +148,7 @@ export class JavaScriptScanner implements PatternScanner { const minIndex = Math.min(...pending.map(m => m[1].index)) for (const [i, match, offset] of pending) { if (match.index === minIndex) { - return toResult(i, match, offset) + return toResult(i, match, this.patternGroupCounts[i], offset) } } } @@ -137,3 +156,195 @@ export class JavaScriptScanner implements PatternScanner { return null } } + +function splitPattern(pattern: string, checkOptimization = true): string[] { + // A conservative limit for the regex pattern length. + if (pattern.length < 350) + return [pattern] + + // Optimization: Only target patterns that are likely to hit the specific issue. + // The issue observed is related to case-insensitivity (?i) and word boundaries \b. + // Also, splitting breaks capture group indexing if nested groups exist. + // So we should be VERY conservative. + if (checkOptimization && !pattern.includes('(?i')) + return [pattern] + + let parenBalance = 0 + let bracketBalance = 0 + let splitIndex = -1 + const target = pattern.length / 2 + let bestDist = Infinity + + // Scan for top-level pipe | + for (let i = 0; i < pattern.length; i++) { + const char = pattern[i] + if (char === '\\') { + i++ + continue + } + if (char === '[') { + bracketBalance++ + } + else if (char === ']') { + bracketBalance-- + } + else if (char === '(' && bracketBalance === 0) { + parenBalance++ + } + else if (char === ')' && bracketBalance === 0) { + parenBalance-- + } + else if (char === '|' && parenBalance === 0 && bracketBalance === 0) { + const dist = Math.abs(i - target) + if (dist < bestDist) { + bestDist = dist + splitIndex = i + } + } + } + + if (splitIndex !== -1) { + const left = pattern.slice(0, splitIndex) + const right = pattern.slice(splitIndex + 1) + // Pass checkOptimization=false because we are inside a pattern that already passed the check + return [...splitPattern(left, false), ...splitPattern(right, false)] + } + + // Wrappers check + + // (?i) prefix + if (pattern.startsWith('(?i)')) { + const inner = pattern.slice(4) + const chunks = splitPattern(inner, false) + return chunks.map(c => `(?i)${c}`) + } + + // \b ... \b + if (pattern.startsWith('\\b') && pattern.endsWith('\\b')) { + const inner = pattern.slice(2, -2) + if (isBalanced(inner)) { + const chunks = splitPattern(inner, false) + if (chunks.length > 1) { + return chunks.map(c => `\\b${c}\\b`) + } + } + } + + // (?i: ... ) + if (pattern.startsWith('(?i:') && pattern.endsWith(')')) { + if (isSingleGroup(pattern)) { + const inner = pattern.slice(4, -1) + const chunks = splitPattern(inner, false) + if (chunks.length > 1) { + return chunks.map(c => `(?i:${c})`) + } + } + } + + // (?: ... ) + if (pattern.startsWith('(?:') && pattern.endsWith(')')) { + if (isSingleGroup(pattern)) { + const inner = pattern.slice(3, -1) + const chunks = splitPattern(inner, false) + if (chunks.length > 1) { + return chunks.map(c => `(?:${c})`) + } + } + } + + // ( ... ) + if (pattern.startsWith('(') && pattern.endsWith(')')) { + if (isSingleGroup(pattern)) { + const inner = pattern.slice(1, -1) + const chunks = splitPattern(inner, false) + if (chunks.length > 1) { + return chunks.map(c => `(${c})`) + } + } + } + + return [pattern] +} + +function countCapturingGroups(pattern: string): number { + let groups = 0 + let bracketBalance = 0 + for (let i = 0; i < pattern.length; i++) { + if (pattern[i] === '\\') { + i++ + continue + } + if (pattern[i] === '[') { + bracketBalance++ + } + else if (pattern[i] === ']') { + bracketBalance-- + } + else if (pattern[i] === '(' && bracketBalance === 0) { + if (pattern[i + 1] !== '?') { + groups++ + } + else if (pattern[i + 2] === '<' && pattern[i + 3] !== '=' && pattern[i + 3] !== '!') { + groups++ + } + } + } + return groups +} + +function isBalanced(s: string): boolean { + let parenBalance = 0 + let bracketBalance = 0 + for (let i = 0; i < s.length; i++) { + const char = s[i] + if (char === '\\') { + i++ + continue + } + if (char === '[') + bracketBalance++ + else if (char === ']') + bracketBalance-- + else if (char === '(' && bracketBalance === 0) + parenBalance++ + else if (char === ')' && bracketBalance === 0) + parenBalance-- + } + return parenBalance === 0 && bracketBalance === 0 +} + +function isSingleGroup(s: string): boolean { + if (!s.startsWith('(')) { + return false + } + if (!s.endsWith(')')) { + return false + } + + let parenBalance = 0 + let bracketBalance = 0 + // We expect the first ( to be closed ONLY at the very end. + for (let i = 0; i < s.length; i++) { + const char = s[i] + if (char === '\\') { + i++ + continue + } + if (char === '[') { + bracketBalance++ + } + else if (char === ']') { + bracketBalance-- + } + else if (char === '(' && bracketBalance === 0) { + parenBalance++ + } + else if (char === ')' && bracketBalance === 0) { + parenBalance-- + if (parenBalance === 0 && i < s.length - 1) { + return false // Closed before end + } + } + } + return parenBalance === 0 && bracketBalance === 0 +} diff --git a/packages/engine-javascript/test/coverage.test.ts b/packages/engine-javascript/test/coverage.test.ts new file mode 100644 index 000000000..0bfd0dd21 --- /dev/null +++ b/packages/engine-javascript/test/coverage.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from 'vitest' +import { defaultJavaScriptRegexConstructor } from '../src/engine-compile' +import { JavaScriptScanner } from '../src/scanner' + +describe('coverage', () => { + it('handles nested word boundaries in splitPattern', () => { + const inner = `(?i)${'a'.repeat(200)}|${'b'.repeat(200)}` + const pattern = `\\b${inner}\\b` + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBeGreaterThan(1) + }) + + it('handles (?i) prefix splitting', () => { + const inner = `${'a'.repeat(200)}|${'b'.repeat(200)}` + const pattern = `(?i)${inner}` + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBeGreaterThan(1) + }) + + it('handles (?i:...) group splitting', () => { + const inner = `${'a'.repeat(200)}|${'b'.repeat(200)}` + const pattern = `(?i:${inner})` + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBeGreaterThan(1) + }) + + it('handles (?:...) non-capturing group splitting', () => { + // Must include (?i) to trigger split + const inner = `(?i)${'a'.repeat(200)}|${'b'.repeat(200)}` + const pattern = `(?:${inner})` + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBeGreaterThan(1) + }) + + it('handles (...) capturing group splitting', () => { + const inner = `(?i)${'a'.repeat(200)}|${'b'.repeat(200)}` + const pattern = `(${inner})` + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBeGreaterThan(1) + }) + + it('isSingleGroup returns false for unbalanced or multi-part', () => { + const part1 = `(${'a'.repeat(180)})` + const part2 = `(${'b'.repeat(180)})` + const pattern = part1 + part2 + const scanner = new JavaScriptScanner([pattern], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0].length).toBe(1) + }) + + it('handles forgiving mode errors', () => { + const scanner = new JavaScriptScanner(['('], { + regexConstructor: defaultJavaScriptRegexConstructor, + forgiving: true, + }) + expect(scanner.regexps[0][0]).toBeNull() + }) + + it('handles non-string patterns', () => { + // @ts-expect-error testing invalid input + const scanner = new JavaScriptScanner([123], { regexConstructor: defaultJavaScriptRegexConstructor }) + expect(scanner.regexps[0][0]).toBe(123) + expect(scanner.patternGroupCounts[0]).toBe(0) + }) + + it('handles regex constructor errors', () => { + expect(() => { + const _ = new JavaScriptScanner(['('], { regexConstructor: defaultJavaScriptRegexConstructor }) + }).toThrow() + + const scanner = new JavaScriptScanner(['('], { + regexConstructor: defaultJavaScriptRegexConstructor, + forgiving: true, + }) + expect(scanner.regexps[0][0]).toBeNull() + }) + + it('handles cached errors', () => { + const cache = new Map() + const pattern = '(' + cache.set(pattern, new Error('Cached error')) + + expect(() => { + const _ = new JavaScriptScanner([pattern], { + regexConstructor: defaultJavaScriptRegexConstructor, + cache, + }) + }).toThrow('Cached error') + + const scanner = new JavaScriptScanner([pattern], { + regexConstructor: defaultJavaScriptRegexConstructor, + cache, + forgiving: true, + }) + expect(scanner.regexps[0][0]).toBeNull() + }) + + it('handles cached regex', () => { + const cache = new Map() + const pattern = 'abc' + const regex = /abc/ + cache.set(pattern, regex) + + const scanner = new JavaScriptScanner([pattern], { + regexConstructor: defaultJavaScriptRegexConstructor, + cache, + }) + expect(scanner.regexps[0][0]).toBeInstanceOf(RegExp) + expect((scanner.regexps[0][0] as RegExp).source).toBe('abc') + }) +})