diff --git a/grok.go b/grok.go index e593222..cc29126 100644 --- a/grok.go +++ b/grok.go @@ -128,7 +128,10 @@ func (h Host) compileExternal(expr string) (*PatternLegacy, error) { if err != nil { return nil, err } - return &PatternLegacy{Regexp: r}, nil + return &PatternLegacy{ + Regexp: r, + requiredLiterals: extractRequiredLiterals(expr), + }, nil } // split spl := patternRegexp.Split(expr, -1) @@ -172,6 +175,7 @@ func (h Host) compileExternal(expr string) (*PatternLegacy, error) { } p := &PatternLegacy{Regexp: r} p.s = msi + p.requiredLiterals = extractRequiredLiterals(res) return p, nil } @@ -195,7 +199,10 @@ func (h Host) compileExternalRe2(expr string) (*PatternRe2, error) { if err != nil { return nil, err } - return &PatternRe2{Regexp: r}, nil + return &PatternRe2{ + Regexp: r, + requiredLiterals: extractRequiredLiterals(expr), + }, nil } // split spl := patternRegexp.Split(expr, -1) @@ -239,6 +246,7 @@ func (h Host) compileExternalRe2(expr string) (*PatternRe2, error) { } p := &PatternRe2{Regexp: r} p.s = msi + p.requiredLiterals = extractRequiredLiterals(res) return p, nil } diff --git a/literals.go b/literals.go new file mode 100644 index 0000000..0bc76e3 --- /dev/null +++ b/literals.go @@ -0,0 +1,143 @@ +package grokky + +import ( + "regexp/syntax" + "strings" +) + +// minLiteralLength is the minimum length of a literal to be considered for fast-reject. +// Shorter literals have higher false-positive rates, diminishing their value as pre-filters. +const minLiteralLength = 3 + +// extractRequiredLiterals parses a regex pattern string and extracts literal substrings +// that MUST appear in any matching input. This enables fast rejection of non-matching +// inputs with cheap strings.Contains() checks before running the full regex engine. +// +// The function walks the regexp/syntax AST and collects literals from OpConcat (sequence) +// nodes. Literals inside OpAlternate (alternation) nodes are excluded since only one +// branch needs to match. Literals inside optional repetitions (*, ?) are excluded since +// they may not appear in a valid match. +func extractRequiredLiterals(pattern string) []string { + re, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return nil + } + re = re.Simplify() + + literals := collectLiterals(re) + + // Deduplicate and filter short literals + seen := make(map[string]struct{}) + var result []string + for _, lit := range literals { + if len(lit) < minLiteralLength { + continue + } + if _, ok := seen[lit]; ok { + continue + } + seen[lit] = struct{}{} + result = append(result, lit) + } + + // Remove literals that are substrings of other literals in the set + // (checking the longer one is sufficient) + result = removeRedundantLiterals(result) + + return result +} + +// collectLiterals recursively walks the regex AST and returns literal strings +// that must appear in any matching input. +func collectLiterals(re *syntax.Regexp) []string { + switch re.Op { + case syntax.OpLiteral: + lit := string(re.Rune) + // Case-insensitive literals are not reliable for Contains() checks + if re.Flags&syntax.FoldCase != 0 { + return nil + } + return []string{lit} + + case syntax.OpConcat: + // In a concatenation, ALL children must match. Collect literals from each. + // Also merge adjacent literals into longer strings. + var allLiterals []string + var adjacentLiteral strings.Builder + + for _, sub := range re.Sub { + if sub.Op == syntax.OpLiteral && sub.Flags&syntax.FoldCase == 0 { + adjacentLiteral.WriteString(string(sub.Rune)) + } else { + if adjacentLiteral.Len() > 0 { + allLiterals = append(allLiterals, adjacentLiteral.String()) + adjacentLiteral.Reset() + } + allLiterals = append(allLiterals, collectLiterals(sub)...) + } + } + if adjacentLiteral.Len() > 0 { + allLiterals = append(allLiterals, adjacentLiteral.String()) + } + return allLiterals + + case syntax.OpCapture: + // A capture group is transparent — recurse into the child. + if len(re.Sub) > 0 { + return collectLiterals(re.Sub[0]) + } + return nil + + case syntax.OpRepeat: + // Only recurse if the minimum repeat count is >= 1 (the literal must appear) + if re.Min >= 1 && len(re.Sub) > 0 { + return collectLiterals(re.Sub[0]) + } + return nil + + case syntax.OpPlus: + // x+ requires at least one occurrence + if len(re.Sub) > 0 { + return collectLiterals(re.Sub[0]) + } + return nil + + case syntax.OpQuest, syntax.OpStar: + // x? and x* don't require the literal to appear + return nil + + case syntax.OpAlternate: + // In an alternation (a|b|c), we cannot require any single branch's literals + // because only one branch needs to match. + return nil + + default: + // OpAnyChar, OpAnyCharNotNL, OpCharClass, OpEmptyMatch, + // OpBeginLine, OpEndLine, OpBeginText, OpEndText, + // OpWordBoundary, OpNoWordBoundary, OpNoMatch + return nil + } +} + +// removeRedundantLiterals removes literals that are substrings of other literals in the set. +// If "Failed password" and "password" are both required, checking "Failed password" is sufficient. +func removeRedundantLiterals(literals []string) []string { + if len(literals) <= 1 { + return literals + } + + var result []string + for i, a := range literals { + redundant := false + for j, b := range literals { + if i != j && len(b) > len(a) && strings.Contains(b, a) { + redundant = true + break + } + } + if !redundant { + result = append(result, a) + } + } + return result +} diff --git a/literals_test.go b/literals_test.go new file mode 100644 index 0000000..4c59683 --- /dev/null +++ b/literals_test.go @@ -0,0 +1,334 @@ +package grokky + +import ( + "sort" + "testing" +) + +func TestExtractRequiredLiterals(t *testing.T) { + tests := []struct { + name string + pattern string + expected []string + }{ + { + name: "simple literal", + pattern: `Failed password`, + expected: []string{"Failed password"}, + }, + { + name: "literal with regex parts", + pattern: `Failed \w+ for \w+ from \d+\.\d+\.\d+\.\d+`, + expected: []string{"Failed ", " for ", " from "}, + }, + { + name: "alternation excludes branch literals", + pattern: `(?:GET|POST|PUT) /api/`, + expected: []string{" /api/"}, + }, + { + name: "no literals (pure regex)", + pattern: `\d+\.\d+\.\d+`, + expected: nil, + }, + { + name: "short literals filtered out", + pattern: `a\d+b\d+c`, + expected: nil, // all literals < 3 chars + }, + { + name: "capture group transparency", + pattern: `Failed (\w+) for (\w+)`, + expected: []string{"Failed ", " for "}, + }, + { + name: "optional parts excluded", + pattern: `Error(?:\s+fatal)?\s+in module`, + expected: []string{"Error", "in module"}, + }, + { + name: "star repetition excluded", + pattern: `prefix\w*suffix`, + expected: []string{"prefix", "suffix"}, + }, + { + name: "plus repetition with literal", + pattern: `begin\w+end`, + expected: []string{"begin", "end"}, + }, + { + name: "redundant literals removed", + pattern: `Failed password for invalid user`, + expected: []string{"Failed password for invalid user"}, + }, + { + name: "case insensitive literal skipped", + pattern: `(?i)failed`, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := extractRequiredLiterals(tt.pattern) + + // Sort both for stable comparison + sort.Strings(got) + sort.Strings(tt.expected) + + if len(got) != len(tt.expected) { + t.Errorf("extractRequiredLiterals(%q) = %v (len %d), want %v (len %d)", + tt.pattern, got, len(got), tt.expected, len(tt.expected)) + return + } + for i := range got { + if got[i] != tt.expected[i] { + t.Errorf("extractRequiredLiterals(%q)[%d] = %q, want %q", + tt.pattern, i, got[i], tt.expected[i]) + } + } + }) + } +} + +func TestFastReject_Parse(t *testing.T) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + t.Fatal(err) + } + + // Should match + result := p.Parse("Failed password for root from 192.168.1.1") + if len(result) == 0 { + t.Error("expected match, got empty result") + } + if result["method"] != "password" { + t.Errorf("method = %q, want %q", result["method"], "password") + } + if result["user"] != "root" { + t.Errorf("user = %q, want %q", result["user"], "root") + } + + // Should be fast-rejected (no "Failed" literal) + result = p.Parse("Accepted password for root from 192.168.1.1") + if len(result) != 0 { + t.Error("expected fast-reject (no match), got:", result) + } + + // Should be fast-rejected (no "from" literal) + result = p.Parse("Failed password for root via 192.168.1.1") + if len(result) != 0 { + t.Error("expected no match, got:", result) + } +} + +func TestFastReject_ParseInto(t *testing.T) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + t.Fatal(err) + } + + // Should match + dest := make(map[string]string) + ok := p.ParseInto("Failed password for root from 192.168.1.1", dest) + if !ok { + t.Error("expected match, got false") + } + if dest["method"] != "password" { + t.Errorf("method = %q, want %q", dest["method"], "password") + } + if dest["user"] != "root" { + t.Errorf("user = %q, want %q", dest["user"], "root") + } + if dest["ip"] != "192.168.1.1" { + t.Errorf("ip = %q, want %q", dest["ip"], "192.168.1.1") + } + + // Should be fast-rejected + dest2 := make(map[string]string) + ok = p.ParseInto("Accepted password for root from 192.168.1.1", dest2) + if ok { + t.Error("expected fast-reject, got true") + } + if len(dest2) != 0 { + t.Error("expected empty dest on fast-reject, got:", dest2) + } +} + +func TestParseInto_equivalence(t *testing.T) { + // Verify ParseInto produces identical results to Parse + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("NS", `[^\s]+`) + h.Must("NQ", `[^"]+`) + h.Must("NLB", `[^\]]+`) + h.Must("A", `.*`) + h.Must("NSS", `[^\s]*`) + h.Must("nginx", `%{NS:clientip}\s%{NSS:ident}\s%{NSS:auth}`+ + `\s\[`+ + `%{NLB:timestamp}\]\s\"`+ + `%{NS:verb}\s`+ + `%{NSS:request}\s`+ + `HTTP/%{NS:httpversion}\"\s`+ + `%{NS:response}\s`+ + `%{NS:bytes}\s\"`+ + `%{NQ:referrer}\"\s\"`+ + `%{NQ:agent}\"`+ + `%{A:blob}`) + + p, err := h.Get("nginx") + if err != nil { + t.Fatal(err) + } + + input := `66.249.65.159 - - [06/Nov/2014:19:10:38 +0600] ` + + `"GET /news/53f8d72920ba2744fe873ebc.html HTTP/1.1" ` + + `404 177 "-" ` + + `"Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) ` + + `AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 ` + + `Mobile/10A5376e Safari/8536.25"` + + parseResult := p.Parse(input) + dest := make(map[string]string) + ok := p.ParseInto(input, dest) + + if !ok { + t.Fatal("ParseInto returned false for matching input") + } + + if len(parseResult) != len(dest) { + t.Errorf("Parse returned %d entries, ParseInto wrote %d entries", + len(parseResult), len(dest)) + } + + for k, v := range parseResult { + if dest[k] != v { + t.Errorf("key %q: Parse=%q, ParseInto=%q", k, v, dest[k]) + } + } +} + +func TestFastReject_noFalseNegatives(t *testing.T) { + // Ensure the fast-reject never produces false negatives (rejecting a valid match) + h := NewBase() + h.Must("SYSLOGBASE2", `(?:%{SYSLOGTIMESTAMP:timestamp}|%{TIMESTAMP_ISO8601:timestamp8601}) (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:`) + + p, err := h.Get("SYSLOGBASE2") + if err != nil { + t.Fatal(err) + } + + inputs := []string{ + `Jan 14 06:35:01 hostname CRON[12345]: pam_unix(cron:session): session opened`, + `2024-01-14T06:35:01+00:00 hostname sshd[1234]: Accepted publickey`, + } + + for _, input := range inputs { + result := p.Parse(input) + if len(result) == 0 { + t.Errorf("false negative on %q: expected match, got none", input) + } + } +} + +func BenchmarkFastReject_miss(b *testing.B) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("NUMBER", `\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip} port %{NUMBER:port}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + b.Fatal(err) + } + + // Input that does NOT match — should be fast-rejected by literal check + nonMatchingInput := "Accepted publickey for admin from 10.0.0.1 port 22 ssh2" + + b.ResetTimer() + for i := 0; i < b.N; i++ { + globalMap = p.Parse(nonMatchingInput) + } + b.ReportAllocs() +} + +func BenchmarkFastReject_hit(b *testing.B) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("NUMBER", `\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip} port %{NUMBER:port}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + b.Fatal(err) + } + + // Input that DOES match + matchingInput := "Failed password for root from 192.168.1.1 port 22" + + b.ResetTimer() + for i := 0; i < b.N; i++ { + globalMap = p.Parse(matchingInput) + } + b.ReportAllocs() +} + +func BenchmarkParseInto_hit(b *testing.B) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("NUMBER", `\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip} port %{NUMBER:port}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + b.Fatal(err) + } + + matchingInput := "Failed password for root from 192.168.1.1 port 22" + dest := make(map[string]string) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Clear dest between iterations + for k := range dest { + delete(dest, k) + } + p.ParseInto(matchingInput, dest) + } + b.ReportAllocs() +} + +func BenchmarkParseInto_miss(b *testing.B) { + h := New() + h.Must("WORD", `\b\w+\b`) + h.Must("IP", `\d+\.\d+\.\d+\.\d+`) + h.Must("NUMBER", `\d+`) + h.Must("SSHFAIL", `Failed %{WORD:method} for %{WORD:user} from %{IP:ip} port %{NUMBER:port}`) + + p, err := h.Get("SSHFAIL") + if err != nil { + b.Fatal(err) + } + + nonMatchingInput := "Accepted publickey for admin from 10.0.0.1 port 22 ssh2" + dest := make(map[string]string) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + p.ParseInto(nonMatchingInput, dest) + } + b.ReportAllocs() +} diff --git a/pattern.go b/pattern.go index c17503f..a03594d 100644 --- a/pattern.go +++ b/pattern.go @@ -5,5 +5,8 @@ type Pattern interface { String() string Names() []string Parse(input string) map[string]string + // ParseInto writes matched captures directly into dest, avoiding an intermediate + // map allocation. Returns true if the pattern matched and captures were written. + ParseInto(input string, dest map[string]string) bool NumSubexp() int } diff --git a/pattern_legacy.go b/pattern_legacy.go index 3382e78..905f6ab 100644 --- a/pattern_legacy.go +++ b/pattern_legacy.go @@ -1,16 +1,35 @@ package grokky -import "regexp" +import ( + "regexp" + "strings" +) -// Pattern is a pattern. -// Feel free to use the Pattern as regexp.Regexp. +// PatternLegacy is a compiled grok pattern using Go's standard regexp engine. type PatternLegacy struct { *regexp.Regexp - s map[string]int + s map[string]int + requiredLiterals []string // literals that must appear in any matching input +} + +// canMatch performs cheap pre-checks to determine if the input could possibly match. +// Returns false if the input definitely cannot match (fast rejection). +func (p *PatternLegacy) canMatch(input string) bool { + for _, lit := range p.requiredLiterals { + if !strings.Contains(input, lit) { + return false + } + } + return true } // Parse returns map (name->match) on input. The map can be empty. func (p *PatternLegacy) Parse(input string) map[string]string { + // Fast-reject: check required literals before running the regex engine + if !p.canMatch(input) { + return make(map[string]string) + } + ss := p.FindStringSubmatch(input) r := make(map[string]string) if len(ss) <= 1 { @@ -22,6 +41,24 @@ func (p *PatternLegacy) Parse(input string) map[string]string { return r } +// ParseInto writes matched captures directly into the provided dest map, +// avoiding the intermediate map allocation of Parse(). Returns true if the +// pattern matched and captures were written. +func (p *PatternLegacy) ParseInto(input string, dest map[string]string) bool { + if !p.canMatch(input) { + return false + } + + ss := p.FindStringSubmatch(input) + if len(ss) <= 1 { + return false + } + for sem, order := range p.s { + dest[sem] = ss[order] + } + return true +} + // Names returns all names that this pattern has func (p *PatternLegacy) Names() (ss []string) { ss = make([]string, 0, len(p.s)) diff --git a/pattern_re2.go b/pattern_re2.go index 304776e..78a968e 100644 --- a/pattern_re2.go +++ b/pattern_re2.go @@ -1,18 +1,36 @@ package grokky import ( + "strings" + "github.com/wasilibs/go-re2" ) -// Pattern is a pattern. -// Feel free to use the Pattern as regexp.Regexp. +// PatternRe2 is a compiled grok pattern using the RE2 regexp engine. type PatternRe2 struct { *re2.Regexp - s map[string]int + s map[string]int + requiredLiterals []string // literals that must appear in any matching input +} + +// canMatch performs cheap pre-checks to determine if the input could possibly match. +// Returns false if the input definitely cannot match (fast rejection). +func (p *PatternRe2) canMatch(input string) bool { + for _, lit := range p.requiredLiterals { + if !strings.Contains(input, lit) { + return false + } + } + return true } // Parse returns map (name->match) on input. The map can be empty. func (p *PatternRe2) Parse(input string) map[string]string { + // Fast-reject: check required literals before running the regex engine + if !p.canMatch(input) { + return make(map[string]string) + } + ss := p.FindStringSubmatch(input) r := make(map[string]string) if len(ss) <= 1 { @@ -24,6 +42,24 @@ func (p *PatternRe2) Parse(input string) map[string]string { return r } +// ParseInto writes matched captures directly into the provided dest map, +// avoiding the intermediate map allocation of Parse(). Returns true if the +// pattern matched and captures were written. +func (p *PatternRe2) ParseInto(input string, dest map[string]string) bool { + if !p.canMatch(input) { + return false + } + + ss := p.FindStringSubmatch(input) + if len(ss) <= 1 { + return false + } + for sem, order := range p.s { + dest[sem] = ss[order] + } + return true +} + // Names returns all names that this pattern has func (p *PatternRe2) Names() (ss []string) { ss = make([]string, 0, len(p.s))