Skip to content
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### New Features

- rules: pre-filter string rules whose Substring/Regex patterns are absent from the binary file, reducing redundant regex evaluation during per-function matching #2126

### Breaking Changes

### New Rules (0)
Expand Down
20 changes: 20 additions & 0 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.features.common import String
from capa.capabilities.common import Capabilities, find_file_capabilities
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

Expand Down Expand Up @@ -163,7 +164,23 @@ def find_static_capabilities(
library_functions_list: list[rdoc.LibraryFunction] = []

assert isinstance(extractor, StaticFeatureExtractor)

functions: list[FunctionHandle] = list(extractor.get_functions())

# Pre-filter string rules based on strings found in the binary.
# For each rule whose required Substring/Regex patterns are provably absent
# from the binary's file-level strings, mark it as skippable in _match().
# This replaces repeated Regex.evaluate() calls (once per function × per rule)
# with a single file-level scan. See: https://github.com/mandiant/capa/issues/2126
#
# The upfront scan cost is O(|string_rules| × |file_strings|). For small
# binaries this overhead can exceed the savings, so we only activate the
# pre-filter when there are enough functions to justify it.
if len(functions) >= 10:
file_strings: frozenset[str] = frozenset(
feature.value for feature, _ in extractor.extract_file_features() if isinstance(feature, String)
)
ruleset.prepare_for_file(file_strings)
n_funcs: int = len(functions)
n_libs: int = 0
percentage: float = 0
Expand Down Expand Up @@ -239,6 +256,9 @@ def find_static_capabilities(
functions=tuple(function_feature_counts),
)

# Clear the string pre-filter so the ruleset is clean for potential reuse.
ruleset.prepare_for_file(frozenset())

matches: MatchResults = dict(
itertools.chain(
# each rule exists in exactly one scope,
Expand Down
103 changes: 103 additions & 0 deletions capa/rules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,6 +1463,11 @@ def __init__(
scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes
}

# Set of string-rule names whose required patterns are absent from the current binary.
# Populated by prepare_for_file(); empty means no pre-filtering is active.
# See: https://github.com/mandiant/capa/issues/2126
self._impossible_string_rule_names: set[str] = set()

@property
def file_rules(self):
return self.rules_by_scope[Scope.FILE]
Expand Down Expand Up @@ -1948,6 +1953,92 @@ def _sort_rules_by_index(rule_index_by_rule_name: dict[str, int], rules: list[Ru
"""
rules.sort(key=lambda r: rule_index_by_rule_name[r.name])

def prepare_for_file(self, file_strings: frozenset[str]) -> None:
"""
Pre-filter string rules based on strings extracted from the binary file.

Rules whose required Substring/Regex patterns cannot match any string in
file_strings will be skipped during subsequent _match() calls. This
saves repeated Regex.evaluate() / Substring.evaluate() work for patterns
that are provably absent from the binary.

Call this before analyzing functions for a binary.
Pass an empty frozenset to clear the filter between binaries.

See: https://github.com/mandiant/capa/issues/2126

Performance note: both Substring and Regex patterns use a concatenated-string
fast path (O(1) regex calls per rule) rather than per-string iteration.
Substring uses a \x01-joined concat with a C-level `in` check. Regex uses a
\n-joined concat with re.MULTILINE so that ^ / $ bind to line boundaries (one
re.search per rule). A per-string fallback runs only when the concat scan
matches, to rule out DOTALL false positives where .* spans a \n boundary.
"""
if not file_strings:
self._impossible_string_rule_names = set()
return

# Two concatenated forms are used to accelerate the scan:
#
# concat_substr (\x01-separated) — for Substring patterns.
# A literal pattern cannot span a \x01 boundary (rule patterns are
# printable ASCII; \x01 never appears in them or in extracted strings).
# One C-level `in` check replaces N per-string comparisons.
#
# concat_regex (\n-separated) — for Regex patterns.
# Each pattern is compiled with re.MULTILINE added so that ^ and $
# match at \n boundaries rather than only at the start/end of the whole
# string. This fixes the anchor bug: `re.search("^foo", "bar\nfoo")`
# succeeds when re.MULTILINE is set. One re.search per rule replaces N
# per-string calls. For patterns compiled with re.DOTALL, `.` also
# matches \n, so `.*` could bridge two adjacent strings (false positive);
# per-string confirmation handles that case.
concat_substr: str = "\x01".join(file_strings)
concat_regex: str = "\n".join(file_strings)

impossible: set[str] = set()
all_string_rule_names: set[str] = set()

for feature_index in self._feature_indexes_by_scopes.values():
for rule_name, wanted_strings in feature_index.string_rules.items():
if rule_name in all_string_rule_names:
continue
all_string_rule_names.add(rule_name)

can_match = False
for feat in wanted_strings:
if isinstance(feat, capa.features.common.Substring):
if feat.value in concat_substr:
can_match = True
break
elif isinstance(feat, capa.features.common.Regex):
# Re-compile with MULTILINE so ^ / $ respect \n boundaries.
# Python's re module caches compiled patterns internally, so
# the recompile cost is paid only on the first call.
ml_re = re.compile(feat.re.pattern, feat.re.flags | re.MULTILINE)
if ml_re.search(concat_regex):
# Concat matched: confirm per-string to rule out false
# positives from DOTALL patterns whose .* spans a \n.
if any(feat.re.search(s) for s in file_strings):
can_match = True
break
# No concat match → pattern is absent from every file string.
else:
# Unknown feature type: keep to be safe.
can_match = True
break
if not can_match:
impossible.add(rule_name)
Comment on lines +2002 to +2031
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The pre-filtering logic assumes that any string feature found during analysis (e.g., at the function scope) must also be present as a contiguous string at the file scope. However, capa extractors (like Vivisect or IDA) often reconstruct stack strings that are not stored contiguously in the binary's data sections.

Since extractor.extract_file_features() typically performs a bulk string scan of the raw bytes, it will not capture these reconstructed stack strings. Consequently, rules using Regex or Substring to match stack strings will be incorrectly marked as impossible and skipped during per-function matching, leading to false negatives for obfuscated binaries.

You should consider if this optimization should be restricted to rules that are known to target static strings, or if there's a way to include stack string components in the pre-filter.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm this is a good point


if impossible:
logger.debug(
"pre-filter: %d/%d string rules skipped (patterns absent from binary)",
len(impossible),
len(all_string_rule_names),
)
Comment on lines +1999 to +2038
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The total count and iteration logic can be improved for better accuracy and performance. Currently, total increments for every rule-scope pair, while len(impossible) counts unique rule names. Since the same rule can appear in multiple scopes (e.g., FUNCTION and BASIC_BLOCK), the debug log ratio might be misleading. Furthermore, the same rule is re-evaluated for every scope it appears in, which is redundant as the result depends only on the global file_strings.

Tracking unique rule names and skipping re-evaluation of rules already processed in a previous scope would be more efficient and provide clearer logging.

        impossible: set[str] = set()
        all_string_rule_names: set[str] = set()

        for feature_index in self._feature_indexes_by_scopes.values():
            for rule_name, wanted_strings in feature_index.string_rules.items():
                if rule_name in all_string_rule_names:
                    continue
                all_string_rule_names.add(rule_name)

                can_match = False
                for feat in wanted_strings:
                    if isinstance(feat, capa.features.common.Substring):
                        if any(feat.value in s for s in file_strings):
                            can_match = True
                            break
                    elif isinstance(feat, capa.features.common.Regex):
                        if any(feat.re.search(s) for s in file_strings):
                            can_match = True
                            break
                    else:
                        # unknown feature type: keep to be safe
                        can_match = True
                        break
                if not can_match:
                    impossible.add(rule_name)

        if impossible:
            logger.debug(
                "pre-filter: %d/%d string rules skipped (patterns absent from binary)",
                len(impossible),
                len(all_string_rule_names),
            )


self._impossible_string_rule_names = impossible

def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[FeatureSet, ceng.MatchResults]:
"""
Match rules from this ruleset at the given scope against the given features.
Expand Down Expand Up @@ -2026,7 +2117,19 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea
string_features[feature] = locations

if string_features:
# Some extractors may synthesize stack strings that do not exist as contiguous
# file bytes. In that case, avoid file-level pre-filtering for this scope.
has_stack_string_characteristic = any(
isinstance(feature, capa.features.common.Characteristic) and feature.value == "stack string"
for feature in features
)
for rule_name, wanted_strings in feature_index.string_rules.items():
# Skip rules whose patterns are provably absent from the binary.
# prepare_for_file() pre-checks all file strings once and populates
# _impossible_string_rule_names to avoid repeated Regex.evaluate() work.
# See: https://github.com/mandiant/capa/issues/2126
if not has_stack_string_characteristic and rule_name in self._impossible_string_rule_names:
continue
for wanted_string in wanted_strings:
if wanted_string.evaluate(string_features):
candidate_rule_names.add(rule_name)
Expand Down
Loading
Loading