Skip to content
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### New Features

- rules: pre-filter string rules whose Substring/Regex patterns are absent from the binary file, reducing redundant regex evaluation during per-function matching #2126

### Breaking Changes

### New Rules (0)
Expand Down
15 changes: 15 additions & 0 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.features.common import String
from capa.capabilities.common import Capabilities, find_file_capabilities
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

Expand Down Expand Up @@ -163,6 +164,17 @@ def find_static_capabilities(
library_functions_list: list[rdoc.LibraryFunction] = []

assert isinstance(extractor, StaticFeatureExtractor)

# Pre-filter string rules based on strings found in the binary.
# Collect all string values from the file's feature set and inform the ruleset
# so that rules whose required patterns are provably absent are skipped during
# per-function matching. This avoids repeated Regex.evaluate() calls that can
# never succeed. See: https://github.com/mandiant/capa/issues/2126
file_strings: frozenset[str] = frozenset(
feature.value for feature, _ in extractor.extract_file_features() if isinstance(feature, String)
)
ruleset.prepare_for_file(file_strings)

functions: list[FunctionHandle] = list(extractor.get_functions())
n_funcs: int = len(functions)
n_libs: int = 0
Expand Down Expand Up @@ -239,6 +251,9 @@ def find_static_capabilities(
functions=tuple(function_feature_counts),
)

# Clear the string pre-filter so the ruleset is clean for potential reuse.
ruleset.prepare_for_file(frozenset())

matches: MatchResults = dict(
itertools.chain(
# each rule exists in exactly one scope,
Expand Down
61 changes: 61 additions & 0 deletions capa/rules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,6 +1463,11 @@ def __init__(
scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes
}

# Set of string-rule names whose required patterns are absent from the current binary.
# Populated by prepare_for_file(); empty means no pre-filtering is active.
# See: https://github.com/mandiant/capa/issues/2126
self._impossible_string_rule_names: set[str] = set()

@property
def file_rules(self):
return self.rules_by_scope[Scope.FILE]
Expand Down Expand Up @@ -1948,6 +1953,56 @@ def _sort_rules_by_index(rule_index_by_rule_name: dict[str, int], rules: list[Ru
"""
rules.sort(key=lambda r: rule_index_by_rule_name[r.name])

def prepare_for_file(self, file_strings: frozenset[str]) -> None:
"""
Pre-filter string rules based on strings extracted from the binary file.

Rules whose required Substring/Regex patterns cannot match any string in
file_strings will be skipped during subsequent _match() calls. This
saves repeated Regex.evaluate() / Substring.evaluate() work for patterns
that are provably absent from the binary.

Call this before analyzing functions for a binary.
Pass an empty frozenset to clear the filter between binaries.

See: https://github.com/mandiant/capa/issues/2126
"""
if not file_strings:
self._impossible_string_rule_names = set()
return

impossible: set[str] = set()
total = 0

for feature_index in self._feature_indexes_by_scopes.values():
for rule_name, wanted_strings in feature_index.string_rules.items():
total += 1
can_match = False
for feat in wanted_strings:
if isinstance(feat, capa.features.common.Substring):
if any(feat.value in s for s in file_strings):
can_match = True
break
elif isinstance(feat, capa.features.common.Regex):
if any(feat.re.search(s) for s in file_strings):
can_match = True
break
else:
# unknown feature type: keep to be safe
can_match = True
break
if not can_match:
impossible.add(rule_name)
Comment on lines +2002 to +2031
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The pre-filtering logic assumes that any string feature found during analysis (e.g., at the function scope) must also be present as a contiguous string at the file scope. However, capa extractors (like Vivisect or IDA) often reconstruct stack strings that are not stored contiguously in the binary's data sections.

Since extractor.extract_file_features() typically performs a bulk string scan of the raw bytes, it will not capture these reconstructed stack strings. Consequently, rules using Regex or Substring to match stack strings will be incorrectly marked as impossible and skipped during per-function matching, leading to false negatives for obfuscated binaries.

You should consider if this optimization should be restricted to rules that are known to target static strings, or if there's a way to include stack string components in the pre-filter.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm this is a good point


if impossible:
logger.debug(
"pre-filter: %d/%d string rules skipped (patterns absent from binary)",
len(impossible),
total,
)
Comment on lines +1999 to +2038
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The total count and iteration logic can be improved for better accuracy and performance. Currently, total increments for every rule-scope pair, while len(impossible) counts unique rule names. Since the same rule can appear in multiple scopes (e.g., FUNCTION and BASIC_BLOCK), the debug log ratio might be misleading. Furthermore, the same rule is re-evaluated for every scope it appears in, which is redundant as the result depends only on the global file_strings.

Tracking unique rule names and skipping re-evaluation of rules already processed in a previous scope would be more efficient and provide clearer logging.

        impossible: set[str] = set()
        all_string_rule_names: set[str] = set()

        for feature_index in self._feature_indexes_by_scopes.values():
            for rule_name, wanted_strings in feature_index.string_rules.items():
                if rule_name in all_string_rule_names:
                    continue
                all_string_rule_names.add(rule_name)

                can_match = False
                for feat in wanted_strings:
                    if isinstance(feat, capa.features.common.Substring):
                        if any(feat.value in s for s in file_strings):
                            can_match = True
                            break
                    elif isinstance(feat, capa.features.common.Regex):
                        if any(feat.re.search(s) for s in file_strings):
                            can_match = True
                            break
                    else:
                        # unknown feature type: keep to be safe
                        can_match = True
                        break
                if not can_match:
                    impossible.add(rule_name)

        if impossible:
            logger.debug(
                "pre-filter: %d/%d string rules skipped (patterns absent from binary)",
                len(impossible),
                len(all_string_rule_names),
            )


self._impossible_string_rule_names = impossible

def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[FeatureSet, ceng.MatchResults]:
"""
Match rules from this ruleset at the given scope against the given features.
Expand Down Expand Up @@ -2027,6 +2082,12 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea

if string_features:
for rule_name, wanted_strings in feature_index.string_rules.items():
# Skip rules whose patterns are provably absent from the binary.
# prepare_for_file() pre-checks all file strings once and populates
# _impossible_string_rule_names to avoid repeated Regex.evaluate() work.
# See: https://github.com/mandiant/capa/issues/2126
if rule_name in self._impossible_string_rule_names:
continue
for wanted_string in wanted_strings:
if wanted_string.evaluate(string_features):
candidate_rule_names.add(rule_name)
Expand Down
Loading