-
Notifications
You must be signed in to change notification settings - Fork 691
rules: pre-filter string rules whose patterns are absent from the binary #2999
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
9dcd615
1a36f9e
bdc6dcd
ebd9a62
a4595fa
7bac4fa
4289ef3
f3e4a7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1463,6 +1463,11 @@ def __init__( | |
| scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes | ||
| } | ||
|
|
||
| # Set of string-rule names whose required patterns are absent from the current binary. | ||
| # Populated by prepare_for_file(); empty means no pre-filtering is active. | ||
| # See: https://github.com/mandiant/capa/issues/2126 | ||
| self._impossible_string_rule_names: set[str] = set() | ||
|
|
||
| @property | ||
| def file_rules(self): | ||
| return self.rules_by_scope[Scope.FILE] | ||
|
|
@@ -1948,6 +1953,56 @@ def _sort_rules_by_index(rule_index_by_rule_name: dict[str, int], rules: list[Ru | |
| """ | ||
| rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) | ||
|
|
||
| def prepare_for_file(self, file_strings: frozenset[str]) -> None: | ||
| """ | ||
| Pre-filter string rules based on strings extracted from the binary file. | ||
|
|
||
| Rules whose required Substring/Regex patterns cannot match any string in | ||
| file_strings will be skipped during subsequent _match() calls. This | ||
| saves repeated Regex.evaluate() / Substring.evaluate() work for patterns | ||
| that are provably absent from the binary. | ||
|
|
||
| Call this before analyzing functions for a binary. | ||
| Pass an empty frozenset to clear the filter between binaries. | ||
|
|
||
| See: https://github.com/mandiant/capa/issues/2126 | ||
| """ | ||
| if not file_strings: | ||
| self._impossible_string_rule_names = set() | ||
| return | ||
|
|
||
| impossible: set[str] = set() | ||
| total = 0 | ||
|
|
||
| for feature_index in self._feature_indexes_by_scopes.values(): | ||
| for rule_name, wanted_strings in feature_index.string_rules.items(): | ||
| total += 1 | ||
| can_match = False | ||
| for feat in wanted_strings: | ||
| if isinstance(feat, capa.features.common.Substring): | ||
| if any(feat.value in s for s in file_strings): | ||
| can_match = True | ||
| break | ||
| elif isinstance(feat, capa.features.common.Regex): | ||
| if any(feat.re.search(s) for s in file_strings): | ||
| can_match = True | ||
| break | ||
| else: | ||
| # unknown feature type: keep to be safe | ||
| can_match = True | ||
| break | ||
| if not can_match: | ||
| impossible.add(rule_name) | ||
|
|
||
| if impossible: | ||
| logger.debug( | ||
| "pre-filter: %d/%d string rules skipped (patterns absent from binary)", | ||
| len(impossible), | ||
| total, | ||
| ) | ||
|
Comment on lines
+1999
to
+2038
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Tracking unique rule names and skipping re-evaluation of rules already processed in a previous scope would be more efficient and provide clearer logging. impossible: set[str] = set()
all_string_rule_names: set[str] = set()
for feature_index in self._feature_indexes_by_scopes.values():
for rule_name, wanted_strings in feature_index.string_rules.items():
if rule_name in all_string_rule_names:
continue
all_string_rule_names.add(rule_name)
can_match = False
for feat in wanted_strings:
if isinstance(feat, capa.features.common.Substring):
if any(feat.value in s for s in file_strings):
can_match = True
break
elif isinstance(feat, capa.features.common.Regex):
if any(feat.re.search(s) for s in file_strings):
can_match = True
break
else:
# unknown feature type: keep to be safe
can_match = True
break
if not can_match:
impossible.add(rule_name)
if impossible:
logger.debug(
"pre-filter: %d/%d string rules skipped (patterns absent from binary)",
len(impossible),
len(all_string_rule_names),
) |
||
|
|
||
| self._impossible_string_rule_names = impossible | ||
|
|
||
| def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[FeatureSet, ceng.MatchResults]: | ||
| """ | ||
| Match rules from this ruleset at the given scope against the given features. | ||
|
|
@@ -2027,6 +2082,12 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea | |
|
|
||
| if string_features: | ||
| for rule_name, wanted_strings in feature_index.string_rules.items(): | ||
| # Skip rules whose patterns are provably absent from the binary. | ||
| # prepare_for_file() pre-checks all file strings once and populates | ||
| # _impossible_string_rule_names to avoid repeated Regex.evaluate() work. | ||
| # See: https://github.com/mandiant/capa/issues/2126 | ||
| if rule_name in self._impossible_string_rule_names: | ||
| continue | ||
| for wanted_string in wanted_strings: | ||
| if wanted_string.evaluate(string_features): | ||
| candidate_rule_names.add(rule_name) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pre-filtering logic assumes that any string feature found during analysis (e.g., at the function scope) must also be present as a contiguous string at the file scope. However,
capaextractors (like Vivisect or IDA) often reconstruct stack strings that are not stored contiguously in the binary's data sections.Since
extractor.extract_file_features()typically performs a bulk string scan of the raw bytes, it will not capture these reconstructed stack strings. Consequently, rules usingRegexorSubstringto match stack strings will be incorrectly marked asimpossibleand skipped during per-function matching, leading to false negatives for obfuscated binaries.You should consider if this optimization should be restricted to rules that are known to target static strings, or if there's a way to include stack string components in the pre-filter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hm this is a good point