Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
### New Features

- ghidra: support PyGhidra @mike-hunhoff #2788
- vmray: support parsing flog.txt (Download Function Log) without full ZIP @devs6186 #2452
- vmray: add flog.txt vs archive docs, fetch-vmray-flog.py helper, and fixture-based regression tests @devs6186 #2878
- vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835
- static: add function triage stage (skip/deprioritize/analyze logging) before function matching; library skip reporting unchanged
- static rules: add `connected blocks` scope/subscope with fixed depth=2 CFG neighborhoods and Vivisect CFG-edge support
- scripts: add `scripts/demo_connected_blocks_and_triage.py` to show triage counts and connected-block rule syntax

### Breaking Changes

Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,22 @@ To use capa as a library or integrate with another tool, see [doc/installation.m

**Documentation:** [Usage and tips](doc/usage.md) · [Installation](doc/installation.md) · [Limitations](doc/limitations.md) · [FAQ](doc/faq.md)

## static pipeline triage and connected blocks

Recent static pipeline updates add:

- **function triage** before full function matching: functions may be conservatively marked as skip/deprioritize using lightweight signals (library/flirt state, function size, API presence when available, thunk/runtime naming patterns).
- **connected blocks** static subscope syntax:

```yaml
- connected blocks:
- and:
- api: kernel32.CreateFileA
- api: kernel32.WriteFile
```

Connected-block matching currently uses fixed CFG neighborhood depth `2` and is available when using a backend with CFG edge support (Vivisect). Other static backends return no CFG edges for this scope.

# capa Explorer Web
The [capa Explorer Web](https://mandiant.github.io/capa/explorer/) enables you to interactively explore capa results in your web browser. Besides the online version you can download a standalone HTML file for local offline usage.

Expand Down
138 changes: 126 additions & 12 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@
import itertools
import collections
from dataclasses import dataclass
from collections import deque

import capa.perf
import capa.helpers
import capa.engine
import capa.features.freeze as frz
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.capabilities.common import Capabilities, find_file_capabilities
from capa.capabilities.triage import TriageDecision, classify_function, classify_library_function
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -110,11 +113,47 @@ def find_basic_block_capabilities(
@dataclass
class CodeCapabilities:
function_matches: MatchResults
connected_block_matches: MatchResults
basic_block_matches: MatchResults
instruction_matches: MatchResults
feature_count: int


def _build_connected_block_adjacency(
extractor: StaticFeatureExtractor, fh: FunctionHandle, bbs: tuple[BBHandle, ...]
) -> dict:
adjacency = fh.ctx.get("connected_blocks_adjacency")
if adjacency is not None:
return adjacency

adjacency = collections.defaultdict(set)
bb_by_address = {bb.address: bb for bb in bbs}
for bb in bbs:
adjacency[bb.address]
for succ in extractor.get_cfg_edges(fh, bb):
if succ.address in bb_by_address:
adjacency[bb.address].add(succ.address)
adjacency[succ.address].add(bb.address)

fh.ctx["connected_blocks_adjacency"] = adjacency
return adjacency


def _collect_connected_neighborhood(adjacency: dict, seed, depth: int = 2) -> set:
seen = {seed}
q = deque([(seed, 0)])
while q:
node, d = q.popleft()
if d >= depth:
continue
for succ in adjacency.get(node, ()):
if succ in seen:
continue
seen.add(succ)
q.append((succ, d + 1))
return seen


def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle) -> CodeCapabilities:
"""
find matches for the given rules within the given function.
Expand All @@ -126,33 +165,66 @@ def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor,
# matches found at the basic block scope.
# might be found at different basic blocks, that's ok.
bb_matches: MatchResults = collections.defaultdict(list)
connected_block_matches: MatchResults = collections.defaultdict(list)

# matches found at the instruction scope.
# might be found at different instructions, that's ok.
insn_matches: MatchResults = collections.defaultdict(list)

for bb in extractor.get_basic_blocks(fh):
basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb)
for feature, vas in basic_block_capabilities.features.items():
function_features[feature].update(vas)

for rule_name, res in basic_block_capabilities.basic_block_matches.items():
bb_matches[rule_name].extend(res)

for rule_name, res in basic_block_capabilities.instruction_matches.items():
insn_matches[rule_name].extend(res)
has_connected_block_rules = bool(ruleset.connected_block_rules)
if has_connected_block_rules:
bbs = tuple(extractor.get_basic_blocks(fh))
bb_features_by_address: dict = {}
for bb in bbs:
basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb)
bb_features_by_address[bb.address] = basic_block_capabilities.features
for feature, vas in basic_block_capabilities.features.items():
function_features[feature].update(vas)

for rule_name, res in basic_block_capabilities.basic_block_matches.items():
bb_matches[rule_name].extend(res)

for rule_name, res in basic_block_capabilities.instruction_matches.items():
insn_matches[rule_name].extend(res)

adjacency = _build_connected_block_adjacency(extractor, fh, bbs)
for seed in bbs:
neighborhood = _collect_connected_neighborhood(adjacency, seed.address, depth=2)
neighborhood_features: FeatureSet = collections.defaultdict(set)
for bb_address in neighborhood:
for feature, vas in bb_features_by_address.get(bb_address, {}).items():
neighborhood_features[feature].update(vas)

_, matches = ruleset.match(Scope.CONNECTED_BLOCKS, neighborhood_features, seed.address)
for rule_name, res in matches.items():
connected_block_matches[rule_name].extend(res)
rule = ruleset[rule_name]
for va, _ in res:
capa.engine.index_rule_matches(function_features, rule, [va])
else:
for bb in extractor.get_basic_blocks(fh):
basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb)
for feature, vas in basic_block_capabilities.features.items():
function_features[feature].update(vas)

for rule_name, res in basic_block_capabilities.basic_block_matches.items():
bb_matches[rule_name].extend(res)

for rule_name, res in basic_block_capabilities.instruction_matches.items():
insn_matches[rule_name].extend(res)

for feature, va in itertools.chain(extractor.extract_function_features(fh), extractor.extract_global_features()):
function_features[feature].add(va)

_, function_matches = ruleset.match(Scope.FUNCTION, function_features, fh.address)
return CodeCapabilities(function_matches, bb_matches, insn_matches, len(function_features))
return CodeCapabilities(function_matches, connected_block_matches, bb_matches, insn_matches, len(function_features))


def find_static_capabilities(
ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None
) -> Capabilities:
all_function_matches: MatchResults = collections.defaultdict(list)
all_connected_block_matches: MatchResults = collections.defaultdict(list)
all_bb_matches: MatchResults = collections.defaultdict(list)
all_insn_matches: MatchResults = collections.defaultdict(list)

Expand All @@ -163,6 +235,7 @@ def find_static_capabilities(
functions: list[FunctionHandle] = list(extractor.get_functions())
n_funcs: int = len(functions)
n_libs: int = 0
triage_counts: collections.Counter = collections.Counter()
percentage: float = 0

with capa.helpers.CapaProgressBar(
Expand All @@ -171,7 +244,27 @@ def find_static_capabilities(
task = pbar.add_task(
"matching", total=n_funcs, unit="functions", postfix=f"skipped {n_libs} library functions, {percentage}%"
)
triage_results: dict = {}
prioritized_functions: list[FunctionHandle] = []
deprioritized_functions: list[FunctionHandle] = []
skipped_functions: list[FunctionHandle] = []
for f in functions:
if extractor.is_library_function(f.address):
triage_counts[TriageDecision.SKIP.value] += 1
classify_library_function(f)
prioritized_functions.append(f)
continue
triage = classify_function(extractor, f)
triage_results[f.address] = triage
triage_counts[triage.decision.value] += 1
if triage.decision == TriageDecision.SKIP:
skipped_functions.append(f)
elif triage.decision == TriageDecision.DEPRIORITIZE:
deprioritized_functions.append(f)
else:
prioritized_functions.append(f)

for f in itertools.chain(prioritized_functions, deprioritized_functions, skipped_functions):
t0 = time.time()
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
Expand All @@ -185,6 +278,12 @@ def find_static_capabilities(
pbar.advance(task)
continue

triage = triage_results[f.address]
if triage.decision == TriageDecision.SKIP:
logger.debug("skipping triaged function %s (%s)", f.address, triage.reason)
pbar.advance(task)
continue

code_capabilities = find_code_capabilities(ruleset, extractor, f)
feature_counts.functions += (
rdoc.FunctionFeatureCount(
Expand All @@ -196,6 +295,7 @@ def find_static_capabilities(
match_count = 0
for name, matches_ in itertools.chain(
code_capabilities.function_matches.items(),
code_capabilities.connected_block_matches.items(),
code_capabilities.basic_block_matches.items(),
code_capabilities.instruction_matches.items(),
):
Expand All @@ -212,18 +312,31 @@ def find_static_capabilities(

for rule_name, res in code_capabilities.function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in code_capabilities.connected_block_matches.items():
all_connected_block_matches[rule_name].extend(res)
for rule_name, res in code_capabilities.basic_block_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in code_capabilities.instruction_matches.items():
all_insn_matches[rule_name].extend(res)

pbar.advance(task)

logger.debug(
"function triage summary: analyze=%d deprioritize=%d skip=%d (library=%d)",
triage_counts[TriageDecision.ANALYZE.value],
triage_counts[TriageDecision.DEPRIORITIZE.value],
triage_counts[TriageDecision.SKIP.value] - n_libs,
n_libs,
)

# collection of features that captures the rule matches within function, BB, and instruction scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
function_and_lower_features: FeatureSet = collections.defaultdict(set)
for rule_name, results in itertools.chain(
all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items()
all_function_matches.items(),
all_connected_block_matches.items(),
all_bb_matches.items(),
all_insn_matches.items(),
):
locations = {p[0] for p in results}
rule = ruleset[rule_name]
Expand All @@ -239,6 +352,7 @@ def find_static_capabilities(
# and we can merge the dictionaries naively.
all_insn_matches.items(),
all_bb_matches.items(),
all_connected_block_matches.items(),
all_function_matches.items(),
all_file_capabilities.matches.items(),
)
Expand Down
Loading
Loading