Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions scripts/capa-diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
capa-diff.py

Compare capabilities between two capa JSON result documents.

Example:

$ capa --json old.exe > old.json
$ capa --json new.exe > new.json
$ python scripts/capa-diff.py old.json new.json
added capabilities: 2
+ anti-debug via timeout
+ inject process
removed capabilities: 1
- check for mutex
"""

from __future__ import annotations

import json
import sys
import argparse
from pathlib import Path

import capa.render.utils as rutils
import capa.render.default as rdefault
import capa.render.result_document as rd


def _parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Compare capabilities in two capa JSON result documents.")
parser.add_argument("old", type=Path, help="path to older/baseline capa JSON result document")
parser.add_argument("new", type=Path, help="path to newer/target capa JSON result document")
parser.add_argument(
"--format",
dest="output_format",
choices=("text", "json"),
default="text",
help="render output as text or json (default: text)",
)
parser.add_argument(
"--include-subscope-rules",
action="store_true",
help="include rules that only matched as subrule references",
)
return parser.parse_args(argv)


def _load_result_document(path: Path) -> rd.ResultDocument:
return rd.ResultDocument.model_validate_json(path.read_text(encoding="utf-8"))


def _collect_capabilities(doc: rd.ResultDocument, include_subscope_rules: bool = False) -> dict[str, dict[str, object]]:
hidden = set()
if not include_subscope_rules:
hidden = rdefault.find_subrule_matches(doc)

capabilities: dict[str, dict[str, object]] = {}
for rule in rutils.capability_rules(doc):
if rule.meta.name in hidden:
continue

capabilities[rule.meta.name] = {
"name": rule.meta.name,
"namespace": rule.meta.namespace,
"match_count": len(rule.matches),
}
return capabilities


def _render_text(added: list[dict[str, object]], removed: list[dict[str, object]]) -> str:
Comment on lines +33 to +86
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better type safety and code clarity, consider using a TypedDict for the structure of a capability. This makes the code easier to understand and maintain, as it explicitly defines the shape of the dictionary. This also allows for more precise type hints in function signatures.

from __future__ import annotations

import json
import sys
import argparse
from pathlib import Path
from typing import TypedDict

import capa.render.utils as rutils
import capa.render.default as rdefault
import capa.render.result_document as rd


class CapabilityInfo(TypedDict):
    name: str
    namespace: str | None
    match_count: int


def _parse_args(argv: list[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Compare capabilities in two capa JSON result documents.")
    parser.add_argument("old", type=Path, help="path to older/baseline capa JSON result document")
    parser.add_argument("new", type=Path, help="path to newer/target capa JSON result document")
    parser.add_argument(
        "--format",
        dest="output_format",
        choices=("text", "json"),
        default="text",
        help="render output as text or json (default: text)",
    )
    parser.add_argument(
        "--include-subscope-rules",
        action="store_true",
        help="include rules that only matched as subrule references",
    )
    return parser.parse_args(argv)


def _load_result_document(path: Path) -> rd.ResultDocument:
    return rd.ResultDocument.model_validate_json(path.read_text(encoding="utf-8"))


def _collect_capabilities(doc: rd.ResultDocument, include_subscope_rules: bool = False) -> dict[str, CapabilityInfo]:
    hidden = set()
    if not include_subscope_rules:
        hidden = rdefault.find_subrule_matches(doc)

    capabilities: dict[str, CapabilityInfo] = {}
    for rule in rutils.capability_rules(doc):
        if rule.meta.name in hidden:
            continue

        capabilities[rule.meta.name] = {
            "name": rule.meta.name,
            "namespace": rule.meta.namespace,
            "match_count": len(rule.matches),
        }
    return capabilities


def _render_text(added: list[CapabilityInfo], removed: list[CapabilityInfo]) -> str:

lines = [f"added capabilities: {len(added)}"]
for capability in added:
lines.append(f" + {capability['name']}")

lines.append(f"removed capabilities: {len(removed)}")
for capability in removed:
lines.append(f" - {capability['name']}")

return "\n".join(lines)


def main(argv: list[str] | None = None) -> int:
if argv is None:
argv = sys.argv[1:]

args = _parse_args(argv)
old_doc = _load_result_document(args.old)
new_doc = _load_result_document(args.new)

old_caps = _collect_capabilities(old_doc, include_subscope_rules=args.include_subscope_rules)
new_caps = _collect_capabilities(new_doc, include_subscope_rules=args.include_subscope_rules)

added = sorted((new_caps[name] for name in (set(new_caps) - set(old_caps))), key=lambda c: c["name"])
removed = sorted((old_caps[name] for name in (set(old_caps) - set(new_caps))), key=lambda c: c["name"])

if args.output_format == "json":
print(json.dumps({"added": added, "removed": removed}, indent=2))
else:
print(_render_text(added, removed))

return 0


if __name__ == "__main__":
sys.exit(main())
7 changes: 7 additions & 0 deletions tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ def get_rule_path():
pytest.param("show-features.py", ["-P", "MicrosoftEdgeUpdate.exe", get_cape_report_file_path()]),
pytest.param("show-unused-features.py", [get_binary_file_path()]),
pytest.param("capa-as-library.py", [get_binary_file_path()]),
pytest.param(
"capa-diff.py",
[
Path(__file__).resolve().parent / "data" / "rd" / "Practical Malware Analysis Lab 01-01.dll_.json",
Path(__file__).resolve().parent / "data" / "rd" / "Practical Malware Analysis Lab 01-01.dll_.json",
],
),
# not testing "minimize-vmray-results.py" as we don't currently upload full VMRay analysis archives
],
)
Expand Down
Loading