Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ This release includes Ghidra PyGhidra support, performance improvements, depende
- nursery/get-custom-http-header @msanchit-dev

### Bug Fixes
- address: add id field to ProcessAddress/ThreadAddress to uniquely track recycled PID/TID lifecycles across all dynamic sandboxes @devs6186 #2619
- main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
- render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
- rules: handle empty or invalid YAML documents gracefully in `Rule.from_yaml` and `get_rules` @devs6186 #2900
Expand Down
114 changes: 92 additions & 22 deletions capa/features/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@
# limitations under the License.

import abc
from typing import Optional


def _process_sort_key(process: Optional["ProcessAddress"]) -> tuple:
"""Create a total ordering key for nested process addresses."""
if process is None:
return (0,)

instance_id = process.instance_id if process.instance_id is not None else -1
return (1, _process_sort_key(process.parent), process.pid, instance_id)


class Address(abc.ABC):
Expand Down Expand Up @@ -50,53 +60,110 @@ def __hash__(self):


class ProcessAddress(Address):
"""an address of a process in a dynamic execution trace"""

def __init__(self, pid: int, ppid: int = 0):
assert ppid >= 0
"""an address of a process in a dynamic execution trace

Args:
pid: process ID assigned by the OS
parent: full address of the parent process, enabling unique tracking
of the parent even if its PID was recycled by the OS.
Use None for root/top-level processes (ppid == 0).
instance_id: sandbox-specific unique identifier to distinguish
processes whose OS-assigned PIDs collide due to reuse.
For VMRay this is the monitor_id; for CAPE it is a sequential
counter; for Drakvuf it is 0 (TID recycling is not tracked there).
"""

def __init__(
self,
pid: int,
parent: Optional["ProcessAddress"] = None,
instance_id: Optional[int] = None,
):
assert pid > 0
self.ppid = ppid
if parent is not None:
assert parent.pid > 0
self.pid = pid
self.parent = parent
self.instance_id = instance_id

@property
def ppid(self) -> int:
"""OS parent PID (0 if no parent)."""
return self.parent.pid if self.parent else 0

def __repr__(self):
return "process(%s%s)" % (
f"ppid: {self.ppid}, " if self.ppid > 0 else "",
f"pid: {self.pid}",
)
parts = []
if self.parent is not None:
parts.append(f"ppid: {self.parent.pid}")
parts.append(f"pid: {self.pid}")
if self.instance_id is not None:
parts.append(f"instance_id: {self.instance_id}")
return "process(%s)" % ", ".join(parts)

def __hash__(self):
return hash((self.ppid, self.pid))
return hash((self.parent, self.pid, self.instance_id))

def __eq__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) == (other.ppid, other.pid)
if not isinstance(other, ProcessAddress):
return NotImplemented
return (self.parent, self.pid, self.instance_id) == (
other.parent,
other.pid,
other.instance_id,
)

def __lt__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) < (other.ppid, other.pid)
return _process_sort_key(self) < _process_sort_key(other)


class ThreadAddress(Address):
"""addresses a thread in a dynamic execution trace"""

def __init__(self, process: ProcessAddress, tid: int):
"""addresses a thread in a dynamic execution trace

Args:
process: address of the containing process
tid: thread ID assigned by the OS
instance_id: sandbox-specific unique identifier to distinguish
threads whose OS-assigned TIDs collide due to reuse.
For VMRay this is the monitor_id; for CAPE it is a sequential
counter; for Drakvuf it is 0 (TID recycling is not tracked there).
"""

def __init__(
self, process: ProcessAddress, tid: int, instance_id: Optional[int] = None
):
assert tid >= 0
self.process = process
self.tid = tid
self.instance_id = instance_id

def __repr__(self):
return f"{self.process}, thread(tid: {self.tid})"
iid_part = (
f", instance_id: {self.instance_id}" if self.instance_id is not None else ""
)
return f"{self.process}, thread(tid: {self.tid}{iid_part})"

def __hash__(self):
return hash((self.process, self.tid))
return hash((self.process, self.tid, self.instance_id))

def __eq__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) == (other.process, other.tid)
if not isinstance(other, ThreadAddress):
return NotImplemented
return (self.process, self.tid, self.instance_id) == (
other.process,
other.tid,
other.instance_id,
)

def __lt__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) < (other.process, other.tid)
self_iid = self.instance_id if self.instance_id is not None else -1
other_iid = other.instance_id if other.instance_id is not None else -1
return (_process_sort_key(self.process), self.tid, self_iid) < (
_process_sort_key(other.process),
other.tid,
other_iid,
)


class DynamicCallAddress(Address):
Expand All @@ -114,7 +181,10 @@ def __hash__(self):
return hash((self.thread, self.id))

def __eq__(self, other):
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (other.thread, other.id)
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (
other.thread,
other.id,
)

def __lt__(self, other):
assert isinstance(other, DynamicCallAddress)
Expand Down
48 changes: 31 additions & 17 deletions capa/features/extractors/cape/file.py
Comment thread
mike-hunhoff marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@

from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.address import (
NO_ADDRESS,
Address,
ProcessAddress,
AbsoluteVirtualAddress,
)
from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.cape.models import CapeReport
from capa.features.extractors.base_extractor import ProcessHandle
Expand All @@ -28,24 +33,33 @@

def get_processes(report: CapeReport) -> Iterator[ProcessHandle]:
"""
get all the created processes for a sample
get all the created processes for a sample.

each process receives a sequential instance_id to ensure unique ProcessAddress
values even when the OS recycles a PID. Parent references are resolved from
the process list so that a recycled parent PID is also tracked uniquely.
"""
seen_processes = {}
seq: dict[tuple[int, int], int] = {}
# pid → latest ProcessAddress for parent lookups (ordered insertion matters)
proc_by_pid: dict[int, ProcessAddress] = {}
handles: list[ProcessHandle] = []

for process in report.behavior.processes:
addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id)
yield ProcessHandle(address=addr, inner=process)

# check for pid and ppid reuse
if addr not in seen_processes:
seen_processes[addr] = [process]
else:
logger.warning(
"pid and ppid reuse detected between process %s and process%s: %s",
process,
"es" if len(seen_processes[addr]) > 1 else "",
seen_processes[addr],
)
seen_processes[addr].append(process)
key = (process.parent_id, process.process_id)
id_ = seq.get(key, 0)
seq[key] = id_ + 1
parent_addr = proc_by_pid.get(process.parent_id)
if parent_addr is None and process.parent_id:
# parent not in CAPE report (e.g., OS/host process); create a skeleton entry
# so that ppid is preserved for filtering and display.
parent_addr = ProcessAddress(pid=process.parent_id)
addr = ProcessAddress(
pid=process.process_id, parent=parent_addr, instance_id=id_
)
proc_by_pid[process.process_id] = addr
handles.append(ProcessHandle(address=addr, inner=process))

yield from handles


def extract_import_names(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
Expand Down
14 changes: 11 additions & 3 deletions capa/features/extractors/cape/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,21 @@

def get_threads(ph: ProcessHandle) -> Iterator[ThreadHandle]:
"""
get the threads associated with a given process
get the threads associated with a given process.

each thread receives a sequential id to ensure unique ThreadAddress
values even when the OS recycles a TID.
"""
process: Process = ph.inner
threads: list[int] = process.threads

for thread in threads:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
seq: dict[int, int] = {}
for tid in threads:
id_ = seq.get(tid, 0)
seq[tid] = id_ + 1
address: ThreadAddress = ThreadAddress(
process=ph.address, tid=tid, instance_id=id_
)
yield ThreadHandle(address=address, inner={})


Expand Down
11 changes: 8 additions & 3 deletions capa/features/extractors/drakvuf/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
from capa.features.extractors.drakvuf.models import Call, DrakvufReport


def index_calls(report: DrakvufReport) -> dict[ProcessAddress, dict[ThreadAddress, list[Call]]]:
def index_calls(
report: DrakvufReport,
) -> dict[ProcessAddress, dict[ThreadAddress, list[Call]]]:
# this method organizes calls into processes and threads, and then sorts them based on
# timestamp so that we can address individual calls per index (CallAddress requires call index)
result: dict[ProcessAddress, dict[ThreadAddress, list[Call]]] = {}
Expand All @@ -29,8 +31,11 @@ def index_calls(report: DrakvufReport) -> dict[ProcessAddress, dict[ThreadAddres
# we ignore the pid 0 since it's a system process and it's unlikely for it to
# be hijacked or so on, in addition to capa addresses not supporting null pids
continue
proc_addr = ProcessAddress(pid=call.pid, ppid=call.ppid)
thread_addr = ThreadAddress(process=proc_addr, tid=call.tid)
parent_addr = (
ProcessAddress(pid=call.ppid, instance_id=0) if call.ppid else None
)
proc_addr = ProcessAddress(pid=call.pid, parent=parent_addr, instance_id=0)
thread_addr = ThreadAddress(process=proc_addr, tid=call.tid, instance_id=0)
if proc_addr not in result:
result[proc_addr] = {}
if thread_addr not in result[proc_addr]:
Expand Down
Loading
Loading