Skip to content

Commit 90ba1b0

Browse files
devs6186claude
andcommitted
address: add optional id field for unique tracking of recycled PID/TID lifecycles
Adds an optional `id` field to `ProcessAddress` and `ThreadAddress` that sandbox backends can populate with a sandbox-specific unique identifier (e.g. VMRay monitor_id, or a sequential counter for CAPE). When set, this field becomes part of equality/hashing so that two process or thread instances that share the same OS-assigned PID/TID are treated as distinct addresses throughout capa's pipeline. This comprehensively fixes the ValueError crash in render (#2619) by solving the root uniqueness problem described in #2361: rather than merging recycled lifecycles into a single entry, each instance now gets its own identity. Changes: - address.py: add optional `id` to ProcessAddress and ThreadAddress; update __eq__, __hash__, __lt__, __repr__ accordingly; backward-compatible (id=None by default) - freeze/__init__.py: extend from_capa/to_capa to encode/decode the new id fields using extended tuple lengths; old 2/3/4-element tuples still decoded correctly for backward compatibility - vmray/extractor.py: pass monitor_id as id to both ProcessAddress and ThreadAddress so each VMRay monitor instance is uniquely tracked - cape/file.py: detect PID reuse via two-pass counting and assign sequential ids; processes with unique PIDs keep id=None (no behavior change) - render/verbose.py: add _format_process_fields / _format_thread_fields helpers that include the id in rendered output when present - tests/test_address_uniqueness.py: 35 unit tests covering identity, hashing, sorting, freeze roundtrip (incl. backward compat), and compute_dynamic_layout behavior for both recycled TIDs and recycled PIDs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent da1abed commit 90ba1b0

File tree

7 files changed

+749
-70
lines changed

7 files changed

+749
-70
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
-
3434

3535
### Bug Fixes
36+
- address: add optional id field to ProcessAddress/ThreadAddress for unique tracking of recycled PID/TID lifecycles @devs6186 #2619
3637
- main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
3738
- render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
3839
- Fixed insecure deserialization vulnerability in YAML loading @0x1622 (#2770)

capa/features/address.py

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import abc
16+
from typing import Optional
1617

1718

1819
class Address(abc.ABC):
@@ -50,53 +51,83 @@ def __hash__(self):
5051

5152

5253
class ProcessAddress(Address):
53-
"""an address of a process in a dynamic execution trace"""
54-
55-
def __init__(self, pid: int, ppid: int = 0):
54+
"""an address of a process in a dynamic execution trace
55+
56+
Args:
57+
pid: process ID assigned by the OS
58+
ppid: parent process ID assigned by the OS
59+
id: optional sandbox-specific unique identifier to distinguish
60+
processes whose OS-assigned PIDs collide due to reuse.
61+
For VMRay this is the monitor_id; for other backends
62+
it may be a sequential counter or timestamp.
63+
"""
64+
65+
def __init__(self, pid: int, ppid: int = 0, id: Optional[int] = None):
5666
assert ppid >= 0
5767
assert pid > 0
5868
self.ppid = ppid
5969
self.pid = pid
70+
self.id = id
6071

6172
def __repr__(self):
62-
return "process(%s%s)" % (
63-
f"ppid: {self.ppid}, " if self.ppid > 0 else "",
64-
f"pid: {self.pid}",
65-
)
73+
parts = []
74+
if self.ppid > 0:
75+
parts.append(f"ppid: {self.ppid}")
76+
parts.append(f"pid: {self.pid}")
77+
if self.id is not None:
78+
parts.append(f"id: {self.id}")
79+
return "process(%s)" % ", ".join(parts)
6680

6781
def __hash__(self):
68-
return hash((self.ppid, self.pid))
82+
return hash((self.ppid, self.pid, self.id))
6983

7084
def __eq__(self, other):
7185
assert isinstance(other, ProcessAddress)
72-
return (self.ppid, self.pid) == (other.ppid, other.pid)
86+
return (self.ppid, self.pid, self.id) == (other.ppid, other.pid, other.id)
7387

7488
def __lt__(self, other):
7589
assert isinstance(other, ProcessAddress)
76-
return (self.ppid, self.pid) < (other.ppid, other.pid)
90+
# None sorts before any real id
91+
self_id = self.id if self.id is not None else -1
92+
other_id = other.id if other.id is not None else -1
93+
return (self.ppid, self.pid, self_id) < (other.ppid, other.pid, other_id)
7794

7895

7996
class ThreadAddress(Address):
80-
"""addresses a thread in a dynamic execution trace"""
81-
82-
def __init__(self, process: ProcessAddress, tid: int):
97+
"""addresses a thread in a dynamic execution trace
98+
99+
Args:
100+
process: address of the containing process
101+
tid: thread ID assigned by the OS
102+
id: optional sandbox-specific unique identifier to distinguish
103+
threads whose OS-assigned TIDs collide due to reuse.
104+
For VMRay this is the monitor_id; for other backends
105+
it may be a sequential counter or timestamp.
106+
"""
107+
108+
def __init__(self, process: ProcessAddress, tid: int, id: Optional[int] = None):
83109
assert tid >= 0
84110
self.process = process
85111
self.tid = tid
112+
self.id = id
86113

87114
def __repr__(self):
88-
return f"{self.process}, thread(tid: {self.tid})"
115+
id_part = f", id: {self.id}" if self.id is not None else ""
116+
return f"{self.process}, thread(tid: {self.tid}{id_part})"
89117

90118
def __hash__(self):
91-
return hash((self.process, self.tid))
119+
return hash((self.process, self.tid, self.id))
92120

93121
def __eq__(self, other):
94122
assert isinstance(other, ThreadAddress)
95-
return (self.process, self.tid) == (other.process, other.tid)
123+
return (self.process, self.tid, self.id) == (other.process, other.tid, other.id)
96124

97125
def __lt__(self, other):
98126
assert isinstance(other, ThreadAddress)
99-
return (self.process, self.tid) < (other.process, other.tid)
127+
# None sorts before any real id
128+
self_id = self.id if self.id is not None else -1
129+
other_id = other.id if other.id is not None else -1
130+
return (self.process, self.tid, self_id) < (other.process, other.tid, other_id)
100131

101132

102133
class DynamicCallAddress(Address):
@@ -114,7 +145,10 @@ def __hash__(self):
114145
return hash((self.thread, self.id))
115146

116147
def __eq__(self, other):
117-
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (other.thread, other.id)
148+
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (
149+
other.thread,
150+
other.id,
151+
)
118152

119153
def __lt__(self, other):
120154
assert isinstance(other, DynamicCallAddress)

capa/features/extractors/cape/file.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,24 +28,37 @@
2828

2929
def get_processes(report: CapeReport) -> Iterator[ProcessHandle]:
3030
"""
31-
get all the created processes for a sample
31+
get all the created processes for a sample.
32+
33+
when the OS recycles a PID, multiple processes in the report may share the
34+
same (ppid, pid) pair. we detect this and assign sequential ids so that
35+
each process receives a unique ProcessAddress.
3236
"""
33-
seen_processes = {}
37+
# first pass: count how many times each (ppid, pid) pair appears
38+
counts: dict[tuple[int, int], int] = {}
3439
for process in report.behavior.processes:
35-
addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id)
36-
yield ProcessHandle(address=addr, inner=process)
40+
key = (process.parent_id, process.process_id)
41+
counts[key] = counts.get(key, 0) + 1
3742

38-
# check for pid and ppid reuse
39-
if addr not in seen_processes:
40-
seen_processes[addr] = [process]
41-
else:
42-
logger.warning(
43-
"pid and ppid reuse detected between process %s and process%s: %s",
44-
process,
45-
"es" if len(seen_processes[addr]) > 1 else "",
46-
seen_processes[addr],
43+
# second pass: yield handles with sequential ids for reused pairs
44+
seq: dict[tuple[int, int], int] = {}
45+
for process in report.behavior.processes:
46+
key = (process.parent_id, process.process_id)
47+
seq[key] = seq.get(key, 0) + 1
48+
49+
# only assign ids when reuse is detected; otherwise keep id=None
50+
# for backward compatibility with existing addresses and freeze files
51+
id_ = seq[key] if counts[key] > 1 else None
52+
if id_ is not None:
53+
logger.debug(
54+
"pid reuse detected for ppid=%d, pid=%d: assigning id=%d",
55+
process.parent_id,
56+
process.process_id,
57+
id_,
4758
)
48-
seen_processes[addr].append(process)
59+
60+
addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id, id=id_)
61+
yield ProcessHandle(address=addr, inner=process)
4962

5063

5164
def extract_import_names(report: CapeReport) -> Iterator[tuple[Feature, Address]]:

capa/features/extractors/vmray/extractor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,9 @@ def get_processes(self) -> Iterator[ProcessHandle]:
9999
)
100100
continue
101101

102-
address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
102+
address: ProcessAddress = ProcessAddress(
103+
pid=monitor_process.pid, ppid=monitor_process.ppid, id=monitor_process.monitor_id
104+
)
103105
yield ProcessHandle(address, inner=monitor_process)
104106

105107
def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
@@ -114,7 +116,9 @@ def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
114116
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
115117
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]
116118

117-
address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
119+
address: ThreadAddress = ThreadAddress(
120+
process=ph.address, tid=monitor_thread.tid, id=monitor_thread.monitor_id
121+
)
118122
yield ThreadHandle(address=address, inner=monitor_thread)
119123

120124
def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:

0 commit comments

Comments
 (0)