Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions capa/features/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import abc
from typing import Optional


class Address(abc.ABC):
Expand Down Expand Up @@ -52,51 +53,59 @@ def __hash__(self):
class ProcessAddress(Address):
"""an address of a process in a dynamic execution trace"""

def __init__(self, pid: int, ppid: int = 0):
def __init__(self, pid: int, ppid: int = 0, id: Optional[int] = None):
assert ppid >= 0
assert pid > 0
self.ppid = ppid
self.pid = pid
self.id = id

def __repr__(self):
return "process(%s%s)" % (
s = "process(%s%s%s)" % (
f"ppid: {self.ppid}, " if self.ppid > 0 else "",
f"pid: {self.pid}",
f", id: {self.id}" if self.id is not None else "",
)
return s

def __hash__(self):
return hash((self.ppid, self.pid))
return hash((self.ppid, self.pid, self.id))

def __eq__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) == (other.ppid, other.pid)
return (self.ppid, self.pid, self.id) == (other.ppid, other.pid, other.id)

def __lt__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) < (other.ppid, other.pid)
self_id = self.id if self.id is not None else -1
other_id = other.id if other.id is not None else -1
return (self.ppid, self.pid, self_id) < (other.ppid, other.pid, other_id)


class ThreadAddress(Address):
"""addresses a thread in a dynamic execution trace"""

def __init__(self, process: ProcessAddress, tid: int):
def __init__(self, process: ProcessAddress, tid: int, id: Optional[int] = None):
assert tid >= 0
self.process = process
self.tid = tid
self.id = id

def __repr__(self):
return f"{self.process}, thread(tid: {self.tid})"
return f"{self.process}, thread(tid: {self.tid}{f', id: {self.id}' if self.id is not None else ''})"

def __hash__(self):
return hash((self.process, self.tid))
return hash((self.process, self.tid, self.id))

def __eq__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) == (other.process, other.tid)
return (self.process, self.tid, self.id) == (other.process, other.tid, other.id)

def __lt__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) < (other.process, other.tid)
self_id = self.id if self.id is not None else -1
other_id = other.id if other.id is not None else -1
return (self.process, self.tid, self_id) < (other.process, other.tid, other_id)


class DynamicCallAddress(Address):
Expand Down
24 changes: 17 additions & 7 deletions capa/features/extractors/cape/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,32 @@ def get_processes(report: CapeReport) -> Iterator[ProcessHandle]:
"""
get all the created processes for a sample
"""
seen_processes = {}
counts: dict[tuple[int, int], int] = {}
for process in report.behavior.processes:
addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id)
key = (process.parent_id, process.process_id)
counts[key] = counts.get(key, 0) + 1

seen_processes: dict[tuple[int, int], list] = {}
seq: dict[tuple[int, int], int] = {}
for process in report.behavior.processes:
key = (process.parent_id, process.process_id)
seq[key] = seq.get(key, 0) + 1
process_id = seq[key] - 1 if counts[key] > 1 else None

addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id, id=process_id)
yield ProcessHandle(address=addr, inner=process)

# check for pid and ppid reuse
if addr not in seen_processes:
seen_processes[addr] = [process]
if key not in seen_processes:
seen_processes[key] = [process]
else:
logger.warning(
"pid and ppid reuse detected between process %s and process%s: %s",
process,
"es" if len(seen_processes[addr]) > 1 else "",
seen_processes[addr],
"es" if len(seen_processes[key]) > 1 else "",
seen_processes[key],
)
seen_processes[addr].append(process)
seen_processes[key].append(process)


def extract_import_names(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
Expand Down
12 changes: 10 additions & 2 deletions capa/features/extractors/cape/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,16 @@ def get_threads(ph: ProcessHandle) -> Iterator[ThreadHandle]:
process: Process = ph.inner
threads: list[int] = process.threads

for thread in threads:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
counts: dict[int, int] = {}
for tid in threads:
counts[tid] = counts.get(tid, 0) + 1

seq: dict[int, int] = {}
for tid in threads:
seq[tid] = seq.get(tid, 0) + 1
thread_id = seq[tid] - 1 if counts[tid] > 1 else None

address: ThreadAddress = ThreadAddress(process=ph.address, tid=tid, id=thread_id)
yield ThreadHandle(address=address, inner={})
Comment on lines +38 to 49
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This function handles TID reuse by assigning a lifecycle ID, which is great. However, unlike the corresponding get_processes function which warns about PID/PPID reuse, there's no warning here for TID reuse. For consistency and better diagnostics, consider adding a warning when a TID is reused within a process.

    seq: dict[int, int] = {}
    warned_tids: set[int] = set()
    for tid in threads:
        if counts[tid] > 1 and tid not in warned_tids:
            logger.warning("TID reuse detected for tid %d in process %s", tid, ph.address)
            warned_tids.add(tid)

        seq[tid] = seq.get(tid, 0) + 1
        thread_id = seq[tid] - 1 if counts[tid] > 1 else None

        address: ThreadAddress = ThreadAddress(process=ph.address, tid=tid, id=thread_id)
        yield ThreadHandle(address=address, inner={})



Expand Down
12 changes: 10 additions & 2 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ def get_processes(self) -> Iterator[ProcessHandle]:
)
continue

address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
address: ProcessAddress = ProcessAddress(
pid=monitor_process.pid,
ppid=monitor_process.ppid,
id=monitor_process.monitor_id,
)
yield ProcessHandle(address, inner=monitor_process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
Expand All @@ -114,7 +118,11 @@ def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]

address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
address: ThreadAddress = ThreadAddress(
process=ph.address,
tid=monitor_thread.tid,
id=monitor_thread.monitor_id,
)
yield ThreadHandle(address=address, inner=monitor_thread)

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:
Expand Down
112 changes: 91 additions & 21 deletions capa/features/freeze/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,40 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
return cls(type=AddressType.DN_TOKEN_OFFSET, value=(a.token, a.offset))

elif isinstance(a, capa.features.address.ProcessAddress):
return cls(type=AddressType.PROCESS, value=(a.ppid, a.pid))
if a.id is None:
return cls(type=AddressType.PROCESS, value=(a.ppid, a.pid))
return cls(type=AddressType.PROCESS, value=(a.ppid, a.pid, a.id))

elif isinstance(a, capa.features.address.ThreadAddress):
return cls(type=AddressType.THREAD, value=(a.process.ppid, a.process.pid, a.tid))
if a.process.id is None and a.id is None:
return cls(type=AddressType.THREAD, value=(a.process.ppid, a.process.pid, a.tid))
return cls(
type=AddressType.THREAD,
value=(
a.process.ppid,
a.process.pid,
a.tid,
a.process.id if a.process.id is not None else -1,
a.id if a.id is not None else -1,
),
)

elif isinstance(a, capa.features.address.DynamicCallAddress):
return cls(type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id))
if a.thread.process.id is None and a.thread.id is None:
return cls(
type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id)
)
return cls(
type=AddressType.CALL,
value=(
a.thread.process.ppid,
a.thread.process.pid,
a.thread.tid,
a.id,
a.thread.process.id if a.thread.process.id is not None else -1,
a.thread.id if a.thread.id is not None else -1,
),
)

elif a == capa.features.address.NO_ADDRESS or isinstance(a, capa.features.address._NoAddress):
return cls(type=AddressType.NO_ADDRESS, value=None)
Expand Down Expand Up @@ -137,30 +164,73 @@ def to_capa(self) -> capa.features.address.Address:

elif self.type is AddressType.PROCESS:
assert isinstance(self.value, tuple)
ppid, pid = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
return capa.features.address.ProcessAddress(ppid=ppid, pid=pid)
if len(self.value) == 2:
ppid, pid = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
return capa.features.address.ProcessAddress(ppid=ppid, pid=pid)
elif len(self.value) == 3:
ppid, pid, process_id = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(process_id, int)
return capa.features.address.ProcessAddress(
ppid=ppid, pid=pid, id=process_id if process_id >= 0 else None
)
else:
raise ValueError(f"invalid process address tuple shape: {self.value!r}")

elif self.type is AddressType.THREAD:
assert isinstance(self.value, tuple)
ppid, pid, tid = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
return capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
)
if len(self.value) == 3:
ppid, pid, tid = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
return capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
)
elif len(self.value) == 5:
ppid, pid, tid, process_id, thread_id = self.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
assert isinstance(process_id, int)
assert isinstance(thread_id, int)
return capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(
ppid=ppid, pid=pid, id=process_id if process_id >= 0 else None
),
tid=tid,
id=thread_id if thread_id >= 0 else None,
)
else:
raise ValueError(f"invalid thread address tuple shape: {self.value!r}")

elif self.type is AddressType.CALL:
assert isinstance(self.value, tuple)
ppid, pid, tid, id_ = self.value
return capa.features.address.DynamicCallAddress(
thread=capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
),
id=id_,
)
if len(self.value) == 4:
ppid, pid, tid, id_ = self.value
return capa.features.address.DynamicCallAddress(
thread=capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
),
id=id_,
)
elif len(self.value) == 6:
ppid, pid, tid, id_, process_id, thread_id = self.value
return capa.features.address.DynamicCallAddress(
thread=capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(
ppid=ppid, pid=pid, id=process_id if process_id >= 0 else None
),
tid=tid,
id=thread_id if thread_id >= 0 else None,
),
id=id_,
)
else:
raise ValueError(f"invalid call address tuple shape: {self.value!r}")

elif self.type is AddressType.NO_ADDRESS:
return capa.features.address.NO_ADDRESS
Expand Down
4 changes: 2 additions & 2 deletions capa/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,14 +712,14 @@ def result_rec(result: capa.features.common.Result):
threads_by_process[p.address] = []

for t in extractor.get_threads(p):
calls_by_thread[t.address] = []
calls_by_thread.setdefault(t.address, [])

for c in extractor.get_calls(p, t):
if c.address in matched_calls:
names_by_call[c.address] = extractor.get_call_name(p, t, c)
calls_by_thread[t.address].append(c.address)

if calls_by_thread[t.address]:
if calls_by_thread[t.address] and t.address not in matched_threads:
matched_threads.add(t.address)
threads_by_process[p.address].append(t.address)

Expand Down
21 changes: 18 additions & 3 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ def addr_to_pb2(addr: frz.Address) -> capa_pb2.Address:

elif addr.type is AddressType.PROCESS:
assert isinstance(addr.value, tuple)
ppid, pid = addr.value
if len(addr.value) == 2:
ppid, pid = addr.value
elif len(addr.value) == 3:
ppid, pid, _process_id = addr.value
else:
raise ValueError(f"invalid process address tuple shape: {addr.value!r}")
assert isinstance(ppid, int)
assert isinstance(pid, int)
return capa_pb2.Address(
Expand All @@ -112,7 +117,12 @@ def addr_to_pb2(addr: frz.Address) -> capa_pb2.Address:

elif addr.type is AddressType.THREAD:
assert isinstance(addr.value, tuple)
ppid, pid, tid = addr.value
if len(addr.value) == 3:
ppid, pid, tid = addr.value
elif len(addr.value) == 5:
ppid, pid, tid, _process_id, _thread_id = addr.value
else:
raise ValueError(f"invalid thread address tuple shape: {addr.value!r}")
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
Expand All @@ -127,7 +137,12 @@ def addr_to_pb2(addr: frz.Address) -> capa_pb2.Address:

elif addr.type is AddressType.CALL:
assert isinstance(addr.value, tuple)
ppid, pid, tid, id_ = addr.value
if len(addr.value) == 4:
ppid, pid, tid, id_ = addr.value
elif len(addr.value) == 6:
ppid, pid, tid, id_, _process_id, _thread_id = addr.value
else:
raise ValueError(f"invalid call address tuple shape: {addr.value!r}")
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
Expand Down
Loading
Loading