-
Notifications
You must be signed in to change notification settings - Fork 696
Expand file tree
/
Copy path__init__.py
More file actions
2383 lines (1997 loc) · 95.2 KB
/
__init__.py
File metadata and controls
2383 lines (1997 loc) · 95.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import re
import copy
import uuid
import struct
import logging
import binascii
import collections
from enum import Enum
from pathlib import Path
from capa.helpers import assert_never
try:
from functools import lru_cache
except ImportError:
# need to type ignore this due to mypy bug here (duplicate name):
# https://github.com/python/mypy/issues/1153
from backports.functools_lru_cache import lru_cache # type: ignore
from typing import Any, Union, Callable, Iterator, Optional, cast
from dataclasses import asdict, dataclass
import yaml
import pydantic
import yaml.parser
import capa.perf
import capa.engine as ceng
import capa.features
import capa.optimizer
import capa.features.com
import capa.features.file
import capa.features.insn
import capa.features.common
import capa.features.basicblock
from capa.engine import Statement, FeatureSet
from capa.features.com import ComType
from capa.features.common import MAX_BYTES_FEATURE_SIZE, Feature
from capa.features.address import Address
logger = logging.getLogger(__name__)
# Fixed prefix size used to pre-filter extracted bytes features.
# This narrows candidate selection from all extracted bytes to those
# sharing a common 4-byte prefix while keeping the implementation simple.
# See: https://github.com/mandiant/capa/issues/2128
_BYTES_PREFIX_SIZE = 4
# these are the standard metadata fields, in the preferred order.
# when reformatted, any custom keys will come after these.
META_KEYS = (
"name",
"namespace",
"maec/analysis-conclusion",
"maec/analysis-conclusion-ov",
"maec/malware-family",
"maec/malware-category",
"maec/malware-category-ov",
"authors",
"description",
"lib",
"scopes",
"att&ck",
"mbc",
"references",
"examples",
)
# these are meta fields that are internal to capa,
# and added during rule reading/construction.
# they may help use manipulate or index rules,
# but should not be exposed to clients.
HIDDEN_META_KEYS = ("capa/nursery", "capa/path")
class Scope(str, Enum):
FILE = "file"
PROCESS = "process"
THREAD = "thread"
SPAN_OF_CALLS = "span of calls"
CALL = "call"
FUNCTION = "function"
BASIC_BLOCK = "basic block"
INSTRUCTION = "instruction"
# used only to specify supported features per scope.
# not used to validate rules.
GLOBAL = "global"
@classmethod
def to_yaml(cls, representer, node):
return representer.represent_str(f"{node.value}")
# these literals are used to check if the flavor
# of a rule is correct.
STATIC_SCOPES = {
Scope.FILE,
Scope.GLOBAL,
Scope.FUNCTION,
Scope.BASIC_BLOCK,
Scope.INSTRUCTION,
}
DYNAMIC_SCOPES = {
Scope.FILE,
Scope.GLOBAL,
Scope.PROCESS,
Scope.THREAD,
Scope.SPAN_OF_CALLS,
Scope.CALL,
}
@dataclass
class Scopes:
# when None, the scope is not supported by a rule
static: Optional[Scope] = None
# when None, the scope is not supported by a rule
dynamic: Optional[Scope] = None
def __contains__(self, scope: Scope) -> bool:
return (scope == self.static) or (scope == self.dynamic)
def __repr__(self) -> str:
if self.static and self.dynamic:
return f"static-scope: {self.static}, dynamic-scope: {self.dynamic}"
elif self.static:
return f"static-scope: {self.static}"
elif self.dynamic:
return f"dynamic-scope: {self.dynamic}"
else:
raise ValueError("invalid rules class. at least one scope must be specified")
@classmethod
def from_dict(self, scopes: dict[str, str]) -> "Scopes":
# make local copy so we don't make changes outside of this routine.
# we'll use the value None to indicate the scope is not supported.
scopes_: dict[str, Optional[str]] = dict(scopes)
# mark non-specified scopes as invalid
if "static" not in scopes_:
raise InvalidRule("static scope must be provided")
if "dynamic" not in scopes_:
raise InvalidRule("dynamic scope must be provided")
# check the syntax of the meta `scopes` field
if sorted(scopes_) != ["dynamic", "static"]:
raise InvalidRule("scope flavors can be either static or dynamic")
if scopes_["static"] == "unsupported":
scopes_["static"] = None
if scopes_["dynamic"] == "unsupported":
scopes_["dynamic"] = None
if (not scopes_["static"]) and (not scopes_["dynamic"]):
raise InvalidRule("invalid scopes value. At least one scope must be specified")
# check that all the specified scopes are valid
if scopes_["static"] and scopes_["static"] not in STATIC_SCOPES:
raise InvalidRule(f"{scopes_['static']} is not a valid static scope")
if scopes_["dynamic"] and scopes_["dynamic"] not in DYNAMIC_SCOPES:
raise InvalidRule(f"{scopes_['dynamic']} is not a valid dynamic scope")
return Scopes(
static=Scope(scopes_["static"]) if scopes_["static"] else None,
dynamic=Scope(scopes_["dynamic"]) if scopes_["dynamic"] else None,
)
SUPPORTED_FEATURES: dict[str, set] = {
Scope.GLOBAL: {
# these will be added to other scopes, see below.
capa.features.common.OS,
capa.features.common.Arch,
capa.features.common.Format,
},
Scope.FILE: {
capa.features.common.MatchedRule,
capa.features.file.Export,
capa.features.file.Import,
capa.features.file.Section,
capa.features.file.FunctionName,
capa.features.common.Characteristic("embedded pe"),
capa.features.common.String,
capa.features.common.Class,
capa.features.common.Namespace,
capa.features.common.Characteristic("mixed mode"),
capa.features.common.Characteristic("forwarded export"),
},
Scope.PROCESS: {
capa.features.common.MatchedRule,
},
Scope.THREAD: set(),
Scope.SPAN_OF_CALLS: set(),
Scope.CALL: {
capa.features.common.MatchedRule,
capa.features.common.Regex,
capa.features.common.String,
capa.features.common.Substring,
capa.features.insn.API,
capa.features.insn.Number,
},
Scope.FUNCTION: {
capa.features.common.MatchedRule,
capa.features.basicblock.BasicBlock,
capa.features.common.Characteristic("calls from"),
capa.features.common.Characteristic("calls to"),
capa.features.common.Characteristic("loop"),
capa.features.common.Characteristic("recursive call"),
# plus basic block scope features, see below
},
Scope.BASIC_BLOCK: {
capa.features.common.MatchedRule,
capa.features.common.Characteristic("tight loop"),
capa.features.common.Characteristic("stack string"),
# plus instruction scope features, see below
},
Scope.INSTRUCTION: {
capa.features.common.MatchedRule,
capa.features.insn.API,
capa.features.insn.Property,
capa.features.insn.Number,
capa.features.common.String,
capa.features.common.Bytes,
capa.features.insn.Offset,
capa.features.insn.Mnemonic,
capa.features.insn.OperandNumber,
capa.features.insn.OperandOffset,
capa.features.common.Characteristic("nzxor"),
capa.features.common.Characteristic("peb access"),
capa.features.common.Characteristic("fs access"),
capa.features.common.Characteristic("gs access"),
capa.features.common.Characteristic("indirect call"),
capa.features.common.Characteristic("call $+5"),
capa.features.common.Characteristic("cross section flow"),
capa.features.common.Characteristic("unmanaged call"),
capa.features.common.Class,
capa.features.common.Namespace,
},
}
# global scope features are available in all other scopes
SUPPORTED_FEATURES[Scope.INSTRUCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.FILE].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.PROCESS].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.THREAD].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.SPAN_OF_CALLS].update(SUPPORTED_FEATURES[Scope.GLOBAL])
SUPPORTED_FEATURES[Scope.CALL].update(SUPPORTED_FEATURES[Scope.GLOBAL])
# all call scope features are also span-of-calls features
SUPPORTED_FEATURES[Scope.SPAN_OF_CALLS].update(SUPPORTED_FEATURES[Scope.CALL])
# all span-of-calls scope features (and therefore, call features) are also thread features
SUPPORTED_FEATURES[Scope.THREAD].update(SUPPORTED_FEATURES[Scope.SPAN_OF_CALLS])
# all thread scope features are also process features
SUPPORTED_FEATURES[Scope.PROCESS].update(SUPPORTED_FEATURES[Scope.THREAD])
# all instruction scope features are also basic block features
SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.INSTRUCTION])
# all basic block scope features are also function scope features
SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK])
class InvalidRule(ValueError):
def __str__(self):
return f"invalid rule: {super().__str__()}"
def __repr__(self):
return str(self)
class InvalidRuleWithPath(InvalidRule):
def __init__(self, path, msg):
super().__init__(msg)
self.path = path
self.__cause__ = None
def __str__(self):
return f"invalid rule: {self.path}: {super(InvalidRule, self).__str__()}"
class InvalidRuleSet(ValueError):
def __str__(self):
return f"invalid rule set: {super().__str__()}"
def __repr__(self):
return str(self)
def ensure_feature_valid_for_scopes(scopes: Scopes, feature: Union[Feature, Statement]):
# construct a dict of all supported features
supported_features: set = set()
if scopes.static:
supported_features.update(SUPPORTED_FEATURES[scopes.static])
if scopes.dynamic:
supported_features.update(SUPPORTED_FEATURES[scopes.dynamic])
# if the given feature is a characteristic,
# check that is a valid characteristic for the given scope.
if (
isinstance(feature, capa.features.common.Characteristic)
and isinstance(feature.value, str)
and capa.features.common.Characteristic(feature.value) not in supported_features
):
raise InvalidRule(f"feature {feature} not supported for scopes {scopes}")
if not isinstance(feature, capa.features.common.Characteristic):
# features of this scope that are not Characteristics will be Type instances.
# check that the given feature is one of these types.
types_for_scope = filter(lambda t: isinstance(t, type), supported_features)
if not isinstance(feature, tuple(types_for_scope)):
raise InvalidRule(f"feature {feature} not supported for scopes {scopes}")
def translate_com_feature(com_name: str, com_type: ComType) -> ceng.Statement:
com_db = capa.features.com.load_com_database(com_type)
guids: Optional[list[str]] = com_db.get(com_name)
if not guids:
logger.error(" %s doesn't exist in COM %s database", com_name, com_type)
raise InvalidRule(f"'{com_name}' doesn't exist in COM {com_type} database")
com_features: list[Feature] = []
for guid in guids:
hex_chars = guid.replace("-", "")
h = [hex_chars[i : i + 2] for i in range(0, len(hex_chars), 2)]
reordered_hex_pairs = [
h[3],
h[2],
h[1],
h[0],
h[5],
h[4],
h[7],
h[6],
h[8],
h[9],
h[10],
h[11],
h[12],
h[13],
h[14],
h[15],
]
guid_bytes = bytes.fromhex("".join(reordered_hex_pairs))
prefix = capa.features.com.COM_PREFIXES[com_type]
symbol = prefix + com_name
com_features.append(capa.features.common.String(guid, f"{symbol} as GUID string"))
com_features.append(capa.features.common.Bytes(guid_bytes, f"{symbol} as bytes"))
return ceng.Or(com_features)
def parse_int(s: str) -> int:
if s.startswith("0x"):
return int(s, 0x10)
else:
return int(s, 10)
def parse_range(s: str):
"""
parse a string "(0, 1)" into a range (min, max).
min and/or max may by None to indicate an unbound range.
"""
# we want to use `{` characters, but this is a dict in yaml.
if not s.startswith("("):
raise InvalidRule(f"invalid range: {s}")
if not s.endswith(")"):
raise InvalidRule(f"invalid range: {s}")
s = s[len("(") : -len(")")]
min_spec, _, max_spec = s.partition(",")
min_spec = min_spec.strip()
max_spec = max_spec.strip()
min_ = None
if min_spec:
min_ = parse_int(min_spec)
if min_ < 0:
raise InvalidRule("range min less than zero")
max_ = None
if max_spec:
max_ = parse_int(max_spec)
if max_ < 0:
raise InvalidRule("range max less than zero")
if min_ is not None and max_ is not None:
if max_ < min_:
raise InvalidRule("range max less than min")
return min_, max_
def parse_feature(key: str):
# keep this in sync with supported features
if key == "api":
return capa.features.insn.API
elif key == "string":
return capa.features.common.StringFactory
elif key == "substring":
return capa.features.common.Substring
elif key == "bytes":
return capa.features.common.Bytes
elif key == "number":
return capa.features.insn.Number
elif key == "offset":
return capa.features.insn.Offset
elif key == "mnemonic":
return capa.features.insn.Mnemonic
elif key == "basic blocks":
return capa.features.basicblock.BasicBlock
elif key == "characteristic":
return capa.features.common.Characteristic
elif key == "export":
return capa.features.file.Export
elif key == "import":
return capa.features.file.Import
elif key == "section":
return capa.features.file.Section
elif key == "match":
return capa.features.common.MatchedRule
elif key == "function-name":
return capa.features.file.FunctionName
elif key == "os":
return capa.features.common.OS
elif key == "format":
return capa.features.common.Format
elif key == "arch":
return capa.features.common.Arch
elif key == "class":
return capa.features.common.Class
elif key == "namespace":
return capa.features.common.Namespace
elif key == "property":
return capa.features.insn.Property
else:
raise InvalidRule(f"unexpected statement: {key}")
# this is the separator between a feature value and its description
# when using the inline description syntax, like:
#
# number: 42 = ENUM_FAVORITE_NUMBER
DESCRIPTION_SEPARATOR = " = "
def parse_bytes(s: str) -> bytes:
try:
b = bytes.fromhex(s.replace(" ", ""))
except binascii.Error:
raise InvalidRule(f'unexpected bytes value: must be a valid hex sequence: "{s}"')
if len(b) > MAX_BYTES_FEATURE_SIZE:
raise InvalidRule(
f"unexpected bytes value: byte sequences must be no larger than {MAX_BYTES_FEATURE_SIZE} bytes"
)
return b
def parse_description(s: Union[str, int, bytes], value_type: str, description=None):
if value_type == "string":
# string features cannot have inline descriptions,
# so we assume the entire value is the string,
# like: `string: foo = bar` -> "foo = bar"
value = s
else:
# other features can have inline descriptions, like `number: 10 = CONST_FOO`.
# in this case, the RHS will be like `10 = CONST_FOO` or some other string
if isinstance(s, str):
if DESCRIPTION_SEPARATOR in s:
if description:
# there is already a description passed in as a sub node, like:
#
# - number: 10 = CONST_FOO
# description: CONST_FOO
raise InvalidRule(
f'unexpected value: "{s}", only one description allowed (inline description with `{DESCRIPTION_SEPARATOR}`)'
)
value, _, description = s.partition(DESCRIPTION_SEPARATOR)
if description == "":
# sanity check:
# there is an empty description, like `number: 10 =`
raise InvalidRule(f'unexpected value: "{s}", description cannot be empty')
else:
# this is a string, but there is no description,
# like: `api: CreateFileA`
value = s
# cast from the received string value to the appropriate type.
#
# without a description, this type would already be correct,
# but since we parsed the description from a string,
# we need to convert the value to the expected type.
#
# for example, from `number: 10 = CONST_FOO` we have
# the string "10" that needs to become the number 10.
if value_type == "bytes":
value = parse_bytes(value)
elif (
value_type in ("number", "offset")
or value_type.startswith(("number/", "offset/"))
or (
value_type.startswith("operand[")
and (value_type.endswith("].number") or value_type.endswith("].offset"))
)
):
try:
value = parse_int(value)
except ValueError:
raise InvalidRule(f'unexpected value: "{value}", must begin with numerical value')
else:
# the value might be a number, like: `number: 10`
value = s
return value, description
def pop_statement_description_entry(d):
"""
extracts the description for statements and removes the description entry from the document
a statement can only have one description
example:
the features definition
- or:
- description: statement description
- number: 1
description: feature description
becomes
<statement>: [
{ "description": "statement description" }, <-- extracted here
{ "number": 1, "description": "feature description" }
]
"""
if not isinstance(d, list):
return None
# identify child of form '{ "description": <description> }'
descriptions = list(filter(lambda c: isinstance(c, dict) and len(c) == 1 and "description" in c, d))
if len(descriptions) > 1:
raise InvalidRule("statements can only have one description")
if not descriptions:
return None
description = descriptions[0]
d.remove(description)
return description["description"]
def trim_dll_part(api: str) -> str:
# ordinal imports, like ws2_32.#1, keep dll
if ".#" in api:
return api
# kernel32.CreateFileA
if api.count(".") == 1:
if "::" not in api:
# skip System.Convert::FromBase64String
api = api.split(".")[1]
return api
def unique(sequence):
"""deduplicate the items in the given sequence, returning a list with the same order.
via: https://stackoverflow.com/a/58666031
"""
seen = set()
return [x for x in sequence if not (x in seen or seen.add(x))] # type: ignore [func-returns-value]
STATIC_SCOPE_ORDER = [
Scope.FILE,
Scope.FUNCTION,
Scope.BASIC_BLOCK,
Scope.INSTRUCTION,
]
DYNAMIC_SCOPE_ORDER = [
Scope.FILE,
Scope.PROCESS,
Scope.THREAD,
Scope.SPAN_OF_CALLS,
Scope.CALL,
]
def is_subscope_compatible(scope: Scope | None, subscope: Scope) -> bool:
if not scope:
return False
if subscope in STATIC_SCOPE_ORDER:
try:
return STATIC_SCOPE_ORDER.index(subscope) >= STATIC_SCOPE_ORDER.index(scope)
except ValueError:
return False
elif subscope in DYNAMIC_SCOPE_ORDER:
try:
return DYNAMIC_SCOPE_ORDER.index(subscope) >= DYNAMIC_SCOPE_ORDER.index(scope)
except ValueError:
return False
else:
raise ValueError("unexpected scope")
def build_statements(d, scopes: Scopes):
if len(d.keys()) > 2:
raise InvalidRule("too many statements")
key = list(d.keys())[0]
description = pop_statement_description_entry(d[key])
if key == "and":
return ceng.And(unique(build_statements(dd, scopes) for dd in d[key]), description=description)
elif key == "or":
return ceng.Or(unique(build_statements(dd, scopes) for dd in d[key]), description=description)
elif key == "not":
if len(d[key]) != 1:
raise InvalidRule("not statement must have exactly one child statement")
return ceng.Not(build_statements(d[key][0], scopes), description=description)
elif key.endswith(" or more"):
count = int(key[: -len("or more")])
return ceng.Some(count, unique(build_statements(dd, scopes) for dd in d[key]), description=description)
elif key == "optional":
# `optional` is an alias for `0 or more`
# which is useful for documenting behaviors,
# like with `write file`, we might say that `WriteFile` is optionally found alongside `CreateFileA`.
return ceng.Some(0, unique(build_statements(dd, scopes) for dd in d[key]), description=description)
elif key == "process":
if not is_subscope_compatible(scopes.dynamic, Scope.PROCESS):
raise InvalidRule("`process` subscope supported only for `file` scope")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.PROCESS, build_statements(d[key][0], Scopes(dynamic=Scope.PROCESS)), description=description
)
elif key == "thread":
if not is_subscope_compatible(scopes.dynamic, Scope.THREAD):
raise InvalidRule("`thread` subscope supported only for the `process` scope")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.THREAD, build_statements(d[key][0], Scopes(dynamic=Scope.THREAD)), description=description
)
elif key == "span of calls":
if not is_subscope_compatible(scopes.dynamic, Scope.SPAN_OF_CALLS):
raise InvalidRule("`span of calls` subscope supported only for the `process` and `thread` scopes")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.SPAN_OF_CALLS,
build_statements(d[key][0], Scopes(dynamic=Scope.SPAN_OF_CALLS)),
description=description,
)
elif key == "call":
if not is_subscope_compatible(scopes.dynamic, Scope.CALL):
raise InvalidRule("`call` subscope supported only for the `process`, `thread`, and `call` scopes")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.CALL, build_statements(d[key][0], Scopes(dynamic=Scope.CALL)), description=description
)
elif key == "function":
if not is_subscope_compatible(scopes.static, Scope.FUNCTION):
raise InvalidRule("`function` subscope supported only for `file` scope")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.FUNCTION, build_statements(d[key][0], Scopes(static=Scope.FUNCTION)), description=description
)
elif key == "basic block":
if not is_subscope_compatible(scopes.static, Scope.BASIC_BLOCK):
raise InvalidRule("`basic block` subscope supported only for `function` scope")
if len(d[key]) != 1:
raise InvalidRule("subscope must have exactly one child statement")
return ceng.Subscope(
Scope.BASIC_BLOCK, build_statements(d[key][0], Scopes(static=Scope.BASIC_BLOCK)), description=description
)
elif key == "instruction":
if not is_subscope_compatible(scopes.static, Scope.INSTRUCTION):
raise InvalidRule("`instruction` subscope supported only for `function` and `basic block` scope")
if len(d[key]) == 1:
statements = build_statements(d[key][0], Scopes(static=Scope.INSTRUCTION))
else:
# for instruction subscopes, we support a shorthand in which the top level AND is implied.
# the following are equivalent:
#
# - instruction:
# - and:
# - arch: i386
# - mnemonic: cmp
#
# - instruction:
# - arch: i386
# - mnemonic: cmp
#
statements = ceng.And(unique(build_statements(dd, Scopes(static=Scope.INSTRUCTION)) for dd in d[key]))
return ceng.Subscope(Scope.INSTRUCTION, statements, description=description)
elif key.startswith("count(") and key.endswith(")"):
# e.g.:
#
# count(basic block)
# count(mnemonic(mov))
# count(characteristic(nzxor))
term = key[len("count(") : -len(")")]
# when looking for the existence of such a feature, our rule might look like:
# - mnemonic: mov
#
# but here we deal with the form: `mnemonic(mov)`.
term, _, arg = term.partition("(")
Feature = parse_feature(term)
if arg:
arg = arg[: -len(")")]
# can't rely on yaml parsing ints embedded within strings
# like:
#
# count(offset(0xC))
# count(number(0x11223344))
# count(number(0x100 = description))
if term != "string":
value, description = parse_description(arg, term)
if term == "api":
value = trim_dll_part(value)
feature = Feature(value, description=description)
else:
# arg is string (which doesn't support inline descriptions), like:
#
# count(string(error))
#
# known problem that embedded newlines may not work here?
# this may become a problem (or not), so address it when encountered.
feature = Feature(arg)
else:
feature = Feature()
ensure_feature_valid_for_scopes(scopes, feature)
count = d[key]
if isinstance(count, int):
return ceng.Range(feature, min=count, max=count, description=description)
elif count.endswith(" or more"):
min = parse_int(count[: -len(" or more")])
max = None
return ceng.Range(feature, min=min, max=max, description=description)
elif count.endswith(" or fewer"):
min = None
max = parse_int(count[: -len(" or fewer")])
return ceng.Range(feature, min=min, max=max, description=description)
elif count.startswith("("):
min, max = parse_range(count)
return ceng.Range(feature, min=min, max=max, description=description)
else:
raise InvalidRule(f"unexpected range: {count}")
elif key == "string" and not isinstance(d[key], str):
raise InvalidRule(f"ambiguous string value {d[key]}, must be defined as explicit string")
elif key.startswith("operand[") and key.endswith("].number"):
index = key[len("operand[") : -len("].number")]
try:
index = int(index)
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(d[key], key, d.get("description"))
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandNumber(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif key.startswith("operand[") and key.endswith("].offset"):
index = key[len("operand[") : -len("].offset")]
try:
index = int(index)
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(d[key], key, d.get("description"))
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandOffset(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif (
(key == "os" and d[key] not in capa.features.common.VALID_OS)
or (key == "format" and d[key] not in capa.features.common.VALID_FORMAT)
or (key == "arch" and d[key] not in capa.features.common.VALID_ARCH)
):
raise InvalidRule(f"unexpected {key} value {d[key]}")
elif key.startswith("property/"):
access = key[len("property/") :]
if access not in capa.features.common.VALID_FEATURE_ACCESS:
raise InvalidRule(f"unexpected {key} access {access}")
value, description = parse_description(d[key], key, d.get("description"))
try:
feature = capa.features.insn.Property(value, access=access, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif key.startswith("com/"):
com_type_name = str(key[len("com/") :])
try:
com_type = ComType(com_type_name)
except ValueError:
raise InvalidRule(f"unexpected COM type: {com_type_name}")
value, description = parse_description(d[key], key, d.get("description"))
return translate_com_feature(value, com_type)
else:
Feature = parse_feature(key)
value, description = parse_description(d[key], key, d.get("description"))
if key == "api":
value = trim_dll_part(value)
try:
feature = Feature(value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
def first(s: list[Any]) -> Any:
return s[0]
def second(s: list[Any]) -> Any:
return s[1]
class Rule:
def __init__(self, name: str, scopes: Scopes, statement: Statement, meta, definition=""):
super().__init__()
self.name = name
self.scopes = scopes
self.statement = statement
self.meta = meta
self.definition = definition
def __str__(self):
return f"Rule(name={self.name})"
def __repr__(self):
return f"Rule(scope={self.scopes}, name={self.name})"
def get_dependencies(self, namespaces: dict[str, list["Rule"]]) -> set[str]:
"""
fetch the names of rules this rule relies upon.
these are only the direct dependencies; a user must
compute the transitive dependency graph themself, if they want it.
Args:
namespaces: mapping from namespace name to rules in it.
see `index_rules_by_namespace`.
Returns:
set[str]: names of rules upon which this rule depends.
"""
deps: set[str] = set()
def rec(statement):
if isinstance(statement, capa.features.common.MatchedRule):
# we're not sure at this point if the `statement.value` is
# really a rule name or a namespace name (we use `MatchedRule` for both cases).
# we'll give precedence to namespaces, and then assume if that does work,
# that it must be a rule name.
#
# we don't expect any collisions between namespaces and rule names, but it's possible.
# most likely would be collision between top level namespace (e.g. `host-interaction`) and rule name.
# but, namespaces tend to use `-` while rule names use ` `. so, unlikely, but possible.
if statement.value in namespaces:
# matches a namespace, so take precedence and don't even check rule names.
assert isinstance(statement.value, str)
deps.update(r.name for r in namespaces[statement.value])
else:
# not a namespace, assume it's a rule name.
assert isinstance(statement.value, str)
deps.add(statement.value)
elif isinstance(statement, ceng.Statement):
for child in statement.get_children():
rec(child)
# else: might be a Feature, etc.
# which we don't care about here.
rec(self.statement)
return deps
def _extract_subscope_rules_rec(self, statement):
if isinstance(statement, ceng.Statement):
# for each child that is a subscope,
for child in statement.get_children():
if not isinstance(child, ceng.Subscope):
continue
subscope = child
# create a new rule from it.
# the name is a randomly generated, hopefully unique value.
# ideally, this won't every be rendered to a user.
name = self.name + "/" + uuid.uuid4().hex
if subscope.scope in STATIC_SCOPES:
scopes = Scopes(static=subscope.scope)
elif subscope.scope in DYNAMIC_SCOPES:
scopes = Scopes(dynamic=subscope.scope)
else:
raise InvalidRule(f"scope {subscope.scope} is not a valid subscope")
new_rule = Rule(
name,
scopes,
subscope.child,
{
"name": name,
"scopes": asdict(scopes),
# these derived rules are never meant to be inspected separately,
# they are dependencies for the parent rule,
# so mark it as such.
"lib": True,
# metadata that indicates this is derived from a subscope statement
"capa/subscope-rule": True,
# metadata that links the child rule the parent rule
"capa/parent": self.name,
},
)
# update the existing statement to `match` the new rule
new_node = capa.features.common.MatchedRule(name)
statement.replace_child(subscope, new_node)
# and yield the new rule to our caller
yield new_rule
# now recurse to other nodes in the logic tree.
# note: we cannot recurse into the subscope sub-tree,
# because its been replaced by a `match` statement.