mandiant · EdoardoAllegrini · Mar 15, 2026 · Mar 15, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini
@@ -86,3 +86,6 @@ ignore_missing_imports = True
 
 [mypy-ghidra.*]
 ignore_missing_imports = True
+
+[mypy-tree_sitter.*]
+ignore_missing_imports = True
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### New Features
 
+- Tree-Sitter Script Analysis
 - ghidra: support PyGhidra @mike-hunhoff #2788
 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835
 

diff --git a/capa/features/address.py b/capa/features/address.py
@@ -145,6 +145,26 @@ def __hash__(self):
         return int.__hash__(self)
 
 
+class FileOffsetRangeAddress(Address):
+    """an address range relative to the start of a file"""
+
+    def __init__(self, start_byte, end_byte):
+        self.start_byte = start_byte
+        self.end_byte = end_byte
+
+    def __eq__(self, other):
+        return (self.start_byte, self.end_byte) == (other.start_byte, other.end_byte)
+
+    def __lt__(self, other):
+        return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte)
+
+    def __hash__(self):
+        return hash((self.start_byte, self.end_byte))
+
+    def __repr__(self):
+        return f"file(0x{self.start_byte:x}, 0x{self.end_byte:x})"
+
+
 class DNTokenAddress(int, Address):
     """a .NET token"""
 

diff --git a/capa/features/common.py b/capa/features/common.py
@@ -488,10 +488,17 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
         return Result(False, self, [])
 
 
+class ScriptLanguage(Feature):
+    def __init__(self, value: str, description=None):
+        super().__init__(value, description=description)
+        self.name = "script language"
+
+
 FORMAT_PE = "pe"
 FORMAT_ELF = "elf"
 FORMAT_DOTNET = "dotnet"
-VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
+FORMAT_SCRIPT = "script"
+VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_SCRIPT)
 # internal only, not to be used in rules
 FORMAT_AUTO = "auto"
 FORMAT_SC32 = "sc32"

diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
@@ -35,13 +35,15 @@
     OS_WINDOWS,
     FORMAT_FREEZE,
     FORMAT_RESULT,
+    FORMAT_SCRIPT,
     Arch,
     Format,
     String,
     Feature,
 )
 from capa.features.freeze import is_freeze
 from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress
+from capa.features.extractors.ts.autodetect import is_script
 
 logger = logging.getLogger(__name__)
 
@@ -77,6 +79,8 @@ def extract_format(buf: bytes) -> Iterator[tuple[Feature, Address]]:
         # we don't know what it is exactly, but may support it (e.g. a dynamic CAPE sandbox report)
         # skip verdict here and let subsequent code analyze this further
         return
+    elif is_script(buf):
+        yield Format(FORMAT_SCRIPT), NO_ADDRESS
     else:
         # we likely end up here:
         #  1. handling a file format (e.g. macho)

diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py
@@ -0,0 +1,55 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Iterator
+
+from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage
+from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress
+
+# Can be used to instantiate tree_sitter Language objects (see ts/query.py)
+LANG_CS = "c_sharp"
+LANG_HTML = "html"
+LANG_JS = "javascript"
+LANG_PY = "python"
+LANG_TEM = "embedded_template"
+
+EXT_ASPX = ("aspx", "aspx_")
+EXT_CS = ("cs", "cs_")
+EXT_HTML = ("html", "html_")
+EXT_PY = ("py", "py_")
+
+
+LANGUAGE_FEATURE_FORMAT = {
+    LANG_CS: "C#",
+    LANG_HTML: "HTML",
+    LANG_JS: "JavaScript",
+    LANG_PY: "Python",
+    LANG_TEM: "Embedded Template",
+}
+
+
+def extract_arch() -> Iterator[Tuple[Feature, Address]]:
+    yield Arch(ARCH_ANY), NO_ADDRESS
+
+
+def extract_language(language: str, addr: FileOffsetRangeAddress) -> Iterator[Tuple[Feature, Address]]:
+    yield ScriptLanguage(LANGUAGE_FEATURE_FORMAT[language]), addr
+
+
+def extract_os() -> Iterator[Tuple[Feature, Address]]:
+    yield OS(OS_ANY), NO_ADDRESS
+
+
+def extract_format() -> Iterator[Tuple[Feature, Address]]:
+    yield Format(FORMAT_SCRIPT), NO_ADDRESS
diff --git a/capa/features/extractors/ts/__init__.py b/capa/features/extractors/ts/__init__.py
diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py
@@ -0,0 +1,79 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+from pathlib import Path
+
+from tree_sitter import Node, Tree, Parser, Language
+
+from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML
+from capa.features.extractors.ts.query import TS_LANGUAGES
+
+
+def is_script(buf: bytes) -> bool:
+    try:
+        return bool(get_language_ts(buf))
+    except ValueError:
+        return False
+
+
+def _parse(ts_language: Language, buf: bytes) -> Optional[Tree]:
+    try:
+        parser = Parser(ts_language)
+        return parser.parse(buf)
+    except ValueError:
+        return None
+
+
+def _contains_errors(ts_language, node: Node) -> bool:
+    return ts_language.query("(ERROR) @error").captures(node)
+
+
+def get_language_ts(buf: bytes) -> str:
+    for language, ts_language in TS_LANGUAGES.items():
+        tree = _parse(ts_language, buf)
+        if tree and not _contains_errors(ts_language, tree.root_node):
+            return language
+    raise ValueError("failed to parse the language")
+
+
+def get_template_language_ts(buf: bytes) -> str:
+    for language, ts_language in TS_LANGUAGES.items():
+        if language in [LANG_TEM, LANG_HTML]:
+            continue
+        tree = _parse(ts_language, buf)
+        if tree and not _contains_errors(ts_language, tree.root_node):
+            return language
+    raise ValueError("failed to parse the language")
+
+
+def get_language_from_ext(path: str) -> str:
+    if path.endswith(EXT_ASPX):
+        return LANG_TEM
+    if path.endswith(EXT_CS):
+        return LANG_CS
+    if path.endswith(EXT_HTML):
+        return LANG_HTML
+    if path.endswith(EXT_PY):
+        return LANG_PY
+    raise ValueError(f"{path} has an unrecognized or an unsupported extension.")
+
+
+def get_language(path: Path) -> str:
+    try:
+        with path.open("rb") as f:
+            buf = f.read()
+        return get_language_ts(buf)
+    except ValueError:
+        return get_language_from_ext(str(path))