Refactor CrossChannelExtractor into driver with pluggable synthesizers

max-rosenblattl · max-rosenblattl · commit e457736a8e77 · 2026-04-11T23:23:51.000-07:00
diff --git a/captionizer.py b/captionizer.py
@@ -54,6 +54,7 @@ def run(
     from mhc.transformer import MHCTransformer
     from mhc.constants import MHC_CHANNEL_CONFIG
     from extractors.cross_channel import CrossChannelExtractor
+    from synthesizers.sleep import SleepSynthesizer
     from extractors.statistical import StatisticalExtractor
     from extractors.structural import StructuralExtractor
     from models.local import LocalConfig, LocalModel
@@ -67,7 +68,7 @@ def run(
         StatisticalExtractor(MHC_CHANNEL_CONFIG),
         StructuralExtractor(MHC_CHANNEL_CONFIG),
         SemanticExtractor(MHC_CHANNEL_CONFIG),
-        CrossChannelExtractor(MHC_CHANNEL_CONFIG),
+        CrossChannelExtractor(MHC_CHANNEL_CONFIG, synthesizers=[SleepSynthesizer()]),
     ])
 
     captionizer = Captionizer(dataset, MHCTransformer(), annotator)
diff --git a/explorer.py b/explorer.py
@@ -18,6 +18,7 @@
 from annotator import Annotator
 from extractors import ChannelConfig
 from extractors.cross_channel import CrossChannelExtractor
+from synthesizers.sleep import SleepSynthesizer
 from extractors.statistical import StatisticalExtractor
 from extractors.structural import StructuralExtractor
 from mhc.constants import MHC_CHANNEL_CONFIG
@@ -91,7 +92,7 @@ def __init__(
         self.annotator = Annotator([
             StatisticalExtractor(channel_config),
             StructuralExtractor(channel_config),
-            CrossChannelExtractor(channel_config),
+            CrossChannelExtractor(channel_config, synthesizers=[SleepSynthesizer()]),
         ])
 
         self.row_index = min(max(0, row_index), len(self.dataset) - 1)
diff --git a/extractors/cross_channel.py b/extractors/cross_channel.py
@@ -6,57 +6,20 @@
 #
 from __future__ import annotations
 
-import numpy as np
-
 from extractors import CaptionExtractor, ChannelConfig
+from synthesizers import CrossChannelSynthesizer
 from timef.schema import Annotation, Recording
 
 
 class CrossChannelExtractor(CaptionExtractor):
     caption_type = "cross_channel"
 
-    def __init__(self, config: ChannelConfig, min_duration: int = 5):
+    def __init__(self, config: ChannelConfig, synthesizers: list[CrossChannelSynthesizer]):
         super().__init__(config)
-        self.min_duration = min_duration
+        self.synthesizers = synthesizers
 
     def extract(self, row: Recording) -> list[Annotation]:
-        try:
-            in_bed_idx = row.channel_names.index("sleep:inbed")
-            asleep_idx = row.channel_names.index("sleep:asleep")
-        except ValueError:
-            return []
-
-        in_bed = np.asarray(row.values[in_bed_idx], dtype=float)
-        asleep = np.asarray(row.values[asleep_idx], dtype=float)
-        if not np.any((~np.isnan(asleep)) & (asleep > 0)):
-            return []
-        mask = (~np.isnan(in_bed)) & (in_bed > 0) & ~((~np.isnan(asleep)) & (asleep > 0))
-
         results: list[Annotation] = []
-        for start, end in _contiguous_windows(mask, self.min_duration):
-            end_inclusive = max(start, end - 1)
-            results.append(
-                Annotation(
-                    caption_type=self.caption_type,
-                    text=f"In bed but not sleeping from {self._time_label(start)} {start} to {self._time_label(end_inclusive)} {end_inclusive}.",
-                    channel_idxs=(asleep_idx, in_bed_idx),
-                    window=(start, end),
-                    label="in_bed_not_sleeping",
-                )
-            )
+        for synth in self.synthesizers:
+            results.extend(synth.synthesize(row, self.config))
         return results
-
-    def _time_label(self, value: int) -> str:
-        return "hour" if self.config.time_unit == "hours" else "minute"
-
-
-def _contiguous_windows(mask: np.ndarray, min_duration: int) -> list[tuple[int, int]]:
-    if not mask.any():
-        return []
-
-    padded = np.concatenate(([False], mask, [False]))
-    diffs = np.diff(padded.astype(np.int8))
-    starts = np.where(diffs == 1)[0]
-    ends = np.where(diffs == -1)[0]
-    keep = (ends - starts) >= min_duration
-    return list(zip(starts[keep].tolist(), ends[keep].tolist()))
diff --git a/synthesizers/__init__.py b/synthesizers/__init__.py
@@ -0,0 +1,31 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+from __future__ import annotations
+
+import abc
+
+import numpy as np
+
+from extractors import ChannelConfig
+from timef.schema import Annotation, Recording
+
+
+class CrossChannelSynthesizer(abc.ABC):
+    @abc.abstractmethod
+    def synthesize(self, row: Recording, config: ChannelConfig) -> list[Annotation]: ...
+
+
+def contiguous_windows(mask: np.ndarray, min_duration: int) -> list[tuple[int, int]]:
+    if not mask.any():
+        return []
+
+    padded = np.concatenate(([False], mask, [False]))
+    diffs = np.diff(padded.astype(np.int8))
+    starts = np.where(diffs == 1)[0]
+    ends = np.where(diffs == -1)[0]
+    keep = (ends - starts) >= min_duration
+    return list(zip(starts[keep].tolist(), ends[keep].tolist()))
diff --git a/synthesizers/sleep.py b/synthesizers/sleep.py
@@ -0,0 +1,53 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+from __future__ import annotations
+
+import json
+
+import numpy as np
+
+from extractors import CaptionExtractor, ChannelConfig
+from synthesizers import CrossChannelSynthesizer, contiguous_windows
+from timef.schema import Annotation, Recording
+
+
+class SleepSynthesizer(CrossChannelSynthesizer):
+    def __init__(self, min_duration: int = 5):
+        self.min_duration = min_duration
+
+    def synthesize(self, row: Recording, config: ChannelConfig) -> list[Annotation]:
+        try:
+            in_bed_idx = row.channel_names.index("sleep:inbed")
+            asleep_idx = row.channel_names.index("sleep:asleep")
+        except ValueError:
+            return []
+
+        in_bed = np.asarray(row.values[in_bed_idx], dtype=float)
+        asleep = np.asarray(row.values[asleep_idx], dtype=float)
+        if not np.any((~np.isnan(asleep)) & (asleep > 0)):
+            return []
+        mask = (~np.isnan(in_bed)) & (in_bed > 0) & ~((~np.isnan(asleep)) & (asleep > 0))
+
+        time_unit = "hour" if config.time_unit == "hours" else "minute"
+        templates = json.loads(config.templates_path.read_text())["cross_channel"]["sleep"]
+        seed = CaptionExtractor._seed(row.row_id)
+
+        results: list[Annotation] = []
+        for i, (start, end) in enumerate(contiguous_windows(mask, self.min_duration)):
+            end_inclusive = max(start, end - 1)
+            template = templates[(seed + i) % len(templates)]
+            text = template.format(time_unit=time_unit, start=start, end=end_inclusive)
+            results.append(
+                Annotation(
+                    caption_type="cross_channel",
+                    text=text,
+                    channel_idxs=(asleep_idx, in_bed_idx),
+                    window=(start, end),
+                    label="in_bed_not_sleeping",
+                )
+            )
+        return results
diff --git a/templates/templates.json b/templates/templates.json
@@ -42,5 +42,14 @@
     "{name} occurred at minute {start}.",
     "{name} was observed at minute {start}.",
     "Identified {name} at minute {start}."
-  ]
+  ],
+  "cross_channel": {
+    "sleep": [
+      "In bed but not sleeping from {time_unit} {start} to {end}.",
+      "The user was in bed but awake from {time_unit} {start} to {end}.",
+      "Awake while in bed between {time_unit} {start} and {end}.",
+      "In-bed wakefulness detected from {time_unit} {start} to {end}.",
+      "The user remained awake in bed from {time_unit} {start} to {end}."
+    ]
+  }
 }
diff --git a/visualizer.py b/visualizer.py
@@ -115,6 +115,7 @@ def _nan_regions(arr: np.ndarray, min_length: int = 30) -> list[tuple[int, int]]
     from mhc.transformer import MHCTransformer
     from mhc.constants import MHC_CHANNEL_CONFIG
     from extractors.cross_channel import CrossChannelExtractor
+    from synthesizers.sleep import SleepSynthesizer
     from extractors.statistical import StatisticalExtractor
     from extractors.structural import StructuralExtractor
     from annotator import Annotator
@@ -124,7 +125,7 @@ def _nan_regions(arr: np.ndarray, min_length: int = 30) -> list[tuple[int, int]]
     annotator = Annotator([
         StatisticalExtractor(MHC_CHANNEL_CONFIG),
         StructuralExtractor(MHC_CHANNEL_CONFIG),
-        CrossChannelExtractor(MHC_CHANNEL_CONFIG),
+        CrossChannelExtractor(MHC_CHANNEL_CONFIG, synthesizers=[SleepSynthesizer()]),
     ])
     captionizer = Captionizer(dataset, MHCTransformer(), annotator)
     result, _ = captionizer.run(max_rows=1)