StanfordBDHG · milanagm · Apr 30, 2026 · Apr 10, 2026 · Apr 12, 2026 · Apr 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 __pycache__/
 data/
+results/
 .env
 .claude/
 docs/

diff --git a/REUSE.toml b/REUSE.toml
@@ -1,6 +1,6 @@
 version = 1
 
 [[annotations]]
-path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "README.md"]
+path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "**/README.md"]
 SPDX-FileCopyrightText = "2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)"
 SPDX-License-Identifier = "MIT"
diff --git a/curriculum_learning.py b/curriculum_learning.py
@@ -36,12 +36,6 @@
     MHCMultiLabelQADataset,
 )
 
-# Allow peft's LoraConfig in torch.load(weights_only=True) checkpoints.
-try:
-    from peft.tuners.lora.config import LoraConfig
-    torch.serialization.add_safe_globals([LoraConfig])
-except ImportError:
-    pass
 from opentslm.model_config import (
     BATCH_SIZE,
     GRAD_CLIP_NORM,
@@ -208,7 +202,7 @@ def _load_checkpoint(self, stage_name: str, eval_only: bool = False):
         if not path.exists():
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded checkpoint from {stage_name} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
@@ -250,7 +244,7 @@ def _load_latest_checkpoint(
         if not path.exists():
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
         optimizer.load_state_dict(checkpoint["optimizer_state"])
         scheduler.load_state_dict(checkpoint["scheduler_state"])
@@ -278,7 +272,7 @@ def _load_previous_stage_model(self, stage_name: str):
             print(f"Warning: No checkpoint found for previous stage {prev_stage}")
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded previous stage model from {prev_stage} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
@@ -289,7 +283,7 @@ def _load_initial_checkpoint(self):
         if not path.exists():
             raise FileNotFoundError(f"Initial checkpoint not found: {path}")
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded initial checkpoint from {path}")

diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,115 @@
+# Evaluation Pipeline
+
+LLM-based evaluation of MHC metabolic labels. For each weekly sample the
+pipeline hands per-channel signals to a model (as plots or an encoded
+tensor depending on the backend), asks it to predict a target label,
+parses the answer and computes classification metrics.
+
+Currently Sherlock-specific — paths below assume `$SCRATCH` and the
+MHC-benchmark repo at `~/MHC-benchmark`.
+
+## Prerequisites
+
+| Artifact | Path | How to build |
+|---|---|---|
+| Weekly HF dataset (stride=7) | `$SCRATCH/mhc-benchmark-weekly_hf_stride7/` | [1] |
+| Captioned arrow shards | `$SCRATCH/exports/lean_full_stride7/shard_{0..3}/recordings_*.arrow` | [2] |
+| Labels lookup parquet | `$SCRATCH/weekly_labels_lookup_ours_stride7.parquet` | [3] |
+| OpenAI API key | `~/SensorTSLM/.env` (`OPENAI_API_KEY=...`) | manual |
+
+Upstream MHC-benchmark deps (one-time DVC pulls inside `~/MHC-benchmark`,
+using its conda env):
+
+```bash
+cd ~/MHC-benchmark
+dvc pull data/processed/daily_hourly_hf.dvc   # ~4 GB, feeds [1]
+dvc pull data/labels.dvc                       # ~600 MB, feeds [3]
+```
+
+## [1] Weekly HF dataset (stride=7)
+
+Non-overlapping 7-day windows aligned to each user's first valid day.
+~156k rows, ~40 min.
+
+```bash
+srun --mem=32G -c 4 -t 02:30:00 -p normal bash -c '
+  export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
+  export PYTHONPATH=$HOME/MHC-benchmark/src:$HOME/MHC-benchmark
+  cd ~/MHC-benchmark
+  python -u -m data.processing.daily_hourly_hf_to_weekly_hf \
+    --daily-hourly-path data/processed/daily_hourly_hf \
+    --output-path $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
+    --stride-days 7
+'
+```
+
+## [2] Captioned arrow shards
+
+Four parallel sbatch jobs, ~30 min total.
+
+```bash
+cd ~/SensorTSLM
+export MHC_WEEKLY_DATASET_DIR=$SCRATCH/mhc-benchmark-weekly_hf_stride7
+export TOTAL_ROWS=156428
+./scripts/export_captions_sharded.sh weekly 4 $SCRATCH/exports/lean_full_stride7
+# wait: squeue -u $USER
+```
+
+## [3] Labels lookup parquet
+
+Reads `(user_id, week_start)` pairs from the stride=7 HF dataset and
+attaches labels from MHC-benchmark's `labels.json`. By construction
+100% aligned to our captions. ~2 min.
+
+```bash
+srun --mem=16G -c 2 -t 00:30:00 -p normal bash -c '
+  export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
+  cd ~/MHC-benchmark
+  python scripts/build_labels_lookup.py \
+    --hf-dir $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
+    --segment-type weekly \
+    --output $SCRATCH/weekly_labels_lookup_ours_stride7.parquet
+'
+```
+
+## Run Eval
+
+Three backends, selected via `--backend` (default `openai`):
+
+| Backend     | What `--model` means                  | Example |
+|-------------|---------------------------------------|---------|
+| `openai`    | Chat model name at `OPENAI_BASE_URL`  | `gpt-4o` (VLM, plots in prompt) |
+| `vlm-local` | HF image-text-to-text model ID        | `google/gemma-3-4b-it` |
+| `opentslm`  | HF Hub repo_id (auto-pulls backbone+weights) OR LLM backbone id with `--opentslm-checkpoint` for a local .pt | `OpenTSLM/llama-3.2-1b-har-sp` |
+
+Configuration via env vars (all optional):
+- `MHC_EXPORT_DIR` (default `exports`) — Arrow shard directory
+- `MHC_LABELS_PATH` (default `data/labels_dev_subset.parquet`) — labels parquet
+- `OPENAI_BASE_URL` (default `https://api.openai.com/v1`) — OpenAI-compatible endpoint
+- `MAX_SAMPLES` (unset = full eval) — cap on samples for debug runs
+
+Split is hardcoded to `test`.
+
+```bash
+srun --mem=16G -c 2 -t 00:15:00 -p normal bash -c '
+  ml python/3.12.1 && ml libsndfile && \
+  source $SCRATCH/sensor-venv/bin/activate && \
+  export HF_HOME=$SCRATCH/hf_cache && \
+  export MHC_EXPORT_DIR=$SCRATCH/exports/lean_full_stride7 && \
+  export MHC_LABELS_PATH=$SCRATCH/weekly_labels_lookup_ours_stride7.parquet && \
+  export MAX_SAMPLES=5 && \
+  cd ~/SensorTSLM && \
+  python evaluation/run_eval.py --task biological_sex --model gpt-4o
+'
+```
+
+Results are written to `results/<label>/<model>/<timestamp>/` relative
+to the current directory: `metrics.json` + `predictions.csv`.
+
+## Available tasks
+
+- `biological_sex` — binary (Male / Female)
+- `bmi` — ordinal (Underweight … Morbid Obesity)
+- `diabetes` — binary
+- `hypertension` — binary
+- `cardiovascular` — binary
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/evaluator.py b/evaluation/evaluator.py
@@ -0,0 +1,92 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Evaluator — orchestrates task, model, and dataset into results.
+
+Responsibilities:
+  - Iterate over samples (with optional cap).
+  - Call model.generate() for each sample.
+  - Parse predictions via the task.
+  - Aggregate metrics at the end.
+  - Return a structured EvalResult ready for the ResultsWriter.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from tqdm import tqdm
+
+from evaluation.tasks.base import EvalTask
+from models.base import BaseModel, ModelResponse
+
+
+@dataclass(slots=True)
+class SampleResult:
+    """Result for a single evaluated sample."""
+    sample_id: str
+    ground_truth: str
+    raw_prediction: str
+    parsed_prediction: str
+    input_tokens: int
+    output_tokens: int
+    metadata: dict
+
+
+@dataclass(slots=True)
+class EvalResult:
+    """Aggregated result for one model on one task."""
+    model_name: str
+    task_name: str
+    per_sample: list[SampleResult]
+    metrics: dict[str, float | int]
+
+
+class Evaluator:
+    """Runs a single (task, model, dataset) evaluation pass."""
+
+    def run(
+        self,
+        task: EvalTask,
+        model: BaseModel,
+        dataset,
+        model_name: str,
+        max_samples: int | None = None,
+    ) -> EvalResult:
+        """Evaluate *model* on *dataset* using *task*.
+
+        The dataset must be an OpenTSLM-style ``QADataset`` and expose
+        ``get_eval_context(idx)`` returning ``(Recording, ground_truth)``.
+        ``dataset[idx]`` provides the model-facing sample dict.
+        """
+        n = len(dataset) if max_samples is None else min(max_samples, len(dataset))
+        per_sample: list[SampleResult] = []
+
+        for i in tqdm(range(n), desc=f"{model_name} / {task.name}"):
+            recording, ground_truth = dataset.get_eval_context(i)
+            sample = dataset[i]
+
+            response: ModelResponse = model.generate(sample=sample, recording=recording)
+            parsed = task.parse_prediction(response.text)
+
+            per_sample.append(SampleResult(
+                sample_id=recording.row_id,
+                ground_truth=ground_truth,
+                raw_prediction=response.text,
+                parsed_prediction=parsed,
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                metadata={"user_id": recording.user_id, "date": recording.date},
+            ))
+
+        gts = [s.ground_truth for s in per_sample]
+        preds = [s.parsed_prediction for s in per_sample]
+
+        return EvalResult(
+            model_name=model_name,
+            task_name=task.name,
+            per_sample=per_sample,
+            metrics=task.aggregate_metrics(gts, preds),
+        )
diff --git a/evaluation/io/__init__.py b/evaluation/io/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/io/writer.py b/evaluation/io/writer.py
@@ -0,0 +1,88 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""ResultsWriter — persists EvalResult to disk.
+
+Output layout::
+
+    results/
+        <task_name>/
+            <model_name>/
+                <timestamp>/
+                    metrics.json      ← aggregated metrics (F1, SE, …)
+                    predictions.csv   ← per-sample ground truth + prediction
+"""
+from __future__ import annotations
+
+import csv
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+
+from evaluation.evaluator import EvalResult
+
+
+class ResultsWriter:
+    """Writes EvalResult files under results/<task>/<model>/<timestamp>/.
+
+    Args:
+        results_dir: Root directory for all evaluation outputs.
+    """
+
+    def __init__(self, results_dir: Path | str = Path("results")) -> None:
+        self.results_dir = Path(results_dir)
+
+    def write(self, result: EvalResult) -> Path:
+        """Persist *result* to disk and return the output directory path.
+
+        Creates:
+            metrics.json     — aggregated metrics dict (directly JSON-serialisable).
+            predictions.csv  — one row per evaluated sample.
+        """
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        safe_task = re.sub(r"[^\w\-.]", "_", result.task_name)
+        safe_model = re.sub(r"[^\w\-.]", "_", result.model_name)
+        out_dir = self.results_dir / safe_task / safe_model / timestamp
+        out_dir.mkdir(parents=True, exist_ok=False)
+
+        # metrics.json
+        with open(out_dir / "metrics.json", "w") as f:
+            json.dump(result.metrics, f, indent=2)
+
+        # predictions.csv
+        base_fields = [
+            "sample_id",
+            "ground_truth",
+            "prediction",
+            "raw_prediction",
+            "input_tokens",
+            "output_tokens",
+        ]
+        metadata_fields: list[str] = []
+        seen_metadata: set[str] = set()
+        for sample in result.per_sample:
+            for key in sample.metadata:
+                if key not in seen_metadata:
+                    seen_metadata.add(key)
+                    metadata_fields.append(key)
+
+        with open(out_dir / "predictions.csv", "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=base_fields + metadata_fields)
+            writer.writeheader()
+            for sample in result.per_sample:
+                row = {
+                    "sample_id": sample.sample_id,
+                    "ground_truth": sample.ground_truth,
+                    "prediction": sample.parsed_prediction,
+                    "raw_prediction": sample.raw_prediction,
+                    "input_tokens": sample.input_tokens,
+                    "output_tokens": sample.output_tokens,
+                }
+                row.update(sample.metadata)
+                writer.writerow(row)
+        print(f"Results written to: {out_dir}")
+        return out_dir
diff --git a/evaluation/metrics/__init__.py b/evaluation/metrics/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#