Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
6e8a9fe
Add evaluation framework scaffold with activity task
milanagm Apr 10, 2026
de9c281
removed stubs to make review easier
milanagm Apr 12, 2026
6dd3c83
Merge main and refactor shared split logic into mhc_label_lookup
milanagm Apr 13, 2026
968d25d
Add metabolic evaluation pipeline with caption-assisted eval support
milanagm Apr 13, 2026
116fdac
Update .gitignore and evaluation docs
milanagm Apr 13, 2026
1c5aa45
Add downstream label exploration notebook
milanagm Apr 13, 2026
9c2b623
removed activity related components - will push on new branch
milanagm Apr 15, 2026
4ae7876
Revert README changes from evaluation PR
milanagm Apr 15, 2026
7359eac
Remove notebooks figures, defered to activity related eval that will …
milanagm Apr 15, 2026
8f9ee3d
Write predictions as CSV while keeping metrics as JSON
milanagm Apr 15, 2026
a7db502
Removed masking bootstrap metric failures / removed Exception Catch
milanagm Apr 15, 2026
a002894
Clean up README files and docstring wording
milanagm Apr 15, 2026
1fbc06f
Add ordinal metrics and align BMI label handling - expanded metrics
milanagm Apr 15, 2026
88f1301
removed seperat weekly metabolic dataset script to expand the weekly …
milanagm Apr 15, 2026
762bc24
removed local env name
milanagm Apr 15, 2026
349ab92
updated requirements (added opentslm dependancy)
milanagm Apr 15, 2026
7790643
removed splut method from lookup back to base
milanagm Apr 16, 2026
98dc024
removed unused method
milanagm Apr 16, 2026
2f62dc0
Remove temporary caption generation placeholder
milanagm Apr 16, 2026
f54dda4
Pin opentslm to immutable commit SHA
milanagm Apr 16, 2026
bcd1446
anitize task and model names in result output paths
milanagm Apr 16, 2026
b0ce3e0
Move assign_split back to base dataset, support user-level label parq…
milanagm Apr 16, 2026
c3f8373
Replace EvalSample with get_eval_context adapter
milanagm Apr 16, 2026
24ce359
Simplify registries and CLI in run_eval
milanagm Apr 16, 2026
ece1b58
Use explicit None check for labels fallback in multiclass_metrics
milanagm Apr 16, 2026
4b10517
Remove exploration notebook, deferred until captioning pipeline deliv…
milanagm Apr 16, 2026
f284ae5
added required labels variable to pass from task to base QA dataset l…
milanagm Apr 19, 2026
c039034
Search shard subdirs recursively for
milanagm Apr 19, 2026
37da391
Decode MHC-benchmark
milanagm Apr 19, 2026
5799d0d
Store values as float32 ndarrays instead of nested Python lists
milanagm Apr 20, 2026
18e9569
added readme to eval to understand how to set things up
milanagm Apr 21, 2026
a32ece7
Cover nested README.md files in REUSE.toml glob
milanagm Apr 21, 2026
0d8b0b4
adjustet the prompt to LLM to get distinct answer
milanagm Apr 21, 2026
cc824ad
Move METABOLIC_LABEL_CONFIG to mhc_label_lookup
milanagm Apr 22, 2026
fb4b922
Remove --dataset arg, infer dataset via task.dataset_cls
milanagm Apr 22, 2026
9984756
Drop --device and --results-dir; auto-detect device in LocalModel
milanagm Apr 22, 2026
c75d287
se opentslm from PyPI instead of git SHA
milanagm Apr 22, 2026
e14e032
Merge remote-tracking branch 'origin/main' into feature/evaluation-pi…
milanagm Apr 22, 2026
3c5d4c5
minor updates in read me
milanagm Apr 22, 2026
1a443bd
added docstring for clarification
milanagm Apr 22, 2026
4405c7d
move raw-samples cache from class-level to instance-level
milanagm Apr 23, 2026
79d7be9
Restructure evaluation around backend-dispatch; droped caption inject…
milanagm Apr 23, 2026
fe5a67b
Add OpenTSLM and text-only backends under unified generate() interface
milanagm Apr 24, 2026
6898408
Add smoke-test scripts for testing
milanagm Apr 24, 2026
4f248f8
added SPDX-Headers for test scripts
milanagm Apr 24, 2026
e86fc60
Add additional binary channel detectors (#59)
KarlDeck Apr 24, 2026
bfe36fe
Merge origin/main into feature/evaluation-pipeline
milanagm Apr 27, 2026
602bd4a
Restore caption field on sample dict for MHCCaptionQADataset training…
milanagm Apr 27, 2026
70c1cce
Rename LabelLookup.join_ondate back to .join Aligns with main and PR …
milanagm Apr 27, 2026
ee74c6b
Move label decoding from LabelLookup.join into the eval caller - eva…
milanagm Apr 27, 2026
c486336
Whitelist peft enums (PeftType, TaskType) for resume checkpoint loads
milanagm Apr 27, 2026
ba46701
Allow peft's LoraConfig in torch.load(weights_only=True) checkpoints.
milanagm Apr 28, 2026
83802f6
removed smoke test shell scripts for merge w PR and the text only eva…
milanagm Apr 29, 2026
38cc995
Commit explicit Answer: as model intent — return unknown when candida…
milanagm Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__pycache__/
data/
results/
.env
.claude/
docs/
Expand Down
2 changes: 1 addition & 1 deletion REUSE.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version = 1

[[annotations]]
path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "README.md"]
path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "**/README.md"]
SPDX-FileCopyrightText = "2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)"
SPDX-License-Identifier = "MIT"
14 changes: 4 additions & 10 deletions curriculum_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@
MHCMultiLabelQADataset,
)

# Allow peft's LoraConfig in torch.load(weights_only=True) checkpoints.
try:
from peft.tuners.lora.config import LoraConfig
torch.serialization.add_safe_globals([LoraConfig])
except ImportError:
pass
from opentslm.model_config import (
BATCH_SIZE,
GRAD_CLIP_NORM,
Expand Down Expand Up @@ -208,7 +202,7 @@ def _load_checkpoint(self, stage_name: str, eval_only: bool = False):
if not path.exists():
return None

checkpoint = torch.load(path, map_location="cpu", weights_only=True)
checkpoint = torch.load(path, map_location="cpu", weights_only=False)
self._apply_checkpoint_to_model(checkpoint)

print(f"Loaded checkpoint from {stage_name} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
Expand Down Expand Up @@ -250,7 +244,7 @@ def _load_latest_checkpoint(
if not path.exists():
return None

checkpoint = torch.load(path, map_location="cpu", weights_only=True)
checkpoint = torch.load(path, map_location="cpu", weights_only=False)
self._apply_checkpoint_to_model(checkpoint)
optimizer.load_state_dict(checkpoint["optimizer_state"])
scheduler.load_state_dict(checkpoint["scheduler_state"])
Expand Down Expand Up @@ -278,7 +272,7 @@ def _load_previous_stage_model(self, stage_name: str):
print(f"Warning: No checkpoint found for previous stage {prev_stage}")
return None

checkpoint = torch.load(path, map_location="cpu", weights_only=True)
checkpoint = torch.load(path, map_location="cpu", weights_only=False)
self._apply_checkpoint_to_model(checkpoint)

print(f"Loaded previous stage model from {prev_stage} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
Expand All @@ -289,7 +283,7 @@ def _load_initial_checkpoint(self):
if not path.exists():
raise FileNotFoundError(f"Initial checkpoint not found: {path}")

checkpoint = torch.load(path, map_location="cpu", weights_only=True)
checkpoint = torch.load(path, map_location="cpu", weights_only=False)
self._apply_checkpoint_to_model(checkpoint)

print(f"Loaded initial checkpoint from {path}")
Expand Down
115 changes: 115 additions & 0 deletions evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Evaluation Pipeline

LLM-based evaluation of MHC metabolic labels. For each weekly sample the
pipeline hands per-channel signals to a model (as plots or an encoded
tensor depending on the backend), asks it to predict a target label,
parses the answer and computes classification metrics.

Currently Sherlock-specific — paths below assume `$SCRATCH` and the
MHC-benchmark repo at `~/MHC-benchmark`.

## Prerequisites

| Artifact | Path | How to build |
|---|---|---|
| Weekly HF dataset (stride=7) | `$SCRATCH/mhc-benchmark-weekly_hf_stride7/` | [1] |
| Captioned arrow shards | `$SCRATCH/exports/lean_full_stride7/shard_{0..3}/recordings_*.arrow` | [2] |
| Labels lookup parquet | `$SCRATCH/weekly_labels_lookup_ours_stride7.parquet` | [3] |
| OpenAI API key | `~/SensorTSLM/.env` (`OPENAI_API_KEY=...`) | manual |

Upstream MHC-benchmark deps (one-time DVC pulls inside `~/MHC-benchmark`,
using its conda env):

```bash
cd ~/MHC-benchmark
dvc pull data/processed/daily_hourly_hf.dvc # ~4 GB, feeds [1]
dvc pull data/labels.dvc # ~600 MB, feeds [3]
```

## [1] Weekly HF dataset (stride=7)

Non-overlapping 7-day windows aligned to each user's first valid day.
~156k rows, ~40 min.

```bash
srun --mem=32G -c 4 -t 02:30:00 -p normal bash -c '
export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
export PYTHONPATH=$HOME/MHC-benchmark/src:$HOME/MHC-benchmark
cd ~/MHC-benchmark
python -u -m data.processing.daily_hourly_hf_to_weekly_hf \
--daily-hourly-path data/processed/daily_hourly_hf \
--output-path $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
--stride-days 7
'
```

## [2] Captioned arrow shards

Four parallel sbatch jobs, ~30 min total.

```bash
cd ~/SensorTSLM
export MHC_WEEKLY_DATASET_DIR=$SCRATCH/mhc-benchmark-weekly_hf_stride7
Comment thread
milanagm marked this conversation as resolved.
export TOTAL_ROWS=156428
./scripts/export_captions_sharded.sh weekly 4 $SCRATCH/exports/lean_full_stride7
# wait: squeue -u $USER
```

## [3] Labels lookup parquet

Reads `(user_id, week_start)` pairs from the stride=7 HF dataset and
attaches labels from MHC-benchmark's `labels.json`. By construction
100% aligned to our captions. ~2 min.

```bash
srun --mem=16G -c 2 -t 00:30:00 -p normal bash -c '
export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
cd ~/MHC-benchmark
python scripts/build_labels_lookup.py \
--hf-dir $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
--segment-type weekly \
--output $SCRATCH/weekly_labels_lookup_ours_stride7.parquet
'
```

## Run Eval

Three backends, selected via `--backend` (default `openai`):

| Backend | What `--model` means | Example |
|-------------|---------------------------------------|---------|
| `openai` | Chat model name at `OPENAI_BASE_URL` | `gpt-4o` (VLM, plots in prompt) |
| `vlm-local` | HF image-text-to-text model ID | `google/gemma-3-4b-it` |
| `opentslm` | HF Hub repo_id (auto-pulls backbone+weights) OR LLM backbone id with `--opentslm-checkpoint` for a local .pt | `OpenTSLM/llama-3.2-1b-har-sp` |

Configuration via env vars (all optional):
- `MHC_EXPORT_DIR` (default `exports`) — Arrow shard directory
- `MHC_LABELS_PATH` (default `data/labels_dev_subset.parquet`) — labels parquet
- `OPENAI_BASE_URL` (default `https://api.openai.com/v1`) — OpenAI-compatible endpoint
- `MAX_SAMPLES` (unset = full eval) — cap on samples for debug runs

Split is hardcoded to `test`.

```bash
srun --mem=16G -c 2 -t 00:15:00 -p normal bash -c '
ml python/3.12.1 && ml libsndfile && \
source $SCRATCH/sensor-venv/bin/activate && \
export HF_HOME=$SCRATCH/hf_cache && \
export MHC_EXPORT_DIR=$SCRATCH/exports/lean_full_stride7 && \
export MHC_LABELS_PATH=$SCRATCH/weekly_labels_lookup_ours_stride7.parquet && \
export MAX_SAMPLES=5 && \
cd ~/SensorTSLM && \
python evaluation/run_eval.py --task biological_sex --model gpt-4o
'
```

Results are written to `results/<label>/<model>/<timestamp>/` relative
to the current directory: `metrics.json` + `predictions.csv`.

## Available tasks

- `biological_sex` — binary (Male / Female)
- `bmi` — ordinal (Underweight … Morbid Obesity)
- `diabetes` — binary
- `hypertension` — binary
- `cardiovascular` — binary
6 changes: 6 additions & 0 deletions evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#
# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
#
# SPDX-License-Identifier: MIT
#
92 changes: 92 additions & 0 deletions evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#
# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
#
# SPDX-License-Identifier: MIT
#
"""Evaluator — orchestrates task, model, and dataset into results.

Responsibilities:
- Iterate over samples (with optional cap).
- Call model.generate() for each sample.
- Parse predictions via the task.
- Aggregate metrics at the end.
- Return a structured EvalResult ready for the ResultsWriter.
"""
from __future__ import annotations

from dataclasses import dataclass

from tqdm import tqdm

from evaluation.tasks.base import EvalTask
from models.base import BaseModel, ModelResponse


@dataclass(slots=True)
class SampleResult:
"""Result for a single evaluated sample."""
sample_id: str
ground_truth: str
raw_prediction: str
parsed_prediction: str
input_tokens: int
output_tokens: int
metadata: dict


@dataclass(slots=True)
class EvalResult:
"""Aggregated result for one model on one task."""
model_name: str
task_name: str
per_sample: list[SampleResult]
metrics: dict[str, float | int]


class Evaluator:
"""Runs a single (task, model, dataset) evaluation pass."""

def run(
self,
task: EvalTask,
model: BaseModel,
dataset,
model_name: str,
max_samples: int | None = None,
) -> EvalResult:
"""Evaluate *model* on *dataset* using *task*.

The dataset must be an OpenTSLM-style ``QADataset`` and expose
``get_eval_context(idx)`` returning ``(Recording, ground_truth)``.
``dataset[idx]`` provides the model-facing sample dict.
"""
n = len(dataset) if max_samples is None else min(max_samples, len(dataset))
per_sample: list[SampleResult] = []

for i in tqdm(range(n), desc=f"{model_name} / {task.name}"):
recording, ground_truth = dataset.get_eval_context(i)
sample = dataset[i]

response: ModelResponse = model.generate(sample=sample, recording=recording)
parsed = task.parse_prediction(response.text)

per_sample.append(SampleResult(
sample_id=recording.row_id,
ground_truth=ground_truth,
raw_prediction=response.text,
parsed_prediction=parsed,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
metadata={"user_id": recording.user_id, "date": recording.date},
))

gts = [s.ground_truth for s in per_sample]
preds = [s.parsed_prediction for s in per_sample]

return EvalResult(
model_name=model_name,
task_name=task.name,
per_sample=per_sample,
metrics=task.aggregate_metrics(gts, preds),
)
Comment thread
max-rosenblattl marked this conversation as resolved.
6 changes: 6 additions & 0 deletions evaluation/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#
# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
#
# SPDX-License-Identifier: MIT
#
88 changes: 88 additions & 0 deletions evaluation/io/writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#
# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
#
# SPDX-License-Identifier: MIT
#
"""ResultsWriter — persists EvalResult to disk.

Output layout::

results/
<task_name>/
<model_name>/
<timestamp>/
metrics.json ← aggregated metrics (F1, SE, …)
predictions.csv ← per-sample ground truth + prediction
"""
from __future__ import annotations

import csv
import json
import re
from datetime import datetime
from pathlib import Path

from evaluation.evaluator import EvalResult


class ResultsWriter:
"""Writes EvalResult files under results/<task>/<model>/<timestamp>/.

Args:
results_dir: Root directory for all evaluation outputs.
"""

def __init__(self, results_dir: Path | str = Path("results")) -> None:
self.results_dir = Path(results_dir)

def write(self, result: EvalResult) -> Path:
"""Persist *result* to disk and return the output directory path.

Creates:
metrics.json — aggregated metrics dict (directly JSON-serialisable).
predictions.csv — one row per evaluated sample.
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
safe_task = re.sub(r"[^\w\-.]", "_", result.task_name)
safe_model = re.sub(r"[^\w\-.]", "_", result.model_name)
out_dir = self.results_dir / safe_task / safe_model / timestamp
out_dir.mkdir(parents=True, exist_ok=False)

# metrics.json
with open(out_dir / "metrics.json", "w") as f:
json.dump(result.metrics, f, indent=2)

# predictions.csv
base_fields = [
"sample_id",
"ground_truth",
"prediction",
"raw_prediction",
"input_tokens",
"output_tokens",
]
metadata_fields: list[str] = []
seen_metadata: set[str] = set()
for sample in result.per_sample:
for key in sample.metadata:
if key not in seen_metadata:
seen_metadata.add(key)
metadata_fields.append(key)

with open(out_dir / "predictions.csv", "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=base_fields + metadata_fields)
writer.writeheader()
for sample in result.per_sample:
row = {
"sample_id": sample.sample_id,
"ground_truth": sample.ground_truth,
"prediction": sample.parsed_prediction,
"raw_prediction": sample.raw_prediction,
"input_tokens": sample.input_tokens,
"output_tokens": sample.output_tokens,
}
row.update(sample.metadata)
writer.writerow(row)
print(f"Results written to: {out_dir}")
return out_dir
6 changes: 6 additions & 0 deletions evaluation/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#
# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
#
# SPDX-License-Identifier: MIT
#
Loading
Loading