diff --git a/.gitignore b/.gitignore
index 8b60180..ec14117 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 __pycache__/
 data/
+results/
 .env
 .claude/
 docs/
diff --git a/REUSE.toml b/REUSE.toml
index bd8a574..d2669c4 100644
--- a/REUSE.toml
+++ b/REUSE.toml
@@ -1,6 +1,6 @@
 version = 1
 
 [[annotations]]
-path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "README.md"]
+path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "**/README.md"]
 SPDX-FileCopyrightText = "2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)"
 SPDX-License-Identifier = "MIT"
diff --git a/curriculum_learning.py b/curriculum_learning.py
index 0d205de..4989d0f 100644
--- a/curriculum_learning.py
+++ b/curriculum_learning.py
@@ -36,12 +36,6 @@
     MHCMultiLabelQADataset,
 )
 
-# Allow peft's LoraConfig in torch.load(weights_only=True) checkpoints.
-try:
-    from peft.tuners.lora.config import LoraConfig
-    torch.serialization.add_safe_globals([LoraConfig])
-except ImportError:
-    pass
 from opentslm.model_config import (
     BATCH_SIZE,
     GRAD_CLIP_NORM,
@@ -208,7 +202,7 @@ def _load_checkpoint(self, stage_name: str, eval_only: bool = False):
         if not path.exists():
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded checkpoint from {stage_name} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
@@ -250,7 +244,7 @@ def _load_latest_checkpoint(
         if not path.exists():
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
         optimizer.load_state_dict(checkpoint["optimizer_state"])
         scheduler.load_state_dict(checkpoint["scheduler_state"])
@@ -278,7 +272,7 @@ def _load_previous_stage_model(self, stage_name: str):
             print(f"Warning: No checkpoint found for previous stage {prev_stage}")
             return None
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded previous stage model from {prev_stage} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})")
@@ -289,7 +283,7 @@ def _load_initial_checkpoint(self):
         if not path.exists():
             raise FileNotFoundError(f"Initial checkpoint not found: {path}")
 
-        checkpoint = torch.load(path, map_location="cpu", weights_only=True)
+        checkpoint = torch.load(path, map_location="cpu", weights_only=False)
         self._apply_checkpoint_to_model(checkpoint)
 
         print(f"Loaded initial checkpoint from {path}")
diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 0000000..c5613aa
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,115 @@
+# Evaluation Pipeline
+
+LLM-based evaluation of MHC metabolic labels. For each weekly sample the
+pipeline hands per-channel signals to a model (as plots or an encoded
+tensor depending on the backend), asks it to predict a target label,
+parses the answer and computes classification metrics.
+
+Currently Sherlock-specific — paths below assume `$SCRATCH` and the
+MHC-benchmark repo at `~/MHC-benchmark`.
+
+## Prerequisites
+
+| Artifact | Path | How to build |
+|---|---|---|
+| Weekly HF dataset (stride=7) | `$SCRATCH/mhc-benchmark-weekly_hf_stride7/` | [1] |
+| Captioned arrow shards | `$SCRATCH/exports/lean_full_stride7/shard_{0..3}/recordings_*.arrow` | [2] |
+| Labels lookup parquet | `$SCRATCH/weekly_labels_lookup_ours_stride7.parquet` | [3] |
+| OpenAI API key | `~/SensorTSLM/.env` (`OPENAI_API_KEY=...`) | manual |
+
+Upstream MHC-benchmark deps (one-time DVC pulls inside `~/MHC-benchmark`,
+using its conda env):
+
+```bash
+cd ~/MHC-benchmark
+dvc pull data/processed/daily_hourly_hf.dvc   # ~4 GB, feeds [1]
+dvc pull data/labels.dvc                       # ~600 MB, feeds [3]
+```
+
+## [1] Weekly HF dataset (stride=7)
+
+Non-overlapping 7-day windows aligned to each user's first valid day.
+~156k rows, ~40 min.
+
+```bash
+srun --mem=32G -c 4 -t 02:30:00 -p normal bash -c '
+  export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
+  export PYTHONPATH=$HOME/MHC-benchmark/src:$HOME/MHC-benchmark
+  cd ~/MHC-benchmark
+  python -u -m data.processing.daily_hourly_hf_to_weekly_hf \
+    --daily-hourly-path data/processed/daily_hourly_hf \
+    --output-path $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
+    --stride-days 7
+'
+```
+
+## [2] Captioned arrow shards
+
+Four parallel sbatch jobs, ~30 min total.
+
+```bash
+cd ~/SensorTSLM
+export MHC_WEEKLY_DATASET_DIR=$SCRATCH/mhc-benchmark-weekly_hf_stride7
+export TOTAL_ROWS=156428
+./scripts/export_captions_sharded.sh weekly 4 $SCRATCH/exports/lean_full_stride7
+# wait: squeue -u $USER
+```
+
+## [3] Labels lookup parquet
+
+Reads `(user_id, week_start)` pairs from the stride=7 HF dataset and
+attaches labels from MHC-benchmark's `labels.json`. By construction
+100% aligned to our captions. ~2 min.
+
+```bash
+srun --mem=16G -c 2 -t 00:30:00 -p normal bash -c '
+  export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH
+  cd ~/MHC-benchmark
+  python scripts/build_labels_lookup.py \
+    --hf-dir $SCRATCH/mhc-benchmark-weekly_hf_stride7 \
+    --segment-type weekly \
+    --output $SCRATCH/weekly_labels_lookup_ours_stride7.parquet
+'
+```
+
+## Run Eval
+
+Three backends, selected via `--backend` (default `openai`):
+
+| Backend     | What `--model` means                  | Example |
+|-------------|---------------------------------------|---------|
+| `openai`    | Chat model name at `OPENAI_BASE_URL`  | `gpt-4o` (VLM, plots in prompt) |
+| `vlm-local` | HF image-text-to-text model ID        | `google/gemma-3-4b-it` |
+| `opentslm`  | HF Hub repo_id (auto-pulls backbone+weights) OR LLM backbone id with `--opentslm-checkpoint` for a local .pt | `OpenTSLM/llama-3.2-1b-har-sp` |
+
+Configuration via env vars (all optional):
+- `MHC_EXPORT_DIR` (default `exports`) — Arrow shard directory
+- `MHC_LABELS_PATH` (default `data/labels_dev_subset.parquet`) — labels parquet
+- `OPENAI_BASE_URL` (default `https://api.openai.com/v1`) — OpenAI-compatible endpoint
+- `MAX_SAMPLES` (unset = full eval) — cap on samples for debug runs
+
+Split is hardcoded to `test`.
+
+```bash
+srun --mem=16G -c 2 -t 00:15:00 -p normal bash -c '
+  ml python/3.12.1 && ml libsndfile && \
+  source $SCRATCH/sensor-venv/bin/activate && \
+  export HF_HOME=$SCRATCH/hf_cache && \
+  export MHC_EXPORT_DIR=$SCRATCH/exports/lean_full_stride7 && \
+  export MHC_LABELS_PATH=$SCRATCH/weekly_labels_lookup_ours_stride7.parquet && \
+  export MAX_SAMPLES=5 && \
+  cd ~/SensorTSLM && \
+  python evaluation/run_eval.py --task biological_sex --model gpt-4o
+'
+```
+
+Results are written to `results/<label>/<model>/<timestamp>/` relative
+to the current directory: `metrics.json` + `predictions.csv`.
+
+## Available tasks
+
+- `biological_sex` — binary (Male / Female)
+- `bmi` — ordinal (Underweight … Morbid Obesity)
+- `diabetes` — binary
+- `hypertension` — binary
+- `cardiovascular` — binary
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
new file mode 100644
index 0000000..4d6db2e
--- /dev/null
+++ b/evaluation/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/evaluator.py b/evaluation/evaluator.py
new file mode 100644
index 0000000..ca332c7
--- /dev/null
+++ b/evaluation/evaluator.py
@@ -0,0 +1,92 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Evaluator — orchestrates task, model, and dataset into results.
+
+Responsibilities:
+  - Iterate over samples (with optional cap).
+  - Call model.generate() for each sample.
+  - Parse predictions via the task.
+  - Aggregate metrics at the end.
+  - Return a structured EvalResult ready for the ResultsWriter.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from tqdm import tqdm
+
+from evaluation.tasks.base import EvalTask
+from models.base import BaseModel, ModelResponse
+
+
+@dataclass(slots=True)
+class SampleResult:
+    """Result for a single evaluated sample."""
+    sample_id: str
+    ground_truth: str
+    raw_prediction: str
+    parsed_prediction: str
+    input_tokens: int
+    output_tokens: int
+    metadata: dict
+
+
+@dataclass(slots=True)
+class EvalResult:
+    """Aggregated result for one model on one task."""
+    model_name: str
+    task_name: str
+    per_sample: list[SampleResult]
+    metrics: dict[str, float | int]
+
+
+class Evaluator:
+    """Runs a single (task, model, dataset) evaluation pass."""
+
+    def run(
+        self,
+        task: EvalTask,
+        model: BaseModel,
+        dataset,
+        model_name: str,
+        max_samples: int | None = None,
+    ) -> EvalResult:
+        """Evaluate *model* on *dataset* using *task*.
+
+        The dataset must be an OpenTSLM-style ``QADataset`` and expose
+        ``get_eval_context(idx)`` returning ``(Recording, ground_truth)``.
+        ``dataset[idx]`` provides the model-facing sample dict.
+        """
+        n = len(dataset) if max_samples is None else min(max_samples, len(dataset))
+        per_sample: list[SampleResult] = []
+
+        for i in tqdm(range(n), desc=f"{model_name} / {task.name}"):
+            recording, ground_truth = dataset.get_eval_context(i)
+            sample = dataset[i]
+
+            response: ModelResponse = model.generate(sample=sample, recording=recording)
+            parsed = task.parse_prediction(response.text)
+
+            per_sample.append(SampleResult(
+                sample_id=recording.row_id,
+                ground_truth=ground_truth,
+                raw_prediction=response.text,
+                parsed_prediction=parsed,
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                metadata={"user_id": recording.user_id, "date": recording.date},
+            ))
+
+        gts = [s.ground_truth for s in per_sample]
+        preds = [s.parsed_prediction for s in per_sample]
+
+        return EvalResult(
+            model_name=model_name,
+            task_name=task.name,
+            per_sample=per_sample,
+            metrics=task.aggregate_metrics(gts, preds),
+        )
diff --git a/evaluation/io/__init__.py b/evaluation/io/__init__.py
new file mode 100644
index 0000000..4d6db2e
--- /dev/null
+++ b/evaluation/io/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/io/writer.py b/evaluation/io/writer.py
new file mode 100644
index 0000000..d7aa186
--- /dev/null
+++ b/evaluation/io/writer.py
@@ -0,0 +1,88 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""ResultsWriter — persists EvalResult to disk.
+
+Output layout::
+
+    results/
+        <task_name>/
+            <model_name>/
+                <timestamp>/
+                    metrics.json      ← aggregated metrics (F1, SE, …)
+                    predictions.csv   ← per-sample ground truth + prediction
+"""
+from __future__ import annotations
+
+import csv
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+
+from evaluation.evaluator import EvalResult
+
+
+class ResultsWriter:
+    """Writes EvalResult files under results/<task>/<model>/<timestamp>/.
+
+    Args:
+        results_dir: Root directory for all evaluation outputs.
+    """
+
+    def __init__(self, results_dir: Path | str = Path("results")) -> None:
+        self.results_dir = Path(results_dir)
+
+    def write(self, result: EvalResult) -> Path:
+        """Persist *result* to disk and return the output directory path.
+
+        Creates:
+            metrics.json     — aggregated metrics dict (directly JSON-serialisable).
+            predictions.csv  — one row per evaluated sample.
+        """
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        safe_task = re.sub(r"[^\w\-.]", "_", result.task_name)
+        safe_model = re.sub(r"[^\w\-.]", "_", result.model_name)
+        out_dir = self.results_dir / safe_task / safe_model / timestamp
+        out_dir.mkdir(parents=True, exist_ok=False)
+
+        # metrics.json
+        with open(out_dir / "metrics.json", "w") as f:
+            json.dump(result.metrics, f, indent=2)
+
+        # predictions.csv
+        base_fields = [
+            "sample_id",
+            "ground_truth",
+            "prediction",
+            "raw_prediction",
+            "input_tokens",
+            "output_tokens",
+        ]
+        metadata_fields: list[str] = []
+        seen_metadata: set[str] = set()
+        for sample in result.per_sample:
+            for key in sample.metadata:
+                if key not in seen_metadata:
+                    seen_metadata.add(key)
+                    metadata_fields.append(key)
+
+        with open(out_dir / "predictions.csv", "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=base_fields + metadata_fields)
+            writer.writeheader()
+            for sample in result.per_sample:
+                row = {
+                    "sample_id": sample.sample_id,
+                    "ground_truth": sample.ground_truth,
+                    "prediction": sample.parsed_prediction,
+                    "raw_prediction": sample.raw_prediction,
+                    "input_tokens": sample.input_tokens,
+                    "output_tokens": sample.output_tokens,
+                }
+                row.update(sample.metadata)
+                writer.writerow(row)
+        print(f"Results written to: {out_dir}")
+        return out_dir
diff --git a/evaluation/metrics/__init__.py b/evaluation/metrics/__init__.py
new file mode 100644
index 0000000..4d6db2e
--- /dev/null
+++ b/evaluation/metrics/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/metrics/classification.py b/evaluation/metrics/classification.py
new file mode 100644
index 0000000..d322cec
--- /dev/null
+++ b/evaluation/metrics/classification.py
@@ -0,0 +1,195 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Classification metrics with bootstrap standard errors.
+
+All functions are pure (no state) and return a flat dict of floats so results
+can be serialised directly to JSON. Bootstrap SEs use 1,000 resamples and
+report the standard deviation of bootstrapped metric values as the SE estimate.
+Supported metric families are binary, multiclass, and ordinal classification.
+
+Example
+-------
+>>> from evaluation.metrics.classification import (
+...     binary_metrics, multiclass_metrics, ordinal_metrics
+... )
+>>> binary = binary_metrics(["yes", "no", "yes"], ["yes", "yes", "yes"])
+>>> multiclass = multiclass_metrics(["a", "b", "c"], ["a", "b", "b"])
+"""
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import numpy as np
+from scipy.stats import spearmanr
+from sklearn.metrics import (
+    accuracy_score,
+    cohen_kappa_score,
+    f1_score,
+    mean_absolute_error,
+    precision_score,
+    recall_score,
+)
+
+_N_BOOTSTRAP = 1_000
+_BOOTSTRAP_SEED = 42
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap helper
+# ---------------------------------------------------------------------------
+
+def bootstrap_se(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    metric_fn: Callable[[np.ndarray, np.ndarray], float],
+    n_boot: int = _N_BOOTSTRAP,
+    seed: int = _BOOTSTRAP_SEED,
+) -> float:
+    """Compute bootstrap standard error for a metric.
+
+    Args:
+        y_true:    Ground-truth label array.
+        y_pred:    Predicted label array.
+        metric_fn: f(y_true, y_pred) -> float.
+        n_boot:    Number of bootstrap resamples.
+        seed:      Random seed for reproducibility.
+
+    Returns:
+        Standard error (std of bootstrapped metric values).
+    """
+    rng = np.random.RandomState(seed)
+    n = len(y_true)
+    scores = np.empty(n_boot)
+    for i in range(n_boot):
+        idx = rng.choice(n, n, replace=True)
+        scores[i] = metric_fn(y_true[idx], y_pred[idx])
+    return float(np.nanstd(scores, ddof=1))
+
+
+# ---------------------------------------------------------------------------
+# Binary classification metrics
+# ---------------------------------------------------------------------------
+
+def binary_metrics(
+    ground_truths: list[str],
+    predictions: list[str],
+    positive_label: str = "yes",
+) -> dict[str, float | int]:
+    """F1, Precision, Recall, Accuracy with bootstrap SEs for binary labels.
+
+    Args:
+        ground_truths:  List of canonical ground-truth strings.
+        predictions:    List of parsed model predictions (same length).
+        positive_label: The string that counts as the positive class.
+
+    Returns:
+        Flat dict: f1, f1_se, precision, precision_se, recall, recall_se,
+        accuracy, accuracy_se, n_samples, n_positive_gt, n_positive_pred.
+    """
+    y_true = np.array([1 if g == positive_label else 0 for g in ground_truths])
+    y_pred = np.array([1 if p == positive_label else 0 for p in predictions])
+
+    def _f1(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(f1_score(yt, yp, zero_division=0))
+
+    def _precision(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(precision_score(yt, yp, zero_division=0))
+
+    def _recall(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(recall_score(yt, yp, zero_division=0))
+
+    def _accuracy(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(accuracy_score(yt, yp))
+
+    return {
+        "f1":            _f1(y_true, y_pred),
+        "f1_se":         bootstrap_se(y_true, y_pred, _f1),
+        "precision":     _precision(y_true, y_pred),
+        "precision_se":  bootstrap_se(y_true, y_pred, _precision),
+        "recall":        _recall(y_true, y_pred),
+        "recall_se":     bootstrap_se(y_true, y_pred, _recall),
+        "accuracy":      _accuracy(y_true, y_pred),
+        "accuracy_se":   bootstrap_se(y_true, y_pred, _accuracy),
+        "n_samples":     int(len(ground_truths)),
+        "n_positive_gt": int(y_true.sum()),
+        "n_positive_pred": int(y_pred.sum()),
+    }
+
+
+def multiclass_metrics(
+    ground_truths: list[str],
+    predictions: list[str],
+    labels: list[str] | None = None,
+) -> dict[str, float | int]:
+    """Accuracy and macro-F1 with bootstrap SEs for categorical labels."""
+    y_true = np.asarray(ground_truths, dtype=object)
+    y_pred = np.asarray(predictions, dtype=object)
+    label_set = labels if labels is not None else sorted({*ground_truths, *predictions})
+
+    def _macro_f1(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(
+            f1_score(yt, yp, labels=label_set, average="macro", zero_division=0)
+        )
+
+    def _accuracy(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(accuracy_score(yt, yp))
+
+    return {
+        "macro_f1":      _macro_f1(y_true, y_pred),
+        "macro_f1_se":   bootstrap_se(y_true, y_pred, _macro_f1),
+        "accuracy":      _accuracy(y_true, y_pred),
+        "accuracy_se":   bootstrap_se(y_true, y_pred, _accuracy),
+        "n_samples":     int(len(ground_truths)),
+        "n_classes":     int(len(label_set)),
+    }
+
+
+def ordinal_metrics(
+    ground_truths: list[str],
+    predictions: list[str],
+    labels: list[str],
+) -> dict[str, float | int]:
+    """Spearman, quadratic weighted kappa, and MAE for ordered labels."""
+    label_to_rank = {label: idx for idx, label in enumerate(labels)}
+    y_true = np.asarray([label_to_rank[g] for g in ground_truths], dtype=np.int64)
+    y_pred = np.asarray([label_to_rank[p] for p in predictions], dtype=np.int64)
+
+    def _spearman(yt: np.ndarray, yp: np.ndarray) -> float:
+        if len(np.unique(yt)) < 2 or len(np.unique(yp)) < 2:
+            return float("nan")
+        score = spearmanr(yt, yp)[0]
+        return float(score) if score is not None else float("nan")
+
+    def _qwk(yt: np.ndarray, yp: np.ndarray) -> float:
+        if len(np.unique(yt)) < 2 and len(np.unique(yp)) < 2:
+            return float("nan")
+        return float(
+            cohen_kappa_score(
+                yt,
+                yp,
+                labels=list(range(len(labels))),
+                weights="quadratic",
+            )
+        )
+
+    def _mae(yt: np.ndarray, yp: np.ndarray) -> float:
+        return float(mean_absolute_error(yt, yp))
+
+    spearman_val = _spearman(y_true, y_pred)
+    qwk_val = _qwk(y_true, y_pred)
+    mae_val = _mae(y_true, y_pred)
+
+    return {
+        "spearman_r": float(spearman_val),
+        "spearman_r_se": bootstrap_se(y_true, y_pred, _spearman),
+        "qwk": float(qwk_val),
+        "qwk_se": bootstrap_se(y_true, y_pred, _qwk),
+        "mae_ordinal": float(mae_val),
+        "mae_ordinal_se": bootstrap_se(y_true, y_pred, _mae),
+        "n_samples": int(len(ground_truths)),
+        "n_classes": int(len(labels)),
+    }
diff --git a/evaluation/run_eval.py b/evaluation/run_eval.py
new file mode 100644
index 0000000..4c2b23c
--- /dev/null
+++ b/evaluation/run_eval.py
@@ -0,0 +1,151 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Unified evaluation entry point.
+
+Three backends, selected via ``--backend``:
+
+* ``openai`` (default) — VLM over OpenAI-compatible API (GPT-4o, Ollama, vLLM, …).
+* ``vlm-local``        — VLM via HF ``image-text-to-text`` pipeline (Gemma-VL, Qwen-VL, …).
+* ``opentslm``         — Native OpenTSLM-SP (requires ``--opentslm-checkpoint``).
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from evaluation.evaluator import Evaluator
+from evaluation.io.writer import ResultsWriter
+from evaluation.tasks.metabolic import MetabolicTask
+from mhc_weekly.constants import WEEKLY_CHANNEL_CONFIG
+from models.client import ClientConfig, ClientModel
+from models.local import LocalConfig, LocalModel
+from models.opentslm_native import OpenTSLMConfig, OpenTSLMModel
+
+# ---------------------------------------------------------------------------
+# Registries
+# ---------------------------------------------------------------------------
+
+TASK_REGISTRY = {
+    "biological_sex": lambda: MetabolicTask("BiologicalSex"),
+    "bmi":            lambda: MetabolicTask("BMI_categories"),
+    "diabetes":       lambda: MetabolicTask("Diabetes"),
+    "hypertension":   lambda: MetabolicTask("Hypertension"),
+    "cardiovascular": lambda: MetabolicTask("cardiovascular_disease"),
+}
+
+BACKENDS = ("openai", "vlm-local", "opentslm")
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run evaluation for a dataset/model/task combination.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--task", required=True, choices=sorted(TASK_REGISTRY),
+        help="Evaluation task (determines dataset, prompt, and metrics).",
+    )
+    parser.add_argument(
+        "--model", required=True,
+        help='Model identifier (meaning depends on --backend).',
+    )
+    parser.add_argument(
+        "--backend", default="openai", choices=BACKENDS,
+        help="Model backend.",
+    )
+    parser.add_argument(
+        "--opentslm-checkpoint", default=None,
+        help=(
+            "Path to locally trained OpenTSLM-SP weights (opentslm backend). "
+            "When set, --model is the backbone LLM id. When omitted, --model "
+            "is treated as a HF Hub repo_id and pulled via "
+            "OpenTSLM.load_pretrained."
+        ),
+    )
+    return parser.parse_args()
+
+
+def _build_model(args: argparse.Namespace):
+    if args.backend == "openai":
+        config = ClientConfig(model=args.model)  # base_url via OPENAI_BASE_URL env
+        return ClientModel(config=config, channel_config=WEEKLY_CHANNEL_CONFIG), args.model
+    if args.backend == "vlm-local":
+        config = LocalConfig(model=args.model)
+        return LocalModel(config=config, channel_config=WEEKLY_CHANNEL_CONFIG), args.model.split("/")[-1]
+    if args.backend == "opentslm":
+        if args.opentslm_checkpoint:
+            config = OpenTSLMConfig(
+                llm_id=args.model, checkpoint=args.opentslm_checkpoint
+            )
+            display = f"opentslm-{Path(args.opentslm_checkpoint).stem}"
+        else:
+            config = OpenTSLMConfig(repo_id=args.model)
+            display = args.model.split("/")[-1]
+        return OpenTSLMModel(config=config), display
+    raise ValueError(f"Unknown backend: {args.backend}")
+
+
+def _build_dataset(args: argparse.Namespace, task: MetabolicTask):
+    return task.dataset_cls(
+        split="test",
+        EOS_TOKEN="",
+        target_label=task._label_column,
+        export_dir=os.environ.get("MHC_EXPORT_DIR", "exports"),
+        label_lookup_path=os.environ.get(
+            "MHC_LABELS_PATH", "data/labels_dev_subset.parquet"
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    args = _parse_args()
+    max_samples = int(os.environ["MAX_SAMPLES"]) if os.environ.get("MAX_SAMPLES") else None
+
+    print(f"Task     : {args.task}")
+    print(f"Backend  : {args.backend}")
+    print(f"Model    : {args.model}")
+    print()
+
+    task = TASK_REGISTRY[args.task]()
+    model, model_name = _build_model(args)
+    dataset = _build_dataset(args, task)
+    if len(dataset) == 0:
+        print("No samples found — check MHC_EXPORT_DIR.")
+        sys.exit(1)
+
+    evaluator = Evaluator()
+    result = evaluator.run(
+        task=task,
+        model=model,
+        dataset=dataset,
+        model_name=model_name,
+        max_samples=max_samples,
+    )
+
+    ResultsWriter().write(result)
+
+    print("\n=== RESULTS ===")
+    for key, val in result.metrics.items():
+        if isinstance(val, float):
+            print(f"  {key}: {val:.4f}")
+        else:
+            print(f"  {key}: {val}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/tasks/__init__.py b/evaluation/tasks/__init__.py
new file mode 100644
index 0000000..4d6db2e
--- /dev/null
+++ b/evaluation/tasks/__init__.py
@@ -0,0 +1,6 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
diff --git a/evaluation/tasks/base.py b/evaluation/tasks/base.py
new file mode 100644
index 0000000..0bbdcfb
--- /dev/null
+++ b/evaluation/tasks/base.py
@@ -0,0 +1,37 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Core abstraction for evaluation tasks."""
+from __future__ import annotations
+
+import abc
+
+
+class EvalTask(abc.ABC):
+    """Abstract base class for an evaluation task.
+
+    A task owns two responsibilities:
+        1. parse_prediction  — extract a canonical answer from raw model output.
+        2. aggregate_metrics — compute dataset-level metrics from all predictions.
+
+    Subclasses must set the ``name`` class attribute and implement both methods.
+    """
+
+    name: str  # set by each concrete subclass, used for result file paths
+
+    @abc.abstractmethod
+    def parse_prediction(self, raw: str) -> str:
+        """Extract a canonical answer token from raw model output."""
+        ...
+
+    @abc.abstractmethod
+    def aggregate_metrics(
+        self,
+        ground_truths: list[str],
+        predictions: list[str],
+    ) -> dict[str, float]:
+        """Compute dataset-level metrics from ground truths and parsed predictions."""
+        ...
diff --git a/evaluation/tasks/metabolic.py b/evaluation/tasks/metabolic.py
new file mode 100644
index 0000000..87cbb16
--- /dev/null
+++ b/evaluation/tasks/metabolic.py
@@ -0,0 +1,180 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""Metabolic prediction evaluation task.
+
+The dataset owns prompt wording and answer options. This task is responsible
+only for parsing the model output and dispatching to the correct metric family.
+Binary labels use binary classification metrics, ordinal labels use ordinal
+metrics, and categorical labels use multiclass metrics.
+"""
+from __future__ import annotations
+
+import re  # used in parse_prediction
+
+from evaluation.metrics.classification import (
+    binary_metrics,
+    multiclass_metrics,
+    ordinal_metrics,
+)
+from evaluation.tasks.base import EvalTask
+from time_series_datasets.mhc_label_lookup import (
+    LABEL_TYPES,
+    get_metabolic_label_config,
+)
+from time_series_datasets.mhc_metabolic_qa_dataset import MHCMetabolicQADataset
+
+
+class MetabolicTask(EvalTask):
+    """Parse predictions and aggregate metrics for one metabolic label."""
+
+    name = "metabolic"
+    dataset_cls = MHCMetabolicQADataset
+
+    def __init__(self, label_column: str) -> None:
+        if label_column not in LABEL_TYPES:
+            raise ValueError(
+                f"Unknown label_column '{label_column}'. "
+                f"All MHC labels (from mhc_label_lookup.LABEL_TYPES): {sorted(LABEL_TYPES)}"
+            )
+        cfg = get_metabolic_label_config(label_column)
+        self._label_column = label_column
+        self._label_type = LABEL_TYPES[label_column]
+        self._values = list(cfg["values"])
+        self._match_values = sorted(self._values, key=len, reverse=True)
+        self._positive_label = self._values[0]
+        self.name = label_column
+
+    # Patterns that indicate the model refused rather than answered.
+    _REFUSAL_PATTERNS = re.compile(
+        r"\b(i('m| am) sorry|i can'?t|i cannot|unable to|not possible|"
+        r"i('m| am) not able|it('s| is) not possible)\b",
+        re.IGNORECASE,
+    )
+
+    def parse_prediction(self, raw: str) -> str:
+        """Extract the first matching label value from model output.
+
+        Checks the last non-empty line first, then falls back to the full text.
+        Matching is case-insensitive and whole-word.
+
+        A line that contains a refusal marker is skipped even if it also
+        contains a label string (e.g. "I cannot tell if male or female").
+        Returns 'unknown' if no valid label is found.
+        """
+        text = raw.strip()
+        lines = [line.strip() for line in text.splitlines() if line.strip()]
+
+        # Prefer the explicit "Answer: <value>" suffix when present.
+        answer_match = re.search(
+            r"Answer:\s*([^\n.,;!?]+)", text, re.IGNORECASE
+        )
+        if answer_match:
+            candidate = answer_match.group(1).strip().rstrip(".,!?;:")
+            for value in self._match_values:
+                if candidate.lower() == value.lower():
+                    return value
+            return "unknown"
+
+        def _match(s: str) -> str | None:
+            if self._REFUSAL_PATTERNS.search(s):
+                return None
+            lower = s.lower()
+            for value in self._match_values:
+                if re.search(r"\b" + re.escape(value.lower()) + r"\b", lower):
+                    return value
+            return None
+
+        for line in reversed(lines):
+            hit = _match(line)
+            if hit is not None:
+                return hit
+
+        # Final fallback: full text, but only if not a refusal
+        if not self._REFUSAL_PATTERNS.search(text):
+            lower = text.lower()
+            for value in self._match_values:
+                if re.search(r"\b" + re.escape(value.lower()) + r"\b", lower):
+                    return value
+
+        return "unknown"
+
+    def aggregate_metrics(
+        self,
+        ground_truths: list[str],
+        predictions: list[str],
+    ) -> dict[str, float | int]:
+        """Compute label-type-aware metrics, excluding unknown predictions."""
+        if len(ground_truths) != len(predictions):
+            raise ValueError(
+                "Length mismatch: "
+                f"{len(ground_truths)} ground truths vs {len(predictions)} predictions"
+            )
+
+        n_unknown = sum(1 for p in predictions if p == "unknown")
+        filtered = [
+            (gt, pred)
+            for gt, pred in zip(ground_truths, predictions, strict=True)
+            if pred != "unknown"
+        ]
+
+        if not filtered:
+            return self._empty_metrics(ground_truths, n_unknown)
+
+        gts, preds = zip(*filtered)
+        if self._label_type == "binary":
+            metrics = binary_metrics(
+                list(gts), list(preds), positive_label=self._positive_label
+            )
+        elif self._label_type == "ordinal":
+            metrics = ordinal_metrics(list(gts), list(preds), labels=self._values)
+        else:
+            metrics = multiclass_metrics(list(gts), list(preds), labels=self._values)
+        metrics["n_unknown"] = n_unknown
+        metrics["parse_rate"] = round(len(filtered) / len(ground_truths), 4)
+        return metrics
+
+    def _empty_metrics(
+        self,
+        ground_truths: list[str],
+        n_unknown: int,
+    ) -> dict[str, float | int]:
+        if self._label_type == "binary":
+            n_positive_gt = sum(1 for gt in ground_truths if gt == self._positive_label)
+            return {
+                "f1": 0.0, "f1_se": 0.0,
+                "precision": 0.0, "precision_se": 0.0,
+                "recall": 0.0, "recall_se": 0.0,
+                "accuracy": 0.0, "accuracy_se": 0.0,
+                "n_samples": len(ground_truths),
+                "n_positive_gt": n_positive_gt,
+                "n_positive_pred": 0,
+                "n_unknown": n_unknown,
+                "parse_rate": 0.0,
+            }
+        if self._label_type == "ordinal":
+            return {
+                "spearman_r": 0.0,
+                "spearman_r_se": 0.0,
+                "qwk": 0.0,
+                "qwk_se": 0.0,
+                "mae_ordinal": 0.0,
+                "mae_ordinal_se": 0.0,
+                "n_samples": len(ground_truths),
+                "n_classes": len(self._values),
+                "n_unknown": n_unknown,
+                "parse_rate": 0.0,
+            }
+        return {
+            "macro_f1": 0.0,
+            "macro_f1_se": 0.0,
+            "accuracy": 0.0,
+            "accuracy_se": 0.0,
+            "n_samples": len(ground_truths),
+            "n_classes": len(self._values),
+            "n_unknown": n_unknown,
+            "parse_rate": 0.0,
+        }
diff --git a/mhc/transformer.py b/mhc/transformer.py
index 2cb9b7a..c876153 100644
--- a/mhc/transformer.py
+++ b/mhc/transformer.py
@@ -41,7 +41,7 @@ def transform_row(self, row: dict) -> Recording:
         row_id = f"mhc:{user_id}:{date}"
         data = np.asarray(row["data"], dtype=np.float32).copy()  # (19, 1440)
 
-        # ZeroToNaNTransform (according to MHC-Benchmark)
+        # Treat placeholder zero-only values as missing data.
         data[5][data[5] == 0] = np.nan
         for ch_idx in (0, 1, 3, 4, 6):
             if np.all(data[ch_idx] == 0):
diff --git a/models/base.py b/models/base.py
index 25ad9f3..5346d64 100644
--- a/models/base.py
+++ b/models/base.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 from extractors import ChannelConfig
-from timef.schema import Annotation, SignalView
+from timef.schema import Recording, SignalView
 
 
 @dataclass
@@ -24,33 +24,17 @@ class ModelResponse:
 
 
 class BaseModel(abc.ABC):
-    """Abstract base for model backends. Renders signals, assembles context, calls the model."""
+    """Abstract model backend."""
 
-    def __init__(self, channel_config: ChannelConfig):
+    def __init__(self, channel_config: ChannelConfig | None = None):
         self.channel_config = channel_config
 
-    def process(
-        self,
-        prompt: str,
-        signals: list[SignalView],
-        annotations: list[Annotation] | None = None,
-        multi_channel: bool = False,
-    ) -> ModelResponse:
-        """Render signals as plots, assemble with optional annotations, call model."""
-        caption = self._format_annotations(annotations) if annotations else None
-        return self._call(
-            prompt=prompt, signals=signals, caption=caption, multi_channel=multi_channel
-        )
-
     @abc.abstractmethod
-    def _call(
-        self,
-        prompt: str,
-        signals: list[SignalView],
-        caption: str | None,
-        multi_channel: bool,
-    ) -> ModelResponse:
-        """Subclasses render signals in their native format and call the model."""
+    def generate(self, sample: dict, recording: Recording) -> ModelResponse:
+        """Run inference on one sample.
+
+        ``recording`` carries raw per-channel values for plot rendering.
+        """
         ...
 
     def _plot_signal(self, signal: SignalView) -> plt.Figure:
@@ -70,7 +54,3 @@ def _plot_signal(self, signal: SignalView) -> plt.Figure:
         ax.set_xlabel(self.channel_config.time_unit.capitalize())
         fig.tight_layout()
         return fig
-
-    def _format_annotations(self, annotations: list[Annotation]) -> str:
-        """Concatenate all annotation answers."""
-        return "\n".join(annotation.text for annotation in annotations if annotation.text)
diff --git a/models/client.py b/models/client.py
index 48d5dfc..b750a15 100644
--- a/models/client.py
+++ b/models/client.py
@@ -17,22 +17,22 @@
 
 from extractors import ChannelConfig
 from models.base import BaseModel, ModelResponse
-from timef.schema import SignalView
+from timef.schema import Annotation, Recording, SignalView
 
 load_dotenv(override=True)
 
 
 @dataclass(frozen=True)
 class ClientConfig:
-    base_url: str                    # "https://api.openai.com/v1" or "http://localhost:11434/v1"
     model: str                       # "gpt-4o", "gemma3", "Qwen/Qwen2.5-VL-7B"
+    base_url: str | None = None      # None → OpenAI SDK reads OPENAI_BASE_URL env, else api.openai.com
     max_tokens: int = 4096
     temperature: float = 0.7
     max_requests: int = 100
 
 
 class ClientModel(BaseModel):
-    """Calls an OpenAI-compatible endpoint (OpenAI, Ollama, vLLM, LM Studio, etc.)."""
+    """VLM backend over an OpenAI-compatible endpoint (OpenAI, Ollama, vLLM, LM Studio, …)."""
 
     def __init__(self, config: ClientConfig, channel_config: ChannelConfig):
         super().__init__(channel_config)
@@ -46,25 +46,39 @@ def __init__(self, config: ClientConfig, channel_config: ChannelConfig):
         )
         self._request_count = 0
 
-    def _call(
+    def generate(self, sample: dict, recording: Recording) -> ModelResponse:
+        content = self._build_content(
+            text=f"{sample['pre_prompt']}\n\n{sample['post_prompt']}",
+            signals=list(recording.iter_channels()),
+        )
+        return self._send(content)
+
+    def process(
         self,
         prompt: str,
         signals: list[SignalView],
-        caption: str | None,
-        multi_channel: bool,
+        annotations: list[Annotation] | None = None,
+        multi_channel: bool = False,
     ) -> ModelResponse:
+        """Captioning-review helper: send plots + arbitrary prompt (+ caption)."""
+        text = prompt
+        if annotations:
+            caption = "\n".join(a.text for a in annotations if a.text)
+            if caption:
+                text = f"{caption}\n\n{prompt}"
+        return self._send(self._build_content(text=text, signals=signals))
+
+    def _build_content(self, text: str, signals: list[SignalView]) -> list[dict]:
         content: list[dict] = []
         for signal in signals:
             content.append({
                 "type": "image_url",
                 "image_url": {"url": self._render_signal(signal)},
             })
-
-        text = prompt
-        if caption:
-            text = f"{caption}\n\n{prompt}"
         content.append({"type": "text", "text": text})
+        return content
 
+    def _send(self, content: list[dict]) -> ModelResponse:
         if self._request_count >= self.config.max_requests:
             raise RuntimeError(
                 f"Request limit reached ({self.config.max_requests}). "
diff --git a/models/local.py b/models/local.py
index 8fba0ed..36a5707 100644
--- a/models/local.py
+++ b/models/local.py
@@ -16,45 +16,65 @@
 
 from extractors import ChannelConfig
 from models.base import BaseModel, ModelResponse
-from timef.schema import SignalView
+from timef.schema import Annotation, Recording, SignalView
+
+
+def _get_best_device() -> str:
+    """Pick CUDA > MPS > CPU based on availability (mirrors OpenTSLM)."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
 
 
 @dataclass(frozen=True)
 class LocalConfig:
     model: str                       # "google/gemma-3-4b-it"
-    device: str = "cuda"             # "cuda", "mps", "cpu"
+    device: str | None = None        # auto-detected if None (cuda > mps > cpu)
     torch_dtype: str = "bfloat16"    # maps to torch.bfloat16 etc.
     max_tokens: int = 1024
 
 
 class LocalModel(BaseModel):
-    """Runs a HF transformers pipeline in-process. No server needed."""
+    """VLM backend via an in-process HF transformers pipeline (image-text-to-text)."""
 
     def __init__(self, config: LocalConfig, channel_config: ChannelConfig):
         super().__init__(channel_config)
         self.config = config
+        device = config.device or _get_best_device()
         self._pipe = pipeline(
             "image-text-to-text",
             model=config.model,
-            device=config.device,
+            device=device,
             dtype=getattr(torch, config.torch_dtype),
         )
 
-    def _call(
+    def generate(self, sample: dict, recording: Recording) -> ModelResponse:
+        return self._send(
+            text=f"{sample['pre_prompt']}\n\n{sample['post_prompt']}",
+            signals=list(recording.iter_channels()),
+        )
+
+    def process(
         self,
         prompt: str,
         signals: list[SignalView],
-        caption: str | None,
-        multi_channel: bool,
+        annotations: list[Annotation] | None = None,
+        multi_channel: bool = False,
     ) -> ModelResponse:
-        images = [self._render_signal(s) for s in signals]
+        """Captioning-review helper: send plots + arbitrary prompt (+ caption)."""
+        text = prompt
+        if annotations:
+            caption = "\n".join(a.text for a in annotations if a.text)
+            if caption:
+                text = f"{caption}\n\n{prompt}"
+        return self._send(text=text, signals=signals)
 
+    def _send(self, text: str, signals: list[SignalView]) -> ModelResponse:
         content: list[dict] = []
-        for img in images:
-            content.append({"type": "image", "image": img})
-        text = prompt
-        if caption:
-            text = f"{caption}\n\n{prompt}"
+        for signal in signals:
+            content.append({"type": "image", "image": self._render_signal(signal)})
         content.append({"type": "text", "text": text})
 
         messages = [{"role": "user", "content": content}]
diff --git a/models/opentslm_native.py b/models/opentslm_native.py
new file mode 100644
index 0000000..722378d
--- /dev/null
+++ b/models/opentslm_native.py
@@ -0,0 +1,110 @@
+#
+# SPDX-FileCopyrightText: 2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2026 This source file is part of the SensorTSLM open-source project.
+#
+# SPDX-License-Identifier: MIT
+#
+"""OpenTSLM native backend.
+
+Wraps ``opentslm.model.llm.OpenTSLMSP``: takes the dataset's native
+sample dict (pre_prompt, post_prompt, time_series, time_series_text),
+rebuilds a ``FullPrompt``, and calls ``eval_prompt`` for inference.
+
+Two loading paths are supported:
+
+* ``repo_id`` set — pulls a published OpenTSLM checkpoint from HF Hub
+  via ``OpenTSLM.load_pretrained`` (auto-detects backbone from the
+  ``-sp``/``-flamingo`` suffix).
+* ``llm_id`` + ``checkpoint`` set — instantiates ``OpenTSLMSP`` with
+  an explicit backbone and loads local encoder/projector weights. This
+  is the path we use for our own trained checkpoints on Sherlock scratch.
+
+LoRA is auto-detected by reading the ``lora_enabled`` flag off the
+checkpoint — ``OpenTSLMSP.save_lora_state_to_checkpoint`` always writes
+it, so this is safe for every SP checkpoint.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import torch
+from huggingface_hub import hf_hub_download
+from opentslm.model.llm.OpenTSLM import OpenTSLM
+from opentslm.model.llm.OpenTSLMSP import OpenTSLMSP
+from opentslm.prompt.full_prompt import FullPrompt
+from opentslm.prompt.text_prompt import TextPrompt
+from opentslm.prompt.text_time_series_prompt import TextTimeSeriesPrompt
+
+from models.base import BaseModel, ModelResponse
+from models.local import _get_best_device
+from timef.schema import Recording
+
+
+def _peek_lora_flag(checkpoint_path: str) -> bool:
+    """Read ``lora_enabled`` off a saved OpenTSLM-SP checkpoint."""
+    ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    return bool(ckpt.get("lora_enabled", False))
+
+
+@dataclass(frozen=True)
+class OpenTSLMConfig:
+    repo_id: str | None = None        # HF Hub id, e.g. "OpenTSLM/gemma-3-270m-pt-har-sp"
+    llm_id: str | None = None         # explicit backbone, used with ``checkpoint``
+    checkpoint: str | None = None     # local .pt file, used with ``llm_id``
+    device: str | None = None
+    max_new_tokens: int = 512
+    normalize: bool = False
+
+    def __post_init__(self) -> None:
+        has_hub = self.repo_id is not None
+        has_local = self.llm_id is not None and self.checkpoint is not None
+        if has_hub == has_local:
+            raise ValueError(
+                "OpenTSLMConfig: provide exactly one of `repo_id` (HF Hub) "
+                "or (`llm_id` + `checkpoint`) for a local file."
+            )
+
+
+class OpenTSLMModel(BaseModel):
+    """Native OpenTSLM-SP inference — encodes time-series internally."""
+
+    def __init__(self, config: OpenTSLMConfig):
+        super().__init__(channel_config=None)
+        self.config = config
+        device = config.device or _get_best_device()
+
+        if config.repo_id is not None:
+            # Pre-download to peek the LoRA flag; HF cache means load_pretrained
+            # reuses the same file, no double-download.
+            ckpt_path = hf_hub_download(config.repo_id, "model_checkpoint.pt")
+            self._model = OpenTSLM.load_pretrained(
+                config.repo_id, device=device, enable_lora=_peek_lora_flag(ckpt_path)
+            )
+        else:
+            self._model = OpenTSLMSP(llm_id=config.llm_id, device=device)
+            if _peek_lora_flag(config.checkpoint):
+                self._model.enable_lora()
+            self._model.load_from_file(config.checkpoint)
+            self._model.eval()
+
+        # Greedy decoding: bfloat16 softmax on CPU can emit NaN probabilities,
+        # which torch.multinomial rejects. Greedy skips sampling entirely.
+        self._model.llm.generation_config.do_sample = False
+
+    def generate(self, sample: dict, recording: Recording) -> ModelResponse:
+        prompt = FullPrompt(
+            pre_prompt=TextPrompt(sample["pre_prompt"]),
+            text_time_series_prompt_list=[
+                TextTimeSeriesPrompt(text, tensor)
+                for text, tensor in zip(
+                    sample["time_series_text"], sample["time_series"], strict=True
+                )
+            ],
+            post_prompt=TextPrompt(sample["post_prompt"]),
+        )
+        text = self._model.eval_prompt(
+            prompt,
+            max_new_tokens=self.config.max_new_tokens,
+            normalize=self.config.normalize,
+        )
+        return ModelResponse(text=text, input_tokens=0, output_tokens=0)
diff --git a/time_series_datasets/mhc_base_qa_dataset.py b/time_series_datasets/mhc_base_qa_dataset.py
index 679d26f..a8add84 100644
--- a/time_series_datasets/mhc_base_qa_dataset.py
+++ b/time_series_datasets/mhc_base_qa_dataset.py
@@ -20,7 +20,8 @@
 from opentslm.time_series_datasets.QADataset import QADataset
 
 from mhc.constants import CONTINUOUS_CHANNELS
-from time_series_datasets.mhc_label_lookup import LabelLookup
+from time_series_datasets.mhc_label_lookup import LabelLookup, _decode_labels
+from timef.schema import Recording
 
 
 DEFAULT_EXPORT_DIR = "exports"
@@ -29,6 +30,7 @@
 
 
 def _assign_split(user_id: str) -> Literal["train", "validation", "test"]:
+    """Deterministic user-level split via SHA-1 bucketing (80 / 10 / 10)."""
     digest = int(hashlib.sha1(user_id.encode()).hexdigest(), 16)
     bucket = (digest % 1000) / 1000.0
     if bucket < TRAIN_FRAC:
@@ -47,7 +49,7 @@ def _iter_shard_rows(export_dir: Path) -> Iterator[dict]:
     shards = sorted(export_dir.rglob("recordings_*.arrow"))
     if not shards:
         raise FileNotFoundError(
-            f"No recordings_*.arrow shards found in {export_dir}."
+            f"No recordings_*.arrow shards found in {export_dir} (searched recursively)."
         )
     for shard_path in shards:
         table = feather.read_table(shard_path)
@@ -62,12 +64,22 @@ def __init__(
         EOS_TOKEN: str,
         export_dir: str | Path = DEFAULT_EXPORT_DIR,
         label_lookup_path: str | Path | None = None,
-        format_sample_str: bool = False,
-        time_series_format_function=None,
+        required_labels: List[str] | None = None,
     ) -> None:
+        """Load and format QA samples from captioned MHC arrow shards.
+
+        required_labels: label columns to fetch from the labels parquet.
+            Subclasses pass their task's target(s) — e.g.
+            ``MHCMetabolicQADataset`` forwards ``[target_label]`` here.
+            Routed straight into ``LabelLookup.join(labels=...)``.
+            ``None`` pulls every column registered in ``LABEL_TYPES``.
+        """
+        self._split_name = split
         self._export_dir = Path(export_dir)
         self._label_lookup = LabelLookup(label_lookup_path) if label_lookup_path else None
-        super().__init__(split, EOS_TOKEN, format_sample_str, time_series_format_function)
+        self._required_labels = required_labels
+        super().__init__(split, EOS_TOKEN)
+        # self._raw_samples is populated inside _load_splits.
 
     def _load_splits(self) -> Tuple[List[dict], List[dict], List[dict]]:
         train: List[dict] = []
@@ -77,16 +89,15 @@ def _load_splits(self) -> Tuple[List[dict], List[dict], List[dict]]:
         all_samples: List[dict] = []
         for row in _iter_shard_rows(self._export_dir):
             texts = [ann["text"] for ann in row["annotations"] if ann["text"]]
-            if not texts:
-                continue
-
             sample = {
                 "user_id": row["user_id"],
                 "date": row["date"],
-                "values": row["values"],
+                "values": np.asarray(row["values"], dtype=np.float32),
                 "channel_names": row["channel_names"],
                 "display_names": row["display_names"],
                 "units": row["units"],
+                # Kept on the sample dict for MHCCaptionQADataset (training target).
+                # Eval datasets don't read it; OpenTSLM's _format_sample never sees it.
                 "caption": " ".join(texts),
             }
             all_samples.append(sample)
@@ -94,7 +105,10 @@ def _load_splits(self) -> Tuple[List[dict], List[dict], List[dict]]:
         if self._label_lookup and all_samples:
             user_ids = [s["user_id"] for s in all_samples]
             dates = [s["date"] for s in all_samples]
-            labels_df = self._label_lookup.join(user_ids, dates)
+            labels_df = self._label_lookup.join(
+                user_ids, dates, labels=self._required_labels
+            )
+            labels_df = _decode_labels(labels_df, labels_df.columns.tolist())
             for sample, (_, row) in zip(all_samples, labels_df.iterrows(), strict=True):
                 sample["labels"] = row.to_dict()
 
@@ -102,8 +116,36 @@ def _load_splits(self) -> Tuple[List[dict], List[dict], List[dict]]:
             bucket = _assign_split(sample["user_id"])
             {"train": train, "validation": validation, "test": test}[bucket].append(sample)
 
+        # Keep this instance's split for get_eval_context() — QADataset discards
+        # the raw dicts after _format_sample; only the current split is needed.
+        self._raw_samples = {
+            "train": train, "validation": validation, "test": test,
+        }[self._split_name]
+
         return train, validation, test
 
+    def get_eval_context(self, idx: int) -> Tuple[Recording, str]:
+        """Return (recording, ground_truth) for evaluation.
+
+        The recording carries raw per-channel values so VLM backends can
+        render plots. Prompt text comes from ``dataset[idx]`` via the
+        standard OpenTSLM formatting path — this method adds nothing to it.
+        """
+        row = self._raw_samples[idx]
+        values = np.asarray(row["values"], dtype=np.float32)
+        recording = Recording(
+            row_id=f"{row['user_id']}:{row['date']}",
+            user_id=row["user_id"],
+            date=row["date"],
+            values=values,
+            channel_names=tuple(row["channel_names"]),
+            display_names=tuple(row["display_names"]),
+            units=tuple(row["units"]),
+            has_any_data=np.any(~np.isnan(values), axis=1),
+            channel_variance=np.nan_to_num(np.nanvar(values, axis=1), nan=0.0),
+        )
+        return recording, self._get_answer(row)
+
     def _get_pre_prompt(self, _row) -> str:
         return "You are an expert in wearable sensor time series analysis."
 
@@ -145,7 +187,7 @@ def _continuous_prompt(
             f"The following is the {display}{unit_str}, "
             f"it has mean {mean:.4f} and std {std:.4f}:"
         )
-        return TextTimeSeriesPrompt(text, normalized.tolist())
+        return TextTimeSeriesPrompt(text, normalized.numpy())
 
     @staticmethod
     def _binary_prompt(channel: torch.Tensor, display: str) -> TextTimeSeriesPrompt:
@@ -154,4 +196,4 @@ def _binary_prompt(channel: torch.Tensor, display: str) -> TextTimeSeriesPrompt:
             f"The following is a per-minute binary mask for {display} "
             f"(1 = active, 0 = inactive or nonwear):"
         )
-        return TextTimeSeriesPrompt(text, channel.tolist())
+        return TextTimeSeriesPrompt(text, channel.numpy())
diff --git a/time_series_datasets/mhc_label_lookup.py b/time_series_datasets/mhc_label_lookup.py
index 47774d2..0cba5a8 100644
--- a/time_series_datasets/mhc_label_lookup.py
+++ b/time_series_datasets/mhc_label_lookup.py
@@ -55,11 +55,106 @@
 _META_COLUMNS = frozenset({"user_id", "week_start", "date", "n_valid_days", "n_valid_hours"})
 
 
+# =========================================================================
+# Authoritative int -> string decoding tables.
+# Mirrors upstream MHC-benchmark (~/MHC-benchmark/):
+#   - Binary:    src/labels/api.py:126-141 (_to_bool normalizes strings/bools
+#                to {0, 1}). All binary labels use {0: "False", 1: "True"}
+#                except BiologicalSex, which is sourced from raw "Male"/"Female"
+#                strings and maps to {0: "Female", 1: "Male"}.
+#   - Ordinal:   data/labels/ordinal_dictionary.json.
+# Continuous labels require no decoding (floats pass through).
+# Keep in sync if upstream encoding changes.
+# =========================================================================
+
+_BINARY_OVERRIDES: dict[str, dict[int, str]] = {
+    "BiologicalSex": {0: "Female", 1: "Male"},
+}
+_DEFAULT_BINARY: dict[int, str] = {0: "False", 1: "True"}
+
+_ORDINAL_DECODE: dict[str, dict[int, str]] = {
+    "BMI_categories":            {0: "Underweight", 1: "Normal weight", 2: "Overweight", 3: "Obesity", 4: "Morbid Obesity"},
+    "WakeUpTime_categories":     {0: "Normal Riser", 1: "Early Riser", 2: "Late Riser", 3: "Very Late Riser"},
+    "GoSleepTime_categories":    {0: "Normal Sleeper", 1: "Early Sleeper", 2: "Late Sleeper", 3: "Very Late Sleeper", 4: "Shift Worker"},
+    "sleep_time_categories":     {0: "Normal", 1: "Short", 2: "Too Long", 3: "Insufficient"},
+    "blood_pressure_categories": {0: "Normal", 1: "Elevated", 2: "Hypertension_Stage_1", 3: "Hypertension_Stage_2", 4: "Hypertensive_Crisis"},
+    "feel_worthwhile1":          {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+    "feel_worthwhile2":          {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+    "feel_worthwhile3":          {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+    "feel_worthwhile4":          {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+    "happiness_categories":      {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+    "satisfiedwith_life":        {0: "Very High", 1: "High", 2: "Medium", 3: "Low"},
+}
+
+
+def _decode_labels(df: pd.DataFrame, label_cols: list[str]) -> pd.DataFrame:
+    """Decode int-encoded labels to canonical strings per MHC-benchmark.
+
+    Binary and ordinal columns are mapped to their canonical label strings.
+    Continuous columns pass through. NaN entries and values not present in
+    the mapping are left untouched (pandas ``replace`` semantics).
+    """
+    for col in label_cols:
+        label_type = LABEL_TYPES[col]
+        if label_type == "binary":
+            mapping = _BINARY_OVERRIDES.get(col, _DEFAULT_BINARY)
+        elif label_type == "ordinal":
+            mapping = _ORDINAL_DECODE[col]
+        else:
+            continue
+        df[col] = df[col].replace(mapping)
+    return df
+
+
+# Presentation metadata for the Metabolic task family: the natural-language
+# question posed to the model and the set of acceptable answer strings.
+METABOLIC_LABEL_CONFIG: dict[str, dict[str, str | list[str]]] = {
+    "BiologicalSex": {
+        "question": "Is this person biologically male or female?",
+        "values": ["Male", "Female"],
+    },
+    "BMI_categories": {
+        "question": (
+            "Which BMI category best matches this participant: "
+            "Underweight, Normal weight, Overweight, Obesity, or Morbid Obesity?"
+        ),
+        "values": [
+            "Underweight",
+            "Normal weight",
+            "Overweight",
+            "Obesity",
+            "Morbid Obesity",
+        ],
+    },
+    "Diabetes": {
+        "question": "Does this person have diabetes?",
+        "values": ["True", "False"],
+    },
+    "Hypertension": {
+        "question": "Does this person have hypertension?",
+        "values": ["True", "False"],
+    },
+    "cardiovascular_disease": {
+        "question": "Does this person have cardiovascular disease?",
+        "values": ["True", "False"],
+    },
+}
+
+
+def get_metabolic_label_config(label_column: str) -> dict[str, str | list[str]]:
+    if label_column not in METABOLIC_LABEL_CONFIG:
+        raise ValueError(
+            f"Unknown metabolic label '{label_column}'. "
+            f"Supported: {sorted(METABOLIC_LABEL_CONFIG)}"
+        )
+    return METABOLIC_LABEL_CONFIG[label_column]
+
+
 class LabelLookup:
-    """Join MHC-benchmark labels to SensorTSLM samples by (user_id, date).
+    """Join MHC-benchmark labels to SensorTSLM samples.
 
-    The parquet uses ``week_start`` (weekly) or ``date`` (daily) as the
-    timestamp column. This class auto-detects which is present.
+    The parquet may contain ``week_start`` (weekly) or ``date`` (daily) as the
+    timestamp column, or neither for user-level label files (one row per user).
 
     Sentinel values (-1 for integer labels, -1.0 for continuous labels)
     are converted to NaN by default.
@@ -71,19 +166,19 @@ def __init__(self, parquet_path: str | Path) -> None:
             raise FileNotFoundError(f"Labels lookup parquet not found: {path}")
 
         self.df = pd.read_parquet(path)
+        self.df["user_id"] = self.df["user_id"].astype(str)
 
         if "week_start" in self.df.columns:
-            self._date_col = "week_start"
+            self._date_col: str | None = "week_start"
+            self.df["week_start"] = self.df["week_start"].astype(str)
         elif "date" in self.df.columns:
             self._date_col = "date"
+            self.df["date"] = self.df["date"].astype(str)
         else:
-            raise ValueError("Parquet must contain a 'week_start' or 'date' column")
-
-        self.df["user_id"] = self.df["user_id"].astype(str)
-        self.df[self._date_col] = self.df[self._date_col].astype(str)
+            self._date_col = None  # user-level parquet
 
     @property
-    def date_column(self) -> str:
+    def date_column(self) -> str | None:
         return self._date_col
 
     @property
@@ -94,11 +189,19 @@ def label_names(self) -> list[str]:
     def num_samples(self) -> int:
         return len(self.df)
 
-    def join(self, user_ids: list[str], dates: list[str], labels: list[str] | None = None) -> pd.DataFrame:
-        """Batch join: return a DataFrame of label values for a list of samples.
+    def join(
+        self,
+        user_ids: list[str],
+        dates: list[str],
+        labels: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """Sample-level join keyed by (user_id, date/week_start).
+
+        If the parquet has no date column (user-level labels), the join
+        falls back to user_id only — every sample from the same user
+        receives the same labels.
 
         Rows are aligned with the input lists. Missing samples get NaN.
-        Sentinel values are replaced with NaN.
         """
         if len(user_ids) != len(dates):
             raise ValueError("user_ids and dates must have the same length")
@@ -108,13 +211,20 @@ def join(self, user_ids: list[str], dates: list[str], labels: list[str] | None =
             if name not in LABEL_TYPES:
                 raise ValueError(f"Unknown label: {name!r}")
 
-        keys = pd.DataFrame({"user_id": user_ids, self._date_col: dates})
-        keys["user_id"] = keys["user_id"].astype(str)
-        keys[self._date_col] = keys[self._date_col].astype(str)
+        if self._date_col is not None:
+            keys = pd.DataFrame({"user_id": user_ids, self._date_col: dates})
+            keys["user_id"] = keys["user_id"].astype(str)
+            keys[self._date_col] = keys[self._date_col].astype(str)
+            merge_on = ["user_id", self._date_col]
+            right_cols = ["user_id", self._date_col] + label_cols
+        else:
+            keys = pd.DataFrame({"user_id": [str(u) for u in user_ids]})
+            merge_on = ["user_id"]
+            right_cols = ["user_id"] + label_cols
 
         merged = keys.merge(
-            self.df[["user_id", self._date_col] + label_cols],
-            on=["user_id", self._date_col],
+            self.df[right_cols],
+            on=merge_on,
             how="left",
             validate="many_to_one",
         )
diff --git a/time_series_datasets/mhc_metabolic_qa_dataset.py b/time_series_datasets/mhc_metabolic_qa_dataset.py
index f03302f..738137a 100644
--- a/time_series_datasets/mhc_metabolic_qa_dataset.py
+++ b/time_series_datasets/mhc_metabolic_qa_dataset.py
@@ -14,7 +14,7 @@
 import pandas as pd
 
 from time_series_datasets.mhc_base_qa_dataset import MHCBaseQADataset
-from time_series_datasets.mhc_label_lookup import LABEL_TYPES
+from time_series_datasets.mhc_label_lookup import LABEL_TYPES, METABOLIC_LABEL_CONFIG
 
 
 class MHCMetabolicQADataset(MHCBaseQADataset):
@@ -25,8 +25,6 @@ def __init__(
         target_label: str,
         export_dir: str | Path = "exports",
         label_lookup_path: str | Path | None = None,
-        format_sample_str: bool = False,
-        time_series_format_function=None,
     ) -> None:
         if target_label not in LABEL_TYPES:
             raise ValueError(f"Unknown label: {target_label!r}")
@@ -35,7 +33,7 @@ def __init__(
         self._target_label = target_label
         super().__init__(
             split, EOS_TOKEN, export_dir, label_lookup_path,
-            format_sample_str, time_series_format_function,
+            required_labels=[target_label],
         )
         if self._target_label not in self._label_lookup.label_names:
             raise ValueError(
@@ -47,17 +45,31 @@ def _load_splits(self) -> Tuple[List[dict], List[dict], List[dict]]:
         train = [s for s in train if not pd.isna(s["labels"][self._target_label])]
         validation = [s for s in validation if not pd.isna(s["labels"][self._target_label])]
         test = [s for s in test if not pd.isna(s["labels"][self._target_label])]
+        # Overwrite with filtered version for this instance's split.
+        self._raw_samples = {
+            "train": train, "validation": validation, "test": test,
+        }[self._split_name]
         return train, validation, test
 
     def _get_answer(self, row) -> str:
-        value = row["labels"][self._target_label]
-        if LABEL_TYPES[self._target_label] in ("binary", "ordinal"):
-            return str(int(value))
-        return str(value)
+        return str(row["labels"][self._target_label])
 
     def _get_post_prompt(self, _row) -> str:
-        label_type = LABEL_TYPES[self._target_label]
+        cfg = METABOLIC_LABEL_CONFIG.get(self._target_label)
+        if cfg is None:
+            label_type = LABEL_TYPES[self._target_label]
+            return (
+                f"Based on the sensor data, predict the value of "
+                f"{self._target_label} ({label_type})."
+            )
+        values_str = ", ".join(cfg["values"])
         return (
-            f"Based on the sensor data, predict the value of "
-            f"{self._target_label} ({label_type})."
+            f"{cfg['question']}\n\n"
+            f"Possible answers: {values_str}\n\n"
+            "Rules:\n"
+            "- Base your answer on the sensor patterns above.\n"
+            "- You MUST give a classification even if the signal is unclear — "
+            "state limitations but still make your best guess.\n"
+            "- Never respond with a question.\n"
+            "- You MUST end your response with 'Answer: <label>'."
         )