diff --git a/.gitignore b/.gitignore index 8b60180..ec14117 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ __pycache__/ data/ +results/ .env .claude/ docs/ diff --git a/REUSE.toml b/REUSE.toml index bd8a574..d2669c4 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -1,6 +1,6 @@ version = 1 [[annotations]] -path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "README.md"] +path = ["assets/**", "data/**", "**/*.png", "*.svg", "*.png", "**/*.pt", "**/*.jsonl", "**/*.json", ".gitignore", ".env.example", "**/uv.lock", "LICENSE", "**/requirements.txt", "CONTRIBUTORS.md", "**/README.md"] SPDX-FileCopyrightText = "2026 Stanford University, ETH Zurich, and the project authors (see CONTRIBUTORS.md)" SPDX-License-Identifier = "MIT" diff --git a/curriculum_learning.py b/curriculum_learning.py index 0d205de..4989d0f 100644 --- a/curriculum_learning.py +++ b/curriculum_learning.py @@ -36,12 +36,6 @@ MHCMultiLabelQADataset, ) -# Allow peft's LoraConfig in torch.load(weights_only=True) checkpoints. -try: - from peft.tuners.lora.config import LoraConfig - torch.serialization.add_safe_globals([LoraConfig]) -except ImportError: - pass from opentslm.model_config import ( BATCH_SIZE, GRAD_CLIP_NORM, @@ -208,7 +202,7 @@ def _load_checkpoint(self, stage_name: str, eval_only: bool = False): if not path.exists(): return None - checkpoint = torch.load(path, map_location="cpu", weights_only=True) + checkpoint = torch.load(path, map_location="cpu", weights_only=False) self._apply_checkpoint_to_model(checkpoint) print(f"Loaded checkpoint from {stage_name} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})") @@ -250,7 +244,7 @@ def _load_latest_checkpoint( if not path.exists(): return None - checkpoint = torch.load(path, map_location="cpu", weights_only=True) + checkpoint = torch.load(path, map_location="cpu", weights_only=False) self._apply_checkpoint_to_model(checkpoint) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) @@ -278,7 +272,7 @@ def _load_previous_stage_model(self, stage_name: str): print(f"Warning: No checkpoint found for previous stage {prev_stage}") return None - checkpoint = torch.load(path, map_location="cpu", weights_only=True) + checkpoint = torch.load(path, map_location="cpu", weights_only=False) self._apply_checkpoint_to_model(checkpoint) print(f"Loaded previous stage model from {prev_stage} (epoch={checkpoint['epoch']}, val_loss={checkpoint['val_loss']:.4f})") @@ -289,7 +283,7 @@ def _load_initial_checkpoint(self): if not path.exists(): raise FileNotFoundError(f"Initial checkpoint not found: {path}") - checkpoint = torch.load(path, map_location="cpu", weights_only=True) + checkpoint = torch.load(path, map_location="cpu", weights_only=False) self._apply_checkpoint_to_model(checkpoint) print(f"Loaded initial checkpoint from {path}") diff --git a/evaluation/README.md b/evaluation/README.md new file mode 100644 index 0000000..c5613aa --- /dev/null +++ b/evaluation/README.md @@ -0,0 +1,115 @@ +# Evaluation Pipeline + +LLM-based evaluation of MHC metabolic labels. For each weekly sample the +pipeline hands per-channel signals to a model (as plots or an encoded +tensor depending on the backend), asks it to predict a target label, +parses the answer and computes classification metrics. + +Currently Sherlock-specific — paths below assume `$SCRATCH` and the +MHC-benchmark repo at `~/MHC-benchmark`. + +## Prerequisites + +| Artifact | Path | How to build | +|---|---|---| +| Weekly HF dataset (stride=7) | `$SCRATCH/mhc-benchmark-weekly_hf_stride7/` | [1] | +| Captioned arrow shards | `$SCRATCH/exports/lean_full_stride7/shard_{0..3}/recordings_*.arrow` | [2] | +| Labels lookup parquet | `$SCRATCH/weekly_labels_lookup_ours_stride7.parquet` | [3] | +| OpenAI API key | `~/SensorTSLM/.env` (`OPENAI_API_KEY=...`) | manual | + +Upstream MHC-benchmark deps (one-time DVC pulls inside `~/MHC-benchmark`, +using its conda env): + +```bash +cd ~/MHC-benchmark +dvc pull data/processed/daily_hourly_hf.dvc # ~4 GB, feeds [1] +dvc pull data/labels.dvc # ~600 MB, feeds [3] +``` + +## [1] Weekly HF dataset (stride=7) + +Non-overlapping 7-day windows aligned to each user's first valid day. +~156k rows, ~40 min. + +```bash +srun --mem=32G -c 4 -t 02:30:00 -p normal bash -c ' + export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH + export PYTHONPATH=$HOME/MHC-benchmark/src:$HOME/MHC-benchmark + cd ~/MHC-benchmark + python -u -m data.processing.daily_hourly_hf_to_weekly_hf \ + --daily-hourly-path data/processed/daily_hourly_hf \ + --output-path $SCRATCH/mhc-benchmark-weekly_hf_stride7 \ + --stride-days 7 +' +``` + +## [2] Captioned arrow shards + +Four parallel sbatch jobs, ~30 min total. + +```bash +cd ~/SensorTSLM +export MHC_WEEKLY_DATASET_DIR=$SCRATCH/mhc-benchmark-weekly_hf_stride7 +export TOTAL_ROWS=156428 +./scripts/export_captions_sharded.sh weekly 4 $SCRATCH/exports/lean_full_stride7 +# wait: squeue -u $USER +``` + +## [3] Labels lookup parquet + +Reads `(user_id, week_start)` pairs from the stride=7 HF dataset and +attaches labels from MHC-benchmark's `labels.json`. By construction +100% aligned to our captions. ~2 min. + +```bash +srun --mem=16G -c 2 -t 00:30:00 -p normal bash -c ' + export PATH=$SCRATCH/mhc-benchmark/miniforge3/envs/mhc-benchmark-run/bin:$PATH + cd ~/MHC-benchmark + python scripts/build_labels_lookup.py \ + --hf-dir $SCRATCH/mhc-benchmark-weekly_hf_stride7 \ + --segment-type weekly \ + --output $SCRATCH/weekly_labels_lookup_ours_stride7.parquet +' +``` + +## Run Eval + +Three backends, selected via `--backend` (default `openai`): + +| Backend | What `--model` means | Example | +|-------------|---------------------------------------|---------| +| `openai` | Chat model name at `OPENAI_BASE_URL` | `gpt-4o` (VLM, plots in prompt) | +| `vlm-local` | HF image-text-to-text model ID | `google/gemma-3-4b-it` | +| `opentslm` | HF Hub repo_id (auto-pulls backbone+weights) OR LLM backbone id with `--opentslm-checkpoint` for a local .pt | `OpenTSLM/llama-3.2-1b-har-sp` | + +Configuration via env vars (all optional): +- `MHC_EXPORT_DIR` (default `exports`) — Arrow shard directory +- `MHC_LABELS_PATH` (default `data/labels_dev_subset.parquet`) — labels parquet +- `OPENAI_BASE_URL` (default `https://api.openai.com/v1`) — OpenAI-compatible endpoint +- `MAX_SAMPLES` (unset = full eval) — cap on samples for debug runs + +Split is hardcoded to `test`. + +```bash +srun --mem=16G -c 2 -t 00:15:00 -p normal bash -c ' + ml python/3.12.1 && ml libsndfile && \ + source $SCRATCH/sensor-venv/bin/activate && \ + export HF_HOME=$SCRATCH/hf_cache && \ + export MHC_EXPORT_DIR=$SCRATCH/exports/lean_full_stride7 && \ + export MHC_LABELS_PATH=$SCRATCH/weekly_labels_lookup_ours_stride7.parquet && \ + export MAX_SAMPLES=5 && \ + cd ~/SensorTSLM && \ + python evaluation/run_eval.py --task biological_sex --model gpt-4o +' +``` + +Results are written to `results/