fix: move HuggingFace cache to user home, add gitignore

vlordier · vlordier · commit c787d5088b5c · 2026-03-06T19:27:10.000+01:00
- Add _models/.cache/ to .gitignore to prevent accidental cache commits
- Set HF_HOME and HF_HUB_CACHE in scripts to ensure cache goes to ~/.cache
- Add warning in config about using local_dir outside project repo
- Add note in .env.example about storing models outside project

This prevents large HuggingFace model caches from being stored
in the project directory (fixes 77GB cache issue).
diff --git a/examples/localcowork/.env.example b/examples/localcowork/.env.example
@@ -6,6 +6,9 @@
 # ─── Model Configuration ─────────────────────────────────────────────────────
 
 # Directory containing GGUF model files (downloaded from HuggingFace)
+# IMPORTANT: Store models OUTSIDE the project repo (e.g., ~/Projects/_models).
+# The huggingface_hub library creates .cache/huggingface for tracking downloads,
+# which should not be in the project directory.
 # LOCALCOWORK_MODELS_DIR=~/Projects/_models
 
 # Text model API endpoint (OpenAI-compatible). Set by start-model.sh.
diff --git a/examples/localcowork/.gitignore b/examples/localcowork/.gitignore
@@ -17,6 +17,7 @@ src-tauri/gen/
 _models/*.gguf
 _models/*.bin
 _models/*.safetensors
+_models/.cache/
 
 # ─── IDE ──────────────────────────────────────────────────────────────────
 .vscode/
diff --git a/examples/localcowork/_models/config.yaml b/examples/localcowork/_models/config.yaml
@@ -7,10 +7,20 @@
 #   Set LOCALCOWORK_MODELS_DIR env var to override.
 #   Ollama-managed models use Ollama's own storage (~/.ollama/models/).
 #
+#   IMPORTANT: When downloading models via huggingface_hub, use your home
+#   directory (default) rather than this directory. The huggingface_hub
+#   library creates a .cache/huggingface folder for tracking downloads,
+#   which can grow large and should not be in the project repo.
+#
+#   Example to download to your models directory:
+#     from huggingface_hub import hf_hub_download
+#     hf_hub_download('LiquidAI/LFM2-24B-A2B', 'LFM2-24B-A2B-Q4_K_M.gguf',
+#                     local_dir='${HOME}/Projects/_models')
+#
 #   Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation.
 #   The config-loader resolves environment variables at load time.
 
-active_model: lfm2-24b-a2b  # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
+active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
 
 # Default model directory for non-Ollama model files (GGUF, MLX, etc.)
 models_dir: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}"
@@ -76,11 +86,11 @@ models:
     model_name: "gpt-oss:20b"
     base_url: "http://localhost:11434/v1"
     context_window: 32768
-    tool_call_format: native_json  # Native function calling + structured outputs
+    tool_call_format: native_json # Native function calling + structured outputs
     temperature: 0.7
     max_tokens: 4096
     estimated_vram_gb: 14
-    force_json_response: false  # Enable after live testing — triggers GBNF grammar enforcement
+    force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
     capabilities:
       - text
       - tool_calling
@@ -92,7 +102,7 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf"
     base_url: "http://localhost:8080/v1"
     context_window: 32768
-    tool_call_format: pythonic  # LFM2.5 uses Pythonic calls; normalizer converts to JSON
+    tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
     temperature: 0.7
     max_tokens: 4096
     estimated_vram_gb: 14
@@ -102,20 +112,20 @@ models:
 
   # LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview)
   # Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio)
-  # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access)
+  # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access)
   # Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md
   # Run: llama-server --model <path> --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn
   lfm2-24b-a2b:
-    display_name: "LFM2-24B-A2B-Preview"
+    display_name: "LFM2-24B-A2B"
     runtime: llama_cpp
-    model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf"
+    model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf"
     base_url: "http://localhost:8080/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
+    tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
     temperature: 0.7
-    tool_temperature: 0.1  # Lower temperature for tool-calling turns (ADR-008 Layer 3)
+    tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
     max_tokens: 4096
-    estimated_vram_gb: 16  # Q4_K_M quantization estimate for 24B MoE
+    estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
     capabilities:
       - text
       - tool_calling
@@ -130,7 +140,7 @@ models:
     tool_call_format: native_json
     temperature: 0.7
     max_tokens: 4096
-    estimated_vram_gb: 4  # Only ~3B active params
+    estimated_vram_gb: 4 # Only ~3B active params
     capabilities:
       - text
       - tool_calling
@@ -152,7 +162,7 @@ models:
     tool_call_format: native_json
     temperature: 0.1
     max_tokens: 4096
-    estimated_vram_gb: 1.8  # Q8_0 model (1.25 GB) + mmproj (583 MB)
+    estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
     capabilities:
       - text
       - vision
@@ -170,10 +180,10 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf"
     base_url: "http://localhost:8082/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2.5 bracket format: [server.tool(args)]
+    tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
     temperature: 0.1
     max_tokens: 512
-    estimated_vram_gb: 1.5  # Q8_0 quantization (1.2 GB)
+    estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
     role: tool_router
     fine_tuned:
       method: lora
@@ -198,7 +208,7 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf"
     base_url: "http://localhost:8084/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2.5 bracket format: [server.tool(args)]
+    tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
     temperature: 0.1
     max_tokens: 512
     estimated_vram_gb: 2.3
@@ -238,6 +248,24 @@ models:
       - text
       - tool_calling
 
+  # LM Studio headless server — any model loaded in LM Studio
+  # Run `lms server start` or enable "Run LLM server on login" in app settings.
+  # Default port is 1234. Uses OpenAI-compatible API.
+  # Note: The model_name here is informational - update to match your loaded model.
+  lmstudio-default:
+    display_name: "LM Studio (Default)"
+    runtime: lmstudio
+    model_name: "liquid/lfm2-24b-a2b" # Replace with your loaded model ID
+    base_url: "http://localhost:1234/v1"
+    context_window: 32768
+    tool_call_format: native_json
+    temperature: 0.7
+    max_tokens: 4096
+    estimated_vram_gb: null # Varies by loaded model
+    capabilities:
+      - text
+      - tool_calling
+
   # ─── Benchmark comparison models ────────────────────────────────────────
   # These models are benchmarked against LFM2-24B-A2B to demonstrate
   # scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE.
@@ -290,7 +318,7 @@ models:
     tool_temperature: 0.1
     max_tokens: 4096
     estimated_vram_gb: 20
-    deprecated: true  # Partial run only (40/100), dropped from active benchmarks
+    deprecated: true # Partial run only (40/100), dropped from active benchmarks
     capabilities:
       - text
       - tool_calling
@@ -358,7 +386,9 @@ models:
       - text
       - tool_calling
 
-# Runtime configurations
+# Runtime configurations (informational only — not used by the app)
+# These describe how to start each runtime for reference. The app
+# expects the runtime to already be running when it starts.
 runtimes:
   ollama:
     command: "ollama serve"
@@ -371,6 +401,14 @@ runtimes:
     health_check: "http://localhost:8080/health"
     startup_timeout_seconds: 60
 
+  lmstudio:
+    # Use `lms server start` CLI to start headless, or enable "Run LLM server
+    # on login" in app settings (Cmd/Ctrl+,). Default port is 1234.
+    command: "lms"
+    args: ["server", "start"]
+    health_check: "http://localhost:1234/v1/models"
+    startup_timeout_seconds: 30
+
   mlx:
     command: "mlx_lm.server"
     args: ["--model", "{model_path}", "--port", "8080"]
@@ -380,9 +418,9 @@ runtimes:
 
 # Fallback chain — used when the active model is unavailable
 fallback_chain:
-  - lfm2-24b-a2b     # Primary — 78% single-step, 24% chain completion
-  - qwen3-30b-moe    # Fallback 1 — Ollama-hosted Qwen3 MoE
-  - static_response   # Fallback 2 — hardcoded "model unavailable" message
+  - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
+  - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
+  - static_response # Fallback 2 — hardcoded "model unavailable" message
 
 # Dual-model orchestrator (ADR-009)
 # When enabled, the planner model decomposes multi-step workflows and
@@ -402,9 +440,9 @@ fallback_chain:
 #   to skip the orchestrator entirely and avoid the ~2-3s wasted planner call.
 #   See ADR-009 for full details.
 orchestrator:
-  enabled: false  # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
+  enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
   planner_model: lfm2-24b-a2b
-  router_model: lfm25-1.2b-router-ft  # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
+  router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
   router_top_k: 15
   max_plan_steps: 10
   step_retries: 3
@@ -414,4 +452,4 @@ orchestrator:
 # ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens).
 # The model selects 2-3 categories, then subsequent turns use only those tools.
 # Saves ~7,170 tokens per turn and eliminates cross-server confusion.
-two_pass_tool_selection: true  # Active only when >30 tools registered; 21 curated tools use flat mode
+two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode
diff --git a/examples/localcowork/scripts/start-model.sh b/examples/localcowork/scripts/start-model.sh