Skip to content

Commit c787d50

Browse files
committed
fix: move HuggingFace cache to user home, add gitignore
- Add _models/.cache/ to .gitignore to prevent accidental cache commits - Set HF_HOME and HF_HUB_CACHE in scripts to ensure cache goes to ~/.cache - Add warning in config about using local_dir outside project repo - Add note in .env.example about storing models outside project This prevents large HuggingFace model caches from being stored in the project directory (fixes 77GB cache issue).
1 parent 0959763 commit c787d50

File tree

4 files changed

+179
-132
lines changed

4 files changed

+179
-132
lines changed

examples/localcowork/.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# ─── Model Configuration ─────────────────────────────────────────────────────
77

88
# Directory containing GGUF model files (downloaded from HuggingFace)
9+
# IMPORTANT: Store models OUTSIDE the project repo (e.g., ~/Projects/_models).
10+
# The huggingface_hub library creates .cache/huggingface for tracking downloads,
11+
# which should not be in the project directory.
912
# LOCALCOWORK_MODELS_DIR=~/Projects/_models
1013

1114
# Text model API endpoint (OpenAI-compatible). Set by start-model.sh.

examples/localcowork/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ src-tauri/gen/
1717
_models/*.gguf
1818
_models/*.bin
1919
_models/*.safetensors
20+
_models/.cache/
2021

2122
# ─── IDE ──────────────────────────────────────────────────────────────────
2223
.vscode/

examples/localcowork/_models/config.yaml

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,20 @@
77
# Set LOCALCOWORK_MODELS_DIR env var to override.
88
# Ollama-managed models use Ollama's own storage (~/.ollama/models/).
99
#
10+
# IMPORTANT: When downloading models via huggingface_hub, use your home
11+
# directory (default) rather than this directory. The huggingface_hub
12+
# library creates a .cache/huggingface folder for tracking downloads,
13+
# which can grow large and should not be in the project repo.
14+
#
15+
# Example to download to your models directory:
16+
# from huggingface_hub import hf_hub_download
17+
# hf_hub_download('LiquidAI/LFM2-24B-A2B', 'LFM2-24B-A2B-Q4_K_M.gguf',
18+
# local_dir='${HOME}/Projects/_models')
19+
#
1020
# Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation.
1121
# The config-loader resolves environment variables at load time.
1222

13-
active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
23+
active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
1424

1525
# Default model directory for non-Ollama model files (GGUF, MLX, etc.)
1626
models_dir: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}"
@@ -76,11 +86,11 @@ models:
7686
model_name: "gpt-oss:20b"
7787
base_url: "http://localhost:11434/v1"
7888
context_window: 32768
79-
tool_call_format: native_json # Native function calling + structured outputs
89+
tool_call_format: native_json # Native function calling + structured outputs
8090
temperature: 0.7
8191
max_tokens: 4096
8292
estimated_vram_gb: 14
83-
force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
93+
force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
8494
capabilities:
8595
- text
8696
- tool_calling
@@ -92,7 +102,7 @@ models:
92102
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf"
93103
base_url: "http://localhost:8080/v1"
94104
context_window: 32768
95-
tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
105+
tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
96106
temperature: 0.7
97107
max_tokens: 4096
98108
estimated_vram_gb: 14
@@ -102,20 +112,20 @@ models:
102112

103113
# LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview)
104114
# Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio)
105-
# Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access)
115+
# Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access)
106116
# Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md
107117
# Run: llama-server --model <path> --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn
108118
lfm2-24b-a2b:
109-
display_name: "LFM2-24B-A2B-Preview"
119+
display_name: "LFM2-24B-A2B"
110120
runtime: llama_cpp
111-
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf"
121+
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf"
112122
base_url: "http://localhost:8080/v1"
113123
context_window: 32768
114-
tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
124+
tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
115125
temperature: 0.7
116-
tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
126+
tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
117127
max_tokens: 4096
118-
estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
128+
estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
119129
capabilities:
120130
- text
121131
- tool_calling
@@ -130,7 +140,7 @@ models:
130140
tool_call_format: native_json
131141
temperature: 0.7
132142
max_tokens: 4096
133-
estimated_vram_gb: 4 # Only ~3B active params
143+
estimated_vram_gb: 4 # Only ~3B active params
134144
capabilities:
135145
- text
136146
- tool_calling
@@ -152,7 +162,7 @@ models:
152162
tool_call_format: native_json
153163
temperature: 0.1
154164
max_tokens: 4096
155-
estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
165+
estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
156166
capabilities:
157167
- text
158168
- vision
@@ -170,10 +180,10 @@ models:
170180
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf"
171181
base_url: "http://localhost:8082/v1"
172182
context_window: 32768
173-
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
183+
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
174184
temperature: 0.1
175185
max_tokens: 512
176-
estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
186+
estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
177187
role: tool_router
178188
fine_tuned:
179189
method: lora
@@ -198,7 +208,7 @@ models:
198208
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf"
199209
base_url: "http://localhost:8084/v1"
200210
context_window: 32768
201-
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
211+
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
202212
temperature: 0.1
203213
max_tokens: 512
204214
estimated_vram_gb: 2.3
@@ -238,6 +248,24 @@ models:
238248
- text
239249
- tool_calling
240250

251+
# LM Studio headless server — any model loaded in LM Studio
252+
# Run `lms server start` or enable "Run LLM server on login" in app settings.
253+
# Default port is 1234. Uses OpenAI-compatible API.
254+
# Note: The model_name here is informational - update to match your loaded model.
255+
lmstudio-default:
256+
display_name: "LM Studio (Default)"
257+
runtime: lmstudio
258+
model_name: "liquid/lfm2-24b-a2b" # Replace with your loaded model ID
259+
base_url: "http://localhost:1234/v1"
260+
context_window: 32768
261+
tool_call_format: native_json
262+
temperature: 0.7
263+
max_tokens: 4096
264+
estimated_vram_gb: null # Varies by loaded model
265+
capabilities:
266+
- text
267+
- tool_calling
268+
241269
# ─── Benchmark comparison models ────────────────────────────────────────
242270
# These models are benchmarked against LFM2-24B-A2B to demonstrate
243271
# scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE.
@@ -290,7 +318,7 @@ models:
290318
tool_temperature: 0.1
291319
max_tokens: 4096
292320
estimated_vram_gb: 20
293-
deprecated: true # Partial run only (40/100), dropped from active benchmarks
321+
deprecated: true # Partial run only (40/100), dropped from active benchmarks
294322
capabilities:
295323
- text
296324
- tool_calling
@@ -358,7 +386,9 @@ models:
358386
- text
359387
- tool_calling
360388

361-
# Runtime configurations
389+
# Runtime configurations (informational only — not used by the app)
390+
# These describe how to start each runtime for reference. The app
391+
# expects the runtime to already be running when it starts.
362392
runtimes:
363393
ollama:
364394
command: "ollama serve"
@@ -371,6 +401,14 @@ runtimes:
371401
health_check: "http://localhost:8080/health"
372402
startup_timeout_seconds: 60
373403

404+
lmstudio:
405+
# Use `lms server start` CLI to start headless, or enable "Run LLM server
406+
# on login" in app settings (Cmd/Ctrl+,). Default port is 1234.
407+
command: "lms"
408+
args: ["server", "start"]
409+
health_check: "http://localhost:1234/v1/models"
410+
startup_timeout_seconds: 30
411+
374412
mlx:
375413
command: "mlx_lm.server"
376414
args: ["--model", "{model_path}", "--port", "8080"]
@@ -380,9 +418,9 @@ runtimes:
380418

381419
# Fallback chain — used when the active model is unavailable
382420
fallback_chain:
383-
- lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
384-
- qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
385-
- static_response # Fallback 2 — hardcoded "model unavailable" message
421+
- lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
422+
- qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
423+
- static_response # Fallback 2 — hardcoded "model unavailable" message
386424

387425
# Dual-model orchestrator (ADR-009)
388426
# When enabled, the planner model decomposes multi-step workflows and
@@ -402,9 +440,9 @@ fallback_chain:
402440
# to skip the orchestrator entirely and avoid the ~2-3s wasted planner call.
403441
# See ADR-009 for full details.
404442
orchestrator:
405-
enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
443+
enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
406444
planner_model: lfm2-24b-a2b
407-
router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
445+
router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
408446
router_top_k: 15
409447
max_plan_steps: 10
410448
step_retries: 3
@@ -414,4 +452,4 @@ orchestrator:
414452
# ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens).
415453
# The model selects 2-3 categories, then subsequent turns use only those tools.
416454
# Saves ~7,170 tokens per turn and eliminates cross-server confusion.
417-
two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode
455+
two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode

0 commit comments

Comments
 (0)