77# Set LOCALCOWORK_MODELS_DIR env var to override.
88# Ollama-managed models use Ollama's own storage (~/.ollama/models/).
99#
10+ # IMPORTANT: When downloading models via huggingface_hub, use your home
11+ # directory (default) rather than this directory. The huggingface_hub
12+ # library creates a .cache/huggingface folder for tracking downloads,
13+ # which can grow large and should not be in the project repo.
14+ #
15+ # Example to download to your models directory:
16+ # from huggingface_hub import hf_hub_download
17+ # hf_hub_download('LiquidAI/LFM2-24B-A2B', 'LFM2-24B-A2B-Q4_K_M.gguf',
18+ # local_dir='${HOME}/Projects/_models')
19+ #
1020# Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation.
1121# The config-loader resolves environment variables at load time.
1222
13- active_model : lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
23+ active_model : lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
1424
1525# Default model directory for non-Ollama model files (GGUF, MLX, etc.)
1626models_dir : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}"
@@ -76,11 +86,11 @@ models:
7686 model_name : " gpt-oss:20b"
7787 base_url : " http://localhost:11434/v1"
7888 context_window : 32768
79- tool_call_format : native_json # Native function calling + structured outputs
89+ tool_call_format : native_json # Native function calling + structured outputs
8090 temperature : 0.7
8191 max_tokens : 4096
8292 estimated_vram_gb : 14
83- force_json_response : false # Enable after live testing — triggers GBNF grammar enforcement
93+ force_json_response : false # Enable after live testing — triggers GBNF grammar enforcement
8494 capabilities :
8595 - text
8696 - tool_calling
@@ -92,7 +102,7 @@ models:
92102 model_path : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf"
93103 base_url : " http://localhost:8080/v1"
94104 context_window : 32768
95- tool_call_format : pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
105+ tool_call_format : pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
96106 temperature : 0.7
97107 max_tokens : 4096
98108 estimated_vram_gb : 14
@@ -102,20 +112,20 @@ models:
102112
103113 # LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview)
104114 # Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio)
105- # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access)
115+ # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access)
106116 # Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md
107117 # Run: llama-server --model <path> --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn
108118 lfm2-24b-a2b :
109- display_name : " LFM2-24B-A2B-Preview "
119+ display_name : " LFM2-24B-A2B"
110120 runtime : llama_cpp
111- model_path : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview- Q4_K_M.gguf"
121+ model_path : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf"
112122 base_url : " http://localhost:8080/v1"
113123 context_window : 32768
114- tool_call_format : bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
124+ tool_call_format : bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
115125 temperature : 0.7
116- tool_temperature : 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
126+ tool_temperature : 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
117127 max_tokens : 4096
118- estimated_vram_gb : 16 # Q4_K_M quantization estimate for 24B MoE
128+ estimated_vram_gb : 16 # Q4_K_M quantization estimate for 24B MoE
119129 capabilities :
120130 - text
121131 - tool_calling
@@ -130,7 +140,7 @@ models:
130140 tool_call_format : native_json
131141 temperature : 0.7
132142 max_tokens : 4096
133- estimated_vram_gb : 4 # Only ~3B active params
143+ estimated_vram_gb : 4 # Only ~3B active params
134144 capabilities :
135145 - text
136146 - tool_calling
@@ -152,7 +162,7 @@ models:
152162 tool_call_format : native_json
153163 temperature : 0.1
154164 max_tokens : 4096
155- estimated_vram_gb : 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
165+ estimated_vram_gb : 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
156166 capabilities :
157167 - text
158168 - vision
@@ -170,10 +180,10 @@ models:
170180 model_path : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf"
171181 base_url : " http://localhost:8082/v1"
172182 context_window : 32768
173- tool_call_format : bracket # LFM2.5 bracket format: [server.tool(args)]
183+ tool_call_format : bracket # LFM2.5 bracket format: [server.tool(args)]
174184 temperature : 0.1
175185 max_tokens : 512
176- estimated_vram_gb : 1.5 # Q8_0 quantization (1.2 GB)
186+ estimated_vram_gb : 1.5 # Q8_0 quantization (1.2 GB)
177187 role : tool_router
178188 fine_tuned :
179189 method : lora
@@ -198,7 +208,7 @@ models:
198208 model_path : " ${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf"
199209 base_url : " http://localhost:8084/v1"
200210 context_window : 32768
201- tool_call_format : bracket # LFM2.5 bracket format: [server.tool(args)]
211+ tool_call_format : bracket # LFM2.5 bracket format: [server.tool(args)]
202212 temperature : 0.1
203213 max_tokens : 512
204214 estimated_vram_gb : 2.3
@@ -238,6 +248,24 @@ models:
238248 - text
239249 - tool_calling
240250
251+ # LM Studio headless server — any model loaded in LM Studio
252+ # Run `lms server start` or enable "Run LLM server on login" in app settings.
253+ # Default port is 1234. Uses OpenAI-compatible API.
254+ # Note: The model_name here is informational - update to match your loaded model.
255+ lmstudio-default :
256+ display_name : " LM Studio (Default)"
257+ runtime : lmstudio
258+ model_name : " liquid/lfm2-24b-a2b" # Replace with your loaded model ID
259+ base_url : " http://localhost:1234/v1"
260+ context_window : 32768
261+ tool_call_format : native_json
262+ temperature : 0.7
263+ max_tokens : 4096
264+ estimated_vram_gb : null # Varies by loaded model
265+ capabilities :
266+ - text
267+ - tool_calling
268+
241269 # ─── Benchmark comparison models ────────────────────────────────────────
242270 # These models are benchmarked against LFM2-24B-A2B to demonstrate
243271 # scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE.
@@ -290,7 +318,7 @@ models:
290318 tool_temperature : 0.1
291319 max_tokens : 4096
292320 estimated_vram_gb : 20
293- deprecated : true # Partial run only (40/100), dropped from active benchmarks
321+ deprecated : true # Partial run only (40/100), dropped from active benchmarks
294322 capabilities :
295323 - text
296324 - tool_calling
@@ -358,7 +386,9 @@ models:
358386 - text
359387 - tool_calling
360388
361- # Runtime configurations
389+ # Runtime configurations (informational only — not used by the app)
390+ # These describe how to start each runtime for reference. The app
391+ # expects the runtime to already be running when it starts.
362392runtimes :
363393 ollama :
364394 command : " ollama serve"
@@ -371,6 +401,14 @@ runtimes:
371401 health_check : " http://localhost:8080/health"
372402 startup_timeout_seconds : 60
373403
404+ lmstudio :
405+ # Use `lms server start` CLI to start headless, or enable "Run LLM server
406+ # on login" in app settings (Cmd/Ctrl+,). Default port is 1234.
407+ command : " lms"
408+ args : ["server", "start"]
409+ health_check : " http://localhost:1234/v1/models"
410+ startup_timeout_seconds : 30
411+
374412 mlx :
375413 command : " mlx_lm.server"
376414 args : ["--model", "{model_path}", "--port", "8080"]
@@ -380,9 +418,9 @@ runtimes:
380418
381419# Fallback chain — used when the active model is unavailable
382420fallback_chain :
383- - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
384- - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
385- - static_response # Fallback 2 — hardcoded "model unavailable" message
421+ - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
422+ - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
423+ - static_response # Fallback 2 — hardcoded "model unavailable" message
386424
387425# Dual-model orchestrator (ADR-009)
388426# When enabled, the planner model decomposes multi-step workflows and
@@ -402,9 +440,9 @@ fallback_chain:
402440# to skip the orchestrator entirely and avoid the ~2-3s wasted planner call.
403441# See ADR-009 for full details.
404442orchestrator :
405- enabled : false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
443+ enabled : false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
406444 planner_model : lfm2-24b-a2b
407- router_model : lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
445+ router_model : lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
408446 router_top_k : 15
409447 max_plan_steps : 10
410448 step_retries : 3
@@ -414,4 +452,4 @@ orchestrator:
414452# ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens).
415453# The model selects 2-3 categories, then subsequent turns use only those tools.
416454# Saves ~7,170 tokens per turn and eliminates cross-server confusion.
417- two_pass_tool_selection : true # Active only when >30 tools registered; 21 curated tools use flat mode
455+ two_pass_tool_selection : true # Active only when >30 tools registered; 21 curated tools use flat mode
0 commit comments