Liquid4All · vlordier · Mar 6, 2026 · Mar 6, 2026 · Copilot · Mar 6, 2026
diff --git a/examples/localcowork/_models/config.yaml b/examples/localcowork/_models/config.yaml
@@ -10,7 +10,7 @@
 #   Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation.
 #   The config-loader resolves environment variables at load time.
 
-active_model: lfm2-24b-a2b  # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
+active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
 
 # Default model directory for non-Ollama model files (GGUF, MLX, etc.)
 models_dir: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}"
@@ -76,11 +76,11 @@ models:
     model_name: "gpt-oss:20b"
     base_url: "http://localhost:11434/v1"
     context_window: 32768
-    tool_call_format: native_json  # Native function calling + structured outputs
+    tool_call_format: native_json # Native function calling + structured outputs
     temperature: 0.7
     max_tokens: 4096
     estimated_vram_gb: 14
-    force_json_response: false  # Enable after live testing — triggers GBNF grammar enforcement
+    force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
     capabilities:
       - text
       - tool_calling
@@ -92,7 +92,7 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf"
     base_url: "http://localhost:8080/v1"
     context_window: 32768
-    tool_call_format: pythonic  # LFM2.5 uses Pythonic calls; normalizer converts to JSON
+    tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
     temperature: 0.7
     max_tokens: 4096
     estimated_vram_gb: 14
@@ -102,20 +102,20 @@ models:
 
   # LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview)
   # Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio)
-  # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access)
+  # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access)
   # Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md
   # Run: llama-server --model <path> --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn
   lfm2-24b-a2b:
-    display_name: "LFM2-24B-A2B-Preview"
+    display_name: "LFM2-24B-A2B"
     runtime: llama_cpp
-    model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf"
+    model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf"
     base_url: "http://localhost:8080/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
+    tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
     temperature: 0.7
-    tool_temperature: 0.1  # Lower temperature for tool-calling turns (ADR-008 Layer 3)
+    tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
     max_tokens: 4096
-    estimated_vram_gb: 16  # Q4_K_M quantization estimate for 24B MoE
+    estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
     capabilities:
       - text
       - tool_calling
@@ -130,7 +130,7 @@ models:
     tool_call_format: native_json
     temperature: 0.7
     max_tokens: 4096
-    estimated_vram_gb: 4  # Only ~3B active params
+    estimated_vram_gb: 4 # Only ~3B active params
     capabilities:
       - text
       - tool_calling
@@ -152,7 +152,7 @@ models:
     tool_call_format: native_json
     temperature: 0.1
     max_tokens: 4096
-    estimated_vram_gb: 1.8  # Q8_0 model (1.25 GB) + mmproj (583 MB)
+    estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
     capabilities:
       - text
       - vision
@@ -170,10 +170,10 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf"
     base_url: "http://localhost:8082/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2.5 bracket format: [server.tool(args)]
+    tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
     temperature: 0.1
     max_tokens: 512
-    estimated_vram_gb: 1.5  # Q8_0 quantization (1.2 GB)
+    estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
     role: tool_router
     fine_tuned:
       method: lora
@@ -198,7 +198,7 @@ models:
     model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf"
     base_url: "http://localhost:8084/v1"
     context_window: 32768
-    tool_call_format: bracket  # LFM2.5 bracket format: [server.tool(args)]
+    tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
     temperature: 0.1
     max_tokens: 512
     estimated_vram_gb: 2.3
@@ -238,6 +238,24 @@ models:
       - text
       - tool_calling
 
+  # LM Studio headless server — any model loaded in LM Studio
+  # Run `lms server start` or enable "Run LLM server on login" in app settings.
+  # Default port is 1234. Uses OpenAI-compatible API.
+  # Note: The model_name here is informational - update to match your loaded model.
+  lmstudio-default:
+    display_name: "LM Studio (Default)"
+    runtime: lmstudio
+    model_name: "liquid/lfm2-24b-a2b" # Replace with your loaded model ID
+    base_url: "http://localhost:1234/v1"
+    context_window: 32768
+    tool_call_format: native_json
+    temperature: 0.7
+    max_tokens: 4096
+    estimated_vram_gb: null # Varies by loaded model
+    capabilities:
+      - text
+      - tool_calling
+
   # ─── Benchmark comparison models ────────────────────────────────────────
   # These models are benchmarked against LFM2-24B-A2B to demonstrate
   # scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE.
@@ -290,7 +308,7 @@ models:
     tool_temperature: 0.1
     max_tokens: 4096
     estimated_vram_gb: 20
-    deprecated: true  # Partial run only (40/100), dropped from active benchmarks
+    deprecated: true # Partial run only (40/100), dropped from active benchmarks
     capabilities:
       - text
       - tool_calling
@@ -358,7 +376,9 @@ models:
       - text
       - tool_calling
 
-# Runtime configurations
+# Runtime configurations (informational only — not used by the app)
+# These describe how to start each runtime for reference. The app
+# expects the runtime to already be running when it starts.
 runtimes:
   ollama:
     command: "ollama serve"
@@ -371,6 +391,14 @@ runtimes:
     health_check: "http://localhost:8080/health"
     startup_timeout_seconds: 60
 
+  lmstudio:
+    # Use `lms server start` CLI to start headless, or enable "Run LLM server
+    # on login" in app settings (Cmd/Ctrl+,). Default port is 1234.
+    command: "lms"
+    args: ["server", "start"]
+    health_check: "http://localhost:1234/v1/models"
+    startup_timeout_seconds: 30
+
   mlx:
     command: "mlx_lm.server"
     args: ["--model", "{model_path}", "--port", "8080"]
@@ -380,9 +408,9 @@ runtimes:
 
 # Fallback chain — used when the active model is unavailable
 fallback_chain:
-  - lfm2-24b-a2b     # Primary — 78% single-step, 24% chain completion
-  - qwen3-30b-moe    # Fallback 1 — Ollama-hosted Qwen3 MoE
-  - static_response   # Fallback 2 — hardcoded "model unavailable" message
+  - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
+  - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
+  - static_response # Fallback 2 — hardcoded "model unavailable" message
 
 # Dual-model orchestrator (ADR-009)
 # When enabled, the planner model decomposes multi-step workflows and
@@ -402,9 +430,9 @@ fallback_chain:
 #   to skip the orchestrator entirely and avoid the ~2-3s wasted planner call.
 #   See ADR-009 for full details.
 orchestrator:
-  enabled: false  # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
+  enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
   planner_model: lfm2-24b-a2b
-  router_model: lfm25-1.2b-router-ft  # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
+  router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
   router_top_k: 15
   max_plan_steps: 10
   step_retries: 3
@@ -414,4 +442,4 @@ orchestrator:
 # ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens).
 # The model selects 2-3 categories, then subsequent turns use only those tools.
 # Saves ~7,170 tokens per turn and eliminates cross-server confusion.
-two_pass_tool_selection: true  # Active only when >30 tools registered; 21 curated tools use flat mode
+two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode
diff --git a/examples/localcowork/src-tauri/src/inference/client.rs b/examples/localcowork/src-tauri/src/inference/client.rs
@@ -540,6 +540,24 @@ mod tests {
                 role: None,
             },
         );
+        models.insert(
+            "lmstudio-model".to_string(),
+            ModelConfig {
+                display_name: "LM Studio Model".to_string(),
+                runtime: "lmstudio".to_string(),
+                model_name: Some("lmstudio/default".to_string()),
+                model_path: None,
+                base_url: "http://localhost:1234/v1".to_string(),
+                context_window: 32768,
+                tool_call_format: ToolCallFormat::NativeJson,
+                temperature: 0.7,
+                max_tokens: 4096,
+                estimated_vram_gb: Some(8.0),
+                capabilities: vec!["text".to_string(), "tool_calling".to_string()],
+                force_json_response: false,
+                role: None,
+            },
+        );
 
         ModelsConfig {
             active_model: "model-a".to_string(),
@@ -579,6 +597,16 @@ mod tests {
         assert!(result.is_err());
     }
 
+    #[test]
+    fn test_lmstudio_model_config() {
+        let config = test_config();
+        // Create client targeting LM Studio model directly
+        let client = InferenceClient::from_config_with_model(config, "lmstudio-model").unwrap();
+        assert_eq!(client.current_model_key, "lmstudio-model");
+        assert_eq!(client.current_model_name(), "LM Studio Model");
+        assert_eq!(client.current_base_url(), "http://localhost:1234/v1");
+    }
+
     #[test]
     fn test_remaining_fallbacks() {
         let client = InferenceClient::from_config(test_config()).unwrap();

diff --git a/examples/localcowork/src-tauri/src/inference/config.rs b/examples/localcowork/src-tauri/src/inference/config.rs
@@ -193,7 +193,9 @@ pub fn load_models_config(path: &Path) -> Result<ModelsConfig, InferenceError> {
 /// Returns `(model_key, ModelConfig)` for the first available model.
 /// "Available" here means it exists in the config — actual connectivity is
 /// checked at runtime by the client.
-pub fn resolve_active_model(config: &ModelsConfig) -> Result<(String, ModelConfig), InferenceError> {
+pub fn resolve_active_model(
+    config: &ModelsConfig,
+) -> Result<(String, ModelConfig), InferenceError> {
     // Try the explicitly active model first
     if let Some(model) = config.models.get(&config.active_model) {
         return Ok((config.active_model.clone(), model.clone()));
@@ -338,6 +340,65 @@ mod tests {
         "#;
         let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
         let model = config.models.get("test").unwrap();
-        assert!(!model.force_json_response, "force_json_response should default to false");
+        assert!(
+            !model.force_json_response,
+            "force_json_response should default to false"
+        );
+    }
+
+    #[test]
+    fn test_lmstudio_model_config() {
+        let yaml = r#"
+            active_model: lmstudio-model
+            models:
+              lmstudio-model:
+                display_name: "LM Studio Model"
+                runtime: lmstudio
+                model_name: "lmstudio/default"
+                base_url: "http://localhost:1234/v1"
+                context_window: 32768
+                tool_call_format: native_json
+                temperature: 0.7
+                max_tokens: 4096
+                capabilities:
+                  - text
+                  - tool_calling
+        "#;
+        let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
+        let model = config.models.get("lmstudio-model").unwrap();
+        assert_eq!(model.runtime, "lmstudio");
+        assert_eq!(model.base_url, "http://localhost:1234/v1");
+        assert_eq!(model.model_name.as_deref(), Some("lmstudio/default"));
+        assert_eq!(model.tool_call_format, ToolCallFormat::NativeJson);
+    }
+
+    #[test]
+    fn test_lmstudio_runtime_config() {
+        // Test that ModelsConfig can parse YAML with unknown runtimes field
+        // (the runtimes section is ignored but should not cause deserialization errors)
+        let yaml = r#"
+            active_model: test
+            runtimes:
+              lmstudio:
+                health_check: "http://localhost:1234/v1/models"
+                startup_timeout_seconds: 30
+              ollama:
+                command: "ollama serve"
+                health_check: "http://localhost:11434/api/tags"
+            models:
+              test:
+                display_name: "Test"
+                runtime: lmstudio
+                base_url: "http://localhost:1234/v1"
+                context_window: 4096
+                tool_call_format: native_json
+                temperature: 0.7
+                max_tokens: 1024
+        "#;
+        // Verify that unknown runtimes field is ignored and parsing succeeds
+        let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(config.active_model, "test");
+        assert!(config.models.contains_key("test"));
+        assert_eq!(config.models.get("test").unwrap().runtime, "lmstudio");
     }
 }