Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 51 additions & 23 deletions examples/localcowork/_models/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation.
# The config-loader resolves environment variables at load time.

active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy
active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy

# Default model directory for non-Ollama model files (GGUF, MLX, etc.)
models_dir: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}"
Expand Down Expand Up @@ -76,11 +76,11 @@ models:
model_name: "gpt-oss:20b"
base_url: "http://localhost:11434/v1"
context_window: 32768
tool_call_format: native_json # Native function calling + structured outputs
tool_call_format: native_json # Native function calling + structured outputs
temperature: 0.7
max_tokens: 4096
estimated_vram_gb: 14
force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement
capabilities:
- text
- tool_calling
Expand All @@ -92,7 +92,7 @@ models:
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf"
base_url: "http://localhost:8080/v1"
context_window: 32768
tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON
temperature: 0.7
max_tokens: 4096
estimated_vram_gb: 14
Expand All @@ -102,20 +102,20 @@ models:

# LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview)
# Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio)
# Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access)
# Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access)
# Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md
# Run: llama-server --model <path> --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn
lfm2-24b-a2b:
display_name: "LFM2-24B-A2B-Preview"
display_name: "LFM2-24B-A2B"
runtime: llama_cpp
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf"
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf"
base_url: "http://localhost:8080/v1"
context_window: 32768
tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs
temperature: 0.7
tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3)
max_tokens: 4096
estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE
capabilities:
- text
- tool_calling
Expand All @@ -130,7 +130,7 @@ models:
tool_call_format: native_json
temperature: 0.7
max_tokens: 4096
estimated_vram_gb: 4 # Only ~3B active params
estimated_vram_gb: 4 # Only ~3B active params
capabilities:
- text
- tool_calling
Expand All @@ -152,7 +152,7 @@ models:
tool_call_format: native_json
temperature: 0.1
max_tokens: 4096
estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB)
capabilities:
- text
- vision
Expand All @@ -170,10 +170,10 @@ models:
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf"
base_url: "http://localhost:8082/v1"
context_window: 32768
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
temperature: 0.1
max_tokens: 512
estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB)
role: tool_router
fine_tuned:
method: lora
Expand All @@ -198,7 +198,7 @@ models:
model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf"
base_url: "http://localhost:8084/v1"
context_window: 32768
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)]
temperature: 0.1
max_tokens: 512
estimated_vram_gb: 2.3
Expand Down Expand Up @@ -238,6 +238,24 @@ models:
- text
- tool_calling

# LM Studio headless server — any model loaded in LM Studio
# Run `lms server start` or enable "Run LLM server on login" in app settings.
# Default port is 1234. Uses OpenAI-compatible API.
# Note: The model_name here is informational - update to match your loaded model.
lmstudio-default:
display_name: "LM Studio (Default)"
runtime: lmstudio
model_name: "liquid/lfm2-24b-a2b" # Replace with your loaded model ID
base_url: "http://localhost:1234/v1"
context_window: 32768
tool_call_format: native_json
temperature: 0.7
max_tokens: 4096
estimated_vram_gb: null # Varies by loaded model
capabilities:
- text
- tool_calling

# ─── Benchmark comparison models ────────────────────────────────────────
# These models are benchmarked against LFM2-24B-A2B to demonstrate
# scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE.
Expand Down Expand Up @@ -290,7 +308,7 @@ models:
tool_temperature: 0.1
max_tokens: 4096
estimated_vram_gb: 20
deprecated: true # Partial run only (40/100), dropped from active benchmarks
deprecated: true # Partial run only (40/100), dropped from active benchmarks
capabilities:
- text
- tool_calling
Expand Down Expand Up @@ -358,7 +376,9 @@ models:
- text
- tool_calling

# Runtime configurations
# Runtime configurations (informational only — not used by the app)
# These describe how to start each runtime for reference. The app
# expects the runtime to already be running when it starts.
runtimes:
ollama:
command: "ollama serve"
Expand All @@ -371,6 +391,14 @@ runtimes:
health_check: "http://localhost:8080/health"
startup_timeout_seconds: 60

lmstudio:
# Use `lms server start` CLI to start headless, or enable "Run LLM server
# on login" in app settings (Cmd/Ctrl+,). Default port is 1234.
command: "lms"
args: ["server", "start"]
health_check: "http://localhost:1234/v1/models"
startup_timeout_seconds: 30
Comment on lines +394 to +400
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new runtimes.lmstudio section is currently ignored by the Rust loader because ModelsConfig has no runtimes field and Serde will drop unknown keys. If this is meant to drive runtime startup/health-check behavior, it should be added to ModelsConfig (and used) or the YAML should explicitly document that the runtimes map is informational-only so readers don’t assume it affects behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback


mlx:
command: "mlx_lm.server"
args: ["--model", "{model_path}", "--port", "8080"]
Expand All @@ -380,9 +408,9 @@ runtimes:

# Fallback chain — used when the active model is unavailable
fallback_chain:
- lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
- qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
- static_response # Fallback 2 — hardcoded "model unavailable" message
- lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion
- qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE
- static_response # Fallback 2 — hardcoded "model unavailable" message

# Dual-model orchestrator (ADR-009)
# When enabled, the planner model decomposes multi-step workflows and
Expand All @@ -402,9 +430,9 @@ fallback_chain:
# to skip the orchestrator entirely and avoid the ~2-3s wasted planner call.
# See ADR-009 for full details.
orchestrator:
enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools.
planner_model: lfm2-24b-a2b
router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools)
router_top_k: 15
max_plan_steps: 10
step_retries: 3
Expand All @@ -414,4 +442,4 @@ orchestrator:
# ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens).
# The model selects 2-3 categories, then subsequent turns use only those tools.
# Saves ~7,170 tokens per turn and eliminates cross-server confusion.
two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode
two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode
28 changes: 28 additions & 0 deletions examples/localcowork/src-tauri/src/inference/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,24 @@ mod tests {
role: None,
},
);
models.insert(
"lmstudio-model".to_string(),
ModelConfig {
display_name: "LM Studio Model".to_string(),
runtime: "lmstudio".to_string(),
model_name: Some("lmstudio/default".to_string()),
model_path: None,
base_url: "http://localhost:1234/v1".to_string(),
context_window: 32768,
tool_call_format: ToolCallFormat::NativeJson,
temperature: 0.7,
max_tokens: 4096,
estimated_vram_gb: Some(8.0),
capabilities: vec!["text".to_string(), "tool_calling".to_string()],
force_json_response: false,
role: None,
},
);

ModelsConfig {
active_model: "model-a".to_string(),
Expand Down Expand Up @@ -579,6 +597,16 @@ mod tests {
assert!(result.is_err());
}

#[test]
fn test_lmstudio_model_config() {
let config = test_config();
// Create client targeting LM Studio model directly
let client = InferenceClient::from_config_with_model(config, "lmstudio-model").unwrap();
assert_eq!(client.current_model_key, "lmstudio-model");
assert_eq!(client.current_model_name(), "LM Studio Model");
assert_eq!(client.current_base_url(), "http://localhost:1234/v1");
}

#[test]
fn test_remaining_fallbacks() {
let client = InferenceClient::from_config(test_config()).unwrap();
Expand Down
65 changes: 63 additions & 2 deletions examples/localcowork/src-tauri/src/inference/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ pub fn load_models_config(path: &Path) -> Result<ModelsConfig, InferenceError> {
/// Returns `(model_key, ModelConfig)` for the first available model.
/// "Available" here means it exists in the config — actual connectivity is
/// checked at runtime by the client.
pub fn resolve_active_model(config: &ModelsConfig) -> Result<(String, ModelConfig), InferenceError> {
pub fn resolve_active_model(
config: &ModelsConfig,
) -> Result<(String, ModelConfig), InferenceError> {
// Try the explicitly active model first
if let Some(model) = config.models.get(&config.active_model) {
return Ok((config.active_model.clone(), model.clone()));
Expand Down Expand Up @@ -338,6 +340,65 @@ mod tests {
"#;
let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
let model = config.models.get("test").unwrap();
assert!(!model.force_json_response, "force_json_response should default to false");
assert!(
!model.force_json_response,
"force_json_response should default to false"
);
}

#[test]
fn test_lmstudio_model_config() {
let yaml = r#"
active_model: lmstudio-model
models:
lmstudio-model:
display_name: "LM Studio Model"
runtime: lmstudio
model_name: "lmstudio/default"
base_url: "http://localhost:1234/v1"
context_window: 32768
tool_call_format: native_json
temperature: 0.7
max_tokens: 4096
capabilities:
- text
- tool_calling
"#;
let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
let model = config.models.get("lmstudio-model").unwrap();
assert_eq!(model.runtime, "lmstudio");
assert_eq!(model.base_url, "http://localhost:1234/v1");
assert_eq!(model.model_name.as_deref(), Some("lmstudio/default"));
assert_eq!(model.tool_call_format, ToolCallFormat::NativeJson);
}

#[test]
fn test_lmstudio_runtime_config() {
// Test that ModelsConfig can parse YAML with unknown runtimes field
// (the runtimes section is ignored but should not cause deserialization errors)
let yaml = r#"
active_model: test
runtimes:
lmstudio:
health_check: "http://localhost:1234/v1/models"
startup_timeout_seconds: 30
ollama:
command: "ollama serve"
health_check: "http://localhost:11434/api/tags"
models:
test:
display_name: "Test"
runtime: lmstudio
base_url: "http://localhost:1234/v1"
context_window: 4096
tool_call_format: native_json
temperature: 0.7
max_tokens: 1024
"#;
// Verify that unknown runtimes field is ignored and parsing succeeds
let config: ModelsConfig = serde_yaml::from_str(yaml).unwrap();
assert_eq!(config.active_model, "test");
assert!(config.models.contains_key("test"));
assert_eq!(config.models.get("test").unwrap().runtime, "lmstudio");
}
}