diff --git a/examples/localcowork/README.md b/examples/localcowork/README.md index a54467f..a23fce9 100644 --- a/examples/localcowork/README.md +++ b/examples/localcowork/README.md @@ -4,7 +4,7 @@ **Tool-calling that actually feels instant on a laptop.** -Building a local AI agent sounds great until you try to use one all day. The hard part isn't getting a model to understand you -- it's getting it to choose the right tool and do it fast enough that the experience feels interactive. This is where [LFM2-24B-A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview) shines: it's designed for tool dispatch on consumer hardware, where latency and memory aren't abstract constraints -- they decide whether your agent is a product or a demo. +Building a local AI agent sounds great until you try to use one all day. The hard part isn't getting a model to understand you -- it's getting it to choose the right tool and do it fast enough that the experience feels interactive. This is where [LFM2-24B-A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF) shines: it's designed for tool dispatch on consumer hardware, where latency and memory aren't abstract constraints -- they decide whether your agent is a product or a demo. LocalCowork is a desktop AI agent that runs entirely on-device. No cloud APIs, no data leaving your machine. The model calls pre-built tools via the [Model Context Protocol](https://modelcontextprotocol.io/) (MCP), and every tool execution is logged to a local audit trail. @@ -120,13 +120,13 @@ Full study with 8 models, 150+ scenarios, and 12 failure modes: [`docs/model-ana git clone && cd localCoWork ./scripts/setup-dev.sh -# 2. Download LFM2-24B-A2B (~14 GB, requires HuggingFace access) -# Request access: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview +# 2. Download LFM2-24B-A2B (~14 GB) +# https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF pip install huggingface-hub python3 -c " from huggingface_hub import hf_hub_download -hf_hub_download('LiquidAI/LFM2-24B-A2B-Preview', - 'LFM2-24B-A2B-Preview-Q4_K_M.gguf', +hf_hub_download('LiquidAI/LFM2-24B-A2B-GGUF', + 'LFM2-24B-A2B-Q4_K_M.gguf', local_dir='$HOME/Projects/_models/') " diff --git a/examples/localcowork/_models/config.yaml b/examples/localcowork/_models/config.yaml index d61d1fb..2e0c687 100644 --- a/examples/localcowork/_models/config.yaml +++ b/examples/localcowork/_models/config.yaml @@ -100,15 +100,15 @@ models: - text - tool_calling - # LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview) + # LFM2-24B-A2B — Liquid AI's MoE hybrid model # Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio) - # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access) + # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF # Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md # Run: llama-server --model --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn lfm2-24b-a2b: - display_name: "LFM2-24B-A2B-Preview" + display_name: "LFM2-24B-A2B-GGUF" runtime: llama_cpp - model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf" + model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf" base_url: "http://localhost:8080/v1" context_window: 32768 tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs diff --git a/examples/localcowork/docs/model-analysis/README.md b/examples/localcowork/docs/model-analysis/README.md index ad30352..cc40d2b 100644 --- a/examples/localcowork/docs/model-analysis/README.md +++ b/examples/localcowork/docs/model-analysis/README.md @@ -371,7 +371,7 @@ For the broader community building local AI agents: the era of "throw all tools ```bash # LFM2-24B-A2B via llama-server (default port 8080) -llama-server -m _models/LFM2-24B-A2B-Preview-Q4_K_M.gguf --port 8080 +llama-server -m _models/LFM2-24B-A2B-Q4_K_M.gguf --port 8080 # Any Ollama model (default port 11434) ollama run llama3.2 diff --git a/examples/localcowork/docs/model-analysis/dual-model-orchestrator-performance.md b/examples/localcowork/docs/model-analysis/dual-model-orchestrator-performance.md index 9504830..7854169 100644 --- a/examples/localcowork/docs/model-analysis/dual-model-orchestrator-performance.md +++ b/examples/localcowork/docs/model-analysis/dual-model-orchestrator-performance.md @@ -441,7 +441,7 @@ The single-model agent loop exhibits five pathological behaviors, all eliminated ```bash # Only the 24B model is needed llama-server \ - --model _models/LFM2-24B-A2B-Preview-Q4_K_M.gguf \ + --model _models/LFM2-24B-A2B-Q4_K_M.gguf \ --port 8080 \ --ctx-size 32768 \ --n-gpu-layers 99 \ @@ -452,7 +452,7 @@ llama-server \ ```bash # Terminal 1: 24B planner/synthesizer llama-server \ - --model _models/LFM2-24B-A2B-Preview-Q4_K_M.gguf \ + --model _models/LFM2-24B-A2B-Q4_K_M.gguf \ --port 8080 \ --ctx-size 32768 \ --n-gpu-layers 99 \ @@ -470,7 +470,7 @@ llama-server \ ### Readiness Checklists **For Single-Model Mode (Flow A):** -- [ ] `_models/LFM2-24B-A2B-Preview-Q4_K_M.gguf` exists +- [ ] `_models/LFM2-24B-A2B-Q4_K_M.gguf` exists - [ ] llama-server running on port 8080 - [ ] `active_model: lfm2-24b-a2b` in config.yaml - [ ] `orchestrator.enabled: false` in config.yaml diff --git a/examples/localcowork/docs/model-analysis/lfm2-24b-a2b-benchmark.md b/examples/localcowork/docs/model-analysis/lfm2-24b-a2b-benchmark.md index ec93adf..aa3ef41 100644 --- a/examples/localcowork/docs/model-analysis/lfm2-24b-a2b-benchmark.md +++ b/examples/localcowork/docs/model-analysis/lfm2-24b-a2b-benchmark.md @@ -1,4 +1,4 @@ -# LFM2-24B-A2B-Preview — Benchmark & Execution Results +# LFM2-24B-A2B-GGUF — Benchmark & Execution Results **Status:** Production model (planner + synthesizer in dual-model orchestrator) **Date:** 2026-02-18 (updated with real-world execution traces, orchestrator A/B results) @@ -17,7 +17,7 @@ | Property | Value | | ---------------- | ------------------------------------------------------------------ | -| Model | LFM2-24B-A2B-Preview | +| Model | LFM2-24B-A2B-GGUF | | Architecture | Sparse MoE: gated short convolution + grouped query attention (GQA) | | Total params | 24B | | Active per token | 2.3B | @@ -37,7 +37,7 @@ | Decode speed | ~121 tokens/sec (Apple Silicon, Metal) | | GPU throughput | ~26.8K tok/s @ 1024 concurrent (H100 SXM5, vLLM) | | Tool call format | LFM bracket syntax (`<\|tool_call_start\|>...<\|tool_call_end\|>`) | -| Source | https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated) | +| Source | https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF | | Blog | https://www.liquid.ai/blog/lfm2-24b-a2b | ### Why this model diff --git a/examples/localcowork/docs/model-analysis/ollama-demo-prompts.md b/examples/localcowork/docs/model-analysis/ollama-demo-prompts.md index a112530..760dd60 100644 --- a/examples/localcowork/docs/model-analysis/ollama-demo-prompts.md +++ b/examples/localcowork/docs/model-analysis/ollama-demo-prompts.md @@ -2,7 +2,7 @@ Curated prompts from our benchmark suite (100 single-step + 50 multi-step). Only prompts with **85%+ accuracy** on greedy sampling are included. -Model: `LiquidAI/LFM2-24B-A2B-Preview` (Q4_K_M GGUF) +Model: `LiquidAI/LFM2-24B-A2B-GGUF` (Q4_K_M GGUF) --- diff --git a/examples/localcowork/docs/model-analysis/tool-calling-benchmark-results.md b/examples/localcowork/docs/model-analysis/tool-calling-benchmark-results.md index 82d2351..5d5f1b7 100644 --- a/examples/localcowork/docs/model-analysis/tool-calling-benchmark-results.md +++ b/examples/localcowork/docs/model-analysis/tool-calling-benchmark-results.md @@ -33,7 +33,7 @@ The comparison is relevant for anyone deploying local AI agents on consumer hard | Model | HuggingFace ID | Ollama Tag | Runtime | Quantization | |-------|---------------|-----------|---------|-------------| -| LFM2-24B-A2B | `LiquidAI/LFM2-24B-A2B-Preview` | N/A | llama-server | Q4_K_M (GGUF) | +| LFM2-24B-A2B | `LiquidAI/LFM2-24B-A2B-GGUF` | N/A | llama-server | Q4_K_M (GGUF) | | Mistral-Small-24B | `mistralai/Mistral-Small-24B-Instruct-2501` | `mistral-small:24b` | Ollama | Q4_K_M | | Gemma 3 27B | `google/gemma-3-27b-it` | `gemma3:27b` | Ollama | Q4_K_M | | GPT-OSS-20B | `openai/gpt-oss-20b` | `gpt-oss:20b` | Ollama | MXFP4 (native, ~4.25 bits/param) | diff --git a/examples/localcowork/docs/patterns/prompt-engineering-small-llms.md b/examples/localcowork/docs/patterns/prompt-engineering-small-llms.md index 5357933..d3970ec 100644 --- a/examples/localcowork/docs/patterns/prompt-engineering-small-llms.md +++ b/examples/localcowork/docs/patterns/prompt-engineering-small-llms.md @@ -4,7 +4,7 @@ > for tool-calling accuracy in LocalCowork. **Status:** Validated in production (March 2026) -**Model:** LFM2-24B-A2B-Preview, Q4_K_M quantization, llama.cpp runtime +**Model:** LFM2-24B-A2B-GGUF, Q4_K_M quantization, llama.cpp runtime **Implementation:** `src-tauri/src/commands/chat.rs` (`build_system_prompt`, `send_message`) **Key model trait:** LFM2-24B-A2B uses lightweight instruct post-training with diff --git a/examples/localcowork/scripts/benchmark-lfm2-24b.sh b/examples/localcowork/scripts/benchmark-lfm2-24b.sh index d001bd2..14408c1 100755 --- a/examples/localcowork/scripts/benchmark-lfm2-24b.sh +++ b/examples/localcowork/scripts/benchmark-lfm2-24b.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # benchmark-lfm2-24b.sh — Benchmark LFM2-24B-A2B on both architectures # -# Runs the full benchmark suite against LFM2-24B-A2B-Preview: +# Runs the full benchmark suite against LFM2-24B-A2B-GGUF: # Phase 1: Single-model agent loop (main branch architecture) # Phase 2: Dual-model orchestrator (feat/dual-model-orchestrator architecture) # Phase 3: Comparison report generation @@ -13,7 +13,7 @@ # - npm install completed in project root # # Usage: -# ./scripts/benchmark-lfm2-24b.sh --path ~/Projects/_models/LFM2-24B-A2B-Preview-Q4_K_M.gguf +# ./scripts/benchmark-lfm2-24b.sh --path ~/Projects/_models/LFM2-24B-A2B-Q4_K_M.gguf # ./scripts/benchmark-lfm2-24b.sh --path --phase single # Phase 1 only # ./scripts/benchmark-lfm2-24b.sh --path --phase orchestrator # Phase 2 only # ./scripts/benchmark-lfm2-24b.sh --phase report # Phase 3 only (uses existing results) diff --git a/examples/localcowork/scripts/setup-dev.sh b/examples/localcowork/scripts/setup-dev.sh index d910f8a..9af237c 100755 --- a/examples/localcowork/scripts/setup-dev.sh +++ b/examples/localcowork/scripts/setup-dev.sh @@ -187,20 +187,20 @@ echo " Models directory: $MODELS_DIR" echo "" # Primary model: LFM2-24B-A2B (production, 80% tool-calling accuracy) -MAIN_MODEL="LFM2-24B-A2B-Preview-Q4_K_M.gguf" +MAIN_MODEL="LFM2-24B-A2B-Q4_K_M.gguf" if [ -f "$MODELS_DIR/$MAIN_MODEL" ]; then MAIN_SIZE=$(du -h "$MODELS_DIR/$MAIN_MODEL" | cut -f1) echo "✅ LFM2-24B-A2B found ($MAIN_SIZE)" else echo "❌ LFM2-24B-A2B not found — this is the primary production model" echo "" - echo " Download from HuggingFace (gated — request access first):" - echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview" + echo " Download from HuggingFace:" + echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF" echo "" echo " pip install huggingface-hub" echo " python3 -c \"" echo " from huggingface_hub import hf_hub_download" - echo " hf_hub_download('LiquidAI/LFM2-24B-A2B-Preview'," + echo " hf_hub_download('LiquidAI/LFM2-24B-A2B-GGUF'," echo " '$MAIN_MODEL'," echo " local_dir='$MODELS_DIR')" echo " \"" diff --git a/examples/localcowork/scripts/start-model.sh b/examples/localcowork/scripts/start-model.sh index f282661..f15b836 100755 --- a/examples/localcowork/scripts/start-model.sh +++ b/examples/localcowork/scripts/start-model.sh @@ -15,7 +15,7 @@ set -euo pipefail MODELS_DIR="${LOCALCOWORK_MODELS_DIR:-$HOME/Projects/_models}" # Main model (LFM2-24B-A2B) -MAIN_MODEL="LFM2-24B-A2B-Preview-Q4_K_M.gguf" +MAIN_MODEL="LFM2-24B-A2B-Q4_K_M.gguf" MAIN_PORT=8080 MAIN_CTX=32768 @@ -84,14 +84,14 @@ if [ -f "$MAIN_PATH" ]; then else echo "❌ Main model not found: $MAIN_PATH" echo "" - echo " Download LFM2-24B-A2B from HuggingFace (gated — request access first):" - echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview" + echo " Download LFM2-24B-A2B from HuggingFace:" + echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B-GGUF" echo "" echo " pip install huggingface-hub" echo " python3 -c \"" echo " from huggingface_hub import hf_hub_download" - echo " hf_hub_download('LiquidAI/LFM2-24B-A2B-Preview'," - echo " 'LFM2-24B-A2B-Preview-Q4_K_M.gguf'," + echo " hf_hub_download('LiquidAI/LFM2-24B-A2B-GGUF'," + echo " 'LFM2-24B-A2B-Q4_K_M.gguf'," echo " local_dir='$MODELS_DIR')" echo " \"" if [ "$CHECK_ONLY" = true ]; then