diff --git a/examples/localcowork/.env.example b/examples/localcowork/.env.example index 16e64bd..ed5eea9 100644 --- a/examples/localcowork/.env.example +++ b/examples/localcowork/.env.example @@ -6,6 +6,9 @@ # ─── Model Configuration ───────────────────────────────────────────────────── # Directory containing GGUF model files (downloaded from HuggingFace) +# IMPORTANT: Store models OUTSIDE the project repo (e.g., ~/Projects/_models). +# The huggingface_hub library creates .cache/huggingface for tracking downloads, +# which should not be in the project directory. # LOCALCOWORK_MODELS_DIR=~/Projects/_models # Text model API endpoint (OpenAI-compatible). Set by start-model.sh. diff --git a/examples/localcowork/.gitignore b/examples/localcowork/.gitignore index 4f4a5d5..281c756 100644 --- a/examples/localcowork/.gitignore +++ b/examples/localcowork/.gitignore @@ -17,6 +17,7 @@ src-tauri/gen/ _models/*.gguf _models/*.bin _models/*.safetensors +_models/.cache/ # ─── IDE ────────────────────────────────────────────────────────────────── .vscode/ diff --git a/examples/localcowork/_models/config.yaml b/examples/localcowork/_models/config.yaml index d61d1fb..f827752 100644 --- a/examples/localcowork/_models/config.yaml +++ b/examples/localcowork/_models/config.yaml @@ -7,10 +7,20 @@ # Set LOCALCOWORK_MODELS_DIR env var to override. # Ollama-managed models use Ollama's own storage (~/.ollama/models/). # +# IMPORTANT: When downloading models via huggingface_hub, use your home +# directory (default) rather than this directory. The huggingface_hub +# library creates a .cache/huggingface folder for tracking downloads, +# which can grow large and should not be in the project repo. +# +# Example to download to your models directory: +# from huggingface_hub import hf_hub_download +# hf_hub_download('LiquidAI/LFM2-24B-A2B', 'LFM2-24B-A2B-Q4_K_M.gguf', +# local_dir='${HOME}/Projects/_models') +# # Model paths below use ${LOCALCOWORK_MODELS_DIR} for interpolation. # The config-loader resolves environment variables at load time. -active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy +active_model: lfm2-24b-a2b # Sparse MoE: 24B total, 2.3B active, 64 experts top-4 — 80% tool accuracy # Default model directory for non-Ollama model files (GGUF, MLX, etc.) models_dir: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}" @@ -76,11 +86,11 @@ models: model_name: "gpt-oss:20b" base_url: "http://localhost:11434/v1" context_window: 32768 - tool_call_format: native_json # Native function calling + structured outputs + tool_call_format: native_json # Native function calling + structured outputs temperature: 0.7 max_tokens: 4096 estimated_vram_gb: 14 - force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement + force_json_response: false # Enable after live testing — triggers GBNF grammar enforcement capabilities: - text - tool_calling @@ -92,7 +102,7 @@ models: model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/lfm25-24b-q4_k_m.gguf" base_url: "http://localhost:8080/v1" context_window: 32768 - tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON + tool_call_format: pythonic # LFM2.5 uses Pythonic calls; normalizer converts to JSON temperature: 0.7 max_tokens: 4096 estimated_vram_gb: 14 @@ -102,20 +112,20 @@ models: # LFM2-24B-A2B — Liquid AI's MoE hybrid model (private preview) # Architecture: 24B total, 2.3B active per token, 64 experts top-4, 40 layers (1:3 attn:conv ratio) - # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview (gated — request access) + # Download GGUF from: https://huggingface.co/LiquidAI/LFM2-24B-A2B (gated — request access) # Benchmark plan: docs/model-analysis/lfm2-24b-a2b-benchmark.md # Run: llama-server --model --port 8080 --ctx-size 32768 --n-gpu-layers 99 --flash-attn lfm2-24b-a2b: - display_name: "LFM2-24B-A2B-Preview" + display_name: "LFM2-24B-A2B" runtime: llama_cpp - model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Preview-Q4_K_M.gguf" + model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2-24B-A2B-Q4_K_M.gguf" base_url: "http://localhost:8080/v1" context_window: 32768 - tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs + tool_call_format: bracket # LFM2 bracket format: [server.tool(args)] parsed by tool_call_parser.rs temperature: 0.7 - tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3) + tool_temperature: 0.1 # Lower temperature for tool-calling turns (ADR-008 Layer 3) max_tokens: 4096 - estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE + estimated_vram_gb: 16 # Q4_K_M quantization estimate for 24B MoE capabilities: - text - tool_calling @@ -130,7 +140,7 @@ models: tool_call_format: native_json temperature: 0.7 max_tokens: 4096 - estimated_vram_gb: 4 # Only ~3B active params + estimated_vram_gb: 4 # Only ~3B active params capabilities: - text - tool_calling @@ -152,7 +162,7 @@ models: tool_call_format: native_json temperature: 0.1 max_tokens: 4096 - estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB) + estimated_vram_gb: 1.8 # Q8_0 model (1.25 GB) + mmproj (583 MB) capabilities: - text - vision @@ -170,10 +180,10 @@ models: model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Router-FT-v2-Q8_0.gguf" base_url: "http://localhost:8082/v1" context_window: 32768 - tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)] + tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)] temperature: 0.1 max_tokens: 512 - estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB) + estimated_vram_gb: 1.5 # Q8_0 quantization (1.2 GB) role: tool_router fine_tuned: method: lora @@ -198,7 +208,7 @@ models: model_path: "${LOCALCOWORK_MODELS_DIR:-~/Projects/_models}/LFM2.5-1.2B-Instruct-F16.gguf" base_url: "http://localhost:8084/v1" context_window: 32768 - tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)] + tool_call_format: bracket # LFM2.5 bracket format: [server.tool(args)] temperature: 0.1 max_tokens: 512 estimated_vram_gb: 2.3 @@ -238,6 +248,24 @@ models: - text - tool_calling + # LM Studio headless server — any model loaded in LM Studio + # Run `lms server start` or enable "Run LLM server on login" in app settings. + # Default port is 1234. Uses OpenAI-compatible API. + # Note: The model_name here is informational - update to match your loaded model. + lmstudio-default: + display_name: "LM Studio (Default)" + runtime: lmstudio + model_name: "liquid/lfm2-24b-a2b" # Replace with your loaded model ID + base_url: "http://localhost:1234/v1" + context_window: 32768 + tool_call_format: native_json + temperature: 0.7 + max_tokens: 4096 + estimated_vram_gb: null # Varies by loaded model + capabilities: + - text + - tool_calling + # ─── Benchmark comparison models ──────────────────────────────────────── # These models are benchmarked against LFM2-24B-A2B to demonstrate # scaling efficiency of hybrid MoE conv+attn vs dense and standard MoE. @@ -290,7 +318,7 @@ models: tool_temperature: 0.1 max_tokens: 4096 estimated_vram_gb: 20 - deprecated: true # Partial run only (40/100), dropped from active benchmarks + deprecated: true # Partial run only (40/100), dropped from active benchmarks capabilities: - text - tool_calling @@ -358,7 +386,9 @@ models: - text - tool_calling -# Runtime configurations +# Runtime configurations (informational only — not used by the app) +# These describe how to start each runtime for reference. The app +# expects the runtime to already be running when it starts. runtimes: ollama: command: "ollama serve" @@ -371,6 +401,14 @@ runtimes: health_check: "http://localhost:8080/health" startup_timeout_seconds: 60 + lmstudio: + # Use `lms server start` CLI to start headless, or enable "Run LLM server + # on login" in app settings (Cmd/Ctrl+,). Default port is 1234. + command: "lms" + args: ["server", "start"] + health_check: "http://localhost:1234/v1/models" + startup_timeout_seconds: 30 + mlx: command: "mlx_lm.server" args: ["--model", "{model_path}", "--port", "8080"] @@ -380,9 +418,9 @@ runtimes: # Fallback chain — used when the active model is unavailable fallback_chain: - - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion - - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE - - static_response # Fallback 2 — hardcoded "model unavailable" message + - lfm2-24b-a2b # Primary — 78% single-step, 24% chain completion + - qwen3-30b-moe # Fallback 1 — Ollama-hosted Qwen3 MoE + - static_response # Fallback 2 — hardcoded "model unavailable" message # Dual-model orchestrator (ADR-009) # When enabled, the planner model decomposes multi-step workflows and @@ -402,9 +440,9 @@ fallback_chain: # to skip the orchestrator entirely and avoid the ~2-3s wasted planner call. # See ADR-009 for full details. orchestrator: - enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools. + enabled: false # With 20 curated tools, single-model loop is faster. Enable for 67+ tools. planner_model: lfm2-24b-a2b - router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools) + router_model: lfm25-1.2b-router-ft # Fine-tuned V2: 93.0% eval accuracy, 83.7% live (83 tools) router_top_k: 15 max_plan_steps: 10 step_retries: 3 @@ -414,4 +452,4 @@ orchestrator: # ~15 category meta-tools (~1,500 tokens) instead of all 67 tools (~8,670 tokens). # The model selects 2-3 categories, then subsequent turns use only those tools. # Saves ~7,170 tokens per turn and eliminates cross-server confusion. -two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode +two_pass_tool_selection: true # Active only when >30 tools registered; 21 curated tools use flat mode diff --git a/examples/localcowork/scripts/start-model.sh b/examples/localcowork/scripts/start-model.sh index f282661..2570af3 100755 --- a/examples/localcowork/scripts/start-model.sh +++ b/examples/localcowork/scripts/start-model.sh @@ -14,8 +14,14 @@ set -euo pipefail MODELS_DIR="${LOCALCOWORK_MODELS_DIR:-$HOME/Projects/_models}" +# Ensure HuggingFace cache is in home directory, NOT in project repo +# This prevents .cache/huggingface from being created in the project +export HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}" +export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" +echo "📁 HuggingFace cache: HF_HOME=$HF_HOME" + # Main model (LFM2-24B-A2B) -MAIN_MODEL="LFM2-24B-A2B-Preview-Q4_K_M.gguf" +MAIN_MODEL="LFM2-24B-A2B-Q4_K_M.gguf" MAIN_PORT=8080 MAIN_CTX=32768 @@ -30,40 +36,40 @@ START_VISION=false CHECK_ONLY=false for arg in "$@"; do - case "$arg" in - --vision) START_VISION=true ;; - --check) CHECK_ONLY=true ;; - --help|-h) - echo "Usage: $0 [--vision] [--check]" - echo "" - echo " --vision Also start the vision model server (port $VISION_PORT)" - echo " --check Check if model files exist (don't start servers)" - echo "" - echo "Environment:" - echo " LOCALCOWORK_MODELS_DIR Model directory (default: ~/Projects/_models)" - exit 0 - ;; - *) - echo "Unknown argument: $arg" - echo "Run '$0 --help' for usage." - exit 1 - ;; - esac + case "$arg" in + --vision) START_VISION=true ;; + --check) CHECK_ONLY=true ;; + --help | -h) + echo "Usage: $0 [--vision] [--check]" + echo "" + echo " --vision Also start the vision model server (port $VISION_PORT)" + echo " --check Check if model files exist (don't start servers)" + echo "" + echo "Environment:" + echo " LOCALCOWORK_MODELS_DIR Model directory (default: ~/Projects/_models)" + exit 0 + ;; + *) + echo "Unknown argument: $arg" + echo "Run '$0 --help' for usage." + exit 1 + ;; + esac done # ── Check llama-server ─────────────────────────────────────────────────────── -if ! command -v llama-server &> /dev/null; then - echo "❌ llama-server not found." - echo "" - echo "Install via Homebrew (macOS):" - echo " brew install llama.cpp" - echo "" - echo "Or build from source:" - echo " git clone https://github.com/ggml-org/llama.cpp" - echo " cd llama.cpp && cmake -B build && cmake --build build --config Release" - echo " # Binary at: build/bin/llama-server" - exit 1 +if ! command -v llama-server &>/dev/null; then + echo "❌ llama-server not found." + echo "" + echo "Install via Homebrew (macOS):" + echo " brew install llama.cpp" + echo "" + echo "Or build from source:" + echo " git clone https://github.com/ggml-org/llama.cpp" + echo " cd llama.cpp && cmake -B build && cmake --build build --config Release" + echo " # Binary at: build/bin/llama-server" + exit 1 fi echo "✅ llama-server found: $(command -v llama-server)" @@ -79,55 +85,55 @@ VISION_PATH="$MODELS_DIR/$VISION_MODEL" MMPROJ_PATH="$MODELS_DIR/$VISION_MMPROJ" if [ -f "$MAIN_PATH" ]; then - MAIN_SIZE=$(du -h "$MAIN_PATH" | cut -f1) - echo "✅ Main model: $MAIN_MODEL ($MAIN_SIZE)" + MAIN_SIZE=$(du -h "$MAIN_PATH" | cut -f1) + echo "✅ Main model: $MAIN_MODEL ($MAIN_SIZE)" else - echo "❌ Main model not found: $MAIN_PATH" - echo "" - echo " Download LFM2-24B-A2B from HuggingFace (gated — request access first):" - echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B-Preview" - echo "" - echo " pip install huggingface-hub" - echo " python3 -c \"" - echo " from huggingface_hub import hf_hub_download" - echo " hf_hub_download('LiquidAI/LFM2-24B-A2B-Preview'," - echo " 'LFM2-24B-A2B-Preview-Q4_K_M.gguf'," - echo " local_dir='$MODELS_DIR')" - echo " \"" - if [ "$CHECK_ONLY" = true ]; then - echo "" - else - exit 1 - fi + echo "❌ Main model not found: $MAIN_PATH" + echo "" + echo " Download LFM2-24B-A2B from HuggingFace (gated — request access first):" + echo " https://huggingface.co/LiquidAI/LFM2-24B-A2B" + echo "" + echo " pip install huggingface-hub" + echo " python3 -c \"" + echo " from huggingface_hub import hf_hub_download" + echo " hf_hub_download('LiquidAI/LFM2-24B-A2B'," + echo " 'LFM2-24B-A2B-Q4_K_M.gguf'," + echo " local_dir='$MODELS_DIR')" + echo " \"" + if [ "$CHECK_ONLY" = true ]; then + echo "" + else + exit 1 + fi fi if [ -f "$VISION_PATH" ] && [ -f "$MMPROJ_PATH" ]; then - echo "✅ Vision model: $VISION_MODEL + mmproj" + echo "✅ Vision model: $VISION_MODEL + mmproj" else - echo "⚠️ Vision model not found (optional — OCR falls back to Tesseract)" - if [ "$START_VISION" = true ]; then - echo "" - echo " Download from: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF" - echo "" - echo " pip install huggingface-hub" - echo " python3 -c \"" - echo " from huggingface_hub import hf_hub_download" - echo " for f in ['$VISION_MODEL', '$VISION_MMPROJ']:" - echo " hf_hub_download('LiquidAI/LFM2.5-VL-1.6B-GGUF', f," - echo " local_dir='$MODELS_DIR')" - echo " \"" - fi + echo "⚠️ Vision model not found (optional — OCR falls back to Tesseract)" + if [ "$START_VISION" = true ]; then + echo "" + echo " Download from: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF" + echo "" + echo " pip install huggingface-hub" + echo " python3 -c \"" + echo " from huggingface_hub import hf_hub_download" + echo " for f in ['$VISION_MODEL', '$VISION_MMPROJ']:" + echo " hf_hub_download('LiquidAI/LFM2.5-VL-1.6B-GGUF', f," + echo " local_dir='$MODELS_DIR')" + echo " \"" + fi fi if [ "$CHECK_ONLY" = true ]; then - exit 0 + exit 0 fi # ── Start main model server ───────────────────────────────────────────────── if [ ! -f "$MAIN_PATH" ]; then - echo "Cannot start server — main model file missing." - exit 1 + echo "Cannot start server — main model file missing." + exit 1 fi echo "" @@ -141,11 +147,11 @@ echo "" # Start main model in background llama-server \ - --model "$MAIN_PATH" \ - --port "$MAIN_PORT" \ - --ctx-size "$MAIN_CTX" \ - --n-gpu-layers 99 \ - --flash-attn & + --model "$MAIN_PATH" \ + --port "$MAIN_PORT" \ + --ctx-size "$MAIN_CTX" \ + --n-gpu-layers 99 \ + --flash-attn & MAIN_PID=$! echo " PID: $MAIN_PID" @@ -153,47 +159,47 @@ echo " PID: $MAIN_PID" # Wait for health check echo -n " Waiting for server..." for i in $(seq 1 60); do - if curl -sf "http://localhost:$MAIN_PORT/health" > /dev/null 2>&1; then - echo " ready!" - break - fi - if [ "$i" -eq 60 ]; then - echo " timeout (60s). Check logs above for errors." - exit 1 - fi - sleep 1 - echo -n "." + if curl -sf "http://localhost:$MAIN_PORT/health" >/dev/null 2>&1; then + echo " ready!" + break + fi + if [ "$i" -eq 60 ]; then + echo " timeout (60s). Check logs above for errors." + exit 1 + fi + sleep 1 + echo -n "." done # ── Start vision model server (optional) ───────────────────────────────────── if [ "$START_VISION" = true ] && [ -f "$VISION_PATH" ] && [ -f "$MMPROJ_PATH" ]; then - echo "" - echo "═══════════════════════════════════════════════════" - echo " Starting LFM2.5-VL-1.6B on port $VISION_PORT" - echo "═══════════════════════════════════════════════════" - - llama-server \ - --model "$VISION_PATH" \ - --mmproj "$MMPROJ_PATH" \ - --port "$VISION_PORT" \ - --ctx-size 32768 & - - VISION_PID=$! - echo " PID: $VISION_PID" - - echo -n " Waiting for server..." - for i in $(seq 1 60); do - if curl -sf "http://localhost:$VISION_PORT/health" > /dev/null 2>&1; then - echo " ready!" - break - fi - if [ "$i" -eq 60 ]; then - echo " timeout. Vision OCR will fall back to Tesseract." - fi - sleep 1 - echo -n "." - done + echo "" + echo "═══════════════════════════════════════════════════" + echo " Starting LFM2.5-VL-1.6B on port $VISION_PORT" + echo "═══════════════════════════════════════════════════" + + llama-server \ + --model "$VISION_PATH" \ + --mmproj "$MMPROJ_PATH" \ + --port "$VISION_PORT" \ + --ctx-size 32768 & + + VISION_PID=$! + echo " PID: $VISION_PID" + + echo -n " Waiting for server..." + for i in $(seq 1 60); do + if curl -sf "http://localhost:$VISION_PORT/health" >/dev/null 2>&1; then + echo " ready!" + break + fi + if [ "$i" -eq 60 ]; then + echo " timeout. Vision OCR will fall back to Tesseract." + fi + sleep 1 + echo -n "." + done fi # ── Summary ────────────────────────────────────────────────────────────────── @@ -204,7 +210,7 @@ echo " Model servers running" echo "═══════════════════════════════════════════════════" echo " Main: http://localhost:$MAIN_PORT/v1 (PID $MAIN_PID)" if [ "$START_VISION" = true ] && [ -n "${VISION_PID:-}" ]; then - echo " Vision: http://localhost:$VISION_PORT/v1 (PID $VISION_PID)" + echo " Vision: http://localhost:$VISION_PORT/v1 (PID $VISION_PID)" fi echo "" echo " In another terminal: cargo tauri dev"