huggingface · mishig25 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/.github/workflows/populate_search_engine.yml b/.github/workflows/populate_search_engine.yml
@@ -5,6 +5,12 @@ on:
   #   - cron: "5 7 * * *" # every day at 07:05
   # to run this workflow manually from the Actions tab
   workflow_dispatch:
+    inputs:
+      full_rebuild:
+        description: 'Full rebuild (ignores existing embeddings)'
+        required: false
+        default: false
+        type: boolean
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -29,7 +35,19 @@ jobs:
       - name: Install doc-builder
         run: uv sync --extra dev
 
-      - name: Populate search engine from HF doc-build dataset
+      # Incremental mode (default): only process new/changed documents
+      - name: Populate search engine (incremental)
+        if: ${{ github.event.inputs.full_rebuild != 'true' }}
+        env:
+          HF_IE_URL: ${{ secrets.HF_IE_URL }}
+          HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
+          MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: uv run doc-builder populate-search-engine --incremental
+
+      # Full rebuild mode: process all documents (for manual runs when needed)
+      - name: Populate search engine (full rebuild)
+        if: ${{ github.event.inputs.full_rebuild == 'true' }}
         env:
           HF_IE_URL: ${{ secrets.HF_IE_URL }}
           HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
@@ -53,41 +71,23 @@ jobs:
       - name: Install doc-builder
         run: uv sync --extra dev
 
-      - name: Add gradio docs to meilisearch
+      # Incremental mode (default): only process new/changed Gradio documents
+      - name: Add gradio docs to meilisearch (incremental)
+        if: ${{ github.event.inputs.full_rebuild != 'true' }}
         env:
           HF_IE_URL: ${{ secrets.HF_IE_URL }}
           HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
           MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
-        run: uv run doc-builder add-gradio-docs
-
-  # cleanup-job:
-  #   needs: [process-docs, gradio-job]
-  #   runs-on: ubuntu-latest
-  #   if: always() # This ensures that the cleanup job runs regardless of the result
-  #   steps:
-  #     - name: Checkout doc-builder
-  #       uses: actions/checkout@v4
-
-  #     - name: Install uv
-  #       uses: astral-sh/setup-uv@v4
-  #       with:
-  #         version: "latest"
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: uv run doc-builder add-gradio-docs --incremental
 
-  #     - name: Set up Python 3.10
-  #       run: uv python install 3.10
-
-  #     - name: Install doc-builder
-  #       run: uv sync --extra dev
-
-  #     - name: Success Cleanup
-  #       if: needs.process-docs.result == 'success' # Runs if job succeeded
-  #       env:
-  #         MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
-  #       run: uv run doc-builder meilisearch-clean --swap
+      # Full rebuild mode: process all Gradio documents
+      - name: Add gradio docs to meilisearch (full rebuild)
+        if: ${{ github.event.inputs.full_rebuild == 'true' }}
+        env:
+          HF_IE_URL: ${{ secrets.HF_IE_URL }}
+          HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
+          MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
+        run: uv run doc-builder add-gradio-docs
 
-  #     - name: Failure Cleanup
-  #       if: needs.process-docs.result == 'failure' # Runs if job failed
-  #       env:
-  #         MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
-  #       run: uv run doc-builder meilisearch-clean
 
diff --git a/README.md b/README.md
@@ -17,6 +17,12 @@ This is the package we use to build the documentation of our Hugging Face repos.
     + [Code Quality & Formatting](#code-quality--formatting)
     + [Testing](#testing)
     + [Development Workflow](#development-workflow)
+  * [Search Engine Population](#search-engine-population)
+    + [How it works](#how-it-works)
+    + [Incremental Updates](#incremental-updates)
+    + [Commands](#commands)
+    + [Migrations](#migrations)
+    + [Environment Variables](#environment-variables)
   * [Writing documentation for Hugging Face libraries](#writing-documentation-for-hugging-face-libraries)
     + [Internal link to object](#internal-link-to-object)
     + [External link to object](#external-link-to-object)
@@ -325,6 +331,65 @@ uv run ruff format --check .
 uv run python -m pytest -n 1 --dist=loadfile -s -v ./tests/
 ```
 
+## Search Engine Population
+
+The `doc-builder` includes tools for populating the HuggingFace documentation search engine with embeddings. This enables semantic search across all HuggingFace library documentation.
+
+### How it works
+
+1. Documentation is downloaded from the `hf-doc-build/doc-build` dataset
+2. Documents are chunked and assigned deterministic IDs: `{library}-{page}-{sha256_hash_of_text[:8]}`
+3. Embeddings are generated via an inference endpoint
+4. Embeddings are uploaded to Meilisearch
+
+### Incremental Updates
+
+To avoid reprocessing unchanged documents, the system tracks document IDs in a HuggingFace dataset (`hf-doc-build/doc-builder-embeddings-tracker`). In incremental mode:
+
+1. Existing document IDs are fetched from the tracker dataset
+2. New IDs are reconstructed from source docs (deterministic, no Meilisearch needed)
+3. Only new/changed documents are processed
+4. New embeddings are uploaded directly to the main index
+5. The tracker dataset is updated with new IDs
+
+### Commands
+
+**Populate search engine (incremental - recommended):**
+```bash
+uv run doc-builder populate-search-engine --incremental
+```
+
+**Populate search engine (full rebuild):**
+```bash
+uv run doc-builder populate-search-engine
+```
+
+**Add Gradio docs (incremental):**
+```bash
+uv run doc-builder add-gradio-docs --incremental
+```
+
+**Add Gradio docs (full rebuild):**
+```bash
+uv run doc-builder add-gradio-docs
+```
+
+### Migrations
+
+**Initialize the tracker dataset (one-time setup):**
+```bash
+uv run python migrations/init_embeddings_tracker.py --hf_token <token>
+```
+
+This reconstructs all document IDs from the source docs at `hf-doc-build/doc-build` and pushes them to the tracker dataset. No Meilisearch access required.
+
+### Environment Variables
+
+- `HF_IE_URL`: Inference endpoint URL for embedding generation
+- `HF_IE_TOKEN`: Token for the inference endpoint
+- `MEILISEARCH_KEY`: Meilisearch API key
+- `HF_TOKEN`: HuggingFace token for updating the tracker dataset (incremental mode)
+
 ## Writing documentation for Hugging Face libraries
 
 `doc-builder` expects Markdown so you should write any new documentation in `".mdx"` files for tutorials, guides, API documentations. For docstrings, we follow the [Google format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) with the main difference that you should use Markdown instead of restructured text (hopefully, that will be easier!)

diff --git a/migrations/init_embeddings_tracker.py b/migrations/init_embeddings_tracker.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+Script to initialize the embeddings tracker dataset by processing docs from hf-doc-build/doc-build.
+This reconstructs document IDs deterministically without needing Meilisearch.
+
+Usage:
+    uv run python migrations/init_embeddings_tracker.py --hf_token <token>
+
+The dataset will be created at: hf-doc-build/doc-builder-embeddings-tracker
+"""
+
+import argparse
+import os
+from pathlib import Path
+
+from datasets import Dataset
+from tqdm import tqdm
+
+from doc_builder.meilisearch_helper import generate_doc_id
+from doc_builder.process_hf_docs import process_all_libraries
+
+# Dataset repository for tracking embeddings
+EMBEDDINGS_TRACKER_REPO = "hf-doc-build/doc-builder-embeddings-tracker"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Initialize embeddings tracker dataset from hf-doc-build/doc-build")
+    parser.add_argument(
+        "--hf_token",
+        type=str,
+        required=False,
+        help="HuggingFace token with write access (or set HF_TOKEN env var)",
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        default=EMBEDDINGS_TRACKER_REPO,
+        help=f"Dataset repository ID (default: {EMBEDDINGS_TRACKER_REPO})",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory for downloaded/extracted files (uses temp dir if not specified)",
+    )
+    args = parser.parse_args()
+
+    hf_token = args.hf_token or os.environ.get("HF_TOKEN")
+
+    if not hf_token:
+        raise ValueError("HF_TOKEN is required. Set via --hf_token or HF_TOKEN env var.")
+
+    # Process all libraries from hf-doc-build/doc-build (same as populate-search-engine)
+    print("=" * 80)
+    print("DOWNLOADING AND PROCESSING DOCS FROM hf-doc-build/doc-build")
+    print("=" * 80)
+
+    results = process_all_libraries(
+        output_dir=Path(args.output_dir) if args.output_dir else None,
+        excerpts_max_length=2000,  # Same as default in populate-search-engine
+    )
+
+    # Generate document IDs for all chunks
+    print("\n" + "=" * 80)
+    print("GENERATING DOCUMENT IDS")
+    print("=" * 80)
+
+    entries = []
+    for _library_name, chunks in tqdm(results.items(), desc="Processing libraries"):
+        for chunk in chunks:
+            doc_id = generate_doc_id(chunk.package_name, chunk.page, chunk.text)
+            entries.append(
+                {
+                    "id": doc_id,
+                    "library": chunk.package_name,
+                    "source_page_url": chunk.source_page_url,
+                }
+            )
+
+    print(f"\nTotal document IDs generated: {len(entries)}")
+
+    # Deduplicate by ID (in case of any duplicates)
+    seen_ids = set()
+    unique_entries = []
+    for entry in entries:
+        if entry["id"] not in seen_ids:
+            seen_ids.add(entry["id"])
+            unique_entries.append(entry)
+
+    print(f"Unique document IDs: {len(unique_entries)}")
+
+    # Create and push dataset
+    print("\n" + "=" * 80)
+    print("PUSHING TO HUGGINGFACE")
+    print("=" * 80)
+
+    dataset = Dataset.from_list(unique_entries)
+    print(f"Created dataset with {len(dataset)} entries")
+    print(f"Columns: {dataset.column_names}")
+
+    print(f"Pushing to {args.repo}...")
+    dataset.push_to_hub(args.repo, token=hf_token, private=False)
+
+    print("\n" + "=" * 80)
+    print("✅ MIGRATION COMPLETE")
+    print("=" * 80)
+    print(f"Dataset created at: https://huggingface.co/datasets/{args.repo}")
+    print(f"Total documents tracked: {len(unique_entries)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,9 @@ dev = [
     "google-api-python-client",
     "requests",
     # quality dependencies
-    "ruff>=0.1.0"
+    "ruff>=0.1.0",
+    # embeddings/search engine dependencies
+    "datasets>=4.1.1",
 ]
 release = ["twine"]