VectifyAI · harshrathod0585 · Jun 2, 2026 · Jun 2, 2026
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,31 @@
+# Incremental Markdown Update Demo
+
+`incremental_update_demo.py` — index a Markdown doc, then incrementally update
+it so only changed sections are re-summarized.
+
+Set an API key first (e.g. `export OPENAI_API_KEY=...`) and configure the model
+in `pageindex/config.yaml`.
+
+```bash
+python examples/incremental_update_demo.py
+```
+
+## How it works
+
+- `client.get_doc_id_by_path(path)` returns the `doc_id` already indexed for a
+  file path, or `None`.
+- First run for a path → `client.index(path)` builds the tree fresh.
+- Later runs → `client.update(doc_id)` re-summarizes **only** the sections whose
+  content hash changed; unchanged sections reuse their cached summary. No diff →
+  `{"status": "unchanged"}` (zero LLM work).
+
+Re-indexing the same file path reuses its `doc_id` and overwrites the same
+workspace JSON instead of creating a duplicate document.
+
+The script copies the sample (`documents/sample.md`) into a stable workspace
+path so re-runs reuse the same `doc_id`.
+
+## Workspace
+
+Indexed documents persist under `examples/workspace/` as `<doc_id>.json` plus a
+`_meta.json` index. Generated files there are throwaway test artifacts.
diff --git a/examples/documents/sample.md b/examples/documents/sample.md
@@ -0,0 +1,38 @@
+# PageIndex Overview
+
+PageIndex turns long documents into a navigable tree of sections, each with a
+summary, so agents can reason over structure instead of flat chunks. This
+sample doc is used by the incremental update demo.
+
+## 1. What PageIndex Does
+
+PageIndex parses a PDF or Markdown file into a hierarchical structure of nodes.
+Each node holds a title, its text, and a generated summary. The tree lets a
+retrieval agent walk from the document root down to the exact section that
+answers a question, without embedding every chunk into a vector store.
+
+## 2. Indexing
+
+Indexing builds the tree once. For Markdown, headings define the hierarchy; for
+PDFs, the table of contents and page layout are used. Every section is
+summarized, and the whole document gets a short description. The result is
+persisted in a workspace as JSON keyed by a document id.
+
+## 3. Incremental Update
+
+When a document changes, PageIndex avoids rebuilding everything. It hashes the
+file and each section: if the file hash is unchanged the update is skipped
+entirely, and if only some sections changed, only those (plus their ancestors)
+are re-summarized. Unchanged sections reuse their cached summary.
+
+## 4. Vectorless Retrieval
+
+Because the tree carries summaries at every level, an agent can retrieve by
+traversing the structure instead of doing nearest-neighbor search over
+embeddings. This keeps retrieval explainable and cheap to maintain.
+
+## Appendix: Key Methods
+
+`client.index(path)` builds the tree. `client.update(doc_id)` refreshes it
+incrementally. `client.get_doc_id_by_path(path)` resolves an existing document
+so the same file is never indexed twice.
diff --git a/examples/incremental_update_demo.py b/examples/incremental_update_demo.py
@@ -0,0 +1,59 @@
+"""
+Incremental Markdown Update with PageIndex - Demo
+
+Shows how PageIndexClient resolves a document by file path: the first run
+indexes it fresh; later runs find the same doc_id and call update(), which
+re-summarizes only the sections whose content changed.
+
+Flow:
+  - First run for a path → index() builds the tree fresh.
+  - Later runs → same doc_id is found, update() runs; with no content change
+    it reports "unchanged" (zero LLM work).
+
+The source document (documents/sample.md) is copied into the workspace under a
+stable path, so re-running the demo reuses the same doc_id. An API key is
+required to generate section summaries.
+
+Run:
+  python examples/incremental_update_demo.py
+"""
+import shutil
+from pathlib import Path
+
+from pageindex import PageIndexClient
+
+SOURCE_MD = Path(__file__).parent / "documents" / "sample.md"
+
+
+def ingest_or_update(client, doc_path):
+    """Index the doc if new, otherwise incrementally update it."""
+    doc_id = client.get_doc_id_by_path(str(doc_path))
+    if doc_id:
+        result = client.update(doc_id)
+        if result.get("status") == "unchanged":
+            print(f"\n[{doc_path.name}] Loaded from cache (unchanged): {doc_id}")
+        else:
+            print(f"\n[{doc_path.name}] Incremental update done: {result}")
+    else:
+        doc_id = client.index(str(doc_path))
+        print(f"\n[{doc_path.name}] Indexed fresh. doc_id: {doc_id}")
+    return doc_id
+
+
+def main():
+    workspace = Path(__file__).parent / "workspace"
+    client = PageIndexClient(workspace=str(workspace))
+
+    # Stable copy inside the workspace so re-runs reuse the same doc_id.
+    workspace.mkdir(parents=True, exist_ok=True)
+    md_path = workspace / SOURCE_MD.name
+    shutil.copy(SOURCE_MD, md_path)  
+
+    print("== Ingest or update ==")
+    doc_id = ingest_or_update(client, md_path)
+
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pageindex/client.py b/pageindex/client.py
@@ -8,9 +8,24 @@
 import PyPDF2
 
 from .page_index import page_index
-from .page_index_md import md_to_tree
+from .page_index_md import (
+    md_to_tree,
+    extract_nodes_from_markdown,
+    extract_node_text_content,
+    get_node_summary,
+    build_tree_from_nodes,
+)
 from .retrieve import get_document, get_document_structure, get_page_content
-from .utils import ConfigLoader, remove_fields
+from .utils import (
+    ConfigLoader,
+    remove_fields,
+    hash_text,
+    compute_section_hashes,
+    find_ancestors,
+    structure_to_list,
+    write_node_id,
+    format_structure,
+)
 
 META_INDEX = "_meta.json"
 
@@ -60,7 +75,9 @@ def index(self, file_path: str, mode: str = "auto") -> str:
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"File not found: {file_path}")
 
-        doc_id = str(uuid.uuid4())
+        # Re-indexing the same file path reuses its doc_id (overwrites in place)
+        # instead of creating a duplicate document/JSON.
+        doc_id = self.get_doc_id_by_path(file_path) or str(uuid.uuid4())
         ext = os.path.splitext(file_path)[1].lower()
 
         is_pdf = ext == '.pdf'
@@ -112,6 +129,10 @@ def index(self, file_path: str, mode: str = "auto") -> str:
                     result = pool.submit(asyncio.run, coro).result()
             except RuntimeError:
                 result = asyncio.run(coro)
+            # Compute hashes from the raw file to enable incremental update().
+            _md_content = open(file_path, encoding='utf-8').read()
+            _node_list, _md_lines = extract_nodes_from_markdown(_md_content)
+            _flat_nodes = extract_node_text_content(_node_list, _md_lines)
             self.documents[doc_id] = {
                 'id': doc_id,
                 'type': 'md',
@@ -120,6 +141,8 @@ def index(self, file_path: str, mode: str = "auto") -> str:
                 'doc_description': result.get('doc_description', ''),
                 'line_count': result.get('line_count', 0),
                 'structure': result['structure'],
+                'file_hash': hash_text(_md_content),
+                'section_hashes': compute_section_hashes(_flat_nodes),
             }
         else:
             raise ValueError(f"Unsupported file format for: {file_path}")
@@ -205,6 +228,14 @@ def _load_workspace(self):
                 doc['path'] = str((self.workspace / doc['path']).resolve())
             self.documents[doc_id] = doc
 
+    def get_doc_id_by_path(self, file_path: str) -> str | None:
+        """Return the doc_id already indexed for this file path, or None."""
+        file_path = os.path.abspath(os.path.expanduser(file_path))
+        return next(
+            (did for did, d in self.documents.items() if d.get('path') == file_path),
+            None,
+        )
+
     def _ensure_doc_loaded(self, doc_id: str):
         """Load full document JSON on demand (structure, pages, etc.)."""
         doc = self.documents.get(doc_id)
@@ -216,6 +247,108 @@ def _ensure_doc_loaded(self, doc_id: str):
         doc['structure'] = full.get('structure', [])
         if full.get('pages'):
             doc['pages'] = full['pages']
+        if full.get('section_hashes'):
+            doc['section_hashes'] = full['section_hashes']
+        if full.get('file_hash'):
+            doc['file_hash'] = full['file_hash']
+
+    def update(self, doc_id: str) -> dict:
+        """Incrementally update an indexed MD document.
+
+        Re-summarizes only sections whose own text changed (plus their
+        ancestors, whose roll-up may be affected); unchanged sections reuse
+        their cached summary. Returns a status dict describing the change set.
+        """
+        self._ensure_doc_loaded(doc_id)
+        doc = self.documents.get(doc_id)
+        if not doc:
+            raise ValueError(f"Unknown doc_id: {doc_id}")
+        if doc.get('type') != 'md':
+            raise ValueError("update() only supports MD documents")
+
+        file_path = doc['path']
+        content = open(file_path, encoding='utf-8').read()
+
+        # Gate 1: file-level hash — skip entirely if nothing changed.
+        new_file_hash = hash_text(content)
+        if new_file_hash == doc.get('file_hash'):
+            return {"status": "unchanged"}
+
+        # Gate 2: section-level diff.
+        node_list, md_lines = extract_nodes_from_markdown(content)
+        new_nodes = extract_node_text_content(node_list, md_lines)
+        new_hashes = compute_section_hashes(new_nodes)
+        old_hashes = doc.get('section_hashes') or {}
+
+        new_keys = set(new_hashes)
+        old_keys = set(old_hashes)
+        added = new_keys - old_keys
+        deleted = old_keys - new_keys
+        changed = {p for p in new_keys & old_keys if new_hashes[p] != old_hashes[p]}
+
+        # Dirty sections plus the ancestors of each (roll-up summaries).
+        dirty = changed | added
+        to_summarize = set(dirty)
+        for path in dirty:
+            to_summarize.update(find_ancestors(path))
+
+        # Reuse cached summaries for clean sections.
+        old_structure_flat = structure_to_list(doc.get('structure', []))
+        old_summary_map = {
+            n.get('title_path', n.get('title')): n.get('summary') or n.get('prefix_summary', '')
+            for n in old_structure_flat
+        }
+
+        async def _identity(val):
+            return val
+
+        async def _regenerate():
+            tasks = {}
+            for path, node in {n['title_path']: n for n in new_nodes}.items():
+                if path in to_summarize:
+                    tasks[path] = get_node_summary(node, summary_token_threshold=200, model=self.model)
+                else:
+                    tasks[path] = _identity(old_summary_map.get(path, ''))
+            return {path: await coro for path, coro in tasks.items()}
+
+        try:
+            asyncio.get_running_loop()
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                summaries = pool.submit(asyncio.run, _regenerate()).result()
+        except RuntimeError:
+            summaries = asyncio.run(_regenerate())
+
+        for node in new_nodes:
+            node['summary'] = summaries.get(node['title_path'], '')
+
+        # Rebuild the tree with fresh node ids.
+        new_structure = build_tree_from_nodes(new_nodes)
+        write_node_id(new_structure)
+        new_structure = format_structure(
+            new_structure,
+            order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'],
+        )
+
+        doc['structure'] = new_structure
+        doc['file_hash'] = new_file_hash
+        doc['section_hashes'] = new_hashes
+        doc['line_count'] = content.count('\n') + 1
+
+        if self.workspace:
+            tmp = self.workspace / f"{doc_id}.tmp"
+            save_doc = dict(doc)
+            save_doc['structure'] = new_structure
+            with open(tmp, "w", encoding="utf-8") as f:
+                json.dump(save_doc, f, ensure_ascii=False, indent=2)
+            os.replace(tmp, self.workspace / f"{doc_id}.json")
+            self._save_meta(doc_id, self._make_meta_entry(doc))
+
+        return {
+            "status": "updated",
+            "updated": sorted(changed),
+            "added": sorted(added),
+            "deleted": sorted(deleted),
+        }
 
     def get_document(self, doc_id: str) -> str:
         """Return document metadata JSON."""

diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py
@@ -75,7 +75,19 @@ def extract_node_text_content(node_list, markdown_lines):
             'level': len(header_match.group(1))
         }
         all_nodes.append(processed_node)
-
+
+    # Build title_path per node using a level-keyed ancestor stack.
+    # Enables stable section identity across edits (incremental update).
+    ancestor_stack = {}
+    for node in all_nodes:
+        level = node['level']
+        for l in list(ancestor_stack.keys()):
+            if l >= level:
+                del ancestor_stack[l]
+        parts = [ancestor_stack[l] for l in sorted(ancestor_stack)] + [node['title']]
+        node['title_path'] = ' > '.join(parts)
+        ancestor_stack[level] = node['title']
+
     for i, node in enumerate(all_nodes):
         start_line = node['line_num'] - 1 
         if i + 1 < len(all_nodes):

diff --git a/pageindex/utils.py b/pageindex/utils.py
@@ -5,6 +5,7 @@
 from datetime import datetime
 import time
 import json
+import hashlib
 import PyPDF2
 import copy
 import asyncio
@@ -708,3 +709,22 @@ def print_wrapped(text, width=100):
     for line in text.splitlines():
         print(textwrap.fill(line, width=width))
 
+
+# ---------------------------------------------------------------------------
+# Incremental update helpers
+# ---------------------------------------------------------------------------
+
+def hash_text(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()
+
+
+def compute_section_hashes(node_list: list) -> dict:
+    """Build {title_path: sha256_of_own_text} from a flat node list."""
+    return {node["title_path"]: hash_text(node.get("text", "")) for node in node_list}
+
+
+def find_ancestors(title_path: str) -> list:
+    """Return ancestor title paths from root to immediate parent."""
+    parts = title_path.split(" > ")
+    return [" > ".join(parts[:i]) for i in range(1, len(parts))]
+