Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@ __pycache__
.env*
.venv/
logs/
pageindex.egg-info/
dist/
*.db
venv/
uv.lock
73 changes: 73 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,79 @@ You can generate the PageIndex tree structure with this open-source repo, or use

---

# 🚀 SDK Usage

A unified `PageIndexClient` powers both local self-hosted and cloud-managed modes. Mode is auto-detected by whether you pass an `api_key`.

### Install

```bash
pip install pageindex
```

### Quick start

```python
from pageindex import PageIndexClient

# Local mode — uses your LLM key (e.g. OPENAI_API_KEY in env).
client = PageIndexClient(model="gpt-4o-2024-11-20")

col = client.collection()
doc_id = col.add("path/to/your.pdf")

print(col.query("What is the main contribution?", doc_ids=doc_id))

# Cloud mode — fully managed, no LLM key needed:
# client = PageIndexClient(api_key="your-pageindex-api-key")
```

`col.query(...)` returns the answer string by default. Always pass `doc_ids` for reliable single-document QA — omitting it queries the entire collection, which is experimental (see below).

### Streaming queries

```python
import asyncio

async def main():
async for ev in col.query("Explain multi-head attention", doc_ids=doc_id, stream=True):
if ev.type == "answer_delta":
print(ev.data, end="", flush=True)
elif ev.type == "tool_call":
print(f"\n[tool] {ev.data['name']}")

asyncio.run(main())
```

`ev.type` is one of: `tool_call`, `tool_result`, `answer_delta`, `answer_done`.

### Multi-document collections (experimental)

Passing `doc_ids` scopes the query to a specific subset of documents — this is the recommended path. `doc_ids` accepts a single id (`str`) or a list:

```python
col.query("What does this paper say?", doc_ids=doc1) # single
col.query("Compare these two papers", doc_ids=[doc1, doc2]) # multi
```

Omitting `doc_ids` queries the **entire collection** and lets the agent pick which docs to read. This is an **experimental** feature with a naive first implementation — we're actively working on better cross-document retrieval. A `UserWarning` is emitted; set `PAGEINDEX_EXPERIMENTAL_MULTIDOC=1` to silence it.

### Environment variables

| Variable | Effect |
|---|---|
| `OPENAI_API_KEY` (or any LiteLLM `<PROVIDER>_API_KEY`) | LLM provider key — local mode |
| `PAGEINDEX_API_KEY` | PageIndex cloud key — cloud mode |
| `PAGEINDEX_EXPERIMENTAL_MULTIDOC` | Set to `1` to silence the warning when calling `col.query(...)` without `doc_ids` |

### Runnable examples

- [`examples/local_demo.py`](examples/local_demo.py) — local mode end-to-end (index a PDF + streaming QA)
- [`examples/cloud_demo.py`](examples/cloud_demo.py) — cloud mode end-to-end
- [`examples/agentic_vectorless_rag_demo.py`](examples/agentic_vectorless_rag_demo.py) — lower-level integration with the OpenAI Agents SDK

---

# ⚙️ Package Usage

You can follow these steps to generate a PageIndex tree from a PDF document.
Expand Down
62 changes: 62 additions & 0 deletions examples/cloud_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Agentic Vectorless RAG with PageIndex SDK - Cloud Demo

Uses CloudClient for fully-managed document indexing and QA.
No LLM API key needed — the cloud service handles everything.

Steps:
1 — Upload and index a PDF via PageIndex cloud
2 — Stream a question with tool call visibility

Requirements:
pip install pageindex
export PAGEINDEX_API_KEY=your-api-key
"""
import asyncio
import os
from pathlib import Path
import requests
from pageindex import CloudClient

_EXAMPLES_DIR = Path(__file__).parent
PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"

# Download PDF if needed
if not PDF_PATH.exists():
print(f"Downloading {PDF_URL} ...")
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(PDF_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.\n")

client = CloudClient(api_key=os.environ["PAGEINDEX_API_KEY"])
col = client.collection()

doc_id = col.add(str(PDF_PATH))
print(f"Indexed: {doc_id}\n")

# Streaming query
stream = col.query("What is the main contribution of this paper?", stream=True)

async def main():
streamed_text = False
async for event in stream:
if event.type == "answer_delta":
print(event.data, end="", flush=True)
streamed_text = True
elif event.type == "tool_call":
if streamed_text:
print()
streamed_text = False
args = event.data.get("args", "")
print(f"[tool call] {event.data['name']}({args})")
elif event.type == "answer_done":
print()
streamed_text = False

asyncio.run(main())
149 changes: 149 additions & 0 deletions examples/demo_query_modes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""Demo: exercise Collection.query() in all modes.

Creates a temp workspace with 2 small markdown docs, then runs:
Case 1 — single-doc collection, no doc_ids (open mode, no warning)
Case 2 — multi-doc collection, no doc_ids (open mode, UserWarning)
Case 2b — same as Case 2 + PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 (warning silenced)
Case 3 — scoped: doc_ids=[one_id] (no list_documents call)
Case 4 — scoped: doc_ids=[id1, id2] (no list_documents call)

Requirements:
- OPENAI_API_KEY (or any LiteLLM-supported provider key) in env or .env
"""
import asyncio
import os
import shutil
import tempfile
import warnings
from pathlib import Path

# Load .env if present
env_file = Path(__file__).parent.parent / ".env"
if env_file.exists():
for line in env_file.read_text().splitlines():
if "=" in line and not line.strip().startswith("#"):
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip())

from pageindex import PageIndexClient


def banner(text: str) -> None:
print("\n" + "=" * 70)
print(text)
print("=" * 70)


WORKSPACE = tempfile.mkdtemp(prefix="pi_demo_")
print(f"Workspace: {WORKSPACE}")

docs_dir = Path(WORKSPACE) / "docs"
docs_dir.mkdir()
alpha_md = docs_dir / "alpha.md"
alpha_md.write_text(
"# Alpha\n\n"
"## Introduction\n"
"Alpha is about apples and their nutritional value.\n\n"
"## Health benefits\n"
"Apples contain fiber and vitamin C, support digestion, and may help "
"regulate blood sugar.\n"
)
beta_md = docs_dir / "beta.md"
beta_md.write_text(
"# Beta\n\n"
"## Introduction\n"
"Beta is about bananas and potassium.\n\n"
"## Energy\n"
"Bananas provide quick energy from natural sugars and are rich in "
"potassium, supporting muscle function.\n"
)

client = PageIndexClient(model="gpt-4o-2024-11-20", storage_path=WORKSPACE)


async def stream_and_collect(coro_or_stream) -> list[str]:
"""Iterate a QueryStream, print tool calls and answer, return tool-call names."""
calls: list[str] = []
async for ev in coro_or_stream:
if ev.type == "tool_call":
calls.append(ev.data["name"])
print(f" [tool] {ev.data['name']}({ev.data.get('args','')})")
elif ev.type == "answer_done":
text = str(ev.data)
print(f" [answer] {text[:160]}{'...' if len(text) > 160 else ''}")
return calls


try:
# ── Case 1 ────────────────────────────────────────────────────────────
banner("Case 1: single-doc collection, no doc_ids (no warning expected)")
single = client.collection("single_test")
d_alpha_solo = single.add(str(alpha_md))
print(f"Indexed: {d_alpha_solo}")
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
answer = single.query("What is alpha about?")
uw = [w for w in caught if issubclass(w.category, UserWarning)]
print(f"UserWarning count: {len(uw)} (expected 0)")
print(f"Answer: {answer[:160]}{'...' if len(answer) > 160 else ''}")

# ── Case 2 ────────────────────────────────────────────────────────────
banner("Case 2: multi-doc collection, no doc_ids (UserWarning expected)")
multi = client.collection("multi_test")
d1 = multi.add(str(alpha_md))
d2 = multi.add(str(beta_md))
print(f"Indexed: {d1}, {d2}")
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
answer = multi.query("What are these documents about?")
uw = [w for w in caught if issubclass(w.category, UserWarning)]
print(f"UserWarning count: {len(uw)} (expected 1)")
for w in uw:
print(f" ⚠ {str(w.message)[:140]}")
print(f"Answer: {answer[:160]}{'...' if len(answer) > 160 else ''}")

# ── Case 2b ───────────────────────────────────────────────────────────
banner("Case 2b: same as Case 2 + PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 (silenced)")
prev = os.environ.get("PAGEINDEX_EXPERIMENTAL_MULTIDOC")
os.environ["PAGEINDEX_EXPERIMENTAL_MULTIDOC"] = "1"
try:
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
answer = multi.query("What are these documents about?")
uw = [w for w in caught if issubclass(w.category, UserWarning)]
print(f"UserWarning count: {len(uw)} (expected 0)")
print(f"Answer: {answer[:160]}{'...' if len(answer) > 160 else ''}")
finally:
if prev is None:
del os.environ["PAGEINDEX_EXPERIMENTAL_MULTIDOC"]
else:
os.environ["PAGEINDEX_EXPERIMENTAL_MULTIDOC"] = prev

# ── Case 3 ────────────────────────────────────────────────────────────
banner(f"Case 3: scoped, doc_ids=[{d1[:8]}…] (no list_documents)")

async def case3():
calls = await stream_and_collect(
multi.query("What are apples good for?", doc_ids=[d1], stream=True)
)
assert "list_documents" not in calls, f"unexpected list_documents call: {calls}"
print(f"Tools called: {calls}")
asyncio.run(case3())

# ── Case 4 ────────────────────────────────────────────────────────────
banner(f"Case 4: scoped, doc_ids=[{d1[:8]}…, {d2[:8]}…] (no list_documents)")

async def case4():
calls = await stream_and_collect(
multi.query("Compare alpha and beta briefly.",
doc_ids=[d1, d2], stream=True)
)
assert "list_documents" not in calls, f"unexpected list_documents call: {calls}"
print(f"Tools called: {calls}")
asyncio.run(case4())

print("\nAll cases passed.")

finally:
shutil.rmtree(WORKSPACE, ignore_errors=True)
print(f"\nCleaned up {WORKSPACE}")
69 changes: 69 additions & 0 deletions examples/local_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Agentic Vectorless RAG with PageIndex SDK - Local Demo

A simple example of using LocalClient for self-hosted document indexing
and agent-based QA. The agent uses OpenAI Agents SDK to reason over
the document's tree structure index.

Steps:
1 — Download and index a PDF
2 — Stream a question with tool call visibility

Requirements:
pip install pageindex
export OPENAI_API_KEY=your-api-key # or any LiteLLM-supported provider
"""
import asyncio
from pathlib import Path
import requests
from pageindex import LocalClient

_EXAMPLES_DIR = Path(__file__).parent
PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"
WORKSPACE = _EXAMPLES_DIR / "workspace"
MODEL = "gpt-4o-2024-11-20" # any LiteLLM-supported model

# Download PDF if needed
if not PDF_PATH.exists():
print(f"Downloading {PDF_URL} ...")
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(PDF_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.\n")

client = LocalClient(model=MODEL, storage_path=str(WORKSPACE))
col = client.collection()

doc_id = col.add(str(PDF_PATH))
print(f"Indexed: {doc_id}\n")

# Streaming query
stream = col.query(
"What is the main architecture proposed in this paper and how does self-attention work?",
stream=True,
)

async def main():
streamed_text = False
async for event in stream:
if event.type == "answer_delta":
print(event.data, end="", flush=True)
streamed_text = True
elif event.type == "tool_call":
if streamed_text:
print()
streamed_text = False
print(f"[tool call] {event.data['name']}")
elif event.type == "tool_result":
preview = str(event.data)[:200] + "..." if len(str(event.data)) > 200 else event.data
print(f"[tool output] {preview}")
elif event.type == "answer_done":
print()
streamed_text = False

asyncio.run(main())
Loading
Loading