diff --git a/README.md b/README.md index 1f7957e1..04cdf79a 100755 --- a/README.md +++ b/README.md @@ -1025,7 +1025,7 @@ leann remove my-docs
šŸ“‹ Click to expand: Complete CLI Reference -You can use `leann --help`, or `leann build --help`, `leann search --help`, `leann ask --help`, `leann list --help`, `leann remove --help` to get the complete CLI reference. +You can use `leann --help`, or `leann build --help`, `leann update --help`, `leann search --help`, `leann ask --help`, `leann list --help`, `leann remove --help` to get the complete CLI reference. **Build Command:** ```bash @@ -1041,6 +1041,27 @@ Options: --recompute / --no-recompute Enable recomputation (default: true) ``` +**Update Command:** +```bash +leann update INDEX_NAME --docs DIRECTORY|FILE [DIRECTORY|FILE ...] [OPTIONS] + +# Add new documents to an existing index +# Note: Only works with HNSW indices built with --no-compact + +Options: + --file-types TYPES File extensions to include (e.g., '.txt,.pdf') + --include-hidden Include hidden files/directories + --doc-chunk-size N Document chunk size (default: 256) + --doc-chunk-overlap N Document chunk overlap (default: 128) + --code-chunk-size N Code chunk size (default: 512) + --code-chunk-overlap N Code chunk overlap (default: 50) + --use-ast-chunking Enable AST-aware chunking for code + +Examples: + leann update my-docs --docs ./new-documents + leann update my-code --docs ./new-src --file-types .py,.js +``` + **Search Command:** ```bash leann search INDEX_NAME QUERY [OPTIONS] diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 79b5e1d3..2733ac91 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -85,6 +85,7 @@ def create_parser(self) -> argparse.ArgumentParser: leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files + leann update my-docs --docs ./new-documents # Add new documents to existing index leann search my-docs "query" # Search in my-docs index leann ask my-docs "question" # Ask my-docs index leann list # List all stored indexes @@ -237,6 +238,77 @@ def create_parser(self) -> argparse.ArgumentParser: help="Fall back to traditional chunking if AST chunking fails (default: True)", ) + # Update command + update_parser = subparsers.add_parser( + "update", help="Update existing index with new documents" + ) + update_parser.add_argument("index_name", help="Index name to update") + update_parser.add_argument( + "--docs", + type=str, + nargs="+", + required=True, + help="New documents directories and/or files to add", + ) + update_parser.add_argument( + "--file-types", + type=str, + help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.", + ) + update_parser.add_argument( + "--include-hidden", + action=argparse.BooleanOptionalAction, + default=False, + help="Include hidden files and directories (paths starting with '.') during indexing (default: false)", + ) + update_parser.add_argument( + "--doc-chunk-size", + type=int, + default=256, + help="Document chunk size in TOKENS (default: 256). Should match original build settings for consistency.", + ) + update_parser.add_argument( + "--doc-chunk-overlap", + type=int, + default=128, + help="Document chunk overlap in TOKENS (default: 128). Should match original build settings for consistency.", + ) + update_parser.add_argument( + "--code-chunk-size", + type=int, + default=512, + help="Code chunk size in TOKENS (default: 512). Should match original build settings for consistency.", + ) + update_parser.add_argument( + "--code-chunk-overlap", + type=int, + default=50, + help="Code chunk overlap in TOKENS (default: 50). Should match original build settings for consistency.", + ) + update_parser.add_argument( + "--use-ast-chunking", + action="store_true", + help="Enable AST-aware chunking for code files (requires astchunk)", + ) + update_parser.add_argument( + "--ast-chunk-size", + type=int, + default=300, + help="AST chunk size in CHARACTERS (non-whitespace) (default: 300).", + ) + update_parser.add_argument( + "--ast-chunk-overlap", + type=int, + default=64, + help="AST chunk overlap in CHARACTERS (default: 64).", + ) + update_parser.add_argument( + "--ast-fallback-traditional", + action="store_true", + default=True, + help="Fall back to traditional chunking if AST chunking fails (default: True)", + ) + # Search command search_parser = subparsers.add_parser("search", help="Search documents") search_parser.add_argument("index_name", help="Index name") @@ -1455,6 +1527,147 @@ async def build_index(self, args): # Register this project directory in global registry self.register_project_dir() + async def update_index(self, args): + """Update an existing index with new documents.""" + index_name = args.index_name + docs_paths = args.docs + + # Check if index exists + if not self.index_exists(index_name): + print(f"āŒ Index '{index_name}' not found.") + print(f" Use 'leann build {index_name} --docs ' to create it first.") + return + + index_dir = self.indexes_dir / index_name + index_path = self.get_index_path(index_name) + meta_path = index_dir / "documents.leann.meta.json" + + # Load and validate metadata + print(f"šŸ“‹ Loading index metadata for '{index_name}'...") + try: + import json + + with open(meta_path, encoding="utf-8") as f: + meta = json.load(f) + except Exception as e: + print(f"āŒ Error reading index metadata: {e}") + return + + # Validate backend is HNSW + backend_name = meta.get("backend_name") + if backend_name != "hnsw": + print(f"āŒ Cannot update: Index uses '{backend_name}' backend.") + print(" Only HNSW indices support updates.") + return + + # Validate index is not compact + meta_backend_kwargs = meta.get("backend_kwargs", {}) + is_compact = meta.get("is_compact", meta_backend_kwargs.get("is_compact", True)) + if is_compact: + print("āŒ Cannot update: Index is compact.") + print(" Compact HNSW indices do not support in-place updates.") + print(f" Rebuild with: leann build {index_name} --docs --no-compact --force") + return + + # Extract embedding configuration from metadata + embedding_model = meta.get("embedding_model") + embedding_mode = meta.get("embedding_mode") + embedding_options = meta.get("embedding_options", {}) + graph_degree = meta_backend_kwargs.get("graph_degree", 32) + complexity = meta_backend_kwargs.get("complexity", 64) + is_recompute = meta.get("is_pruned") or meta_backend_kwargs.get("is_recompute", True) + num_threads = meta_backend_kwargs.get("num_threads", 1) + + print("āœ… Index configuration:") + print(f" Backend: {backend_name}") + print(f" Embedding model: {embedding_model}") + print(f" Embedding mode: {embedding_mode}") + print(f" Is compact: {is_compact}") + print(f" Is recompute: {is_recompute}") + + # Display paths being added + files = [p for p in docs_paths if Path(p).is_file()] + directories = [p for p in docs_paths if Path(p).is_dir()] + + print(f"\nšŸ“‚ Adding {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:") + if files: + print(f" šŸ“„ Files ({len(files)}):") + for i, file_path in enumerate(files, 1): + print(f" {i}. {Path(file_path).resolve()}") + if directories: + print(f" šŸ“ Directories ({len(directories)}):") + for i, dir_path in enumerate(directories, 1): + print(f" {i}. {Path(dir_path).resolve()}") + + # Configure chunking based on CLI args + doc_chunk_size = max(1, int(args.doc_chunk_size)) + doc_chunk_overlap = max(0, int(args.doc_chunk_overlap)) + if doc_chunk_overlap >= doc_chunk_size: + print( + f"āš ļø Adjusting doc chunk overlap from {doc_chunk_overlap} to {doc_chunk_size - 1} (must be < chunk size)" + ) + doc_chunk_overlap = doc_chunk_size - 1 + + code_chunk_size = max(1, int(args.code_chunk_size)) + code_chunk_overlap = max(0, int(args.code_chunk_overlap)) + if code_chunk_overlap >= code_chunk_size: + print( + f"āš ļø Adjusting code chunk overlap from {code_chunk_overlap} to {code_chunk_size - 1} (must be < chunk size)" + ) + code_chunk_overlap = code_chunk_size - 1 + + self.node_parser = SentenceSplitter( + chunk_size=doc_chunk_size, + chunk_overlap=doc_chunk_overlap, + separator=" ", + paragraph_separator="\n\n", + ) + self.code_parser = SentenceSplitter( + chunk_size=code_chunk_size, + chunk_overlap=code_chunk_overlap, + separator="\n", + paragraph_separator="\n\n", + ) + + # Load new documents + print("\nšŸ”„ Loading new documents...") + all_texts = self.load_documents( + docs_paths, args.file_types, include_hidden=args.include_hidden, args=args + ) + if not all_texts: + print("āŒ No new documents found to add") + return + + print(f"āœ… Loaded {len(all_texts)} new chunks") + + # Initialize builder with settings from existing index + print(f"\nšŸ”Ø Updating index '{index_name}'...") + builder = LeannBuilder( + backend_name=backend_name, + embedding_model=embedding_model, + embedding_mode=embedding_mode, + embedding_options=embedding_options or None, + graph_degree=graph_degree, + complexity=complexity, + is_compact=is_compact, + is_recompute=is_recompute, + num_threads=num_threads, + ) + + # Add new texts to builder + for chunk in all_texts: + builder.add_text(chunk["text"], metadata=chunk["metadata"]) + + # Call update_index instead of build_index + try: + builder.update_index(index_path) + print(f"āœ… Index updated successfully at {index_path}") + print(f" Added {len(all_texts)} new chunks to '{index_name}'") + except ValueError as e: + print(f"āŒ Update failed: {e}") + except Exception as e: + print(f"āŒ Unexpected error during update: {e}") + async def search_documents(self, args): index_name = args.index_name query = args.query @@ -1673,6 +1886,8 @@ async def run(self, args=None): self.remove_index(args.index_name, args.force) elif args.command == "build": await self.build_index(args) + elif args.command == "update": + await self.update_index(args) elif args.command == "search": await self.search_documents(args) elif args.command == "ask": diff --git a/tests/README.md b/tests/README.md index 48ca7f59..952e3824 100644 --- a/tests/README.md +++ b/tests/README.md @@ -19,6 +19,16 @@ Basic functionality tests that verify: - Basic index building and searching works for both HNSW and DiskANN backends - Uses parametrized tests to test both backends +### `test_cli_update.py` +Tests CLI update command parsing and validation: +- Verifies update command accepts required arguments (index name, --docs) +- Tests multiple document paths support +- Tests chunking options (doc-chunk-size, code-chunk-size, overlaps) +- Tests file type filters (--file-types) +- Tests AST chunking options (--use-ast-chunking, --ast-chunk-size) +- Tests include-hidden flag +- Validates default parameter values + ### `test_document_rag.py` Tests the document RAG example functionality: - Tests with facebook/contriever embeddings diff --git a/tests/test_cli_update.py b/tests/test_cli_update.py new file mode 100644 index 00000000..3d5c4dd4 --- /dev/null +++ b/tests/test_cli_update.py @@ -0,0 +1,146 @@ +""" +Test CLI update command parsing and validation. +""" + +from leann.cli import LeannCLI + + +def test_cli_update_accepts_required_args(tmp_path, monkeypatch): + """Test that update command parser accepts required arguments.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args(["update", "my-index", "--docs", "./new-documents"]) + + assert args.command == "update" + assert args.index_name == "my-index" + assert args.docs == ["./new-documents"] + + +def test_cli_update_accepts_multiple_docs(tmp_path, monkeypatch): + """Test that update command accepts multiple document paths.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args(["update", "my-index", "--docs", "./docs1", "./docs2", "./file.txt"]) + + assert args.command == "update" + assert args.index_name == "my-index" + assert args.docs == ["./docs1", "./docs2", "./file.txt"] + + +def test_cli_update_accepts_chunking_options(tmp_path, monkeypatch): + """Test that update command accepts chunking options.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args( + [ + "update", + "my-index", + "--docs", + "./docs", + "--doc-chunk-size", + "512", + "--doc-chunk-overlap", + "64", + "--code-chunk-size", + "1024", + "--code-chunk-overlap", + "100", + ] + ) + + assert args.command == "update" + assert args.index_name == "my-index" + assert args.doc_chunk_size == 512 + assert args.doc_chunk_overlap == 64 + assert args.code_chunk_size == 1024 + assert args.code_chunk_overlap == 100 + + +def test_cli_update_accepts_file_types(tmp_path, monkeypatch): + """Test that update command accepts file type filters.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args( + ["update", "my-index", "--docs", "./docs", "--file-types", ".py,.js,.ts"] + ) + + assert args.command == "update" + assert args.index_name == "my-index" + assert args.file_types == ".py,.js,.ts" + + +def test_cli_update_accepts_ast_chunking(tmp_path, monkeypatch): + """Test that update command accepts AST chunking options.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args( + [ + "update", + "my-index", + "--docs", + "./docs", + "--use-ast-chunking", + "--ast-chunk-size", + "400", + "--ast-chunk-overlap", + "80", + ] + ) + + assert args.command == "update" + assert args.use_ast_chunking is True + assert args.ast_chunk_size == 400 + assert args.ast_chunk_overlap == 80 + + +def test_cli_update_accepts_include_hidden(tmp_path, monkeypatch): + """Test that update command accepts include-hidden flag.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args(["update", "my-index", "--docs", "./docs", "--include-hidden"]) + + assert args.command == "update" + assert args.include_hidden is True + + +def test_cli_update_default_values(tmp_path, monkeypatch): + """Test that update command has correct default values.""" + monkeypatch.chdir(tmp_path) + + cli = LeannCLI() + parser = cli.create_parser() + + args = parser.parse_args(["update", "my-index", "--docs", "./docs"]) + + # Check default chunking values + assert args.doc_chunk_size == 256 + assert args.doc_chunk_overlap == 128 + assert args.code_chunk_size == 512 + assert args.code_chunk_overlap == 50 + + # Check default AST values + assert args.ast_chunk_size == 300 + assert args.ast_chunk_overlap == 64 + assert args.ast_fallback_traditional is True + + # Check default flags + assert args.include_hidden is False + assert args.use_ast_chunking is False