-
Notifications
You must be signed in to change notification settings - Fork 123
Reliability hardening suite #261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
a6f7fe8
7d65089
7e7ec1d
8013163
cd760dc
8002ada
9dfa72b
3eca0b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| version: 2 | ||
| updates: | ||
| - package-ecosystem: "github-actions" | ||
| directory: "/" | ||
| schedule: | ||
| interval: "weekly" | ||
| groups: | ||
| github-actions: | ||
| patterns: | ||
| - "*" | ||
|
|
||
| - package-ecosystem: "pip" | ||
| directory: "/graphrag_sdk" | ||
| schedule: | ||
| interval: "weekly" | ||
| groups: | ||
| python-dependencies: | ||
| patterns: | ||
| - "*" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,13 +3,15 @@ services: | |
| image: falkordb/falkordb:v4.18.0 | ||
| ports: | ||
| - "6379:6379" | ||
| - "3000:3000" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 M7 — Browser UI binds to all interfaces by default. ports:
- "3000:3000"Fine on a laptop, surprising on a shared dev host or CI runner — exposes the FalkorDB Browser UI on ports:
- "127.0.0.1:3000:3000"Or at minimum a one-line note in CONTRIBUTING.md so contributors aren't surprised. |
||
| volumes: | ||
| - falkordb_data:/data | ||
| healthcheck: | ||
| test: ["CMD", "redis-cli", "ping"] | ||
| interval: 5s | ||
| timeout: 3s | ||
| retries: 5 | ||
| start_period: 5s | ||
|
|
||
| volumes: | ||
| falkordb_data: | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,12 @@ | |
| from graphrag_sdk import __version__ | ||
| from graphrag_sdk.core.connection import ConnectionConfig, FalkorDBConnection | ||
| from graphrag_sdk.core.context import Context | ||
| from graphrag_sdk.core.exceptions import ConfigError, DatabaseError, DocumentNotFoundError | ||
| from graphrag_sdk.core.exceptions import ( | ||
| ConfigError, | ||
| DatabaseError, | ||
| DocumentNotFoundError, | ||
| LatencyBudgetExceededError, | ||
| ) | ||
| from graphrag_sdk.core.models import ( | ||
| ApplyChangesResult, | ||
| BatchEntry, | ||
|
|
@@ -1446,13 +1451,16 @@ async def retrieve( | |
| ctx = Context() | ||
|
|
||
| ctx.log(f"Retrieve: {question[:80]}...") | ||
| ctx.ensure_budget("graph config validation") | ||
|
|
||
| await self._validate_graph_config() | ||
| await self._validate_graph_config(ctx=ctx) | ||
|
|
||
| retrieval = strategy or self._retrieval_strategy | ||
| ctx.ensure_budget("retrieval strategy search") | ||
| retriever_result = await retrieval.search(question, ctx) | ||
|
|
||
| if reranker is not None: | ||
| ctx.ensure_budget("retrieval reranking") | ||
| retriever_result = await reranker.rerank(question, retriever_result, ctx) | ||
|
|
||
| ctx.log(f"Retrieved {len(retriever_result.items)} context items") | ||
|
|
@@ -1480,13 +1488,16 @@ def _validate_history( | |
| f"history[{i}]: each message must have 'role' and " | ||
| f"'content' keys, got {sorted(msg.keys())}" | ||
| ) | ||
| try: | ||
| validated.append(ChatMessage(role=msg["role"], content=msg["content"])) | ||
| except Exception: | ||
| role = msg["role"] | ||
| content = msg["content"] | ||
| if role not in {"system", "user", "assistant"}: | ||
| raise ValueError( | ||
| f"history[{i}]: invalid role '{msg['role']}'. " | ||
| f"history[{i}]: invalid role '{role}'. " | ||
| f"Must be one of: 'system', 'user', 'assistant'" | ||
| ) | ||
| if not isinstance(content, str): | ||
| raise ValueError(f"history[{i}]: content must be a string") | ||
| validated.append(ChatMessage(role=role, content=content)) | ||
| else: | ||
| raise TypeError( | ||
| f"history[{i}]: expected ChatMessage or dict, got {type(msg).__name__}" | ||
|
|
@@ -1515,8 +1526,11 @@ async def _rewrite_question_with_history( | |
| question=question, | ||
| ) | ||
| try: | ||
| ctx.ensure_budget("question rewrite LLM call") | ||
| resp = await self.llm.ainvoke(prompt) | ||
| rewritten = (resp.content or "").strip().splitlines()[0].strip() if resp.content else "" | ||
| except LatencyBudgetExceededError: | ||
| raise | ||
| except Exception as e: | ||
| # Broad catch is intentional (see docstring) — but log at WARNING | ||
| # with full traceback so programming bugs surface in operator | ||
|
|
@@ -1585,6 +1599,7 @@ async def completion( | |
| # Step 1: Optionally rewrite the question for retrieval. | ||
| retrieval_query = question | ||
| if validated_history and rewrite_question_with_history: | ||
| ctx.ensure_budget("question rewrite") | ||
| retrieval_query = await self._rewrite_question_with_history( | ||
| question, | ||
| validated_history, | ||
|
|
@@ -1594,6 +1609,7 @@ async def completion( | |
| ctx.log(f"Rewrote for retrieval: {retrieval_query[:80]}") | ||
|
|
||
| # Step 2: Retrieve + rerank (using possibly-rewritten query). | ||
| ctx.ensure_budget("completion retrieval") | ||
| retriever_result = await self.retrieve( | ||
| retrieval_query, | ||
| strategy=strategy, | ||
|
|
@@ -1635,6 +1651,7 @@ async def completion( | |
| ChatMessage(role="user", content=final_user_content), | ||
| ] | ||
|
|
||
| ctx.ensure_budget("completion LLM call") | ||
| llm_response = await self.llm.ainvoke_messages(messages) | ||
|
|
||
| result = RagResult( | ||
|
|
@@ -1675,7 +1692,7 @@ async def _write_graph_config(self) -> None: | |
| except Exception: | ||
| logger.debug("Failed to write graph config node", exc_info=True) | ||
|
|
||
| async def _validate_graph_config(self) -> None: | ||
| async def _validate_graph_config(self, ctx: Context | None = None) -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 M3 — Positional async def _validate_graph_config(self, ctx: Context | None = None) -> None:This is a private method by name but it's on a public class. Any subclass overriding async def _validate_graph_config(self, *, ctx: Context | None = None) -> None:Keyword-only matches the rest of the codebase's |
||
| """Check that the current embedder matches the graph's stored config. | ||
|
|
||
| Two checks, both cached after first run: | ||
|
|
@@ -1697,6 +1714,8 @@ async def _validate_graph_config(self) -> None: | |
| return | ||
|
|
||
| try: | ||
| if ctx is not None: | ||
| ctx.ensure_budget("graph config query") | ||
| result = await self._graph_store.query_raw( | ||
| "MATCH (c:__GraphRAGConfig__ {id: 'default'}) " | ||
| "RETURN c.embedding_model, c.embedding_dimension" | ||
|
|
@@ -1721,6 +1740,8 @@ async def _validate_graph_config(self) -> None: | |
| ) | ||
| except ConfigError: | ||
| raise | ||
| except LatencyBudgetExceededError: | ||
| raise | ||
| except Exception: | ||
| # Don't mark as validated on transient failures — retry next call. | ||
| logger.debug("Failed to validate graph config", exc_info=True) | ||
|
|
@@ -1729,8 +1750,12 @@ async def _validate_graph_config(self) -> None: | |
| # Probe the embedder once: confirm it produces vectors of the | ||
| # configured dimension. Catches user error like | ||
| # ``embedding_dimension=256`` paired with a 1536-dim model. | ||
| if ctx is not None: | ||
| ctx.ensure_budget("graph config embedder probe") | ||
| try: | ||
| probe = await self.embedder.aembed_query("dim_check") | ||
| except LatencyBudgetExceededError: | ||
| raise | ||
| except Exception: | ||
| # Probe failure is non-fatal — but don't cache a "validated" | ||
| # state, otherwise a transient outage permanently disables | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,8 @@ | |
| from typing import Any | ||
| from urllib.parse import urlparse | ||
|
|
||
| from graphrag_sdk.core.exceptions import DatabaseError | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
|
|
@@ -180,7 +182,13 @@ async def query( | |
| last_exc = exc | ||
| # Don't retry non-transient errors (e.g. schema/index conflicts) | ||
| if self._is_non_transient(exc): | ||
| raise | ||
| logger.error( | ||
| "Non-transient FalkorDB query failure: %s: %s", | ||
| type(exc).__name__, | ||
| exc, | ||
| ) | ||
| logger.debug("Non-transient FalkorDB query failure details", exc_info=True) | ||
| raise DatabaseError(f"FalkorDB query failed: {exc}") from exc | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 C1 — Breaking change to a public class, undocumented. Previously the original exception ( try:
await conn.query(...)
except falkordb.ResponseError as e: # ← silently stops matching
if 'index' in str(e): ...…breaks. Either (a) preserve the original type (just |
||
| await self._breaker.record_failure() | ||
| logger.warning( | ||
| "Query attempt %d/%d failed: %s", | ||
|
|
@@ -193,7 +201,18 @@ async def query( | |
| break | ||
| base_delay = self.config.retry_delay * (2**attempt) | ||
| await asyncio.sleep(base_delay * (0.5 + random.random())) | ||
| raise last_exc # type: ignore[misc] | ||
| logger.error( | ||
| "FalkorDB query failed after %d attempts: %s: %s", | ||
| self.config.retry_count, | ||
| type(last_exc).__name__ if last_exc is not None else "UnknownError", | ||
| last_exc, | ||
| ) | ||
| if last_exc is not None: | ||
| logger.debug( | ||
| "FalkorDB query failure details", | ||
| exc_info=(type(last_exc), last_exc, last_exc.__traceback__), | ||
| ) | ||
| raise DatabaseError(f"FalkorDB query failed: {last_exc}") from last_exc | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 C1 (same issue) — retries-exhausted path.
|
||
|
|
||
| # Substrings that indicate a non-transient (permanent) error — | ||
| # retrying will never succeed. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 M6 — Verify dependabot pip ecosystem works with PEP-621
pyproject.toml.Dependabot's
pipparser handles[project.dependencies]but has known gaps with[project.optional-dependencies]groups (it tends to only update[project.dependencies]and miss thedev/extrasextras). Worth a dry run before relying on it, or switch touvecosystem if youruv.lockis the source of truth.