Skip to content

Commit fe9fc7e

Browse files
committed
fix(core): remove automatic embedding backfill
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 1207e8c commit fe9fc7e

File tree

3 files changed

+2
-364
lines changed

3 files changed

+2
-364
lines changed

src/basic_memory/db.py

Lines changed: 0 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -44,101 +44,6 @@
4444
_session_maker: Optional[async_sessionmaker[AsyncSession]] = None
4545

4646

47-
async def _needs_semantic_embedding_backfill(
48-
app_config: BasicMemoryConfig,
49-
session_maker: async_sessionmaker[AsyncSession],
50-
) -> bool:
51-
"""Check if entities exist but vector embeddings are empty.
52-
53-
This is the reliable way to detect that embeddings need to be generated,
54-
regardless of how migrations were applied (fresh DB, upgrade, reset, etc.).
55-
"""
56-
if not app_config.semantic_search_enabled:
57-
return False
58-
59-
try:
60-
async with scoped_session(session_maker) as session:
61-
entity_count = (
62-
await session.execute(text("SELECT COUNT(*) FROM entity"))
63-
).scalar() or 0
64-
if entity_count == 0:
65-
return False
66-
67-
# Check if vector chunks table exists and is empty
68-
embedding_count = (
69-
await session.execute(text("SELECT COUNT(*) FROM search_vector_chunks"))
70-
).scalar() or 0
71-
72-
return embedding_count == 0
73-
except Exception as exc:
74-
# Table might not exist yet (pre-migration)
75-
logger.debug(f"Could not check embedding status: {exc}")
76-
return False
77-
78-
79-
async def _run_semantic_embedding_backfill(
80-
app_config: BasicMemoryConfig,
81-
session_maker: async_sessionmaker[AsyncSession],
82-
) -> None:
83-
"""Backfill semantic embeddings for all active projects/entities."""
84-
if not app_config.semantic_search_enabled:
85-
logger.info("Skipping automatic semantic embedding backfill: semantic search is disabled.")
86-
return
87-
88-
async with scoped_session(session_maker) as session:
89-
project_result = await session.execute(
90-
text("SELECT id, name FROM project WHERE is_active = :is_active ORDER BY id"),
91-
{"is_active": True},
92-
)
93-
projects = [(int(row[0]), str(row[1])) for row in project_result.fetchall()]
94-
95-
if not projects:
96-
logger.info("Skipping automatic semantic embedding backfill: no active projects found.")
97-
return
98-
99-
repository_class = (
100-
PostgresSearchRepository
101-
if app_config.database_backend == DatabaseBackend.POSTGRES
102-
else SQLiteSearchRepository
103-
)
104-
105-
total_entities = 0
106-
for project_id, project_name in projects:
107-
async with scoped_session(session_maker) as session:
108-
entity_result = await session.execute(
109-
text("SELECT id FROM entity WHERE project_id = :project_id ORDER BY id"),
110-
{"project_id": project_id},
111-
)
112-
entity_ids = [int(row[0]) for row in entity_result.fetchall()]
113-
114-
if not entity_ids:
115-
continue
116-
117-
total_entities += len(entity_ids)
118-
logger.info(
119-
"Automatic semantic embedding backfill: "
120-
f"project={project_name}, entities={len(entity_ids)}"
121-
)
122-
123-
search_repository = repository_class(
124-
session_maker,
125-
project_id=project_id,
126-
app_config=app_config,
127-
)
128-
batch_result = await search_repository.sync_entity_vectors_batch(entity_ids)
129-
if batch_result.entities_failed > 0:
130-
logger.warning(
131-
"Automatic semantic embedding backfill encountered entity failures: "
132-
f"project={project_name}, failed={batch_result.entities_failed}, "
133-
f"failed_entity_ids={batch_result.failed_entity_ids}"
134-
)
135-
136-
logger.info(
137-
"Automatic semantic embedding backfill complete: "
138-
f"projects={len(projects)}, entities={total_entities}"
139-
)
140-
141-
14247
class DatabaseType(Enum):
14348
"""Types of supported databases."""
14449

@@ -521,14 +426,6 @@ async def run_migrations(
521426
else:
522427
await SQLiteSearchRepository(session_maker, 1).init_search_index()
523428

524-
# Check if backfill is needed — actual backfill runs in background
525-
# from the MCP server lifespan to avoid blocking startup.
526-
if await _needs_semantic_embedding_backfill(app_config, session_maker):
527-
logger.info(
528-
"Semantic embeddings missing — backfill will run in background after startup"
529-
)
530-
else:
531-
logger.info("Semantic embeddings: up to date")
532429
except Exception as e: # pragma: no cover
533430
logger.error(f"Error running migrations: {e}")
534431
raise

src/basic_memory/mcp/server.py

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Basic Memory FastMCP server.
33
"""
44

5-
import asyncio
65
import time
76
from contextlib import asynccontextmanager
87

@@ -13,12 +12,7 @@
1312

1413
from basic_memory import db
1514
from basic_memory.cli.auth import CLIAuth
16-
from basic_memory.config import BasicMemoryConfig
17-
from basic_memory.db import (
18-
scoped_session,
19-
_needs_semantic_embedding_backfill,
20-
_run_semantic_embedding_backfill,
21-
)
15+
from basic_memory.db import scoped_session
2216
from basic_memory.mcp.container import McpContainer, set_container
2317
from basic_memory.services.initialization import initialize_app
2418
from basic_memory import telemetry
@@ -43,7 +37,7 @@ async def _log_embedding_status(session_maker: async_sessionmaker[AsyncSession])
4337
elif embedding_count == 0:
4438
logger.warning(
4539
f"Semantic embeddings: EMPTY — {entity_count} entities have no embeddings. "
46-
"Backfill running in background..."
40+
"Run 'bm reindex --embeddings' to build them."
4741
)
4842
else:
4943
logger.info(
@@ -54,20 +48,6 @@ async def _log_embedding_status(session_maker: async_sessionmaker[AsyncSession])
5448
logger.debug(f"Could not check embedding status at startup: {exc}")
5549

5650

57-
async def _background_embedding_backfill(
58-
config: BasicMemoryConfig,
59-
session_maker: async_sessionmaker[AsyncSession],
60-
) -> None:
61-
"""Run semantic embedding backfill in the background without blocking startup."""
62-
try:
63-
if await _needs_semantic_embedding_backfill(config, session_maker):
64-
logger.info("Background embedding backfill starting...")
65-
await _run_semantic_embedding_backfill(config, session_maker)
66-
await _log_embedding_status(session_maker)
67-
except Exception as exc:
68-
logger.error(f"Background embedding backfill failed: {exc}")
69-
70-
7151
@asynccontextmanager
7252
async def lifespan(app: FastMCP):
7353
"""Lifecycle manager for the MCP server.
@@ -133,14 +113,8 @@ async def lifespan(app: FastMCP):
133113
await initialize_app(container.config)
134114

135115
# Log embedding status so it's easy to spot in the logs
136-
backfill_task: asyncio.Task | None = None # type: ignore[type-arg]
137116
if config.semantic_search_enabled and db._session_maker is not None:
138117
await _log_embedding_status(db._session_maker)
139-
# Launch backfill in background so MCP server is ready immediately
140-
backfill_task = asyncio.create_task(
141-
_background_embedding_backfill(config, db._session_maker),
142-
name="embedding-backfill",
143-
)
144118

145119
# Create and start sync coordinator (lifecycle centralized in coordinator)
146120
sync_coordinator = container.create_sync_coordinator()
@@ -157,14 +131,6 @@ async def lifespan(app: FastMCP):
157131
):
158132
logger.debug("Shutting down Basic Memory MCP server")
159133

160-
# Cancel embedding backfill if still running
161-
if backfill_task is not None and not backfill_task.done():
162-
backfill_task.cancel()
163-
try:
164-
await backfill_task
165-
except asyncio.CancelledError:
166-
logger.info("Background embedding backfill cancelled during shutdown")
167-
168134
await sync_coordinator.stop()
169135

170136
# Only shutdown DB if we created it (not if test fixture provided it)

0 commit comments

Comments
 (0)