|
44 | 44 | _session_maker: Optional[async_sessionmaker[AsyncSession]] = None |
45 | 45 |
|
46 | 46 |
|
47 | | -async def _needs_semantic_embedding_backfill( |
48 | | - app_config: BasicMemoryConfig, |
49 | | - session_maker: async_sessionmaker[AsyncSession], |
50 | | -) -> bool: |
51 | | - """Check if entities exist but vector embeddings are empty. |
52 | | -
|
53 | | - This is the reliable way to detect that embeddings need to be generated, |
54 | | - regardless of how migrations were applied (fresh DB, upgrade, reset, etc.). |
55 | | - """ |
56 | | - if not app_config.semantic_search_enabled: |
57 | | - return False |
58 | | - |
59 | | - try: |
60 | | - async with scoped_session(session_maker) as session: |
61 | | - entity_count = ( |
62 | | - await session.execute(text("SELECT COUNT(*) FROM entity")) |
63 | | - ).scalar() or 0 |
64 | | - if entity_count == 0: |
65 | | - return False |
66 | | - |
67 | | - # Check if vector chunks table exists and is empty |
68 | | - embedding_count = ( |
69 | | - await session.execute(text("SELECT COUNT(*) FROM search_vector_chunks")) |
70 | | - ).scalar() or 0 |
71 | | - |
72 | | - return embedding_count == 0 |
73 | | - except Exception as exc: |
74 | | - # Table might not exist yet (pre-migration) |
75 | | - logger.debug(f"Could not check embedding status: {exc}") |
76 | | - return False |
77 | | - |
78 | | - |
79 | | -async def _run_semantic_embedding_backfill( |
80 | | - app_config: BasicMemoryConfig, |
81 | | - session_maker: async_sessionmaker[AsyncSession], |
82 | | -) -> None: |
83 | | - """Backfill semantic embeddings for all active projects/entities.""" |
84 | | - if not app_config.semantic_search_enabled: |
85 | | - logger.info("Skipping automatic semantic embedding backfill: semantic search is disabled.") |
86 | | - return |
87 | | - |
88 | | - async with scoped_session(session_maker) as session: |
89 | | - project_result = await session.execute( |
90 | | - text("SELECT id, name FROM project WHERE is_active = :is_active ORDER BY id"), |
91 | | - {"is_active": True}, |
92 | | - ) |
93 | | - projects = [(int(row[0]), str(row[1])) for row in project_result.fetchall()] |
94 | | - |
95 | | - if not projects: |
96 | | - logger.info("Skipping automatic semantic embedding backfill: no active projects found.") |
97 | | - return |
98 | | - |
99 | | - repository_class = ( |
100 | | - PostgresSearchRepository |
101 | | - if app_config.database_backend == DatabaseBackend.POSTGRES |
102 | | - else SQLiteSearchRepository |
103 | | - ) |
104 | | - |
105 | | - total_entities = 0 |
106 | | - for project_id, project_name in projects: |
107 | | - async with scoped_session(session_maker) as session: |
108 | | - entity_result = await session.execute( |
109 | | - text("SELECT id FROM entity WHERE project_id = :project_id ORDER BY id"), |
110 | | - {"project_id": project_id}, |
111 | | - ) |
112 | | - entity_ids = [int(row[0]) for row in entity_result.fetchall()] |
113 | | - |
114 | | - if not entity_ids: |
115 | | - continue |
116 | | - |
117 | | - total_entities += len(entity_ids) |
118 | | - logger.info( |
119 | | - "Automatic semantic embedding backfill: " |
120 | | - f"project={project_name}, entities={len(entity_ids)}" |
121 | | - ) |
122 | | - |
123 | | - search_repository = repository_class( |
124 | | - session_maker, |
125 | | - project_id=project_id, |
126 | | - app_config=app_config, |
127 | | - ) |
128 | | - batch_result = await search_repository.sync_entity_vectors_batch(entity_ids) |
129 | | - if batch_result.entities_failed > 0: |
130 | | - logger.warning( |
131 | | - "Automatic semantic embedding backfill encountered entity failures: " |
132 | | - f"project={project_name}, failed={batch_result.entities_failed}, " |
133 | | - f"failed_entity_ids={batch_result.failed_entity_ids}" |
134 | | - ) |
135 | | - |
136 | | - logger.info( |
137 | | - "Automatic semantic embedding backfill complete: " |
138 | | - f"projects={len(projects)}, entities={total_entities}" |
139 | | - ) |
140 | | - |
141 | | - |
142 | 47 | class DatabaseType(Enum): |
143 | 48 | """Types of supported databases.""" |
144 | 49 |
|
@@ -521,14 +426,6 @@ async def run_migrations( |
521 | 426 | else: |
522 | 427 | await SQLiteSearchRepository(session_maker, 1).init_search_index() |
523 | 428 |
|
524 | | - # Check if backfill is needed — actual backfill runs in background |
525 | | - # from the MCP server lifespan to avoid blocking startup. |
526 | | - if await _needs_semantic_embedding_backfill(app_config, session_maker): |
527 | | - logger.info( |
528 | | - "Semantic embeddings missing — backfill will run in background after startup" |
529 | | - ) |
530 | | - else: |
531 | | - logger.info("Semantic embeddings: up to date") |
532 | 429 | except Exception as e: # pragma: no cover |
533 | 430 | logger.error(f"Error running migrations: {e}") |
534 | 431 | raise |
|
0 commit comments