Skip to content

Commit 009d735

Browse files
phernandezclaude
andcommitted
perf(search): record vector-sync metrics at batch level, not per entity
The previous fix introduced a _METRIC_INSTRUMENTS cache + helper functions to work around instrument re-creation in the per-entity hot path. That was the same kind of wrapper we just deleted from telemetry.py — cache to work around a wrapper around logfire is not the right shape. Proper fix: record these metrics where they belong — once per batch, using the totals VectorSyncBatchResult already accumulates (prepare_seconds_total, queue_wait_seconds_total, etc.). The per-entity histogram calls in _log_vector_sync_complete are gone; that function now only emits the slow-entity warning log. Batch-level block now: - shares one `batch_attrs` dict across all recordings (was repeated 7x) - records 5 histograms + 6 counters per batch with direct logfire.metric_* calls, no cache, no helpers - _METRIC_INSTRUMENTS + _metric_histogram + _metric_counter all removed Tests updated: per-entity histogram counts (was 2 per 2-entity batch) are now 1 per batch. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 08e06e6 commit 009d735

2 files changed

Lines changed: 42 additions & 116 deletions

File tree

src/basic_memory/repository/search_repository_base.py

Lines changed: 36 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,6 @@
4141
_SQLITE_MAX_PREPARE_WINDOW = 8
4242

4343

44-
# Cache instruments so the per-entity hot path in _log_vector_sync_complete
45-
# doesn't re-enter the OTel MeterProvider lookup on every sample.
46-
_METRIC_INSTRUMENTS: dict[tuple[str, str, str], Any] = {}
47-
48-
49-
def _metric_histogram(name: str, unit: str = "") -> Any:
50-
key = ("histogram", name, unit)
51-
instrument = _METRIC_INSTRUMENTS.get(key)
52-
if instrument is None:
53-
instrument = logfire.metric_histogram(name, unit=unit)
54-
_METRIC_INSTRUMENTS[key] = instrument
55-
return instrument
56-
57-
58-
def _metric_counter(name: str) -> Any:
59-
key = ("counter", name, "")
60-
instrument = _METRIC_INSTRUMENTS.get(key)
61-
if instrument is None:
62-
instrument = logfire.metric_counter(name)
63-
_METRIC_INSTRUMENTS[key] = instrument
64-
return instrument
65-
66-
6744
@dataclass
6845
class VectorSyncBatchResult:
6946
"""Aggregate result for batched semantic vector sync runs."""
@@ -1092,57 +1069,42 @@ def emit_progress(entity_id: int) -> None:
10921069
write_seconds_total=result.write_seconds_total,
10931070
)
10941071
batch_total_seconds = time.perf_counter() - batch_start
1095-
_metric_histogram(
1096-
"vector_sync_batch_total_seconds",
1097-
unit="s",
1098-
).record(
1099-
batch_total_seconds,
1100-
attributes={
1101-
"backend": backend_name,
1102-
"skip_only_batch": result.embedding_jobs_total == 0,
1103-
},
1072+
batch_attrs = {
1073+
"backend": backend_name,
1074+
"skip_only_batch": result.embedding_jobs_total == 0,
1075+
}
1076+
logfire.metric_histogram("vector_sync_batch_total_seconds", unit="s").record(
1077+
batch_total_seconds, attributes=batch_attrs
11041078
)
1105-
_metric_counter("vector_sync_entities_total").add(
1106-
result.entities_total,
1107-
attributes={
1108-
"backend": backend_name,
1109-
"skip_only_batch": result.embedding_jobs_total == 0,
1110-
},
1079+
logfire.metric_histogram("vector_sync_prepare_seconds", unit="s").record(
1080+
result.prepare_seconds_total, attributes=batch_attrs
11111081
)
1112-
_metric_counter("vector_sync_entities_skipped").add(
1113-
result.entities_skipped,
1114-
attributes={
1115-
"backend": backend_name,
1116-
"skip_only_batch": result.embedding_jobs_total == 0,
1117-
},
1082+
logfire.metric_histogram("vector_sync_queue_wait_seconds", unit="s").record(
1083+
result.queue_wait_seconds_total, attributes=batch_attrs
11181084
)
1119-
_metric_counter("vector_sync_entities_deferred").add(
1120-
result.entities_deferred,
1121-
attributes={
1122-
"backend": backend_name,
1123-
"skip_only_batch": result.embedding_jobs_total == 0,
1124-
},
1085+
logfire.metric_histogram("vector_sync_embed_seconds", unit="s").record(
1086+
result.embed_seconds_total, attributes=batch_attrs
11251087
)
1126-
_metric_counter("vector_sync_embedding_jobs_total").add(
1127-
result.embedding_jobs_total,
1128-
attributes={
1129-
"backend": backend_name,
1130-
"skip_only_batch": result.embedding_jobs_total == 0,
1131-
},
1088+
logfire.metric_histogram("vector_sync_write_seconds", unit="s").record(
1089+
result.write_seconds_total, attributes=batch_attrs
11321090
)
1133-
_metric_counter("vector_sync_chunks_total").add(
1134-
result.chunks_total,
1135-
attributes={
1136-
"backend": backend_name,
1137-
"skip_only_batch": result.embedding_jobs_total == 0,
1138-
},
1091+
logfire.metric_counter("vector_sync_entities_total").add(
1092+
result.entities_total, attributes=batch_attrs
11391093
)
1140-
_metric_counter("vector_sync_chunks_skipped").add(
1141-
result.chunks_skipped,
1142-
attributes={
1143-
"backend": backend_name,
1144-
"skip_only_batch": result.embedding_jobs_total == 0,
1145-
},
1094+
logfire.metric_counter("vector_sync_entities_skipped").add(
1095+
result.entities_skipped, attributes=batch_attrs
1096+
)
1097+
logfire.metric_counter("vector_sync_entities_deferred").add(
1098+
result.entities_deferred, attributes=batch_attrs
1099+
)
1100+
logfire.metric_counter("vector_sync_embedding_jobs_total").add(
1101+
result.embedding_jobs_total, attributes=batch_attrs
1102+
)
1103+
logfire.metric_counter("vector_sync_chunks_total").add(
1104+
result.chunks_total, attributes=batch_attrs
1105+
)
1106+
logfire.metric_counter("vector_sync_chunks_skipped").add(
1107+
result.chunks_skipped, attributes=batch_attrs
11461108
)
11471109
if batch_span is not None:
11481110
batch_span.set_attributes(
@@ -1715,48 +1677,12 @@ def _log_vector_sync_complete(
17151677
shard_count: int,
17161678
remaining_jobs_after_shard: int,
17171679
) -> None:
1718-
"""Log completion and slow-entity warnings with a consistent format."""
1719-
backend_name = type(self).__name__.removesuffix("SearchRepository").lower()
1720-
_metric_histogram(
1721-
"vector_sync_prepare_seconds",
1722-
unit="s",
1723-
).record(
1724-
prepare_seconds,
1725-
attributes={
1726-
"backend": backend_name,
1727-
"skip_only_entity": entity_skipped and embedding_jobs_count == 0,
1728-
},
1729-
)
1730-
_metric_histogram(
1731-
"vector_sync_queue_wait_seconds",
1732-
unit="s",
1733-
).record(
1734-
queue_wait_seconds,
1735-
attributes={
1736-
"backend": backend_name,
1737-
"skip_only_entity": entity_skipped and embedding_jobs_count == 0,
1738-
},
1739-
)
1740-
_metric_histogram(
1741-
"vector_sync_embed_seconds",
1742-
unit="s",
1743-
).record(
1744-
embed_seconds,
1745-
attributes={
1746-
"backend": backend_name,
1747-
"skip_only_entity": entity_skipped and embedding_jobs_count == 0,
1748-
},
1749-
)
1750-
_metric_histogram(
1751-
"vector_sync_write_seconds",
1752-
unit="s",
1753-
).record(
1754-
write_seconds,
1755-
attributes={
1756-
"backend": backend_name,
1757-
"skip_only_entity": entity_skipped and embedding_jobs_count == 0,
1758-
},
1759-
)
1680+
"""Log completion and slow-entity warnings with a consistent format.
1681+
1682+
Per-entity timings are aggregated into `VectorSyncBatchResult` and
1683+
recorded as batch-level histograms once the batch completes — this
1684+
function stays on the per-entity hot path so it only emits logs.
1685+
"""
17601686
if total_seconds > 10:
17611687
logger.warning(
17621688
"Vector sync slow entity: project_id={project_id} entity_id={entity_id} "

tests/repository/test_semantic_search_base.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -712,8 +712,6 @@ def add(self, amount, attributes=None) -> None:
712712

713713
monkeypatch.setattr(repo, "_prepare_entity_vector_jobs_window", _stub_prepare_window)
714714
monkeypatch.setattr(repo, "_flush_embedding_jobs", _stub_flush)
715-
# Reset the module-level metric cache so the fake factories below win.
716-
monkeypatch.setattr(search_repository_base_module, "_METRIC_INSTRUMENTS", {})
717715
monkeypatch.setattr(
718716
search_repository_base_module.logfire,
719717
"metric_histogram",
@@ -732,12 +730,14 @@ def add(self, amount, attributes=None) -> None:
732730

733731
result = await repo.sync_entity_vectors_batch([1, 2])
734732

733+
# Batch-level histograms record once per batch using aggregated totals
734+
# from VectorSyncBatchResult — not per entity. See _sync_entity_vectors_internal.
735735
assert result.entities_synced == 2
736736
histogram_names = [name for name, _, _ in histogram_calls]
737-
assert histogram_names.count("vector_sync_prepare_seconds") == 2
738-
assert histogram_names.count("vector_sync_queue_wait_seconds") == 2
739-
assert histogram_names.count("vector_sync_embed_seconds") == 2
740-
assert histogram_names.count("vector_sync_write_seconds") == 2
737+
assert histogram_names.count("vector_sync_prepare_seconds") == 1
738+
assert histogram_names.count("vector_sync_queue_wait_seconds") == 1
739+
assert histogram_names.count("vector_sync_embed_seconds") == 1
740+
assert histogram_names.count("vector_sync_write_seconds") == 1
741741
assert histogram_names.count("vector_sync_batch_total_seconds") == 1
742742
assert [name for name, _, _ in counter_calls].count("vector_sync_entities_total") == 1
743743

0 commit comments

Comments
 (0)