-
Notifications
You must be signed in to change notification settings - Fork 10
Evaluation: Fast Evals #870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
bbfb6b0
cd4e0c6
ccf281b
74058f7
d13ab1c
3f8d24d
eb5b0a6
324777f
8da52fe
f6c9567
7653953
86053dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| """add run_mode column and unique run-name constraint to evaluation_run | ||
|
|
||
| Revision ID: 064 | ||
| Revises: 063 | ||
| Create Date: 2026-05-20 00:00:00.000000 | ||
|
|
||
| """ | ||
|
|
||
| import sqlalchemy as sa | ||
| from alembic import op | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "064" | ||
| down_revision = "063" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
| disable_per_migration_transaction = True | ||
|
|
||
| _UNIQUE_INDEX = "uq_evaluation_run_org_project_run_name" | ||
| _UNIQUE_CONSTRAINT = "uq_evaluation_run_org_project_run_name" | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # 1. Add run_mode as nullable first so existing rows are backfilled by the | ||
| # server default, then tighten to NOT NULL. The server default is left | ||
| # in place as a safety net. | ||
| with op.get_context().autocommit_block(): | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "run_mode", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: can it be enum ? ['batch', 'fast_mode'] .. just suggestion |
||
| sa.String(length=10), | ||
| nullable=True, | ||
| server_default=sa.text("'batch'"), | ||
| comment="Execution mode: batch or fast", | ||
| ), | ||
| ) | ||
| op.execute("ALTER TABLE evaluation_run ALTER COLUMN run_mode SET NOT NULL") | ||
|
|
||
| # 2. Dedupe existing rows before adding the unique constraint. | ||
| # Keep the lowest-id row for each (organization_id, project_id, | ||
| # run_name) tuple and remove the rest. | ||
| with op.get_context().autocommit_block(): | ||
| op.execute( | ||
| """ | ||
| DELETE FROM evaluation_run | ||
| WHERE id IN ( | ||
| SELECT id | ||
| FROM ( | ||
| SELECT id, | ||
| ROW_NUMBER() OVER ( | ||
| PARTITION BY organization_id, project_id, run_name | ||
| ORDER BY id ASC | ||
| ) AS rn | ||
| FROM evaluation_run | ||
| ) sub | ||
| WHERE rn > 1 | ||
| ) | ||
| """ | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| # 3. Build the unique index CONCURRENTLY so the scan does not take an | ||
| # AccessExclusiveLock, then attach it as a named constraint via | ||
| # ADD CONSTRAINT ... USING INDEX (brief catalog-only lock). | ||
| with op.get_context().autocommit_block(): | ||
| op.execute( | ||
| f"CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS " | ||
| f'"{_UNIQUE_INDEX}" ' | ||
| f"ON evaluation_run (organization_id, project_id, run_name)" | ||
| ) | ||
| op.execute( | ||
| f"ALTER TABLE evaluation_run " | ||
| f'ADD CONSTRAINT "{_UNIQUE_CONSTRAINT}" ' | ||
| f'UNIQUE USING INDEX "{_UNIQUE_INDEX}"' | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # Reverse in opposite order to upgrade(). | ||
| op.execute( | ||
| f"ALTER TABLE evaluation_run " | ||
| f'DROP CONSTRAINT IF EXISTS "{_UNIQUE_CONSTRAINT}"' | ||
| ) | ||
| with op.get_context().autocommit_block(): | ||
| op.execute(f'DROP INDEX CONCURRENTLY IF EXISTS "{_UNIQUE_INDEX}"') | ||
| op.drop_column("evaluation_run", "run_mode") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,10 @@ | ||
| List all datasets for the current organization and project. | ||
|
|
||
| Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, and object store URL. | ||
| Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, object store URL, and an `eligible_for_fast` flag that is `true` when the dataset's unique-row count is within `EVAL_FAST_MAX_UNIQUE_ROWS` (and so can be used with `run_mode="fast"` on `POST /evaluations`). | ||
|
|
||
| ## Query parameters | ||
|
|
||
| | Parameter | Description | | ||
| | --- | --- | | ||
| | `limit` / `offset` | Pagination (default 50 / 0; max limit 100) | | ||
| | `eligible_for` | If set to `fast`, the response is filtered to only datasets where `eligible_for_fast` is `true` | |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,14 +17,19 @@ | |
| from app.core.cloud import get_cloud_storage | ||
| from app.crud.evaluations import ( | ||
| get_dataset_by_id, | ||
| ) | ||
| from app.crud.evaluations import ( | ||
| list_datasets as list_evaluation_datasets, | ||
| ) | ||
| from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud | ||
| from app.models.evaluation import DatasetUploadResponse, EvaluationDataset | ||
| from app.services.evaluations import ( | ||
| upload_dataset as upload_evaluation_dataset, | ||
| is_dataset_fast_eligible, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This oneliner check needn't be wrapped around a function |
||
| validate_csv_file, | ||
| ) | ||
| from app.services.evaluations import ( | ||
| upload_dataset as upload_evaluation_dataset, | ||
| ) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please combine these into a single import block |
||
| from app.utils import ( | ||
| APIResponse, | ||
| load_description, | ||
|
|
@@ -39,13 +44,15 @@ def _dataset_to_response( | |
| dataset: EvaluationDataset, signed_url: str | None = None | ||
| ) -> DatasetUploadResponse: | ||
| """Convert a dataset model to a DatasetUploadResponse.""" | ||
| original_items = dataset.dataset_metadata.get("original_items_count", 0) | ||
| return DatasetUploadResponse( | ||
| dataset_id=dataset.id, | ||
| dataset_name=dataset.name, | ||
| description=dataset.description, | ||
| total_items=dataset.dataset_metadata.get("total_items_count", 0), | ||
| original_items=dataset.dataset_metadata.get("original_items_count", 0), | ||
| original_items=original_items, | ||
| duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1), | ||
| eligible_for_fast=is_dataset_fast_eligible(original_items_count=original_items), | ||
| langfuse_dataset_id=dataset.langfuse_dataset_id, | ||
| object_store_url=dataset.object_store_url, | ||
| signed_url=signed_url, | ||
|
|
@@ -104,6 +111,15 @@ def list_datasets( | |
| default=50, ge=1, le=100, description="Maximum number of datasets to return" | ||
| ), | ||
| offset: int = Query(default=0, ge=0, description="Number of datasets to skip"), | ||
| eligible_for: str | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is taking only one enum type . So instead of typecasting to bool in line no 131, we can have a single queryParam |
||
| | None = Query( | ||
| default=None, | ||
| description=( | ||
| "If 'fast', return only datasets eligible for run_mode='fast' " | ||
| "(unique-row count within EVAL_FAST_MAX_UNIQUE_ROWS)." | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: description not required, the queryParam is self-explanatory |
||
| ), | ||
| enum=["fast"], | ||
| ), | ||
| ) -> APIResponse[list[DatasetUploadResponse]]: | ||
| """List evaluation datasets.""" | ||
| datasets = list_evaluation_datasets( | ||
|
|
@@ -114,9 +130,15 @@ def list_datasets( | |
| offset=offset, | ||
| ) | ||
|
|
||
| return APIResponse.success_response( | ||
| data=[_dataset_to_response(dataset) for dataset in datasets] | ||
| ) | ||
| dataset_responses = [_dataset_to_response(dataset) for dataset in datasets] | ||
| if eligible_for == "fast": | ||
| dataset_responses = [ | ||
| dataset_response | ||
| for dataset_response in dataset_responses | ||
| if dataset_response.eligible_for_fast | ||
| ] | ||
|
|
||
| return APIResponse.success_response(data=dataset_responses) | ||
|
|
||
|
|
||
| @router.get( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import logging | ||
| from uuid import UUID | ||
|
|
||
| from asgi_correlation_id import correlation_id | ||
| from fastapi import ( | ||
| APIRouter, | ||
| Body, | ||
|
|
@@ -12,13 +13,14 @@ | |
| ) | ||
|
|
||
| from app.api.deps import AuthContextDep, SessionDep | ||
| from app.api.permissions import Permission, require_permission | ||
| from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud | ||
| from app.crud.evaluations.core import group_traces_by_question_id | ||
| from app.models.evaluation import EvaluationRunPublic | ||
| from app.api.permissions import Permission, require_permission | ||
| from app.models.evaluation import EvaluationRunPublic, RunModeEnum | ||
| from app.services.evaluations import ( | ||
| get_evaluation_with_scores, | ||
| start_evaluation, | ||
| validate_and_start_fast_evaluation, | ||
| ) | ||
| from app.utils import ( | ||
| APIResponse, | ||
|
|
@@ -45,8 +47,32 @@ def evaluate( | |
| ), | ||
| config_id: UUID = Body(..., description="Stored config ID"), | ||
| config_version: int = Body(..., ge=1, description="Stored config version"), | ||
| run_mode: RunModeEnum = Body( | ||
| default=RunModeEnum.BATCH, | ||
| description="Execution mode: 'batch' (default) or 'fast'", | ||
| ), | ||
| ) -> APIResponse[EvaluationRunPublic]: | ||
| """Start an evaluation run.""" | ||
| logger.info( | ||
| f"[evaluate] Starting evaluation | run_mode={run_mode.value} | " | ||
| f"experiment_name={experiment_name} | dataset_id={dataset_id} | " | ||
| f"org_id={auth_context.organization_.id} | " | ||
| f"project_id={auth_context.project_.id}" | ||
| ) | ||
|
|
||
| if run_mode == RunModeEnum.FAST: | ||
| eval_run = validate_and_start_fast_evaluation( | ||
| session=session, | ||
| dataset_id=dataset_id, | ||
| run_name=experiment_name, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=auth_context.organization_.id, | ||
| project_id=auth_context.project_.id, | ||
| trace_id=correlation_id.get() or "N/A", | ||
| ) | ||
| return APIResponse.success_response(data=eval_run) | ||
|
|
||
| eval_run = start_evaluation( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: method name can be changed to |
||
| session=session, | ||
| dataset_id=dataset_id, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| """ | ||
| Celery task for the synchronous (fast) text-evaluation pipeline. | ||
|
|
||
| This module hosts the single orchestrator task per fast evaluation run. The | ||
| heavy lifting lives in `app/services/evaluations/fast.py`; this task is a thin | ||
| shim that sets the correlation id, attaches the OTel parent context, and | ||
| delegates. | ||
|
|
||
| See `Fast Evaluation SRD.md` for the design (queue, retries, idempotency). | ||
| """ | ||
|
|
||
| import logging | ||
|
|
||
| from celery import current_task | ||
|
|
||
| from app.celery.celery_app import celery_app | ||
| from app.celery.tasks.job_execution import _run_with_otel_parent, _set_trace | ||
| from app.celery.utils import gevent_timeout | ||
| from app.core.config import settings | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| @celery_app.task(bind=True, queue="evaluations", priority=6) | ||
| @gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_evaluation_fast") | ||
| def run_evaluation_fast( | ||
| self, eval_run_id: int, trace_id: str = "N/A", **kwargs | ||
| ) -> None: | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| """Run the fast evaluation pipeline for one EvaluationRun. | ||
|
|
||
| Idempotency: each stage is skipped on retry when its `batch_job` marker is | ||
| already set on the EvaluationRun, so Celery redelivery never re-calls | ||
| OpenAI for work that already succeeded. | ||
|
|
||
| Args: | ||
| eval_run_id: ID of the EvaluationRun (run_mode="fast"). | ||
| trace_id: Correlation id from the enqueueing request, propagated into | ||
| the worker for log correlation. | ||
| """ | ||
| from app.services.evaluations.fast import execute_fast_evaluation | ||
|
|
||
| _set_trace(trace_id) | ||
| logger.info( | ||
| f"[run_evaluation_fast] Starting fast evaluation task | " | ||
| f"eval_run_id={eval_run_id} | task_id={current_task.request.id}" | ||
| ) | ||
|
|
||
| return _run_with_otel_parent( | ||
| self, | ||
| lambda: execute_fast_evaluation(eval_run_id=eval_run_id), | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -171,6 +171,12 @@ def AWS_S3_BUCKET(self) -> str: | |
| DOC_TRANSFORMATION_PENDING_THRESHOLD_MINUTES: int = 30 | ||
| PENDING_JOB_QUERY_TIMEOUT_MS: int = 1000 | ||
|
|
||
| # Fast evaluation (run_mode="fast") configuration. | ||
| # See "Fast Evaluation SRD.md" for the full design rationale. | ||
| EVAL_FAST_MAX_UNIQUE_ROWS: int = 10 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These three env variables could be inside a |
||
| EVAL_FAST_FAILURE_THRESHOLD: float = 0.5 | ||
| EVAL_FAST_API_CONCURRENCY: int = 10 | ||
|
|
||
| @computed_field # type: ignore[prop-decorator] | ||
| @property | ||
| def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
increase the revision to 65 .. since revision 64 already exist