Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
from ami.utils.storages import ConnectionTestResult

from ..models import (
NULL_DETECTIONS_FILTER,
Classification,
Deployment,
Detection,
Expand Down Expand Up @@ -704,9 +703,7 @@ def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet:
if has_detections is not None:
has_detections = BooleanField(required=False).clean(has_detections)
queryset = queryset.annotate(
has_detections=models.Exists(
Detection.objects.filter(source_image=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
),
has_detections=models.Exists(Detection.objects.valid().filter(source_image=models.OuterRef("pk"))),
).filter(has_detections=has_detections)
return queryset

Expand Down Expand Up @@ -756,7 +753,7 @@ def prefetch_detections(self, queryset: QuerySet, project: Project | None = None
score = get_default_classification_threshold(project, self.request)

prefetch_queryset = (
Detection.objects.exclude(NULL_DETECTIONS_FILTER)
Detection.objects.valid()
.annotate(
determination_score=models.Max("occurrence__detections__classifications__score"),
# Store whether this occurrence should be included based on default filters
Expand Down Expand Up @@ -1096,7 +1093,7 @@ class DetectionViewSet(DefaultViewSet, ProjectMixin):
"""

require_project_for_list = True # Unfiltered list scans are too expensive on this table
queryset = Detection.objects.exclude(NULL_DETECTIONS_FILTER).select_related("source_image", "detection_algorithm")
queryset = Detection.objects.valid().select_related("source_image", "detection_algorithm")
serializer_class = DetectionSerializer
filterset_fields = ["source_image", "detection_algorithm", "source_image__project"]
ordering_fields = ["created_at", "updated_at", "detection_score", "timestamp"]
Expand Down
95 changes: 95 additions & 0 deletions ami/main/management/commands/cleanup_null_only_occurrences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Delete phantom Occurrences and dangling null-marker Detections left by the Issue #1310
field bug, on a per-project basis.

The bug created two categories of rows that should never have been persisted:
- Occurrence rows with no real detections (their only detections are null-marker
sentinels, or they have none at all), surfaced as ghost rows in the API.
- Detection rows that mark a SourceImage as "processed" while no real detections
exist for it — these prevent filter_processed_images from re-yielding the image
on the next ML run.

After cleanup, the source images become eligible for re-processing.

Dry-run by default. Pass --commit to delete.
"""

from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.db.models import Exists, OuterRef

from ami.main.models import Detection, Occurrence, Project


class Command(BaseCommand):
help = "Delete phantom Occurrences and dangling null-marker Detections (Issue #1310)."

def add_arguments(self, parser):
parser.add_argument(
"--project",
type=int,
required=True,
help="Project ID to clean up.",
)
parser.add_argument(
"--commit",
action="store_true",
help="Actually delete. Defaults to dry-run.",
)

def handle(self, *args, **options):
project_id: int = options["project"]
commit: bool = options["commit"]

try:
project = Project.objects.get(pk=project_id)
except Project.DoesNotExist as err:
raise CommandError(f"Project {project_id} does not exist") from err

all_occs = Occurrence.objects.filter(project=project)
# Phantom = an occurrence with NO real (valid) detection backing it: its only detections
# are null-marker sentinels, or it has none at all. This is the Issue #1310 debris.
#
# Deliberately narrower than Occurrence.valid(): valid() ALSO excludes occurrences whose
# determination is null, but an occurrence that has a real detection and merely a missing
# determination is a different (partial-write) shape, not #1310 debris. Deleting it would
# SET_NULL the real detection's occurrence FK (Detection.occurrence is on_delete=SET_NULL),
# stranding a classified detection on an image that filter_processed_images then skips
# forever. Those are left for a separate, targeted repair.
has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
phantom_occs = all_occs.exclude(has_valid_detection)

has_valid_detection = Detection.objects.valid().filter(source_image_id=OuterRef("source_image_id"))
dangling_null_markers = (
Detection.objects.filter(source_image__project=project)
.null_markers()
.annotate(_has_valid=Exists(has_valid_detection))
.filter(_has_valid=False)
)

phantom_count = phantom_occs.count()
null_count = dangling_null_markers.count()

self.stdout.write(f"Project #{project.pk} ({project.name}):")
self.stdout.write(f" Phantom occurrences (no real detection backing them): {phantom_count}")
self.stdout.write(f" Dangling null-marker detections on images with no real detections: {null_count}")

if phantom_count == 0 and null_count == 0:
self.stdout.write(self.style.SUCCESS("Nothing to clean up."))
return

if not commit:
self.stdout.write(self.style.WARNING("Dry run — pass --commit to delete."))
return

with transaction.atomic():
dangling_null_markers.delete()
phantom_occs.delete()

# Report the pre-calculated counts of the rows we targeted directly. The tuple from
# .delete() also counts cascade-deleted related rows (e.g. classifications under a
# phantom occurrence's detections), which would inflate the numbers and confuse the
# operator about what the command actually targeted.
self.stdout.write(
self.style.SUCCESS(f"Deleted {phantom_count} phantom occurrences and {null_count} dangling null markers.")
)
91 changes: 78 additions & 13 deletions ami/main/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,28 @@ class TaxonRank(OrderedEnum):
]
)

NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[])


def bbox_is_null(bbox) -> bool:
"""In-memory equivalent of NULL_DETECTIONS_FILTER for already-fetched bbox values."""
return bbox is None or bbox == []
"""In-memory equivalent of null_detections_q() for an already-fetched bbox value."""
return bbox is None


def null_detections_q(prefix: str = "") -> Q:
"""
Return a Q expression matching null-marker Detection rows, optionally prefixed
for use across relations (e.g. null_detections_q("images__detections__") for an
aggregate filter on a parent table). For Detection queries directly, prefer
Detection.objects.null_markers() / .valid() instead.

Null markers are stored as SQL NULL (bbox IS NULL); that is the only sentinel form.
"""
return Q(**{f"{prefix}bbox__isnull": True})


# Single source of truth for "this Detection is a null marker", shared by
# DetectionQuerySet.valid() / .null_markers(). Defined via null_detections_q() so the
# constant and the helper cannot drift apart.
NULL_DETECTIONS_FILTER = null_detections_q()


def get_media_url(path: str) -> str:
Expand Down Expand Up @@ -836,7 +852,7 @@ def get_detections_count(self) -> int | None:
was processed and no detections were found) to stay consistent with
``SourceImage.get_detections_count`` and ``Event.get_detections_count``.
"""
qs = Detection.objects.filter(source_image__deployment=self).exclude(NULL_DETECTIONS_FILTER)
qs = Detection.objects.filter(source_image__deployment=self).valid()
filter_q = build_occurrence_default_filters_q(
project=self.project,
request=None,
Expand Down Expand Up @@ -1271,7 +1287,7 @@ def get_detections_count(self) -> int | None:
Excludes null-bbox placeholder detections to stay consistent with
``SourceImage.get_detections_count`` and ``Deployment.get_detections_count``.
"""
qs = Detection.objects.filter(source_image__event=self).exclude(NULL_DETECTIONS_FILTER)
qs = Detection.objects.filter(source_image__event=self).valid()
filter_q = build_occurrence_default_filters_q(
project=self.project,
request=None,
Expand Down Expand Up @@ -2238,7 +2254,7 @@ def get_detections_count(self) -> int:
Excludes detections without bounding boxes — those are placeholder records
indicating the image was successfully processed and no detections were found.
"""
qs = self.detections.exclude(NULL_DETECTIONS_FILTER)
qs = self.detections.all().valid()
project = self.project
if not project:
return qs.distinct().count()
Expand Down Expand Up @@ -2518,7 +2534,7 @@ def update_detection_counts(
if null_only:
qs = qs.filter(detections_count__isnull=True)

detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).valid()
if project is not None:
filter_q = build_occurrence_default_filters_q(
project=project,
Expand Down Expand Up @@ -3024,7 +3040,23 @@ def save(self, *args, **kwargs):


class DetectionQuerySet(BaseQuerySet):
def null_detections(self):
def valid(self):
"""
Detections suitable for consumer queries — excludes null-marker sentinels.

Null markers are rows that record "an algorithm ran against this image and
found nothing." Consumers asking "give me detections" should always go
through .valid(). Future predicates to fold in here: soft-delete tombstones,
detections missing an algorithm reference, detections missing classifications.
"""
return self.exclude(NULL_DETECTIONS_FILTER)

def null_markers(self):
"""
Sentinel rows that record "this algorithm ran against this image and found
nothing." Only relevant for SourceImage-level "has this been processed?"
questions. Detection consumers should use .valid() instead.
"""
return self.filter(NULL_DETECTIONS_FILTER)


Expand Down Expand Up @@ -3102,6 +3134,25 @@ class Detection(BaseModel):

objects = DetectionManager()

NULL_BBOX = None
"""Canonical bbox value for null markers (rows that record 'an algorithm ran but
found nothing'). Null markers are stored as SQL NULL; use Detection.build_null_marker()
to construct them."""

@property
def is_null_marker(self) -> bool:
"""True for sentinel rows representing 'no detections found by this algorithm.'"""
return self.bbox is None

@classmethod
def build_null_marker(cls, source_image, detection_algorithm) -> "Detection":
"""Construct (without saving) a null-marker Detection for the given image+algorithm."""
return cls(
source_image=source_image,
bbox=cls.NULL_BBOX,
detection_algorithm=detection_algorithm,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

def get_bbox(self):
if self.bbox:
return BoundingBox(
Expand Down Expand Up @@ -3222,7 +3273,20 @@ def __str__(self) -> str:

class OccurrenceQuerySet(BaseQuerySet):
def valid(self):
return self.exclude(detections__isnull=True)
"""
Occurrences fit to surface in API responses: at least one real detection AND
a determination set.

Excludes:
- Occurrences with no detections at all (empty occurrences)
- Occurrences whose only detections are null-marker sentinels (Issue #1310:
field bug created phantom occurrences with no real bounding box backing
them)
- Occurrences with determination__isnull=True (no taxonomic identification,
same field bug shape)
"""
has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
return self.filter(has_valid_detection).exclude(determination__isnull=True)

def with_detections_count(self):
return self.annotate(detections_count=models.Count("detections", distinct=True))
Expand Down Expand Up @@ -4621,7 +4685,7 @@ def with_source_images_with_detections_count(self):
return self.annotate(
source_images_with_detections_count=models.Count(
"images",
filter=(~models.Q(images__detections__bbox__isnull=True) & ~models.Q(images__detections__bbox=[])),
filter=~null_detections_q("images__detections__"),
distinct=True,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)
)
Expand Down Expand Up @@ -5013,10 +5077,11 @@ def sample_greatest_file_size_from_each_event(self, num_each: int = 1):
return captures

def sample_detections_only(self):
"""Sample all source images with detections"""
"""Sample all source images with at least one real (non-null-marker) detection."""

qs = self.get_queryset()
return qs.filter(detections__isnull=False).distinct()
valid_detection_image_ids = Detection.objects.valid().values("source_image_id")
return qs.filter(pk__in=valid_detection_image_ids).distinct()

def sample_full(
self,
Expand Down
Loading