RolnickLab · mihow · Jun 23, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/ami/main/api/views.py b/ami/main/api/views.py
@@ -39,7 +39,6 @@
 from ami.utils.storages import ConnectionTestResult
 
 from ..models import (
-    NULL_DETECTIONS_FILTER,
     Classification,
     Deployment,
     Detection,
@@ -704,9 +703,7 @@ def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet:
         if has_detections is not None:
             has_detections = BooleanField(required=False).clean(has_detections)
             queryset = queryset.annotate(
-                has_detections=models.Exists(
-                    Detection.objects.filter(source_image=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
-                ),
+                has_detections=models.Exists(Detection.objects.valid().filter(source_image=models.OuterRef("pk"))),
             ).filter(has_detections=has_detections)
         return queryset
 
@@ -756,7 +753,7 @@ def prefetch_detections(self, queryset: QuerySet, project: Project | None = None
         score = get_default_classification_threshold(project, self.request)
 
         prefetch_queryset = (
-            Detection.objects.exclude(NULL_DETECTIONS_FILTER)
+            Detection.objects.valid()
             .annotate(
                 determination_score=models.Max("occurrence__detections__classifications__score"),
                 # Store whether this occurrence should be included based on default filters
@@ -1096,7 +1093,7 @@ class DetectionViewSet(DefaultViewSet, ProjectMixin):
     """
 
     require_project_for_list = True  # Unfiltered list scans are too expensive on this table
-    queryset = Detection.objects.exclude(NULL_DETECTIONS_FILTER).select_related("source_image", "detection_algorithm")
+    queryset = Detection.objects.valid().select_related("source_image", "detection_algorithm")
     serializer_class = DetectionSerializer
     filterset_fields = ["source_image", "detection_algorithm", "source_image__project"]
     ordering_fields = ["created_at", "updated_at", "detection_score", "timestamp"]

diff --git a/ami/main/management/commands/cleanup_null_only_occurrences.py b/ami/main/management/commands/cleanup_null_only_occurrences.py
@@ -0,0 +1,95 @@
+"""
+Delete phantom Occurrences and dangling null-marker Detections left by the Issue #1310
+field bug, on a per-project basis.
+
+The bug created two categories of rows that should never have been persisted:
+- Occurrence rows with no real detections (their only detections are null-marker
+  sentinels, or they have none at all), surfaced as ghost rows in the API.
+- Detection rows that mark a SourceImage as "processed" while no real detections
+  exist for it — these prevent filter_processed_images from re-yielding the image
+  on the next ML run.
+
+After cleanup, the source images become eligible for re-processing.
+
+Dry-run by default. Pass --commit to delete.
+"""
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db import transaction
+from django.db.models import Exists, OuterRef
+
+from ami.main.models import Detection, Occurrence, Project
+
+
+class Command(BaseCommand):
+    help = "Delete phantom Occurrences and dangling null-marker Detections (Issue #1310)."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--project",
+            type=int,
+            required=True,
+            help="Project ID to clean up.",
+        )
+        parser.add_argument(
+            "--commit",
+            action="store_true",
+            help="Actually delete. Defaults to dry-run.",
+        )
+
+    def handle(self, *args, **options):
+        project_id: int = options["project"]
+        commit: bool = options["commit"]
+
+        try:
+            project = Project.objects.get(pk=project_id)
+        except Project.DoesNotExist as err:
+            raise CommandError(f"Project {project_id} does not exist") from err
+
+        all_occs = Occurrence.objects.filter(project=project)
+        # Phantom = an occurrence with NO real (valid) detection backing it: its only detections
+        # are null-marker sentinels, or it has none at all. This is the Issue #1310 debris.
+        #
+        # Deliberately narrower than Occurrence.valid(): valid() ALSO excludes occurrences whose
+        # determination is null, but an occurrence that has a real detection and merely a missing
+        # determination is a different (partial-write) shape, not #1310 debris. Deleting it would
+        # SET_NULL the real detection's occurrence FK (Detection.occurrence is on_delete=SET_NULL),
+        # stranding a classified detection on an image that filter_processed_images then skips
+        # forever. Those are left for a separate, targeted repair.
+        has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
+        phantom_occs = all_occs.exclude(has_valid_detection)
+
+        has_valid_detection = Detection.objects.valid().filter(source_image_id=OuterRef("source_image_id"))
+        dangling_null_markers = (
+            Detection.objects.filter(source_image__project=project)
+            .null_markers()
+            .annotate(_has_valid=Exists(has_valid_detection))
+            .filter(_has_valid=False)
+        )
+
+        phantom_count = phantom_occs.count()
+        null_count = dangling_null_markers.count()
+
+        self.stdout.write(f"Project #{project.pk} ({project.name}):")
+        self.stdout.write(f"  Phantom occurrences (no real detection backing them): {phantom_count}")
+        self.stdout.write(f"  Dangling null-marker detections on images with no real detections: {null_count}")
+
+        if phantom_count == 0 and null_count == 0:
+            self.stdout.write(self.style.SUCCESS("Nothing to clean up."))
+            return
+
+        if not commit:
+            self.stdout.write(self.style.WARNING("Dry run — pass --commit to delete."))
+            return
+
+        with transaction.atomic():
+            dangling_null_markers.delete()
+            phantom_occs.delete()
+
+        # Report the pre-calculated counts of the rows we targeted directly. The tuple from
+        # .delete() also counts cascade-deleted related rows (e.g. classifications under a
+        # phantom occurrence's detections), which would inflate the numbers and confuse the
+        # operator about what the command actually targeted.
+        self.stdout.write(
+            self.style.SUCCESS(f"Deleted {phantom_count} phantom occurrences and {null_count} dangling null markers.")
+        )
diff --git a/ami/main/models.py b/ami/main/models.py
@@ -97,12 +97,28 @@ class TaxonRank(OrderedEnum):
     ]
 )
 
-NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[])
-
 
 def bbox_is_null(bbox) -> bool:
-    """In-memory equivalent of NULL_DETECTIONS_FILTER for already-fetched bbox values."""
-    return bbox is None or bbox == []
+    """In-memory equivalent of null_detections_q() for an already-fetched bbox value."""
+    return bbox is None
+
+
+def null_detections_q(prefix: str = "") -> Q:
+    """
+    Return a Q expression matching null-marker Detection rows, optionally prefixed
+    for use across relations (e.g. null_detections_q("images__detections__") for an
+    aggregate filter on a parent table). For Detection queries directly, prefer
+    Detection.objects.null_markers() / .valid() instead.
+
+    Null markers are stored as SQL NULL (bbox IS NULL); that is the only sentinel form.
+    """
+    return Q(**{f"{prefix}bbox__isnull": True})
+
+
+# Single source of truth for "this Detection is a null marker", shared by
+# DetectionQuerySet.valid() / .null_markers(). Defined via null_detections_q() so the
+# constant and the helper cannot drift apart.
+NULL_DETECTIONS_FILTER = null_detections_q()
 
 
 def get_media_url(path: str) -> str:
@@ -836,7 +852,7 @@ def get_detections_count(self) -> int | None:
         was processed and no detections were found) to stay consistent with
         ``SourceImage.get_detections_count`` and ``Event.get_detections_count``.
         """
-        qs = Detection.objects.filter(source_image__deployment=self).exclude(NULL_DETECTIONS_FILTER)
+        qs = Detection.objects.filter(source_image__deployment=self).valid()
         filter_q = build_occurrence_default_filters_q(
             project=self.project,
             request=None,
@@ -1271,7 +1287,7 @@ def get_detections_count(self) -> int | None:
         Excludes null-bbox placeholder detections to stay consistent with
         ``SourceImage.get_detections_count`` and ``Deployment.get_detections_count``.
         """
-        qs = Detection.objects.filter(source_image__event=self).exclude(NULL_DETECTIONS_FILTER)
+        qs = Detection.objects.filter(source_image__event=self).valid()
         filter_q = build_occurrence_default_filters_q(
             project=self.project,
             request=None,
@@ -2238,7 +2254,7 @@ def get_detections_count(self) -> int:
         Excludes detections without bounding boxes — those are placeholder records
         indicating the image was successfully processed and no detections were found.
         """
-        qs = self.detections.exclude(NULL_DETECTIONS_FILTER)
+        qs = self.detections.all().valid()
         project = self.project
         if not project:
             return qs.distinct().count()
@@ -2518,7 +2534,7 @@ def update_detection_counts(
     if null_only:
         qs = qs.filter(detections_count__isnull=True)
 
-    detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
+    detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).valid()
     if project is not None:
         filter_q = build_occurrence_default_filters_q(
             project=project,
@@ -3024,7 +3040,23 @@ def save(self, *args, **kwargs):
 
 
 class DetectionQuerySet(BaseQuerySet):
-    def null_detections(self):
+    def valid(self):
+        """
+        Detections suitable for consumer queries — excludes null-marker sentinels.
+
+        Null markers are rows that record "an algorithm ran against this image and
+        found nothing." Consumers asking "give me detections" should always go
+        through .valid(). Future predicates to fold in here: soft-delete tombstones,
+        detections missing an algorithm reference, detections missing classifications.
+        """
+        return self.exclude(NULL_DETECTIONS_FILTER)
+
+    def null_markers(self):
+        """
+        Sentinel rows that record "this algorithm ran against this image and found
+        nothing." Only relevant for SourceImage-level "has this been processed?"
+        questions. Detection consumers should use .valid() instead.
+        """
         return self.filter(NULL_DETECTIONS_FILTER)
 
 
@@ -3102,6 +3134,25 @@ class Detection(BaseModel):
 
     objects = DetectionManager()
 
+    NULL_BBOX = None
+    """Canonical bbox value for null markers (rows that record 'an algorithm ran but
+    found nothing'). Null markers are stored as SQL NULL; use Detection.build_null_marker()
+    to construct them."""
+
+    @property
+    def is_null_marker(self) -> bool:
+        """True for sentinel rows representing 'no detections found by this algorithm.'"""
+        return self.bbox is None
+
+    @classmethod
+    def build_null_marker(cls, source_image, detection_algorithm) -> "Detection":
+        """Construct (without saving) a null-marker Detection for the given image+algorithm."""
+        return cls(
+            source_image=source_image,
+            bbox=cls.NULL_BBOX,
+            detection_algorithm=detection_algorithm,
+        )
+
     def get_bbox(self):
         if self.bbox:
             return BoundingBox(
@@ -3222,7 +3273,20 @@ def __str__(self) -> str:
 
 class OccurrenceQuerySet(BaseQuerySet):
     def valid(self):
-        return self.exclude(detections__isnull=True)
+        """
+        Occurrences fit to surface in API responses: at least one real detection AND
+        a determination set.
+
+        Excludes:
+          - Occurrences with no detections at all (empty occurrences)
+          - Occurrences whose only detections are null-marker sentinels (Issue #1310:
+            field bug created phantom occurrences with no real bounding box backing
+            them)
+          - Occurrences with determination__isnull=True (no taxonomic identification,
+            same field bug shape)
+        """
+        has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
+        return self.filter(has_valid_detection).exclude(determination__isnull=True)
 
     def with_detections_count(self):
         return self.annotate(detections_count=models.Count("detections", distinct=True))
@@ -4621,7 +4685,7 @@ def with_source_images_with_detections_count(self):
         return self.annotate(
             source_images_with_detections_count=models.Count(
                 "images",
-                filter=(~models.Q(images__detections__bbox__isnull=True) & ~models.Q(images__detections__bbox=[])),
+                filter=~null_detections_q("images__detections__"),
                 distinct=True,
             )
         )
@@ -5013,10 +5077,11 @@ def sample_greatest_file_size_from_each_event(self, num_each: int = 1):
         return captures
 
     def sample_detections_only(self):
-        """Sample all source images with detections"""
+        """Sample all source images with at least one real (non-null-marker) detection."""
 
         qs = self.get_queryset()
-        return qs.filter(detections__isnull=False).distinct()
+        valid_detection_image_ids = Detection.objects.valid().values("source_image_id")
+        return qs.filter(pk__in=valid_detection_image_ids).distinct()
 
     def sample_full(
         self,