diff --git a/src/openedx_core/__init__.py b/src/openedx_core/__init__.py index 01d5f52bf..05861982c 100644 --- a/src/openedx_core/__init__.py +++ b/src/openedx_core/__init__.py @@ -6,4 +6,4 @@ """ # The version for the entire repository -__version__ = "0.39.1" +__version__ = "0.39.2" diff --git a/src/openedx_tagging/api.py b/src/openedx_tagging/api.py index 0998a6ab5..5763cb7df 100644 --- a/src/openedx_tagging/api.py +++ b/src/openedx_tagging/api.py @@ -12,7 +12,8 @@ """ from __future__ import annotations -from typing import Any +from collections import defaultdict +from typing import Any, Counter from django.db import models, transaction from django.db.models import F, QuerySet, Value @@ -116,7 +117,6 @@ def search_tags( taxonomy: Taxonomy, search_term: str, exclude_object_id: str | None = None, - include_counts: bool = False, ) -> TagDataQuerySet: """ Returns a list of all tags that contains `search_term` of the given @@ -138,7 +138,6 @@ def search_tags( qs = taxonomy.cast().get_filtered_tags( search_term=search_term, excluded_values=excluded_values, - include_counts=include_counts, ) return qs @@ -525,3 +524,49 @@ def unmark_copied_tags(object_id: str) -> None: Update copied object tags on the given object to mark them as "not copied". """ ObjectTag.objects.filter(object_id=object_id).update(is_copied=False) + + +def add_usage_counts(taxonomy: Taxonomy, tag_data: TagDataQuerySet) -> TagDataQuerySet: + """ + Add usage counts to the query result. + + Not a simple raw count of each tags usage. A tag can be directly + applied to an object, which can be a course, library, module, + or something else. + + A tag can also be indirectly applied when some of its children + are applied to an object, it is considered automatically applied. + So, if the tags "Chemistry" and "Physics" are applied once + each to different objects, their parent tag "Natural Science" is + considered indirectly applied to 2 objects. + + Deduplication: A tag can only be applied to a single object once. + So if two child tags are applied to the same object, e.g. + "Chemistry" and "Physics" are applied to the same course, the + parent tag, "Natural Science" is only applied to it once, + because no tag can be applied to the same object twice. + + For performance reasons, we call this function with the list result of the + QuerySet so we can then add the counts in-memory rather than annotate to a + QuerySet which would require a very expensive annotation to join the + in-memory data to the original QuerySet. + """ + + object_tags = taxonomy.objecttag_set.values_list("object_id", "tag__lineage") + tag_counts: Counter[str] = Counter() + object_tag_lineage_seen: defaultdict[str, set] = defaultdict(set) + + for object_id, tag_lineage in object_tags: + # split the lineages to get a dict of {tag.value: [lineages]} + lineage_tags = list(tag_lineage.split('\t')) if tag_lineage else [] + # de-duplicate based on if the lineage is already 'seen' per object + unseen_tags = [t for t in lineage_tags if t not in object_tag_lineage_seen[object_id]] + + tag_counts.update(unseen_tags) + object_tag_lineage_seen[object_id].update(unseen_tags) + + # In-memory 'annotation'; this is faster than using annotate() on the QuerySet. + for row in tag_data: + row["usage_count"] = tag_counts.get(row["value"], 0) + + return tag_data diff --git a/src/openedx_tagging/models/base.py b/src/openedx_tagging/models/base.py index d72b8be68..000d4d338 100644 --- a/src/openedx_tagging/models/base.py +++ b/src/openedx_tagging/models/base.py @@ -426,7 +426,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments depth: int | None = None, parent_tag_value: str | None = None, search_term: str | None = None, - include_counts: bool = False, excluded_values: list[str] | None = None, ) -> TagDataQuerySet: """ @@ -451,7 +450,7 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments if self.allow_free_text: if parent_tag_value is not None: raise ValueError("Cannot specify a parent tag ID for free text taxonomies") - result = self._get_filtered_tags_free_text(search_term=search_term, include_counts=include_counts) + result = self._get_filtered_tags_free_text(search_term=search_term) if excluded_values: return result.exclude(value__in=excluded_values) else: @@ -460,7 +459,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments result = self._get_filtered_tags_one_level( parent_tag_value=parent_tag_value, search_term=search_term, - include_counts=include_counts, ) if excluded_values: return result.exclude(value__in=excluded_values) @@ -470,7 +468,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments return self._get_filtered_tags_deep( parent_tag_value=parent_tag_value, search_term=search_term, - include_counts=include_counts, excluded_values=excluded_values, ) else: @@ -479,7 +476,6 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments def _get_filtered_tags_free_text( self, search_term: str | None, - include_counts: bool, ) -> TagDataQuerySet: """ Implementation of get_filtered_tags() for free text taxonomies. @@ -499,16 +495,13 @@ def _get_filtered_tags_free_text( _id=Value(None, output_field=models.CharField()), ) qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id").order_by("value") - if include_counts: - return qs.annotate(usage_count=models.Count("value")) - else: - return qs.distinct() # type: ignore[return-value] + + return qs.distinct() # type: ignore[return-value] def _get_filtered_tags_one_level( self, parent_tag_value: str | None, search_term: str | None, - include_counts: bool, ) -> TagDataQuerySet: """ Implementation of get_filtered_tags() for closed taxonomies, where @@ -531,24 +524,13 @@ def _get_filtered_tags_one_level( qs = qs.annotate(_id=F("id")) # ID has an underscore to encourage use of 'value' rather than this internal ID qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id") qs = qs.order_by("value") - if include_counts: - # We need to include the count of how many times this tag is used to tag objects. - # You'd think we could just use: - # qs = qs.annotate(usage_count=models.Count("objecttag__pk")) - # but that adds another join which starts creating a cross product and the children and usage_count become - # intertwined and multiplied with each other. So we use a subquery. - obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate( - # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027 - count=models.Func(F('id'), function='Count') - ) - qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count'))) + return qs # type: ignore[return-value] def _get_filtered_tags_deep( self, parent_tag_value: str | None, search_term: str | None, - include_counts: bool, excluded_values: list[str] | None, ) -> TagDataQuerySet: """ @@ -615,17 +597,7 @@ def _get_filtered_tags_deep( # lineage is a case-insensitive column storing "Root\tParent\t...\tThisValue\t", so # ordering by it gives the tree sort order that we want. qs = qs.order_by("lineage") - if include_counts: - # Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level() - obj_tags = ( - ObjectTag.objects.filter(tag_id=models.OuterRef("pk")) - .order_by() - .annotate( - # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027 - count=models.Func(F("id"), function="Count") - ) - ) - qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count"))) + return qs # type: ignore[return-value] def add_tag( diff --git a/src/openedx_tagging/rest_api/v1/views.py b/src/openedx_tagging/rest_api/v1/views.py index 14de82809..348b5b143 100644 --- a/src/openedx_tagging/rest_api/v1/views.py +++ b/src/openedx_tagging/rest_api/v1/views.py @@ -17,6 +17,7 @@ from ...api import ( TagDoesNotExist, add_tag_to_taxonomy, + add_usage_counts, create_taxonomy, delete_tags_from_taxonomy, get_object_tag_counts, @@ -844,21 +845,33 @@ def get_queryset(self) -> TagDataQuerySet: parent_tag_value=parent_tag_value, search_term=search_term, depth=depth, - include_counts=include_counts, ) if depth == 1: # We're already returning just a single level. It will be paginated normally. + if include_counts: + results_with_counts = add_usage_counts(self.get_taxonomy(), results) + return results_with_counts + return results elif full_depth_threshold and len(results) < full_depth_threshold: # We can load and display all the tags in this (sub)tree at once: self.pagination_class = DisabledTagsPagination + if include_counts: + results_with_counts = add_usage_counts(self.get_taxonomy(), results) + return results_with_counts + return results else: # We had to do a deep query, but we will only return one level of results. # This is because the user did not request a deep response (via full_depth_threshold) or the result was too # large (larger than the threshold). # It will be paginated normally. - return results.filter(parent_value=parent_tag_value) + filtered_results = results.filter(parent_value=parent_tag_value) + if include_counts: + results_with_counts = add_usage_counts(self.get_taxonomy(), results) + return results_with_counts + + return filtered_results def post(self, request, *args, **kwargs): """ diff --git a/tests/openedx_tagging/test_api.py b/tests/openedx_tagging/test_api.py index 9da007bd3..83ed1010b 100644 --- a/tests/openedx_tagging/test_api.py +++ b/tests/openedx_tagging/test_api.py @@ -752,53 +752,53 @@ def get_object_tags(): @ddt.data( ("ChA", [ - "Archaea (used: 1, children: 2)", - " Euryarchaeida (used: 0, children: 0)", - " Proteoarchaeota (used: 0, children: 0)", - "Bacteria (used: 0, children: 1)", # does not contain "cha" but a child does - " Archaebacteria (used: 1, children: 0)", + "Archaea (children: 2)", + " Euryarchaeida (children: 0)", + " Proteoarchaeota (children: 0)", + "Bacteria (children: 1)", # does not contain "cha" but a child does + " Archaebacteria (children: 0)", ]), ("ar", [ - "Archaea (used: 1, children: 2)", - " Euryarchaeida (used: 0, children: 0)", - " Proteoarchaeota (used: 0, children: 0)", - "Bacteria (used: 0, children: 1)", # does not contain "ar" but a child does - " Archaebacteria (used: 1, children: 0)", - "Eukaryota (used: 0, children: 1)", - " Animalia (used: 1, children: 2)", # does not contain "ar" but a child does - " Arthropoda (used: 1, children: 0)", - " Cnidaria (used: 0, children: 0)", + "Archaea (children: 2)", + " Euryarchaeida (children: 0)", + " Proteoarchaeota (children: 0)", + "Bacteria (children: 1)", # does not contain "ar" but a child does + " Archaebacteria (children: 0)", + "Eukaryota (children: 1)", + " Animalia (children: 2)", # does not contain "ar" but a child does + " Arthropoda (children: 0)", + " Cnidaria (children: 0)", ]), ("aE", [ - "Archaea (used: 1, children: 2)", - " Euryarchaeida (used: 0, children: 0)", - " Proteoarchaeota (used: 0, children: 0)", - "Bacteria (used: 0, children: 1)", # does not contain "ae" but a child does - " Archaebacteria (used: 1, children: 0)", - "Eukaryota (used: 0, children: 1)", # does not contain "ae" but a child does - " Plantae (used: 1, children: 0)", + "Archaea (children: 2)", + " Euryarchaeida (children: 0)", + " Proteoarchaeota (children: 0)", + "Bacteria (children: 1)", # does not contain "ae" but a child does + " Archaebacteria (children: 0)", + "Eukaryota (children: 1)", # does not contain "ae" but a child does + " Plantae (children: 0)", ]), ("a", [ - "Archaea (used: 1, children: 3)", - " DPANN (used: 0, children: 0)", - " Euryarchaeida (used: 0, children: 0)", - " Proteoarchaeota (used: 0, children: 0)", - "Bacteria (used: 0, children: 2)", - " Archaebacteria (used: 1, children: 0)", - " Eubacteria (used: 0, children: 0)", - "Eukaryota (used: 0, children: 4)", - " Animalia (used: 1, children: 7)", - " Arthropoda (used: 1, children: 0)", - " Chordata (used: 0, children: 1)", - " Mammalia (used: 0, children: 0)", - " Cnidaria (used: 0, children: 0)", - " Ctenophora (used: 0, children: 0)", - " Gastrotrich (used: 1, children: 0)", - " Placozoa (used: 1, children: 0)", - " Porifera (used: 0, children: 0)", - " Monera (used: 1, children: 0)", - " Plantae (used: 1, children: 0)", - " Protista (used: 0, children: 0)", + "Archaea (children: 3)", + " DPANN (children: 0)", + " Euryarchaeida (children: 0)", + " Proteoarchaeota (children: 0)", + "Bacteria (children: 2)", + " Archaebacteria (children: 0)", + " Eubacteria (children: 0)", + "Eukaryota (children: 4)", + " Animalia (children: 7)", + " Arthropoda (children: 0)", + " Chordata (children: 1)", + " Mammalia (children: 0)", + " Cnidaria (children: 0)", + " Ctenophora (children: 0)", + " Gastrotrich (children: 0)", + " Placozoa (children: 0)", + " Porifera (children: 0)", + " Monera (children: 0)", + " Plantae (children: 0)", + " Protista (children: 0)", ]), ) @ddt.unpack @@ -817,7 +817,7 @@ def test_autocomplete_tags_closed(self, search: str, expected: list[str]) -> Non _value=value, ).save() - result = tagging_api.search_tags(closed_taxonomy, search, include_counts=True) + result = tagging_api.search_tags(closed_taxonomy, search) assert pretty_format_tags(result, parent=False) == expected def test_autocomplete_tags_closed_omit_object(self) -> None: diff --git a/tests/openedx_tagging/test_models.py b/tests/openedx_tagging/test_models.py index f7b2b9359..acea53011 100644 --- a/tests/openedx_tagging/test_models.py +++ b/tests/openedx_tagging/test_models.py @@ -324,9 +324,9 @@ class TestFilteredTagsClosedTaxonomy(TestTagTaxonomyMixin, TestCase): def test_get_root(self) -> None: """ Test basic retrieval of root tags in the closed taxonomy, using - get_filtered_tags(). Without counts included. + get_filtered_tags(). """ - result = list(self.taxonomy.get_filtered_tags(depth=1, include_counts=False)) + result = list(self.taxonomy.get_filtered_tags(depth=1)) common_fields = {"depth": 0, "parent_value": None, "external_id": None} for r in result: del r["_id"] # Remove the internal database IDs; they aren't interesting here and a other tests check them @@ -342,8 +342,8 @@ def test_get_child_tags_one_level(self) -> None: Test basic retrieval of tags one level below the "Eukaryota" root tag in the closed taxonomy, using get_filtered_tags(). With counts included. """ - result = list(self.taxonomy.get_filtered_tags(depth=1, parent_tag_value="Eukaryota", include_counts=True)) - common_fields = {"depth": 1, "parent_value": "Eukaryota", "usage_count": 0, "external_id": None} + result = list(self.taxonomy.get_filtered_tags(depth=1, parent_tag_value="Eukaryota")) + common_fields = {"depth": 1, "parent_value": "Eukaryota", "external_id": None} for r in result: del r["_id"] # Remove the internal database IDs; they aren't interesting here and a other tests check them assert result == [ @@ -379,13 +379,12 @@ def test_get_depth_1_search_term(self) -> None: """ Filter the root tags to only those that match a search term """ - result = list(self.taxonomy.get_filtered_tags(depth=1, search_term="ARCH", include_counts=True)) + result = list(self.taxonomy.get_filtered_tags(depth=1, search_term="ARCH")) assert result == [ { "value": "Archaea", "child_count": 3, "depth": 0, - "usage_count": 0, "parent_value": None, "external_id": None, "_id": 2, # These IDs are hard-coded in the test fixture file @@ -504,13 +503,12 @@ def test_tags_deep(self) -> None: """ Test getting a deep tag in the taxonomy """ - result = list(self.taxonomy.get_filtered_tags(parent_tag_value="Chordata", include_counts=True)) + result = list(self.taxonomy.get_filtered_tags(parent_tag_value="Chordata")) assert result == [ { "value": "Mammalia", "parent_value": "Chordata", "depth": 3, - "usage_count": 0, "child_count": 0, "external_id": None, "_id": 21, # These IDs are hard-coded in the test fixture file @@ -543,29 +541,6 @@ def test_get_external_id(self) -> None: assert result[0]["value"] == "Bacteria" assert result[0]["external_id"] == "bct001" - def test_usage_count(self) -> None: - """ - Test that the usage count in the results is right - """ - api.tag_object(object_id="obj01", taxonomy=self.taxonomy, tags=["Bacteria"]) - api.tag_object(object_id="obj02", taxonomy=self.taxonomy, tags=["Bacteria"]) - api.tag_object(object_id="obj03", taxonomy=self.taxonomy, tags=["Bacteria"]) - api.tag_object(object_id="obj04", taxonomy=self.taxonomy, tags=["Eubacteria"]) - # Now the API should reflect these usage counts: - result = pretty_format_tags(self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True)) - assert result == [ - "Bacteria (None) (used: 3, children: 2)", - " Archaebacteria (Bacteria) (used: 0, children: 0)", - " Eubacteria (Bacteria) (used: 1, children: 0)", - ] - # Same with depth=1, which uses a different query internally: - result1 = pretty_format_tags( - self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True, depth=1) - ) - assert result1 == [ - "Bacteria (None) (used: 3, children: 2)", - ] - def test_tree_sort(self) -> None: """ Verify that taxonomies can be sorted correctly in tree orer (case insensitive). @@ -615,9 +590,8 @@ def setUp(self): def test_get_filtered_tags(self): """ Test basic retrieval of all tags in the taxonomy. - Without counts included. """ - result = list(self.taxonomy.get_filtered_tags(include_counts=False)) + result = list(self.taxonomy.get_filtered_tags()) common_fields = {"child_count": 0, "depth": 0, "parent_value": None, "external_id": None, "_id": None} assert result == [ # These should appear in alphabetical order: @@ -626,20 +600,6 @@ def test_get_filtered_tags(self): {"value": "triple", **common_fields}, ] - def test_get_filtered_tags_with_count(self): - """ - Test basic retrieval of all tags in the taxonomy. - Without counts included. - """ - result = list(self.taxonomy.get_filtered_tags(include_counts=True)) - common_fields = {"child_count": 0, "depth": 0, "parent_value": None, "external_id": None, "_id": None} - assert result == [ - # These should appear in alphabetical order: - {"value": "double", "usage_count": 2, **common_fields}, - {"value": "solo", "usage_count": 1, **common_fields}, - {"value": "triple", "usage_count": 3, **common_fields}, - ] - def test_get_filtered_tags_num_queries(self): """ Test that the number of queries used by get_filtered_tags() is fixed @@ -647,22 +607,20 @@ def test_get_filtered_tags_num_queries(self): """ with self.assertNumQueries(1): self.test_get_filtered_tags() - with self.assertNumQueries(1): - self.test_get_filtered_tags_with_count() def test_get_filtered_tags_with_search(self) -> None: """ Test basic retrieval of only matching tags. """ - result1 = list(self.taxonomy.get_filtered_tags(search_term="le", include_counts=True)) + result1 = list(self.taxonomy.get_filtered_tags(search_term="le")) common_fields = {"child_count": 0, "depth": 0, "parent_value": None, "external_id": None, "_id": None} assert result1 == [ # These should appear in alphabetical order: - {"value": "double", "usage_count": 2, **common_fields}, - {"value": "triple", "usage_count": 3, **common_fields}, + {"value": "double", **common_fields}, + {"value": "triple", **common_fields}, ] # And it should be case insensitive: - result2 = list(self.taxonomy.get_filtered_tags(search_term="LE", include_counts=True)) + result2 = list(self.taxonomy.get_filtered_tags(search_term="LE")) assert result1 == result2 diff --git a/tests/openedx_tagging/test_views.py b/tests/openedx_tagging/test_views.py index c87c3a95d..a7510d6ac 100644 --- a/tests/openedx_tagging/test_views.py +++ b/tests/openedx_tagging/test_views.py @@ -2570,6 +2570,436 @@ def test_delete_tag_in_taxonomy_without_subtags(self): existing_tag.refresh_from_db() +class TestTaxonomyTagsUsageCount(TestTaxonomyViewMixin): + """ + Tests the usage count of tags in a taxonomy, verifies that the + usage count is correct according to the rules as described in the + comments in src/openedx_tagging/api.py:add_usage_counts() + """ + + # Taxonomy reference as used in tests below + # + # - Bacteria + # |- Eubacteria + # |- Archaebacteria + # - Archaea + # |- DPANN + # |- Euryarchaeida + # |- Proteoarchaeota + # - Eukaryota (Root) + # |- Animalia (L1) + # | |- Arthropoda + # | |- Chordata (L2) + # | | |- Mammalia (L3) + # | | | |- Carnivora (L4) + # | | | | |- Felidae (L5) + # | | | | | |- Felis (L6) + # | | | |- Canidae + # | |- Cnidaria + # | |- Ctenophora + # | |- Gastrotrich + # | |- Placozoa + # | |- Porifera + # |- Fungi + # |- Monera + # |- Plantae + # |- Protista + + def setUp(self): + super().setUp() + self.taxonomy = Taxonomy.objects.create(name="Usage Count Taxonomy") + self.taxonomy_url = TAXONOMY_TAGS_URL.format(pk=self.taxonomy.pk) + + def test_simple_usage_count_with_lineage_and_deduplication(self): + """ + Test that usage counts correctly 'roll up' from children to parents, + while deduplicating multiple tags applied to the same object. + + This test is a basic case to verify that the tags are correctly + counted according to business rules and deduplication + requirements; Animalia and Eukaryota should not be counted + more than once per object, the children should be counted once each. + """ + # --- Setup Hierarchy --- + # Eukaryota -> Animalia -> (Arthropoda, Chordata, Cnidaria) + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + arthropoda = Tag.objects.create(taxonomy=self.taxonomy, value="Arthropoda", parent=animalia) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + cnidaria = Tag.objects.create(taxonomy=self.taxonomy, value="Cnidaria", parent=animalia) + + # --- Setup Tagging --- + # Tags applied as: + # obj1: Arthropoda, Chordata, Cnidaria + # obj2: Arthropoda + obj1_id = "obj1" + obj2_id = "obj2" + + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=arthropoda, object_id=obj1_id) + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=chordata, object_id=obj1_id) + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=cnidaria, object_id=obj1_id) + + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=arthropoda, object_id=obj2_id) + + self.client.force_authenticate(user=self.staff) + + # --- Request all tags with counts --- + response = self.client.get(self.taxonomy_url + "?include_counts&full_depth_threshold=100") + assert response.status_code == status.HTTP_200_OK + + results = {tag["value"]: tag for tag in response.data["results"]} + + # --- Verification --- + # Arthropoda: applied to obj1, obj2 -> count: 2 + assert results["Arthropoda"]["usage_count"] == 2 + + # Chordata: applied to obj1 -> count: 1 + assert results["Chordata"]["usage_count"] == 1 + + # Cnidaria: applied to obj1 -> count: 1 + assert results["Cnidaria"]["usage_count"] == 1 + + # Animalia: applied to obj1 (via Arthropoda, Chordata, Cnidaria) and obj2 (via Arthropoda). + # Should be 2, because it counts '1' per object regardless of how many children are applied. + assert results["Animalia"]["usage_count"] == 2 + + # Eukaryota: same logic as Animalia -> count: 2 + assert results["Eukaryota"]["usage_count"] == 2 + + def test_usage_count_through_multiple_levels(self): + """ + Test that usage count is correctly calculated across multiple levels. + Apply a simple set of tags to some objects and verify that the + usage_counts are correctly calculated, verifying that the ancestor + tags are correctly applied and de-deuplicated across the entire depth + of the taxonomy + """ + # --- Setup Hierarchy --- + # Eukaryota -> Animalia -> Chordata -> Mammalia + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + mammalia = Tag.objects.create(taxonomy=self.taxonomy, value="Mammalia", parent=chordata) + + # --- Setup Tagging --- + # obj1: Mammalia + # obj2: Chordata + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=mammalia, object_id="obj1") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=chordata, object_id="obj2") + + self.client.force_authenticate(user=self.staff) + + # --- Request tags with counts --- + response = self.client.get(self.taxonomy_url + "?include_counts&full_depth_threshold=100") + assert response.status_code == status.HTTP_200_OK + results = {tag["value"]: tag for tag in response.data["results"]} + + # --- Verification --- + # Mammalia: obj1 -> 1 + assert results["Mammalia"]["usage_count"] == 1 + # Chordata: obj1 (via Mammalia), obj2 -> 2 + assert results["Chordata"]["usage_count"] == 2 + # Animalia: obj1 (via Mammalia), obj2 (via Chordata) -> 2 + assert results["Animalia"]["usage_count"] == 2 + # Eukaryota: obj1 (via Mammalia), obj2 (via Chordata) -> 2 + assert results["Eukaryota"]["usage_count"] == 2 + + def test_usage_count_across_different_objects(self): + """ + Verify that counts are not erroneously shared between different objects + that are tagged with distinct branches of the same hierarchy. + """ + # --- Setup Hierarchy --- + # Eukaryota -> (Animalia, Fungi) + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + fungi = Tag.objects.create(taxonomy=self.taxonomy, value="Fungi", parent=eukaryota) + + # --- Setup Tagging --- + # obj1: Animalia + # obj2: Fungi + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=animalia, object_id="obj1") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=fungi, object_id="obj2") + + self.client.force_authenticate(user=self.staff) + + # --- Request tags with counts --- + response = self.client.get(self.taxonomy_url + "?include_counts&full_depth_threshold=100") + assert response.status_code == status.HTTP_200_OK + results = {tag["value"]: tag for tag in response.data["results"]} + + # --- Verification --- + assert results["Animalia"]["usage_count"] == 1 + assert results["Fungi"]["usage_count"] == 1 + # Eukaryota should have 2 because it's used on obj1 (via Animalia) and obj2 (via Fungi) + assert results["Eukaryota"]["usage_count"] == 2 + + def test_usage_count_max_depth(self): + """ + Verify usage_count up to the maximum depth of 7, ensuring redundant + tagging on the same object is deduplicated. + """ + # --- Setup Hierarchy (6 Levels) --- + # Eukaryota -> Animalia -> Chordata -> Mammalia -> Carnivora -> Felidae + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + mammalia = Tag.objects.create(taxonomy=self.taxonomy, value="Mammalia", parent=chordata) + carnivora = Tag.objects.create(taxonomy=self.taxonomy, value="Carnivora", parent=mammalia) + felidae = Tag.objects.create(taxonomy=self.taxonomy, value="Felidae", parent=carnivora) + + # --- Setup Tagging --- + # obj1: Tagged at Felidae AND Carnivora (Redundant tagging) + # Should count as '1' for all tags in its lineage. + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=felidae, object_id="obj1") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=carnivora, object_id="obj1") + + # obj2: Tagged at Chordata + # Should count as '1' for Chordata, Animalia, and Eukaryota. + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=chordata, object_id="obj2") + + self.client.force_authenticate(user=self.staff) + response = self.client.get(self.taxonomy_url + "?include_counts&full_depth_threshold=7") + assert response.status_code == status.HTTP_200_OK + results = {tag["value"]: tag for tag in response.data["results"]} + + # --- Verification --- + # Felidae: obj1 -> 1 + assert results["Felidae"]["usage_count"] == 1 + # Carnivora: obj1 (twice, but deduplicated) -> 1 + assert results["Carnivora"]["usage_count"] == 1 + # Mammalia: obj1 (via Carnivora/Felidae) -> 1 + assert results["Mammalia"]["usage_count"] == 1 + # Chordata: obj1 (via Mammalia), obj2 -> 2 + assert results["Chordata"]["usage_count"] == 2 + # Animalia: obj1 (via Chordata), obj2 (via Chordata) -> 2 + assert results["Animalia"]["usage_count"] == 2 + # Eukaryota: obj1 (via Animalia), obj2 (via Animalia) -> 2 + assert results["Eukaryota"]["usage_count"] == 2 + + def test_usage_count_only_at_root_when_child_applied(self): + """ + Verify that usage_count for a tag is correct, even if we only query for + the root level tag and is only used indirectly because a child is applied. + """ + # Eukaryota -> Animalia -> Chordata + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + + # Tag an object with the deepest tag + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=chordata, object_id="obj1") + + self.client.force_authenticate(user=self.staff) + + # --- Check Root Level (depth=1) --- + # Should show Eukaryota with count 1 + resp_root = self.client.get(self.taxonomy_url + "?include_counts") + results_root = {tag["value"]: tag for tag in resp_root.data["results"]} + assert results_root["Eukaryota"]["usage_count"] == 1 + + def test_usage_count_returns_zero(self): + """ + Ensure usage_count is 0 (int) for unused tags, not None. + """ + Tag.objects.create(taxonomy=self.taxonomy, value="Protista") + + self.client.force_authenticate(user=self.staff) + response = self.client.get(self.taxonomy_url + "?include_counts") + results = {tag["value"]: tag for tag in response.data["results"]} + + assert isinstance(results["Protista"]["usage_count"], int) + assert results["Protista"]["usage_count"] == 0 + + def test_usage_count_with_search_term(self): + """ + Verify usage_count is correct even when the result set is filtered by search. + """ + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=animalia, object_id="obj1") + + self.client.force_authenticate(user=self.staff) + + # Search for "Ani" + response = self.client.get(self.taxonomy_url + "?include_counts&search_term=Ani&full_depth_threshold=100") + results = {tag["value"]: tag for tag in response.data["results"]} + + # "Animalia" should match and have count 1 + assert "Animalia" in results + assert results["Animalia"]["usage_count"] == 1 + + def test_usage_count_search_permutations(self): + """ + Extensively test search logic across various depths and match types (partial/complete). + Uses the same search terms and result structure as 'test_api.py' as a handy example. + Ensures usage_count correctly rolls up even when child tags match but parents don't. + """ + # --- Setup Hierarchy (Matching tagging.yaml used in test_api.py) --- + # Bacteria + # - Eubacteria + # - Archaebacteria + # Archaea + # - DPANN + # - Euryarchaeida + # - Proteoarchaeota + # Eukaryota + # - Animalia + # - Arthropoda + # - Chordata + # - Mammalia + # - Cnidaria + # - Ctenophora + # - Gastrotrich + # - Placozoa + # - Porifera + # - Fungi + # - Monera + # - Plantae + # - Protista + + # Roots + bacteria = Tag.objects.create(taxonomy=self.taxonomy, value="Bacteria") + archaea = Tag.objects.create(taxonomy=self.taxonomy, value="Archaea") + eukaryota = Tag.objects.create(taxonomy=self.taxonomy, value="Eukaryota") + + # Bacteria branch + Tag.objects.create(taxonomy=self.taxonomy, value="Eubacteria", parent=bacteria) + archaebacteria = Tag.objects.create(taxonomy=self.taxonomy, value="Archaebacteria", parent=bacteria) + + # Archaea branch + Tag.objects.create(taxonomy=self.taxonomy, value="DPANN", parent=archaea) + euryarchaeida = Tag.objects.create(taxonomy=self.taxonomy, value="Euryarchaeida", parent=archaea) + proteoarchaeota = Tag.objects.create(taxonomy=self.taxonomy, value="Proteoarchaeota", parent=archaea) + + # Eukaryota branch + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia", parent=eukaryota) + arthropoda = Tag.objects.create(taxonomy=self.taxonomy, value="Arthropoda", parent=animalia) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + cnidaria = Tag.objects.create(taxonomy=self.taxonomy, value="Cnidaria", parent=animalia) + Tag.objects.create(taxonomy=self.taxonomy, value="Ctenophora", parent=animalia) + Tag.objects.create(taxonomy=self.taxonomy, value="Gastrotrich", parent=animalia) + Tag.objects.create(taxonomy=self.taxonomy, value="Placozoa", parent=animalia) + Tag.objects.create(taxonomy=self.taxonomy, value="Porifera", parent=animalia) + Tag.objects.create(taxonomy=self.taxonomy, value="Mammalia", parent=chordata) + Tag.objects.create(taxonomy=self.taxonomy, value="Fungi", parent=eukaryota) + Tag.objects.create(taxonomy=self.taxonomy, value="Monera", parent=eukaryota) + plantae = Tag.objects.create(taxonomy=self.taxonomy, value="Plantae", parent=eukaryota) + Tag.objects.create(taxonomy=self.taxonomy, value="Protista", parent=eukaryota) + + # --- Setup Tagging to Exercise usage_counts --- + # Tag a few objects to create counts. + # obj1: Archaebacteria, Arthropoda + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=archaebacteria, object_id="obj1") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=arthropoda, object_id="obj1") + + # obj2: Euryarchaeida, Cnidaria + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=euryarchaeida, object_id="obj2") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=cnidaria, object_id="obj2") + + # obj3: Proteoarchaeota, Plantae + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=proteoarchaeota, object_id="obj3") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=plantae, object_id="obj3") + + self.client.force_authenticate(user=self.staff) + + # SCENARIO 1: search="ChA" + url = self.taxonomy_url + "?include_counts&search_term=ChA&full_depth_threshold=100" + resp = self.client.get(url) + assert resp.status_code == status.HTTP_200_OK + assert pretty_format_tags(resp.data["results"], parent=False) == [ + "Archaea (used: 2, children: 2)", + " Euryarchaeida (used: 1, children: 0)", + " Proteoarchaeota (used: 1, children: 0)", + "Bacteria (used: 1, children: 1)", + " Archaebacteria (used: 1, children: 0)", + ] + + # SCENARIO 2: search="ar" + url = self.taxonomy_url + "?include_counts&search_term=ar&full_depth_threshold=100" + resp = self.client.get(url) + assert resp.status_code == status.HTTP_200_OK + assert pretty_format_tags(resp.data["results"], parent=False) == [ + "Archaea (used: 2, children: 2)", + " Euryarchaeida (used: 1, children: 0)", + " Proteoarchaeota (used: 1, children: 0)", + "Bacteria (used: 1, children: 1)", + " Archaebacteria (used: 1, children: 0)", + "Eukaryota (used: 3, children: 1)", + " Animalia (used: 2, children: 2)", + " Arthropoda (used: 1, children: 0)", + " Cnidaria (used: 1, children: 0)", + ] + + # SCENARIO 3: search="aE" + url = self.taxonomy_url + "?include_counts&search_term=aE&full_depth_threshold=100" + resp = self.client.get(url) + assert resp.status_code == status.HTTP_200_OK + assert pretty_format_tags(resp.data["results"], parent=False) == [ + "Archaea (used: 2, children: 2)", + " Euryarchaeida (used: 1, children: 0)", + " Proteoarchaeota (used: 1, children: 0)", + "Bacteria (used: 1, children: 1)", + " Archaebacteria (used: 1, children: 0)", + "Eukaryota (used: 3, children: 1)", + " Plantae (used: 1, children: 0)", + ] + + # SCENARIO 4: search="a" + url = self.taxonomy_url + "?include_counts&search_term=a&full_depth_threshold=100" + resp = self.client.get(url) + assert resp.status_code == status.HTTP_200_OK + assert pretty_format_tags(resp.data["results"], parent=False) == [ + "Archaea (used: 2, children: 3)", + " DPANN (used: 0, children: 0)", + " Euryarchaeida (used: 1, children: 0)", + " Proteoarchaeota (used: 1, children: 0)", + "Bacteria (used: 1, children: 2)", + " Archaebacteria (used: 1, children: 0)", + " Eubacteria (used: 0, children: 0)", + "Eukaryota (used: 3, children: 4)", + " Animalia (used: 2, children: 7)", + " Arthropoda (used: 1, children: 0)", + " Chordata (used: 0, children: 1)", + " Mammalia (used: 0, children: 0)", + " Cnidaria (used: 1, children: 0)", + " Ctenophora (used: 0, children: 0)", + " Gastrotrich (used: 0, children: 0)", + " Placozoa (used: 0, children: 0)", + " Porifera (used: 0, children: 0)", + " Monera (used: 0, children: 0)", + " Plantae (used: 1, children: 0)", + " Protista (used: 0, children: 0)", + ] + + def test_usage_count_sibling_and_ancestor_deduplication(self): + """ + Test deduplication when multiple children of the same parent are applied to the same object. + """ + animalia = Tag.objects.create(taxonomy=self.taxonomy, value="Animalia") + arthropoda = Tag.objects.create(taxonomy=self.taxonomy, value="Arthropoda", parent=animalia) + chordata = Tag.objects.create(taxonomy=self.taxonomy, value="Chordata", parent=animalia) + + # obj1: tagged with both siblings + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=arthropoda, object_id="obj1") + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=chordata, object_id="obj1") + + # obj2: tagged with only one sibling + ObjectTag.objects.create(taxonomy=self.taxonomy, tag=arthropoda, object_id="obj2") + + self.client.force_authenticate(user=self.staff) + response = self.client.get(self.taxonomy_url + "?include_counts&full_depth_threshold=100") + results = {tag["value"]: tag for tag in response.data["results"]} + + # Arthropoda: obj1, obj2 -> 2 + assert results["Arthropoda"]["usage_count"] == 2 + # Chordata: obj1 -> 1 + assert results["Chordata"]["usage_count"] == 1 + # Animalia: obj1 (via Arthropoda/Chordata), obj2 (via Arthropoda) -> 2 + # Deduplication check: obj1 only counts as 1 for Animalia even though it has both Arthropoda and Chordata. + assert results["Animalia"]["usage_count"] == 2 + + class ImportTaxonomyMixin(TestTaxonomyViewMixin): """ Mixin to test importing taxonomies.