Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
0473c0e
feat: #253 new branch to reduce noise; adding BE logic to implement c…
tbain Mar 18, 2026
2c0b959
feat: #253 Fixing API tests with regard to count logic changes
tbain Mar 23, 2026
8846805
Merge branch 'main' of https://github.com/openedx/openedx-core into t…
tbain Mar 25, 2026
b01964b
feat: #253 Resolving merge conflict with upstream main branch
tbain Mar 26, 2026
a23afe8
feat: #253 Fixing pylint issues
tbain Mar 26, 2026
3df68ab
feat: #253 Fixing pycodestyle issue
tbain Mar 26, 2026
435808c
feat: #253 Fixing pycodestyle issue
tbain Mar 26, 2026
457313b
feat: #253 Addressing first round Code review comments
tbain Mar 27, 2026
a14c56e
feat: #253 fixing count depth issue and updating appropriate unit tests
tbain Mar 27, 2026
2055a07
feat: #253 fixing spelling errors in comments
tbain Mar 27, 2026
939f18c
feat: #253 Fixing code review comments; fix incorrect unit test & fil…
tbain Mar 30, 2026
5762c33
feat: #253 adjusting comments per code review feedback
tbain Apr 1, 2026
79be83a
Merge branch 'main' of https://github.com/openedx/openedx-core into t…
tbain Apr 1, 2026
c2f79d2
feat: #253 fixing unit tests to work with upstream updates
tbain Apr 1, 2026
c017e8a
feat: #253 Changing usage_count to being in-mem/python based instead …
tbain Apr 3, 2026
7d42793
feat: #253 Fixing code quality pipeline issues
tbain Apr 3, 2026
1e47967
feat: #253 Moving usage_count logic out to API level, cleaning up/add…
tbain Apr 8, 2026
a87c877
Merge branch 'main' of https://github.com/openedx/openedx-core into t…
tbain Apr 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 67 additions & 22 deletions src/openedx_tagging/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from __future__ import annotations

import logging
import operator
import re
from functools import reduce
from typing import List, Self

from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import F, Value
from django.db.models.functions import Concat, Length, Replace, Substr
from django.db.models import Count, F, IntegerField, Q, Subquery, Value
from django.db.models.functions import Coalesce, Concat, Length, Replace, Substr
from django.utils.functional import cached_property
from django.utils.module_loading import import_string
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -539,16 +541,7 @@ def _get_filtered_tags_one_level(
qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
qs = qs.order_by("value")
if include_counts:
# We need to include the count of how many times this tag is used to tag objects.
# You'd think we could just use:
# qs = qs.annotate(usage_count=models.Count("objecttag__pk"))
# but that adds another join which starts creating a cross product and the children and usage_count become
# intertwined and multiplied with each other. So we use a subquery.
obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F('id'), function='Count')
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count')))
qs = self.add_counts_query(qs)
return qs # type: ignore[return-value]

def _get_filtered_tags_deep(
Expand Down Expand Up @@ -644,18 +637,70 @@ def _get_filtered_tags_deep(
# ordering by it gives the tree sort order that we want.
qs = qs.order_by("lineage")
if include_counts:
# Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
obj_tags = (
ObjectTag.objects.filter(tag_id=models.OuterRef("pk"))
.order_by()
.annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F("id"), function="Count")
)
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count")))
qs = self.add_counts_query(qs)

return qs # type: ignore[return-value]

def add_counts_query(self, qs: models.QuerySet):
"""
Adds a subquery to the passed-in queryset that returns the usage_count
for a given tag, or the appropriate count with de-deuplication per Object
for the parents of a used child tag
:param qs: The QuerySet to annotate with usage counts.
:return: the queryset annotated with the usage counts
"""
# Adds a subquery to the passed-in queryset that returns the number
# of times a tag has been used.
#
# Note: The count is not a simple count, we need to do a 'roll up'
# where we count the number of times a tag is directly used and applied,
# but then that also needs to add a "1" count to the lineage tags
# (parent, grandparent, etc.), but de-duplicate counts for any children
# so that if we have "2" child tags, it only counts towards "1" for the
# parent.
# This query gets the raw counts for each tag usage, gets the distinct
# usages (so de-duplicates counts) by actual application to an "Object"
# (library, course, course module, course section, etc.), which creates
# a count per tag, annotated to that particular tag from the passed-in
# queryset.

# Since Depth may change depending on the value of TAXONOMY_MAX_DEPTH, dynamically
# build a list of lineage paths to be used in the query, so we're not hard coding to
# a certain number of levels. This will build an array containing something like:
# ['tag_id', 'tag__parent_id', 'tag__parent__parent_id', 'tag__parent__parent__parent_id', ...]
lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]

# Combine the above-built lineage with a Q query against the OuterRef("pk"),
lineage_query_list = [Q(**{path: models.OuterRef("pk")}) for path in lineage_paths]

usage_count_qs = ObjectTag.objects.filter(
# Combine the logic built above with an or operator to flesh out a
# lineage query of the form:
# ```
# Q(tag_id=OuterRef('pk')) |
# Q(tag__parent_id=OuterRef('pk')) |
# Q(tag__parent__parent_id=OuterRef('pk')) |
# ...
# ```
# Previously the above was hard coded and needed to be changed with every
# change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
# (Thanks Google for helping me build this)

reduce(operator.or_, lineage_query_list)
).values('object_id').distinct().annotate(
intermediate_grouping=Value(1, output_field=IntegerField())
).values('intermediate_grouping').annotate(
total_usage=Count('object_id', distinct=True)
).values('total_usage')

qs = qs.annotate(
usage_count=Coalesce(
Subquery(usage_count_qs, output_field=IntegerField()),
0 # Coalesce ensures we return 0 instead of None if there are no usages
)
)
return qs

def add_tag(
self,
tag_value: str,
Expand Down
22 changes: 11 additions & 11 deletions tests/openedx_tagging/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,43 +755,43 @@ def get_object_tags():
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "cha" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "cha" but a child does
" Archaebacteria (used: 1, children: 0)",
]),
("ar", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ar" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ar" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1 + 2)",
" Animalia (used: 1, children: 2)", # does not contain "ar" but a child does
"Eukaryota (used: 6, children: 1 + 2)",
" Animalia (used: 4, children: 2)", # does not contain "ar" but a child does
" Arthropoda (used: 1, children: 0)",
" Cnidaria (used: 0, children: 0)",
]),
("aE", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ae" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ae" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1)", # does not contain "ae" but a child does
"Eukaryota (used: 6, children: 1)", # does not contain "ae" but a child does
" Plantae (used: 1, children: 0)",
]),
("a", [
"Archaea (used: 1, children: 3)",
" DPANN (used: 0, children: 0)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 2)",
"Bacteria (used: 1, children: 2)",
" Archaebacteria (used: 1, children: 0)",
" Eubacteria (used: 0, children: 0)",
"Eukaryota (used: 0, children: 4 + 8)",
" Animalia (used: 1, children: 7 + 1)",
"Eukaryota (used: 6, children: 4 + 8)",
" Animalia (used: 4, children: 7 + 1)",
" Arthropoda (used: 1, children: 0)",
" Chordata (used: 0, children: 1)",
" Chordata (used: 0, children: 1)", # <<< Chordata has a matching child but we only support searching
" Mammalia (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)", # 3 levels deep at once for now.
" Ctenophora (used: 0, children: 0)",
" Gastrotrich (used: 1, children: 0)",
" Placozoa (used: 1, children: 0)",
Expand Down
Loading
Loading