Skip to content

Commit e4a5bd4

Browse files
committed
Convert management command to address licensing metadata issues
1 parent c2f2777 commit e4a5bd4

3 files changed

Lines changed: 377 additions & 108 deletions

File tree

contentcuration/contentcuration/management/commands/fix_missing_import_sources.py

Lines changed: 155 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import csv
2+
import io
23
import logging
34
import time
5+
import uuid
6+
from pathlib import Path
7+
from typing import Optional
8+
from typing import Tuple
49

510
from django.core.management.base import BaseCommand
611
from django.db.models import Exists
@@ -12,17 +17,97 @@
1217

1318
from contentcuration.models import Channel
1419
from contentcuration.models import ContentNode
20+
from contentcuration.models import License
1521

1622

1723
logger = logging.getLogger(__name__)
1824

1925

26+
class LicensingFixesLookup(object):
27+
"""Consolidates logic for reading and processing the licensing fixes from the CSV"""
28+
29+
def __init__(self):
30+
self._lookup = {}
31+
self._license_lookup = {}
32+
33+
def load(self, fp: io.TextIOWrapper):
34+
"""Loads the data from the CSV file, and the necessary license data from the database"""
35+
reader = csv.DictReader(fp)
36+
license_names = set()
37+
38+
# create a lookup index by channel ID from the CSV data
39+
for row in reader:
40+
lookup_key = f"{uuid.UUID(row['channel_id']).hex}:{row.get('kind', '')}"
41+
self._lookup[lookup_key] = row
42+
if row["license_name"]:
43+
license_names.add(row["license_name"])
44+
45+
# load all licenses, regardless of whether they are named in the CSV
46+
license_lookup_by_name = {}
47+
for lic in License.objects.all():
48+
self._license_lookup[lic.id] = lic
49+
license_lookup_by_name[lic.license_name] = lic
50+
license_names.discard(lic.license_name)
51+
52+
# ensure we've found all the licenses
53+
if len(license_names):
54+
raise ValueError(f"Could not find all licenses: {license_names}")
55+
56+
# we now are certain all licenses are found
57+
for info in self._lookup.values():
58+
if info["license_name"]:
59+
info["license_id"] = license_lookup_by_name[info["license_name"]].id
60+
61+
def get_info(
62+
self,
63+
channel_id: str,
64+
kind: str,
65+
license_id: Optional[int],
66+
license_description: Optional[str],
67+
copyright_holder: Optional[str],
68+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
69+
"""
70+
Determines the complete licensing metadata, given the current metadata, and comparing it
71+
with what would make the node complete.
72+
73+
:param channel_id: The channel the node was sourced from
74+
:param kind: The content kind of the node
75+
:param license_id: The current license_id of the node
76+
:param license_description: The current license_description of the node
77+
:param copyright_holder: The current copyright_holder of the node
78+
:return: A tuple of (license_id, license_description, copyright_holder) to use on the node
79+
"""
80+
# first check kind-specific metadata, fallback to channel-wide (no kind)
81+
info = self._lookup.get(f"{channel_id}:{kind}", None)
82+
if info is None:
83+
info = self._lookup.get(f"{channel_id}:", None)
84+
85+
if info is None:
86+
logger.warning(f"Failed to find licensing info for channel: {channel_id}")
87+
return license_id, license_description, copyright_holder
88+
89+
if not license_id:
90+
license_id = info["license_id"]
91+
92+
if not license_id:
93+
return None, license_description, copyright_holder
94+
95+
license_obj = self._license_lookup.get(license_id)
96+
97+
if license_obj.is_custom and not license_description:
98+
license_description = info["license_description"]
99+
100+
if license_obj.copyright_holder_required and not copyright_holder:
101+
copyright_holder = info["copyright_holder"]
102+
103+
return license_id, license_description, copyright_holder
104+
105+
20106
class Command(BaseCommand):
21107
"""
22108
Audits nodes that have imported content from public channels and whether the imported content
23-
has a missing source node.
24-
25-
TODO: this does not yet FIX them
109+
has a missing source node. We've determined that pretty much all of these have incomplete
110+
licensing data
26111
"""
27112

28113
def handle(self, *args, **options):
@@ -71,32 +156,27 @@ def handle(self, *args, **options):
71156

72157
logger.info("=== Iterating over private destination channels. ===")
73158
channel_count = 0
74-
total_node_count = 0
75-
76-
with open("fix_missing_import_sources.csv", "w", newline="") as csv_file:
77-
csv_writer = csv.DictWriter(
78-
csv_file,
79-
fieldnames=[
80-
"channel_id",
81-
"channel_name",
82-
"contentnode_id",
83-
"contentnode_title",
84-
"public_channel_id",
85-
"public_channel_name",
86-
"public_channel_deleted",
87-
],
88-
)
89-
csv_writer.writeheader()
159+
total_fixed = 0
160+
lookup = LicensingFixesLookup()
161+
162+
command_dir = Path(__file__).parent
163+
csv_path = command_dir / "licensing_fixes_lookup.csv"
164+
165+
with csv_path.open("r", encoding="utf-8", newline="") as csv_file:
166+
lookup.load(csv_file)
90167

91-
for channel in destination_channels.iterator():
92-
node_count = self.handle_channel(csv_writer, channel)
168+
# skip using an iterator here, to limit transaction duration to `handle_channel`
169+
for channel in destination_channels:
170+
node_count = self.handle_channel(lookup, channel)
93171

94-
if node_count > 0:
95-
total_node_count += node_count
96-
channel_count += 1
172+
if node_count > 0:
173+
total_fixed += node_count
174+
channel_count += 1
97175

98176
logger.info("=== Done iterating over private destination channels. ===")
99-
logger.info(f"Found {total_node_count} nodes across {channel_count} channels.")
177+
logger.info(
178+
f"Fixed incomplete licensing data on {total_fixed} nodes across {channel_count} channels."
179+
)
100180
logger.info(f"Finished in {time.time() - start}")
101181

102182
def get_public_cte(self) -> With:
@@ -110,7 +190,15 @@ def get_public_cte(self) -> With:
110190
name="public_cte",
111191
)
112192

113-
def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
193+
def handle_channel(self, lookup: LicensingFixesLookup, channel: dict) -> int:
194+
"""
195+
Goes through the nodes of the channel, that were imported from public channels, but no
196+
longer have a valid source node. For each node, it applies license metadata as necessary
197+
198+
:param lookup: The lookup utility to pull licensing data from
199+
:param channel: The channel to fix
200+
:return: The total node count that are now marked complete as a result of the fixes
201+
"""
114202
public_cte = self.get_public_cte()
115203
channel_id = channel["id"]
116204
channel_name = channel["name"]
@@ -136,29 +224,51 @@ def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
136224
)
137225
)
138226
)
139-
.values(
140-
"public_channel_id",
141-
"public_channel_name",
142-
"public_channel_deleted",
143-
contentnode_id=F("id"),
144-
contentnode_title=F("title"),
145-
)
146227
)
147228

148229
# Count and log results
149230
node_count = missing_source_nodes.count()
231+
processed = 0
232+
was_complete = 0
233+
unfixed = 0
234+
now_complete = 0
150235

151-
# TODO: this will be replaced with logic to correct the missing source nodes
152-
if node_count > 0:
236+
def _log():
153237
logger.info(
154-
f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes."
238+
f"Fixing {channel_id}:{channel_name}\ttotal: {node_count}; before: {was_complete} unfixed: {unfixed}; after: {now_complete};"
155239
)
156-
row_dict = {
157-
"channel_id": channel_id,
158-
"channel_name": channel_name,
159-
}
160-
for node_dict in missing_source_nodes.iterator():
161-
row_dict.update(node_dict)
162-
csv_writer.writerow(row_dict)
163-
164-
return node_count
240+
241+
if node_count > 0:
242+
for node in missing_source_nodes.iterator():
243+
# determine the new license metadata
244+
license_id, license_description, copyright_holder = lookup.get_info(
245+
node.original_channel_id,
246+
node.kind,
247+
node.license_id,
248+
node.license_description,
249+
node.copyright_holder,
250+
)
251+
252+
# if there isn't a license, there's nothing to do
253+
if not license_id:
254+
unfixed += 1
255+
# cannot fix
256+
continue
257+
258+
if node.complete:
259+
was_complete += 1
260+
261+
# apply updates
262+
node.license_id = license_id
263+
node.license_description = license_description
264+
node.copyright_holder = copyright_holder
265+
if not node.mark_complete():
266+
now_complete += 1
267+
node.save()
268+
processed += 1
269+
if processed % 100 == 0:
270+
_log()
271+
272+
_log()
273+
274+
return now_complete - was_complete
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
channel_id,channel_name,kind,license_id,license_name,license_description,copyright_holder
2+
f9d3e0e4-6ea2-5789-bbed-672ff6a399ed,African Storybook Library (multiple languages),,,CC BY,,African Storybook Initiative
3+
d0ef6f71-e4fe-4e54-bb87-d7dab5eeaae2,Be Strong: Internet safety resources,,,CC BY-NC-ND,,Vodafone
4+
2d7b056d-668a-58ee-9244-ccf76108cbdb,Book Dash,,,CC BY,,http://bookdash.org/
5+
922e9c57-6c2f-59e5-9389-142b136308ff,Career Girls,,,Special Permissions,For use on Kolibri,Career Girls
6+
da53f90b-1be2-5752-a046-82bbc353659f,Ciencia NASA,,,CC BY,,NASA
7+
0294a064-f722-4899-887c-e07bd47f9991,Citoyennes de la Terre,,,CC BY,,Florence Piron
8+
604ad3b8-5d84-4dd8-9ee7-0fa12a9a5a6e,CREE+,,,CC BY-NC-SA,,"Publicado por el Lic. Edelberto Andino(edelberto.andino.ea@gmail.com) para ser utilizado con fines educativos únicamente, no debe ser utilizado con fines lucrativos de ninguna índole."
9+
ef2ead65-de76-4ea4-a27b-ba6df5282c74,CSpathshala - सीएसपाठशाला (हिंदी),,,CC BY,,ए सि एम् इंडिया
10+
7e68bc59-d430-4e71-8a07-50b1b87125ad,Cultura Emprendedora,,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
11+
c51a0f84-2fed-427c-95ac-ff9bb4a21e3c,EENET Inclusive Education Training Materials,,,CC BY-NC-SA,,Enabling Education Network (EENET)
12+
0e173fca-6e90-52f8-a474-a2fb84055faf,Global Digital Library - Book Catalog,,,CC BY,,Enabling Writers Initiative
13+
624e09bb-5eeb-4d20-aa8d-e62e7b4778a0,How to get started with Kolibri,,,CC BY-NC,,Learning Equality
14+
378cf412-8c85-4c27-95c1-00b5aca7a3ed,Inclusive Home Learning Activities,,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",EENET – Enabling Education Network
15+
d76da4d3-6cfd-5927-9b57-5dfc6017aa13,Kamkalima (العربيّة),,,CC BY-NC-ND,,Kamkalima
16+
2fd54ca4-7a8f-59c9-9fce-faaa3894c19e,Khan Academy (English - CBSE India Curriculum),video,,CC BY-NC-SA,,Khan Academy
17+
2fd54ca4-7a8f-59c9-9fce-faaa3894c19e,Khan Academy (English - CBSE India Curriculum),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
18+
c9d7f950-ab6b-5a11-99e3-d6c10d7f0103,Khan Academy (English - US curriculum),video,,CC BY-NC-SA,,Khan Academy
19+
c9d7f950-ab6b-5a11-99e3-d6c10d7f0103,Khan Academy (English - US curriculum),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
20+
c1f2b7e6-ac9f-56a2-bb44-fa7a48b66dce,Khan Academy (Español),video,,CC BY-NC-SA,,Khan Academy
21+
c1f2b7e6-ac9f-56a2-bb44-fa7a48b66dce,Khan Academy (Español),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
22+
878ec2e6-f88c-5c26-8b1b-e6f202833cd4,Khan Academy (Français),video,,CC BY-NC-SA,,Khan Academy
23+
878ec2e6-f88c-5c26-8b1b-e6f202833cd4,Khan Academy (Français),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
24+
801a5f02-9420-5569-8918-edcff6494185,Khan Academy (Italiano),video,,CC BY-NC-SA,,Khan Academy
25+
801a5f02-9420-5569-8918-edcff6494185,Khan Academy (Italiano),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
26+
ec164fee-25ee-5262-96e6-8f7c10b1e169,Khan Academy (Kiswahili),video,,CC BY-NC-SA,,Khan Academy
27+
ec164fee-25ee-5262-96e6-8f7c10b1e169,Khan Academy (Kiswahili),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
28+
2ac071c4-6723-54f2-aa78-953448f81e50,Khan Academy (Português - Brasil),video,,CC BY-NC-SA,,Khan Academy
29+
2ac071c4-6723-54f2-aa78-953448f81e50,Khan Academy (Português - Brasil),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
30+
c3231d84-4f8d-5bb1-b4cb-c6a7ddd91eb7,Khan Academy (Português (Portugal)),video,,CC BY-NC-SA,,Khan Academy
31+
c3231d84-4f8d-5bb1-b4cb-c6a7ddd91eb7,Khan Academy (Português (Portugal)),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
32+
09ee940e-1069-53a2-b671-6e1020a0ce3f,Khan Academy (български език),video,,CC BY-NC-SA,,Khan Academy
33+
09ee940e-1069-53a2-b671-6e1020a0ce3f,Khan Academy (български език),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
34+
a53592c9-72a8-594e-9b69-5aa127493ff6,Khan Academy (हिन्दी),video,,CC BY-NC-SA,,Khan Academy
35+
a53592c9-72a8-594e-9b69-5aa127493ff6,Khan Academy (हिन्दी),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
36+
a03496a6-de09-5e7b-a9d2-4291a487c78d,Khan Academy (বাংলা),video,,CC BY-NC-SA,,Khan Academy
37+
a03496a6-de09-5e7b-a9d2-4291a487c78d,Khan Academy (বাংলা),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
38+
5357e525-81c3-567d-a4f5-6d56badfeac7,Khan Academy (ગુજરાતી),video,,CC BY-NC-SA,,Khan Academy
39+
5357e525-81c3-567d-a4f5-6d56badfeac7,Khan Academy (ગુજરાતી),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
40+
2b608c6f-d4c3-5c34-b738-7e3dd7b53265,Khan Academy (ဗမာစာ),video,,CC BY-NC-SA,,Khan Academy
41+
2b608c6f-d4c3-5c34-b738-7e3dd7b53265,Khan Academy (ဗမာစာ),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
42+
f5b71417-b1f6-57fc-a4d1-aaecd23e4067,Khan Academy (ភាសាខ្មែរ),video,,CC BY-NC-SA,,Khan Academy
43+
f5b71417-b1f6-57fc-a4d1-aaecd23e4067,Khan Academy (ភាសាខ្មែរ),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
44+
ec599e77-f9ad-5802-8975-e8a26e6f1821,Khan Academy (中文(中国)),video,,CC BY-NC-SA,,Khan Academy
45+
ec599e77-f9ad-5802-8975-e8a26e6f1822,Khan Academy (中文(中国)),exercise,,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
46+
913efe9f-14c6-5cb1-b234-02f21f056e99,MIT Blossoms,,,CC BY-NC-SA,,MIT Blossoms
47+
fc47aee8-2e01-53e2-a301-97d3fdee1128,Open Stax,,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Rice University
48+
b8bd7770-063d-40a8-bd9b-30d4703927b5,PBS SoCal: Family Math,,,All Rights Reserved,,PBS SoCal
49+
197934f1-4430-5350-b582-0c7c4dd8e194,PhET Interactive Simulations,,,CC BY,,"PhET Interactive Simulations, University of Colorado Boulder"
50+
aa254505-59b5-5bd7-9bc9-0c09dfb805d2,PhET simulações interativas,,,CC BY,,"PhET Interactive Simulations, University of Colorado Boulder"
51+
889f0c34-b275-507a-b8d3-7d2da2d03aa9,PhET – інтерактивне моделювання,,,CC BY,,"PhET Interactive Simulations, University of Colorado Boulder"
52+
f6cb302e-f659-4db4-b4a0-4b4991a595c2,Plan Educativo TIC Basico,,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
53+
e832106c-6398-54e1-8161-6015a8b87910,PraDigi,,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",PraDigi
54+
131e543d-becf-5776-bb13-cfcfddf05605,Pratham Books' StoryWeaver,,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Pratham Books
55+
f758ac6a-d39c-452f-9566-58da6ad7d3cc,Project Based Learning with Kolibri,,,CC BY,,Learning Equality
56+
305b12ea-5ea8-4fa1-8f93-3705c23f5ee0,School of Thought,,,CC BY,,School of Thought
57+
3e464ee1-2f6a-50a7-81cd-df59147b48b1,Sikana (English),,,CC BY-NC-ND,,Sikana Education
58+
30c71c99-c42c-57d1-81e8-aeafd2e15e5f,Sikana (Español),,,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
59+
8ef625db-6e86-506c-9a3b-ac891e413fff,Sikana (Français),,,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
60+
f4715a77-6972-5c72-9d25-d29977b8b308,Similasyon Enteraktif PhET,,,CC BY,,"PhET Interactive Simulations, University of Colorado Boulder"
61+
8fa678af-1dd0-5329-bf32-18c549b84996,Simulaciones interactivas PhET,,,CC BY,,"PhET Interactive Simulations, University of Colorado Boulder"
62+
a9b25ac9-8147-42c8-83ce-1b0579448337,TESSA - Teacher Resources,,,CC BY-NC-SA,,Open University
63+
74f36493-bb47-5b62-935f-a8705ed59fed,Thoughtful Learning,,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Thoughtful Learning
64+
000409f8-1dbe-5d1b-a671-01cb9fed4530,Touchable Earth (en),,,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
65+
b336c2e2-c45c-53d5-b24e-5c476a54b077,Touchable Earth (fr),,,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
66+
08a53136-a155-5f64-b049-6b3e1318b0cd,Ubongo Kids,,,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Ubongo Media
67+
237e5975-bce2-5bf6-aff3-98f4c17516f3,,,,,,

0 commit comments

Comments
 (0)