Skip to content

Commit 60ded2b

Browse files
committed
Convert management command to address licensing metadata issues
1 parent 050e85b commit 60ded2b

File tree

3 files changed

+350
-108
lines changed

3 files changed

+350
-108
lines changed

contentcuration/contentcuration/management/commands/fix_missing_import_sources.py

Lines changed: 147 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import csv
2+
import io
23
import logging
34
import time
5+
import uuid
6+
from pathlib import Path
7+
from typing import Optional
8+
from typing import Tuple
49

510
from django.core.management.base import BaseCommand
611
from django.db.models import Exists
@@ -12,17 +17,90 @@
1217

1318
from contentcuration.models import Channel
1419
from contentcuration.models import ContentNode
20+
from contentcuration.models import License
1521

1622

1723
logger = logging.getLogger(__name__)
1824

1925

26+
class LicensingFixesLookup(object):
27+
"""Consolidates logic for reading and processing the licensing fixes from the CSV"""
28+
29+
def __init__(self):
30+
self._lookup = {}
31+
self._license_lookup = {}
32+
33+
def load(self, fp: io.TextIOWrapper):
34+
"""Loads the data from the CSV file, and the necessary license data from the database"""
35+
reader = csv.DictReader(fp)
36+
license_names = set()
37+
38+
# create a lookup index by channel ID from the CSV data
39+
for row in reader:
40+
self._lookup[uuid.UUID(row["channel_id"]).hex] = row
41+
if row["license_name"]:
42+
license_names.add(row["license_name"])
43+
44+
# load all licenses, regardless of whether they are named in the CSV
45+
license_lookup_by_name = {}
46+
for lic in License.objects.all():
47+
self._license_lookup[lic.id] = lic
48+
license_lookup_by_name[lic.license_name] = lic
49+
license_names.discard(lic.license_name)
50+
51+
# ensure we've found all the licenses
52+
if len(license_names):
53+
raise ValueError(f"Could not find all licenses: {license_names}")
54+
55+
# we now are certain all licenses are found
56+
for info in self._lookup.values():
57+
if info["license_name"]:
58+
info["license_id"] = license_lookup_by_name[info["license_name"]].id
59+
60+
def get_info(
61+
self,
62+
channel_id: str,
63+
license_id: Optional[int],
64+
license_description: Optional[str],
65+
copyright_holder: Optional[str],
66+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
67+
"""
68+
Determines the complete licensing metadata, given the current metadata, and comparing it
69+
with what would make the node complete.
70+
71+
:param channel_id: The channel the node was sourced from
72+
:param license_id: The current license_id of the node
73+
:param license_description: The current license_description of the node
74+
:param copyright_holder: The current copyright_holder of the node
75+
:return: A tuple of (license_id, license_description, copyright_holder) to use on the node
76+
"""
77+
info = self._lookup.get(channel_id, None)
78+
if info is None:
79+
logger.warning(f"Failed to find licensing info for channel: {channel_id}")
80+
return license_id, license_description, copyright_holder
81+
82+
if not license_id:
83+
license_id = info["license_id"]
84+
85+
if not license_id:
86+
return None, license_description, copyright_holder
87+
88+
license_obj = self._license_lookup.get(license_id)
89+
90+
if license_obj.is_custom and not license_description:
91+
license_description = info["license_description"]
92+
93+
if license_obj.copyright_holder_required and not copyright_holder:
94+
copyright_holder = info["copyright_holder"]
95+
96+
return license_id, license_description, copyright_holder
97+
98+
2099
class Command(BaseCommand):
21100
"""
22101
Audits nodes that have imported content from public channels and whether the imported content
23-
has a missing source node.
24-
25-
TODO: this does not yet FIX them
102+
has a missing source node. We've determined that pretty much all of these have incomplete
103+
licensing data
26104
"""
27105

28106
def handle(self, *args, **options):
@@ -71,32 +149,27 @@ def handle(self, *args, **options):
71149

72150
logger.info("=== Iterating over private destination channels. ===")
73151
channel_count = 0
74-
total_node_count = 0
75-
76-
with open("fix_missing_import_sources.csv", "w", newline="") as csv_file:
77-
csv_writer = csv.DictWriter(
78-
csv_file,
79-
fieldnames=[
80-
"channel_id",
81-
"channel_name",
82-
"contentnode_id",
83-
"contentnode_title",
84-
"public_channel_id",
85-
"public_channel_name",
86-
"public_channel_deleted",
87-
],
88-
)
89-
csv_writer.writeheader()
152+
total_fixed = 0
153+
lookup = LicensingFixesLookup()
154+
155+
command_dir = Path(__file__).parent
156+
csv_path = command_dir / "licensing_fixes_lookup.csv"
157+
158+
with csv_path.open("r", encoding="utf-8", newline="") as csv_file:
159+
lookup.load(csv_file)
90160

91-
for channel in destination_channels.iterator():
92-
node_count = self.handle_channel(csv_writer, channel)
161+
# skip using an iterator here, to limit transaction duration to `handle_channel`
162+
for channel in destination_channels:
163+
node_count = self.handle_channel(lookup, channel)
93164

94-
if node_count > 0:
95-
total_node_count += node_count
96-
channel_count += 1
165+
if node_count > 0:
166+
total_fixed += node_count
167+
channel_count += 1
97168

98169
logger.info("=== Done iterating over private destination channels. ===")
99-
logger.info(f"Found {total_node_count} nodes across {channel_count} channels.")
170+
logger.info(
171+
f"Fixed incomplete licensing data on {total_fixed} nodes across {channel_count} channels."
172+
)
100173
logger.info(f"Finished in {time.time() - start}")
101174

102175
def get_public_cte(self) -> With:
@@ -110,7 +183,15 @@ def get_public_cte(self) -> With:
110183
name="public_cte",
111184
)
112185

113-
def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
186+
def handle_channel(self, lookup: LicensingFixesLookup, channel: dict) -> int:
187+
"""
188+
Goes through the nodes of the channel, that were imported from public channels, but no
189+
longer have a valid source node. For each node, it applies license metadata as necessary
190+
191+
:param lookup: The lookup utility to pull licensing data from
192+
:param channel: The channel to fix
193+
:return: The total node count that are now marked complete as a result of the fixes
194+
"""
114195
public_cte = self.get_public_cte()
115196
channel_id = channel["id"]
116197
channel_name = channel["name"]
@@ -136,29 +217,50 @@ def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
136217
)
137218
)
138219
)
139-
.values(
140-
"public_channel_id",
141-
"public_channel_name",
142-
"public_channel_deleted",
143-
contentnode_id=F("id"),
144-
contentnode_title=F("title"),
145-
)
146220
)
147221

148222
# Count and log results
149223
node_count = missing_source_nodes.count()
224+
processed = 0
225+
was_complete = 0
226+
unfixed = 0
227+
now_complete = 0
150228

151-
# TODO: this will be replaced with logic to correct the missing source nodes
152-
if node_count > 0:
229+
def _log():
153230
logger.info(
154-
f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes."
231+
f"Fixing {channel_id}:{channel_name}\ttotal: {node_count}; before: {was_complete} unfixed: {unfixed}; after: {now_complete};"
155232
)
156-
row_dict = {
157-
"channel_id": channel_id,
158-
"channel_name": channel_name,
159-
}
160-
for node_dict in missing_source_nodes.iterator():
161-
row_dict.update(node_dict)
162-
csv_writer.writerow(row_dict)
163-
164-
return node_count
233+
234+
if node_count > 0:
235+
for node in missing_source_nodes.iterator():
236+
# determine the new license metadata
237+
license_id, license_description, copyright_holder = lookup.get_info(
238+
node.original_channel_id,
239+
node.license_id,
240+
node.license_description,
241+
node.copyright_holder,
242+
)
243+
244+
# if there isn't a license, there's nothing to do
245+
if not license_id:
246+
unfixed += 1
247+
# cannot fix
248+
continue
249+
250+
if node.complete:
251+
was_complete += 1
252+
253+
# apply updates
254+
node.license_id = license_id
255+
node.license_description = license_description
256+
node.copyright_holder = copyright_holder
257+
if not node.mark_complete():
258+
now_complete += 1
259+
node.save()
260+
processed += 1
261+
if processed % 100 == 0:
262+
_log()
263+
264+
_log()
265+
266+
return now_complete - was_complete
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
channel_id,channel_name,license_id,license_name,license_description,copyright_holder
2+
f9d3e0e4-6ea2-5789-bbed-672ff6a399ed,African Storybook Library (multiple languages),,CC BY,"",African Storybook Initiative
3+
d0ef6f71-e4fe-4e54-bb87-d7dab5eeaae2,Be Strong: Internet safety resources,,CC BY-NC-ND,"",Vodafone
4+
2d7b056d-668a-58ee-9244-ccf76108cbdb,Book Dash,,CC BY,"",http://bookdash.org/
5+
922e9c57-6c2f-59e5-9389-142b136308ff,Career Girls,,Special Permissions,For use on Kolibri,Career Girls
6+
da53f90b-1be2-5752-a046-82bbc353659f,Ciencia NASA,,,,""
7+
0294a064-f722-4899-887c-e07bd47f9991,Citoyennes de la Terre,,CC BY,"",Florence Piron
8+
604ad3b8-5d84-4dd8-9ee7-0fa12a9a5a6e,CREE+,,CC BY-NC-SA,"","Publicado por el Lic. Edelberto Andino(edelberto.andino.ea@gmail.com) para ser utilizado con fines educativos únicamente, no debe ser utilizado con fines lucrativos de ninguna índole."
9+
ef2ead65-de76-4ea4-a27b-ba6df5282c74,CSpathshala - सीएसपाठशाला (हिंदी),,CC BY,"",ए सि एम् इंडिया
10+
7e68bc59-d430-4e71-8a07-50b1b87125ad,Cultura Emprendedora,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
11+
c51a0f84-2fed-427c-95ac-ff9bb4a21e3c,EENET Inclusive Education Training Materials,,CC BY-NC-SA,"",Enabling Education Network (EENET)
12+
0e173fca-6e90-52f8-a474-a2fb84055faf,Global Digital Library - Book Catalog,,CC BY,"",Enabling Writers Initiative
13+
624e09bb-5eeb-4d20-aa8d-e62e7b4778a0,How to get started with Kolibri,,CC BY-NC,"",Learning Equality
14+
378cf412-8c85-4c27-95c1-00b5aca7a3ed,Inclusive Home Learning Activities,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",EENET – Enabling Education Network
15+
d76da4d3-6cfd-5927-9b57-5dfc6017aa13,Kamkalima (العربيّة),,CC BY-NC-ND,"",Kamkalima
16+
2fd54ca4-7a8f-59c9-9fce-faaa3894c19e,Khan Academy (English - CBSE India Curriculum),,CC BY-NC-SA,"",Khan Academy
17+
c9d7f950-ab6b-5a11-99e3-d6c10d7f0103,Khan Academy (English - US curriculum),,CC BY-NC-SA,"",Khan Academy
18+
c1f2b7e6-ac9f-56a2-bb44-fa7a48b66dce,Khan Academy (Español),,CC BY-NC-SA,"",Khan Academy
19+
878ec2e6-f88c-5c26-8b1b-e6f202833cd4,Khan Academy (Français),,CC BY-NC-SA,"",Khan Academy
20+
801a5f02-9420-5569-8918-edcff6494185,Khan Academy (Italiano),,CC BY-NC-SA,"",Khan Academy
21+
ec164fee-25ee-5262-96e6-8f7c10b1e169,Khan Academy (Kiswahili),,CC BY-NC-SA,"",Khan Academy
22+
2ac071c4-6723-54f2-aa78-953448f81e50,Khan Academy (Português - Brasil),,CC BY-NC-SA,"",Khan Academy
23+
c3231d84-4f8d-5bb1-b4cb-c6a7ddd91eb7,Khan Academy (Português (Portugal)),,CC BY-NC-SA,"",Khan Academy
24+
09ee940e-1069-53a2-b671-6e1020a0ce3f,Khan Academy (български език),,CC BY-NC-SA,"",Khan Academy
25+
a53592c9-72a8-594e-9b69-5aa127493ff6,Khan Academy (हिन्दी),,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
26+
a03496a6-de09-5e7b-a9d2-4291a487c78d,Khan Academy (বাংলা),,CC BY-NC-SA,"",Khan Academy
27+
5357e525-81c3-567d-a4f5-6d56badfeac7,Khan Academy (ગુજરાતી),,CC BY-NC-SA,"",Khan Academy
28+
2b608c6f-d4c3-5c34-b738-7e3dd7b53265,Khan Academy (ဗမာစာ),,CC BY-NC-SA,"",Khan Academy
29+
f5b71417-b1f6-57fc-a4d1-aaecd23e4067,Khan Academy (ភាសាខ្មែរ),,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
30+
ec599e77-f9ad-5802-8975-e8a26e6f1821,Khan Academy (中文(中国)),,CC BY-NC-SA,"",Khan Academy
31+
913efe9f-14c6-5cb1-b234-02f21f056e99,MIT Blossoms,,CC BY-NC-SA,"",MIT Blossoms
32+
fc47aee8-2e01-53e2-a301-97d3fdee1128,Open Stax,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Rice University
33+
b8bd7770-063d-40a8-bd9b-30d4703927b5,PBS SoCal: Family Math,,All Rights Reserved,"",PBS SoCal
34+
197934f1-4430-5350-b582-0c7c4dd8e194,PhET Interactive Simulations,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
35+
aa254505-59b5-5bd7-9bc9-0c09dfb805d2,PhET simulações interativas,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
36+
889f0c34-b275-507a-b8d3-7d2da2d03aa9,PhET – інтерактивне моделювання,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
37+
f6cb302e-f659-4db4-b4a0-4b4991a595c2,Plan Educativo TIC Basico,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
38+
e832106c-6398-54e1-8161-6015a8b87910,PraDigi,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",PraDigi
39+
131e543d-becf-5776-bb13-cfcfddf05605,Pratham Books' StoryWeaver,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Pratham Books
40+
f758ac6a-d39c-452f-9566-58da6ad7d3cc,Project Based Learning with Kolibri,,,"",""
41+
305b12ea-5ea8-4fa1-8f93-3705c23f5ee0,School of Thought,,CC BY,"",School of Thought
42+
3e464ee1-2f6a-50a7-81cd-df59147b48b1,Sikana (English),,CC BY-NC-ND,"",Sikana Education
43+
30c71c99-c42c-57d1-81e8-aeafd2e15e5f,Sikana (Español),,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
44+
8ef625db-6e86-506c-9a3b-ac891e413fff,Sikana (Français),,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
45+
f4715a77-6972-5c72-9d25-d29977b8b308,Similasyon Enteraktif PhET,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
46+
8fa678af-1dd0-5329-bf32-18c549b84996,Simulaciones interactivas PhET,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
47+
a9b25ac9-8147-42c8-83ce-1b0579448337,TESSA - Teacher Resources,,CC BY-NC-SA,"",Open University
48+
74f36493-bb47-5b62-935f-a8705ed59fed,Thoughtful Learning,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Thoughtful Learning
49+
000409f8-1dbe-5d1b-a671-01cb9fed4530,Touchable Earth (en),,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
50+
b336c2e2-c45c-53d5-b24e-5c476a54b077,Touchable Earth (fr),,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
51+
08a53136-a155-5f64-b049-6b3e1318b0cd,Ubongo Kids,,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Ubongo Media
52+
237e5975-bce2-5bf6-aff3-98f4c17516f3,,,,,

0 commit comments

Comments
 (0)