Skip to content

Commit daedef4

Browse files
committed
Fix static analysis issues and handle 400 errors as end-of-data
1 parent f40dbf4 commit daedef4

1 file changed

Lines changed: 49 additions & 23 deletions

File tree

scripts/1-fetch/doaj_fetch.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
Focus: Journal-level CC license adoption and temporal trends.
66
Note: Articles do not contain license information in DOAJ API.
77
8-
This script focuses on essential data for quantifying Creative Commons adoption:
8+
This script focuses on essential data for quantifying Creative Commons
9+
adoption:
910
- Journal CC license counts by type
1011
- Temporal trends (year-by-year adoption)
1112
@@ -63,8 +64,12 @@
6364

6465
# File Paths
6566
FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
66-
FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv")
67-
FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv")
67+
FILE_DOAJ_COUNTRY = shared.path_join(
68+
PATHS["data_1-fetch"], "doaj_3_count_by_country.csv"
69+
)
70+
FILE_DOAJ_LANGUAGE = shared.path_join(
71+
PATHS["data_1-fetch"], "doaj_5_count_by_language.csv"
72+
)
6873
FILE_PROVENANCE = shared.path_join(
6974
PATHS["data_1-fetch"], "doaj_provenance.yaml"
7075
)
@@ -75,7 +80,12 @@
7580
# CSV Headers
7681
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
7782
HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"]
78-
HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
83+
HEADER_LANGUAGE = [
84+
"TOOL_IDENTIFIER",
85+
"LANGUAGE_CODE",
86+
"LANGUAGE_NAME",
87+
"COUNT",
88+
]
7989
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
8090

8191
# Runtime variables
@@ -165,13 +175,13 @@ def extract_license_types(license_info):
165175
"""Extract all CC license types from DOAJ license information."""
166176
if not license_info:
167177
return []
168-
178+
169179
cc_licenses = []
170180
for lic in license_info:
171181
lic_type = lic.get("type", "")
172182
if lic_type in CC_LICENSE_TYPES:
173183
cc_licenses.append(lic_type)
174-
184+
175185
return cc_licenses
176186

177187

@@ -183,7 +193,9 @@ def process_journals(session, args):
183193
country_counts = defaultdict(Counter)
184194
language_counts = defaultdict(Counter)
185195
year_counts = defaultdict(Counter)
186-
processed_journals = set() # Track unique journals to avoid double counting
196+
processed_journals = (
197+
set()
198+
) # Track unique journals to avoid double counting
187199

188200
total_processed = 0
189201
page = 1
@@ -200,6 +212,10 @@ def process_journals(session, args):
200212
response.raise_for_status()
201213
data = response.json()
202214
except requests.HTTPError as e:
215+
# Handle 400 errors as end of data (DOAJ API behavior)
216+
if hasattr(e, "response") and e.response.status_code == 400:
217+
LOGGER.info(f"Reached end of available data at page {page}")
218+
break
203219
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
204220
except requests.RequestException as e:
205221
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
@@ -260,7 +276,7 @@ def process_journals(session, args):
260276
if isinstance(publisher_info, dict):
261277
country_code = publisher_info.get("country", "Unknown")
262278
country_counts[license_type][country_code] += 1
263-
279+
264280
# Extract language information
265281
languages = bibjson.get("language", [])
266282
if languages:
@@ -332,12 +348,14 @@ def save_count_data(
332348
for lic, countries in country_counts.items():
333349
for country_code, count in countries.items():
334350
country_name = get_country_name(country_code)
335-
writer.writerow({
336-
"TOOL_IDENTIFIER": lic,
337-
"COUNTRY_CODE": country_code,
338-
"COUNTRY_NAME": country_name,
339-
"COUNT": count,
340-
})
351+
writer.writerow(
352+
{
353+
"TOOL_IDENTIFIER": lic,
354+
"COUNTRY_CODE": country_code,
355+
"COUNTRY_NAME": country_name,
356+
"COUNT": count,
357+
}
358+
)
341359

342360
# Save language counts with pycountry names
343361
with open(
@@ -350,12 +368,14 @@ def save_count_data(
350368
for lic, languages in language_counts.items():
351369
for lang_code, count in languages.items():
352370
lang_name = get_language_name(lang_code)
353-
writer.writerow({
354-
"TOOL_IDENTIFIER": lic,
355-
"LANGUAGE_CODE": lang_code,
356-
"LANGUAGE_NAME": lang_name,
357-
"COUNT": count,
358-
})
371+
writer.writerow(
372+
{
373+
"TOOL_IDENTIFIER": lic,
374+
"LANGUAGE_CODE": lang_code,
375+
"LANGUAGE_NAME": lang_name,
376+
"COUNT": count,
377+
}
378+
)
359379

360380
# Save year counts
361381
with open(
@@ -405,7 +425,10 @@ def query_doaj(args):
405425
"quarter": QUARTER,
406426
"script": os.path.basename(__file__),
407427
"api_version": "v4",
408-
"note": "Journal-level CC license data only - article counts not available via DOAJ API",
428+
"note": (
429+
"Journal-level CC license data only - "
430+
"article counts not available via DOAJ API"
431+
),
409432
}
410433

411434
try:
@@ -425,11 +448,14 @@ def query_doaj(args):
425448
)
426449

427450
LOGGER.info(f"Unique CC-licensed journals processed: {journals_processed}")
428-
451+
429452
# Calculate total license availability instances
430453
total_license_instances = sum(license_counts.values())
431454
LOGGER.info(f"Total CC license type instances: {total_license_instances}")
432-
LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type")
455+
LOGGER.info(
456+
"Note: Journals supporting multiple CC license types are "
457+
"counted once per license type"
458+
)
433459

434460

435461
def main():

0 commit comments

Comments
 (0)