Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scraper/src/gutenberg2zim/iso639.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,5 +240,6 @@ def language_name(code: str) -> str:
"myn": [],
"nai": [],
"nah": ["nhe"],
"nap": ["nap"], # Neapolitan: no ISO 639-1 code, already valid ISO 639-3
"fur": ["fvr"], # Friulian: IETF fur maps to ISO 639-3 fvr
}
37 changes: 30 additions & 7 deletions scraper/src/gutenberg2zim/zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,27 @@


def get_zim_language_metadata(languages: list[str], books: list[CatalogEntry]):
language_counts = {
zim_lang: sum(lang in book.languages for book in books)
for lang in languages
for zim_lang in ZIM_LANGUAGES_MAP.get(lang, [ISO_MATRIX.get(lang, None)])
if zim_lang
}
unresolved = []
language_counts: dict[str, int] = {}
for lang in languages:
zim_langs = ZIM_LANGUAGES_MAP.get(
lang,
# Try ISO_MATRIX (2-letter → 3-letter), fall back to
# ISO_MATRIX_REV to check if already a valid 3-letter code
[ISO_MATRIX.get(lang, lang if lang in ISO_MATRIX_REV else None)],
)
resolved = [zl for zl in zim_langs if zl]
if not resolved:
unresolved.append(lang)
continue
for zim_lang in resolved:
language_counts[zim_lang] = sum(lang in book.languages for book in books)
if unresolved:
logger.warning(
f"Could not resolve ZIM language metadata for: {', '.join(unresolved)}. "
"Books with these languages will still be included, but they won't appear "
"in ZIM language metadata."
)
return sorted(language_counts, key=lambda lang: language_counts[lang], reverse=True)


Expand Down Expand Up @@ -123,9 +138,17 @@ def build_zimfile(
logger.info(f"Removing existing ZIM file {zim_file}")
zim_path.unlink(missing_ok=True)

resolved_languages = zim_languages or get_zim_language_metadata(languages, books)
if not resolved_languages:
raise ValueError(
f"Cannot resolve language metadata for: "
f"{', '.join(languages)}. "
"Use --zim-languages to override."
)

Global.setup(
filename=zim_path,
language=zim_languages or get_zim_language_metadata(languages, books),
language=resolved_languages,
title=title,
description=description,
long_description=long_description,
Expand Down