Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 4 additions & 16 deletions provider/icu4x-datagen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ struct Cli {
ucd_tag: String,

#[arg(long, value_name = "PATH")]
#[arg(help = "Path to a local Unihan.zip file or directory.")]
#[arg(help = "[DEPRECATED] Path to a local Unihan.zip file or directory.")]
#[cfg(feature = "provider")]
unihan_root: Option<PathBuf>,

Expand Down Expand Up @@ -444,10 +444,6 @@ fn run(cli: Cli) -> eyre::Result<()> {
);
} else if SourceDataProvider::is_missing_segmenter_lstm_error(e) {
eyre::bail!("Segmentation LSTM data is required for this invocation, set --segmenter-lstm-root or --segmenter-lstm-tag");
} else if SourceDataProvider::is_missing_unihan_error(e) {
eyre::bail!(
"Unihan data is required for this invocation, set --unihan-root or --ucd-tag"
);
} else if SourceDataProvider::is_missing_ucd_error(e) {
eyre::bail!("UCD data is required for this invocation, set --ucd-root or --ucd-tag");
} else if SourceDataProvider::is_missing_tzdb_error(e) {
Expand Down Expand Up @@ -529,17 +525,9 @@ fn run(cli: Cli) -> eyre::Result<()> {
(None, _) => p,
};

p = match (cli.unihan_root, cli.ucd_tag.as_str()) {
(Some(path), _) => p.with_unihan(&path)?,
#[cfg(feature = "networking")]
(_, "latest") => p.with_unihan_for_tag(SourceDataProvider::TESTED_UCD_TAG),
#[cfg(feature = "networking")]
(_, "latest-tag") => p.with_unihan_for_tag("latest"),
#[cfg(feature = "networking")]
(_, tag) => p.with_unihan_for_tag(tag),
#[cfg(not(feature = "networking"))]
(None, _) => p,
};
if cli.unihan_root.is_some() {
log::warn!("Ignoring --unihan-root, use --ucd-root instead")
}

p = match (cli.ucd_root, cli.ucd_tag.as_str()) {
(Some(path), _) => p.with_ucd(&path)?,
Expand Down
84 changes: 30 additions & 54 deletions provider/source/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use icu::calendar::{Date, Iso};
use icu::time::zone::UtcOffset;
use icu::time::Time;
use icu_provider::prelude::*;
use source::{AbstractFs, SerdeCache, TzdbCache};
use source::{AbstractFs, SerdeCache, TzdbCache, UnicodeCache};
use std::collections::{BTreeSet, HashSet};
use std::fmt::Debug;
use std::path::Path;
Expand Down Expand Up @@ -99,8 +99,7 @@ pub struct SourceDataProvider {
icuexport_paths: Option<Arc<SerdeCache>>,
segmenter_lstm_paths: Option<Arc<SerdeCache>>,
tzdb_paths: Option<Arc<TzdbCache>>,
unihan_paths: Option<Arc<AbstractFs>>,
ucd_paths: Option<Arc<AbstractFs>>,
unicode_paths: Option<Arc<UnicodeCache>>,
trie_type: TrieType,
collation_root_han: CollationRootHan,
pub(crate) timezone_horizon: time_zones::Timestamp,
Expand Down Expand Up @@ -163,7 +162,7 @@ impl SourceDataProvider {
.with_icuexport_for_tag(Self::TESTED_ICUEXPORT_TAG)
.with_segmenter_lstm_for_tag(Self::TESTED_SEGMENTER_LSTM_TAG)
.with_tzdb_for_tag(Self::TESTED_TZDB_TAG)
.with_unihan_for_tag(Self::TESTED_UCD_TAG)
.with_ucd_for_tag(Self::TESTED_UCD_TAG)
})
.clone()
}
Expand All @@ -179,8 +178,7 @@ impl SourceDataProvider {
icuexport_paths: None,
segmenter_lstm_paths: None,
tzdb_paths: None,
unihan_paths: None,
ucd_paths: None,
unicode_paths: None,
trie_type: Default::default(),
timezone_horizon: time_zones::Timestamp::try_offset_only_from_str(
"2015-01-01T00:00:00Z",
Expand Down Expand Up @@ -222,20 +220,17 @@ impl SourceDataProvider {
})
}

/// Adds Unihan source data to the provider. The path should point to the Unihan ZIP file
/// (see [Unicode Character Database](https://www.unicode.org/ucd/)).
pub fn with_unihan(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
unihan_paths: Some(Arc::new(AbstractFs::new(root)?)),
..self
})
/// Deprecated, see [`Self::with_ucd`].
#[deprecated(since = "2.3.0", note = "use .with_ucd")]
pub fn with_unihan(self, _root: &Path) -> Result<Self, DataError> {
panic!("Use `.with_ucd` to set UCD data, which includes Unihan data.");
}

/// Adds UCD source data to the provider. The path should point to a
/// directory containing `security/IdentifierStatus.txt`.
/// Adds Unicode source data to the provider. The path should point to a
/// directory structure matching <https://www.unicode.org/Public/{version}/>.
pub fn with_ucd(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
ucd_paths: Some(Arc::new(AbstractFs::new(root)?)),
unicode_paths: Some(Arc::new(UnicodeCache::new_local(AbstractFs::new(root)?))),
..self
})
}
Expand All @@ -258,7 +253,7 @@ impl SourceDataProvider {
#[cfg(feature = "networking")]
pub fn with_cldr_for_tag(self, tag: &str) -> Self {
Self {
cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new_from_url(format!(
cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new_zip_from_url(format!(
"https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip",
))))),
..self
Expand All @@ -285,7 +280,7 @@ impl SourceDataProvider {
)
};
Self {
icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(url)))),
icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_zip_from_url(url)))),
..self
}
}
Expand All @@ -299,41 +294,32 @@ impl SourceDataProvider {
#[cfg(feature = "networking")]
pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Self {
Self {
segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!(
segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_zip_from_url(format!(
"https://github.com/unicode-org/lstm_word_segmentation/releases/download/{tag}/models.zip"
))))),
..self
}
}

/// Adds UCD Unihan source data to the provider. The data will be downloaded from unicode.org
/// using the given version tag (see [Unicode Character Database](https://www.unicode.org/ucd/)).
///
/// Also see: [`TESTED_UCD_TAG`](Self::TESTED_UCD_TAG)
/// Deprecated, see [`Self::with_ucd_for_tag`].
///
/// ✨ *Enabled with the `networking` Cargo feature.*
#[cfg(feature = "networking")]
pub fn with_unihan_for_tag(self, tag: &str) -> Self {
Self {
unihan_paths: Some(Arc::new(AbstractFs::new_from_url(format!(
"https://www.unicode.org/Public/{tag}/ucd/Unihan.zip"
)))),
..self
}
#[deprecated(since = "2.3.0", note = "use .with_ucd_for_tag")]
pub fn with_unihan_for_tag(self, _tag: &str) -> Self {
panic!("Use `.with_ucd_for_tag` to set UCD data, which includes Unihan data.");
}

/// Adds UCD source data to the provider. The data will be downloaded from unicode.org
/// using the given version tag (see [Unicode Character Database](https://www.unicode.org/ucd/)).
/// Adds Unicode source data to the provider. The data will be downloaded from
/// <https://unicode.org/Public> using the given version tag.
///
/// Also see: [`TESTED_UCD_TAG`](Self::TESTED_UCD_TAG)
///
/// ✨ *Enabled with the `networking` Cargo feature.*
#[cfg(feature = "networking")]
pub fn with_ucd_for_tag(self, tag: &str) -> Self {
Self {
ucd_paths: Some(Arc::new(AbstractFs::new_from_url(format!(
"https://www.unicode.org/Public/{tag}/"
)))),
unicode_paths: Some(Arc::new(UnicodeCache::new_remote(tag))),
..self
}
}
Expand All @@ -347,9 +333,9 @@ impl SourceDataProvider {
#[cfg(feature = "networking")]
pub fn with_tzdb_for_tag(self, tag: &str) -> Self {
Self {
tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new_from_url(format!(
"https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",
))))),
tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new_tar_from_url(
format!("https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",),
)))),
..self
}
}
Expand All @@ -364,11 +350,8 @@ impl SourceDataProvider {
"Missing segmenter data. Use `.with_segmenter_lstm[_for_tag]` to set segmenter data.",
);

const MISSING_UNIHAN_ERROR: DataError =
DataError::custom("Missing Unihan data. Use `.with_unihan[_for_tag]` to set Unihan data.");

const MISSING_UCD_ERROR: DataError =
DataError::custom("Missing UCD data. Use `.with_ucd` to set UCD data.");
DataError::custom("Missing UCD data. Use `.with_ucd[_for_tag]` to set UCD data.");

const MISSING_TZDB_ERROR: DataError =
DataError::custom("Missing tzdb data. Use `.with_tzdb[_for_tag]` to set tzdb data.");
Expand Down Expand Up @@ -398,9 +381,9 @@ impl SourceDataProvider {
}

/// Identifies errors that are due to missing UCD data.
pub fn is_missing_unihan_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_UNIHAN_ERROR
#[deprecated]
pub fn is_missing_unihan_error(e: DataError) -> bool {
Self::is_missing_ucd_error(e)
}

/// Identifies errors that are due to missing UCD data.
Expand All @@ -426,15 +409,8 @@ impl SourceDataProvider {
}

#[allow(dead_code)]
fn unihan(&self) -> Result<&AbstractFs, DataError> {
self.unihan_paths
.as_deref()
.ok_or(Self::MISSING_UNIHAN_ERROR)
}

#[allow(dead_code)]
fn ucd(&self) -> Result<&AbstractFs, DataError> {
self.ucd_paths.as_deref().ok_or(Self::MISSING_UCD_ERROR)
fn unicode(&self) -> Result<&UnicodeCache, DataError> {
self.unicode_paths.as_deref().ok_or(Self::MISSING_UCD_ERROR)
}

fn tzdb(&self) -> Result<&TzdbCache, DataError> {
Expand Down
13 changes: 5 additions & 8 deletions provider/source/src/segmenter/unihan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//! This module contains provider implementations for Unihan radicals.

use crate::AbstractFs;
use crate::source::UnicodeCache;
use crate::{IterableDataProviderCached, SourceDataProvider};
use icu::collections::codepointinvlist::CodePointInversionListBuilder;
use icu::segmenter::provider::radical::{SegmenterUnihanRadicalV1, UnihanRadicalsData};
Expand All @@ -15,11 +15,10 @@ use std::collections::HashSet;

#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
fn build_unihan_radicals_data(
unihan: &AbstractFs,
ucd: &AbstractFs,
unicode: &UnicodeCache,
trie_type: crate::TrieType,
) -> Result<UnihanRadicalsData<'static>, DataError> {
let identifier_status = ucd.read_to_string("security/IdentifierStatus.txt")?;
let identifier_status = unicode.read_to_string("security/IdentifierStatus.txt")?;
let mut id_builder = CodePointInversionListBuilder::new();
for line in identifier_status.lines() {
if line.starts_with('#') || line.trim().is_empty() {
Expand All @@ -37,7 +36,7 @@ fn build_unihan_radicals_data(
}
let identifier_status = id_builder.build();

let raw_content = unihan.read_to_string("Unihan_IRGSources.txt")?;
let raw_content = unicode.read_to_string("ucd/unihan/Unihan_IRGSources.txt")?;
let mut builder = CodePointTrieBuilder::new(0u8, 0u8, trie_type.into());

for line in raw_content.lines() {
Expand Down Expand Up @@ -85,9 +84,7 @@ impl DataProvider<SegmenterUnihanRadicalV1> for SourceDataProvider {
{
self.check_req::<SegmenterUnihanRadicalV1>(req)?;

let unihan = self.unihan()?;
let ucd = self.ucd()?;
let data = build_unihan_radicals_data(unihan, ucd, self.trie_type())?;
let data = build_unihan_radicals_data(self.unicode()?, self.trie_type())?;

Ok(DataResponse {
metadata: Default::default(),
Expand Down
Loading
Loading