Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 1 addition & 18 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ members = [
"tools/make/depcheck",
"tools/make/diplomat-coverage",
"tools/make/diplomat-gen",
"tools/make/download-repo-sources",
"tools/md-tests",
"tools/noalloctest",
"tools/changelog",
Expand Down
1 change: 1 addition & 0 deletions provider/source/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ simple_logger = { workspace = true }
icu = { path = "../../components/icu", default-features = false, features = ["unstable"] }
num-bigint = { workspace = true }
num-rational = { workspace = true }
crlify = { workspace = true }

[features]
default = ["use_wasm", "networking"]
Expand Down
6 changes: 3 additions & 3 deletions provider/source/src/cldr_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use crate::cldr_serde::eras::EraData;
use crate::datetime::DatagenCalendar;
use crate::source::SerdeCache;
use crate::source::{AbstractFs, SerdeCache};
use crate::CoverageLevel;
use icu::locale::provider::{
LocaleLikelySubtagsExtendedV1, LocaleLikelySubtagsLanguageV1, LocaleLikelySubtagsScriptRegionV1,
Expand Down Expand Up @@ -46,9 +46,9 @@ pub(crate) struct CldrCache {
}

impl CldrCache {
pub(crate) fn from_serde_cache(serde_cache: SerdeCache) -> Self {
pub(crate) fn new(root: AbstractFs) -> Self {
CldrCache {
serde_cache,
serde_cache: SerdeCache::new(root),
dir_suffix: Default::default(),
extended_locale_expander: Default::default(),
calendar_eras: Default::default(),
Expand Down
31 changes: 16 additions & 15 deletions provider/source/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,7 @@ impl SourceDataProvider {
/// [GitHub releases](https://github.com/unicode-org/cldr-json/releases)).
pub fn with_cldr(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(
AbstractFs::new(root)?,
)))),
cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new(root)?))),
..self
})
}
Expand Down Expand Up @@ -246,10 +244,7 @@ impl SourceDataProvider {
/// `tz` directory or ZIP file (see [GitHub](https://github.com/eggert/tz)).
pub fn with_tzdb(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
tzdb_paths: Some(Arc::new(TzdbCache {
root: AbstractFs::new(root)?,
transitions: Default::default(),
})),
tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new(root)?))),
..self
})
}
Expand All @@ -263,9 +258,9 @@ impl SourceDataProvider {
#[cfg(feature = "networking")]
pub fn with_cldr_for_tag(self, tag: &str) -> Self {
Self {
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!(
cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new_from_url(format!(
"https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip",
)))))),
))))),
..self
}
}
Expand Down Expand Up @@ -352,12 +347,9 @@ impl SourceDataProvider {
#[cfg(feature = "networking")]
pub fn with_tzdb_for_tag(self, tag: &str) -> Self {
Self {
tzdb_paths: Some(Arc::new(TzdbCache {
root: AbstractFs::new_from_url(format!(
"https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",
)),
transitions: Default::default(),
})),
tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new_from_url(format!(
"https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",
))))),
..self
}
}
Expand Down Expand Up @@ -658,6 +650,15 @@ enum TrieType {
Small,
}

impl From<TrieType> for icu::collections::codepointtrie::TrieType {
fn from(other: TrieType) -> Self {
match other {
TrieType::Fast => Self::Fast,
TrieType::Small => Self::Small,
}
}
}

impl std::fmt::Display for TrieType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Expand Down
1 change: 0 additions & 1 deletion provider/source/src/segmenter/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

use crate::IterableDataProviderCached;
use crate::SourceDataProvider;
use icu::locale::langid;
use icu::segmenter::provider::SegmenterDictionaryAutoV1;
use icu::segmenter::provider::SegmenterDictionaryExtendedV1;
use icu::segmenter::provider::UCharDictionaryBreakData;
Expand Down
15 changes: 5 additions & 10 deletions provider/source/src/segmenter/lstm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
//! This module contains provider implementations backed by LSTM segmentation data.

use crate::{IterableDataProviderCached, SourceDataProvider};
use icu::locale::langid;
use icu::segmenter::provider::{
LstmData, LstmDataFloat32, LstmMatrix1, LstmMatrix2, LstmMatrix3, ModelType,
SegmenterLstmAutoV1,
Expand Down Expand Up @@ -208,15 +207,11 @@ impl DataProvider<SegmenterLstmAutoV1> for SourceDataProvider {

impl IterableDataProviderCached<SegmenterLstmAutoV1> for SourceDataProvider {
fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
const SUPPORTED: [&DataMarkerAttributes; 4] = [
DataMarkerAttributes::from_str_or_panic("Burmese_codepoints_exclusive_model4_heavy"),
DataMarkerAttributes::from_str_or_panic("Khmer_codepoints_exclusive_model4_heavy"),
DataMarkerAttributes::from_str_or_panic("Lao_codepoints_exclusive_model4_heavy"),
DataMarkerAttributes::from_str_or_panic("Thai_codepoints_exclusive_model4_heavy"),
];
Ok(SUPPORTED
.into_iter()
.map(DataIdentifierCow::from_marker_attributes)
Ok(self
.segmenter_lstm()?
.list("")?
.filter_map(|p| DataMarkerAttributes::try_from_string(p).ok())
.map(DataIdentifierCow::from_marker_attributes_owned)
.collect())
}
}
Expand Down
147 changes: 32 additions & 115 deletions provider/source/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

//! This module contains provider implementations backed by built-in segmentation data.

#![allow(dead_code)]
#![allow(unused_imports)]
#![cfg_attr(
not(any(feature = "use_wasm", feature = "use_icu4c")),
allow(dead_code, unused_imports)
)]

use crate::source::{include_files, SerdeCache};
use crate::SourceDataProvider;
use icu::collections::codepointtrie;
use icu::properties::{
props::{
EastAsianWidth, GeneralCategory, GraphemeClusterBreak, IndicConjunctBreak, LineBreak,
Expand All @@ -19,12 +21,9 @@ use icu::properties::{
use icu::segmenter::options::WordType;
use icu::segmenter::provider::*;
use icu_provider::prelude::*;
use std::cmp;
use std::collections::HashSet;
use std::fmt::Debug;
use std::ops::RangeInclusive;
use std::sync::OnceLock;
use zerovec::ZeroVec;

mod dictionary;
mod lstm;
Expand Down Expand Up @@ -98,11 +97,8 @@ fn generate_rule_break_data(
use icu::properties::{props::ExtendedPictographic, PropertyParser};
use icu_codepointtrie_builder::CodePointTrieBuilder;

let segmenter = provider
.icuexport()
.unwrap()
.read_and_parse_toml::<SegmenterRuleTable>(rules_file)
.expect("The data should be valid!");
let segmenter =
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: are we not caching anymore? don't we want to cache?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need caching because these are only loaded once, and even if not, the TOML parsing is only a tiny part of the work that's being done in the implementation.

These don't belong in the icuexportdata source, which I'm trying to slim down.

toml::from_str::<SegmenterRuleTable>(rules_file).expect("The data should be valid!");

let data = CodePointMapData::<WordBreak>::try_new_unstable(provider)
.expect("The data should be valid!");
Expand Down Expand Up @@ -197,14 +193,7 @@ fn generate_rule_break_data(
// the default unassigned values, so it's ok to omit them in the table.
const CODEPOINT_TABLE_LEN: usize = 0xE1000;

let mut properties_trie = CodePointTrieBuilder::new(
0u8,
0,
match trie_type {
crate::TrieType::Fast => codepointtrie::TrieType::Fast,
crate::TrieType::Small => codepointtrie::TrieType::Small,
},
);
let mut properties_trie = CodePointTrieBuilder::new(0u8, 0, trie_type.into());
let mut properties_names = Vec::<String>::new();
let mut simple_properties_count = 0;

Expand Down Expand Up @@ -635,27 +624,16 @@ fn generate_rule_break_data(

#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
fn generate_rule_break_data_override(
provider: &SourceDataProvider,
_provider: &SourceDataProvider,
rules_file: &str,
trie_type: crate::TrieType,
) -> RuleBreakDataOverride<'static> {
use icu_codepointtrie_builder::CodePointTrieBuilder;

let segmenter = provider
.icuexport()
.unwrap()
.read_and_parse_toml::<SegmenterRuleTable>(rules_file)
.expect("The data should be valid!");
let segmenter =
toml::from_str::<SegmenterRuleTable>(rules_file).expect("The data should be valid!");

const CODEPOINT_TABLE_LEN: usize = 0xE1000;
let mut properties_trie = CodePointTrieBuilder::new(
0u8,
0,
match trie_type {
crate::TrieType::Fast => codepointtrie::TrieType::Fast,
crate::TrieType::Small => codepointtrie::TrieType::Small,
},
);
let mut properties_trie = CodePointTrieBuilder::new(0u8, 0, trie_type.into());
let mut properties_names = Vec::<String>::new();

properties_names.push("Unknown".to_string());
Expand Down Expand Up @@ -714,7 +692,7 @@ macro_rules! implement {
self.check_req::<$marker>(req)?;
let data = generate_rule_break_data(
&hardcoded_segmenter_provider(),
$rules,
include_str!(concat!("../../data/segmenter/", $rules)),
self.trie_type(),
);

Expand Down Expand Up @@ -748,7 +726,7 @@ macro_rules! implement_override {
self.check_req::<$marker>(req)?;
let data = generate_rule_break_data_override(
&hardcoded_segmenter_provider(),
$rules,
include_str!(concat!("../../data/segmenter/", $rules)),
self.trie_type(),
);

Expand All @@ -773,95 +751,34 @@ macro_rules! implement_override {
}

fn hardcoded_segmenter_provider() -> SourceDataProvider {
use crate::{
source::{AbstractFs, SerdeCache},
SourceDataProvider,
};
// Singleton so that all instantiations share the same cache.
static SINGLETON: OnceLock<SourceDataProvider> = OnceLock::new();
SINGLETON
.get_or_init(|| {
let mut provider = SourceDataProvider::new_custom();
provider.icuexport_paths =
Some(std::sync::Arc::new(SerdeCache::new(AbstractFs::Memory(
[
(
"uprops/small/ea.toml",
include_bytes!("../../data/segmenter/uprops/small/ea.toml").as_slice(),
),
(
"uprops/small/ExtPict.toml",
include_bytes!("../../data/segmenter/uprops/small/ExtPict.toml")
.as_slice(),
),
(
"uprops/small/gc.toml",
include_bytes!("../../data/segmenter/uprops/small/gc.toml").as_slice(),
),
(
"uprops/small/GCB.toml",
include_bytes!("../../data/segmenter/uprops/small/GCB.toml").as_slice(),
),
(
"uprops/small/InCB.toml",
include_bytes!("../../data/segmenter/uprops/small/InCB.toml")
.as_slice(),
),
(
"uprops/small/lb.toml",
include_bytes!("../../data/segmenter/uprops/small/lb.toml").as_slice(),
),
(
"uprops/small/SB.toml",
include_bytes!("../../data/segmenter/uprops/small/SB.toml").as_slice(),
),
(
"uprops/small/sc.toml",
include_bytes!("../../data/segmenter/uprops/small/sc.toml").as_slice(),
),
(
"uprops/small/WB.toml",
include_bytes!("../../data/segmenter/uprops/small/WB.toml").as_slice(),
),
(
"segmenter/grapheme.toml",
include_bytes!("../../data/segmenter/grapheme.toml").as_slice(),
),
(
"segmenter/line.toml",
include_bytes!("../../data/segmenter/line.toml").as_slice(),
),
(
"segmenter/sentence.toml",
include_bytes!("../../data/segmenter/sentence.toml").as_slice(),
),
(
"segmenter/word.toml",
include_bytes!("../../data/segmenter/word.toml").as_slice(),
),
]
.into_iter()
.collect(),
))));
provider.icuexport_paths = Some(std::sync::Arc::new(SerdeCache::new(include_files!(
"../../data/segmenter/";
"uprops/small/ea.toml",
"uprops/small/ExtPict.toml",
"uprops/small/gc.toml",
"uprops/small/GCB.toml",
"uprops/small/InCB.toml",
"uprops/small/lb.toml",
"uprops/small/SB.toml",
"uprops/small/sc.toml",
"uprops/small/WB.toml",
))));
provider
})
.clone()
}

implement!(SegmenterBreakLineV1, "segmenter/line.toml");
implement!(SegmenterBreakGraphemeClusterV1, "segmenter/grapheme.toml");
implement!(SegmenterBreakWordV1, "segmenter/word.toml");
implement!(SegmenterBreakSentenceV1, "segmenter/sentence.toml");
implement_override!(
SegmenterBreakWordOverrideV1,
"segmenter/word.toml",
["fi", "sv"]
);
implement_override!(
SegmenterBreakSentenceOverrideV1,
"segmenter/sentence.toml",
["el"]
);
implement!(SegmenterBreakLineV1, "line.toml");
implement!(SegmenterBreakGraphemeClusterV1, "grapheme.toml");
implement!(SegmenterBreakWordV1, "word.toml");
implement!(SegmenterBreakSentenceV1, "sentence.toml");
implement_override!(SegmenterBreakWordOverrideV1, "word.toml", ["fi", "sv"]);
implement_override!(SegmenterBreakSentenceOverrideV1, "sentence.toml", ["el"]);

#[cfg(test)]
mod tests {
Expand Down
Loading
Loading