diff --git a/provider/icu4x-datagen/src/main.rs b/provider/icu4x-datagen/src/main.rs index deea097ff2e..efec818e3c0 100644 --- a/provider/icu4x-datagen/src/main.rs +++ b/provider/icu4x-datagen/src/main.rs @@ -183,7 +183,7 @@ struct Cli { ucd_tag: String, #[arg(long, value_name = "PATH")] - #[arg(help = "Path to a local Unihan.zip file or directory.")] + #[arg(help = "[DEPRECATED] Path to a local Unihan.zip file or directory.")] #[cfg(feature = "provider")] unihan_root: Option, @@ -444,10 +444,6 @@ fn run(cli: Cli) -> eyre::Result<()> { ); } else if SourceDataProvider::is_missing_segmenter_lstm_error(e) { eyre::bail!("Segmentation LSTM data is required for this invocation, set --segmenter-lstm-root or --segmenter-lstm-tag"); - } else if SourceDataProvider::is_missing_unihan_error(e) { - eyre::bail!( - "Unihan data is required for this invocation, set --unihan-root or --ucd-tag" - ); } else if SourceDataProvider::is_missing_ucd_error(e) { eyre::bail!("UCD data is required for this invocation, set --ucd-root or --ucd-tag"); } else if SourceDataProvider::is_missing_tzdb_error(e) { @@ -529,17 +525,9 @@ fn run(cli: Cli) -> eyre::Result<()> { (None, _) => p, }; - p = match (cli.unihan_root, cli.ucd_tag.as_str()) { - (Some(path), _) => p.with_unihan(&path)?, - #[cfg(feature = "networking")] - (_, "latest") => p.with_unihan_for_tag(SourceDataProvider::TESTED_UCD_TAG), - #[cfg(feature = "networking")] - (_, "latest-tag") => p.with_unihan_for_tag("latest"), - #[cfg(feature = "networking")] - (_, tag) => p.with_unihan_for_tag(tag), - #[cfg(not(feature = "networking"))] - (None, _) => p, - }; + if cli.unihan_root.is_some() { + log::warn!("Ignoring --unihan-root, use --ucd-root instead") + } p = match (cli.ucd_root, cli.ucd_tag.as_str()) { (Some(path), _) => p.with_ucd(&path)?, diff --git a/provider/source/src/lib.rs b/provider/source/src/lib.rs index f55f866d10c..dcbcaa32672 100644 --- a/provider/source/src/lib.rs +++ b/provider/source/src/lib.rs @@ -36,7 +36,7 @@ use icu::calendar::{Date, Iso}; use icu::time::zone::UtcOffset; use icu::time::Time; use icu_provider::prelude::*; -use source::{AbstractFs, SerdeCache, TzdbCache}; +use source::{AbstractFs, SerdeCache, TzdbCache, UnicodeCache}; use std::collections::{BTreeSet, HashSet}; use std::fmt::Debug; use std::path::Path; @@ -99,8 +99,7 @@ pub struct SourceDataProvider { icuexport_paths: Option>, segmenter_lstm_paths: Option>, tzdb_paths: Option>, - unihan_paths: Option>, - ucd_paths: Option>, + unicode_paths: Option>, trie_type: TrieType, collation_root_han: CollationRootHan, pub(crate) timezone_horizon: time_zones::Timestamp, @@ -163,7 +162,7 @@ impl SourceDataProvider { .with_icuexport_for_tag(Self::TESTED_ICUEXPORT_TAG) .with_segmenter_lstm_for_tag(Self::TESTED_SEGMENTER_LSTM_TAG) .with_tzdb_for_tag(Self::TESTED_TZDB_TAG) - .with_unihan_for_tag(Self::TESTED_UCD_TAG) + .with_ucd_for_tag(Self::TESTED_UCD_TAG) }) .clone() } @@ -179,8 +178,7 @@ impl SourceDataProvider { icuexport_paths: None, segmenter_lstm_paths: None, tzdb_paths: None, - unihan_paths: None, - ucd_paths: None, + unicode_paths: None, trie_type: Default::default(), timezone_horizon: time_zones::Timestamp::try_offset_only_from_str( "2015-01-01T00:00:00Z", @@ -222,20 +220,17 @@ impl SourceDataProvider { }) } - /// Adds Unihan source data to the provider. The path should point to the Unihan ZIP file - /// (see [Unicode Character Database](https://www.unicode.org/ucd/)). - pub fn with_unihan(self, root: &Path) -> Result { - Ok(Self { - unihan_paths: Some(Arc::new(AbstractFs::new(root)?)), - ..self - }) + /// Deprecated, see [`Self::with_ucd`]. + #[deprecated(since = "2.3.0", note = "use .with_ucd")] + pub fn with_unihan(self, _root: &Path) -> Result { + panic!("Use `.with_ucd` to set UCD data, which includes Unihan data."); } - /// Adds UCD source data to the provider. The path should point to a - /// directory containing `security/IdentifierStatus.txt`. + /// Adds Unicode source data to the provider. The path should point to a + /// directory structure matching . pub fn with_ucd(self, root: &Path) -> Result { Ok(Self { - ucd_paths: Some(Arc::new(AbstractFs::new(root)?)), + unicode_paths: Some(Arc::new(UnicodeCache::new_local(AbstractFs::new(root)?))), ..self }) } @@ -258,7 +253,7 @@ impl SourceDataProvider { #[cfg(feature = "networking")] pub fn with_cldr_for_tag(self, tag: &str) -> Self { Self { - cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new_from_url(format!( + cldr_paths: Some(Arc::new(CldrCache::new(AbstractFs::new_zip_from_url(format!( "https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip", ))))), ..self @@ -285,7 +280,7 @@ impl SourceDataProvider { ) }; Self { - icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(url)))), + icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_zip_from_url(url)))), ..self } } @@ -299,31 +294,24 @@ impl SourceDataProvider { #[cfg(feature = "networking")] pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Self { Self { - segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!( + segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_zip_from_url(format!( "https://github.com/unicode-org/lstm_word_segmentation/releases/download/{tag}/models.zip" ))))), ..self } } - /// Adds UCD Unihan source data to the provider. The data will be downloaded from unicode.org - /// using the given version tag (see [Unicode Character Database](https://www.unicode.org/ucd/)). - /// - /// Also see: [`TESTED_UCD_TAG`](Self::TESTED_UCD_TAG) + /// Deprecated, see [`Self::with_ucd_for_tag`]. /// /// ✨ *Enabled with the `networking` Cargo feature.* #[cfg(feature = "networking")] - pub fn with_unihan_for_tag(self, tag: &str) -> Self { - Self { - unihan_paths: Some(Arc::new(AbstractFs::new_from_url(format!( - "https://www.unicode.org/Public/{tag}/ucd/Unihan.zip" - )))), - ..self - } + #[deprecated(since = "2.3.0", note = "use .with_ucd_for_tag")] + pub fn with_unihan_for_tag(self, _tag: &str) -> Self { + panic!("Use `.with_ucd_for_tag` to set UCD data, which includes Unihan data."); } - /// Adds UCD source data to the provider. The data will be downloaded from unicode.org - /// using the given version tag (see [Unicode Character Database](https://www.unicode.org/ucd/)). + /// Adds Unicode source data to the provider. The data will be downloaded from + /// using the given version tag. /// /// Also see: [`TESTED_UCD_TAG`](Self::TESTED_UCD_TAG) /// @@ -331,9 +319,7 @@ impl SourceDataProvider { #[cfg(feature = "networking")] pub fn with_ucd_for_tag(self, tag: &str) -> Self { Self { - ucd_paths: Some(Arc::new(AbstractFs::new_from_url(format!( - "https://www.unicode.org/Public/{tag}/" - )))), + unicode_paths: Some(Arc::new(UnicodeCache::new_remote(tag))), ..self } } @@ -347,9 +333,9 @@ impl SourceDataProvider { #[cfg(feature = "networking")] pub fn with_tzdb_for_tag(self, tag: &str) -> Self { Self { - tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new_from_url(format!( - "https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz", - ))))), + tzdb_paths: Some(Arc::new(TzdbCache::new(AbstractFs::new_tar_from_url( + format!("https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",), + )))), ..self } } @@ -364,11 +350,8 @@ impl SourceDataProvider { "Missing segmenter data. Use `.with_segmenter_lstm[_for_tag]` to set segmenter data.", ); - const MISSING_UNIHAN_ERROR: DataError = - DataError::custom("Missing Unihan data. Use `.with_unihan[_for_tag]` to set Unihan data."); - const MISSING_UCD_ERROR: DataError = - DataError::custom("Missing UCD data. Use `.with_ucd` to set UCD data."); + DataError::custom("Missing UCD data. Use `.with_ucd[_for_tag]` to set UCD data."); const MISSING_TZDB_ERROR: DataError = DataError::custom("Missing tzdb data. Use `.with_tzdb[_for_tag]` to set tzdb data."); @@ -398,9 +381,9 @@ impl SourceDataProvider { } /// Identifies errors that are due to missing UCD data. - pub fn is_missing_unihan_error(mut e: DataError) -> bool { - e.marker = None; - e == Self::MISSING_UNIHAN_ERROR + #[deprecated] + pub fn is_missing_unihan_error(e: DataError) -> bool { + Self::is_missing_ucd_error(e) } /// Identifies errors that are due to missing UCD data. @@ -426,15 +409,8 @@ impl SourceDataProvider { } #[allow(dead_code)] - fn unihan(&self) -> Result<&AbstractFs, DataError> { - self.unihan_paths - .as_deref() - .ok_or(Self::MISSING_UNIHAN_ERROR) - } - - #[allow(dead_code)] - fn ucd(&self) -> Result<&AbstractFs, DataError> { - self.ucd_paths.as_deref().ok_or(Self::MISSING_UCD_ERROR) + fn unicode(&self) -> Result<&UnicodeCache, DataError> { + self.unicode_paths.as_deref().ok_or(Self::MISSING_UCD_ERROR) } fn tzdb(&self) -> Result<&TzdbCache, DataError> { diff --git a/provider/source/src/segmenter/unihan.rs b/provider/source/src/segmenter/unihan.rs index ed57d78ae33..d15a48f2918 100644 --- a/provider/source/src/segmenter/unihan.rs +++ b/provider/source/src/segmenter/unihan.rs @@ -4,7 +4,7 @@ //! This module contains provider implementations for Unihan radicals. -use crate::AbstractFs; +use crate::source::UnicodeCache; use crate::{IterableDataProviderCached, SourceDataProvider}; use icu::collections::codepointinvlist::CodePointInversionListBuilder; use icu::segmenter::provider::radical::{SegmenterUnihanRadicalV1, UnihanRadicalsData}; @@ -15,11 +15,10 @@ use std::collections::HashSet; #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] fn build_unihan_radicals_data( - unihan: &AbstractFs, - ucd: &AbstractFs, + unicode: &UnicodeCache, trie_type: crate::TrieType, ) -> Result, DataError> { - let identifier_status = ucd.read_to_string("security/IdentifierStatus.txt")?; + let identifier_status = unicode.read_to_string("security/IdentifierStatus.txt")?; let mut id_builder = CodePointInversionListBuilder::new(); for line in identifier_status.lines() { if line.starts_with('#') || line.trim().is_empty() { @@ -37,7 +36,7 @@ fn build_unihan_radicals_data( } let identifier_status = id_builder.build(); - let raw_content = unihan.read_to_string("Unihan_IRGSources.txt")?; + let raw_content = unicode.read_to_string("ucd/unihan/Unihan_IRGSources.txt")?; let mut builder = CodePointTrieBuilder::new(0u8, 0u8, trie_type.into()); for line in raw_content.lines() { @@ -85,9 +84,7 @@ impl DataProvider for SourceDataProvider { { self.check_req::(req)?; - let unihan = self.unihan()?; - let ucd = self.ucd()?; - let data = build_unihan_radicals_data(unihan, ucd, self.trie_type())?; + let data = build_unihan_radicals_data(self.unicode()?, self.trie_type())?; Ok(DataResponse { metadata: Default::default(), diff --git a/provider/source/src/source.rs b/provider/source/src/source.rs index fee281a5865..75b582da589 100644 --- a/provider/source/src/source.rs +++ b/provider/source/src/source.rs @@ -102,15 +102,43 @@ pub(crate) struct ZipData { file_list: HashSet, } +impl ZipData { + fn try_new(bytes: Vec) -> Result { + let archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|e| DataError::custom("Invalid ZIP file").with_display_context(&e))?; + + let file_list = archive.file_names().map(String::from).collect(); + Ok(Self { archive, file_list }) + } +} + pub(crate) struct TarArchive { archive: Vec, file_list: HashSet, } +impl TarArchive { + fn try_new(bytes: Vec) -> Result { + use std::io::Read; + let mut archive = Vec::new(); + flate2::read::GzDecoder::new(Cursor::new(bytes)).read_to_end(&mut archive)?; + let file_list = tar::Archive::new(Cursor::new(&archive)) + .entries_with_seek() + .map(|e| { + e.into_iter() + .filter_map(|e| Some(e.ok()?.path().ok()?.as_os_str().to_str()?.to_string())) + })? + .collect::>(); + Ok(TarArchive { archive, file_list }) + } +} + pub(crate) enum AbstractFs { Fs(PathBuf), Zip(RwLock>), Tar(RwLock>), + #[cfg(feature = "networking")] + Http(String), Memory(BTreeMap<&'static str, &'static [u8]>), } @@ -128,75 +156,72 @@ impl AbstractFs { { Ok(Self::Fs(root.to_path_buf())) } else if root.extension().is_some_and(|ext| ext == "zip") { - let archive = ZipArchive::new(Cursor::new(std::fs::read(root)?)).map_err(|e| { - DataError::custom("Invalid ZIP file") - .with_display_context(&e) - .with_path_context(root) - })?; - let file_list = archive.file_names().map(String::from).collect(); - Ok(Self::Zip(RwLock::new(Ok(ZipData { archive, file_list })))) - } else if root.ends_with(".tar.gz") { - use std::io::Read; - let mut data = Vec::new(); - flate2::read::GzDecoder::new(Cursor::new(std::fs::read(root)?)) - .read_to_end(&mut data)?; - - let file_list = tar::Archive::new(Cursor::new(&data)) - .entries_with_seek() - .map(|e| { - e.into_iter().filter_map(|e| { - Some(e.ok()?.path().ok()?.as_os_str().to_str()?.to_string()) - }) - })? - .collect::>(); - - Ok(Self::Tar(RwLock::new(Ok(TarArchive { - archive: data, - file_list, - })))) + Ok(Self::Zip(RwLock::new(Ok(ZipData::try_new( + std::fs::read(root)?, + )?)))) + } else if root.extension().is_some_and(|ext| ext == "gz") { + Ok(Self::Tar(RwLock::new(Ok(TarArchive::try_new( + std::fs::read(root)?, + )?)))) } else { Err(DataError::custom("unsupported archive type").with_display_context(&root.display())) } } + #[cfg(feature = "networking")] + pub fn new_zip_from_url(path: String) -> Self { + Self::Zip(RwLock::new(Err(path))) + } + + #[cfg(feature = "networking")] + pub fn new_tar_from_url(path: String) -> Self { + Self::Tar(RwLock::new(Err(path))) + } + #[cfg(feature = "networking")] pub fn new_from_url(path: String) -> Self { - if path.ends_with(".zip") { - Self::Zip(RwLock::new(Err(path))) - } else { - Self::Tar(RwLock::new(Err(path))) - } + // We store the path without trailing / and add them ourselves + Self::Http(path.trim_end_matches('/').to_string()) } - fn init(&self) -> Result<(), DataError> { - #[cfg(feature = "networking")] - fn download(resource: &String) -> Result { - let root = std::env::var_os("ICU4X_SOURCE_CACHE") - .map(PathBuf::from) - .unwrap_or_else(|| std::env::temp_dir().join("icu4x-source-cache/")) - .join(resource.rsplit("//").next().unwrap()); - if !root.exists() { - log::info!("Downloading {resource}"); - std::fs::create_dir_all(root.parent().unwrap())?; - let mut retry = 5; - let mut response = loop { - match ureq::get(resource).call() { - Ok(r) => break r.into_body().into_reader(), - Err(e) if retry > 0 => { - log::warn!("Download error {e:?}, retrying..."); - std::thread::sleep(std::time::Duration::from_secs(2)); - retry -= 1; - } - Err(e) => { - return Err(DataError::custom("Download").with_display_context(&e)) - } - } - }; - std::io::copy(&mut response, &mut BufWriter::new(File::create(&root)?))?; - } - Ok(root) + #[cfg(feature = "networking")] + fn download(resource: &String) -> Result { + let root = std::env::var_os("ICU4X_SOURCE_CACHE") + .map(PathBuf::from) + .unwrap_or_else(|| std::env::temp_dir().join("icu4x-source-cache/")) + .join(resource.rsplit("//").next().unwrap()); + if root.exists() { + return Ok(root); } + static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _one_download_at_a_time = LOCK.lock().unwrap(); + if root.exists() { + return Ok(root); + } + log::info!("Downloading {resource}"); + std::fs::create_dir_all(root.parent().unwrap())?; + let mut retry = 5; + let mut response = loop { + match ureq::get(resource).call() { + Ok(r) => break r.into_body().into_reader(), + Err(e) if retry > 0 => { + log::warn!("Download error {e:?}, retrying..."); + std::thread::sleep(std::time::Duration::from_secs(2)); + retry -= 1; + } + Err(e) => return Err(DataError::custom("Download").with_display_context(&e)), + } + }; + // Cannot write directly to the final path because we don't want other threads to read the partial file + std::io::copy( + &mut response, + &mut BufWriter::new(File::create(root.with_extension("tmp"))?), + )?; + std::fs::rename(root.with_extension("tmp"), &root)?; + Ok(root) + } + fn init(&self) -> Result<(), DataError> { #[cfg(feature = "networking")] if let Self::Zip(lock) = self { if lock.read().expect("poison").is_ok() { @@ -209,17 +234,11 @@ impl AbstractFs { return Ok(()); }; - let root = download(resource)?; - - let archive = ZipArchive::new(Cursor::new(std::fs::read(&root)?)).map_err(|e| { - DataError::custom("Invalid ZIP file") - .with_display_context(&e) - .with_path_context(&root) - })?; + let root = Self::download(resource)?; - let file_list = archive.file_names().map(String::from).collect(); - - *lock = Ok(ZipData { archive, file_list }); + *lock = + Ok(ZipData::try_new(std::fs::read(&root)?) + .map_err(|e| e.with_path_context(&root))?); } else if let Self::Tar(lock) = self { if lock.read().expect("poison").is_ok() { return Ok(()); @@ -231,24 +250,10 @@ impl AbstractFs { return Ok(()); }; - use std::io::Read; - let mut data = Vec::new(); - flate2::read::GzDecoder::new(Cursor::new(std::fs::read(&download(resource)?)?)) - .read_to_end(&mut data)?; + let root = Self::download(resource)?; - let file_list = tar::Archive::new(Cursor::new(&data)) - .entries_with_seek() - .map(|e| { - e.into_iter().filter_map(|e| { - Some(e.ok()?.path().ok()?.as_os_str().to_str()?.to_string()) - }) - })? - .collect::>(); - - *lock = Ok(TarArchive { - archive: data, - file_list, - }) + *lock = Ok(TarArchive::try_new(std::fs::read(&root)?) + .map_err(|e| e.with_path_context(&root))?); } Ok(()) } @@ -304,6 +309,8 @@ impl AbstractFs { .with_display_context(path) }) } + #[cfg(feature = "networking")] + Self::Http(url) => Ok(std::fs::read(Self::download(&format!("{url}/{path}"))?)?), Self::Memory(map) => map.get(path).copied().map(Vec::from).ok_or_else(|| { DataError::custom("Not found in icu4x-datagen's data/").with_display_context(path) }), @@ -352,6 +359,12 @@ impl AbstractFs { .map(String::from) .collect::>() .into_iter(), + #[cfg(feature = "networking")] + Self::Http(url) => { + return Err( + DataError::custom("Cannot list HTTP directories").with_display_context(url) + ) + } Self::Memory(map) => map .keys() .copied() @@ -383,6 +396,8 @@ impl AbstractFs { .unwrap() // init called .file_list .contains(path), + #[cfg(feature = "networking")] + Self::Http(url) => Self::download(&format!("{url}/{path}")).is_ok(), Self::Memory(map) => map.contains_key(path), }) } @@ -493,6 +508,110 @@ impl TzdbCache { } } +// A cache representing https://unicode.org/Public/{version}/ +#[derive(Debug)] +pub(crate) struct UnicodeCache { + root: AbstractFs, + // The `ucd/UCD.zip` file. Requests matching `ucd/[^unihan]` will be resolved through + // the ZIP file instead of downloading individual files. + ucd_zip: Option, + // The `ucd/Unihan.zip` file. Requests matching `ucd/unihan/` will be resolved through + // the ZIP file instead of downloading individual files. + unihan_zip: Option, + // The `security/uts39-data-X.0.0.zip`` file. Requests matching `security/` will be + // resolved through the ZIP file instead of downloading individual files. + uts_35_zip: Option, + // Cached file contents. It's all text files, so we cache them as strings. + file_cache: FrozenMap, +} + +impl UnicodeCache { + #[cfg(feature = "networking")] + pub fn new_remote(version: &str) -> Self { + let root = AbstractFs::new_from_url(format!("https://www.unicode.org/Public/{version}/")); + let ucd_zip = AbstractFs::new_zip_from_url(format!( + "https://www.unicode.org/Public/{version}/ucd/UCD.zip" + )); + let unihan_zip = AbstractFs::new_zip_from_url(format!( + "https://www.unicode.org/Public/{version}/ucd/Unihan.zip" + )); + let uts_35_zip = AbstractFs::new_zip_from_url(format!( + "https://www.unicode.org/Public/{version}/security/uts39-data-{version}.zip" + )); + Self { + root, + ucd_zip: Some(ucd_zip), + unihan_zip: Some(unihan_zip), + uts_35_zip: Some(uts_35_zip), + file_cache: FrozenMap::new(), + } + } + + pub fn new_local(root: AbstractFs) -> Self { + Self { + root, + ucd_zip: None, + unihan_zip: None, + uts_35_zip: None, + file_cache: FrozenMap::new(), + } + } + + #[allow(dead_code)] + pub fn file_exists(&self, file: &str) -> Result { + if self.file_cache.get(file).is_some() { + return Ok(true); + } + + if let (Some(unihan_zip), Some(unihan_path)) = + (self.unihan_zip.as_ref(), file.strip_prefix("ucd/unihan/")) + { + Ok(unihan_zip.file_exists(unihan_path)?) + } else if let (Some(ucd_zip), Some(ucd_path)) = + (self.ucd_zip.as_ref(), file.strip_prefix("ucd/")) + { + Ok(ucd_zip.file_exists(ucd_path)?) + } else if let (Some(uts_35_zip), Some(uts_35_path)) = + (self.uts_35_zip.as_ref(), file.strip_prefix("security/")) + { + Ok(uts_35_zip.file_exists(uts_35_path)?) + } else { + Ok(self.root.file_exists(file)?) + } + } + + #[allow(dead_code)] // only used with CodePointTrieBuilder, which is feature-gated + pub fn read_to_string(&self, file: &str) -> Result<&str, DataError> { + if let Some(x) = self.file_cache.get(file) { + return Ok(x); + } + + if let (Some(unihan_zip), Some(unihan_path)) = + (self.unihan_zip.as_ref(), file.strip_prefix("ucd/unihan/")) + { + Ok(self + .file_cache + .insert(file.to_string(), unihan_zip.read_to_string(unihan_path)?)) + } else if let (Some(ucd_zip), Some(ucd_path)) = + (self.ucd_zip.as_ref(), file.strip_prefix("ucd/")) + { + Ok(self + .file_cache + .insert(file.to_string(), ucd_zip.read_to_string(ucd_path)?)) + } else if let (Some(uts_35_zip), Some(uts_35_path)) = + (self.uts_35_zip.as_ref(), file.strip_prefix("security/")) + { + Ok(self + .file_cache + .insert(file.to_string(), uts_35_zip.read_to_string(uts_35_path)?)) + } else { + Ok(self + .file_cache + .insert(file.to_string(), self.root.read_to_string(file)?)) + } + } +} + macro_rules! include_files { ($base:literal; $($file:literal),* $(,)?) => { crate::source::AbstractFs::Memory([ diff --git a/provider/source/src/tests/data.rs b/provider/source/src/tests/data.rs index 73f4d2ef470..60cd2702ea7 100644 --- a/provider/source/src/tests/data.rs +++ b/provider/source/src/tests/data.rs @@ -667,18 +667,11 @@ pub fn lstm_data() -> AbstractFs { } #[rustfmt::skip] -pub fn unihan_data() -> AbstractFs { +pub fn unicode_data() -> AbstractFs { include_files!( - "../../tests/data/unihan/"; - "Unihan_IRGSources.txt" - ) -} - -#[rustfmt::skip] -pub fn ucd_data() -> AbstractFs { - include_files!( - "../../tests/data/ucd/"; - "security/IdentifierStatus.txt" + "../../tests/data/unicode/"; + "security/IdentifierStatus.txt", + "ucd/unihan/Unihan_IRGSources.txt" ) } diff --git a/provider/source/src/tests/download_repo_sources.rs b/provider/source/src/tests/download_repo_sources.rs index 983607b8ee5..0ffdeaceda3 100644 --- a/provider/source/src/tests/download_repo_sources.rs +++ b/provider/source/src/tests/download_repo_sources.rs @@ -2,7 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::source::AbstractFs; +use crate::source::{AbstractFs, UnicodeCache}; use crate::SourceDataProvider; use icu::locale::{langid, LanguageIdentifier}; use icu_provider::DataError; @@ -38,6 +38,29 @@ impl AbstractFs { } } +impl UnicodeCache { + pub fn dump( + &self, + target: &Path, + mut files: BTreeSet, + ) -> Result, DataError> { + std::fs::remove_dir_all(target)?; + + for file in files.clone() { + if !self.file_exists(&file).unwrap() { + files.remove(&file); + continue; + } + + std::fs::create_dir_all(target.join(&file).parent().unwrap())?; + crlify::BufWriterWithLineEndingFix::new(File::create(target.join(&file))?) + .write_all(self.read_to_string(&file)?.as_bytes())?; + } + + Ok(files) + } +} + #[test] #[ignore] fn download_repo_sources() { @@ -95,15 +118,15 @@ fn download_repo_sources() { ) .unwrap(); - let unihan_files = provider - .unihan_paths + let unicode_files = provider + .unicode_paths .unwrap() .dump( - &out_root.join("unihan"), - UNIHAN_GLOB.iter().copied().map(String::from).collect(), + &out_root.join("unicode"), + UNICODE_GLOB.iter().copied().map(String::from).collect(), ) .unwrap(); - let irg_path = out_root.join("unihan/Unihan_IRGSources.txt"); + let irg_path = out_root.join("unicode/ucd/unihan/Unihan_IRGSources.txt"); std::io::copy( &mut BufReader::new(File::open(&irg_path).unwrap()) .lines() @@ -116,42 +139,6 @@ fn download_repo_sources() { ) .unwrap(); - // Cannot use AbstractFs::dump because UCD is not a functioning data source - std::fs::remove_dir_all(out_root.join("ucd")).unwrap(); - let mut ucd_files = BTreeSet::new(); - for spath in UCD_GLOB { - let path = out_root.join("ucd").join(spath); - std::fs::create_dir_all(path.parent().unwrap()).unwrap(); - std::io::copy( - &mut ureq::get(&format!( - "https://www.unicode.org/Public/{}/security/IdentifierStatus.txt", - SourceDataProvider::TESTED_UCD_TAG, - )) - .call() - .map_err(|e| DataError::custom("Download").with_display_context(&e)) - .unwrap() - .into_body() - .into_reader(), - &mut crlify::BufWriterWithLineEndingFix::new(File::create(path).unwrap()), - ) - .unwrap(); - ucd_files.insert(spath.to_string()); - } - let identifier_status_path = out_root.join("ucd/security/IdentifierStatus.txt"); - std::io::copy( - &mut BufReader::new(File::open(&identifier_status_path).unwrap()) - .lines() - .map_while(Result::ok) - .filter(|l| l.contains("CJK") || l.starts_with('#')) - .collect::>() - .join("\n") - .as_bytes(), - &mut crlify::BufWriterWithLineEndingFix::new( - File::create(&identifier_status_path).unwrap(), - ), - ) - .unwrap(); - let mut tzdb_files = provider .tzdb_paths .unwrap() @@ -192,12 +179,11 @@ fn download_repo_sources() { tzdb_files.remove("Makefile"); tzdb_files.remove("ziguard.awk"); - let [cldr_files, icuexport_files, lstm_files, unihan_files, ucd_files, tzdb_files] = [ + let [cldr_files, icuexport_files, lstm_files, unicode_files, tzdb_files] = [ cldr_files, icuexport_files, lstm_files, - unihan_files, - ucd_files, + unicode_files, tzdb_files, ] .map(|files| { @@ -246,18 +232,10 @@ pub fn lstm_data() -> AbstractFs {{ }} #[rustfmt::skip] -pub fn unihan_data() -> AbstractFs {{ - include_files!( - \"../../tests/data/unihan/\"; - {unihan_files} - ) -}} - -#[rustfmt::skip] -pub fn ucd_data() -> AbstractFs {{ +pub fn unicode_data() -> AbstractFs {{ include_files!( - \"../../tests/data/ucd/\"; - {ucd_files} + \"../../tests/data/unicode/\"; + {unicode_files} ) }} diff --git a/provider/source/src/tests/mod.rs b/provider/source/src/tests/mod.rs index 7e0c0502a7f..19764fc56d6 100644 --- a/provider/source/src/tests/mod.rs +++ b/provider/source/src/tests/mod.rs @@ -13,7 +13,7 @@ mod make_testdata; include!("data.rs"); use crate::cldr_cache::CldrCache; -use crate::source::{SerdeCache, TzdbCache}; +use crate::source::{SerdeCache, TzdbCache, UnicodeCache}; use crate::SourceDataProvider; use std::sync::{Arc, OnceLock}; @@ -27,8 +27,7 @@ impl SourceDataProvider { cldr_paths: Some(Arc::new(CldrCache::new(cldr_data()))), icuexport_paths: Some(Arc::new(SerdeCache::new(icuexport_data()))), segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(lstm_data()))), - unihan_paths: Some(Arc::new(unihan_data())), - ucd_paths: Some(Arc::new(ucd_data())), + unicode_paths: Some(Arc::new(UnicodeCache::new_local(unicode_data()))), tzdb_paths: Some(Arc::new(TzdbCache::new(tzdb_data()))), ..SourceDataProvider::new_custom() }) diff --git a/provider/source/tests/data/ucd/security/IdentifierStatus.txt b/provider/source/tests/data/unicode/security/IdentifierStatus.txt similarity index 77% rename from provider/source/tests/data/ucd/security/IdentifierStatus.txt rename to provider/source/tests/data/unicode/security/IdentifierStatus.txt index a052c71f31d..21a79503afe 100644 --- a/provider/source/tests/data/ucd/security/IdentifierStatus.txt +++ b/provider/source/tests/data/unicode/security/IdentifierStatus.txt @@ -15,6 +15,7 @@ # Field 1: Identifier_Status value # See the "Identifier_Status and Identifier_Type" table of UTS #39: # https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type + # # For the purpose of regular expressions, the property Identifier_Status is defined as # an enumerated property of code points. @@ -22,10 +23,397 @@ # The possible values are: # Allowed, Restricted # The short name of each value is the same as its long name. + # All code points not explicitly listed for Identifier_Status # have the value Restricted. + # @missing: 0000..10FFFF; Restricted + + # Identifier_Status: Allowed + +0027 ; Allowed # 1.1 APOSTROPHE +002D..002E ; Allowed # 1.1 [2] HYPHEN-MINUS..FULL STOP +0030..003A ; Allowed # 1.1 [11] DIGIT ZERO..COLON +0041..005A ; Allowed # 1.1 [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z +005F ; Allowed # 1.1 LOW LINE +0061..007A ; Allowed # 1.1 [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z +00B7 ; Allowed # 1.1 MIDDLE DOT +00C0..00D6 ; Allowed # 1.1 [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS +00D8..00F6 ; Allowed # 1.1 [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS +00F8..0113 ; Allowed # 1.1 [28] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER E WITH MACRON +0116..012B ; Allowed # 1.1 [22] LATIN CAPITAL LETTER E WITH DOT ABOVE..LATIN SMALL LETTER I WITH MACRON +012E..0131 ; Allowed # 1.1 [4] LATIN CAPITAL LETTER I WITH OGONEK..LATIN SMALL LETTER DOTLESS I +0134..0137 ; Allowed # 1.1 [4] LATIN CAPITAL LETTER J WITH CIRCUMFLEX..LATIN SMALL LETTER K WITH CEDILLA +0139..013E ; Allowed # 1.1 [6] LATIN CAPITAL LETTER L WITH ACUTE..LATIN SMALL LETTER L WITH CARON +0141..0148 ; Allowed # 1.1 [8] LATIN CAPITAL LETTER L WITH STROKE..LATIN SMALL LETTER N WITH CARON +014A..014D ; Allowed # 1.1 [4] LATIN CAPITAL LETTER ENG..LATIN SMALL LETTER O WITH MACRON +0150..0155 ; Allowed # 1.1 [6] LATIN CAPITAL LETTER O WITH DOUBLE ACUTE..LATIN SMALL LETTER R WITH ACUTE +0158..0161 ; Allowed # 1.1 [10] LATIN CAPITAL LETTER R WITH CARON..LATIN SMALL LETTER S WITH CARON +0164..017E ; Allowed # 1.1 [27] LATIN CAPITAL LETTER T WITH CARON..LATIN SMALL LETTER Z WITH CARON +0181 ; Allowed # 1.1 LATIN CAPITAL LETTER B WITH HOOK +0186 ; Allowed # 1.1 LATIN CAPITAL LETTER OPEN O +0189..018A ; Allowed # 1.1 [2] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH HOOK +018E..0192 ; Allowed # 1.1 [5] LATIN CAPITAL LETTER REVERSED E..LATIN SMALL LETTER F WITH HOOK +0194 ; Allowed # 1.1 LATIN CAPITAL LETTER GAMMA +0196..0199 ; Allowed # 1.1 [4] LATIN CAPITAL LETTER IOTA..LATIN SMALL LETTER K WITH HOOK +019D ; Allowed # 1.1 LATIN CAPITAL LETTER N WITH LEFT HOOK +01A0..01A1 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER O WITH HORN..LATIN SMALL LETTER O WITH HORN +01AF..01B0 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER U WITH HORN..LATIN SMALL LETTER U WITH HORN +01B2..01B4 ; Allowed # 1.1 [3] LATIN CAPITAL LETTER V WITH HOOK..LATIN SMALL LETTER Y WITH HOOK +01B7 ; Allowed # 1.1 LATIN CAPITAL LETTER EZH +01CD..01D4 ; Allowed # 1.1 [8] LATIN CAPITAL LETTER A WITH CARON..LATIN SMALL LETTER U WITH CARON +01DD ; Allowed # 1.1 LATIN SMALL LETTER TURNED E +01E6..01E9 ; Allowed # 1.1 [4] LATIN CAPITAL LETTER G WITH CARON..LATIN SMALL LETTER K WITH CARON +01EE..01EF ; Allowed # 1.1 [2] LATIN CAPITAL LETTER EZH WITH CARON..LATIN SMALL LETTER EZH WITH CARON +01F8..01F9 ; Allowed # 3.0 [2] LATIN CAPITAL LETTER N WITH GRAVE..LATIN SMALL LETTER N WITH GRAVE +0218..021B ; Allowed # 3.0 [4] LATIN CAPITAL LETTER S WITH COMMA BELOW..LATIN SMALL LETTER T WITH COMMA BELOW +0244 ; Allowed # 5.0 LATIN CAPITAL LETTER U BAR +024C..024D ; Allowed # 5.0 [2] LATIN CAPITAL LETTER R WITH STROKE..LATIN SMALL LETTER R WITH STROKE +0253..0254 ; Allowed # 1.1 [2] LATIN SMALL LETTER B WITH HOOK..LATIN SMALL LETTER OPEN O +0256..0257 ; Allowed # 1.1 [2] LATIN SMALL LETTER D WITH TAIL..LATIN SMALL LETTER D WITH HOOK +0259 ; Allowed # 1.1 LATIN SMALL LETTER SCHWA +025B ; Allowed # 1.1 LATIN SMALL LETTER OPEN E +0263 ; Allowed # 1.1 LATIN SMALL LETTER GAMMA +0268..0269 ; Allowed # 1.1 [2] LATIN SMALL LETTER I WITH STROKE..LATIN SMALL LETTER IOTA +0272 ; Allowed # 1.1 LATIN SMALL LETTER N WITH LEFT HOOK +0289 ; Allowed # 1.1 LATIN SMALL LETTER U BAR +028B ; Allowed # 1.1 LATIN SMALL LETTER V WITH HOOK +0292 ; Allowed # 1.1 LATIN SMALL LETTER EZH +02BB..02BC ; Allowed # 1.1 [2] MODIFIER LETTER TURNED COMMA..MODIFIER LETTER APOSTROPHE +0300..0304 ; Allowed # 1.1 [5] COMBINING GRAVE ACCENT..COMBINING MACRON +0306..030C ; Allowed # 1.1 [7] COMBINING BREVE..COMBINING CARON +031B ; Allowed # 1.1 COMBINING HORN +0323 ; Allowed # 1.1 COMBINING DOT BELOW +0326..0328 ; Allowed # 1.1 [3] COMBINING COMMA BELOW..COMBINING OGONEK +0331 ; Allowed # 1.1 COMBINING MACRON BELOW +0386 ; Allowed # 1.1 GREEK CAPITAL LETTER ALPHA WITH TONOS +0388..038A ; Allowed # 1.1 [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS +038C ; Allowed # 1.1 GREEK CAPITAL LETTER OMICRON WITH TONOS +038E..03A1 ; Allowed # 1.1 [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO +03A3..03CE ; Allowed # 1.1 [44] GREEK CAPITAL LETTER SIGMA..GREEK SMALL LETTER OMEGA WITH TONOS +0401..040C ; Allowed # 1.1 [12] CYRILLIC CAPITAL LETTER IO..CYRILLIC CAPITAL LETTER KJE +040E..044F ; Allowed # 1.1 [66] CYRILLIC CAPITAL LETTER SHORT U..CYRILLIC SMALL LETTER YA +0451..045C ; Allowed # 1.1 [12] CYRILLIC SMALL LETTER IO..CYRILLIC SMALL LETTER KJE +045E..045F ; Allowed # 1.1 [2] CYRILLIC SMALL LETTER SHORT U..CYRILLIC SMALL LETTER DZHE +0490..049B ; Allowed # 1.1 [12] CYRILLIC CAPITAL LETTER GHE WITH UPTURN..CYRILLIC SMALL LETTER KA WITH DESCENDER +049E..04A5 ; Allowed # 1.1 [8] CYRILLIC CAPITAL LETTER KA WITH STROKE..CYRILLIC SMALL LIGATURE EN GHE +04A8..04B7 ; Allowed # 1.1 [16] CYRILLIC CAPITAL LETTER ABKHASIAN HA..CYRILLIC SMALL LETTER CHE WITH DESCENDER +04BA..04C0 ; Allowed # 1.1 [7] CYRILLIC CAPITAL LETTER SHHA..CYRILLIC LETTER PALOCHKA +04CF ; Allowed # 5.0 CYRILLIC SMALL LETTER PALOCHKA +04D0..04D9 ; Allowed # 1.1 [10] CYRILLIC CAPITAL LETTER A WITH BREVE..CYRILLIC SMALL LETTER SCHWA +04DC..04E9 ; Allowed # 1.1 [14] CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS..CYRILLIC SMALL LETTER BARRED O +04EE..04F5 ; Allowed # 1.1 [8] CYRILLIC CAPITAL LETTER U WITH MACRON..CYRILLIC SMALL LETTER CHE WITH DIAERESIS +04F8..04F9 ; Allowed # 1.1 [2] CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS..CYRILLIC SMALL LETTER YERU WITH DIAERESIS +0524..0525 ; Allowed # 5.2 [2] CYRILLIC CAPITAL LETTER PE WITH DESCENDER..CYRILLIC SMALL LETTER PE WITH DESCENDER +0531..0556 ; Allowed # 1.1 [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH +0561..0586 ; Allowed # 1.1 [38] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LETTER FEH +058A ; Allowed # 3.0 ARMENIAN HYPHEN +05D0..05EA ; Allowed # 1.1 [27] HEBREW LETTER ALEF..HEBREW LETTER TAV +05F3..05F4 ; Allowed # 1.1 [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM +0620 ; Allowed # 6.0 ARABIC LETTER KASHMIRI YEH +0621..063A ; Allowed # 1.1 [26] ARABIC LETTER HAMZA..ARABIC LETTER GHAIN +063D ; Allowed # 5.1 ARABIC LETTER FARSI YEH WITH INVERTED V +0641..0652 ; Allowed # 1.1 [18] ARABIC LETTER FEH..ARABIC SUKUN +0654..0655 ; Allowed # 3.0 [2] ARABIC HAMZA ABOVE..ARABIC HAMZA BELOW +0660..0669 ; Allowed # 1.1 [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE +0670 ; Allowed # 1.1 ARABIC LETTER SUPERSCRIPT ALEF +0672 ; Allowed # 1.1 ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE +0674 ; Allowed # 1.1 ARABIC LETTER HIGH HAMZA +0679..068F ; Allowed # 1.1 [23] ARABIC LETTER TTEH..ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS +0691..069A ; Allowed # 1.1 [10] ARABIC LETTER RREH..ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE +069F..06A0 ; Allowed # 1.1 [2] ARABIC LETTER TAH WITH THREE DOTS ABOVE..ARABIC LETTER AIN WITH THREE DOTS ABOVE +06A2 ; Allowed # 1.1 ARABIC LETTER FEH WITH DOT MOVED BELOW +06A4..06AB ; Allowed # 1.1 [8] ARABIC LETTER VEH..ARABIC LETTER KAF WITH RING +06AD..06B1 ; Allowed # 1.1 [5] ARABIC LETTER NG..ARABIC LETTER NGOEH +06B3 ; Allowed # 1.1 ARABIC LETTER GUEH +06B5..06B7 ; Allowed # 1.1 [3] ARABIC LETTER LAM WITH SMALL V..ARABIC LETTER LAM WITH THREE DOTS ABOVE +06BA..06BE ; Allowed # 1.1 [5] ARABIC LETTER NOON GHUNNA..ARABIC LETTER HEH DOACHASHMEE +06C0..06CE ; Allowed # 1.1 [15] ARABIC LETTER HEH WITH YEH ABOVE..ARABIC LETTER YEH WITH SMALL V +06CF ; Allowed # 3.0 ARABIC LETTER WAW WITH DOT ABOVE +06D0..06D3 ; Allowed # 1.1 [4] ARABIC LETTER E..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE +06D5 ; Allowed # 1.1 ARABIC LETTER AE +06EE..06EF ; Allowed # 4.0 [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V +06F0..06F9 ; Allowed # 1.1 [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE +06FD..06FE ; Allowed # 3.0 [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN +06FF ; Allowed # 4.0 ARABIC LETTER HEH WITH INVERTED V +0751..0752 ; Allowed # 4.1 [2] ARABIC LETTER BEH WITH DOT BELOW AND THREE DOTS ABOVE..ARABIC LETTER BEH WITH THREE DOTS POINTING UPWARDS BELOW +0756 ; Allowed # 4.1 ARABIC LETTER BEH WITH SMALL V +0760 ; Allowed # 4.1 ARABIC LETTER FEH WITH TWO DOTS BELOW +0762..0763 ; Allowed # 4.1 [2] ARABIC LETTER KEHEH WITH DOT ABOVE..ARABIC LETTER KEHEH WITH THREE DOTS ABOVE +0766..0768 ; Allowed # 4.1 [3] ARABIC LETTER MEEM WITH DOT BELOW..ARABIC LETTER NOON WITH SMALL TAH +076A ; Allowed # 4.1 ARABIC LETTER LAM WITH BAR +076E..0771 ; Allowed # 5.1 [4] ARABIC LETTER HAH WITH SMALL ARABIC LETTER TAH BELOW..ARABIC LETTER REH WITH SMALL ARABIC LETTER TAH AND TWO DOTS +0780..07B0 ; Allowed # 3.0 [49] THAANA LETTER HAA..THAANA SUKUN +07B1 ; Allowed # 3.2 THAANA LETTER NAA +088F ; Allowed # 17.0 ARABIC LETTER NOON WITH RING ABOVE +08A0 ; Allowed # 6.1 ARABIC LETTER BEH WITH SMALL V BELOW +08A2..08A9 ; Allowed # 6.1 [8] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER YEH WITH TWO DOTS BELOW AND DOT ABOVE +08BB..08BD ; Allowed # 9.0 [3] ARABIC LETTER AFRICAN FEH..ARABIC LETTER AFRICAN NOON +08BE..08C2 ; Allowed # 13.0 [5] ARABIC LETTER PEH WITH SMALL V..ARABIC LETTER KEHEH WITH SMALL V +08C7 ; Allowed # 13.0 ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE +0901..0903 ; Allowed # 1.1 [3] DEVANAGARI SIGN CANDRABINDU..DEVANAGARI SIGN VISARGA +0905..090B ; Allowed # 1.1 [7] DEVANAGARI LETTER A..DEVANAGARI LETTER VOCALIC R +090D..0928 ; Allowed # 1.1 [28] DEVANAGARI LETTER CANDRA E..DEVANAGARI LETTER NA +092A..0933 ; Allowed # 1.1 [10] DEVANAGARI LETTER PA..DEVANAGARI LETTER LLA +0935..0939 ; Allowed # 1.1 [5] DEVANAGARI LETTER VA..DEVANAGARI LETTER HA +093A..093B ; Allowed # 6.0 [2] DEVANAGARI VOWEL SIGN OE..DEVANAGARI VOWEL SIGN OOE +093C ; Allowed # 1.1 DEVANAGARI SIGN NUKTA +093E..0943 ; Allowed # 1.1 [6] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN VOCALIC R +0945..094D ; Allowed # 1.1 [9] DEVANAGARI VOWEL SIGN CANDRA E..DEVANAGARI SIGN VIRAMA +094F ; Allowed # 6.0 DEVANAGARI VOWEL SIGN AW +0956..0957 ; Allowed # 6.0 [2] DEVANAGARI VOWEL SIGN UE..DEVANAGARI VOWEL SIGN UUE +0966..096F ; Allowed # 1.1 [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE +0972 ; Allowed # 5.1 DEVANAGARI LETTER CANDRA A +0973..0977 ; Allowed # 6.0 [5] DEVANAGARI LETTER OE..DEVANAGARI LETTER UUE +097B..097C ; Allowed # 5.0 [2] DEVANAGARI LETTER GGA..DEVANAGARI LETTER JJA +097E..097F ; Allowed # 5.0 [2] DEVANAGARI LETTER DDDA..DEVANAGARI LETTER BBA +0981..0983 ; Allowed # 1.1 [3] BENGALI SIGN CANDRABINDU..BENGALI SIGN VISARGA +0985..098B ; Allowed # 1.1 [7] BENGALI LETTER A..BENGALI LETTER VOCALIC R +098F..0990 ; Allowed # 1.1 [2] BENGALI LETTER E..BENGALI LETTER AI +0993..09A8 ; Allowed # 1.1 [22] BENGALI LETTER O..BENGALI LETTER NA +09AA..09B0 ; Allowed # 1.1 [7] BENGALI LETTER PA..BENGALI LETTER RA +09B2 ; Allowed # 1.1 BENGALI LETTER LA +09B6..09B9 ; Allowed # 1.1 [4] BENGALI LETTER SHA..BENGALI LETTER HA +09BC ; Allowed # 1.1 BENGALI SIGN NUKTA +09BE..09C4 ; Allowed # 1.1 [7] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN VOCALIC RR +09C7..09C8 ; Allowed # 1.1 [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI +09CB..09CD ; Allowed # 1.1 [3] BENGALI VOWEL SIGN O..BENGALI SIGN VIRAMA +09CE ; Allowed # 4.1 BENGALI LETTER KHANDA TA +09E6..09F1 ; Allowed # 1.1 [12] BENGALI DIGIT ZERO..BENGALI LETTER RA WITH LOWER DIAGONAL +0A02 ; Allowed # 1.1 GURMUKHI SIGN BINDI +0A05..0A0A ; Allowed # 1.1 [6] GURMUKHI LETTER A..GURMUKHI LETTER UU +0A0F..0A10 ; Allowed # 1.1 [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI +0A13..0A28 ; Allowed # 1.1 [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA +0A2A..0A30 ; Allowed # 1.1 [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA +0A32 ; Allowed # 1.1 GURMUKHI LETTER LA +0A35 ; Allowed # 1.1 GURMUKHI LETTER VA +0A38..0A39 ; Allowed # 1.1 [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA +0A3C ; Allowed # 1.1 GURMUKHI SIGN NUKTA +0A3E..0A42 ; Allowed # 1.1 [5] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN UU +0A47..0A48 ; Allowed # 1.1 [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI +0A4B..0A4D ; Allowed # 1.1 [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA +0A5C ; Allowed # 1.1 GURMUKHI LETTER RRA +0A70..0A71 ; Allowed # 1.1 [2] GURMUKHI TIPPI..GURMUKHI ADDAK +0A82..0A83 ; Allowed # 1.1 [2] GUJARATI SIGN ANUSVARA..GUJARATI SIGN VISARGA +0A85..0A8B ; Allowed # 1.1 [7] GUJARATI LETTER A..GUJARATI LETTER VOCALIC R +0A8C ; Allowed # 4.0 GUJARATI LETTER VOCALIC L +0A8D ; Allowed # 1.1 GUJARATI VOWEL CANDRA E +0A8F..0A91 ; Allowed # 1.1 [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O +0A93..0AA8 ; Allowed # 1.1 [22] GUJARATI LETTER O..GUJARATI LETTER NA +0AAA..0AB0 ; Allowed # 1.1 [7] GUJARATI LETTER PA..GUJARATI LETTER RA +0AB2..0AB3 ; Allowed # 1.1 [2] GUJARATI LETTER LA..GUJARATI LETTER LLA +0AB5..0AB9 ; Allowed # 1.1 [5] GUJARATI LETTER VA..GUJARATI LETTER HA +0ABC ; Allowed # 1.1 GUJARATI SIGN NUKTA +0ABE..0AC5 ; Allowed # 1.1 [8] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN CANDRA E +0AC7..0AC9 ; Allowed # 1.1 [3] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN CANDRA O +0ACB..0ACD ; Allowed # 1.1 [3] GUJARATI VOWEL SIGN O..GUJARATI SIGN VIRAMA +0AE6..0AEF ; Allowed # 1.1 [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE +0B01..0B03 ; Allowed # 1.1 [3] ORIYA SIGN CANDRABINDU..ORIYA SIGN VISARGA +0B05..0B0B ; Allowed # 1.1 [7] ORIYA LETTER A..ORIYA LETTER VOCALIC R +0B0F..0B10 ; Allowed # 1.1 [2] ORIYA LETTER E..ORIYA LETTER AI +0B13..0B28 ; Allowed # 1.1 [22] ORIYA LETTER O..ORIYA LETTER NA +0B2A..0B30 ; Allowed # 1.1 [7] ORIYA LETTER PA..ORIYA LETTER RA +0B32..0B33 ; Allowed # 1.1 [2] ORIYA LETTER LA..ORIYA LETTER LLA +0B36..0B39 ; Allowed # 1.1 [4] ORIYA LETTER SHA..ORIYA LETTER HA +0B3C ; Allowed # 1.1 ORIYA SIGN NUKTA +0B3E..0B43 ; Allowed # 1.1 [6] ORIYA VOWEL SIGN AA..ORIYA VOWEL SIGN VOCALIC R +0B47..0B48 ; Allowed # 1.1 [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI +0B4B..0B4D ; Allowed # 1.1 [3] ORIYA VOWEL SIGN O..ORIYA SIGN VIRAMA +0B56 ; Allowed # 1.1 ORIYA AI LENGTH MARK +0B5F ; Allowed # 1.1 ORIYA LETTER YYA +0B71 ; Allowed # 4.0 ORIYA LETTER WA +0B83 ; Allowed # 1.1 TAMIL SIGN VISARGA +0B85..0B8A ; Allowed # 1.1 [6] TAMIL LETTER A..TAMIL LETTER UU +0B8E..0B90 ; Allowed # 1.1 [3] TAMIL LETTER E..TAMIL LETTER AI +0B92..0B95 ; Allowed # 1.1 [4] TAMIL LETTER O..TAMIL LETTER KA +0B99..0B9A ; Allowed # 1.1 [2] TAMIL LETTER NGA..TAMIL LETTER CA +0B9C ; Allowed # 1.1 TAMIL LETTER JA +0B9E..0B9F ; Allowed # 1.1 [2] TAMIL LETTER NYA..TAMIL LETTER TTA +0BA3..0BA4 ; Allowed # 1.1 [2] TAMIL LETTER NNA..TAMIL LETTER TA +0BA8..0BAA ; Allowed # 1.1 [3] TAMIL LETTER NA..TAMIL LETTER PA +0BAE..0BB5 ; Allowed # 1.1 [8] TAMIL LETTER MA..TAMIL LETTER VA +0BB6 ; Allowed # 4.1 TAMIL LETTER SHA +0BB7..0BB9 ; Allowed # 1.1 [3] TAMIL LETTER SSA..TAMIL LETTER HA +0BBE..0BC2 ; Allowed # 1.1 [5] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN UU +0BC6..0BC8 ; Allowed # 1.1 [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI +0BCA..0BCD ; Allowed # 1.1 [4] TAMIL VOWEL SIGN O..TAMIL SIGN VIRAMA +0C02..0C03 ; Allowed # 1.1 [2] TELUGU SIGN ANUSVARA..TELUGU SIGN VISARGA +0C05..0C0B ; Allowed # 1.1 [7] TELUGU LETTER A..TELUGU LETTER VOCALIC R +0C0E..0C10 ; Allowed # 1.1 [3] TELUGU LETTER E..TELUGU LETTER AI +0C12..0C28 ; Allowed # 1.1 [23] TELUGU LETTER O..TELUGU LETTER NA +0C2A..0C30 ; Allowed # 1.1 [7] TELUGU LETTER PA..TELUGU LETTER RA +0C32..0C33 ; Allowed # 1.1 [2] TELUGU LETTER LA..TELUGU LETTER LLA +0C35..0C39 ; Allowed # 1.1 [5] TELUGU LETTER VA..TELUGU LETTER HA +0C3E..0C44 ; Allowed # 1.1 [7] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN VOCALIC RR +0C46..0C48 ; Allowed # 1.1 [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI +0C4A..0C4D ; Allowed # 1.1 [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA +0C82..0C83 ; Allowed # 1.1 [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA +0C85..0C8B ; Allowed # 1.1 [7] KANNADA LETTER A..KANNADA LETTER VOCALIC R +0C8E..0C90 ; Allowed # 1.1 [3] KANNADA LETTER E..KANNADA LETTER AI +0C92..0CA8 ; Allowed # 1.1 [23] KANNADA LETTER O..KANNADA LETTER NA +0CAA..0CB0 ; Allowed # 1.1 [7] KANNADA LETTER PA..KANNADA LETTER RA +0CB2..0CB3 ; Allowed # 1.1 [2] KANNADA LETTER LA..KANNADA LETTER LLA +0CB5..0CB9 ; Allowed # 1.1 [5] KANNADA LETTER VA..KANNADA LETTER HA +0CBE..0CC3 ; Allowed # 1.1 [6] KANNADA VOWEL SIGN AA..KANNADA VOWEL SIGN VOCALIC R +0CC6..0CC8 ; Allowed # 1.1 [3] KANNADA VOWEL SIGN E..KANNADA VOWEL SIGN AI +0CCA..0CCD ; Allowed # 1.1 [4] KANNADA VOWEL SIGN O..KANNADA SIGN VIRAMA +0CE6..0CEF ; Allowed # 1.1 [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE +0D02..0D03 ; Allowed # 1.1 [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA +0D05..0D0B ; Allowed # 1.1 [7] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC R +0D0E..0D10 ; Allowed # 1.1 [3] MALAYALAM LETTER E..MALAYALAM LETTER AI +0D12..0D28 ; Allowed # 1.1 [23] MALAYALAM LETTER O..MALAYALAM LETTER NA +0D2A..0D39 ; Allowed # 1.1 [16] MALAYALAM LETTER PA..MALAYALAM LETTER HA +0D3E..0D43 ; Allowed # 1.1 [6] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN VOCALIC R +0D46..0D48 ; Allowed # 1.1 [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI +0D4A..0D4B ; Allowed # 1.1 [2] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN OO +0D4D ; Allowed # 1.1 MALAYALAM SIGN VIRAMA +0D57 ; Allowed # 1.1 MALAYALAM AU LENGTH MARK +0D7A..0D7F ; Allowed # 5.1 [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K +0D82..0D83 ; Allowed # 3.0 [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA +0D85..0D8D ; Allowed # 3.0 [9] SINHALA LETTER AYANNA..SINHALA LETTER IRUYANNA +0D91..0D96 ; Allowed # 3.0 [6] SINHALA LETTER EYANNA..SINHALA LETTER AUYANNA +0D9A..0D9D ; Allowed # 3.0 [4] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER MAHAAPRAANA GAYANNA +0D9F..0DB1 ; Allowed # 3.0 [19] SINHALA LETTER SANYAKA GAYANNA..SINHALA LETTER DANTAJA NAYANNA +0DB3..0DBB ; Allowed # 3.0 [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA +0DBD ; Allowed # 3.0 SINHALA LETTER DANTAJA LAYANNA +0DC0..0DC6 ; Allowed # 3.0 [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA +0DCA ; Allowed # 3.0 SINHALA SIGN AL-LAKUNA +0DCF..0DD4 ; Allowed # 3.0 [6] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA +0DD6 ; Allowed # 3.0 SINHALA VOWEL SIGN DIGA PAA-PILLA +0DD8..0DDE ; Allowed # 3.0 [7] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA +0DF2 ; Allowed # 3.0 SINHALA VOWEL SIGN DIGA GAETTA-PILLA +0E01..0E32 ; Allowed # 1.1 [50] THAI CHARACTER KO KAI..THAI CHARACTER SARA AA +0E34..0E3A ; Allowed # 1.1 [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU +0E40..0E4D ; Allowed # 1.1 [14] THAI CHARACTER SARA E..THAI CHARACTER NIKHAHIT +0E50..0E59 ; Allowed # 1.1 [10] THAI DIGIT ZERO..THAI DIGIT NINE +0E81..0E82 ; Allowed # 1.1 [2] LAO LETTER KO..LAO LETTER KHO SUNG +0E84 ; Allowed # 1.1 LAO LETTER KHO TAM +0E87..0E88 ; Allowed # 1.1 [2] LAO LETTER NGO..LAO LETTER CO +0E8A ; Allowed # 1.1 LAO LETTER SO TAM +0E8D ; Allowed # 1.1 LAO LETTER NYO +0E94..0E97 ; Allowed # 1.1 [4] LAO LETTER DO..LAO LETTER THO TAM +0E99..0E9F ; Allowed # 1.1 [7] LAO LETTER NO..LAO LETTER FO SUNG +0EA1..0EA3 ; Allowed # 1.1 [3] LAO LETTER MO..LAO LETTER LO LING +0EA5 ; Allowed # 1.1 LAO LETTER LO LOOT +0EA7 ; Allowed # 1.1 LAO LETTER WO +0EAA..0EAB ; Allowed # 1.1 [2] LAO LETTER SO SUNG..LAO LETTER HO SUNG +0EAD..0EAE ; Allowed # 1.1 [2] LAO LETTER O..LAO LETTER HO TAM +0EB0..0EB2 ; Allowed # 1.1 [3] LAO VOWEL SIGN A..LAO VOWEL SIGN AA +0EB4..0EB9 ; Allowed # 1.1 [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU +0EBB..0EBD ; Allowed # 1.1 [3] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN NYO +0EC0..0EC4 ; Allowed # 1.1 [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI +0EC6 ; Allowed # 1.1 LAO KO LA +0EC8..0ECD ; Allowed # 1.1 [6] LAO TONE MAI EK..LAO NIGGAHITA +0ED0..0ED9 ; Allowed # 1.1 [10] LAO DIGIT ZERO..LAO DIGIT NINE +0F0B ; Allowed # 2.0 TIBETAN MARK INTERSYLLABIC TSHEG +0F20..0F29 ; Allowed # 2.0 [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE +0F40..0F42 ; Allowed # 2.0 [3] TIBETAN LETTER KA..TIBETAN LETTER GA +0F44..0F47 ; Allowed # 2.0 [4] TIBETAN LETTER NGA..TIBETAN LETTER JA +0F49..0F4C ; Allowed # 2.0 [4] TIBETAN LETTER NYA..TIBETAN LETTER DDA +0F4E..0F51 ; Allowed # 2.0 [4] TIBETAN LETTER NNA..TIBETAN LETTER DA +0F53..0F56 ; Allowed # 2.0 [4] TIBETAN LETTER NA..TIBETAN LETTER BA +0F58..0F5B ; Allowed # 2.0 [4] TIBETAN LETTER MA..TIBETAN LETTER DZA +0F5D..0F68 ; Allowed # 2.0 [12] TIBETAN LETTER WA..TIBETAN LETTER A +0F71..0F72 ; Allowed # 2.0 [2] TIBETAN VOWEL SIGN AA..TIBETAN VOWEL SIGN I +0F74 ; Allowed # 2.0 TIBETAN VOWEL SIGN U +0F7A..0F80 ; Allowed # 2.0 [7] TIBETAN VOWEL SIGN E..TIBETAN VOWEL SIGN REVERSED I +0F84 ; Allowed # 2.0 TIBETAN MARK HALANTA +0F90..0F92 ; Allowed # 2.0 [3] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER GA +0F94..0F95 ; Allowed # 2.0 [2] TIBETAN SUBJOINED LETTER NGA..TIBETAN SUBJOINED LETTER CA +0F96 ; Allowed # 3.0 TIBETAN SUBJOINED LETTER CHA +0F97 ; Allowed # 2.0 TIBETAN SUBJOINED LETTER JA +0F99..0F9C ; Allowed # 2.0 [4] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER DDA +0F9E..0FA1 ; Allowed # 2.0 [4] TIBETAN SUBJOINED LETTER NNA..TIBETAN SUBJOINED LETTER DA +0FA3..0FA6 ; Allowed # 2.0 [4] TIBETAN SUBJOINED LETTER NA..TIBETAN SUBJOINED LETTER BA +0FA8..0FAB ; Allowed # 2.0 [4] TIBETAN SUBJOINED LETTER MA..TIBETAN SUBJOINED LETTER DZA +0FAD ; Allowed # 2.0 TIBETAN SUBJOINED LETTER WA +0FB1..0FB7 ; Allowed # 2.0 [7] TIBETAN SUBJOINED LETTER YA..TIBETAN SUBJOINED LETTER HA +0FB8 ; Allowed # 3.0 TIBETAN SUBJOINED LETTER A +0FBA..0FBC ; Allowed # 3.0 [3] TIBETAN SUBJOINED LETTER FIXED-FORM WA..TIBETAN SUBJOINED LETTER FIXED-FORM RA +1000..1021 ; Allowed # 3.0 [34] MYANMAR LETTER KA..MYANMAR LETTER A +1022 ; Allowed # 5.1 MYANMAR LETTER SHAN A +1023..1027 ; Allowed # 3.0 [5] MYANMAR LETTER I..MYANMAR LETTER E +1028 ; Allowed # 5.1 MYANMAR LETTER MON E +1029..102A ; Allowed # 3.0 [2] MYANMAR LETTER O..MYANMAR LETTER AU +102B ; Allowed # 5.1 MYANMAR VOWEL SIGN TALL AA +102C..1032 ; Allowed # 3.0 [7] MYANMAR VOWEL SIGN AA..MYANMAR VOWEL SIGN AI +1033..1035 ; Allowed # 5.1 [3] MYANMAR VOWEL SIGN MON II..MYANMAR VOWEL SIGN E ABOVE +1036..1039 ; Allowed # 3.0 [4] MYANMAR SIGN ANUSVARA..MYANMAR SIGN VIRAMA +103A..103F ; Allowed # 5.1 [6] MYANMAR SIGN ASAT..MYANMAR LETTER GREAT SA +1040..1049 ; Allowed # 3.0 [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE +105A..1064 ; Allowed # 5.1 [11] MYANMAR LETTER MON NGA..MYANMAR TONE MARK SGAW KAREN KE PHO +1075..108A ; Allowed # 5.1 [22] MYANMAR LETTER SHAN KA..MYANMAR SIGN SHAN TONE-6 +108F ; Allowed # 5.1 MYANMAR SIGN RUMAI PALAUNG TONE-5 +10C7 ; Allowed # 6.1 GEORGIAN CAPITAL LETTER YN +10CD ; Allowed # 6.1 GEORGIAN CAPITAL LETTER AEN +10D0..10F0 ; Allowed # 1.1 [33] GEORGIAN LETTER AN..GEORGIAN LETTER HAE +1200..1206 ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE HO +1208..1246 ; Allowed # 3.0 [63] ETHIOPIC SYLLABLE LA..ETHIOPIC SYLLABLE QO +1247 ; Allowed # 4.1 ETHIOPIC SYLLABLE QOA +1248 ; Allowed # 3.0 ETHIOPIC SYLLABLE QWA +124A..124D ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE +1250..1256 ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO +1258 ; Allowed # 3.0 ETHIOPIC SYLLABLE QHWA +125A..125D ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE +1260..1286 ; Allowed # 3.0 [39] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XO +1288 ; Allowed # 3.0 ETHIOPIC SYLLABLE XWA +128A..128D ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE +1290..12AE ; Allowed # 3.0 [31] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KO +12B0 ; Allowed # 3.0 ETHIOPIC SYLLABLE KWA +12B2..12B5 ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE +12B8..12BE ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO +12C0 ; Allowed # 3.0 ETHIOPIC SYLLABLE KXWA +12C2..12C5 ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE +12C8..12CE ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE WO +12CF ; Allowed # 4.1 ETHIOPIC SYLLABLE WOA +12D0..12D6 ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE PHARYNGEAL A..ETHIOPIC SYLLABLE PHARYNGEAL O +12D8..12EE ; Allowed # 3.0 [23] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE YO +12EF ; Allowed # 4.1 ETHIOPIC SYLLABLE YOA +12F0..12F7 ; Allowed # 3.0 [8] ETHIOPIC SYLLABLE DA..ETHIOPIC SYLLABLE DWA +1300..130E ; Allowed # 3.0 [15] ETHIOPIC SYLLABLE JA..ETHIOPIC SYLLABLE GO +1310 ; Allowed # 3.0 ETHIOPIC SYLLABLE GWA +1312..1315 ; Allowed # 3.0 [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE +1318..131E ; Allowed # 3.0 [7] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE GGO +1320..1346 ; Allowed # 3.0 [39] ETHIOPIC SYLLABLE THA..ETHIOPIC SYLLABLE TZO +1348..1359 ; Allowed # 3.0 [18] ETHIOPIC SYLLABLE FA..ETHIOPIC SYLLABLE MYA +1780..179C ; Allowed # 3.0 [29] KHMER LETTER KA..KHMER LETTER VO +179F..17A2 ; Allowed # 3.0 [4] KHMER LETTER SA..KHMER LETTER QA +17A5..17A7 ; Allowed # 3.0 [3] KHMER INDEPENDENT VOWEL QI..KHMER INDEPENDENT VOWEL QU +17AA..17B3 ; Allowed # 3.0 [10] KHMER INDEPENDENT VOWEL QUUV..KHMER INDEPENDENT VOWEL QAU +17B6..17CD ; Allowed # 3.0 [24] KHMER VOWEL SIGN AA..KHMER SIGN TOANDAKHIAT +17D0 ; Allowed # 3.0 KHMER SIGN SAMYOK SANNYA +17D2 ; Allowed # 3.0 KHMER SIGN COENG +17E0..17E9 ; Allowed # 3.0 [10] KHMER DIGIT ZERO..KHMER DIGIT NINE +1C90..1CBA ; Allowed # 11.0 [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD..1CBF ; Allowed # 11.0 [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E0C..1E0D ; Allowed # 1.1 [2] LATIN CAPITAL LETTER D WITH DOT BELOW..LATIN SMALL LETTER D WITH DOT BELOW +1E12..1E13 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW..LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW +1E20..1E21 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER G WITH MACRON..LATIN SMALL LETTER G WITH MACRON +1E24..1E25 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER H WITH DOT BELOW..LATIN SMALL LETTER H WITH DOT BELOW +1E36..1E37 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER L WITH DOT BELOW..LATIN SMALL LETTER L WITH DOT BELOW +1E3C..1E3F ; Allowed # 1.1 [4] LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW..LATIN SMALL LETTER M WITH ACUTE +1E42..1E4B ; Allowed # 1.1 [10] LATIN CAPITAL LETTER M WITH DOT BELOW..LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW +1E5A..1E5B ; Allowed # 1.1 [2] LATIN CAPITAL LETTER R WITH DOT BELOW..LATIN SMALL LETTER R WITH DOT BELOW +1E62..1E63 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER S WITH DOT BELOW..LATIN SMALL LETTER S WITH DOT BELOW +1E6C..1E6D ; Allowed # 1.1 [2] LATIN CAPITAL LETTER T WITH DOT BELOW..LATIN SMALL LETTER T WITH DOT BELOW +1E70..1E71 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW..LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW +1E8C..1E8D ; Allowed # 1.1 [2] LATIN CAPITAL LETTER X WITH DIAERESIS..LATIN SMALL LETTER X WITH DIAERESIS +1E92..1E93 ; Allowed # 1.1 [2] LATIN CAPITAL LETTER Z WITH DOT BELOW..LATIN SMALL LETTER Z WITH DOT BELOW +1E9E ; Allowed # 5.1 LATIN CAPITAL LETTER SHARP S +1EA0..1EF9 ; Allowed # 1.1 [90] LATIN CAPITAL LETTER A WITH DOT BELOW..LATIN SMALL LETTER Y WITH TILDE +1FA0..1FAF ; Allowed # 1.1 [16] GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2..1FB4 ; Allowed # 1.1 [3] GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FEC ; Allowed # 1.1 GREEK CAPITAL LETTER RHO WITH DASIA +2010 ; Allowed # 1.1 HYPHEN +2019 ; Allowed # 1.1 RIGHT SINGLE QUOTATION MARK +2027 ; Allowed # 1.1 HYPHENATION POINT +3005..3007 ; Allowed # 1.1 [3] IDEOGRAPHIC ITERATION MARK..IDEOGRAPHIC NUMBER ZERO +3041..3094 ; Allowed # 1.1 [84] HIRAGANA LETTER SMALL A..HIRAGANA LETTER VU +3095..3096 ; Allowed # 3.2 [2] HIRAGANA LETTER SMALL KA..HIRAGANA LETTER SMALL KE +309D..309E ; Allowed # 1.1 [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK +30A0 ; Allowed # 3.2 KATAKANA-HIRAGANA DOUBLE HYPHEN +30A1..30FE ; Allowed # 1.1 [94] KATAKANA LETTER SMALL A..KATAKANA VOICED ITERATION MARK 3447 ; Allowed # 3.0 CJK UNIFIED IDEOGRAPH-3447 3473 ; Allowed # 3.0 CJK UNIFIED IDEOGRAPH-3473 34E4 ; Allowed # 3.0 CJK UNIFIED IDEOGRAPH-34E4 @@ -1224,6 +1612,17 @@ 9F6E..9FA0 ; Allowed # 1.1 [51] CJK UNIFIED IDEOGRAPH-9F6E..CJK UNIFIED IDEOGRAPH-9FA0 9FA2 ; Allowed # 1.1 CJK UNIFIED IDEOGRAPH-9FA2 9FA4..9FA5 ; Allowed # 1.1 [2] CJK UNIFIED IDEOGRAPH-9FA4..CJK UNIFIED IDEOGRAPH-9FA5 +A78D ; Allowed # 6.0 LATIN CAPITAL LETTER TURNED H +A7AA ; Allowed # 6.1 LATIN CAPITAL LETTER H WITH HOOK +AA7B ; Allowed # 5.2 MYANMAR SIGN PAO KAREN TONE +AC00..D7A3 ; Allowed # 2.0 [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH +11301 ; Allowed # 7.0 GRANTHA SIGN CANDRABINDU +11303 ; Allowed # 7.0 GRANTHA SIGN VISARGA +1133C ; Allowed # 7.0 GRANTHA SIGN NUKTA +1E7E0..1E7E6 ; Allowed # 14.0 [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO +1E7E8..1E7EB ; Allowed # 14.0 [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE +1E7ED..1E7EE ; Allowed # 14.0 [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE +1E7F0..1E7FE ; Allowed # 14.0 [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 2070E ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-2070E 20731 ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-20731 20779 ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-20779 @@ -1282,4 +1681,5 @@ 28CCD ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-28CCD 28CD2 ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-28CD2 29D98 ; Allowed # 3.1 CJK UNIFIED IDEOGRAPH-29D98 -# Total code points: 33791 \ No newline at end of file + +# Total code points: 33791 diff --git a/provider/source/tests/data/unihan/Unihan_IRGSources.txt b/provider/source/tests/data/unicode/ucd/unihan/Unihan_IRGSources.txt similarity index 100% rename from provider/source/tests/data/unihan/Unihan_IRGSources.txt rename to provider/source/tests/data/unicode/ucd/unihan/Unihan_IRGSources.txt diff --git a/provider/source/tests/globs.rs.data b/provider/source/tests/globs.rs.data index 248d3355b12..4edcf02307f 100644 --- a/provider/source/tests/globs.rs.data +++ b/provider/source/tests/globs.rs.data @@ -206,9 +206,13 @@ const LSTM_GLOB: &[&str] = &[ "Thai_graphclust_model4_heavy/weights.json", ]; -const UNIHAN_GLOB: &[&str] = &["Unihan_IRGSources.txt"]; - -const UCD_GLOB: &[&str] = &["security/IdentifierStatus.txt"]; +const UNICODE_GLOB: &[&str] = &[ + "security/IdentifierStatus.txt", + // WARNING: This file is manually filtered in download_repo_sources.rs to only include + // the data that is needed by current implementations. If you need this file for other + // logic, adapt/remove the filtering. + "ucd/unihan/Unihan_IRGSources.txt", +]; const TZDB_GLOB: &[&str] = &[ "africa", diff --git a/tools/make/bakeddata/src/main.rs b/tools/make/bakeddata/src/main.rs index 0670bda4bc9..a1ac2592fd7 100644 --- a/tools/make/bakeddata/src/main.rs +++ b/tools/make/bakeddata/src/main.rs @@ -76,8 +76,6 @@ fn main() { }; let source = SourceDataProvider::new() - .with_ucd(Path::new("provider/source/tests/data/ucd")) - .unwrap() .with_tzdb(Path::new("provider/source/tests/data/tzdb")) .unwrap();