Skip to content

Commit 4b148df

Browse files
authored
Fix datagen to propagate extension keywords and aux keys to child locales (#4533)
Fixes #4488 If `ar-SA` is requested and `ar-u-nu-latn` exists, `ar-SA-u-nu-latn` should be included, even if that locale isn't in CLDR data by itself.
1 parent c003764 commit 4b148df

17 files changed

+1513
-57
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
- Datagen shows elapsed time for keys that are slow to generate (https://github.com/unicode-org/icu4x/pull/4469)
1919
- Datagen performance improvement by caching supported locales (https://github.com/unicode-org/icu4x/pull/4470)
2020
- Never use fallback for baked segmentation data (https://github.com/unicode-org/icu4x/pull/4510)
21+
- Propagate extension keywords and auxiliary keys to explicit locales (https://github.com/unicode-org/icu4x/pull/4533)
2122
- `icu_provider`
2223
- (Small breakage) `DataPayload::new_owned()` is no longer `const`, this was a mistake (https://github.com/unicode-org/icu4x/pull/4456)
2324
- `icu_provider_blob`

provider/datagen/src/driver.rs

+115-49
Original file line numberDiff line numberDiff line change
@@ -468,97 +468,163 @@ impl DatagenDriver {
468468
}
469469
}
470470

471+
struct ExplicitImplicitLocaleSets {
472+
explicit: HashSet<DataLocale>,
473+
implicit: HashSet<DataLocale>,
474+
}
475+
476+
/// Resolves the set of explicit langids and the supported locales into two sets of locales:
477+
///
478+
/// - `explicit` contains the explicit langids but with aux keys and extension keywords included.
479+
/// For example, if `ar-SA` is requested (explicit langid), and `ar` and `ar-u-nu-latn` are supported,
480+
/// then `ar-SA` and `ar-SA-u-nu-latn` will be returned as `explicit`.
481+
/// - `implcit` contains all supported locales reachable by fallback from an `explicit` locale.
482+
/// These locales can be included without increasing data payload size.
483+
fn make_explicit_implicit_sets(
484+
key: DataKey,
485+
explicit_langids: &HashSet<LanguageIdentifier>,
486+
supported_map: &HashMap<LanguageIdentifier, HashSet<DataLocale>>,
487+
fallbacker: &Lazy<
488+
Result<LocaleFallbacker, DataError>,
489+
impl FnOnce() -> Result<LocaleFallbacker, DataError>,
490+
>,
491+
) -> Result<ExplicitImplicitLocaleSets, DataError> {
492+
let mut implicit = HashSet::new();
493+
// TODO: Make including the default locale configurable
494+
implicit.insert(DataLocale::default());
495+
496+
let mut explicit: HashSet<DataLocale> = Default::default();
497+
for explicit_langid in explicit_langids.iter() {
498+
explicit.insert(explicit_langid.into());
499+
if let Some(locales) = supported_map.get(explicit_langid) {
500+
explicit.extend(locales.iter().cloned()); // adds ar-EG-u-nu-latn
501+
}
502+
if explicit_langid == &LanguageIdentifier::UND {
503+
continue;
504+
}
505+
let fallbacker = fallbacker.as_ref().map_err(|e| *e)?;
506+
let fallbacker_with_config = fallbacker.for_config(key.fallback_config());
507+
let mut iter = fallbacker_with_config.fallback_for(explicit_langid.into());
508+
while !iter.get().is_und() {
509+
implicit.insert(iter.get().clone());
510+
// Inherit aux keys and extension keywords from parent locales
511+
let iter_langid = iter.get().get_langid();
512+
if let Some(locales) = supported_map.get(&iter_langid) {
513+
implicit.extend(locales.iter().cloned()); // adds ar-u-nu-latn
514+
for locale in locales {
515+
let mut morphed_locale = locale.clone();
516+
morphed_locale.set_langid(explicit_langid.clone());
517+
explicit.insert(morphed_locale); // adds ar-SA-u-nu-latn
518+
}
519+
}
520+
iter.step();
521+
}
522+
}
523+
Ok(ExplicitImplicitLocaleSets { explicit, implicit })
524+
}
525+
471526
/// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen
472527
/// provider's options bag. The locales may be later optionally deduplicated for fallback.
473528
fn select_locales_for_key(
474529
provider: &dyn ExportableProvider,
475530
key: DataKey,
476531
fallback: FallbackMode,
477-
locales: Option<&HashSet<LanguageIdentifier>>,
532+
explicit_langids: Option<&HashSet<LanguageIdentifier>>,
478533
additional_collations: &HashSet<String>,
479534
segmenter_models: &[String],
480535
fallbacker: &Lazy<
481536
Result<LocaleFallbacker, DataError>,
482537
impl FnOnce() -> Result<LocaleFallbacker, DataError>,
483538
>,
484539
) -> Result<HashSet<icu_provider::DataLocale>, DataError> {
485-
let mut result = provider
540+
// A map from langid to data locales. Keys that have aux keys or extension keywords
541+
// may have multiple data locales per langid.
542+
let mut supported_map: HashMap<LanguageIdentifier, HashSet<DataLocale>> = Default::default();
543+
for locale in provider
486544
.supported_locales_for_key(key)
487545
.map_err(|e| e.with_key(key))?
488-
.into_iter()
489-
.collect::<HashSet<DataLocale>>();
546+
{
547+
use std::collections::hash_map::Entry;
548+
match supported_map.entry(locale.get_langid()) {
549+
Entry::Occupied(mut entry) => entry.get_mut().insert(locale),
550+
Entry::Vacant(entry) => entry.insert(Default::default()).insert(locale),
551+
};
552+
}
490553

491554
if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY
492555
|| key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY
493556
{
494-
result.retain(|locale| {
495-
let model = crate::transform::segmenter::dictionary::data_locale_to_model_name(locale);
496-
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
557+
supported_map.retain(|_, locales| {
558+
locales.retain(|locale| {
559+
let model =
560+
crate::transform::segmenter::dictionary::data_locale_to_model_name(locale);
561+
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
562+
});
563+
!locales.is_empty()
497564
});
498565
// Don't perform additional locale filtering
499-
return Ok(result);
566+
return Ok(supported_map.into_values().flatten().collect());
500567
} else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY {
501-
result.retain(|locale| {
502-
let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale);
503-
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
568+
supported_map.retain(|_, locales| {
569+
locales.retain(|locale| {
570+
let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale);
571+
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
572+
});
573+
!locales.is_empty()
504574
});
505575
// Don't perform additional locale filtering
506-
return Ok(result);
576+
return Ok(supported_map.into_values().flatten().collect());
507577
} else if key == icu_collator::provider::CollationDataV1Marker::KEY
508578
|| key == icu_collator::provider::CollationDiacriticsV1Marker::KEY
509579
|| key == icu_collator::provider::CollationJamoV1Marker::KEY
510580
|| key == icu_collator::provider::CollationMetadataV1Marker::KEY
511581
|| key == icu_collator::provider::CollationReorderingV1Marker::KEY
512582
|| key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY
513583
{
514-
result.retain(|locale| {
515-
let Some(collation) = locale
516-
.get_unicode_ext(&key!("co"))
517-
.and_then(|co| co.as_single_subtag().copied())
518-
else {
519-
return true;
520-
};
521-
additional_collations.contains(collation.as_str())
522-
|| if collation.starts_with("search") {
523-
additional_collations.contains("search*")
524-
} else {
525-
!["big5han", "gb2312"].contains(&collation.as_str())
526-
}
584+
supported_map.retain(|_, locales| {
585+
locales.retain(|locale| {
586+
let Some(collation) = locale
587+
.get_unicode_ext(&key!("co"))
588+
.and_then(|co| co.as_single_subtag().copied())
589+
else {
590+
return true;
591+
};
592+
additional_collations.contains(collation.as_str())
593+
|| if collation.starts_with("search") {
594+
additional_collations.contains("search*")
595+
} else {
596+
!["big5han", "gb2312"].contains(&collation.as_str())
597+
}
598+
});
599+
!locales.is_empty()
527600
});
528601
}
529602

530-
result = match (locales, fallback) {
603+
let result = match (explicit_langids, fallback) {
531604
// Case 1: `None` simply exports all supported locales for this key.
532-
(None, _) => result,
605+
(None, _) => supported_map.into_values().flatten().collect(),
533606
// Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches
534607
// one of the explicit locales. This ensures extensions are included. In addition, any
535608
// explicit locales are added to the list, even if they themselves don't contain data;
536609
// fallback should be performed upon exporting.
537-
(Some(explicit), FallbackMode::Preresolved) => result
538-
.into_iter()
539-
.chain(explicit.iter().map(|langid| langid.into()))
540-
.filter(|locale| explicit.contains(&locale.get_langid()))
541-
.collect(),
610+
(Some(explicit_langids), FallbackMode::Preresolved) => {
611+
let ExplicitImplicitLocaleSets { explicit, .. } =
612+
make_explicit_implicit_sets(key, explicit_langids, &supported_map, fallbacker)?;
613+
explicit
614+
}
542615
// Case 3: All other modes resolve to the "ancestors and descendants" strategy.
543-
(Some(explicit), _) => {
544-
let include_und = explicit.contains(&LanguageIdentifier::UND);
545-
let explicit: HashSet<DataLocale> = explicit.iter().map(DataLocale::from).collect();
546-
let mut implicit = HashSet::new();
547-
// TODO: Make including the default locale configurable
548-
implicit.insert(DataLocale::default());
616+
(Some(explicit_langids), _) => {
617+
let include_und = explicit_langids.contains(&LanguageIdentifier::UND);
618+
619+
let ExplicitImplicitLocaleSets { explicit, implicit } =
620+
make_explicit_implicit_sets(key, explicit_langids, &supported_map, fallbacker)?;
621+
549622
let fallbacker = fallbacker.as_ref().map_err(|e| *e)?;
550623
let fallbacker_with_config = fallbacker.for_config(key.fallback_config());
551624

552-
for locale in explicit.iter() {
553-
let mut iter = fallbacker_with_config.fallback_for(locale.clone());
554-
while !iter.get().is_und() {
555-
implicit.insert(iter.get().clone());
556-
iter.step();
557-
}
558-
}
559-
560-
result
561-
.into_iter()
625+
supported_map
626+
.into_values()
627+
.flatten()
562628
.chain(explicit.iter().cloned())
563629
.filter(|locale_orig| {
564630
let mut locale = locale_orig.clone();
@@ -705,7 +771,7 @@ fn test_collation_filtering() {
705771
Some(&HashSet::from_iter([cas.language.clone()])),
706772
&HashSet::from_iter(cas.include_collations.iter().copied().map(String::from)),
707773
&[],
708-
&once_cell::sync::Lazy::new(|| unreachable!()),
774+
&once_cell::sync::Lazy::new(|| Ok(LocaleFallbacker::new_without_data())),
709775
)
710776
.unwrap()
711777
.into_iter()

0 commit comments

Comments
 (0)