Skip to content

Improve correctness of POSIX locale conversion #6575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 46 additions & 46 deletions utils/env_preferences/src/parse/aliases.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,59 @@

//! Platform-specific conversion from locale strings to BCP-47 identifiers.

/// Find a BCP-47 language/region from a list of POSIX locale aliases.
/// Find a BCP-47 language/region from a list of POSIX 'locale name' aliases.
///
/// Some of these conversions are approximate and not exact (e.g. "C"->"und").
/// This is based on GNU libc's `intl/locale.alias` file, with some manual processing
/// to remove duplicates. The original file is available at:
/// to remove duplicates. The notable exception is that the default `C`/`POSIX` locales
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// to remove duplicates. The notable exception is that the default `C`/`POSIX` locales
/// to remove duplicates. The notable addition is that the default `C`/`POSIX` locales

/// map to `en-US`. The original file is available at:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// map to `en-US`. The original file is available at:
/// map to `en-US-posix`. The original file is available at:

/// <https://sourceware.org/git/?p=glibc.git;a=blob;f=intl/locale.alias;hb=HEAD>
pub fn find_posix_alias(
alias: &str,
pub fn find_posix_locale_name_alias(
posix_locale: &str,
) -> Option<(
icu_locale_core::subtags::Language,
Option<icu_locale_core::subtags::Region>,
icu_locale_core::subtags::Region,
)> {
use icu_locale_core::subtags::{language, region, Language};
use icu_locale_core::subtags::{language, region};

match alias {
"C" | "POSIX" => Some((Language::UNKNOWN, None)),
"bokmal" => Some((language!("nb"), Some(region!("NO")))),
"catalan" => Some((language!("ca"), Some(region!("ES")))),
"croatian" => Some((language!("hr"), Some(region!("HR")))),
"czech" => Some((language!("cs"), Some(region!("CZ")))),
"danish" => Some((language!("da"), Some(region!("DK")))),
"dansk" => Some((language!("da"), Some(region!("DK")))),
"deutsch" => Some((language!("de"), Some(region!("DE")))),
"dutch" => Some((language!("nl"), Some(region!("NL")))),
"eesti" => Some((language!("et"), Some(region!("EE")))),
"estonian" => Some((language!("et"), Some(region!("EE")))),
"finnish" => Some((language!("fi"), Some(region!("FI")))),
"french" => Some((language!("fr"), Some(region!("FR")))),
"galego" => Some((language!("gl"), Some(region!("ES")))),
"galician" => Some((language!("gl"), Some(region!("ES")))),
"german" => Some((language!("de"), Some(region!("DE")))),
"greek" => Some((language!("el"), Some(region!("GR")))),
"hebrew" => Some((language!("he"), Some(region!("IL")))),
"hrvatski" => Some((language!("hr"), Some(region!("HR")))),
"hungarian" => Some((language!("hu"), Some(region!("HU")))),
"icelandic" => Some((language!("is"), Some(region!("IS")))),
"italian" => Some((language!("it"), Some(region!("IT")))),
"japanese" => Some((language!("ja"), Some(region!("JP")))),
"korean" => Some((language!("ko"), Some(region!("KR")))),
"lithuanian" => Some((language!("lt"), Some(region!("LT")))),
"norwegian" => Some((language!("nb"), Some(region!("NO")))),
"nynorsk" => Some((language!("nn"), Some(region!("NO")))),
"polish" => Some((language!("pl"), Some(region!("PL")))),
"portuguese" => Some((language!("pt"), Some(region!("PT")))),
"romanian" => Some((language!("ro"), Some(region!("RO")))),
"russian" => Some((language!("ru"), Some(region!("RU")))),
"slovak" => Some((language!("sk"), Some(region!("SK")))),
"slovene" => Some((language!("sl"), Some(region!("SI")))),
"slovenian" => Some((language!("sl"), Some(region!("SI")))),
"spanish" => Some((language!("es"), Some(region!("ES")))),
"swedish" => Some((language!("sv"), Some(region!("SE")))),
"thai" => Some((language!("th"), Some(region!("TH")))),
"turkish" => Some((language!("tr"), Some(region!("TR")))),
match posix_locale {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this whole match block to remove the language name checks and unify the "C" and "POSIX" case handling with the block below where you currently append -posix.

The X11 (see https://www.x.org/releases/X11R6.9.0/src-single/X11R6.9.0-src.tar.bz2 ) language name aliases are obsolete, so we shouldn't carry the data or spend time matching them. They resolved to legacy-encoding locales and present-day distros generate UTF-8 locales, so even on the libc-level the aliases don't resolve to an available locale and result in C.

"C" | "POSIX" => Some((language!("en"), region!("US"))),
"bokmal" => Some((language!("nb"), region!("NO"))),
"catalan" => Some((language!("ca"), region!("ES"))),
"croatian" => Some((language!("hr"), region!("HR"))),
"czech" => Some((language!("cs"), region!("CZ"))),
"danish" => Some((language!("da"), region!("DK"))),
"dansk" => Some((language!("da"), region!("DK"))),
"deutsch" => Some((language!("de"), region!("DE"))),
"dutch" => Some((language!("nl"), region!("NL"))),
"eesti" => Some((language!("et"), region!("EE"))),
"estonian" => Some((language!("et"), region!("EE"))),
"finnish" => Some((language!("fi"), region!("FI"))),
"french" => Some((language!("fr"), region!("FR"))),
"galego" => Some((language!("gl"), region!("ES"))),
"galician" => Some((language!("gl"), region!("ES"))),
"german" => Some((language!("de"), region!("DE"))),
"greek" => Some((language!("el"), region!("GR"))),
"hebrew" => Some((language!("he"), region!("IL"))),
"hrvatski" => Some((language!("hr"), region!("HR"))),
"hungarian" => Some((language!("hu"), region!("HU"))),
"icelandic" => Some((language!("is"), region!("IS"))),
"italian" => Some((language!("it"), region!("IT"))),
"japanese" => Some((language!("ja"), region!("JP"))),
"korean" => Some((language!("ko"), region!("KR"))),
"lithuanian" => Some((language!("lt"), region!("LT"))),
"norwegian" => Some((language!("nb"), region!("NO"))),
"nynorsk" => Some((language!("nn"), region!("NO"))),
"polish" => Some((language!("pl"), region!("PL"))),
"portuguese" => Some((language!("pt"), region!("PT"))),
"romanian" => Some((language!("ro"), region!("RO"))),
"russian" => Some((language!("ru"), region!("RU"))),
"slovak" => Some((language!("sk"), region!("SK"))),
"slovene" => Some((language!("sl"), region!("SI"))),
"slovenian" => Some((language!("sl"), region!("SI"))),
"spanish" => Some((language!("es"), region!("ES"))),
"swedish" => Some((language!("sv"), region!("SE"))),
"thai" => Some((language!("th"), region!("TH"))),
"turkish" => Some((language!("tr"), region!("TR"))),
_ => None,
}
}
Expand Down
45 changes: 26 additions & 19 deletions utils/env_preferences/src/parse/posix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
//! # use env_preferences::LocaleError;
//! # fn main() -> Result<(), LocaleError> {
//! let posix_locale = PosixLocale::try_from_str("en_US")?;
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US-posix"));
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US"));
//! # Ok(())
//! # }
//! ```
Expand All @@ -25,7 +25,7 @@ use icu_locale_core::{LanguageIdentifier, Locale};

use crate::ParseError;

use super::aliases::find_posix_alias;
use super::aliases::find_posix_locale_name_alias;

#[derive(Display, Debug, PartialEq)]
/// An error while parsing a POSIX locale identifier
Expand Down Expand Up @@ -208,17 +208,17 @@ impl<'src> PosixLocale<'src> {
/// // Locales will always include the `posix` variant
/// assert_eq!(
/// PosixLocale::try_from_str("en_US")?.try_convert_lossy()?,
/// locale!("en-US-posix")
/// locale!("en-US")
/// );
/// // The codeset field will be ignored
/// assert_eq!(
/// PosixLocale::try_from_str("en_US.iso88591")?.try_convert_lossy()?,
/// locale!("en-US-posix")
/// locale!("en-US")
/// );
/// // Any unknown modifiers will be ignored
/// assert_eq!(
/// PosixLocale::try_from_str("en_US@unknown")?.try_convert_lossy()?,
/// locale!("en-US-posix")
/// locale!("en-US")
/// );
/// # Ok(())
/// # }
Expand All @@ -233,49 +233,60 @@ impl<'src> PosixLocale<'src> {
/// // The default "C"/"POSIX" locale will be converted to "und"
/// assert_eq!(
/// PosixLocale::try_from_str("C")?.try_convert_lossy()?,
/// locale!("und-posix")
/// locale!("en-US-posix")
/// );
/// assert_eq!(
/// PosixLocale::try_from_str("POSIX")?.try_convert_lossy()?,
/// locale!("und-posix")
/// locale!("en-US-posix")
/// );
///
/// // Known language aliases will be converted to the matching BCP-47 identifier
/// assert_eq!(
/// PosixLocale::try_from_str("french")?.try_convert_lossy()?,
/// locale!("fr-FR-posix")
/// locale!("fr-FR")
/// );
///
/// // Known script modifiers will be converted to the matching CLDR keys
/// assert_eq!(
/// PosixLocale::try_from_str("uz_UZ@cyrillic")?.try_convert_lossy()?,
/// locale!("uz-Cyrl-UZ-posix")
/// locale!("uz-Cyrl-UZ")
/// );
/// assert_eq!(
/// PosixLocale::try_from_str("ks_IN@devanagari")?.try_convert_lossy()?,
/// locale!("ks-Deva-IN-posix")
/// locale!("ks-Deva-IN")
/// );
/// assert_eq!(
/// PosixLocale::try_from_str("be_BY@latin")?.try_convert_lossy()?,
/// locale!("be-Latn-BY-posix")
/// locale!("be-Latn-BY")
/// );
///
/// // Other known modifiers are handled accordingly
/// assert_eq!(
/// PosixLocale::try_from_str("en_US@euro")?.try_convert_lossy()?,
/// locale!("en-US-posix-u-cu-eur")
/// locale!("en-US-u-cu-eur")
/// );
/// assert_eq!(
/// PosixLocale::try_from_str("aa_ER@saaho")?.try_convert_lossy()?,
/// locale!("ssy-ER-posix")
/// locale!("ssy-ER")
/// );
/// # Ok(())
/// # }
/// ```
pub fn try_convert_lossy(&self) -> Result<Locale, ParseError> {
let mut extensions = Extensions::new();
let mut script = None;
let mut variants = Vec::new();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the comment in line 310 is obsolete


// The default "C"/"POSIX" locale should map to "en-US-posix",
// which is the default behaviour in ICU4C:
// https://github.com/unicode-org/icu/blob/795d7ac82c4b29cf721d0ad62c0b178347d453bf/icu4c/source/common/putil.cpp#L1738
if self.language == "C" || self.language == "POSIX" {
variants.push(variant!("posix"))
}

// Check if the language matches a known alias (e.g. "nynorsk"->("nn", "NO"))
let (mut language, region) = match find_posix_alias(self.language) {
Some((language, region)) => (language, region),
let (mut language, region) = match find_posix_locale_name_alias(self.language) {
Some((language, region)) => (language, Some(region)),
None => {
let language = Language::try_from_str(self.language)?;
let region = self.territory.map(Region::try_from_str).transpose()?;
Expand All @@ -284,10 +295,6 @@ impl<'src> PosixLocale<'src> {
}
};

let mut extensions = Extensions::new();
let mut script = None;
let mut variants = vec![variant!("posix")];

if let Some(modifier) = self.modifier {
match modifier.to_ascii_lowercase().as_str() {
"euro" => {
Expand Down
100 changes: 50 additions & 50 deletions utils/env_preferences/tests/parse/posix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,80 +15,80 @@ fn expect_success(src: &str, expected: &str) {

#[test]
fn default_locale() {
expect_success("C", "und-posix");
expect_success("POSIX", "und-posix");
expect_success("C", "en-US-posix");
expect_success("POSIX", "en-US-posix");
}

#[test]
fn region() {
expect_success("en_US", "en-US-posix");
expect_success("ne_NP", "ne-NP-posix");
expect_success("zh_TW", "zh-TW-posix");
expect_success("en_US", "en-US");
expect_success("ne_NP", "ne-NP");
expect_success("zh_TW", "zh-TW");
}

#[test]
fn codeset_ignored() {
expect_success("lv_LV.iso885913", "lv-LV-posix");
expect_success("hy_AM.armscii8", "hy-AM-posix");
expect_success("lv_LV.iso885913", "lv-LV");
expect_success("hy_AM.armscii8", "hy-AM");
}

#[test]
fn modifier() {
// Currency
expect_success("it_IT@euro", "it-IT-posix-u-cu-eur");
expect_success("it_IT@euro", "it-IT-u-cu-eur");

// Script
expect_success("uz_UZ@cyrillic", "uz-Cyrl-UZ-posix");
expect_success("sd_IN@devanagari", "sd-Deva-IN-posix");
expect_success("sr_RS@latin", "sr-Latn-RS-posix");
expect_success("uz_UZ@cyrillic", "uz-Cyrl-UZ");
expect_success("sd_IN@devanagari", "sd-Deva-IN");
expect_success("sr_RS@latin", "sr-Latn-RS");

// Language
expect_success("aa_ER@saaho", "ssy-ER-posix");
expect_success("aa_ER@saaho", "ssy-ER");

// Variant
expect_success("ca_ES@valencia", "ca-ES-posix-valencia");
expect_success("ca_ES@valencia", "ca-ES-valencia");
}

#[test]
fn alias() {
const CASES: [(&str, &str); 37] = [
("bokmal", "nb-NO-posix"),
("catalan", "ca-ES-posix"),
("croatian", "hr-HR-posix"),
("czech", "cs-CZ-posix"),
("danish", "da-DK-posix"),
("dansk", "da-DK-posix"),
("deutsch", "de-DE-posix"),
("dutch", "nl-NL-posix"),
("eesti", "et-EE-posix"),
("estonian", "et-EE-posix"),
("finnish", "fi-FI-posix"),
("french", "fr-FR-posix"),
("galego", "gl-ES-posix"),
("galician", "gl-ES-posix"),
("german", "de-DE-posix"),
("greek", "el-GR-posix"),
("hebrew", "he-IL-posix"),
("hrvatski", "hr-HR-posix"),
("hungarian", "hu-HU-posix"),
("icelandic", "is-IS-posix"),
("italian", "it-IT-posix"),
("japanese", "ja-JP-posix"),
("korean", "ko-KR-posix"),
("lithuanian", "lt-LT-posix"),
("norwegian", "nb-NO-posix"),
("nynorsk", "nn-NO-posix"),
("polish", "pl-PL-posix"),
("portuguese", "pt-PT-posix"),
("romanian", "ro-RO-posix"),
("russian", "ru-RU-posix"),
("slovak", "sk-SK-posix"),
("slovene", "sl-SI-posix"),
("slovenian", "sl-SI-posix"),
("spanish", "es-ES-posix"),
("swedish", "sv-SE-posix"),
("thai", "th-TH-posix"),
("turkish", "tr-TR-posix"),
("bokmal", "nb-NO"),
("catalan", "ca-ES"),
("croatian", "hr-HR"),
("czech", "cs-CZ"),
("danish", "da-DK"),
("dansk", "da-DK"),
("deutsch", "de-DE"),
("dutch", "nl-NL"),
("eesti", "et-EE"),
("estonian", "et-EE"),
("finnish", "fi-FI"),
("french", "fr-FR"),
("galego", "gl-ES"),
("galician", "gl-ES"),
("german", "de-DE"),
("greek", "el-GR"),
("hebrew", "he-IL"),
("hrvatski", "hr-HR"),
("hungarian", "hu-HU"),
("icelandic", "is-IS"),
("italian", "it-IT"),
("japanese", "ja-JP"),
("korean", "ko-KR"),
("lithuanian", "lt-LT"),
("norwegian", "nb-NO"),
("nynorsk", "nn-NO"),
("polish", "pl-PL"),
("portuguese", "pt-PT"),
("romanian", "ro-RO"),
("russian", "ru-RU"),
("slovak", "sk-SK"),
("slovene", "sl-SI"),
("slovenian", "sl-SI"),
("spanish", "es-ES"),
("swedish", "sv-SE"),
("thai", "th-TH"),
("turkish", "tr-TR"),
];

for (src, expected) in CASES {
Expand Down