-
Notifications
You must be signed in to change notification settings - Fork 213
Improve correctness of POSIX locale conversion #6575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,59 +4,59 @@ | |||||
|
||||||
//! Platform-specific conversion from locale strings to BCP-47 identifiers. | ||||||
|
||||||
/// Find a BCP-47 language/region from a list of POSIX locale aliases. | ||||||
/// Find a BCP-47 language/region from a list of POSIX 'locale name' aliases. | ||||||
/// | ||||||
/// Some of these conversions are approximate and not exact (e.g. "C"->"und"). | ||||||
/// This is based on GNU libc's `intl/locale.alias` file, with some manual processing | ||||||
/// to remove duplicates. The original file is available at: | ||||||
/// to remove duplicates. The notable exception is that the default `C`/`POSIX` locales | ||||||
/// map to `en-US`. The original file is available at: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
/// <https://sourceware.org/git/?p=glibc.git;a=blob;f=intl/locale.alias;hb=HEAD> | ||||||
pub fn find_posix_alias( | ||||||
alias: &str, | ||||||
pub fn find_posix_locale_name_alias( | ||||||
posix_locale: &str, | ||||||
) -> Option<( | ||||||
icu_locale_core::subtags::Language, | ||||||
Option<icu_locale_core::subtags::Region>, | ||||||
icu_locale_core::subtags::Region, | ||||||
)> { | ||||||
use icu_locale_core::subtags::{language, region, Language}; | ||||||
use icu_locale_core::subtags::{language, region}; | ||||||
|
||||||
match alias { | ||||||
"C" | "POSIX" => Some((Language::UNKNOWN, None)), | ||||||
"bokmal" => Some((language!("nb"), Some(region!("NO")))), | ||||||
"catalan" => Some((language!("ca"), Some(region!("ES")))), | ||||||
"croatian" => Some((language!("hr"), Some(region!("HR")))), | ||||||
"czech" => Some((language!("cs"), Some(region!("CZ")))), | ||||||
"danish" => Some((language!("da"), Some(region!("DK")))), | ||||||
"dansk" => Some((language!("da"), Some(region!("DK")))), | ||||||
"deutsch" => Some((language!("de"), Some(region!("DE")))), | ||||||
"dutch" => Some((language!("nl"), Some(region!("NL")))), | ||||||
"eesti" => Some((language!("et"), Some(region!("EE")))), | ||||||
"estonian" => Some((language!("et"), Some(region!("EE")))), | ||||||
"finnish" => Some((language!("fi"), Some(region!("FI")))), | ||||||
"french" => Some((language!("fr"), Some(region!("FR")))), | ||||||
"galego" => Some((language!("gl"), Some(region!("ES")))), | ||||||
"galician" => Some((language!("gl"), Some(region!("ES")))), | ||||||
"german" => Some((language!("de"), Some(region!("DE")))), | ||||||
"greek" => Some((language!("el"), Some(region!("GR")))), | ||||||
"hebrew" => Some((language!("he"), Some(region!("IL")))), | ||||||
"hrvatski" => Some((language!("hr"), Some(region!("HR")))), | ||||||
"hungarian" => Some((language!("hu"), Some(region!("HU")))), | ||||||
"icelandic" => Some((language!("is"), Some(region!("IS")))), | ||||||
"italian" => Some((language!("it"), Some(region!("IT")))), | ||||||
"japanese" => Some((language!("ja"), Some(region!("JP")))), | ||||||
"korean" => Some((language!("ko"), Some(region!("KR")))), | ||||||
"lithuanian" => Some((language!("lt"), Some(region!("LT")))), | ||||||
"norwegian" => Some((language!("nb"), Some(region!("NO")))), | ||||||
"nynorsk" => Some((language!("nn"), Some(region!("NO")))), | ||||||
"polish" => Some((language!("pl"), Some(region!("PL")))), | ||||||
"portuguese" => Some((language!("pt"), Some(region!("PT")))), | ||||||
"romanian" => Some((language!("ro"), Some(region!("RO")))), | ||||||
"russian" => Some((language!("ru"), Some(region!("RU")))), | ||||||
"slovak" => Some((language!("sk"), Some(region!("SK")))), | ||||||
"slovene" => Some((language!("sl"), Some(region!("SI")))), | ||||||
"slovenian" => Some((language!("sl"), Some(region!("SI")))), | ||||||
"spanish" => Some((language!("es"), Some(region!("ES")))), | ||||||
"swedish" => Some((language!("sv"), Some(region!("SE")))), | ||||||
"thai" => Some((language!("th"), Some(region!("TH")))), | ||||||
"turkish" => Some((language!("tr"), Some(region!("TR")))), | ||||||
match posix_locale { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove this whole The X11 (see https://www.x.org/releases/X11R6.9.0/src-single/X11R6.9.0-src.tar.bz2 ) language name aliases are obsolete, so we shouldn't carry the data or spend time matching them. They resolved to legacy-encoding locales and present-day distros generate UTF-8 locales, so even on the libc-level the aliases don't resolve to an available locale and result in |
||||||
"C" | "POSIX" => Some((language!("en"), region!("US"))), | ||||||
"bokmal" => Some((language!("nb"), region!("NO"))), | ||||||
"catalan" => Some((language!("ca"), region!("ES"))), | ||||||
"croatian" => Some((language!("hr"), region!("HR"))), | ||||||
"czech" => Some((language!("cs"), region!("CZ"))), | ||||||
"danish" => Some((language!("da"), region!("DK"))), | ||||||
"dansk" => Some((language!("da"), region!("DK"))), | ||||||
"deutsch" => Some((language!("de"), region!("DE"))), | ||||||
"dutch" => Some((language!("nl"), region!("NL"))), | ||||||
"eesti" => Some((language!("et"), region!("EE"))), | ||||||
"estonian" => Some((language!("et"), region!("EE"))), | ||||||
"finnish" => Some((language!("fi"), region!("FI"))), | ||||||
"french" => Some((language!("fr"), region!("FR"))), | ||||||
"galego" => Some((language!("gl"), region!("ES"))), | ||||||
"galician" => Some((language!("gl"), region!("ES"))), | ||||||
"german" => Some((language!("de"), region!("DE"))), | ||||||
"greek" => Some((language!("el"), region!("GR"))), | ||||||
"hebrew" => Some((language!("he"), region!("IL"))), | ||||||
"hrvatski" => Some((language!("hr"), region!("HR"))), | ||||||
"hungarian" => Some((language!("hu"), region!("HU"))), | ||||||
"icelandic" => Some((language!("is"), region!("IS"))), | ||||||
"italian" => Some((language!("it"), region!("IT"))), | ||||||
"japanese" => Some((language!("ja"), region!("JP"))), | ||||||
"korean" => Some((language!("ko"), region!("KR"))), | ||||||
"lithuanian" => Some((language!("lt"), region!("LT"))), | ||||||
"norwegian" => Some((language!("nb"), region!("NO"))), | ||||||
"nynorsk" => Some((language!("nn"), region!("NO"))), | ||||||
"polish" => Some((language!("pl"), region!("PL"))), | ||||||
"portuguese" => Some((language!("pt"), region!("PT"))), | ||||||
"romanian" => Some((language!("ro"), region!("RO"))), | ||||||
"russian" => Some((language!("ru"), region!("RU"))), | ||||||
"slovak" => Some((language!("sk"), region!("SK"))), | ||||||
"slovene" => Some((language!("sl"), region!("SI"))), | ||||||
"slovenian" => Some((language!("sl"), region!("SI"))), | ||||||
"spanish" => Some((language!("es"), region!("ES"))), | ||||||
"swedish" => Some((language!("sv"), region!("SE"))), | ||||||
"thai" => Some((language!("th"), region!("TH"))), | ||||||
"turkish" => Some((language!("tr"), region!("TR"))), | ||||||
_ => None, | ||||||
} | ||||||
} | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
//! # use env_preferences::LocaleError; | ||
//! # fn main() -> Result<(), LocaleError> { | ||
//! let posix_locale = PosixLocale::try_from_str("en_US")?; | ||
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US-posix")); | ||
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US")); | ||
//! # Ok(()) | ||
//! # } | ||
//! ``` | ||
|
@@ -25,7 +25,7 @@ use icu_locale_core::{LanguageIdentifier, Locale}; | |
|
||
use crate::ParseError; | ||
|
||
use super::aliases::find_posix_alias; | ||
use super::aliases::find_posix_locale_name_alias; | ||
|
||
#[derive(Display, Debug, PartialEq)] | ||
/// An error while parsing a POSIX locale identifier | ||
|
@@ -208,17 +208,17 @@ impl<'src> PosixLocale<'src> { | |
/// // Locales will always include the `posix` variant | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("en_US")?.try_convert_lossy()?, | ||
/// locale!("en-US-posix") | ||
/// locale!("en-US") | ||
/// ); | ||
/// // The codeset field will be ignored | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("en_US.iso88591")?.try_convert_lossy()?, | ||
/// locale!("en-US-posix") | ||
/// locale!("en-US") | ||
/// ); | ||
/// // Any unknown modifiers will be ignored | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("en_US@unknown")?.try_convert_lossy()?, | ||
/// locale!("en-US-posix") | ||
/// locale!("en-US") | ||
/// ); | ||
/// # Ok(()) | ||
/// # } | ||
|
@@ -233,49 +233,60 @@ impl<'src> PosixLocale<'src> { | |
/// // The default "C"/"POSIX" locale will be converted to "und" | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("C")?.try_convert_lossy()?, | ||
/// locale!("und-posix") | ||
/// locale!("en-US-posix") | ||
/// ); | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("POSIX")?.try_convert_lossy()?, | ||
/// locale!("und-posix") | ||
/// locale!("en-US-posix") | ||
/// ); | ||
/// | ||
/// // Known language aliases will be converted to the matching BCP-47 identifier | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("french")?.try_convert_lossy()?, | ||
/// locale!("fr-FR-posix") | ||
/// locale!("fr-FR") | ||
/// ); | ||
/// | ||
/// // Known script modifiers will be converted to the matching CLDR keys | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("uz_UZ@cyrillic")?.try_convert_lossy()?, | ||
/// locale!("uz-Cyrl-UZ-posix") | ||
/// locale!("uz-Cyrl-UZ") | ||
/// ); | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("ks_IN@devanagari")?.try_convert_lossy()?, | ||
/// locale!("ks-Deva-IN-posix") | ||
/// locale!("ks-Deva-IN") | ||
/// ); | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("be_BY@latin")?.try_convert_lossy()?, | ||
/// locale!("be-Latn-BY-posix") | ||
/// locale!("be-Latn-BY") | ||
/// ); | ||
/// | ||
/// // Other known modifiers are handled accordingly | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("en_US@euro")?.try_convert_lossy()?, | ||
/// locale!("en-US-posix-u-cu-eur") | ||
/// locale!("en-US-u-cu-eur") | ||
/// ); | ||
/// assert_eq!( | ||
/// PosixLocale::try_from_str("aa_ER@saaho")?.try_convert_lossy()?, | ||
/// locale!("ssy-ER-posix") | ||
/// locale!("ssy-ER") | ||
/// ); | ||
/// # Ok(()) | ||
/// # } | ||
/// ``` | ||
pub fn try_convert_lossy(&self) -> Result<Locale, ParseError> { | ||
let mut extensions = Extensions::new(); | ||
let mut script = None; | ||
let mut variants = Vec::new(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the comment in line 310 is obsolete |
||
|
||
// The default "C"/"POSIX" locale should map to "en-US-posix", | ||
// which is the default behaviour in ICU4C: | ||
// https://github.com/unicode-org/icu/blob/795d7ac82c4b29cf721d0ad62c0b178347d453bf/icu4c/source/common/putil.cpp#L1738 | ||
if self.language == "C" || self.language == "POSIX" { | ||
variants.push(variant!("posix")) | ||
} | ||
|
||
// Check if the language matches a known alias (e.g. "nynorsk"->("nn", "NO")) | ||
let (mut language, region) = match find_posix_alias(self.language) { | ||
Some((language, region)) => (language, region), | ||
let (mut language, region) = match find_posix_locale_name_alias(self.language) { | ||
Some((language, region)) => (language, Some(region)), | ||
None => { | ||
let language = Language::try_from_str(self.language)?; | ||
let region = self.territory.map(Region::try_from_str).transpose()?; | ||
|
@@ -284,10 +295,6 @@ impl<'src> PosixLocale<'src> { | |
} | ||
}; | ||
|
||
let mut extensions = Extensions::new(); | ||
let mut script = None; | ||
let mut variants = vec![variant!("posix")]; | ||
|
||
if let Some(modifier) = self.modifier { | ||
match modifier.to_ascii_lowercase().as_str() { | ||
"euro" => { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.