lib.rs - mozsearch

mozilla-central/third_party/rust/unic-langid-impl/src/lib.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

mod errors;

mod layout_table;

#[cfg(feature = "likelysubtags")]

pub mod likelysubtags;

#[doc(hidden)]

pub mod parser;

#[cfg(feature = "serde")]

mod serde;

pub mod subtags;

pub use crate::errors::LanguageIdentifierError;

use std::fmt::Write;

use std::iter::Peekable;

use std::str::FromStr;

/// Enum representing available character direction orientations.

#[derive(Clone, Copy, Debug, PartialEq)]

pub enum CharacterDirection {

    /// Right To Left

///

    /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.

    RTL,

    /// Left To Right

///

    /// Used in languages such as French, Spanish, English, German etc.

    LTR,

    /// Top To Bottom

///

    /// Used in Traditional Mongolian

    TTB,

type PartsTuple = (

    subtags::Language,

    Option<subtags::Script>,

    Option<subtags::Region>,

    Vec<subtags::Variant>,

);

/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.

///

/// # Examples

///

/// ```

/// use unic_langid_impl::LanguageIdentifier;

///

/// let li: LanguageIdentifier = "en-US".parse()

///     .expect("Failed to parse.");

///

/// assert_eq!(li.language, "en");

/// assert_eq!(li.script, None);

/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));

/// assert_eq!(li.variants().len(), 0);

/// ```

///

/// # Parsing

///

/// Unicode recognizes three levels of standard conformance for any language identifier:

///

///  * *well-formed* - syntactically correct

///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...

///  * *canonical* - valid and no deprecated codes or structure.

///

/// At the moment parsing normalizes a well-formed language identifier converting

/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.

///

/// Any bogus subtags will cause the parsing to fail with an error.

/// No subtag validation is performed.

///

/// # Examples:

///

/// ```

/// use unic_langid_impl::LanguageIdentifier;

///

/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()

///     .expect("Failed to parse.");

///

/// assert_eq!(li.language, "en");

/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));

/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));

/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);

/// ```

#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]

pub struct LanguageIdentifier {

    pub language: subtags::Language,

    pub script: Option<subtags::Script>,

    pub region: Option<subtags::Region>,

    variants: Option<Box<[subtags::Variant]>>,

impl LanguageIdentifier {

    /// A constructor which takes a utf8 slice, parses it and

    /// produces a well-formed `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li.to_string(), "en-US");

    /// ```

    pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {

        Ok(parser::parse_language_identifier(v)?)

    /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and

    /// produces a well-formed `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let li = LanguageIdentifier::from_parts(

    ///     "fr".parse().expect("Parsing failed."),

    ///     None,

    ///     Some("CA".parse().expect("Parsing failed.")),

    ///     &[]

    /// );

///

    /// assert_eq!(li.to_string(), "fr-CA");

    /// ```

    pub fn from_parts(

        language: subtags::Language,

        script: Option<subtags::Script>,

        region: Option<subtags::Region>,

        variants: &[subtags::Variant],

    ) -> Self {

        let variants = if !variants.is_empty() {

            let mut v = variants.to_vec();

            v.sort_unstable();

            v.dedup();

            Some(v.into_boxed_slice())

        } else {

            None

};

        Self {

            language,

            script,

            region,

            variants,

    /// # Unchecked

///

    /// This function accepts subtags expecting variants

    /// to be deduplicated and ordered.

    pub const fn from_raw_parts_unchecked(

        language: subtags::Language,

        script: Option<subtags::Script>,

        region: Option<subtags::Region>,

        variants: Option<Box<[subtags::Variant]>>,

    ) -> Self {

        Self {

            language,

            script,

            region,

            variants,

    #[doc(hidden)]

    /// This method is used by `unic-locale` to handle partial

    /// subtag iterator.

///

    /// Not stable.

    pub fn try_from_iter<'a>(

        iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,

        allow_extension: bool,

    ) -> Result<LanguageIdentifier, LanguageIdentifierError> {

        Ok(parser::parse_language_identifier_from_iter(

            iter,

            allow_extension,

)?)

    /// Consumes `LanguageIdentifier` and produces raw internal representations

    /// of all subtags in form of `u64`/`u32`.

///

    /// Primarily used for storing internal representation and restoring via

    /// `from_raw_parts_unchecked`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

    /// use tinystr::{TinyStr8, TinyStr4};

///

    /// let li: LanguageIdentifier = "en-US".parse()

    ///     .expect("Parsing failed.");

///

    /// let (lang, script, region, variants) = li.into_parts();

///

    /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(

    /// //     lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),

    /// //    script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),

    /// //    region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),

    /// //    variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),

    /// //);

///

    /// //assert_eq!(li2.to_string(), "en-US");

    /// ```

    pub fn into_parts(self) -> PartsTuple {

            self.language,

            self.script,

            self.region,

            self.variants.map_or_else(Vec::new, |v| v.to_vec()),

    /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`

    /// allowing for either side to use the missing fields as wildcards.

///

    /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let li1: LanguageIdentifier = "en".parse()

    ///     .expect("Parsing failed.");

///

    /// let li2: LanguageIdentifier = "en-US".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_ne!(li1, li2); // "en" != "en-US"

    /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"

///

    /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"

    /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"

    /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"

    /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"

    /// ```

    pub fn matches<O: AsRef<Self>>(

        &self,

        other: &O,

        self_as_range: bool,

        other_as_range: bool,

    ) -> bool {

        let other = other.as_ref();

        self.language

            .matches(other.language, self_as_range, other_as_range)

            && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)

            && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)

            && subtags_match(

                &self.variants,

                &other.variants,

                self_as_range,

                other_as_range,

    /// Returns a vector of variants subtags of the `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);

///

    /// let li2: LanguageIdentifier = "de".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li2.variants().len(), 0);

    /// ```

    pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {

        let variants: &[_] = match self.variants {

            Some(ref v) => v,

            None => &[],

};

        variants.iter()

    /// Sets variant subtags of the `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let mut li: LanguageIdentifier = "ca-ES".parse()

    ///     .expect("Parsing failed.");

///

    /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);

///

    /// assert_eq!(li.to_string(), "ca-ES-valencia");

    /// ```

    pub fn set_variants(&mut self, variants: &[subtags::Variant]) {

        let mut v = variants.to_vec();

        if v.is_empty() {

            self.variants = None;

        } else {

            v.sort_unstable();

            v.dedup();

            self.variants = Some(v.into_boxed_slice());

    /// Tests if a variant subtag is present in the `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);

    /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);

    /// ```

    pub fn has_variant(&self, variant: subtags::Variant) -> bool {

        if let Some(variants) = &self.variants {

            variants.contains(&variant)

        } else {

            false

    /// Clears variant subtags of the `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()

    ///     .expect("Parsing failed.");

///

    /// li.clear_variants();

///

    /// assert_eq!(li.to_string(), "ca-ES");

    /// ```

    pub fn clear_variants(&mut self) {

        self.variants = None;

    /// Extends the `LanguageIdentifier` adding likely subtags based

    /// on tables provided by CLDR.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let mut li: LanguageIdentifier = "en-US".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li.maximize(), true);

    /// assert_eq!(li.to_string(), "en-Latn-US");

    /// ```

    #[cfg(feature = "likelysubtags")]

    pub fn maximize(&mut self) -> bool {

        if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {

            self.language = new_li.0;

            self.script = new_li.1;

            self.region = new_li.2;

            true

        } else {

            false

    /// Extends the `LanguageIdentifier` removing likely subtags based

    /// on tables provided by CLDR.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::LanguageIdentifier;

///

    /// let mut li: LanguageIdentifier = "en-Latn-US".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li.minimize(), true);

    /// assert_eq!(li.to_string(), "en");

    /// ```

    #[cfg(feature = "likelysubtags")]

    pub fn minimize(&mut self) -> bool {

        if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {

            self.language = new_li.0;

            self.script = new_li.1;

            self.region = new_li.2;

            true

        } else {

            false

    /// Returns character direction of the `LanguageIdentifier`.

///

    /// # Examples

///

    /// ```

    /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};

///

    /// let li1: LanguageIdentifier = "es-AR".parse()

    ///     .expect("Parsing failed.");

    /// let li2: LanguageIdentifier = "fa".parse()

    ///     .expect("Parsing failed.");

///

    /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);

    /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);

    /// ```

    pub fn character_direction(&self) -> CharacterDirection {

        match (self.language.into(), self.script) {

            (_, Some(script))

                if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) =>

                CharacterDirection::LTR

            (_, Some(script))

                if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>

                CharacterDirection::RTL

            (_, Some(script))

                if layout_table::SCRIPTS_CHARACTER_DIRECTION_TTB.contains(&script.into()) =>

                CharacterDirection::TTB

            (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {

                #[cfg(feature = "likelysubtags")]

                if let Some((_, Some(script), _)) =

                    likelysubtags::maximize(self.language, None, self.region)

                    if layout_table::SCRIPTS_CHARACTER_DIRECTION_LTR.contains(&script.into()) {

                        return CharacterDirection::LTR;

                CharacterDirection::RTL

            _ => CharacterDirection::LTR,

impl FromStr for LanguageIdentifier {

    type Err = LanguageIdentifierError;

    fn from_str(source: &str) -> Result<Self, Self::Err> {

        Self::from_bytes(source.as_bytes())

impl AsRef<LanguageIdentifier> for LanguageIdentifier {

    #[inline(always)]

    fn as_ref(&self) -> &LanguageIdentifier {

        self

impl std::fmt::Display for LanguageIdentifier {

    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {

        self.language.fmt(f)?;

        if let Some(ref script) = self.script {

            f.write_char('-')?;

            script.fmt(f)?;

        if let Some(ref region) = self.region {

            f.write_char('-')?;

            region.fmt(f)?;

        if let Some(variants) = &self.variants {

            for variant in variants.iter() {

                f.write_char('-')?;

                variant.fmt(f)?;

        Ok(())

impl PartialEq<&str> for LanguageIdentifier {

    fn eq(&self, other: &&str) -> bool {

        self.to_string().as_str() == *other

fn subtag_matches<P: PartialEq>(

    subtag1: &Option<P>,

    subtag2: &Option<P>,

    as_range1: bool,

    as_range2: bool,

) -> bool {

    (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2

fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {

    subtag.as_ref().map_or(true, |t| t.is_empty())

fn subtags_match<P: PartialEq>(

    subtag1: &Option<Box<[P]>>,

    subtag2: &Option<Box<[P]>>,

    as_range1: bool,

    as_range2: bool,

) -> bool {

    // or is some and is empty!

    (as_range1 && is_option_empty(subtag1))

        || (as_range2 && is_option_empty(subtag2))

        || subtag1 == subtag2

/// This is a best-effort operation that performs all available levels of canonicalization.

///

/// At the moment the operation will normalize casing and the separator, but in the future

/// it may also validate and update from deprecated subtags to canonical ones.

///

/// # Examples

///

/// ```

/// use unic_langid_impl::canonicalize;

///

/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));

/// ```

pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {

    let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?;

    Ok(lang_id.to_string())

#[test]

fn invalid_subtag() {

    assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());