Source code

Revision control

Copy as Markdown

Other Tools

// Copyright The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! This module provides the lower-level API for UTS 46.
//!
//! [`Uts46::process`] is the core that the other convenience
//! methods build on.
//!
//! UTS 46 flags map to this API as follows:
//!
//! * _CheckHyphens_ - _true_: [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
//! * _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
//! * _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
//! * _UseSTD3ASCIIRules_ - _true_: [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`].
//! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_.
//! * _VerifyDnsLength_ - _true_: [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
//! * _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_).
use crate::punycode::Decoder;
use crate::punycode::InternalCaller;
use alloc::borrow::Cow;
use alloc::string::String;
use core::fmt::Write;
use icu_normalizer::properties::CanonicalCombiningClassMap;
use icu_normalizer::uts46::Uts46Mapper;
use icu_properties::maps::CodePointMapDataBorrowed;
use icu_properties::BidiClass;
use icu_properties::CanonicalCombiningClass;
use icu_properties::GeneralCategory;
use icu_properties::JoiningType;
use smallvec::SmallVec;
use utf8_iter::Utf8CharsEx;
/// ICU4C-compatible constraint.
const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = 2000;
/// ICU4C-compatible constraint. (Note: ICU4C measures
/// UTF-16 and we measure UTF-32. This means that we
/// allow longer non-BMP inputs. For this implementation,
/// the denial-of-service scaling does not depend on BMP vs.
/// non-BMP: only the scalar values matter.)
///
const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = 1000;
/// For keeping track of what kind of numerals have been
/// seen in an RTL label.
#[derive(Debug, PartialEq, Eq)]
enum RtlNumeralState {
Undecided,
European,
Arabic,
}
/// Computes the mask for upper-case ASCII.
const fn upper_case_mask() -> u128 {
let mut accu = 0u128;
let mut b = 0u8;
while b < 128 {
if (b >= b'A') && (b <= b'Z') {
accu |= 1u128 << b;
}
b += 1;
}
accu
}
/// Bit set for upper-case ASCII.
const UPPER_CASE_MASK: u128 = upper_case_mask();
/// Computes the mask for glyphless ASCII.
const fn glyphless_mask() -> u128 {
let mut accu = 0u128;
let mut b = 0u8;
while b < 128 {
if (b <= b' ') || (b == 0x7F) {
accu |= 1u128 << b;
}
b += 1;
}
accu
}
/// Bit set for glyphless ASCII.
const GLYPHLESS_MASK: u128 = glyphless_mask();
/// The mask for the ASCII dot.
const DOT_MASK: u128 = 1 << b'.';
/// Computes the ASCII deny list for STD3 ASCII rules.
const fn ldh_mask() -> u128 {
let mut accu = 0u128;
let mut b = 0u8;
while b < 128 {
if !((b >= b'a' && b <= b'z') || (b >= b'0' && b <= b'9') || b == b'-' || b == b'.') {
accu |= 1u128 << b;
}
b += 1;
}
accu
}
/// Turns a joining type into a mask for comparing with multiple type at once.
const fn joining_type_to_mask(jt: JoiningType) -> u32 {
1u32 << jt.0
}
/// Mask for checking for both left and dual joining.
const LEFT_OR_DUAL_JOINING_MASK: u32 =
joining_type_to_mask(JoiningType::LeftJoining) | joining_type_to_mask(JoiningType::DualJoining);
/// Mask for checking for both left and dual joining.
const RIGHT_OR_DUAL_JOINING_MASK: u32 = joining_type_to_mask(JoiningType::RightJoining)
| joining_type_to_mask(JoiningType::DualJoining);
/// Turns a bidi class into a mask for comparing with multiple classes at once.
const fn bidi_class_to_mask(bc: BidiClass) -> u32 {
1u32 << bc.0
}
/// Mask for checking if the domain is a bidi domain.
const RTL_MASK: u32 = bidi_class_to_mask(BidiClass::RightToLeft)
| bidi_class_to_mask(BidiClass::ArabicLetter)
| bidi_class_to_mask(BidiClass::ArabicNumber);
/// Mask for allowable bidi classes in the first character of a label
/// (either LTR or RTL) in a bidi domain.
const FIRST_BC_MASK: u32 = bidi_class_to_mask(BidiClass::LeftToRight)
| bidi_class_to_mask(BidiClass::RightToLeft)
| bidi_class_to_mask(BidiClass::ArabicLetter);
// Mask for allowable bidi classes of the last (non-Non-Spacing Mark)
// character in an LTR label in a bidi domain.
const LAST_LTR_MASK: u32 =
bidi_class_to_mask(BidiClass::LeftToRight) | bidi_class_to_mask(BidiClass::EuropeanNumber);
// Mask for allowable bidi classes of the last (non-Non-Spacing Mark)
// character in an RTL label in a bidi domain.
const LAST_RTL_MASK: u32 = bidi_class_to_mask(BidiClass::RightToLeft)
| bidi_class_to_mask(BidiClass::ArabicLetter)
| bidi_class_to_mask(BidiClass::EuropeanNumber)
| bidi_class_to_mask(BidiClass::ArabicNumber);
// Mask for allowable bidi classes of the middle characters in an LTR label in a bidi domain.
const MIDDLE_LTR_MASK: u32 = bidi_class_to_mask(BidiClass::LeftToRight)
| bidi_class_to_mask(BidiClass::EuropeanNumber)
| bidi_class_to_mask(BidiClass::EuropeanSeparator)
| bidi_class_to_mask(BidiClass::CommonSeparator)
| bidi_class_to_mask(BidiClass::EuropeanTerminator)
| bidi_class_to_mask(BidiClass::OtherNeutral)
| bidi_class_to_mask(BidiClass::BoundaryNeutral)
| bidi_class_to_mask(BidiClass::NonspacingMark);
// Mask for allowable bidi classes of the middle characters in an RTL label in a bidi domain.
const MIDDLE_RTL_MASK: u32 = bidi_class_to_mask(BidiClass::RightToLeft)
| bidi_class_to_mask(BidiClass::ArabicLetter)
| bidi_class_to_mask(BidiClass::ArabicNumber)
| bidi_class_to_mask(BidiClass::EuropeanNumber)
| bidi_class_to_mask(BidiClass::EuropeanSeparator)
| bidi_class_to_mask(BidiClass::CommonSeparator)
| bidi_class_to_mask(BidiClass::EuropeanTerminator)
| bidi_class_to_mask(BidiClass::OtherNeutral)
| bidi_class_to_mask(BidiClass::BoundaryNeutral)
| bidi_class_to_mask(BidiClass::NonspacingMark);
/// Turns a genecal category into a mask for comparing with multiple categories at once.
const fn general_category_to_mask(gc: GeneralCategory) -> u32 {
1 << (gc as u32)
}
/// Mask for the disallowed general categories of the first character in a label.
const MARK_MASK: u32 = general_category_to_mask(GeneralCategory::NonspacingMark)
| general_category_to_mask(GeneralCategory::SpacingMark)
| general_category_to_mask(GeneralCategory::EnclosingMark);
const PUNYCODE_PREFIX: u32 =
((b'-' as u32) << 24) | ((b'-' as u32) << 16) | ((b'N' as u32) << 8) | b'X' as u32;
const PUNYCODE_PREFIX_MASK: u32 = (0xFF << 24) | (0xFF << 16) | (0xDF << 8) | 0xDF;
fn write_punycode_label<W: Write + ?Sized>(
label: &[char],
sink: &mut W,
) -> Result<(), ProcessingError> {
sink.write_str("xn--")?;
crate::punycode::encode_into::<_, _, InternalCaller>(label.iter().copied(), sink)?;
Ok(())
}
#[inline(always)]
fn has_punycode_prefix(slice: &[u8]) -> bool {
if slice.len() < 4 {
return false;
}
// Sadly, the optimizer doesn't figure out that more idiomatic code
// should compile to masking on 32-bit value.
let a = slice[0];
let b = slice[1];
let c = slice[2];
let d = slice[3];
let u = (u32::from(d) << 24) | (u32::from(c) << 16) | (u32::from(b) << 8) | u32::from(a);
(u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX
}
#[inline(always)]
fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool {
u.wrapping_sub(start) <= (end - start)
}
#[inline(always)]
fn in_inclusive_range_char(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
}
#[inline(always)]
fn is_passthrough_ascii_label(label: &[u8]) -> bool {
// XXX if we aren't performing _CheckHyphens_, this could
// check for "xn--" and pass through YouTube CDN node names.
if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' {
return false;
}
if let Some((&first, tail)) = label.split_first() {
// We need to check the first and last character
// more strictly in case this turns out to be a
// label in a bidi domain name. This has the side
// effect that this function only accepts labels
// that also conform to the STD3 rules.
//
// XXX: If we are in the fail-fast mode (i.e. we don't need
// to be able to overwrite anything with U+FFFD), we could
// merely record that we've seen a digit here and error out
// if we later discover that the domain name is a bidi
// domain name.
if !in_inclusive_range8(first, b'a', b'z') {
return false;
}
for &b in tail {
// If we used LDH_MASK, we'd have to check
// the bytes for the ASCII range anyhow.
if in_inclusive_range8(b, b'a', b'z') {
continue;
}
if in_inclusive_range8(b, b'0', b'9') {
continue;
}
if b == b'-' {
continue;
}
return false;
}
label.last() != Some(&b'-')
} else {
// empty
true
}
}
#[inline(always)]
fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) {
if let Some(pos) = label.iter().position(|b| !b.is_ascii()) {
if pos == 0 {
// First is non-ASCII
(&[], label)
} else {
// Leave one ASCII character in the suffix
// in case it's a letter that a combining
// character combines with.
let (head, tail) = label.split_at(pos - 1);
(head, tail)
}
} else {
// All ASCII
(label, &[])
}
}
// Input known to be lower-case, but may contain non-ASCII.
#[inline(always)]
fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char {
if let Some(shifted) = 1u128.checked_shl(u32::from(c)) {
if (deny_list & shifted) == 0 {
c
} else {
'\u{FFFD}'
}
} else {
c
}
}
// Input known to be ASCII, but may contain upper case ASCII.
#[inline(always)]
fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char {
if (deny_list & (1u128 << b)) == 0 {
return char::from(b);
}
if in_inclusive_range8(b, b'A', b'Z') {
return char::from(b + 0x20);
}
'\u{FFFD}'
}
#[inline(always)]
fn is_ascii(label: &[char]) -> bool {
for c in label.iter() {
if !c.is_ascii() {
return false;
}
}
true
}
#[derive(PartialEq, Eq, Copy, Clone)]
enum PunycodeClassification {
Ascii,
Unicode,
Error,
}
#[inline(always)]
fn classify_for_punycode(label: &[char]) -> PunycodeClassification {
let mut iter = label.iter().copied();
loop {
if let Some(c) = iter.next() {
if c.is_ascii() {
continue;
}
if c == '\u{FFFD}' {
return PunycodeClassification::Error;
}
for c in iter {
if c == '\u{FFFD}' {
return PunycodeClassification::Error;
}
}
return PunycodeClassification::Unicode;
}
return PunycodeClassification::Ascii;
}
}
/// The ASCII deny list to be applied.
#[derive(PartialEq, Eq, Copy, Clone)]
#[repr(transparent)]
pub struct AsciiDenyList {
bits: u128,
}
impl AsciiDenyList {
/// Computes (preferably at compile time) an ASCII deny list.
///
/// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below
/// as well as U+007F DELETE for convenience without having to list
/// these characters in the `deny_list` string.
///
/// `deny_list` is the list of ASCII characters to deny. This
/// list must not contain any of:
/// * Letters
/// * Digits
/// * Hyphen
/// * Dot (period / full-stop)
/// * Non-ASCII
///
/// # Panics
///
/// If the deny list contains characters listed as prohibited above.
pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self {
let mut bits = UPPER_CASE_MASK;
if deny_glyphless {
bits |= GLYPHLESS_MASK;
}
let mut i = 0;
let bytes = deny_list.as_bytes();
while i < bytes.len() {
let b = bytes[i];
assert!(b < 0x80, "ASCII deny list must be ASCII.");
// assert_ne not yet available in const context.
assert!(b != b'.', "ASCII deny list must not contain the dot.");
assert!(b != b'-', "ASCII deny list must not contain the hyphen.");
assert!(
!((b >= b'0') && (b <= b'9')),
"ASCII deny list must not contain digits."
);
assert!(
!((b >= b'a') && (b <= b'z')),
"ASCII deny list must not contain letters."
);
assert!(
!((b >= b'A') && (b <= b'Z')),
"ASCII deny list must not contain letters."
);
bits |= 1u128 << b;
i += 1;
}
AsciiDenyList { bits }
}
/// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_.
///
/// Equivalent to `AsciiDenyList::new(false, "")`.
///
/// Note: Not denying the space and control characters can result in
/// strange behavior. Without a deny list provided to the UTS 46
/// operation, the caller is expected perform filtering afterwards,
/// but it's more efficient to use `AsciiDenyList` than post-processing,
/// because the internals of this crate can optimize away checks in
/// certain cases.
pub const EMPTY: AsciiDenyList = AsciiDenyList::new(false, "");
/// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_.
///
/// Note that this deny list rejects the underscore, which occurs in
/// pseudo-hosts used by various TXT record-based protocols, and also
/// characters that may occurs in non-DNS naming, such as NetBIOS.
pub const STD3: AsciiDenyList = AsciiDenyList { bits: ldh_mask() };
/// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard.
///
/// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^|")`.
///
/// Note that this deny list rejects IPv6 addresses, so (as in URL
/// parsing) you need to check for IPv6 addresses first and not
/// put them through UTS 46 processing.
pub const URL: AsciiDenyList = AsciiDenyList::new(true, "%#/:<>?@[\\]^|");
}
/// The _CheckHyphens_ mode.
#[derive(PartialEq, Eq, Copy, Clone)]
#[non_exhaustive] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added
pub enum Hyphens {
/// _CheckHyphens=false_: Do not place positional restrictions on hyphens.
///
/// This mode is used by the WHATWG URL Standard for normal User Agent processing
/// (i.e. not conformance checking).
Allow,
/// Prohibit hyphens in the first and last position in the label but allow in
/// the third and fourth position.
///
/// Note that this mode rejects real-world names, including some GitHub user pages.
CheckFirstLast,
/// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth,
/// and last position in the label.
///
/// Note that this mode rejects real-world names, including YouTube CDN nodes
/// and some GitHub user pages.
Check,
}
/// The UTS 46 _VerifyDNSLength_ flag.
#[derive(PartialEq, Eq, Copy, Clone)]
#[non_exhaustive]
pub enum DnsLength {
/// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.)
Ignore,
/// _VerifyDNSLength=true_ with the exception that the trailing root label dot is
/// allowed.
VerifyAllowRootDot,
/// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.)
Verify,
}
/// Policy for customizing behavior in case of an error.
#[derive(PartialEq, Eq, Copy, Clone)]
#[non_exhaustive]
pub enum ErrorPolicy {
/// Return as early as possible without producing output in case of error.
FailFast,
/// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output
/// containing REPLACEMENT CHARACTERs may be show to the user to illustrate
/// what was wrong but must not be used for naming in a network protocol.)
MarkErrors,
}
/// The success outcome of [`Uts46::process`]
#[derive(PartialEq, Eq, Copy, Clone, Debug)]
pub enum ProcessingSuccess {
/// There were no errors. The caller must consider the input to be the output.
///
/// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`].
///
/// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of
/// [`Uts46::process`].)
Passthrough,
/// There were no errors. The caller must consider what was written to the sink to be the output.
///
/// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of
/// [`Uts46::process`].)
WroteToSink,
}
/// The failure outcome of [`Uts46::process`]
#[derive(PartialEq, Eq, Copy, Clone, Debug)]
pub enum ProcessingError {
/// There was a validity error according to the chosen options.
///
/// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the
/// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error.
ValidityError,
/// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not
/// be used.
SinkError,
}
impl From<core::fmt::Error> for ProcessingError {
fn from(_: core::fmt::Error) -> Self {
ProcessingError::SinkError
}
}
impl From<crate::punycode::PunycodeEncodeError> for ProcessingError {
fn from(_: crate::punycode::PunycodeEncodeError) -> Self {
unreachable!(
"Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH"
);
}
}
#[derive(Debug, Clone, Copy)]
enum AlreadyAsciiLabel<'a> {
MixedCaseAscii(&'a [u8]),
MixedCasePunycode(&'a [u8]),
Other,
}
/// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation.
///
/// If the second argument is `false`, the trailing root label dot is allowed.
///
/// # Panics
///
/// Panics in debug mode if the argument isn't ASCII.
pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool {
let bytes = domain_name.as_bytes();
debug_assert!(bytes.is_ascii());
let domain_name_without_trailing_dot = if let Some(without) = bytes.strip_suffix(b".") {
if !allow_trailing_dot {
return false;
}
without
} else {
bytes
};
if domain_name_without_trailing_dot.len() > 253 {
return false;
}
for label in domain_name_without_trailing_dot.split(|b| *b == b'.') {
if label.is_empty() {
return false;
}
if label.len() > 63 {
return false;
}
}
true
}
/// An implementation of UTS #46.
pub struct Uts46 {
mapper: Uts46Mapper,
canonical_combining_class: CanonicalCombiningClassMap,
general_category: CodePointMapDataBorrowed<'static, GeneralCategory>,
bidi_class: CodePointMapDataBorrowed<'static, BidiClass>,
joining_type: CodePointMapDataBorrowed<'static, JoiningType>,
}
#[cfg(feature = "compiled_data")]
impl Default for Uts46 {
fn default() -> Self {
Self::new()
}
}
impl Uts46 {
/// Constructor using data compiled into the binary.
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
Self {
mapper: Uts46Mapper::new(),
canonical_combining_class: CanonicalCombiningClassMap::new(),
general_category: icu_properties::maps::general_category(),
bidi_class: icu_properties::maps::bidi_class(),
joining_type: icu_properties::maps::joining_type(),
}
}
// XXX Should there be an `icu_provider` feature for enabling
// a constructor for run-time data loading?
/// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation
/// from UTS #46 with the options indicated.
///
/// # Arguments
///
/// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
/// this method and input that is not well-formed UTF-8 is treated as an error. If you
/// already have a `&str`, call `.as_bytes()` on it.)
/// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
/// processing is handled via this argument. Most callers are probably the best off
/// by using [`AsciiDenyList::URL`] here.
/// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
/// off by using [`Hyphens::Allow`] here.
/// * `dns_length` - The UTS 46 _VerifyDNSLength_ flag.
pub fn to_ascii<'a>(
&self,
domain_name: &'a [u8],
ascii_deny_list: AsciiDenyList,
hyphens: Hyphens,
dns_length: DnsLength,
) -> Result<Cow<'a, str>, crate::Errors> {
let mut s = String::new();
match self.process(
domain_name,
ascii_deny_list,
hyphens,
ErrorPolicy::FailFast,
|_, _, _| false,
&mut s,
None,
) {
// SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
Ok(ProcessingSuccess::Passthrough) => {
let cow = Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) });
if dns_length != DnsLength::Ignore
&& !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
{
Err(crate::Errors::default())
} else {
Ok(cow)
}
}
Ok(ProcessingSuccess::WroteToSink) => {
let cow: Cow<'_, str> = Cow::Owned(s);
if dns_length != DnsLength::Ignore
&& !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
{
Err(crate::Errors::default())
} else {
Ok(cow)
}
}
Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
Err(ProcessingError::SinkError) => unreachable!(),
}
}
/// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
/// from UTS #46 according to the options given. When there
/// are errors, there is still output, which may be rendered user, even through
/// the output must not be used in networking protocols. Errors are denoted
/// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the
/// return tuple is `Err`, the first item of the return tuple is guaranteed to contain
/// at least one U+FFFD.)
///
/// Most applications probably shouldn't use this method and should be using
/// [`Uts46::to_user_interface`] instead.
///
/// # Arguments
///
/// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
/// this method and input that is not well-formed UTF-8 is treated as an error. If you
/// already have a `&str`, call `.as_bytes()` on it.)
/// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
/// processing is handled via this argument. Most callers are probably the best off
/// by using [`AsciiDenyList::URL`] here.
/// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
/// off by using [`Hyphens::Allow`] here.
pub fn to_unicode<'a>(
&self,
domain_name: &'a [u8],
ascii_deny_list: AsciiDenyList,
hyphens: Hyphens,
) -> (Cow<'a, str>, Result<(), crate::Errors>) {
self.to_user_interface(domain_name, ascii_deny_list, hyphens, |_, _, _| true)
}
/// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
/// from UTS #46 according to options given with some
/// error-free Unicode labels output according to
/// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by
/// application policy implemented via the `output_as_unicode` closure. The purpose
/// is to convert user-visible domains to the Unicode form in general but to render
/// potentially misleading labels as Punycode.
///
/// This is an imperfect security mechanism, because [the Punycode form itself may be
/// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
/// However, since this mechanism is common practice, this API provides support for The
/// the mechanism.
///
/// ASCII labels always pass through as ASCII and labels with errors always pass through
/// as Unicode. For non-erroneous labels that contain at least one non-ASCII character
/// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label,
/// the TLD (potentially empty), and a flag indicating whether the domain name as a whole
/// is a bidi domain name. If the return value is `true`, the label passes through as
/// Unicode. If the return value is `false`, the label is converted to Punycode.
///
/// When there are errors, there is still output, which may be rendered user, even through
/// the output must not be used in networking protocols. Errors are denoted by
/// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item
/// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain
/// at least one U+FFFD.) Labels that contain errors are not converted to Punycode.
///
/// # Arguments
///
/// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
/// this method and input that is not well-formed UTF-8 is treated as an error. If you
/// already have a `&str`, call `.as_bytes()` on it.)
/// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
/// processing is handled via this argument. Most callers are probably the best off
/// by using [`AsciiDenyList::URL`] here.
/// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
/// off by using [`Hyphens::Allow`] here.
/// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
/// (as opposed to Punycode). The first argument is the label for which a decision is
/// needed (always non-empty slice). The second argument is the TLD (potentially empty).
/// The third argument is `true` iff the domain name as a whole is a bidi domain name.
/// Only non-erroneous labels that contain at least one non-ASCII character are passed
/// to the closure as the first argument. The second and third argument values are
/// guaranteed to remain the same during a single call to `process`, and the closure
/// may cache computations derived from the second and third argument (hence the
/// `FnMut` type).
pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
&self,
domain_name: &'a [u8],
ascii_deny_list: AsciiDenyList,
hyphens: Hyphens,
output_as_unicode: OutputUnicode,
) -> (Cow<'a, str>, Result<(), crate::Errors>) {
let mut s = String::new();
match self.process(
domain_name,
ascii_deny_list,
hyphens,
ErrorPolicy::MarkErrors,
output_as_unicode,
&mut s,
None,
) {
// SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
Ok(ProcessingSuccess::Passthrough) => (
Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }),
Ok(()),
),
Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())),
Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())),
Err(ProcessingError::SinkError) => unreachable!(),
}
}
/// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and
/// [`Uts46::to_user_interface`] are built on to allow support for output types other
/// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language).
///
/// # Arguments
///
/// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
/// this method and input that is not well-formed UTF-8 is treated as an error. If you
/// already have a `&str`, call `.as_bytes()` on it.)
/// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
/// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
/// processing is handled via this argument. Most callers are probably the best off
/// by using [`AsciiDenyList::URL`] here.
/// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
/// off by using [`Hyphens::Allow`] here.
/// * `error_policy` - Whether to fail fast or to produce output that may be rendered
/// for the user to examine in case of errors.
/// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
/// (as opposed to Punycode). The first argument is the label for which a decision is
/// needed (always non-empty slice). The second argument is the TLD (potentially empty).
/// The third argument is `true` iff the domain name as a whole is a bidi domain name.
/// Only non-erroneous labels that contain at least one non-ASCII character are passed
/// to the closure as the first argument. The second and third argument values are
/// guaranteed to remain the same during a single call to `process`, and the closure
/// may cache computations derived from the second and third argument (hence the
/// `FnMut` type). To perform the _ToASCII_ operation, `|_, _, _| false` must be
/// passed as the closure. To perform the _ToUnicode_ operation, `|_, _, _| true` must
/// be passed as the closure. A more complex closure may be used to prepare a domain
/// name for display in a user interface so that labels are converted to the Unicode
/// form in general but potentially misleading labels are converted to the Punycode
/// form.
/// `sink` - The object that receives the output (in the non-passthrough case).
/// `ascii_sink` - A second sink that receives the _ToASCII_ form only if there
/// were no errors and `sink` received at least one character of non-ASCII output.
/// The purpose of this argument is to enable a user interface display form of the
/// domain and the _ToASCII_ form of the domain to be computed efficiently together.
/// This argument is useless when `output_as_unicode` always returns `false`, in
/// which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives
/// no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the
/// output received by `sink` also as the _ToASCII_ result.
///
/// # Return value
///
/// * `Ok(ProcessingSuccess::Passthrough)` - The caller must treat
/// `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This
/// return value asserts that calling `core::str::from_utf8_unchecked(domain_name)`
/// is safe.)
/// * `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written
/// to `sink` as the output. If another sink was passed as `ascii_sink` but it did
/// not receive output, the caller must treat what was written to `sink` also as
/// the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller
/// must treat what was written to `ascii_sink` as the _ToASCII_ output.
/// * `Err(ProcessingError::ValidityError)` - The input was in error and must
/// not be used for DNS lookup or otherwise in a network protocol. If `error_policy`
/// was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed
/// to the user as an illustration of where the error was or the errors were.
/// * `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned
/// [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not
/// be used. If `W` never returns [`core::fmt::Error`], this method never returns
/// `Err(ProcessingError::SinkError)`.
///
/// # Safety-usable invariant
///
/// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is
/// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note:
/// Other return values do _not_ imply that `domain_name` wasn't ASCII!)
///
/// # Security considerations
///
/// Showing labels whose Unicode form might mislead the user as Punycode instead is
/// an imperfect security mechanism, because [the Punycode form itself may be resemble
/// However, since this mechanism is common practice, this API provides support for the
/// the mechanism.
///
/// Punycode processing is quadratic, so to avoid denial of service, this method imposes
/// length limits on Punycode treating especially long inputs as being in error. These
/// limits are well higher than the DNS length limits and are not more restrictive than
/// the limits imposed by ICU4C.
#[allow(clippy::too_many_arguments)]
pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
&self,
domain_name: &[u8],
ascii_deny_list: AsciiDenyList,
hyphens: Hyphens,
error_policy: ErrorPolicy,
mut output_as_unicode: OutputUnicode,
sink: &mut W,
ascii_sink: Option<&mut W>,
) -> Result<ProcessingSuccess, ProcessingError> {
let fail_fast = error_policy == ErrorPolicy::FailFast;
let mut domain_buffer = SmallVec::<[char; 253]>::new();
let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; 8]>::new();
// `process_inner` could be pasted inline here, but it's out of line in order
// to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`.
let (passthrough_up_to, is_bidi, had_errors) = self.process_inner(
domain_name,
ascii_deny_list,
hyphens,
fail_fast,
&mut domain_buffer,
&mut already_punycode,
);
if passthrough_up_to == domain_name.len() {
debug_assert!(!had_errors);
return Ok(ProcessingSuccess::Passthrough);
}
// Checked only after passthrough as a micro optimization.
if fail_fast && had_errors {
return Err(ProcessingError::ValidityError);
}
debug_assert_eq!(had_errors, domain_buffer.contains(&'\u{FFFD}'));
let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.']) {
without_dot
} else {
&domain_buffer[..]
};
// unwrap is OK, because we always have at least one label
let tld = without_dot.rsplit(|c| *c == '.').next().unwrap();
let mut had_unicode_output = false;
let mut seen_label = false;
let mut already_punycode_iter = already_punycode.iter();
let mut passthrough_up_to_extended = passthrough_up_to;
let mut flushed_prefix = false;
for label in domain_buffer.split(|c| *c == '.') {
// Unwrap is OK, because there are supposed to be as many items in
// `already_punycode` as there are labels.
let input_punycode = *already_punycode_iter.next().unwrap();
if seen_label {
if flushed_prefix {
sink.write_char('.')?;
} else {
debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
passthrough_up_to_extended += 1;
if passthrough_up_to_extended == domain_name.len() {
debug_assert!(!had_errors);
return Ok(ProcessingSuccess::Passthrough);
}
}
}
seen_label = true;
if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
if let Some(first_upper_case) =
mixed_case.iter().position(|c| c.is_ascii_uppercase())
{
let (head, tail) = mixed_case.split_at(first_upper_case);
let slice_to_write = if flushed_prefix {
head
} else {
flushed_prefix = true;
passthrough_up_to_extended += head.len();
debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
&domain_name[..passthrough_up_to_extended]
};
// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
for c in tail.iter() {
sink.write_char(char::from(c.to_ascii_lowercase()))?;
}
} else if flushed_prefix {
// SAFETY: `mixed_case` is known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
} else {
passthrough_up_to_extended += mixed_case.len();
if passthrough_up_to_extended == domain_name.len() {
debug_assert!(!had_errors);
return Ok(ProcessingSuccess::Passthrough);
}
}
continue;
}
let potentially_punycode = if fail_fast {
debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error);
!is_ascii(label)
} else {
classify_for_punycode(label) == PunycodeClassification::Unicode
};
let passthrough = if potentially_punycode {
let unicode = output_as_unicode(label, tld, is_bidi);
had_unicode_output |= unicode;
unicode
} else {
true
};
if passthrough {
if !flushed_prefix {
flushed_prefix = true;
// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
})?;
}
for c in label.iter().copied() {
sink.write_char(c)?;
}
} else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode {
if let Some(first_upper_case) =
mixed_case.iter().position(|c| c.is_ascii_uppercase())
{
let (head, tail) = mixed_case.split_at(first_upper_case);
let slice_to_write = if flushed_prefix {
head
} else {
flushed_prefix = true;
passthrough_up_to_extended += head.len();
debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
&domain_name[..passthrough_up_to_extended]
};
// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
for c in tail.iter() {
sink.write_char(char::from(c.to_ascii_lowercase()))?;
}
} else if flushed_prefix {
// SAFETY: `mixed_case` is known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
} else {
passthrough_up_to_extended += mixed_case.len();
if passthrough_up_to_extended == domain_name.len() {
debug_assert!(!had_errors);
return Ok(ProcessingSuccess::Passthrough);
}
}
} else {
if !flushed_prefix {
flushed_prefix = true;
// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
})?;
}
write_punycode_label(label, sink)?;
}
}
if had_errors {
return Err(ProcessingError::ValidityError);
}
if had_unicode_output {
if let Some(sink) = ascii_sink {
let mut seen_label = false;
let mut already_punycode_iter = already_punycode.iter();
let mut passthrough_up_to_extended = passthrough_up_to;
let mut flushed_prefix = false;
for label in domain_buffer.split(|c| *c == '.') {
// Unwrap is OK, because there are supposed to be as many items in
// `already_punycode` as there are labels.
let input_punycode = *already_punycode_iter.next().unwrap();
if seen_label {
if flushed_prefix {
sink.write_char('.')?;
} else {
debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
passthrough_up_to_extended += 1;
}
}
seen_label = true;
if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
if let Some(first_upper_case) =
mixed_case.iter().position(|c| c.is_ascii_uppercase())
{
let (head, tail) = mixed_case.split_at(first_upper_case);
let slice_to_write = if flushed_prefix {
head
} else {
flushed_prefix = true;
passthrough_up_to_extended += head.len();
debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
&domain_name[..passthrough_up_to_extended]
};
// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(slice_to_write)
})?;
for c in tail.iter() {
sink.write_char(char::from(c.to_ascii_lowercase()))?;
}
} else if flushed_prefix {
// SAFETY: `mixed_case` is known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
} else {
passthrough_up_to_extended += mixed_case.len();
}
continue;
}
if is_ascii(label) {
if !flushed_prefix {
flushed_prefix = true;
// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(
&domain_name[..passthrough_up_to_extended],
)
})?;
}
for c in label.iter().copied() {
sink.write_char(c)?;
}
} else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode
{
if let Some(first_upper_case) =
mixed_case.iter().position(|c| c.is_ascii_uppercase())
{
let (head, tail) = mixed_case.split_at(first_upper_case);
let slice_to_write = if flushed_prefix {
head
} else {
flushed_prefix = true;
passthrough_up_to_extended += head.len();
debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
&domain_name[..passthrough_up_to_extended]
};
// SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(slice_to_write)
})?;
for c in tail.iter() {
sink.write_char(char::from(c.to_ascii_lowercase()))?;
}
} else if flushed_prefix {
// SAFETY: `mixed_case` is known to be ASCII.
sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
} else {
passthrough_up_to_extended += mixed_case.len();
}
} else {
if !flushed_prefix {
flushed_prefix = true;
// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(
&domain_name[..passthrough_up_to_extended],
)
})?;
}
write_punycode_label(label, sink)?;
}
}
if !flushed_prefix {
// SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
sink.write_str(unsafe {
core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
})?;
}
}
}
Ok(ProcessingSuccess::WroteToSink)
}
/// The part of `process` that doesn't need to be generic over the sink and
/// can avoid monomorphizing in the interest of code size.
#[inline(never)]
fn process_inner<'a>(
&self,
domain_name: &'a [u8],
ascii_deny_list: AsciiDenyList,
hyphens: Hyphens,
fail_fast: bool,
domain_buffer: &mut SmallVec<[char; 253]>,
already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>,
) -> (usize, bool, bool) {
// Sadly, this even faster-path ASCII tier is needed to avoid regressing
// performance.
let mut iter = domain_name.iter();
let mut most_recent_label_start = iter.clone();
let tail = loop {
if let Some(&b) = iter.next() {
if in_inclusive_range8(b, b'a', b'z') {
continue;
}
if b == b'.' {
most_recent_label_start = iter.clone();
continue;
}
break most_recent_label_start.as_slice();
} else {
// Success! The whole input passes through on the fastest path!
return (domain_name.len(), false, false);
}
};
let deny_list = ascii_deny_list.bits;
let deny_list_deny_dot = deny_list | DOT_MASK;
let mut had_errors = false;
let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name`
// 253 ASCII characters is the max length for a valid domain name
// (excluding the root dot).
let mut current_label_start; // Index into `domain_buffer`
let mut seen_label = false;
let mut in_prefix = true;
for label in tail.split(|b| *b == b'.') {
// We check for passthrough only for the prefix. That is, if we
// haven't moved on and started filling `domain_buffer`. Keeping
// this stuff in one loop where the first items keep being skipped
// once they have been skipped at least once instead of working
// this into a fancier loop structure in order to make sure that
// no item from the iterator is lost or processed twice.
// Furthermore, after the passthrough fails, restarting the
// normalization process after each pre-existing ASCII dot also
// provides an opportunity for the processing to get back onto
// an ASCII fast path that bypasses the normalizer for ASCII
// after a pre-existing ASCII dot (pre-existing in the sense
// of not coming from e.g. normalizing an ideographic dot).
if in_prefix && is_passthrough_ascii_label(label) {
if seen_label {
debug_assert_eq!(domain_name[passthrough_up_to], b'.');
passthrough_up_to += 1;
}
seen_label = true;
passthrough_up_to += label.len();
continue;
}
if seen_label {
if in_prefix {
debug_assert_eq!(domain_name[passthrough_up_to], b'.');
passthrough_up_to += 1;
} else {
domain_buffer.push('.');
}
}
seen_label = true;
in_prefix = false;
current_label_start = domain_buffer.len();
if !label.is_empty() {
let (ascii, non_ascii) = split_ascii_fast_path_prefix(label);
let non_punycode_ascii_label = if non_ascii.is_empty() {
if has_punycode_prefix(ascii) {
if (ascii.last() != Some(&b'-'))
&& (ascii.len() - 4 <= PUNYCODE_DECODE_MAX_INPUT_LENGTH)
{
if let Ok(decode) =
Decoder::default().decode::<u8, InternalCaller>(&ascii[4..])
{
// 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
// characters.
let mut label_buffer = SmallVec::<[char; 59]>::new();
label_buffer.extend(decode);
if self.after_punycode_decode(
domain_buffer,
current_label_start,
&label_buffer,
deny_list_deny_dot,
fail_fast,
&mut had_errors,
) {
return (0, false, true);
}
if self.check_label(
hyphens,
&mut domain_buffer[current_label_start..],
fail_fast,
&mut had_errors,
true,
true,
) {
return (0, false, true);
}
} else {
// Punycode failed
if fail_fast {
return (0, false, true);
}
had_errors = true;
domain_buffer.push('\u{FFFD}');
let mut iter = ascii.iter();
// Discard the first character that we replaced.
let _ = iter.next();
domain_buffer.extend(iter.map(|c| {
// Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
// not matter.
apply_ascii_deny_list_to_potentially_upper_case_ascii(
*c, deny_list,
)
}));
};
// If there were errors, we won't be trying to use this
// anyway later, so it's fine to put it here unconditionally.
already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label));
continue;
} else if fail_fast {
return (0, false, true);
}
// Else fall through to the complex path and rediscover error
// there.
false
} else {
true
}
} else {
false
};
for c in ascii.iter().map(|c| {
// Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
// not matter.
apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list)
}) {
if c == '\u{FFFD}' {
if fail_fast {
return (0, false, true);
}
had_errors = true;
}
domain_buffer.push(c);
}
if non_punycode_ascii_label {
if hyphens != Hyphens::Allow
&& check_hyphens(
&mut domain_buffer[current_label_start..],
hyphens == Hyphens::CheckFirstLast,
fail_fast,
&mut had_errors,
)
{
return (0, false, true);
}
already_punycode.push(if had_errors {
AlreadyAsciiLabel::Other
} else {
AlreadyAsciiLabel::MixedCaseAscii(label)
});
continue;
}
already_punycode.push(AlreadyAsciiLabel::Other);
let mut first_needs_combining_mark_check = ascii.is_empty();
let mut needs_contextj_check = !non_ascii.is_empty();
let mut mapping = self
.mapper
.map_normalize(non_ascii.chars())
.map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list));
loop {
let n = mapping.next();
match n {
None | Some('.') => {
if domain_buffer[current_label_start..]
.starts_with(&['x', 'n', '-', '-'])
{
let mut punycode_precondition_failed = false;
for c in domain_buffer[current_label_start + 4..].iter_mut() {
if !c.is_ascii() {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*c = '\u{FFFD}';
punycode_precondition_failed = true;
}
}
if let Some(last) = domain_buffer.last_mut() {
if *last == '-' {
// Either there's nothing after the "xn--" prefix
// and we got the last hyphen of "xn--", or there
// are no Punycode digits after the last delimiter
// which would result in Punycode decode outputting
// ASCII only.
if fail_fast {
return (0, false, true);
}
had_errors = true;
*last = '\u{FFFD}';
punycode_precondition_failed = true;
}
} else {
unreachable!();
}
// Reject excessively long input
if domain_buffer.len() - current_label_start - 4
> PUNYCODE_DECODE_MAX_INPUT_LENGTH
{
if fail_fast {
return (0, false, true);
}
had_errors = true;
domain_buffer[current_label_start
+ 4
+ PUNYCODE_DECODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
punycode_precondition_failed = true;
}
if !punycode_precondition_failed {
if let Ok(decode) = Decoder::default()
.decode::<char, InternalCaller>(
&domain_buffer[current_label_start + 4..],
)
{
first_needs_combining_mark_check = true;
needs_contextj_check = true;
// 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
// characters.
let mut label_buffer = SmallVec::<[char; 59]>::new();
label_buffer.extend(decode);
domain_buffer.truncate(current_label_start);
if self.after_punycode_decode(
domain_buffer,
current_label_start,
&label_buffer,
deny_list_deny_dot,
fail_fast,
&mut had_errors,
) {
return (0, false, true);
}
} else {
// Punycode failed
if fail_fast {
return (0, false, true);
}
had_errors = true;
domain_buffer[current_label_start] = '\u{FFFD}';
needs_contextj_check = false; // ASCII label
first_needs_combining_mark_check = false;
};
} else {
first_needs_combining_mark_check = false;
needs_contextj_check = false; // Non-ASCII already turned to U+FFFD.
}
}
if self.check_label(
hyphens,
&mut domain_buffer[current_label_start..],
fail_fast,
&mut had_errors,
first_needs_combining_mark_check,
needs_contextj_check,
) {
return (0, false, true);
}
if n.is_none() {
break;
}
domain_buffer.push('.');
current_label_start = domain_buffer.len();
first_needs_combining_mark_check = true;
needs_contextj_check = true;
already_punycode.push(AlreadyAsciiLabel::Other);
}
Some(c) => {
if c == '\u{FFFD}' {
if fail_fast {
return (0, false, true);
}
had_errors = true;
}
domain_buffer.push(c);
}
}
}
} else {
// Empty label
already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label));
}
}
let is_bidi = self.is_bidi(domain_buffer);
if is_bidi {
for label in domain_buffer.split_mut(|c| *c == '.') {
if let Some((first, tail)) = label.split_first_mut() {
let first_bc = self.bidi_class.get(*first);
if (FIRST_BC_MASK & bidi_class_to_mask(first_bc)) == 0 {
// Neither RTL label nor LTR label
if fail_fast {
return (0, false, true);
}
had_errors = true;
*first = '\u{FFFD}';
continue;
}
let is_ltr = first_bc == BidiClass::LeftToRight;
// Trim NSM
let mut middle = tail;
#[allow(clippy::while_let_loop)]
loop {
if let Some((last, prior)) = middle.split_last_mut() {
let last_bc = self.bidi_class.get(*last);
if last_bc == BidiClass::NonspacingMark {
middle = prior;
continue;
}
let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK };
if (bidi_class_to_mask(last_bc) & last_mask) == 0 {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*last = '\u{FFFD}';
}
if is_ltr {
for c in prior.iter_mut() {
let bc = self.bidi_class.get(*c);
if (bidi_class_to_mask(bc) & MIDDLE_LTR_MASK) == 0 {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*c = '\u{FFFD}';
}
}
} else {
let mut numeral_state = RtlNumeralState::Undecided;
for c in prior.iter_mut() {
let bc = self.bidi_class.get(*c);
if (bidi_class_to_mask(bc) & MIDDLE_RTL_MASK) == 0 {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*c = '\u{FFFD}';
} else {
match numeral_state {
RtlNumeralState::Undecided => {
if bc == BidiClass::EuropeanNumber {
numeral_state = RtlNumeralState::European;
} else if bc == BidiClass::ArabicNumber {
numeral_state = RtlNumeralState::Arabic;
}
}
RtlNumeralState::European => {
if bc == BidiClass::ArabicNumber {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*c = '\u{FFFD}';
}
}
RtlNumeralState::Arabic => {
if bc == BidiClass::EuropeanNumber {
if fail_fast {
return (0, false, true);
}
had_errors = true;
*c = '\u{FFFD}';
}
}
}
}
}
if (numeral_state == RtlNumeralState::European
&& last_bc == BidiClass::ArabicNumber)
|| (numeral_state == RtlNumeralState::Arabic
&& last_bc == BidiClass::EuropeanNumber)
{
if fail_fast {
return (0, false, true);
}
had_errors = true;
*last = '\u{FFFD}';
}
}
break;
} else {
// One-character label or label where
// everything after the first character
// is just non-spacing marks.
break;
}
}
}
}
}
(passthrough_up_to, is_bidi, had_errors)
}
#[inline(never)]
fn after_punycode_decode(
&self,
domain_buffer: &mut SmallVec<[char; 253]>,
current_label_start: usize,
label_buffer: &[char],
deny_list_deny_dot: u128,
fail_fast: bool,
had_errors: &mut bool,
) -> bool {
for c in self
.mapper
.normalize_validate(label_buffer.iter().copied())
.map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot))
{
if c == '\u{FFFD}' {
if fail_fast {
return true;
}
*had_errors = true;
}
domain_buffer.push(c);
}
let normalized = &mut domain_buffer[current_label_start..];
if let Err(()) =
normalized
.iter_mut()
.zip(label_buffer.iter())
.try_for_each(|(norm_c, decoded_c)| {
if *norm_c == *decoded_c {
Ok(())
} else {
// Mark the first difference
*norm_c = '\u{FFFD}';
Err(())
}
})
{
if fail_fast {
return true;
}
*had_errors = true;
}
false
}
#[inline(never)]
fn check_label(
&self,
hyphens: Hyphens,
mut_label: &mut [char],
fail_fast: bool,
had_errors: &mut bool,
first_needs_combining_mark_check: bool,
needs_contextj_check: bool,
) -> bool {
if hyphens != Hyphens::Allow
&& check_hyphens(
mut_label,
hyphens == Hyphens::CheckFirstLast,
fail_fast,
had_errors,
)
{
return true;
}
if first_needs_combining_mark_check {
if let Some(first) = mut_label.first_mut() {
if (general_category_to_mask(self.general_category.get(*first)) & MARK_MASK) != 0 {
if fail_fast {
return true;
}
*had_errors = true;
*first = '\u{FFFD}';
}
}
}
if needs_contextj_check {
// ContextJ
for i in 0..mut_label.len() {
let c = mut_label[i];
if !in_inclusive_range_char(c, '\u{200C}', '\u{200D}') {
continue;
}
let (head, joiner_and_tail) = mut_label.split_at_mut(i);
if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() {
if let Some(previous) = head.last() {
if self.canonical_combining_class.get(*previous)
== CanonicalCombiningClass::Virama
{
continue;
}
} else {
// No preceding character
if fail_fast {
return true;
}
*had_errors = true;
*joiner = '\u{FFFD}';
continue;
}
if c == '\u{200D}' {
// ZWJ only has the virama rule
if fail_fast {
return true;
}
*had_errors = true;
*joiner = '\u{FFFD}';
continue;
}
debug_assert_eq!(c, '\u{200C}');
if !self.has_appropriately_joining_char(
head.iter().rev().copied(),
LEFT_OR_DUAL_JOINING_MASK,
) || !self.has_appropriately_joining_char(
tail.iter().copied(),
RIGHT_OR_DUAL_JOINING_MASK,
) {
if fail_fast {
return true;
}
*had_errors = true;
*joiner = '\u{FFFD}';
}
} else {
debug_assert!(false);
}
}
}
if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH {
// Limit quadratic behavior
if fail_fast {
return true;
}
*had_errors = true;
mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
}
false
}
#[inline(always)]
fn has_appropriately_joining_char<I: Iterator<Item = char>>(
&self,
iter: I,
required_mask: u32,
) -> bool {
for c in iter {
let jt = self.joining_type.get(c);
if (joining_type_to_mask(jt) & required_mask) != 0 {
return true;
}
if jt == JoiningType::Transparent {
continue;
}
return false;
}
false
}
#[inline(always)]
fn is_bidi(&self, buffer: &[char]) -> bool {
for &c in buffer {
if c < '\u{0590}' {
// Below Hebrew
continue;
}
if in_inclusive_range_char(c, '\u{0900}', '\u{FB1C}') {
debug_assert_ne!(c, '\u{200F}'); // disallowed
continue;
}
if in_inclusive_range_char(c, '\u{1F000}', '\u{3FFFF}') {
continue;
}
if in_inclusive_range_char(c, '\u{FF00}', '\u{107FF}') {
continue;
}
if in_inclusive_range_char(c, '\u{11000}', '\u{1E7FF}') {
continue;
}
if (RTL_MASK & bidi_class_to_mask(self.bidi_class.get(c))) != 0 {
return true;
}
}
false
}
}
fn check_hyphens(
mut_label: &mut [char],
allow_third_fourth: bool,
fail_fast: bool,
had_errors: &mut bool,
) -> bool {
if let Some(first) = mut_label.first_mut() {
if *first == '-' {
if fail_fast {
return true;
}
*had_errors = true;
*first = '\u{FFFD}';
}
}
if let Some(last) = mut_label.last_mut() {
if *last == '-' {
if fail_fast {
return true;
}
*had_errors = true;
*last = '\u{FFFD}';
}
}
if allow_third_fourth {
return false;
}
if mut_label.len() >= 4 && mut_label[2] == '-' && mut_label[3] == '-' {
if fail_fast {
return true;
}
*had_errors = true;
mut_label[2] = '\u{FFFD}';
mut_label[3] = '\u{FFFD}';
}
false
}