utf.rs - mozsearch

Enable keyboard shortcuts

/*

 * Copyright Stalwart Labs Ltd. See the COPYING

 * file at the top-level directory of this distribution.

 * Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or

 * https://www.apache.org/licenses/LICENSE-2.0> or the MIT license

 * <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your

 * option. This file may not be copied, modified, or distributed

 * except according to those terms.

*/

use std::char::{decode_utf16, REPLACEMENT_CHARACTER};

use crate::decoders::base64::BASE64_MAP;

struct Utf7DecoderState {

    utf16_bytes: Vec<u16>,

    pending_byte: Option<u8>,

    b64_bytes: u32,

fn add_utf16_bytes(state: &mut Utf7DecoderState, n_bytes: usize) {

    debug_assert!(n_bytes < std::mem::size_of::<u32>());

    for byte in state.b64_bytes.to_le_bytes()[0..n_bytes].iter() {

        if let Some(pending_byte) = state.pending_byte {

            state

                .utf16_bytes

                .push(u16::from_be_bytes([pending_byte, *byte]));

            state.pending_byte = None;

        } else {

            state.pending_byte = Some(*byte);

pub fn decoder_utf7(bytes: &[u8]) -> String {

    let mut result = String::with_capacity(bytes.len());

    let mut byte_count: u8 = 0;

    let mut in_b64 = false;

    let mut state = Utf7DecoderState {

        utf16_bytes: Vec::with_capacity(10),

        pending_byte: None,

        b64_bytes: 0,

};

    for byte in bytes {

        if in_b64 {

            let val = BASE64_MAP[byte_count as usize][*byte as usize];

            if val < 0x01ffffff {

                byte_count = (byte_count + 1) & 3;

                if byte_count == 1 {

                    state.b64_bytes = val;

                } else {

                    state.b64_bytes |= val;

                    if byte_count == 0 {

                        add_utf16_bytes(&mut state, 3);

            } else {

                match byte_count {

                    1 | 2 => {

                        add_utf16_bytes(&mut state, 1);

                    3 => {

                        add_utf16_bytes(&mut state, 2);

                    _ => (),

                if !state.utf16_bytes.is_empty() {

                    result.push_str(

                        decode_utf16(state.utf16_bytes.drain(..))

                            .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))

                            .collect::<String>()

                            .as_str(),

);

                } else if byte_count > 0 || state.pending_byte.is_some() {

                    result.push(REPLACEMENT_CHARACTER);

                } else {

                    result.push('+');

                    result.push(char::from(*byte));

                state.pending_byte = None;

                byte_count = 0;

                in_b64 = false;

        } else if byte == &b'+' {

            in_b64 = true;

        } else {

            result.push(char::from(*byte));

    result.shrink_to_fit();

    result

fn decoder_utf16_(bytes: &[u8], fnc: fn([u8; 2]) -> u16) -> String {

    if bytes.len() >= 2 {

        decode_utf16(bytes.chunks_exact(2).map(|c| fnc([c[0], c[1]])))

            .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))

            .collect::<String>()

    } else {

        "".to_string()

pub fn decoder_utf16_le(bytes: &[u8]) -> String {

    decoder_utf16_(bytes, u16::from_le_bytes)

pub fn decoder_utf16_be(bytes: &[u8]) -> String {

    decoder_utf16_(bytes, u16::from_be_bytes)

#[allow(clippy::type_complexity)]

pub fn decoder_utf16(bytes: &[u8]) -> String {

    // Read BOM

    let (bytes, fnc): (&[u8], fn([u8; 2]) -> u16) = match bytes.get(0..2) {

        Some([0xfe, 0xff]) => (bytes.get(2..).unwrap_or(&[]), u16::from_be_bytes),

        Some([0xff, 0xfe]) => (bytes.get(2..).unwrap_or(&[]), u16::from_le_bytes),

        _ => (bytes, u16::from_le_bytes),

};

    decoder_utf16_(bytes, fnc)

// Not currently used at the moment

pub fn decoder_utf8(bytes: &[u8]) -> String {

    String::from_utf8_lossy(bytes).into_owned()

#[cfg(test)]

mod tests {

    use crate::decoders::charsets::utf::decoder_utf7;

    #[test]

    fn decode_utf7() {

        let inputs = [

            ("Hello, World+ACE-", "Hello, World!"),

            ("Hi Mom -+Jjo--!", "Hi Mom -☺-!"),

            ("+ZeVnLIqe-", "日本語"),

            ("Item 3 is +AKM-1.", "Item 3 is £1."),

            ("Plus minus +- -+ +--", "Plus minus +- -+ +--"),

                "+APw-ber ihre mi+AN8-liche Lage+ADs- +ACI-wir",

                "über ihre mißliche Lage; \"wir",

),

                concat!(

                    "+ACI-The sayings of Confucius,+ACI- James R. Ware, trans.  +U/BTFw-:\n",

                    "+ZYeB9FH6ckh5Pg-, 1980.\n",

                    "+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-:  +Ti1XC2b4Xpc-, 1990."

),

                concat!(

                    "\"The sayings of Confucius,\" James R. Ware, trans.  台北:\n",

                    "文致出版社, 1980.\n",

                    "四書五經, 宋元人注, 北京:  中國書店, 1990."

),

),

];

        for input in inputs {

            assert_eq!(decoder_utf7(input.0.as_bytes()), input.1);