builder.rs - mozsearch

// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT

// file at the top-level directory of this distribution.

//

// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or

// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license

// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your

// option. This file may not be copied, modified, or distributed

// except according to those terms.

/// Functions to compile human-readable patterns into a mapped_hyph

/// flattened representation of the hyphenation state machine.

use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind};

use std::collections::HashMap;

use std::convert::TryInto;

use std::hash::{Hash,Hasher};

// Wrap a HashMap so that we can implement the Hash trait.

#[derive(PartialEq, Eq, Clone)]

struct TransitionMap (HashMap<u8,i32>);

impl TransitionMap {

    fn new() -> TransitionMap {

        TransitionMap(HashMap::<u8,i32>::new())

impl Hash for TransitionMap {

    fn hash<H: Hasher>(&self, state: &mut H) {

        // We only look at the values here; that's likely to be enough

        // for a reasonable hash.

        let mut transitions: Vec<&i32> = self.0.values().collect();

        transitions.sort();

        for t in transitions {

            t.hash(state);

#[derive(PartialEq, Eq, Hash, Clone)]

struct State {

    match_string: Option<Vec<u8>>,

    #[allow(dead_code)]

    repl_string: Option<Vec<u8>>,

    #[allow(dead_code)]

    repl_index: i32,

    #[allow(dead_code)]

    repl_cut: i32,

    fallback_state: i32,

    transitions: TransitionMap,

impl State {

    fn new() -> State {

        State {

            match_string: None,

            repl_string: None,

            repl_index: -1,

            repl_cut: -1,

            fallback_state: -1,

            transitions: TransitionMap::new(),

/// Structures returned by the read_dic_file() function;

/// array of these can then be passed to write_hyf_file()

/// to create the flattened output.

struct LevelBuilder {

    states: Vec<State>,

    str_to_state: HashMap<Vec<u8>,i32>,

    encoding: Option<String>,

    nohyphen: Option<String>,

    lh_min: u8,

    rh_min: u8,

    clh_min: u8,

    crh_min: u8,

impl LevelBuilder {

    fn new() -> LevelBuilder {

        let mut result = LevelBuilder {

            states: Vec::<State>::new(),

            str_to_state: HashMap::<Vec<u8>,i32>::new(),

            encoding: None,

            nohyphen: None,

            lh_min: 0,

            rh_min: 0,

            clh_min: 0,

            crh_min: 0,

};

        // Initialize the builder with an empty start state.

        result.str_to_state.insert(vec![], 0);

        result.states.push(State::new());

        result

    fn find_state_number_for(&mut self, text: &[u8]) -> i32 {

        let count = self.states.len() as i32;

        let index = *self.str_to_state.entry(text.to_vec()).or_insert(count);

        if index == count {

            self.states.push(State::new());

        index

    fn add_pattern(&mut self, pattern: &str) {

        let mut bytes = pattern.as_bytes();

        let mut text = Vec::<u8>::with_capacity(bytes.len());

        let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1);

        let mut repl_str = None;

        let mut repl_index = 0;

        let mut repl_cut = 0;

        // Check for replacement rule (non-standard hyphenation spelling change).

        if let Some(slash) = bytes.iter().position(|x| *x == b'/') {

            let parts = bytes.split_at(slash);

            bytes = parts.0;

            let mut it = parts.1[1 ..].split(|x| *x == b',');

            if let Some(repl) = it.next() {

                repl_str = Some(repl.to_vec());

            if let Some(num) = it.next() {

                repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1;

            if let Some(num) = it.next() {

                repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap();

        // Separate the input pattern into parallel arrays of text (bytes) and digits.

        let mut got_digit = false;

        for byte in bytes {

            if *byte <= b'9' && *byte >= b'0' {

                if got_digit {

                    warn!("invalid pattern \"{}\": consecutive digits", pattern);

                    return;

                digits.push(*byte);

                got_digit = true;

            } else {

                text.push(*byte);

                if got_digit {

                    got_digit = false;

                } else {

                    digits.push(b'0');

        if !got_digit {

            digits.push(b'0');

        if repl_str.is_none() {

            // Optimize away leading zeroes from the digits array.

            while !digits.is_empty() && digits[0] == b'0' {

                digits.remove(0);

        } else {

            // Convert repl_index and repl_cut from Unicode char to byte indexing.

            let start = if text[0] == b'.' { 1 } else { 0 };

            if start == 1 {

                if digits[0] != b'0' {

                    warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern);

                    return;

                digits.remove(0);

            let word = std::str::from_utf8(&text[start..]).unwrap();

            let mut chars: Vec<_> = word.char_indices().collect();

            chars.push((word.len(), '.'));

            repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32;

            repl_index = chars[repl_index as usize].0 as i32;

        // Create the new state, or add pattern into an existing state

        // (which should not already have a match_string).

        let mut state_num = self.find_state_number_for(&text);

        let mut state = &mut self.states[state_num as usize];

        if state.match_string.is_some() {

            warn!("duplicate pattern \"{}\" discarded", pattern);

            return;

        if !digits.is_empty() {

            state.match_string = Some(digits);

        if repl_str.is_some() {

            state.repl_string = repl_str;

            state.repl_index = repl_index;

            state.repl_cut = repl_cut;

        // Set up prefix transitions, inserting additional states as needed.

        while !text.is_empty() {

            let last_state = state_num;

            let ch = *text.last().unwrap();

            text.truncate(text.len() - 1);

            state_num = self.find_state_number_for(&text);

            if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) {

                assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern);

                break;

    fn merge_duplicate_states(&mut self) {

        // We loop here because when we eliminate a duplicate, and update the transitons

        // that referenced it, we may thereby create new duplicates that another pass

        // will find and compress further.

        loop {

            let orig_len = self.states.len();

            // Used to map State records to the (first) index at which they occur.

            let mut state_to_index = HashMap::<&State,i32>::new();

            // Mapping of old->new state indexes, and whether each old state is

            // a duplicate that should be dropped.

            let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len);

            let mut next_new_index: i32 = 0;

            for index in 0 .. self.states.len() {

                // Find existing index for this state, or allocate the next new index to it.

                let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index);

                // Record the mapping, and whether the state was a duplicate.

                mappings.push((new_index, new_index != next_new_index));

                // If we used next_new_index for this state, increment it.

                if new_index == next_new_index {

                    next_new_index += 1;

            // If we didn't find any duplicates, next_new_index will have kept pace with

            // index, so we know we're finished.

            if next_new_index as usize == self.states.len() {

                break;

            // Iterate over all the states, either deleting them or updating indexes

            // according to the mapping we created; then repeat the search.

            for index in (0 .. self.states.len()).rev() {

                if mappings[index].1 {

                    self.states.remove(index);

                } else {

                    let state = &mut self.states[index];

                    if state.fallback_state != -1 {

                        state.fallback_state = mappings[state.fallback_state as usize].0;

                    for t in state.transitions.0.iter_mut() {

                        *t.1 = mappings[*t.1 as usize].0;

    fn flatten(&self) -> Vec<u8> {

        // Calculate total space needed for state data, and build the state_to_offset table.

        let mut state_data_size = 0;

        let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len());

        for state in &self.states {

            state_to_offset.push(state_data_size);

            state_data_size += if state.repl_string.is_some() { 12 } else { 8 };

            state_data_size += state.transitions.0.len() * 4;

        // Helper to map a state index to its offset in the final data block.

        let get_state_offset_for = |state_index: i32| -> u32 {

            if state_index < 0 {

                return super::INVALID_STATE_OFFSET;

            state_to_offset[state_index as usize] as u32

};

        // Helper to map a byte string to its offset in the final data block, and

        // store the bytes into string_data unless using an already-existing string.

        let mut string_to_offset = HashMap::<Vec<u8>,usize>::new();

        let mut string_data = Vec::<u8>::new();

        let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 {

            if bytes.is_none() {

                return super::INVALID_STRING_OFFSET;

            assert!(bytes.as_ref().unwrap().len() < 256);

            let new_offset = string_data.len();

            let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset);

            if offset == new_offset {

                string_data.push(bytes.as_ref().unwrap().len() as u8);

                string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref());

            offset.try_into().unwrap()

};

        // Handle nohyphen string list if present, converting comma separators to NULs

        // and trimming any surplus whitespace.

        let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET;

        let mut nohyphen_count: u16 = 0;

        if self.nohyphen.is_some() {

            let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect();

            nohyphen_count = nohyphen_strings.len().try_into().unwrap();

            nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec()));

        let mut state_data = Vec::<u8>::with_capacity(state_data_size);

        for state in &self.states {

            state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes());

            state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes());

            state_data.push(state.transitions.0.len() as u8);

            // Determine whether to use an extended state record, and if so add the

            // replacement string and index fields.

            if state.repl_string.is_none() {

                state_data.push(0);

            } else {

                state_data.push(1);

                state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes());

                state_data.push(state.repl_index as u8);

                state_data.push(state.repl_cut as u8);

            // Collect transitions into an array so we can sort them.

            let mut transitions = vec![];

            for (key, value) in state.transitions.0.iter() {

                transitions.push((*key, get_state_offset_for(*value)))

            transitions.sort();

            for t in transitions {

                // New state offset is stored as a 24-bit value, so we do this manually.

                state_data.push((t.1 & 0xff) as u8);

                state_data.push(((t.1 >> 8) & 0xff) as u8);

                state_data.push(((t.1 >> 16) & 0xff) as u8);

                state_data.push(t.0);

        assert_eq!(state_data.len(), state_data_size);

        // Pad string data to a 4-byte boundary

        while string_data.len() & 3 != 0 {

            string_data.push(0);

        let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len();

        let mut result = Vec::<u8>::with_capacity(total_size);

        let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32;

        let string_data_base: u32 = state_data_base + state_data_size as u32;

        result.extend(&state_data_base.to_le_bytes());

        result.extend(&string_data_base.to_le_bytes());

        result.extend(&nohyphen_string_offset.to_le_bytes());

        result.extend(&nohyphen_count.to_le_bytes());

        result.push(self.lh_min);

        result.push(self.rh_min);

        result.push(self.clh_min);

        result.push(self.crh_min);

        result.extend(state_data.iter());

        result.extend(string_data.iter());

        assert_eq!(result.len(), total_size);

        result

/// Read a libhyphen-style pattern file and create the corresponding state

/// machine transitions, etc.

/// The returned Vec can be passed to write_hyf_file() to generate a flattened

/// representation of the state machine in mapped_hyph's binary format.

fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> {

    let reader = BufReader::new(dic_file);

    let mut builders = Vec::<LevelBuilder>::new();

    builders.push(LevelBuilder::new());

    let mut builder = &mut builders[0];

    for (index, line) in reader.lines().enumerate() {

        let mut trimmed = line.unwrap().trim().to_string();

        // Strip comments.

        if let Some(i) = trimmed.find('%') {

            trimmed = trimmed[..i].trim().to_string();

        // Ignore empty lines.

        if trimmed.is_empty() {

            continue;

        // Uppercase indicates keyword rather than pattern.

        if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' {

            // First line is encoding; we only support UTF-8.

            if builder.encoding.is_none() {

                if trimmed != "UTF-8" {

                    return Err("Only UTF-8 patterns are accepted!");

};

                builder.encoding = Some(trimmed);

                continue;

            // Check for valid keyword-value pairs.

            if trimmed.contains(' ') {

                let parts: Vec<&str> = trimmed.split(' ').collect();

                if parts.len() != 2 {

                    warn!("unrecognized keyword/values: {}", trimmed);

                    continue;

                let keyword = parts[0];

                let value = parts[1];

                match keyword {

                    "LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(),

                    "RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(),

                    "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(),

                    "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(),

                    "NOHYPHEN" => builder.nohyphen = Some(trimmed),

                    _ => warn!("unknown keyword: {}", trimmed),

                continue;

            // Start a new hyphenation level?

            if trimmed == "NEXTLEVEL" {

                builders.push(LevelBuilder::new());

                builder = builders.last_mut().unwrap();

                continue;

            warn!("unknown keyword: {}", trimmed);

            continue;

        // Patterns should always be provided in lowercase; complain if not, and discard

        // the bad pattern.

        if trimmed != trimmed.to_lowercase() {

            warn!("pattern \"{}\" not lowercased at line {}", trimmed, index);

            continue;

        builder.add_pattern(&trimmed);

    // Create default first (compound-word) level if only one level was provided.

    // (Maybe this should be optional? Currently just copying libhyphen behavior.)

    if builders.len() == 1 {

        let (lh_min, rh_min, clh_min, crh_min) =

            (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min);

        builders.insert(0, LevelBuilder::new());

        builder = builders.first_mut().unwrap();

        builder.add_pattern("1-1");

        builder.add_pattern("1'1");

        builder.add_pattern("1\u{2013}1"); // en-dash

        builder.add_pattern("1\u{2019}1"); // curly apostrophe

        builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string());

        builder.lh_min = lh_min;

        builder.rh_min = rh_min;

        builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 };

        builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 };

    // Put in fallback states in each builder.

    for builder in &mut builders {

        for (key, state_index) in builder.str_to_state.iter() {

            if key.is_empty() {

                continue;

            let mut fallback_key = key.clone();

            while !fallback_key.is_empty() {

                fallback_key.remove(0);

                if builder.str_to_state.contains_key(&fallback_key) {

                    break;

            builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key];

    if compress {

        // Merge duplicate states to reduce size.

        for builder in &mut builders {

            builder.merge_duplicate_states();

    Ok(builders)

/// Write out the state machines representing a set of hyphenation rules

/// to the given output stream.

fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {

    if levels.is_empty() {

        return Err(Error::from(ErrorKind::InvalidData));

    let mut flattened = vec![];

    for level in levels {

        flattened.push(level.flatten());

    // Write file header: magic number, count of levels.

    hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?;

    let level_count: u32 = flattened.len() as u32;

    hyf_file.write_all(&level_count.to_le_bytes())?;

    // Write array of offsets to each level. First level will begin immediately

    // after the array of offsets.

    let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count;

    for flat in &flattened {

        hyf_file.write_all(&offset.to_le_bytes())?;

        offset += flat.len() as u32;

    // Write the flattened data for each level.

    for flat in &flattened {

        hyf_file.write_all(&flat)?;

    Ok(())

/// The public API to the compilation process: reads `dic_file` and writes compiled tables

/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the

/// size of the output is performed.

pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> {

    match read_dic_file(dic_file, compress) {

        Ok(dic) => write_hyf_file(hyf_file, dic),

        Err(e) => {

            warn!("parse error: {}", e);

            return Err(Error::from(ErrorKind::InvalidData))

Revision control

Copy as Markdown

Other Tools