update_unicode.py - mozsearch

comm-central/third_party/rust/jsparagus/update_unicode.py

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

#!/usr/bin/env python3

""" Generate Unicode data table for parser

"""

import argparse

import io

import re

import sys

from contextlib import closing

from itertools import tee, zip_longest

from urllib.request import urlopen

from zipfile import ZipFile

# These are also part of IdentifierPart §11.6 Names and Keywords

compatibility_identifier_part = [

    ord(u'\N{ZERO WIDTH NON-JOINER}'),

    ord(u'\N{ZERO WIDTH JOINER}'),

FLAG_ID_START = 1 << 0

FLAG_ID_CONTINUE = 1 << 1

def download_derived_core_properties(version):

    """Downloads UCD.zip for given version, and return the content of

    DerivedCoreProperties.txt. """

    baseurl = 'https://unicode.org/Public'

    if version == 'UNIDATA':

        url = '%s/%s' % (baseurl, version)

    else:

        url = '%s/%s/ucd' % (baseurl, version)

    request_url = '{}/UCD.zip'.format(url)

    with closing(urlopen(request_url)) as downloaded_file:

        downloaded_data = io.BytesIO(downloaded_file.read())

    with ZipFile(downloaded_data) as zip_file:

        return zip_file.read('DerivedCoreProperties.txt').decode()

def read_derived_core_properties(derived_core_properties):

    """Read DerivedCoreProperties.txt content and yield each item. """

    for line in derived_core_properties.split('\n'):

        if line == '' or line.startswith('#'):

            continue

        row = line.split('#')[0].split(';')

        char_range = row[0].strip()

        char_property = row[1].strip()

        if '..' not in char_range:

            yield (int(char_range, 16), char_property)

        else:

            [start, end] = char_range.split('..')

            for char in range(int(start, 16), int(end, 16) + 1):

                yield (char, char_property)

def process_derived_core_properties(derived_core_properties):

    """Parse DerivedCoreProperties.txt and returns its version,

    and set of characters with ID_Start and ID_Continue. """

    id_start = set()

    id_continue = set()

    m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties)

    assert m

    version = m.group(1)

    for (char, prop) in read_derived_core_properties(derived_core_properties):

        if prop == 'ID_Start':

            id_start.add(char)

        if prop == 'ID_Continue':

            id_continue.add(char)

    return (version, id_start, id_continue)

def int_ranges(ints):

    """ Yields consecutive ranges (inclusive) from integer values. """

    (a, b) = tee(sorted(ints))

    start = next(b)

    for (curr, succ) in zip_longest(a, b):

        if curr + 1 != succ:

            yield (start, curr)

            start = succ

def process_unicode_data(derived_core_properties):

    MAX_BMP = 0xffff

    dummy = 0

    table = [dummy]

    cache = {dummy: 0}

    index = [0] * (MAX_BMP + 1)

    non_bmp_id_start_set = {}

    non_bmp_id_continue_set = {}

    (version, id_start, id_continue) = process_derived_core_properties(derived_core_properties)

    codes = id_start.union(id_continue)

    for code in codes:

        if code > MAX_BMP:

            if code in id_start:

                non_bmp_id_start_set[code] = 1

            if code in id_continue:

                non_bmp_id_continue_set[code] = 1

            continue

        flags = 0

        if code in id_start:

            flags |= FLAG_ID_START

        if code in id_continue or code in compatibility_identifier_part:

            flags |= FLAG_ID_CONTINUE

        i = cache.get(flags)

        if i is None:

            assert flags not in table

            cache[flags] = i = len(table)

            table.append(flags)

        index[code] = i

    return (

        version,

        table,

        index,

        id_start,

        id_continue,

        non_bmp_id_start_set,

        non_bmp_id_continue_set,

def getsize(data):

    """ return smallest possible integer size for the given array """

    maxdata = max(data)

    assert maxdata < 2**32

    if maxdata < 256:

        return 1

    elif maxdata < 65536:

        return 2

    else:

        return 4

def splitbins(t):

    """t -> (t1, t2, shift).  Split a table to save space.

    t is a sequence of ints.  This function can be useful to save space if

    many of the ints are the same.  t1 and t2 are lists of ints, and shift

    is an int, chosen to minimize the combined size of t1 and t2 (in C

    code), and where for each i in range(len(t)),

        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

    where mask is a bitmask isolating the last "shift" bits.

"""

    def dump(t1, t2, shift, bytes):

        print("%d+%d bins at shift %d; %d bytes" % (

            len(t1), len(t2), shift, bytes), file=sys.stderr)

        print("Size of original table:", len(t) * getsize(t),

              "bytes", file=sys.stderr)

    n = len(t)-1    # last valid index

    maxshift = 0    # the most we can shift n and still have something left

    if n > 0:

        while n >> 1:

            n >>= 1

            maxshift += 1

    del n

    bytes = sys.maxsize  # smallest total size so far

    t = tuple(t)    # so slices can be dict keys

    for shift in range(maxshift + 1):

        t1 = []

        t2 = []

        size = 2**shift

        bincache = {}

        for i in range(0, len(t), size):

            bin = t[i:i + size]

            index = bincache.get(bin)

            if index is None:

                index = len(t2)

                bincache[bin] = index

                t2.extend(bin)

            t1.append(index >> shift)

        # determine memory size

        b = len(t1) * getsize(t1) + len(t2) * getsize(t2)

        if b < bytes:

            best = t1, t2, shift

            bytes = b

    t1, t2, shift = best

    print("Best:", end=' ', file=sys.stderr)

    dump(t1, t2, shift, bytes)

    # exhaustively verify that the decomposition is correct

    mask = 2**shift - 1

    for i in range(len(t)):

        assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

    return best

def write_table(f, name, type, table, formatter, per_line):

    f.write(f"""

pub const {name}: &'static [{type}] = &[

""")

    i = 0

    for item in table:

        if i == 0:

            f.write('    ')

        f.write(f'{formatter(item)},')

        i += 1

        if i == per_line:

            i = 0

            f.write("""

""")

    f.write("""\

];

""")

def write_func(f, name, group_set):

        f.write(f"""

pub fn {name}(c: char) -> bool {{""")

        for (from_code, to_code) in int_ranges(group_set.keys()):

            f.write(f"""

    if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{

        return true;

    }}""")

        f.write("""

    false

""")

def make_unicode_file(version, table, index,

                      id_start, id_continue,

                      non_bmp_id_start_set, non_bmp_id_continue_set):

    index1, index2, shift = splitbins(index)

    # verify correctness

    for char in index:

        test = table[index[char]]

        idx = index1[char >> shift]

        idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]

        assert test == table[idx]

    with open('crates/parser/src/unicode_data.rs', 'w') as f:

        f.write(f"""\

// Generated by update_unicode.py DO NOT MODIFY

// Unicode version: {version}

""")

        f.write(f"""

const FLAG_ID_START: u8 = {FLAG_ID_START};

const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE};

""")

        f.write("""

pub struct CharInfo {

    flags: u8,

impl CharInfo {

    pub fn is_id_start(&self) -> bool {

        self.flags & FLAG_ID_START != 0

    pub fn is_id_continue(&self) -> bool {

        self.flags & FLAG_ID_CONTINUE != 0

""")

        write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table,

                    lambda flag: f"CharInfo {{ flags: {flag} }}",

1)

        write_table(f, 'INDEX1', 'u8', index1,

                    lambda i: f'{i:4d}', 8)

        write_table(f, 'INDEX2', 'u8', index2,

                    lambda i: f'{i:4d}', 8)

        f.write(f"""

const SHIFT: usize = {shift};

""")

        f.write("""

pub fn char_info(c: char) -> &'static CharInfo {

    let code = c as usize;

    let index = INDEX1[code >> SHIFT] as usize;

    let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize;

    &CHAR_INFO_TABLE[index]

""")

        def format_bool(b):

            if b:

                return 'true '

            else:

                return 'false'

        write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128),

                    lambda code: format_bool(code in id_start), 8)

        write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128),

                    lambda code: format_bool(code in id_continue), 8)

        write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set)

        write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set)

parser = argparse.ArgumentParser(description='Generate Unicode data table for parser')

parser.add_argument('VERSION',

                    help='Unicode version number to download from\

                    <https://unicode.org/Public>. The number must match\

                    a published Unicode version, e.g. use\

                    "--version=8.0.0" to download Unicode 8 files. Alternatively use\

                    "--version=UNIDATA" to download the latest published version.')

parser.add_argument('PATH_TO_JSPARAGUS',

                    help='Path to jsparagus')

args = parser.parse_args()

derived_core_properties = download_derived_core_properties(args.VERSION)

    version,

    table,

    index,

    id_start,

    id_continue,

    non_bmp_id_start_set,

    non_bmp_id_continue_set,

) = process_unicode_data(derived_core_properties)

make_unicode_file(

    version,

    table,

    index,

    id_start,

    id_continue,

    non_bmp_id_start_set,

    non_bmp_id_continue_set,