gen-regexp-special-case.cc

Enable keyboard shortcuts

// Copyright 2020 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

#include <fstream>

#include <iomanip>

#include <iostream>

#include <sstream>

#include "irregexp/imported/special-case.h"

namespace v8 {

namespace internal {

static const base::uc32 kSurrogateStart = 0xd800;

static const base::uc32 kSurrogateEnd = 0xdfff;

static const base::uc32 kNonBmpStart = 0x10000;

// The following code generates "src/regexp/special-case.cc".

void PrintSet(std::ofstream& out, const char* name,

              const icu::UnicodeSet& set) {

  out << "icu::UnicodeSet Build" << name << "() {\n"

      << "  icu::UnicodeSet set;\n";

  for (int32_t i = 0; i < set.getRangeCount(); i++) {

    if (set.getRangeStart(i) == set.getRangeEnd(i)) {

      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";

    } else {

      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"

          << set.getRangeEnd(i) << ");\n";

  out << "  set.freeze();\n"

      << "  return set;\n"

      << "}\n\n";

  out << "struct " << name << "Data {\n"

      << "  " << name << "Data() : set(Build" << name << "()) {}\n"

      << "  const icu::UnicodeSet set;\n"

      << "};\n\n";

  out << "//static\n"

      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"

      << "  static base::LazyInstance<" << name << "Data>::type set =\n"

      << "      LAZY_INSTANCE_INITIALIZER;\n"

      << "  return set.Pointer()->set;\n"

      << "}\n\n";

void PrintSpecial(std::ofstream& out) {

  icu::UnicodeSet current;

  icu::UnicodeSet special_add;

  icu::UnicodeSet ignore;

  UErrorCode status = U_ZERO_ERROR;

  icu::UnicodeSet upper("[\\p{Lu}]", status);

  CHECK(U_SUCCESS(status));

  // Iterate through all chars in BMP except surrogates.

  for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {

    if (i >= static_cast<UChar32>(kSurrogateStart) &&

        i <= static_cast<UChar32>(kSurrogateEnd)) {

      continue;  // Ignore surrogate range

    current.set(i, i);

    current.closeOver(USET_CASE_INSENSITIVE);

    // Check to see if all characters in the case-folding equivalence

    // class as defined by UnicodeSet::closeOver all map to the same

    // canonical value.

    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);

    bool class_has_matching_canonical_char = false;

    bool class_has_non_matching_canonical_char = false;

    for (int32_t j = 0; j < current.getRangeCount(); j++) {

      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);

           c++) {

        if (c == i) {

          continue;

        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);

        if (canonical == other_canonical) {

          class_has_matching_canonical_char = true;

        } else {

          class_has_non_matching_canonical_char = true;

    // If any other character in i's equivalence class has a

    // different canonical value, then i needs special handling.  If

    // no other character shares a canonical value with i, we can

    // ignore i when adding alternatives for case-independent

    // comparison.  If at least one other character shares a

    // canonical value, then i needs special handling.

    if (class_has_non_matching_canonical_char) {

      if (class_has_matching_canonical_char) {

        special_add.add(i);

      } else {

        ignore.add(i);

  // Verify that no Unicode equivalence class contains two non-trivial

  // JS equivalence classes. Every character in SpecialAddSet has the

  // same canonical value as every other non-IgnoreSet character in

  // its Unicode equivalence class. Therefore, if we call closeOver on

  // a set containing no IgnoreSet characters, the only characters

  // that must be removed from the result are in IgnoreSet. This fact

  // is used in CharacterRange::AddCaseEquivalents.

  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {

    for (UChar32 c = special_add.getRangeStart(i);

         c <= special_add.getRangeEnd(i); c++) {

      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);

      current.set(c, c);

      current.closeOver(USET_CASE_INSENSITIVE);

      current.removeAll(ignore);

      for (int32_t j = 0; j < current.getRangeCount(); j++) {

        for (UChar32 c2 = current.getRangeStart(j);

             c2 <= current.getRangeEnd(j); c2++) {

          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));

  PrintSet(out, "IgnoreSet", ignore);

  PrintSet(out, "SpecialAddSet", special_add);

void WriteHeader(const char* header_filename) {

  std::ofstream out(header_filename);

  out << std::hex << std::setfill('0') << std::setw(4);

  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"

      << "// Use of this source code is governed by a BSD-style license that\n"

      << "// can be found in the LICENSE file.\n\n"

      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"

      << "// The following functions are used to build UnicodeSets\n"

      << "// for special cases where the case-folding algorithm used by\n"

      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"

      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"

      << "// Semantics: Canonicalize) step 3.\n\n"

      << "#ifdef V8_INTL_SUPPORT\n"

      << "#include \"src/base/lazy-instance.h\"\n\n"

      << "#include \"src/regexp/special-case.h\"\n\n"

      << "#include \"unicode/uniset.h\"\n"

      << "namespace v8 {\n"

      << "namespace internal {\n\n";

  PrintSpecial(out);

  out << "\n"

      << "}  // namespace internal\n"

      << "}  // namespace v8\n"

      << "#endif  // V8_INTL_SUPPORT\n";

}  // namespace internal

}  // namespace v8

int main(int argc, const char** argv) {

  if (argc != 2) {

    std::cerr << "Usage: " << argv[0] << " <output filename>\n";

    std::exit(1);

  v8::internal::WriteHeader(argv[1]);

  return 0;