RegExpAPI.cpp - mozsearch

mozilla-central/js/src/irregexp/RegExpAPI.cpp (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-

 * vim: set ts=8 sts=2 et sw=2 tw=80:

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// Copyright 2020 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

#include "irregexp/RegExpAPI.h"

#include "mozilla/ArrayUtils.h"

#include "mozilla/Casting.h"

#include "frontend/FrontendContext.h"  // AutoReportFrontendContext

#include "frontend/TokenStream.h"

#include "gc/GC.h"

#include "gc/Zone.h"

#include "irregexp/imported/regexp-ast.h"

#include "irregexp/imported/regexp-bytecode-generator.h"

#include "irregexp/imported/regexp-compiler.h"

#include "irregexp/imported/regexp-interpreter.h"

#include "irregexp/imported/regexp-macro-assembler-arch.h"

#include "irregexp/imported/regexp-macro-assembler-tracer.h"

#include "irregexp/imported/regexp-parser.h"

#include "irregexp/imported/regexp-stack.h"

#include "irregexp/imported/regexp.h"

#include "irregexp/RegExpNativeMacroAssembler.h"

#include "irregexp/RegExpShim.h"

#include "jit/JitCommon.h"

#include "js/ColumnNumber.h"  // JS::ColumnNumberOneOrigin, JS::ColumnNumberOffset

#include "js/friend/ErrorMessages.h"  // JSMSG_*

#include "js/friend/StackLimits.h"    // js::ReportOverRecursed

#include "util/StringBuilder.h"

#include "vm/MatchPairs.h"

#include "vm/PlainObject.h"

#include "vm/RegExpShared.h"

namespace js {

namespace irregexp {

using mozilla::AssertedCast;

using mozilla::Maybe;

using mozilla::Nothing;

using mozilla::PointerRangeSize;

using mozilla::Some;

using frontend::DummyTokenStream;

using frontend::TokenStreamAnyChars;

using v8::internal::DisallowGarbageCollection;

using v8::internal::HandleScope;

using v8::internal::InputOutputData;

using v8::internal::IrregexpInterpreter;

using v8::internal::NativeRegExpMacroAssembler;

using v8::internal::RegExpBytecodeGenerator;

using v8::internal::RegExpCapture;

using v8::internal::RegExpCompileData;

using v8::internal::RegExpCompiler;

using v8::internal::RegExpError;

using v8::internal::RegExpMacroAssembler;

using v8::internal::RegExpMacroAssemblerTracer;

using v8::internal::RegExpNode;

using v8::internal::RegExpParser;

using v8::internal::SMRegExpMacroAssembler;

using v8::internal::Zone;

using v8::internal::ZoneVector;

using V8HandleString = v8::internal::Handle<v8::internal::String>;

using V8HandleRegExp = v8::internal::Handle<v8::internal::IrRegExpData>;

using namespace v8::internal::regexp_compiler_constants;

static uint32_t ErrorNumber(RegExpError err) {

  switch (err) {

    case RegExpError::kNone:

      return JSMSG_NOT_AN_ERROR;

    case RegExpError::kStackOverflow:

      return JSMSG_OVER_RECURSED;

    case RegExpError::kAnalysisStackOverflow:

      return JSMSG_OVER_RECURSED;

    case RegExpError::kTooLarge:

      return JSMSG_TOO_MANY_PARENS;

    case RegExpError::kUnterminatedGroup:

      return JSMSG_MISSING_PAREN;

    case RegExpError::kUnmatchedParen:

      return JSMSG_UNMATCHED_RIGHT_PAREN;

    case RegExpError::kEscapeAtEndOfPattern:

      return JSMSG_ESCAPE_AT_END_OF_REGEXP;

    case RegExpError::kInvalidPropertyName:

      return JSMSG_INVALID_PROPERTY_NAME;

    case RegExpError::kInvalidEscape:

      return JSMSG_INVALID_IDENTITY_ESCAPE;

    case RegExpError::kInvalidDecimalEscape:

      return JSMSG_INVALID_DECIMAL_ESCAPE;

    case RegExpError::kInvalidUnicodeEscape:

      return JSMSG_INVALID_UNICODE_ESCAPE;

    case RegExpError::kNothingToRepeat:

      return JSMSG_NOTHING_TO_REPEAT;

    case RegExpError::kLoneQuantifierBrackets:

      // Note: V8 reports the same error for both ']' and '}'.

      return JSMSG_RAW_BRACKET_IN_REGEXP;

    case RegExpError::kRangeOutOfOrder:

      return JSMSG_NUMBERS_OUT_OF_ORDER;

    case RegExpError::kIncompleteQuantifier:

      return JSMSG_INCOMPLETE_QUANTIFIER;

    case RegExpError::kInvalidQuantifier:

      return JSMSG_INVALID_QUANTIFIER;

    case RegExpError::kInvalidGroup:

      return JSMSG_INVALID_GROUP;

    case RegExpError::kRepeatedFlag:

      return JSMSG_REPEATED_FLAG;

    case RegExpError::kInvalidFlagGroup:

      return JSMSG_INVALID_FLAG_GROUP;

    case RegExpError::kMultipleFlagDashes:

      return JSMSG_MULTIPLE_FLAG_DASHES;

    case RegExpError::kNotLinear:

      // V8 has an experimental non-backtracking engine. We do not

      // support it yet.

      MOZ_CRASH("Non-backtracking execution not supported");

    case RegExpError::kTooManyCaptures:

      return JSMSG_TOO_MANY_PARENS;

    case RegExpError::kInvalidCaptureGroupName:

      return JSMSG_INVALID_CAPTURE_NAME;

    case RegExpError::kDuplicateCaptureGroupName:

      return JSMSG_DUPLICATE_CAPTURE_NAME;

    case RegExpError::kInvalidNamedReference:

      return JSMSG_INVALID_NAMED_REF;

    case RegExpError::kInvalidNamedCaptureReference:

      return JSMSG_INVALID_NAMED_CAPTURE_REF;

    case RegExpError::kInvalidClassPropertyName:

      return JSMSG_INVALID_CLASS_PROPERTY_NAME;

    case RegExpError::kInvalidCharacterClass:

      return JSMSG_RANGE_WITH_CLASS_ESCAPE;

    case RegExpError::kUnterminatedCharacterClass:

      return JSMSG_UNTERM_CLASS;

    case RegExpError::kOutOfOrderCharacterClass:

      return JSMSG_BAD_CLASS_RANGE;

    case RegExpError::kInvalidClassSetOperation:

      return JSMSG_INVALID_CLASS_SET_OP;

    case RegExpError::kInvalidCharacterInClass:

      return JSMSG_INVALID_CHAR_IN_CLASS;

    case RegExpError::kNegatedCharacterClassWithStrings:

      return JSMSG_NEGATED_CLASS_WITH_STR;

    case RegExpError::NumErrors:

      MOZ_CRASH("Unreachable");

  MOZ_CRASH("Unreachable");

Isolate* CreateIsolate(JSContext* cx) {

  auto isolate = MakeUnique<Isolate>(cx);

  if (!isolate || !isolate->init()) {

    return nullptr;

  return isolate.release();

void TraceIsolate(JSTracer* trc, Isolate* isolate) { isolate->trace(trc); }

void DestroyIsolate(Isolate* isolate) {

  MOZ_ASSERT(isolate->liveHandles() == 0);

  MOZ_ASSERT(isolate->livePseudoHandles() == 0);

  js_delete(isolate);

size_t IsolateSizeOfIncludingThis(Isolate* isolate,

                                  mozilla::MallocSizeOf mallocSizeOf) {

  return isolate->sizeOfIncludingThis(mallocSizeOf);

static JS::ColumnNumberOffset ComputeColumnOffset(const Latin1Char* begin,

                                                  const Latin1Char* end) {

  return JS::ColumnNumberOffset(

      AssertedCast<uint32_t>(PointerRangeSize(begin, end)));

static JS::ColumnNumberOffset ComputeColumnOffset(const char16_t* begin,

                                                  const char16_t* end) {

  return JS::ColumnNumberOffset(

      AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end)));

// This function is varargs purely so it can call ReportCompileErrorLatin1.

// We never call it with additional arguments.

template <typename CharT>

static void ReportSyntaxError(TokenStreamAnyChars& ts,

                              mozilla::Maybe<uint32_t> line,

                              mozilla::Maybe<JS::ColumnNumberOneOrigin> column,

                              RegExpCompileData& result, CharT* start,

                              size_t length, ...) {

  MOZ_ASSERT(line.isSome() == column.isSome());

  Maybe<gc::AutoSuppressGC> suppressGC;

  if (JSContext* maybeCx = ts.context()->maybeCurrentJSContext()) {

    suppressGC.emplace(maybeCx);

  uint32_t errorNumber = ErrorNumber(result.error);

  if (errorNumber == JSMSG_OVER_RECURSED) {

    ReportOverRecursed(ts.context());

    return;

  uint32_t offset = std::max(result.error_pos, 0);

  MOZ_ASSERT(offset <= length);

  ErrorMetadata err;

  // Ordinarily this indicates whether line-of-context information can be

  // added, but we entirely ignore that here because we create a

  // a line of context based on the expression source.

  uint32_t location = ts.currentToken().pos.begin;

  if (ts.fillExceptingContext(&err, location)) {

    JS::ColumnNumberOffset columnOffset =

        ComputeColumnOffset(start, start + offset);

    if (line.isSome()) {

      // If this pattern is being checked by the frontend Parser instead

      // of other API entry points like |new RegExp|, then the parser will

      // have provided both a line and column pointing at the *beginning*

      // of the RegExp literal inside the source text.

      // We adjust the columnNumber to point to the actual syntax error

      // inside the literal.

      err.lineNumber = *line;

      err.columnNumber = *column + columnOffset;

    } else {

      // Line breaks are not significant in pattern text in the same way as

      // in source text, so act as though pattern text is a single line, then

      // compute a column based on "code point" count (treating a lone

      // surrogate as a "code point" in UTF-16).  Gak.

      err.lineNumber = 1;

      err.columnNumber = JS::ColumnNumberOneOrigin() + columnOffset;

  // For most error reporting, the line of context derives from the token

  // stream.  So when location information doesn't come from the token

  // stream, we can't give a line of context.  But here the "line of context"

  // can be (and is) derived from the pattern text, so we can provide it no

  // matter if the location is derived from the caller.

  const CharT* windowStart =

      (offset > ErrorMetadata::lineOfContextRadius)

          ? start + (offset - ErrorMetadata::lineOfContextRadius)

          : start;

  const CharT* windowEnd =

      (length - offset > ErrorMetadata::lineOfContextRadius)

          ? start + offset + ErrorMetadata::lineOfContextRadius

          : start + length;

  size_t windowLength = PointerRangeSize(windowStart, windowEnd);

  MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2);

  // Create the windowed string, not including the potential line

  // terminator.

  StringBuilder windowBuf(ts.context());

  if (!windowBuf.append(windowStart, windowEnd)) {

    return;

  // The line of context must be null-terminated, and StringBuilder doesn't

  // make that happen unless we force it to.

  if (!windowBuf.append('\0')) {

    return;

  err.lineOfContext.reset(windowBuf.stealChars());

  if (!err.lineOfContext) {

    return;

  err.lineLength = windowLength;

  err.tokenOffset = offset - (windowStart - start);

  va_list args;

  va_start(args, length);

  ReportCompileErrorLatin1VA(ts.context(), std::move(err), nullptr, errorNumber,

                             &args);

  va_end(args);

static void ReportSyntaxError(TokenStreamAnyChars& ts,

                              RegExpCompileData& result,

                              Handle<JSAtom*> pattern) {

  JS::AutoCheckCannotGC nogc_;

  if (pattern->hasLatin1Chars()) {

    ReportSyntaxError(ts, Nothing(), Nothing(), result,

                      pattern->latin1Chars(nogc_), pattern->length());

  } else {

    ReportSyntaxError(ts, Nothing(), Nothing(), result,

                      pattern->twoByteChars(nogc_), pattern->length());

template <typename CharT>

static bool CheckPatternSyntaxImpl(js::LifoAlloc& alloc,

                                   JS::NativeStackLimit stackLimit,

                                   const CharT* input, uint32_t inputLength,

                                   JS::RegExpFlags flags,

                                   RegExpCompileData* result,

                                   JS::AutoAssertNoGC& nogc) {

  LifoAllocScope allocScope(&alloc);

  Zone zone(allocScope.alloc());

  return RegExpParser::VerifyRegExpSyntax(&zone, stackLimit, input, inputLength,

                                          flags, result, nogc);

bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit,

                        TokenStreamAnyChars& ts,

                        const mozilla::Range<const char16_t> chars,

                        JS::RegExpFlags flags, mozilla::Maybe<uint32_t> line,

                        mozilla::Maybe<JS::ColumnNumberOneOrigin> column) {

  RegExpCompileData result;

  JS::AutoAssertNoGC nogc;

  if (!CheckPatternSyntaxImpl(alloc, stackLimit, chars.begin().get(),

                              chars.length(), flags, &result, nogc)) {

    ReportSyntaxError(ts, line, column, result, chars.begin().get(),

                      chars.length());

    return false;

  return true;

bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit,

                        TokenStreamAnyChars& ts, Handle<JSAtom*> pattern,

                        JS::RegExpFlags flags) {

  RegExpCompileData result;

  JS::AutoAssertNoGC nogc(cx);

  if (pattern->hasLatin1Chars()) {

    if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit,

                                pattern->latin1Chars(nogc), pattern->length(),

                                flags, &result, nogc)) {

      ReportSyntaxError(ts, result, pattern);

      return false;

    return true;

  if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit,

                              pattern->twoByteChars(nogc), pattern->length(),

                              flags, &result, nogc)) {

    ReportSyntaxError(ts, result, pattern);

    return false;

  return true;

// A regexp is a good candidate for Boyer-Moore if it has at least 3

// times as many characters as it has unique characters. Note that

// table lookups in irregexp are done modulo tableSize (128).

template <typename CharT>

static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {

  const uint32_t tableSize =

      v8::internal::NativeRegExpMacroAssembler::kTableSize;

  bool character_found[tableSize] = {};

  uint32_t different = 0;

  for (uint32_t i = 0; i < length; i++) {

    uint32_t ch = chars[i] % tableSize;

    if (!character_found[ch]) {

      character_found[ch] = true;

      different++;

      // We declare a regexp low-alphabet if it has at least 3 times as many

      // characters as it has different characters.

      if (different * 3 > length) {

        return false;

  return true;

// Identifies the sort of pattern where Boyer-Moore is faster than string search

static bool UseBoyerMoore(Handle<JSAtom*> pattern, JS::AutoAssertNoGC& nogc) {

  size_t length =

      std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());

  if (length <= kPatternTooShortForBoyerMoore) {

    return false;

  if (pattern->hasLatin1Chars()) {

    return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);

  MOZ_ASSERT(pattern->hasTwoByteChars());

  return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);

// Sample character frequency information for use in Boyer-Moore.

static void SampleCharacters(Handle<JSLinearString*> sample_subject,

                             RegExpCompiler& compiler) {

  static const int kSampleSize = 128;

  int chars_sampled = 0;

  int length = sample_subject->length();

  int half_way = (length - kSampleSize) / 2;

  for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize;

       i++, chars_sampled++) {

    compiler.frequency_collator()->CountCharacter(

        sample_subject->latin1OrTwoByteChar(i));

// Recursively walking the AST for a deeply nested regexp (like

// `/(a(a(a(a(a(a(a(...(a)...))))))))/`) may overflow the stack while

// compiling. To avoid this, we use V8's implementation of the Visitor

// pattern to walk the AST first with an overly large stack frame.

class RegExpDepthCheck final : public v8::internal::RegExpVisitor {

 public:

  explicit RegExpDepthCheck(JSContext* cx) : cx_(cx) {}

  bool check(v8::internal::RegExpTree* root) {

    return !!root->Accept(this, nullptr);

  // Leaf nodes with no children

#define LEAF_DEPTH(Kind)                                                \

  void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \

    uint8_t padding[FRAME_PADDING];                                     \

    dummy_ = padding; /* Prevent padding from being optimized away.*/   \

    AutoCheckRecursionLimit recursion(cx_);                             \

    return (void*)recursion.checkDontReport(cx_);                       \

  LEAF_DEPTH(Assertion)

  LEAF_DEPTH(Atom)

  LEAF_DEPTH(BackReference)

  LEAF_DEPTH(ClassSetOperand)

  LEAF_DEPTH(ClassRanges)

  LEAF_DEPTH(Empty)

  LEAF_DEPTH(Text)

#undef LEAF_DEPTH

  // Wrapper nodes with one child

#define WRAPPER_DEPTH(Kind)                                             \

  void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \

    uint8_t padding[FRAME_PADDING];                                     \

    dummy_ = padding; /* Prevent padding from being optimized away.*/   \

    AutoCheckRecursionLimit recursion(cx_);                             \

    if (!recursion.checkDontReport(cx_)) {                              \

      return nullptr;                                                   \

    }                                                                   \

    return node->body()->Accept(this, nullptr);                         \

  WRAPPER_DEPTH(Capture)

  WRAPPER_DEPTH(Group)

  WRAPPER_DEPTH(Lookaround)

  WRAPPER_DEPTH(Quantifier)

#undef WRAPPER_DEPTH

  void* VisitAlternative(v8::internal::RegExpAlternative* node,

                         void*) override {

    uint8_t padding[FRAME_PADDING];

    dummy_ = padding; /* Prevent padding from being optimized away.*/

    AutoCheckRecursionLimit recursion(cx_);

    if (!recursion.checkDontReport(cx_)) {

      return nullptr;

    for (auto* child : *node->nodes()) {

      if (!child->Accept(this, nullptr)) {

        return nullptr;

    return (void*)true;

  void* VisitDisjunction(v8::internal::RegExpDisjunction* node,

                         void*) override {

    uint8_t padding[FRAME_PADDING];

    dummy_ = padding; /* Prevent padding from being optimized away.*/

    AutoCheckRecursionLimit recursion(cx_);

    if (!recursion.checkDontReport(cx_)) {

      return nullptr;

    for (auto* child : *node->alternatives()) {

      if (!child->Accept(this, nullptr)) {

        return nullptr;

    return (void*)true;

  void* VisitClassSetExpression(v8::internal::RegExpClassSetExpression* node,

                                void*) override {

    uint8_t padding[FRAME_PADDING];

    dummy_ = padding; /* Prevent padding from being optimized away.*/

    AutoCheckRecursionLimit recursion(cx_);

    if (!recursion.checkDontReport(cx_)) {

      return nullptr;

    for (auto* child : *node->operands()) {

      if (!child->Accept(this, nullptr)) {

        return nullptr;

    return (void*)true;

 private:

  JSContext* cx_;

  void* dummy_ = nullptr;

  // This size is picked to be comfortably larger than any

  // RegExp*::ToNode stack frame.

  static const size_t FRAME_PADDING = 256;

};

enum class AssembleResult {

  Success,

  TooLarge,

  OutOfMemory,

};

[[nodiscard]] static AssembleResult Assemble(

    JSContext* cx, RegExpCompiler* compiler, RegExpCompileData* data,

    MutableHandleRegExpShared re, Handle<JSAtom*> pattern, Zone* zone,

    bool useNativeCode, bool isLatin1) {

  // Because we create a StackMacroAssembler, this function is not allowed

  // to GC. If needed, we allocate and throw errors in the caller.

  jit::TempAllocator temp(&cx->tempLifoAlloc());

  Maybe<jit::JitContext> jctx;

  Maybe<js::jit::StackMacroAssembler> stack_masm;

  UniquePtr<RegExpMacroAssembler> masm;

  if (useNativeCode) {

    NativeRegExpMacroAssembler::Mode mode =

        isLatin1 ? NativeRegExpMacroAssembler::LATIN1

                 : NativeRegExpMacroAssembler::UC16;

    // If we are compiling native code, we need a macroassembler,

    // which needs a jit context.

    jctx.emplace(cx);

    stack_masm.emplace(cx, temp);

#ifdef DEBUG

    // It would be much preferable to use `class AutoCreatedBy` here, but we

    // may be operating without an assembler at all if `useNativeCode` is

    // `false`, so there's no place to put such a call.

    stack_masm.ref().pushCreator("Assemble() in RegExpAPI.cpp");

#endif

    uint32_t num_capture_registers = re->pairCount() * 2;

    masm = MakeUnique<SMRegExpMacroAssembler>(cx, stack_masm.ref(), zone, mode,

                                              num_capture_registers);

  } else {

    masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, zone);

  if (!masm) {

    ReportOutOfMemory(cx);

    return AssembleResult::OutOfMemory;

  bool isLargePattern =

      pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize;

  masm->set_slow_safe(isLargePattern);

  if (compiler->optimize()) {

    compiler->set_optimize(!isLargePattern);

  // When matching a regexp with known maximum length that is anchored

  // at the end, we may be able to skip the beginning of long input

  // strings. This decision is made here because it depends on

  // information in the AST that isn't replicated in the Node

  // structure used inside the compiler.

  bool is_start_anchored = data->tree->IsAnchoredAtStart();

  bool is_end_anchored = data->tree->IsAnchoredAtEnd();

  int max_length = data->tree->max_match();

  static const int kMaxBacksearchLimit = 1024;

  if (is_end_anchored && !is_start_anchored && !re->sticky() &&

      max_length < kMaxBacksearchLimit) {

    masm->SetCurrentPositionFromEnd(max_length);

  if (re->global()) {

    RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;

    if (data->tree->min_match() > 0) {

      mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;

    } else if (re->unicode()) {

      mode = RegExpMacroAssembler::GLOBAL_UNICODE;

    masm->set_global_mode(mode);

  // The masm tracer works as a thin wrapper around another macroassembler.

  RegExpMacroAssembler* masm_ptr = masm.get();

#ifdef DEBUG

  UniquePtr<RegExpMacroAssembler> tracer_masm;

  if (jit::JitOptions.trace_regexp_assembler) {

    tracer_masm = MakeUnique<RegExpMacroAssemblerTracer>(cx->isolate, masm_ptr);

    masm_ptr = tracer_masm.get();

#endif

  // Compile the regexp.

  V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate);

  RegExpCompiler::CompilationResult result = compiler->Assemble(

      cx->isolate, masm_ptr, data->node, data->capture_count, wrappedPattern);

  if (useNativeCode) {

#ifdef DEBUG

    // See comment referencing `pushCreator` above.

    stack_masm.ref().popCreator();

#endif

  if (!result.Succeeded()) {

    MOZ_ASSERT(result.error == RegExpError::kTooLarge);

    return AssembleResult::TooLarge;

  if (result.code->value().isUndefined()) {

    // SMRegExpMacroAssembler::GetCode returns undefined on OOM.

    MOZ_ASSERT(useNativeCode);

    return AssembleResult::OutOfMemory;

  re->updateMaxRegisters(result.num_registers);

  if (useNativeCode) {

    // Transfer ownership of the tables from the macroassembler to the

    // RegExpShared.

    SMRegExpMacroAssembler::TableVector& tables =

        static_cast<SMRegExpMacroAssembler*>(masm.get())->tables();

    for (uint32_t i = 0; i < tables.length(); i++) {

      if (!re->addTable(std::move(tables[i]))) {

        ReportOutOfMemory(cx);

        return AssembleResult::OutOfMemory;

    re->setJitCode(v8::internal::Code::cast(*result.code).inner(), isLatin1);

  } else {

    // Transfer ownership of the bytecode from the HandleScope to the

    // RegExpShared.

    ByteArray bytecode =

        v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate);

    uint32_t length = bytecode->length();

    re->setByteCode(bytecode.release(), isLatin1);

    js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode);

  return AssembleResult::Success;

struct RegExpNamedCapture {

  const ZoneVector<char16_t>* name;

  js::Vector<uint32_t> indices;

  RegExpNamedCapture(JSContext* cx, const ZoneVector<char16_t>* name)

      : name(name), indices(cx) {}

};

struct RegExpNamedCaptureIndexLess {

  bool operator()(const RegExpNamedCapture& lhs,

                  const RegExpNamedCapture& rhs) const {

    // Every name must have at least one corresponding capture index, and all

    // the capture indices must be distinct. This allows us to sort on the

    // lowest capture index.

    MOZ_ASSERT(!lhs.indices.empty());

    MOZ_ASSERT(!rhs.indices.empty());

    MOZ_ASSERT_IF(&lhs != &rhs, lhs.indices[0] != rhs.indices[0]);

    return lhs.indices[0] < rhs.indices[0];

};

bool InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re,

                             ZoneVector<RegExpCapture*>* namedCaptures) {

  // The irregexp parser returns named capture information in the form

  // of a ZoneVector of RegExpCaptures nodes, each of which stores the

  // capture name and the corresponding capture index. We create a

  // template object with a property for each capture name, and store

  // the capture indices as a heap-allocated array.

  uint32_t numNamedCaptures = namedCaptures->size();

  // The input vector of named captures is already sorted by name, and then by

  // capture index if there are duplicates. We iterate through the captures,

  // creating groups for each set of indices corresponding to a name. Usually,

  // there will be a 1:1 mapping.

  js::Vector<RegExpNamedCapture> groups(cx);

  if (!groups.reserve(numNamedCaptures)) {

    js::ReportOutOfMemory(cx);

    return false;

  const ZoneVector<char16_t>* prevName = nullptr;

  uint32_t numDistinctNamedCaptures = 0;

  for (uint32_t i = 0; i < numNamedCaptures; i++) {

    RegExpCapture* capture = (*namedCaptures)[i];

    const ZoneVector<char16_t>* name = capture->name();

    if (!prevName || *name != *prevName) {

      if (!groups.emplaceBack(RegExpNamedCapture(cx, name))) {

        js::ReportOutOfMemory(cx);

        return false;

      numDistinctNamedCaptures++;

      prevName = name;

    // Make sure we're getting the indices in the order we expect

    MOZ_ASSERT_IF(!groups.back().indices.empty(),

                  groups.back().indices.back() < (uint32_t)capture->index());

    if (!groups.back().indices.emplaceBack(capture->index())) {

      js::ReportOutOfMemory(cx);

      return false;

  // The capture name map must be sorted by index.

  std::sort(groups.begin(), groups.end(), RegExpNamedCaptureIndexLess{});

  // Create a plain template object.

  Rooted<js::PlainObject*> templateObject(

      cx, js::NewPlainObjectWithProto(cx, nullptr, TenuredObject));

  if (!templateObject) {

    return false;

  // Allocate the capture index array.

  uint32_t arraySize = numNamedCaptures * sizeof(uint32_t);

  UniquePtr<uint32_t[], JS::FreePolicy> captureIndices(

      static_cast<uint32_t*>(js_malloc(arraySize)));

  if (!captureIndices) {

    js::ReportOutOfMemory(cx);

    return false;

  // Allocate the capture slice index array, if necessary. We only use this

  // if we have duplicate named capture groups.

  bool hasDuplicateNames = numNamedCaptures != numDistinctNamedCaptures;

  UniquePtr<uint32_t[], JS::FreePolicy> sliceIndices;

  if (hasDuplicateNames) {

    arraySize = numDistinctNamedCaptures * sizeof(uint32_t);

    sliceIndices.reset(static_cast<uint32_t*>(js_malloc(arraySize)));

    if (!sliceIndices) {

      js::ReportOutOfMemory(cx);

      return false;

  // Initialize the properties of the template and store capture indices.

  RootedId id(cx);

  RootedValue dummyString(cx, StringValue(cx->runtime()->emptyString));

  size_t insertIndex = 0;

  for (size_t i = 0; i < numDistinctNamedCaptures; ++i) {

    RegExpNamedCapture& group = groups[i];

    // We store the names as properties on the template object, in the order of

    // their lowest capture index.

    JSAtom* name = js::AtomizeChars(cx, group.name->data(), group.name->size());

    if (!name) {

      return false;

    id = NameToId(name->asPropertyName());

    if (!NativeDefineDataProperty(cx, templateObject, id, dummyString,

                                  JSPROP_ENUMERATE)) {

      return false;

    // The slice index keeps track of the captureIndex where indices

    // corresponding to a name start. The difference between the current slice

    // index and the next slice index is used to calculate how many values to

    // return in the slice. This is only needed when we have duplicate capture

    // names, otherwise, there's a 1:1 mapping, and we don't need the extra

    // data.

    if (hasDuplicateNames) {

      sliceIndices[i] = insertIndex;

    for (uint32_t captureIndex : groups[i].indices) {

      captureIndices[insertIndex++] = captureIndex;

  RegExpShared::InitializeNamedCaptures(

      cx, re, numNamedCaptures, numDistinctNamedCaptures, templateObject,

      captureIndices.release(), sliceIndices.release());

  return true;

bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,

                    Handle<JSLinearString*> input,

                    RegExpShared::CodeKind codeKind) {

  Rooted<JSAtom*> pattern(cx, re->getSource());

  JS::RegExpFlags flags = re->getFlags();

  LifoAllocScope allocScope(&cx->tempLifoAlloc());

  HandleScope handleScope(cx->isolate);

  Zone zone(allocScope.alloc());

  RegExpCompileData data;

    V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate);

    if (!RegExpParser::ParseRegExpFromHeapString(

            cx->isolate, &zone, wrappedPattern, flags, &data)) {

      AutoReportFrontendContext fc(cx);

      JS::CompileOptions options(cx);

      DummyTokenStream dummyTokenStream(&fc, options);

      ReportSyntaxError(dummyTokenStream, data, pattern);

      return false;

  // Avoid stack overflow while recursively walking the AST.

  RegExpDepthCheck depthCheck(cx);

  if (!depthCheck.check(data.tree)) {

    JS_ReportErrorASCII(cx, "regexp too big");

    cx->reportResourceExhaustion();

    return false;

  if (re->kind() == RegExpShared::Kind::Unparsed) {

    // This is the first time we have compiled this regexp.

    // First, check to see if we should use simple string search

    // with an atom.

    if (!flags.ignoreCase() && !flags.sticky()) {

      Rooted<JSAtom*> searchAtom(cx);

      if (data.simple) {

        // The parse-tree is a single atom that is equal to the pattern.

        searchAtom = re->getSource();

      } else if (data.tree->IsAtom() && data.capture_count == 0) {

        // The parse-tree is a single atom that is not equal to the pattern.

        v8::internal::RegExpAtom* atom = data.tree->AsAtom();

        const char16_t* twoByteChars = atom->data().begin();

        searchAtom = AtomizeChars(cx, twoByteChars, atom->length());

        if (!searchAtom) {

          return false;

      JS::AutoAssertNoGC nogc(cx);

      if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {

        re->useAtomMatch(searchAtom);

        return true;

    if (data.named_captures) {

      if (!InitializeNamedCaptures(cx, re, data.named_captures)) {

        return false;

    // All fallible initialization has succeeded, so we can change state.

    // Add one to capture_count to account for the whole-match capture.

    uint32_t pairCount = data.capture_count + 1;

    re->useRegExpMatch(pairCount);

  MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp);

  RegExpCompiler compiler(cx->isolate, &zone, data.capture_count, flags,

                          input->hasLatin1Chars());

  bool isLatin1 = input->hasLatin1Chars();

  SampleCharacters(input, compiler);

  data.node = compiler.PreprocessRegExp(&data, isLatin1);

  data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node);

  if (data.error != RegExpError::kNone) {

    MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow);

    ReportOverRecursed(cx);

    return false;

  bool useNativeCode = codeKind == RegExpShared::CodeKind::Jitcode;

  MOZ_ASSERT_IF(useNativeCode, IsNativeRegExpEnabled());

  switch (Assemble(cx, &compiler, &data, re, pattern, &zone, useNativeCode,

                   isLatin1)) {

    case AssembleResult::TooLarge:

      JS_ReportErrorASCII(cx, "regexp too big");

      cx->reportResourceExhaustion();

      return false;

    case AssembleResult::OutOfMemory:

      MOZ_ASSERT(cx->isThrowingOutOfMemory());

      return false;

    case AssembleResult::Success:

      break;

  return true;

template <typename CharT>

RegExpRunStatus ExecuteRaw(jit::JitCode* code, const CharT* chars,

                           size_t length, size_t startIndex,

                           VectorMatchPairs* matches) {

  InputOutputData data(chars, chars + length, startIndex, matches);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Error) ==

                v8::internal::RegExp::kInternalRegExpException);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Success) ==

                v8::internal::RegExp::kInternalRegExpSuccess);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) ==

                v8::internal::RegExp::kInternalRegExpFailure);

  using RegExpCodeSignature = int (*)(InputOutputData*);

  auto function = reinterpret_cast<RegExpCodeSignature>(code->raw());

    JS::AutoSuppressGCAnalysis nogc;

    return (RegExpRunStatus)CALL_GENERATED_1(function, &data);

RegExpRunStatus Interpret(JSContext* cx, MutableHandleRegExpShared re,

                          Handle<JSLinearString*> input, size_t startIndex,

                          VectorMatchPairs* matches) {

  MOZ_ASSERT(re->getByteCode(input->hasLatin1Chars()));

  HandleScope handleScope(cx->isolate);

  V8HandleRegExp wrappedRegExp(v8::internal::IrRegExpData(re), cx->isolate);

  V8HandleString wrappedInput(v8::internal::String(input), cx->isolate);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Error) ==

                v8::internal::RegExp::kInternalRegExpException);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Success) ==

                v8::internal::RegExp::kInternalRegExpSuccess);

  static_assert(static_cast<int32_t>(RegExpRunStatus::Success_NotFound) ==

                v8::internal::RegExp::kInternalRegExpFailure);

  RegExpRunStatus status =

      (RegExpRunStatus)IrregexpInterpreter::MatchForCallFromRuntime(

          cx->isolate, wrappedRegExp, wrappedInput, matches->pairsRaw(),

          uint32_t(matches->pairCount() * 2), uint32_t(startIndex));

  MOZ_ASSERT(status == RegExpRunStatus::Error ||

             status == RegExpRunStatus::Success ||

             status == RegExpRunStatus::Success_NotFound);

  return status;

RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re,

                        Handle<JSLinearString*> input, size_t startIndex,

                        VectorMatchPairs* matches) {

  bool latin1 = input->hasLatin1Chars();

  jit::JitCode* jitCode = re->getJitCode(latin1);

  bool isCompiled = !!jitCode;

  // Reset the Irregexp backtrack stack if it grows during execution.

  irregexp::RegExpStackScope stackScope(cx->isolate);

  if (isCompiled) {

    JS::AutoCheckCannotGC nogc;

    if (latin1) {

      return ExecuteRaw(jitCode, input->latin1Chars(nogc), input->length(),

                        startIndex, matches);

    return ExecuteRaw(jitCode, input->twoByteChars(nogc), input->length(),

                      startIndex, matches);

  return Interpret(cx, re, input, startIndex, matches);

RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern,

                                  Handle<JSLinearString*> input,

                                  JS::RegExpFlags flags, size_t startIndex,

                                  VectorMatchPairs* matches,

                                  RegExpShared::CodeKind codeKind) {

  RootedRegExpShared re(cx, cx->zone()->regExps().get(cx, pattern, flags));

  if (!RegExpShared::compileIfNecessary(cx, &re, input, codeKind)) {

    return RegExpRunStatus::Error;

  return RegExpShared::execute(cx, &re, input, startIndex, matches);

bool GrowBacktrackStack(RegExpStack* regexp_stack) {

  return SMRegExpMacroAssembler::GrowBacktrackStack(regexp_stack);

uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1,

                                          const char16_t* substring2,

                                          size_t byteLength) {

  return SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(

      substring1, substring2, byteLength);

uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1,

                                       const char16_t* substring2,

                                       size_t byteLength) {

  return SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode(

      substring1, substring2, byteLength);

bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges) {

  return SMRegExpMacroAssembler::IsCharacterInRangeArray(c, ranges);

#ifdef DEBUG

bool IsolateShouldSimulateInterrupt(Isolate* isolate) {

  return isolate->shouldSimulateInterrupt_ != 0;

void IsolateSetShouldSimulateInterrupt(Isolate* isolate) {

  isolate->shouldSimulateInterrupt_ = 1;

void IsolateClearShouldSimulateInterrupt(Isolate* isolate) {

  isolate->shouldSimulateInterrupt_ = 0;

#endif

}  // namespace irregexp

}  // namespace js