regex.rs - mozsearch

/*!

A lazy DFA backed `Regex`.

This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements

convenience routines you might have come to expect, such as finding a match

and iterating over all non-overlapping matches. This `Regex` type is limited

in its capabilities to what a lazy DFA can provide. Therefore, APIs involving

capturing groups, for example, are not provided.

Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that

finds the end offset of a match, where as the other is a "reverse" DFA that

find the start offset of a match.

See the [parent module](crate::hybrid) for examples.

*/

use crate::{

    hybrid::{

        dfa::{self, DFA},

        error::BuildError,

},

    nfa::thompson,

    util::{

        iter,

        search::{Anchored, Input, Match, MatchError, MatchKind},

},

};

/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")

/// for searching.

///

/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a

/// "reverse" DFA. The forward DFA is responsible for detecting the end of

/// a match while the reverse DFA is responsible for detecting the start

/// of a match. Thus, in order to find the bounds of any given match, a

/// forward search must first be run followed by a reverse search. A match

/// found by the forward DFA guarantees that the reverse DFA will also find

/// a match.

///

/// # Fallibility

///

/// Most of the search routines defined on this type will _panic_ when the

/// underlying search fails. This might be because the DFA gave up because it

/// saw a quit byte, whether configured explicitly or via heuristic Unicode

/// word boundary support, although neither are enabled by default. It might

/// also fail if the underlying DFA determines it isn't making effective use of

/// the cache (which also never happens by default). Or it might fail because

/// an invalid `Input` configuration is given, for example, with an unsupported

/// [`Anchored`] mode.

///

/// If you need to handle these error cases instead of allowing them to trigger

/// a panic, then the lower level [`Regex::try_search`] provides a fallible API

/// that never panics.

///

/// # Example

///

/// This example shows how to cause a search to terminate if it sees a

/// `\n` byte, and handle the error returned. This could be useful if, for

/// example, you wanted to prevent a user supplied pattern from matching

/// across a line boundary.

///

/// ```

/// # if cfg!(miri) { return Ok(()); } // miri takes too long

/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};

///

/// let re = Regex::builder()

///     .dfa(dfa::Config::new().quit(b'\n', true))

///     .build(r"foo\p{any}+bar")?;

/// let mut cache = re.create_cache();

///

/// let input = Input::new("foo\nbar");

/// // Normally this would produce a match, since \p{any} contains '\n'.

/// // But since we instructed the automaton to enter a quit state if a

/// // '\n' is observed, this produces a match error instead.

/// let expected = MatchError::quit(b'\n', 3);

/// let got = re.try_search(&mut cache, &input).unwrap_err();

/// assert_eq!(expected, got);

///

/// # Ok::<(), Box<dyn std::error::Error>>(())

/// ```

#[derive(Debug)]

pub struct Regex {

    /// The forward lazy DFA. This can only find the end of a match.

    forward: DFA,

    /// The reverse lazy DFA. This can only find the start of a match.

///

    /// This is built with 'all' match semantics (instead of leftmost-first)

    /// so that it always finds the longest possible match (which corresponds

    /// to the leftmost starting position). It is also compiled as an anchored

    /// matcher and has 'starts_for_each_pattern' enabled. Including starting

    /// states for each pattern is necessary to ensure that we only look for

    /// matches of a pattern that matched in the forward direction. Otherwise,

    /// we might wind up finding the "leftmost" starting position of a totally

    /// different pattern!

    reverse: DFA,

/// Convenience routines for regex and cache construction.

impl Regex {

    /// Parse the given regular expression using the default configuration and

    /// return the corresponding regex.

///

    /// If you want a non-default configuration, then use the [`Builder`] to

    /// set your own configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{hybrid::regex::Regex, Match};

///

    /// let re = Regex::new("foo[0-9]+bar")?;

    /// let mut cache = re.create_cache();

    /// assert_eq!(

    ///     Some(Match::must(0, 3..14)),

    ///     re.find(&mut cache, "zzzfoo12345barzzz"),

    /// );

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[cfg(feature = "syntax")]

    pub fn new(pattern: &str) -> Result<Regex, BuildError> {

        Regex::builder().build(pattern)

    /// Like `new`, but parses multiple patterns into a single "multi regex."

    /// This similarly uses the default regex configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{hybrid::regex::Regex, Match};

///

    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;

    /// let mut cache = re.create_cache();

///

    /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");

    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());

    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());

    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());

    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());

    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());

    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());

    /// assert_eq!(None, it.next());

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[cfg(feature = "syntax")]

    pub fn new_many<P: AsRef<str>>(

        patterns: &[P],

    ) -> Result<Regex, BuildError> {

        Regex::builder().build_many(patterns)

    /// Return a builder for configuring the construction of a `Regex`.

///

    /// This is a convenience routine to avoid needing to import the

    /// [`Builder`] type in common cases.

///

    /// # Example

///

    /// This example shows how to use the builder to disable UTF-8 mode

    /// everywhere.

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::{

    ///     hybrid::regex::Regex, nfa::thompson, util::syntax, Match,

    /// };

///

    /// let re = Regex::builder()

    ///     .syntax(syntax::Config::new().utf8(false))

    ///     .thompson(thompson::Config::new().utf8(false))

    ///     .build(r"foo(?-u:[^b])ar.*")?;

    /// let mut cache = re.create_cache();

///

    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";

    /// let expected = Some(Match::must(0, 1..9));

    /// let got = re.find(&mut cache, haystack);

    /// assert_eq!(expected, got);

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn builder() -> Builder {

        Builder::new()

    /// Create a new cache for this `Regex`.

///

    /// The cache returned should only be used for searches for this

    /// `Regex`. If you want to reuse the cache for another `Regex`, then

    /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,

    /// [`Regex::reset_cache`]).

    pub fn create_cache(&self) -> Cache {

        Cache::new(self)

    /// Reset the given cache such that it can be used for searching with the

    /// this `Regex` (and only this `Regex`).

///

    /// A cache reset permits reusing memory already allocated in this cache

    /// with a different `Regex`.

///

    /// Resetting a cache sets its "clear count" to 0. This is relevant if the

    /// `Regex` has been configured to "give up" after it has cleared the cache

    /// a certain number of times.

///

    /// # Example

///

    /// This shows how to re-purpose a cache for use with a different `Regex`.

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::{hybrid::regex::Regex, Match};

///

    /// let re1 = Regex::new(r"\w")?;

    /// let re2 = Regex::new(r"\W")?;

///

    /// let mut cache = re1.create_cache();

    /// assert_eq!(

    ///     Some(Match::must(0, 0..2)),

    ///     re1.find(&mut cache, "Δ"),

    /// );

///

    /// // Using 'cache' with re2 is not allowed. It may result in panics or

    /// // incorrect results. In order to re-purpose the cache, we must reset

    /// // it with the Regex we'd like to use it with.

    /// //

    /// // Similarly, after this reset, using the cache with 're1' is also not

    /// // allowed.

    /// re2.reset_cache(&mut cache);

    /// assert_eq!(

    ///     Some(Match::must(0, 0..3)),

    ///     re2.find(&mut cache, "☃"),

    /// );

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn reset_cache(&self, cache: &mut Cache) {

        self.forward().reset_cache(&mut cache.forward);

        self.reverse().reset_cache(&mut cache.reverse);

/// Standard infallible search routines for finding and iterating over matches.

impl Regex {

    /// Returns true if and only if this regex matches the given haystack.

///

    /// This routine may short circuit if it knows that scanning future input

    /// will never lead to a different result. In particular, if the underlying

    /// DFA enters a match state or a dead state, then this routine will return

    /// `true` or `false`, respectively, without inspecting any future input.

///

    /// # Panics

///

    /// This routine panics if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the lazy DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the lazy DFA quitting.

    /// * The configuration of the lazy DFA may also permit it to "give up"

    /// on a search if it makes ineffective use of its transition table

    /// cache. The default configuration does not enable this by default,

    /// although it is typically a good idea to.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search panics, callers cannot know whether a match exists or

    /// not.

///

    /// Use [`Regex::try_search`] if you want to handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::hybrid::regex::Regex;

///

    /// let re = Regex::new("foo[0-9]+bar")?;

    /// let mut cache = re.create_cache();

///

    /// assert!(re.is_match(&mut cache, "foo12345bar"));

    /// assert!(!re.is_match(&mut cache, "foobar"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn is_match<'h, I: Into<Input<'h>>>(

        &self,

        cache: &mut Cache,

        input: I,

    ) -> bool {

        // Not only can we do an "earliest" search, but we can avoid doing a

        // reverse scan too.

        self.forward()

            .try_search_fwd(&mut cache.forward, &input.into().earliest(true))

            .unwrap()

            .is_some()

    /// Returns the start and end offset of the leftmost match. If no match

    /// exists, then `None` is returned.

///

    /// # Panics

///

    /// This routine panics if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the lazy DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the lazy DFA quitting.

    /// * The configuration of the lazy DFA may also permit it to "give up"

    /// on a search if it makes ineffective use of its transition table

    /// cache. The default configuration does not enable this by default,

    /// although it is typically a good idea to.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search panics, callers cannot know whether a match exists or

    /// not.

///

    /// Use [`Regex::try_search`] if you want to handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, hybrid::regex::Regex};

///

    /// let re = Regex::new("foo[0-9]+")?;

    /// let mut cache = re.create_cache();

    /// assert_eq!(

    ///     Some(Match::must(0, 3..11)),

    ///     re.find(&mut cache, "zzzfoo12345zzz"),

    /// );

///

    /// // Even though a match is found after reading the first byte (`a`),

    /// // the default leftmost-first match semantics demand that we find the

    /// // earliest match that prefers earlier parts of the pattern over latter

    /// // parts.

    /// let re = Regex::new("abc|a")?;

    /// let mut cache = re.create_cache();

    /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn find<'h, I: Into<Input<'h>>>(

        &self,

        cache: &mut Cache,

        input: I,

    ) -> Option<Match> {

        self.try_search(cache, &input.into()).unwrap()

    /// Returns an iterator over all non-overlapping leftmost matches in the

    /// given bytes. If no match exists, then the iterator yields no elements.

///

    /// # Panics

///

    /// This routine panics if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the lazy DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the lazy DFA quitting.

    /// * The configuration of the lazy DFA may also permit it to "give up"

    /// on a search if it makes ineffective use of its transition table

    /// cache. The default configuration does not enable this by default,

    /// although it is typically a good idea to.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search panics, callers cannot know whether a match exists or

    /// not.

///

    /// The above conditions also apply to the iterator returned as well. For

    /// example, if the lazy DFA gives up or quits during a search using this

    /// method, then a panic will occur during iteration.

///

    /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)

    /// if you want to handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{hybrid::regex::Regex, Match};

///

    /// let re = Regex::new("foo[0-9]+")?;

    /// let mut cache = re.create_cache();

///

    /// let text = "foo1 foo12 foo123";

    /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();

    /// assert_eq!(matches, vec![

    ///     Match::must(0, 0..4),

    ///     Match::must(0, 5..10),

    ///     Match::must(0, 11..17),

    /// ]);

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(

        &'r self,

        cache: &'c mut Cache,

        input: I,

    ) -> FindMatches<'r, 'c, 'h> {

        let it = iter::Searcher::new(input.into());

        FindMatches { re: self, cache, it }

/// Lower level "search" primitives that accept a `&Input` for cheap reuse

/// and return an error if one occurs instead of panicking.

impl Regex {

    /// Returns the start and end offset of the leftmost match. If no match

    /// exists, then `None` is returned.

///

    /// This is like [`Regex::find`] but with two differences:

///

    /// 1. It is not generic over `Into<Input>` and instead accepts a

    /// `&Input`. This permits reusing the same `Input` for multiple searches

    /// without needing to create a new one. This _may_ help with latency.

    /// 2. It returns an error if the search could not complete where as

    /// [`Regex::find`] will panic.

///

    /// # Errors

///

    /// This routine errors if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the lazy DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the lazy DFA quitting.

    /// * The configuration of the lazy DFA may also permit it to "give up"

    /// on a search if it makes ineffective use of its transition table

    /// cache. The default configuration does not enable this by default,

    /// although it is typically a good idea to.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search returns an error, callers cannot know whether a match

    /// exists or not.

    #[inline]

    pub fn try_search(

        &self,

        cache: &mut Cache,

        input: &Input<'_>,

    ) -> Result<Option<Match>, MatchError> {

        let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);

        let end = match self.forward().try_search_fwd(fcache, input)? {

            None => return Ok(None),

            Some(end) => end,

};

        // This special cases an empty match at the beginning of the search. If

        // our end matches our start, then since a reverse DFA can't match past

        // the start, it must follow that our starting position is also our end

        // position. So short circuit and skip the reverse search.

        if input.start() == end.offset() {

            return Ok(Some(Match::new(

                end.pattern(),

                end.offset()..end.offset(),

            )));

        // We can also skip the reverse search if we know our search was

        // anchored. This occurs either when the input config is anchored or

        // when we know the regex itself is anchored. In this case, we know the

        // start of the match, if one is found, must be the start of the

        // search.

        if self.is_anchored(input) {

            return Ok(Some(Match::new(

                end.pattern(),

                input.start()..end.offset(),

            )));

        // N.B. I have tentatively convinced myself that it isn't necessary

        // to specify the specific pattern for the reverse search since the

        // reverse search will always find the same pattern to match as the

        // forward search. But I lack a rigorous proof. Why not just provide

        // the pattern anyway? Well, if it is needed, then leaving it out

        // gives us a chance to find a witness. (Also, if we don't need to

        // specify the pattern, then we don't need to build the reverse DFA

        // with 'starts_for_each_pattern' enabled. It doesn't matter too much

        // for the lazy DFA, but does make the overall DFA bigger.)

//

        // We also need to be careful to disable 'earliest' for the reverse

        // search, since it could be enabled for the forward search. In the

        // reverse case, to satisfy "leftmost" criteria, we need to match as

        // much as we can. We also need to be careful to make the search

        // anchored. We don't want the reverse search to report any matches

        // other than the one beginning at the end of our forward search.

        let revsearch = input

            .clone()

            .span(input.start()..end.offset())

            .anchored(Anchored::Yes)

            .earliest(false);

        let start = self

            .reverse()

            .try_search_rev(rcache, &revsearch)?

            .expect("reverse search must match if forward search does");

        debug_assert_eq!(

            start.pattern(),

            end.pattern(),

            "forward and reverse search must match same pattern",

);

        debug_assert!(start.offset() <= end.offset());

        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))

    /// Returns true if either the given input specifies an anchored search

    /// or if the underlying NFA is always anchored.

    fn is_anchored(&self, input: &Input<'_>) -> bool {

        match input.get_anchored() {

            Anchored::No => {

                self.forward().get_nfa().is_always_start_anchored()

            Anchored::Yes | Anchored::Pattern(_) => true,

/// Non-search APIs for querying information about the regex and setting a

/// prefilter.

impl Regex {

    /// Return the underlying lazy DFA responsible for forward matching.

///

    /// This is useful for accessing the underlying lazy DFA and using it

    /// directly if the situation calls for it.

    pub fn forward(&self) -> &DFA {

        &self.forward

    /// Return the underlying lazy DFA responsible for reverse matching.

///

    /// This is useful for accessing the underlying lazy DFA and using it

    /// directly if the situation calls for it.

    pub fn reverse(&self) -> &DFA {

        &self.reverse

    /// Returns the total number of patterns matched by this regex.

///

    /// # Example

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::hybrid::regex::Regex;

///

    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;

    /// assert_eq!(3, re.pattern_len());

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn pattern_len(&self) -> usize {

        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());

        self.forward().pattern_len()

/// An iterator over all non-overlapping matches for an infallible search.

///

/// The iterator yields a [`Match`] value until no more matches could be found.

/// If the underlying regex engine returns an error, then a panic occurs.

///

/// The lifetime parameters are as follows:

///

/// * `'r` represents the lifetime of the regex object.

/// * `'h` represents the lifetime of the haystack being searched.

/// * `'c` represents the lifetime of the regex cache.

///

/// This iterator can be created with the [`Regex::find_iter`] method.

#[derive(Debug)]

pub struct FindMatches<'r, 'c, 'h> {

    re: &'r Regex,

    cache: &'c mut Cache,

    it: iter::Searcher<'h>,

impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {

    type Item = Match;

    #[inline]

    fn next(&mut self) -> Option<Match> {

        let FindMatches { re, ref mut cache, ref mut it } = *self;

        it.advance(|input| re.try_search(cache, input))

/// A cache represents a partially computed forward and reverse DFA.

///

/// A cache is the key component that differentiates a classical DFA and a

/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a

/// complete transition table that can handle all possible inputs, a hybrid

/// NFA/DFA starts with an empty transition table and builds only the parts

/// required during search. The parts that are built are stored in a cache. For

/// this reason, a cache is a required parameter for nearly every operation on

/// a [`Regex`].

///

/// Caches can be created from their corresponding `Regex` via

/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`

/// that created it, or the `Regex` that was most recently used to reset it

/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in

/// panics or incorrect results.

#[derive(Debug, Clone)]

pub struct Cache {

    forward: dfa::Cache,

    reverse: dfa::Cache,

impl Cache {

    /// Create a new cache for the given `Regex`.

///

    /// The cache returned should only be used for searches for the given

    /// `Regex`. If you want to reuse the cache for another `Regex`, then you

    /// must call [`Cache::reset`] with that `Regex`.

    pub fn new(re: &Regex) -> Cache {

        let forward = dfa::Cache::new(re.forward());

        let reverse = dfa::Cache::new(re.reverse());

        Cache { forward, reverse }

    /// Reset this cache such that it can be used for searching with the given

    /// `Regex` (and only that `Regex`).

///

    /// A cache reset permits reusing memory already allocated in this cache

    /// with a different `Regex`.

///

    /// Resetting a cache sets its "clear count" to 0. This is relevant if the

    /// `Regex` has been configured to "give up" after it has cleared the cache

    /// a certain number of times.

///

    /// # Example

///

    /// This shows how to re-purpose a cache for use with a different `Regex`.

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::{hybrid::regex::Regex, Match};

///

    /// let re1 = Regex::new(r"\w")?;

    /// let re2 = Regex::new(r"\W")?;

///

    /// let mut cache = re1.create_cache();

    /// assert_eq!(

    ///     Some(Match::must(0, 0..2)),

    ///     re1.find(&mut cache, "Δ"),

    /// );

///

    /// // Using 'cache' with re2 is not allowed. It may result in panics or

    /// // incorrect results. In order to re-purpose the cache, we must reset

    /// // it with the Regex we'd like to use it with.

    /// //

    /// // Similarly, after this reset, using the cache with 're1' is also not

    /// // allowed.

    /// cache.reset(&re2);

    /// assert_eq!(

    ///     Some(Match::must(0, 0..3)),

    ///     re2.find(&mut cache, "☃"),

    /// );

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn reset(&mut self, re: &Regex) {

        self.forward.reset(re.forward());

        self.reverse.reset(re.reverse());

    /// Return a reference to the forward cache.

    pub fn forward(&mut self) -> &dfa::Cache {

        &self.forward

    /// Return a reference to the reverse cache.

    pub fn reverse(&mut self) -> &dfa::Cache {

        &self.reverse

    /// Return a mutable reference to the forward cache.

///

    /// If you need mutable references to both the forward and reverse caches,

    /// then use [`Cache::as_parts_mut`].

    pub fn forward_mut(&mut self) -> &mut dfa::Cache {

        &mut self.forward

    /// Return a mutable reference to the reverse cache.

///

    /// If you need mutable references to both the forward and reverse caches,

    /// then use [`Cache::as_parts_mut`].

    pub fn reverse_mut(&mut self) -> &mut dfa::Cache {

        &mut self.reverse

    /// Return references to the forward and reverse caches, respectively.

    pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {

        (&self.forward, &self.reverse)

    /// Return mutable references to the forward and reverse caches,

    /// respectively.

    pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {

        (&mut self.forward, &mut self.reverse)

    /// Returns the heap memory usage, in bytes, as a sum of the forward and

    /// reverse lazy DFA caches.

///

    /// This does **not** include the stack size used up by this cache. To

    /// compute that, use `std::mem::size_of::<Cache>()`.

    pub fn memory_usage(&self) -> usize {

        self.forward.memory_usage() + self.reverse.memory_usage()

/// A builder for a regex based on a hybrid NFA/DFA.

///

/// This builder permits configuring options for the syntax of a pattern, the

/// NFA construction, the lazy DFA construction and finally the regex searching

/// itself. This builder is different from a general purpose regex builder

/// in that it permits fine grain configuration of the construction process.

/// The trade off for this is complexity, and the possibility of setting a

/// configuration that might not make sense. For example, there are two

/// different UTF-8 modes:

///

/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls

/// whether the pattern itself can contain sub-expressions that match invalid

/// UTF-8.

/// * [`thompson::Config::utf8`] controls how the regex iterators themselves

/// advance the starting position of the next search when a match with zero

/// length is found.

///

/// Generally speaking, callers will want to either enable all of these or

/// disable all of these.

///

/// Internally, building a regex requires building two hybrid NFA/DFAs,

/// where one is responsible for finding the end of a match and the other is

/// responsible for finding the start of a match. If you only need to detect

/// whether something matched, or only the end of a match, then you should use

/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper

/// than building two of them.

///

/// # Example

///

/// This example shows how to disable UTF-8 mode in the syntax and the regex

/// itself. This is generally what you want for matching on arbitrary bytes.

///

/// ```

/// # if cfg!(miri) { return Ok(()); } // miri takes too long

/// use regex_automata::{

///     hybrid::regex::Regex, nfa::thompson, util::syntax, Match,

/// };

///

/// let re = Regex::builder()

///     .syntax(syntax::Config::new().utf8(false))

///     .thompson(thompson::Config::new().utf8(false))

///     .build(r"foo(?-u:[^b])ar.*")?;

/// let mut cache = re.create_cache();

///

/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";

/// let expected = Some(Match::must(0, 1..9));

/// let got = re.find(&mut cache, haystack);

/// assert_eq!(expected, got);

/// // Notice that `(?-u:[^b])` matches invalid UTF-8,

/// // but the subsequent `.*` does not! Disabling UTF-8

/// // on the syntax permits this.

/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);

///

/// # Ok::<(), Box<dyn std::error::Error>>(())

/// ```

#[derive(Clone, Debug)]

pub struct Builder {

    dfa: dfa::Builder,

impl Builder {

    /// Create a new regex builder with the default configuration.

    pub fn new() -> Builder {

        Builder { dfa: DFA::builder() }

    /// Build a regex from the given pattern.

///

    /// If there was a problem parsing or compiling the pattern, then an error

    /// is returned.

    #[cfg(feature = "syntax")]

    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {

        self.build_many(&[pattern])

    /// Build a regex from the given patterns.

    #[cfg(feature = "syntax")]

    pub fn build_many<P: AsRef<str>>(

        &self,

        patterns: &[P],

    ) -> Result<Regex, BuildError> {

        let forward = self.dfa.build_many(patterns)?;

        let reverse = self

            .dfa

            .clone()

            .configure(

                DFA::config()

                    .prefilter(None)

                    .specialize_start_states(false)

                    .match_kind(MatchKind::All),

            .thompson(thompson::Config::new().reverse(true))

            .build_many(patterns)?;

        Ok(self.build_from_dfas(forward, reverse))

    /// Build a regex from its component forward and reverse hybrid NFA/DFAs.

///

    /// This is useful when you've built a forward and reverse lazy DFA

    /// separately, and want to combine them into a single regex. Once build,

    /// the individual DFAs given can still be accessed via [`Regex::forward`]

    /// and [`Regex::reverse`].

///

    /// It is important that the reverse lazy DFA be compiled under the

    /// following conditions:

///

    /// * It should use [`MatchKind::All`] semantics.

    /// * It should match in reverse.

    /// * Otherwise, its configuration should match the forward DFA.

///

    /// If these conditions aren't satisfied, then the behavior of searches is

    /// unspecified.

///

    /// Note that when using this constructor, no configuration is applied.

    /// Since this routine provides the DFAs to the builder, there is no

    /// opportunity to apply other configuration options.

///

    /// # Example

///

    /// This shows how to build individual lazy forward and reverse DFAs, and

    /// then combine them into a single `Regex`.

///

    /// ```

    /// use regex_automata::{

    ///     hybrid::{dfa::DFA, regex::Regex},

    ///     nfa::thompson,

    ///     MatchKind,

    /// };

///

    /// let fwd = DFA::new(r"foo[0-9]+")?;

    /// let rev = DFA::builder()

    ///     .configure(DFA::config().match_kind(MatchKind::All))

    ///     .thompson(thompson::Config::new().reverse(true))

    ///     .build(r"foo[0-9]+")?;

///

    /// let re = Regex::builder().build_from_dfas(fwd, rev);

    /// let mut cache = re.create_cache();

    /// assert_eq!(true, re.is_match(&mut cache, "foo123"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {

        Regex { forward, reverse }

    /// Set the syntax configuration for this builder using

    /// [`syntax::Config`](crate::util::syntax::Config).

///

    /// This permits setting things like case insensitivity, Unicode and multi

    /// line mode.

    #[cfg(feature = "syntax")]

    pub fn syntax(

        &mut self,

        config: crate::util::syntax::Config,

    ) -> &mut Builder {

        self.dfa.syntax(config);

        self

    /// Set the Thompson NFA configuration for this builder using

    /// [`nfa::thompson::Config`](thompson::Config).

///

    /// This permits setting things like whether additional time should be

    /// spent shrinking the size of the NFA.

    #[cfg(feature = "syntax")]

    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {

        self.dfa.thompson(config);

        self

    /// Set the lazy DFA compilation configuration for this builder using

    /// [`dfa::Config`](dfa::Config).

///

    /// This permits setting things like whether Unicode word boundaries should

    /// be heuristically supported or settings how the behavior of the cache.

    pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {

        self.dfa.configure(config);

        self

impl Default for Builder {

    fn default() -> Builder {

        Builder::new()

Revision control

Copy as Markdown

Other Tools