bytes.rs - mozsearch

comm-central/third_party/rust/regex/src/regexset/bytes.rs

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

use alloc::string::String;

use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};

use crate::{bytes::RegexSetBuilder, Error};

/// Match multiple, possibly overlapping, regexes in a single search.

///

/// A regex set corresponds to the union of zero or more regular expressions.

/// That is, a regex set will match a haystack when at least one of its

/// constituent regexes matches. A regex set as its formulated here provides a

/// touch more power: it will also report *which* regular expressions in the

/// set match. Indeed, this is the key difference between regex sets and a

/// single `Regex` with many alternates, since only one alternate can match at

/// a time.

///

/// For example, consider regular expressions to match email addresses and

/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a

/// regex set is constructed from those regexes, then searching the haystack

/// `foo@example.com` will report both regexes as matching. Of course, one

/// could accomplish this by compiling each regex on its own and doing two

/// searches over the haystack. The key advantage of using a regex set is

/// that it will report the matching regexes using a *single pass through the

/// haystack*. If one has hundreds or thousands of regexes to match repeatedly

/// (like a URL router for a complex web application or a user agent matcher),

/// then a regex set *can* realize huge performance gains.

///

/// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet`

/// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this

/// `RegexSet` is permitted to match invalid UTF-8.

///

/// # Limitations

///

/// Regex sets are limited to answering the following two questions:

///

/// 1. Does any regex in the set match?

/// 2. If so, which regexes in the set match?

///

/// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask

/// (1) instead of (2) since the matching engines can stop after the first

/// match is found.

///

/// You cannot directly extract [`Match`][crate::bytes::Match] or

/// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need

/// these operations, the recommended approach is to compile each pattern in

/// the set independently and scan the exact same haystack a second time with

/// those independently compiled patterns:

///

/// ```

/// use regex::bytes::{Regex, RegexSet};

///

/// let patterns = ["foo", "bar"];

/// // Both patterns will match different ranges of this string.

/// let hay = b"barfoo";

///

/// // Compile a set matching any of our patterns.

/// let set = RegexSet::new(patterns).unwrap();

/// // Compile each pattern independently.

/// let regexes: Vec<_> = set

///     .patterns()

///     .iter()

///     .map(|pat| Regex::new(pat).unwrap())

///     .collect();

///

/// // Match against the whole set first and identify the individual

/// // matching patterns.

/// let matches: Vec<&[u8]> = set

///     .matches(hay)

///     .into_iter()

///     // Dereference the match index to get the corresponding

///     // compiled pattern.

///     .map(|index| &regexes[index])

///     // To get match locations or any other info, we then have to search the

///     // exact same haystack again, using our separately-compiled pattern.

///     .map(|re| re.find(hay).unwrap().as_bytes())

///     .collect();

///

/// // Matches arrive in the order the constituent patterns were declared,

/// // not the order they appear in the haystack.

/// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches);

/// ```

///

/// # Performance

///

/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,

/// search takes `O(m * n)` time, where `m` is proportional to the size of the

/// regex set and `n` is proportional to the length of the haystack.

///

/// # Trait implementations

///

/// The `Default` trait is implemented for `RegexSet`. The default value

/// is an empty set. An empty set can also be explicitly constructed via

/// [`RegexSet::empty`].

///

/// # Example

///

/// This shows how the above two regexes (for matching email addresses and

/// domains) might work:

///

/// ```

/// use regex::bytes::RegexSet;

///

/// let set = RegexSet::new(&[

///     r"[a-z]+@[a-z]+\.(com|org|net)",

///     r"[a-z]+\.(com|org|net)",

/// ]).unwrap();

///

/// // Ask whether any regexes in the set match.

/// assert!(set.is_match(b"foo@example.com"));

///

/// // Identify which regexes in the set match.

/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();

/// assert_eq!(vec![0, 1], matches);

///

/// // Try again, but with a haystack that only matches one of the regexes.

/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();

/// assert_eq!(vec![1], matches);

///

/// // Try again, but with a haystack that doesn't match any regex in the set.

/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();

/// assert!(matches.is_empty());

/// ```

///

/// Note that it would be possible to adapt the above example to using `Regex`

/// with an expression like:

///

/// ```text

/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))

/// ```

///

/// After a match, one could then inspect the capture groups to figure out

/// which alternates matched. The problem is that it is hard to make this

/// approach scale when there are many regexes since the overlap between each

/// alternate isn't always obvious to reason about.

#[derive(Clone)]

pub struct RegexSet {

    pub(crate) meta: meta::Regex,

    pub(crate) patterns: alloc::sync::Arc<[String]>,

impl RegexSet {

    /// Create a new regex set with the given regular expressions.

///

    /// This takes an iterator of `S`, where `S` is something that can produce

    /// a `&str`. If any of the strings in the iterator are not valid regular

    /// expressions, then an error is returned.

///

    /// # Example

///

    /// Create a new regex set from an iterator of strings:

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();

    /// assert!(set.is_match(b"foo"));

    /// ```

    pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>

    where

        S: AsRef<str>,

        I: IntoIterator<Item = S>,

        RegexSetBuilder::new(exprs).build()

    /// Create a new empty regex set.

///

    /// An empty regex never matches anything.

///

    /// This is a convenience function for `RegexSet::new([])`, but doesn't

    /// require one to specify the type of the input.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::empty();

    /// assert!(set.is_empty());

    /// // an empty set matches nothing

    /// assert!(!set.is_match(b""));

    /// ```

    pub fn empty() -> RegexSet {

        let empty: [&str; 0] = [];

        RegexSetBuilder::new(empty).build().unwrap()

    /// Returns true if and only if one of the regexes in this set matches

    /// the haystack given.

///

    /// This method should be preferred if you only need to test whether any

    /// of the regexes in the set should match, but don't care about *which*

    /// regexes matched. This is because the underlying matching engine will

    /// quit immediately after seeing the first match instead of continuing to

    /// find all matches.

///

    /// Note that as with searches using [`Regex`](crate::bytes::Regex), the

    /// expression is unanchored by default. That is, if the regex does not

    /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted

    /// to match anywhere in the haystack.

///

    /// # Example

///

    /// Tests whether a set matches somewhere in a haystack:

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();

    /// assert!(set.is_match(b"foo"));

    /// assert!(!set.is_match("☃".as_bytes()));

    /// ```

    #[inline]

    pub fn is_match(&self, haystack: &[u8]) -> bool {

        self.is_match_at(haystack, 0)

    /// Returns true if and only if one of the regexes in this set matches the

    /// haystack given, with the search starting at the offset given.

///

    /// The significance of the starting point is that it takes the surrounding

    /// context into consideration. For example, the `\A` anchor can only

    /// match when `start == 0`.

///

    /// # Panics

///

    /// This panics when `start >= haystack.len() + 1`.

///

    /// # Example

///

    /// This example shows the significance of `start`. Namely, consider a

    /// haystack `foobar` and a desire to execute a search starting at offset

    /// `3`. You could search a substring explicitly, but then the look-around

    /// assertions won't work correctly. Instead, you can use this method to

    /// specify the start position of a search.

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();

    /// let hay = b"foobar";

    /// // We get a match here, but it's probably not intended.

    /// assert!(set.is_match(&hay[3..]));

    /// // No match because the  assertions take the context into account.

    /// assert!(!set.is_match_at(hay, 3));

    /// ```

    #[inline]

    pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {

        self.meta.is_match(Input::new(haystack).span(start..haystack.len()))

    /// Returns the set of regexes that match in the given haystack.

///

    /// The set returned contains the index of each regex that matches in

    /// the given haystack. The index is in correspondence with the order of

    /// regular expressions given to `RegexSet`'s constructor.

///

    /// The set can also be used to iterate over the matched indices. The order

    /// of iteration is always ascending with respect to the matching indices.

///

    /// Note that as with searches using [`Regex`](crate::bytes::Regex), the

    /// expression is unanchored by default. That is, if the regex does not

    /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted

    /// to match anywhere in the haystack.

///

    /// # Example

///

    /// Tests which regular expressions match the given haystack:

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([

    ///     r"\w+",

    ///     r"\d+",

    ///     r"\pL+",

    ///     r"foo",

    ///     r"bar",

    ///     r"barfoo",

    ///     r"foobar",

    /// ]).unwrap();

    /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect();

    /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);

///

    /// // You can also test whether a particular regex matched:

    /// let matches = set.matches(b"foobar");

    /// assert!(!matches.matched(5));

    /// assert!(matches.matched(6));

    /// ```

    #[inline]

    pub fn matches(&self, haystack: &[u8]) -> SetMatches {

        self.matches_at(haystack, 0)

    /// Returns the set of regexes that match in the given haystack.

///

    /// The set returned contains the index of each regex that matches in

    /// the given haystack. The index is in correspondence with the order of

    /// regular expressions given to `RegexSet`'s constructor.

///

    /// The set can also be used to iterate over the matched indices. The order

    /// of iteration is always ascending with respect to the matching indices.

///

    /// The significance of the starting point is that it takes the surrounding

    /// context into consideration. For example, the `\A` anchor can only

    /// match when `start == 0`.

///

    /// # Panics

///

    /// This panics when `start >= haystack.len() + 1`.

///

    /// # Example

///

    /// Tests which regular expressions match the given haystack:

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();

    /// let hay = b"foobar";

    /// // We get matches here, but it's probably not intended.

    /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();

    /// assert_eq!(matches, vec![0, 1]);

    /// // No matches because the  assertions take the context into account.

    /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();

    /// assert_eq!(matches, vec![]);

    /// ```

    #[inline]

    pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches {

        let input = Input::new(haystack).span(start..haystack.len());

        let mut patset = PatternSet::new(self.meta.pattern_len());

        self.meta.which_overlapping_matches(&input, &mut patset);

        SetMatches(patset)

    /// Returns the same as matches, but starts the search at the given

    /// offset and stores the matches into the slice given.

///

    /// The significance of the starting point is that it takes the surrounding

    /// context into consideration. For example, the `\A` anchor can only

    /// match when `start == 0`.

///

    /// `matches` must have a length that is at least the number of regexes

    /// in this set.

///

    /// This method returns true if and only if at least one member of

    /// `matches` is true after executing the set against `haystack`.

    #[doc(hidden)]

    #[inline]

    pub fn matches_read_at(

        &self,

        matches: &mut [bool],

        haystack: &[u8],

        start: usize,

    ) -> bool {

        // This is pretty dumb. We should try to fix this, but the

        // regex-automata API doesn't provide a way to store matches in an

        // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and

        // thus not public... But regex-capi currently uses it. We should

        // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet

        // is in regex-automata, not regex. So maybe we should just accept a

        // 'SetMatches', which is basically just a newtype around PatternSet.

        let mut patset = PatternSet::new(self.meta.pattern_len());

        let mut input = Input::new(haystack);

        input.set_start(start);

        self.meta.which_overlapping_matches(&input, &mut patset);

        for pid in patset.iter() {

            matches[pid] = true;

        !patset.is_empty()

    /// An alias for `matches_read_at` to preserve backward compatibility.

///

    /// The `regex-capi` crate used this method, so to avoid breaking that

    /// crate, we continue to export it as an undocumented API.

    #[doc(hidden)]

    #[inline]

    pub fn read_matches_at(

        &self,

        matches: &mut [bool],

        haystack: &[u8],

        start: usize,

    ) -> bool {

        self.matches_read_at(matches, haystack, start)

    /// Returns the total number of regexes in this set.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// assert_eq!(0, RegexSet::empty().len());

    /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());

    /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());

    /// ```

    #[inline]

    pub fn len(&self) -> usize {

        self.meta.pattern_len()

    /// Returns `true` if this set contains no regexes.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// assert!(RegexSet::empty().is_empty());

    /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());

    /// ```

    #[inline]

    pub fn is_empty(&self) -> bool {

        self.meta.pattern_len() == 0

    /// Returns the regex patterns that this regex set was constructed from.

///

    /// This function can be used to determine the pattern for a match. The

    /// slice returned has exactly as many patterns givens to this regex set,

    /// and the order of the slice is the same as the order of the patterns

    /// provided to the set.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new(&[

    ///     r"\w+",

    ///     r"\d+",

    ///     r"\pL+",

    ///     r"foo",

    ///     r"bar",

    ///     r"barfoo",

    ///     r"foobar",

    /// ]).unwrap();

    /// let matches: Vec<_> = set

    ///     .matches(b"foobar")

    ///     .into_iter()

    ///     .map(|index| &set.patterns()[index])

    ///     .collect();

    /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);

    /// ```

    #[inline]

    pub fn patterns(&self) -> &[String] {

        &self.patterns

impl Default for RegexSet {

    fn default() -> Self {

        RegexSet::empty()

/// A set of matches returned by a regex set.

///

/// Values of this type are constructed by [`RegexSet::matches`].

#[derive(Clone, Debug)]

pub struct SetMatches(PatternSet);

impl SetMatches {

    /// Whether this set contains any matches.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new(&[

    ///     r"[a-z]+@[a-z]+\.(com|org|net)",

    ///     r"[a-z]+\.(com|org|net)",

    /// ]).unwrap();

    /// let matches = set.matches(b"foo@example.com");

    /// assert!(matches.matched_any());

    /// ```

    #[inline]

    pub fn matched_any(&self) -> bool {

        !self.0.is_empty()

    /// Whether the regex at the given index matched.

///

    /// The index for a regex is determined by its insertion order upon the

    /// initial construction of a `RegexSet`, starting at `0`.

///

    /// # Panics

///

    /// If `index` is greater than or equal to the number of regexes in the

    /// original set that produced these matches. Equivalently, when `index`

    /// is greater than or equal to [`SetMatches::len`].

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([

    ///     r"[a-z]+@[a-z]+\.(com|org|net)",

    ///     r"[a-z]+\.(com|org|net)",

    /// ]).unwrap();

    /// let matches = set.matches(b"example.com");

    /// assert!(!matches.matched(0));

    /// assert!(matches.matched(1));

    /// ```

    #[inline]

    pub fn matched(&self, index: usize) -> bool {

        self.0.contains(PatternID::new_unchecked(index))

    /// The total number of regexes in the set that created these matches.

///

    /// **WARNING:** This always returns the same value as [`RegexSet::len`].

    /// In particular, it does *not* return the number of elements yielded by

    /// [`SetMatches::iter`]. The only way to determine the total number of

    /// matched regexes is to iterate over them.

///

    /// # Example

///

    /// Notice that this method returns the total number of regexes in the

    /// original set, and *not* the total number of regexes that matched.

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([

    ///     r"[a-z]+@[a-z]+\.(com|org|net)",

    ///     r"[a-z]+\.(com|org|net)",

    /// ]).unwrap();

    /// let matches = set.matches(b"example.com");

    /// // Total number of patterns that matched.

    /// assert_eq!(1, matches.iter().count());

    /// // Total number of patterns in the set.

    /// assert_eq!(2, matches.len());

    /// ```

    #[inline]

    pub fn len(&self) -> usize {

        self.0.capacity()

    /// Returns an iterator over the indices of the regexes that matched.

///

    /// This will always produces matches in ascending order, where the index

    /// yielded corresponds to the index of the regex that matched with respect

    /// to its position when initially building the set.

///

    /// # Example

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([

    ///     r"[0-9]",

    ///     r"[a-z]",

    ///     r"[A-Z]",

    ///     r"\p{Greek}",

    /// ]).unwrap();

    /// let hay = "βa1".as_bytes();

    /// let matches: Vec<_> = set.matches(hay).iter().collect();

    /// assert_eq!(matches, vec![0, 1, 3]);

    /// ```

///

    /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so

    /// this method is not always needed. For example:

///

    /// ```

    /// use regex::bytes::RegexSet;

///

    /// let set = RegexSet::new([

    ///     r"[0-9]",

    ///     r"[a-z]",

    ///     r"[A-Z]",

    ///     r"\p{Greek}",

    /// ]).unwrap();

    /// let hay = "βa1".as_bytes();

    /// let mut matches = vec![];

    /// for index in set.matches(hay) {

    ///     matches.push(index);

    /// }

    /// assert_eq!(matches, vec![0, 1, 3]);

    /// ```

    #[inline]

    pub fn iter(&self) -> SetMatchesIter<'_> {

        SetMatchesIter(self.0.iter())

impl IntoIterator for SetMatches {

    type IntoIter = SetMatchesIntoIter;

    type Item = usize;

    fn into_iter(self) -> Self::IntoIter {

        let it = 0..self.0.capacity();

        SetMatchesIntoIter { patset: self.0, it }

impl<'a> IntoIterator for &'a SetMatches {

    type IntoIter = SetMatchesIter<'a>;

    type Item = usize;

    fn into_iter(self) -> Self::IntoIter {

        self.iter()

/// An owned iterator over the set of matches from a regex set.

///

/// This will always produces matches in ascending order of index, where the

/// index corresponds to the index of the regex that matched with respect to

/// its position when initially building the set.

///

/// This iterator is created by calling `SetMatches::into_iter` via the

/// `IntoIterator` trait. This is automatically done in `for` loops.

///

/// # Example

///

/// ```

/// use regex::bytes::RegexSet;

///

/// let set = RegexSet::new([

///     r"[0-9]",

///     r"[a-z]",

///     r"[A-Z]",

///     r"\p{Greek}",

/// ]).unwrap();

/// let hay = "βa1".as_bytes();

/// let mut matches = vec![];

/// for index in set.matches(hay) {

///     matches.push(index);

/// }

/// assert_eq!(matches, vec![0, 1, 3]);

/// ```

#[derive(Debug)]

pub struct SetMatchesIntoIter {

    patset: PatternSet,

    it: core::ops::Range<usize>,

impl Iterator for SetMatchesIntoIter {

    type Item = usize;

    fn next(&mut self) -> Option<usize> {

        loop {

            let id = self.it.next()?;

            if self.patset.contains(PatternID::new_unchecked(id)) {

                return Some(id);

    fn size_hint(&self) -> (usize, Option<usize>) {

        self.it.size_hint()

impl DoubleEndedIterator for SetMatchesIntoIter {

    fn next_back(&mut self) -> Option<usize> {

        loop {

            let id = self.it.next_back()?;

            if self.patset.contains(PatternID::new_unchecked(id)) {

                return Some(id);

impl core::iter::FusedIterator for SetMatchesIntoIter {}

/// A borrowed iterator over the set of matches from a regex set.

///

/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that

/// created this iterator.

///

/// This will always produces matches in ascending order, where the index

/// corresponds to the index of the regex that matched with respect to its

/// position when initially building the set.

///

/// This iterator is created by the [`SetMatches::iter`] method.

#[derive(Clone, Debug)]

pub struct SetMatchesIter<'a>(PatternSetIter<'a>);

impl<'a> Iterator for SetMatchesIter<'a> {

    type Item = usize;

    fn next(&mut self) -> Option<usize> {

        self.0.next().map(|pid| pid.as_usize())

    fn size_hint(&self) -> (usize, Option<usize>) {

        self.0.size_hint()

impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {

    fn next_back(&mut self) -> Option<usize> {

        self.0.next_back().map(|pid| pid.as_usize())

impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}

impl core::fmt::Debug for RegexSet {

    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {

        write!(f, "RegexSet({:?})", self.patterns())