Revision control

Copy as Markdown

Other Tools

use core::{char, cmp, fmt::Debug, slice};
use alloc::vec::Vec;
use crate::unicode;
// This module contains an *internal* implementation of interval sets.
//
// The primary invariant that interval sets guards is canonical ordering. That
// is, every interval set contains an ordered sequence of intervals where
// no two intervals are overlapping or adjacent. While this invariant is
// occasionally broken within the implementation, it should be impossible for
// callers to observe it.
//
// Since case folding (as implemented below) breaks that invariant, we roll
// that into this API even though it is a little out of place in an otherwise
// generic interval set. (Hence the reason why the `unicode` module is imported
// here.)
//
// Some of the implementation complexity here is a result of me wanting to
// preserve the sequential representation without using additional memory.
// In many cases, we do use linear extra memory, but it is at most 2x and it
// is amortized. If we relaxed the memory requirements, this implementation
// could become much simpler. The extra memory is honestly probably OK, but
// character classes (especially of the Unicode variety) can become quite
// large, and it would be nice to keep regex compilation snappy even in debug
// builds. (In the past, I have been careless with this area of code and it has
// caused slow regex compilations in debug mode, so this isn't entirely
// unwarranted.)
//
// Tests on this are relegated to the public API of HIR in src/hir.rs.
#[derive(Clone, Debug)]
pub struct IntervalSet<I> {
/// A sorted set of non-overlapping ranges.
ranges: Vec<I>,
/// While not required at all for correctness, we keep track of whether an
/// interval set has been case folded or not. This helps us avoid doing
/// redundant work if, for example, a set has already been cased folded.
/// And note that whether a set is folded or not is preserved through
/// all of the pairwise set operations. That is, if both interval sets
/// have been case folded, then any of difference, union, intersection or
/// symmetric difference all produce a case folded set.
///
/// Note that when this is true, it *must* be the case that the set is case
/// folded. But when it's false, the set *may* be case folded. In other
/// words, we only set this to true when we know it to be case, but we're
/// okay with it being false if it would otherwise be costly to determine
/// whether it should be true. This means code cannot assume that a false
/// value necessarily indicates that the set is not case folded.
///
/// Bottom line: this is a performance optimization.
folded: bool,
}
impl<I: Interval> Eq for IntervalSet<I> {}
// We implement PartialEq manually so that we don't consider the set's internal
// 'folded' property to be part of its identity. The 'folded' property is
// strictly an optimization.
impl<I: Interval> PartialEq for IntervalSet<I> {
fn eq(&self, other: &IntervalSet<I>) -> bool {
self.ranges.eq(&other.ranges)
}
}
impl<I: Interval> IntervalSet<I> {
/// Create a new set from a sequence of intervals. Each interval is
/// specified as a pair of bounds, where both bounds are inclusive.
///
/// The given ranges do not need to be in any specific order, and ranges
/// may overlap.
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
let ranges: Vec<I> = intervals.into_iter().collect();
// An empty set is case folded.
let folded = ranges.is_empty();
let mut set = IntervalSet { ranges, folded };
set.canonicalize();
set
}
/// Add a new interval to this set.
pub fn push(&mut self, interval: I) {
// TODO: This could be faster. e.g., Push the interval such that
// it preserves canonicalization.
self.ranges.push(interval);
self.canonicalize();
// We don't know whether the new interval added here is considered
// case folded, so we conservatively assume that the entire set is
// no longer case folded if it was previously.
self.folded = false;
}
/// Return an iterator over all intervals in this set.
///
/// The iterator yields intervals in ascending order.
pub fn iter(&self) -> IntervalSetIter<'_, I> {
IntervalSetIter(self.ranges.iter())
}
/// Return an immutable slice of intervals in this set.
///
/// The sequence returned is in canonical ordering.
pub fn intervals(&self) -> &[I] {
&self.ranges
}
/// Expand this interval set such that it contains all case folded
/// characters. For example, if this class consists of the range `a-z`,
/// then applying case folding will result in the class containing both the
/// ranges `a-z` and `A-Z`.
///
/// This returns an error if the necessary case mapping data is not
/// available.
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
if self.folded {
return Ok(());
}
let len = self.ranges.len();
for i in 0..len {
let range = self.ranges[i];
if let Err(err) = range.case_fold_simple(&mut self.ranges) {
self.canonicalize();
return Err(err);
}
}
self.canonicalize();
self.folded = true;
Ok(())
}
/// Union this set with the given set, in place.
pub fn union(&mut self, other: &IntervalSet<I>) {
if other.ranges.is_empty() || self.ranges == other.ranges {
return;
}
// This could almost certainly be done more efficiently.
self.ranges.extend(&other.ranges);
self.canonicalize();
self.folded = self.folded && other.folded;
}
/// Intersect this set with the given set, in place.
pub fn intersect(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() {
return;
}
if other.ranges.is_empty() {
self.ranges.clear();
// An empty set is case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the intersection to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
let mut ita = 0..drain_end;
let mut itb = 0..other.ranges.len();
let mut a = ita.next().unwrap();
let mut b = itb.next().unwrap();
loop {
if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
self.ranges.push(ab);
}
let (it, aorb) =
if self.ranges[a].upper() < other.ranges[b].upper() {
(&mut ita, &mut a)
} else {
(&mut itb, &mut b)
};
match it.next() {
Some(v) => *aorb = v,
None => break,
}
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Subtract the given set from this set, in place.
pub fn difference(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() || other.ranges.is_empty() {
return;
}
// This algorithm is (to me) surprisingly complex. A search of the
// interwebs indicate that this is a potentially interesting problem.
// Folks seem to suggest interval or segment trees, but I'd like to
// avoid the overhead (both runtime and conceptual) of that.
//
// The following is basically my Shitty First Draft. Therefore, in
// order to grok it, you probably need to read each line carefully.
// Simplifications are most welcome!
//
// Remember, we can assume the canonical format invariant here, which
// says that all ranges are sorted, not overlapping and not adjacent in
// each class.
let drain_end = self.ranges.len();
let (mut a, mut b) = (0, 0);
'LOOP: while a < drain_end && b < other.ranges.len() {
// Basically, the easy cases are when neither range overlaps with
// each other. If the `b` range is less than our current `a`
// range, then we can skip it and move on.
if other.ranges[b].upper() < self.ranges[a].lower() {
b += 1;
continue;
}
// ... similarly for the `a` range. If it's less than the smallest
// `b` range, then we can add it as-is.
if self.ranges[a].upper() < other.ranges[b].lower() {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
continue;
}
// Otherwise, we have overlapping ranges.
assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
// This part is tricky and was non-obvious to me without looking
// at explicit examples (see the tests). The trickiness stems from
// two things: 1) subtracting a range from another range could
// yield two ranges and 2) after subtracting a range, it's possible
// that future ranges can have an impact. The loop below advances
// the `b` ranges until they can't possible impact the current
// range.
//
// For example, if our `a` range is `a-t` and our next three `b`
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
// subtraction three times before moving on to the next `a` range.
let mut range = self.ranges[a];
while b < other.ranges.len()
&& !range.is_intersection_empty(&other.ranges[b])
{
let old_range = range;
range = match range.difference(&other.ranges[b]) {
(None, None) => {
// We lost the entire range, so move on to the next
// without adding this one.
a += 1;
continue 'LOOP;
}
(Some(range1), None) | (None, Some(range1)) => range1,
(Some(range1), Some(range2)) => {
self.ranges.push(range1);
range2
}
};
// It's possible that the `b` range has more to contribute
// here. In particular, if it is greater than the original
// range, then it might impact the next `a` range *and* it
// has impacted the current `a` range as much as possible,
// so we can quit. We don't bump `b` so that the next `a`
// range can apply it.
if other.ranges[b].upper() > old_range.upper() {
break;
}
// Otherwise, the next `b` range might apply to the current
// `a` range.
b += 1;
}
self.ranges.push(range);
a += 1;
}
while a < drain_end {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Compute the symmetric difference of the two sets, in place.
///
/// This computes the symmetric difference of two interval sets. This
/// removes all elements in this set that are also in the given set,
/// but also adds all elements from the given set that aren't in this
/// set. That is, the set will contain all elements in either set,
/// but will not contain any elements that are in both sets.
pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
// TODO(burntsushi): Fix this so that it amortizes allocation.
let mut intersection = self.clone();
intersection.intersect(other);
self.union(other);
self.difference(&intersection);
}
/// Negate this interval set.
///
/// For all `x` where `x` is any element, if `x` was in this set, then it
/// will not be in this set after negation.
pub fn negate(&mut self) {
if self.ranges.is_empty() {
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
self.ranges.push(I::create(min, max));
// The set containing everything must case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the negation to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
// We do checked arithmetic below because of the canonical ordering
// invariant.
if self.ranges[0].lower() > I::Bound::min_value() {
let upper = self.ranges[0].lower().decrement();
self.ranges.push(I::create(I::Bound::min_value(), upper));
}
for i in 1..drain_end {
let lower = self.ranges[i - 1].upper().increment();
let upper = self.ranges[i].lower().decrement();
self.ranges.push(I::create(lower, upper));
}
if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
let lower = self.ranges[drain_end - 1].upper().increment();
self.ranges.push(I::create(lower, I::Bound::max_value()));
}
self.ranges.drain(..drain_end);
// We don't need to update whether this set is folded or not, because
// it is conservatively preserved through negation. Namely, if a set
// is not folded, then it is possible that its negation is folded, for
// example, [^☃]. But we're fine with assuming that the set is not
// folded in that case. (`folded` permits false negatives but not false
// positives.)
//
// But what about when a set is folded, is its negation also
// necessarily folded? Yes. Because if a set is folded, then for every
// character in the set, it necessarily included its equivalence class
// of case folded characters. Negating it in turn means that all
// equivalence classes in the set are negated, and any equivalence
// class that was previously not in the set is now entirely in the set.
}
/// Converts this set into a canonical ordering.
fn canonicalize(&mut self) {
if self.is_canonical() {
return;
}
self.ranges.sort();
assert!(!self.ranges.is_empty());
// Is there a way to do this in-place with constant memory? I couldn't
// figure out a way to do it. So just append the canonicalization to
// the end of this range, and then drain it before we're done.
let drain_end = self.ranges.len();
for oldi in 0..drain_end {
// If we've added at least one new range, then check if we can
// merge this range in the previously added range.
if self.ranges.len() > drain_end {
let (last, rest) = self.ranges.split_last_mut().unwrap();
if let Some(union) = last.union(&rest[oldi]) {
*last = union;
continue;
}
}
let range = self.ranges[oldi];
self.ranges.push(range);
}
self.ranges.drain(..drain_end);
}
/// Returns true if and only if this class is in a canonical ordering.
fn is_canonical(&self) -> bool {
for pair in self.ranges.windows(2) {
if pair[0] >= pair[1] {
return false;
}
if pair[0].is_contiguous(&pair[1]) {
return false;
}
}
true
}
}
/// An iterator over intervals.
#[derive(Debug)]
pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
impl<'a, I> Iterator for IntervalSetIter<'a, I> {
type Item = &'a I;
fn next(&mut self) -> Option<&'a I> {
self.0.next()
}
}
pub trait Interval:
Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
{
type Bound: Bound;
fn lower(&self) -> Self::Bound;
fn upper(&self) -> Self::Bound;
fn set_lower(&mut self, bound: Self::Bound);
fn set_upper(&mut self, bound: Self::Bound);
fn case_fold_simple(
&self,
intervals: &mut Vec<Self>,
) -> Result<(), unicode::CaseFoldError>;
/// Create a new interval.
fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
let mut int = Self::default();
if lower <= upper {
int.set_lower(lower);
int.set_upper(upper);
} else {
int.set_lower(upper);
int.set_upper(lower);
}
int
}
/// Union the given overlapping range into this range.
///
/// If the two ranges aren't contiguous, then this returns `None`.
fn union(&self, other: &Self) -> Option<Self> {
if !self.is_contiguous(other) {
return None;
}
let lower = cmp::min(self.lower(), other.lower());
let upper = cmp::max(self.upper(), other.upper());
Some(Self::create(lower, upper))
}
/// Intersect this range with the given range and return the result.
///
/// If the intersection is empty, then this returns `None`.
fn intersect(&self, other: &Self) -> Option<Self> {
let lower = cmp::max(self.lower(), other.lower());
let upper = cmp::min(self.upper(), other.upper());
if lower <= upper {
Some(Self::create(lower, upper))
} else {
None
}
}
/// Subtract the given range from this range and return the resulting
/// ranges.
///
/// If subtraction would result in an empty range, then no ranges are
/// returned.
fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
if self.is_subset(other) {
return (None, None);
}
if self.is_intersection_empty(other) {
return (Some(self.clone()), None);
}
let add_lower = other.lower() > self.lower();
let add_upper = other.upper() < self.upper();
// We know this because !self.is_subset(other) and the ranges have
// a non-empty intersection.
assert!(add_lower || add_upper);
let mut ret = (None, None);
if add_lower {
let upper = other.lower().decrement();
ret.0 = Some(Self::create(self.lower(), upper));
}
if add_upper {
let lower = other.upper().increment();
let range = Self::create(lower, self.upper());
if ret.0.is_none() {
ret.0 = Some(range);
} else {
ret.1 = Some(range);
}
}
ret
}
/// Compute the symmetric difference the given range from this range. This
/// returns the union of the two ranges minus its intersection.
fn symmetric_difference(
&self,
other: &Self,
) -> (Option<Self>, Option<Self>) {
let union = match self.union(other) {
None => return (Some(self.clone()), Some(other.clone())),
Some(union) => union,
};
let intersection = match self.intersect(other) {
None => return (Some(self.clone()), Some(other.clone())),
Some(intersection) => intersection,
};
union.difference(&intersection)
}
/// Returns true if and only if the two ranges are contiguous. Two ranges
/// are contiguous if and only if the ranges are either overlapping or
/// adjacent.
fn is_contiguous(&self, other: &Self) -> bool {
let lower1 = self.lower().as_u32();
let upper1 = self.upper().as_u32();
let lower2 = other.lower().as_u32();
let upper2 = other.upper().as_u32();
cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
}
/// Returns true if and only if the intersection of this range and the
/// other range is empty.
fn is_intersection_empty(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
}
/// Returns true if and only if this range is a subset of the other range.
fn is_subset(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
(lower2 <= lower1 && lower1 <= upper2)
&& (lower2 <= upper1 && upper1 <= upper2)
}
}
pub trait Bound:
Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
{
fn min_value() -> Self;
fn max_value() -> Self;
fn as_u32(self) -> u32;
fn increment(self) -> Self;
fn decrement(self) -> Self;
}
impl Bound for u8 {
fn min_value() -> Self {
u8::MIN
}
fn max_value() -> Self {
u8::MAX
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
}
fn decrement(self) -> Self {
self.checked_sub(1).unwrap()
}
}
impl Bound for char {
fn min_value() -> Self {
'\x00'
}
fn max_value() -> Self {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}
fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}
// Tests for interval sets are written in src/hir.rs against the public API.