summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/regexset/bytes.rs
blob: 1220a14662ca09dfc187601ddc2f00aca8957205 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
use alloc::string::String;

use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};

use crate::{bytes::RegexSetBuilder, Error};

/// Match multiple, possibly overlapping, regexes in a single search.
///
/// A regex set corresponds to the union of zero or more regular expressions.
/// That is, a regex set will match a haystack when at least one of its
/// constituent regexes matches. A regex set as its formulated here provides a
/// touch more power: it will also report *which* regular expressions in the
/// set match. Indeed, this is the key difference between regex sets and a
/// single `Regex` with many alternates, since only one alternate can match at
/// a time.
///
/// For example, consider regular expressions to match email addresses and
/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
/// regex set is constructed from those regexes, then searching the haystack
/// `foo@example.com` will report both regexes as matching. Of course, one
/// could accomplish this by compiling each regex on its own and doing two
/// searches over the haystack. The key advantage of using a regex set is
/// that it will report the matching regexes using a *single pass through the
/// haystack*. If one has hundreds or thousands of regexes to match repeatedly
/// (like a URL router for a complex web application or a user agent matcher),
/// then a regex set *can* realize huge performance gains.
///
/// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet`
/// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this
/// `RegexSet` is permitted to match invalid UTF-8.
///
/// # Limitations
///
/// Regex sets are limited to answering the following two questions:
///
/// 1. Does any regex in the set match?
/// 2. If so, which regexes in the set match?
///
/// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask
/// (1) instead of (2) since the matching engines can stop after the first
/// match is found.
///
/// You cannot directly extract [`Match`][crate::bytes::Match] or
/// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need
/// these operations, the recommended approach is to compile each pattern in
/// the set independently and scan the exact same haystack a second time with
/// those independently compiled patterns:
///
/// ```
/// use regex::bytes::{Regex, RegexSet};
///
/// let patterns = ["foo", "bar"];
/// // Both patterns will match different ranges of this string.
/// let hay = b"barfoo";
///
/// // Compile a set matching any of our patterns.
/// let set = RegexSet::new(patterns).unwrap();
/// // Compile each pattern independently.
/// let regexes: Vec<_> = set
///     .patterns()
///     .iter()
///     .map(|pat| Regex::new(pat).unwrap())
///     .collect();
///
/// // Match against the whole set first and identify the individual
/// // matching patterns.
/// let matches: Vec<&[u8]> = set
///     .matches(hay)
///     .into_iter()
///     // Dereference the match index to get the corresponding
///     // compiled pattern.
///     .map(|index| &regexes[index])
///     // To get match locations or any other info, we then have to search the
///     // exact same haystack again, using our separately-compiled pattern.
///     .map(|re| re.find(hay).unwrap().as_bytes())
///     .collect();
///
/// // Matches arrive in the order the constituent patterns were declared,
/// // not the order they appear in the haystack.
/// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches);
/// ```
///
/// # Performance
///
/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
/// search takes `O(m * n)` time, where `m` is proportional to the size of the
/// regex set and `n` is proportional to the length of the haystack.
///
/// # Trait implementations
///
/// The `Default` trait is implemented for `RegexSet`. The default value
/// is an empty set. An empty set can also be explicitly constructed via
/// [`RegexSet::empty`].
///
/// # Example
///
/// This shows how the above two regexes (for matching email addresses and
/// domains) might work:
///
/// ```
/// use regex::bytes::RegexSet;
///
/// let set = RegexSet::new(&[
///     r"[a-z]+@[a-z]+\.(com|org|net)",
///     r"[a-z]+\.(com|org|net)",
/// ]).unwrap();
///
/// // Ask whether any regexes in the set match.
/// assert!(set.is_match(b"foo@example.com"));
///
/// // Identify which regexes in the set match.
/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
/// assert_eq!(vec![0, 1], matches);
///
/// // Try again, but with a haystack that only matches one of the regexes.
/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
/// assert_eq!(vec![1], matches);
///
/// // Try again, but with a haystack that doesn't match any regex in the set.
/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
/// assert!(matches.is_empty());
/// ```
///
/// Note that it would be possible to adapt the above example to using `Regex`
/// with an expression like:
///
/// ```text
/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
/// ```
///
/// After a match, one could then inspect the capture groups to figure out
/// which alternates matched. The problem is that it is hard to make this
/// approach scale when there are many regexes since the overlap between each
/// alternate isn't always obvious to reason about.
#[derive(Clone)]
pub struct RegexSet {
    pub(crate) meta: meta::Regex,
    pub(crate) patterns: alloc::sync::Arc<[String]>,
}

impl RegexSet {
    /// Create a new regex set with the given regular expressions.
    ///
    /// This takes an iterator of `S`, where `S` is something that can produce
    /// a `&str`. If any of the strings in the iterator are not valid regular
    /// expressions, then an error is returned.
    ///
    /// # Example
    ///
    /// Create a new regex set from an iterator of strings:
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
    /// assert!(set.is_match(b"foo"));
    /// ```
    pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = S>,
    {
        RegexSetBuilder::new(exprs).build()
    }

    /// Create a new empty regex set.
    ///
    /// An empty regex never matches anything.
    ///
    /// This is a convenience function for `RegexSet::new([])`, but doesn't
    /// require one to specify the type of the input.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::empty();
    /// assert!(set.is_empty());
    /// // an empty set matches nothing
    /// assert!(!set.is_match(b""));
    /// ```
    pub fn empty() -> RegexSet {
        let empty: [&str; 0] = [];
        RegexSetBuilder::new(empty).build().unwrap()
    }

    /// Returns true if and only if one of the regexes in this set matches
    /// the haystack given.
    ///
    /// This method should be preferred if you only need to test whether any
    /// of the regexes in the set should match, but don't care about *which*
    /// regexes matched. This is because the underlying matching engine will
    /// quit immediately after seeing the first match instead of continuing to
    /// find all matches.
    ///
    /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
    /// expression is unanchored by default. That is, if the regex does not
    /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
    /// to match anywhere in the haystack.
    ///
    /// # Example
    ///
    /// Tests whether a set matches somewhere in a haystack:
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
    /// assert!(set.is_match(b"foo"));
    /// assert!(!set.is_match("☃".as_bytes()));
    /// ```
    #[inline]
    pub fn is_match(&self, haystack: &[u8]) -> bool {
        self.is_match_at(haystack, 0)
    }

    /// Returns true if and only if one of the regexes in this set matches the
    /// haystack given, with the search starting at the offset given.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, the `\A` anchor can only
    /// match when `start == 0`.
    ///
    /// # Panics
    ///
    /// This panics when `start >= haystack.len() + 1`.
    ///
    /// # Example
    ///
    /// This example shows the significance of `start`. Namely, consider a
    /// haystack `foobar` and a desire to execute a search starting at offset
    /// `3`. You could search a substring explicitly, but then the look-around
    /// assertions won't work correctly. Instead, you can use this method to
    /// specify the start position of a search.
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
    /// let hay = b"foobar";
    /// // We get a match here, but it's probably not intended.
    /// assert!(set.is_match(&hay[3..]));
    /// // No match because the  assertions take the context into account.
    /// assert!(!set.is_match_at(hay, 3));
    /// ```
    #[inline]
    pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
        self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
    }

    /// Returns the set of regexes that match in the given haystack.
    ///
    /// The set returned contains the index of each regex that matches in
    /// the given haystack. The index is in correspondence with the order of
    /// regular expressions given to `RegexSet`'s constructor.
    ///
    /// The set can also be used to iterate over the matched indices. The order
    /// of iteration is always ascending with respect to the matching indices.
    ///
    /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
    /// expression is unanchored by default. That is, if the regex does not
    /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
    /// to match anywhere in the haystack.
    ///
    /// # Example
    ///
    /// Tests which regular expressions match the given haystack:
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([
    ///     r"\w+",
    ///     r"\d+",
    ///     r"\pL+",
    ///     r"foo",
    ///     r"bar",
    ///     r"barfoo",
    ///     r"foobar",
    /// ]).unwrap();
    /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect();
    /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
    ///
    /// // You can also test whether a particular regex matched:
    /// let matches = set.matches(b"foobar");
    /// assert!(!matches.matched(5));
    /// assert!(matches.matched(6));
    /// ```
    #[inline]
    pub fn matches(&self, haystack: &[u8]) -> SetMatches {
        self.matches_at(haystack, 0)
    }

    /// Returns the set of regexes that match in the given haystack.
    ///
    /// The set returned contains the index of each regex that matches in
    /// the given haystack. The index is in correspondence with the order of
    /// regular expressions given to `RegexSet`'s constructor.
    ///
    /// The set can also be used to iterate over the matched indices. The order
    /// of iteration is always ascending with respect to the matching indices.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, the `\A` anchor can only
    /// match when `start == 0`.
    ///
    /// # Panics
    ///
    /// This panics when `start >= haystack.len() + 1`.
    ///
    /// # Example
    ///
    /// Tests which regular expressions match the given haystack:
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
    /// let hay = b"foobar";
    /// // We get matches here, but it's probably not intended.
    /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();
    /// assert_eq!(matches, vec![0, 1]);
    /// // No matches because the  assertions take the context into account.
    /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();
    /// assert_eq!(matches, vec![]);
    /// ```
    #[inline]
    pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches {
        let input = Input::new(haystack).span(start..haystack.len());
        let mut patset = PatternSet::new(self.meta.pattern_len());
        self.meta.which_overlapping_matches(&input, &mut patset);
        SetMatches(patset)
    }

    /// Returns the same as matches, but starts the search at the given
    /// offset and stores the matches into the slice given.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, the `\A` anchor can only
    /// match when `start == 0`.
    ///
    /// `matches` must have a length that is at least the number of regexes
    /// in this set.
    ///
    /// This method returns true if and only if at least one member of
    /// `matches` is true after executing the set against `haystack`.
    #[doc(hidden)]
    #[inline]
    pub fn matches_read_at(
        &self,
        matches: &mut [bool],
        haystack: &[u8],
        start: usize,
    ) -> bool {
        // This is pretty dumb. We should try to fix this, but the
        // regex-automata API doesn't provide a way to store matches in an
        // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and
        // thus not public... But regex-capi currently uses it. We should
        // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet
        // is in regex-automata, not regex. So maybe we should just accept a
        // 'SetMatches', which is basically just a newtype around PatternSet.
        let mut patset = PatternSet::new(self.meta.pattern_len());
        let mut input = Input::new(haystack);
        input.set_start(start);
        self.meta.which_overlapping_matches(&input, &mut patset);
        for pid in patset.iter() {
            matches[pid] = true;
        }
        !patset.is_empty()
    }

    /// An alias for `matches_read_at` to preserve backward compatibility.
    ///
    /// The `regex-capi` crate used this method, so to avoid breaking that
    /// crate, we continue to export it as an undocumented API.
    #[doc(hidden)]
    #[inline]
    pub fn read_matches_at(
        &self,
        matches: &mut [bool],
        haystack: &[u8],
        start: usize,
    ) -> bool {
        self.matches_read_at(matches, haystack, start)
    }

    /// Returns the total number of regexes in this set.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// assert_eq!(0, RegexSet::empty().len());
    /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());
    /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());
    /// ```
    #[inline]
    pub fn len(&self) -> usize {
        self.meta.pattern_len()
    }

    /// Returns `true` if this set contains no regexes.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// assert!(RegexSet::empty().is_empty());
    /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());
    /// ```
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.meta.pattern_len() == 0
    }

    /// Returns the regex patterns that this regex set was constructed from.
    ///
    /// This function can be used to determine the pattern for a match. The
    /// slice returned has exactly as many patterns givens to this regex set,
    /// and the order of the slice is the same as the order of the patterns
    /// provided to the set.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new(&[
    ///     r"\w+",
    ///     r"\d+",
    ///     r"\pL+",
    ///     r"foo",
    ///     r"bar",
    ///     r"barfoo",
    ///     r"foobar",
    /// ]).unwrap();
    /// let matches: Vec<_> = set
    ///     .matches(b"foobar")
    ///     .into_iter()
    ///     .map(|index| &set.patterns()[index])
    ///     .collect();
    /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
    /// ```
    #[inline]
    pub fn patterns(&self) -> &[String] {
        &self.patterns
    }
}

impl Default for RegexSet {
    fn default() -> Self {
        RegexSet::empty()
    }
}

/// A set of matches returned by a regex set.
///
/// Values of this type are constructed by [`RegexSet::matches`].
#[derive(Clone, Debug)]
pub struct SetMatches(PatternSet);

impl SetMatches {
    /// Whether this set contains any matches.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new(&[
    ///     r"[a-z]+@[a-z]+\.(com|org|net)",
    ///     r"[a-z]+\.(com|org|net)",
    /// ]).unwrap();
    /// let matches = set.matches(b"foo@example.com");
    /// assert!(matches.matched_any());
    /// ```
    #[inline]
    pub fn matched_any(&self) -> bool {
        !self.0.is_empty()
    }

    /// Whether the regex at the given index matched.
    ///
    /// The index for a regex is determined by its insertion order upon the
    /// initial construction of a `RegexSet`, starting at `0`.
    ///
    /// # Panics
    ///
    /// If `index` is greater than or equal to the number of regexes in the
    /// original set that produced these matches. Equivalently, when `index`
    /// is greater than or equal to [`SetMatches::len`].
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([
    ///     r"[a-z]+@[a-z]+\.(com|org|net)",
    ///     r"[a-z]+\.(com|org|net)",
    /// ]).unwrap();
    /// let matches = set.matches(b"example.com");
    /// assert!(!matches.matched(0));
    /// assert!(matches.matched(1));
    /// ```
    #[inline]
    pub fn matched(&self, index: usize) -> bool {
        self.0.contains(PatternID::new_unchecked(index))
    }

    /// The total number of regexes in the set that created these matches.
    ///
    /// **WARNING:** This always returns the same value as [`RegexSet::len`].
    /// In particular, it does *not* return the number of elements yielded by
    /// [`SetMatches::iter`]. The only way to determine the total number of
    /// matched regexes is to iterate over them.
    ///
    /// # Example
    ///
    /// Notice that this method returns the total number of regexes in the
    /// original set, and *not* the total number of regexes that matched.
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([
    ///     r"[a-z]+@[a-z]+\.(com|org|net)",
    ///     r"[a-z]+\.(com|org|net)",
    /// ]).unwrap();
    /// let matches = set.matches(b"example.com");
    /// // Total number of patterns that matched.
    /// assert_eq!(1, matches.iter().count());
    /// // Total number of patterns in the set.
    /// assert_eq!(2, matches.len());
    /// ```
    #[inline]
    pub fn len(&self) -> usize {
        self.0.capacity()
    }

    /// Returns an iterator over the indices of the regexes that matched.
    ///
    /// This will always produces matches in ascending order, where the index
    /// yielded corresponds to the index of the regex that matched with respect
    /// to its position when initially building the set.
    ///
    /// # Example
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([
    ///     r"[0-9]",
    ///     r"[a-z]",
    ///     r"[A-Z]",
    ///     r"\p{Greek}",
    /// ]).unwrap();
    /// let hay = "βa1".as_bytes();
    /// let matches: Vec<_> = set.matches(hay).iter().collect();
    /// assert_eq!(matches, vec![0, 1, 3]);
    /// ```
    ///
    /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so
    /// this method is not always needed. For example:
    ///
    /// ```
    /// use regex::bytes::RegexSet;
    ///
    /// let set = RegexSet::new([
    ///     r"[0-9]",
    ///     r"[a-z]",
    ///     r"[A-Z]",
    ///     r"\p{Greek}",
    /// ]).unwrap();
    /// let hay = "βa1".as_bytes();
    /// let mut matches = vec![];
    /// for index in set.matches(hay) {
    ///     matches.push(index);
    /// }
    /// assert_eq!(matches, vec![0, 1, 3]);
    /// ```
    #[inline]
    pub fn iter(&self) -> SetMatchesIter<'_> {
        SetMatchesIter(self.0.iter())
    }
}

impl IntoIterator for SetMatches {
    type IntoIter = SetMatchesIntoIter;
    type Item = usize;

    fn into_iter(self) -> Self::IntoIter {
        let it = 0..self.0.capacity();
        SetMatchesIntoIter { patset: self.0, it }
    }
}

impl<'a> IntoIterator for &'a SetMatches {
    type IntoIter = SetMatchesIter<'a>;
    type Item = usize;

    fn into_iter(self) -> Self::IntoIter {
        self.iter()
    }
}

/// An owned iterator over the set of matches from a regex set.
///
/// This will always produces matches in ascending order of index, where the
/// index corresponds to the index of the regex that matched with respect to
/// its position when initially building the set.
///
/// This iterator is created by calling `SetMatches::into_iter` via the
/// `IntoIterator` trait. This is automatically done in `for` loops.
///
/// # Example
///
/// ```
/// use regex::bytes::RegexSet;
///
/// let set = RegexSet::new([
///     r"[0-9]",
///     r"[a-z]",
///     r"[A-Z]",
///     r"\p{Greek}",
/// ]).unwrap();
/// let hay = "βa1".as_bytes();
/// let mut matches = vec![];
/// for index in set.matches(hay) {
///     matches.push(index);
/// }
/// assert_eq!(matches, vec![0, 1, 3]);
/// ```
#[derive(Debug)]
pub struct SetMatchesIntoIter {
    patset: PatternSet,
    it: core::ops::Range<usize>,
}

impl Iterator for SetMatchesIntoIter {
    type Item = usize;

    fn next(&mut self) -> Option<usize> {
        loop {
            let id = self.it.next()?;
            if self.patset.contains(PatternID::new_unchecked(id)) {
                return Some(id);
            }
        }
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        self.it.size_hint()
    }
}

impl DoubleEndedIterator for SetMatchesIntoIter {
    fn next_back(&mut self) -> Option<usize> {
        loop {
            let id = self.it.next_back()?;
            if self.patset.contains(PatternID::new_unchecked(id)) {
                return Some(id);
            }
        }
    }
}

impl core::iter::FusedIterator for SetMatchesIntoIter {}

/// A borrowed iterator over the set of matches from a regex set.
///
/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that
/// created this iterator.
///
/// This will always produces matches in ascending order, where the index
/// corresponds to the index of the regex that matched with respect to its
/// position when initially building the set.
///
/// This iterator is created by the [`SetMatches::iter`] method.
#[derive(Clone, Debug)]
pub struct SetMatchesIter<'a>(PatternSetIter<'a>);

impl<'a> Iterator for SetMatchesIter<'a> {
    type Item = usize;

    fn next(&mut self) -> Option<usize> {
        self.0.next().map(|pid| pid.as_usize())
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        self.0.size_hint()
    }
}

impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
    fn next_back(&mut self) -> Option<usize> {
        self.0.next_back().map(|pid| pid.as_usize())
    }
}

impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}

impl core::fmt::Debug for RegexSet {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        write!(f, "RegexSet({:?})", self.patterns())
    }
}