diff options
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs')
-rw-r--r-- | vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs b/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs new file mode 100644 index 000000000..ea1c29ff7 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs @@ -0,0 +1,321 @@ +use crate::dfa::automaton::{Automaton, State}; +use crate::MatchError; + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + last_match = Some(at - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p) - 1, + }); + } + last_match = Some(offset(bytes, p) - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match)) +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_rev<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = end; + while at > start { + at -= 1; + let byte = bytes[at]; + state = dfa.next_state(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at }); + } + last_match = Some(at + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(end); + while p > bytes[start..].as_ptr() { + p = p.sub(1); + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p), + }); + } + last_match = Some(offset(bytes, p) + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_rev(dfa, state, bytes, start)?.or(last_match)) +} + +pub fn find_overlapping_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut State<A::ID>, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = match caller_state.as_option() { + None => init_fwd(dfa, bytes, start, end)?, + Some(id) => { + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `match_offset` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `match_offset` bytes. + start += dfa.match_offset(); + // Since match_offset could be any arbitrary value and we use + // `start` in pointer arithmetic below, we check that we are still + // in bounds. Otherwise, we could materialize a pointer that is + // more than one past the end point of `bytes`, which is UB. + if start > end { + return Ok(None); + } + (id, None) + } + }; + if last_match.is_some() { + caller_state.set(state); + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + caller_state.set(state); + if dfa.is_dead_state(state) { + return Ok(None); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } else { + return Ok(Some(at - dfa.match_offset())); + } + } + } + /* + // SAFETY: Other than the normal pointer arithmetic happening here, a + // unique aspect of safety for this function is the fact that the caller + // can provide the state that the search routine will start with. If this + // state were invalid, it would be possible to incorrectly index the + // transition table. We however prevent this from happening by guaranteeing + // that State is valid. Namely, callers cannot mutate a State. All they can + // do is create a "start" state or otherwise reuse a previously set state. + // Since callers can't mutate a state, it follows that a previously set + // state can only be retrieved by crate internal functions. Therefore, our + // use of it is safe since this code will only ever set the provided state + // to a valid state. + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + caller_state.set(state); + return if dfa.is_dead_state(state) { + Ok(None) + } else if dfa.is_quit_state(state) { + Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 }) + } else { + Ok(Some(offset(bytes, p) - dfa.match_offset())) + }; + } + } + } + */ + + let result = eof_fwd(dfa, bytes, end, &mut state); + caller_state.set(state); + result +} + +fn init_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option<usize>), MatchError> { + let state = dfa.start_state_forward(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(start - dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn init_rev<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option<usize>), MatchError> { + let state = dfa.start_state_reverse(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(end + dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn eof_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + end: usize, + state: &mut A::ID, +) -> Result<Option<usize>, MatchError> { + match bytes.get(end) { + Some(&b) => { + *state = dfa.next_state(*state, b); + if dfa.is_match_state(*state) { + Ok(Some(end)) + } else { + Ok(None) + } + } + None => { + *state = dfa.next_eof_state(*state); + if dfa.is_match_state(*state) { + Ok(Some(bytes.len())) + } else { + Ok(None) + } + } + } +} + +fn eof_rev<A: Automaton + ?Sized>( + dfa: &A, + state: A::ID, + bytes: &[u8], + start: usize, +) -> Result<Option<usize>, MatchError> { + if start > 0 { + if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) { + Ok(Some(start)) + } else { + Ok(None) + } + } else { + if dfa.is_match_state(dfa.next_eof_state(state)) { + Ok(Some(0)) + } else { + Ok(None) + } + } +} + +/// Returns the distance between the given pointer and the start of `bytes`. +/// This assumes that the given pointer points to somewhere in the `bytes` +/// slice given. +fn offset(bytes: &[u8], p: *const u8) -> usize { + debug_assert!(bytes.as_ptr() <= p); + debug_assert!(bytes[bytes.len()..].as_ptr() >= p); + ((p as isize) - (bytes.as_ptr() as isize)) as usize +} |