summaryrefslogtreecommitdiffstats
path: root/third_party/rust/rure/src/rure.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/rure/src/rure.rs')
-rw-r--r--third_party/rust/rure/src/rure.rs629
1 files changed, 629 insertions, 0 deletions
diff --git a/third_party/rust/rure/src/rure.rs b/third_party/rust/rure/src/rure.rs
new file mode 100644
index 0000000000..d2e1539ed2
--- /dev/null
+++ b/third_party/rust/rure/src/rure.rs
@@ -0,0 +1,629 @@
+use std::collections::HashMap;
+use std::ffi::{CStr, CString};
+use std::ops::Deref;
+use std::ptr;
+use std::slice;
+use std::str;
+
+use libc::{c_char, size_t};
+use regex::bytes;
+
+use crate::error::{Error, ErrorKind};
+
+const RURE_FLAG_CASEI: u32 = 1 << 0;
+const RURE_FLAG_MULTI: u32 = 1 << 1;
+const RURE_FLAG_DOTNL: u32 = 1 << 2;
+const RURE_FLAG_SWAP_GREED: u32 = 1 << 3;
+const RURE_FLAG_SPACE: u32 = 1 << 4;
+const RURE_FLAG_UNICODE: u32 = 1 << 5;
+const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE;
+
+pub struct Regex {
+ re: bytes::Regex,
+ capture_names: HashMap<String, i32>,
+}
+
+pub struct Options {
+ size_limit: usize,
+ dfa_size_limit: usize,
+}
+
+// The `RegexSet` is not exposed with option support or matching at an
+// arbitrary position with a crate just yet. To circumvent this, we use
+// the `Exec` structure directly.
+pub struct RegexSet {
+ re: bytes::RegexSet,
+}
+
+#[repr(C)]
+pub struct rure_match {
+ pub start: size_t,
+ pub end: size_t,
+}
+
+pub struct Captures(bytes::Locations);
+
+pub struct Iter {
+ re: *const Regex,
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+pub struct IterCaptureNames {
+ capture_names: bytes::CaptureNames<'static>,
+ name_ptrs: Vec<*mut c_char>,
+}
+
+impl Deref for Regex {
+ type Target = bytes::Regex;
+ fn deref(&self) -> &bytes::Regex {
+ &self.re
+ }
+}
+
+impl Deref for RegexSet {
+ type Target = bytes::RegexSet;
+ fn deref(&self) -> &bytes::RegexSet {
+ &self.re
+ }
+}
+
+impl Default for Options {
+ fn default() -> Options {
+ Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
+ }
+}
+
+ffi_fn! {
+ fn rure_compile_must(pattern: *const c_char) -> *const Regex {
+ let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
+ let pat = pattern as *const u8;
+ let mut err = Error::new(ErrorKind::None);
+ let re = rure_compile(
+ pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err);
+ if err.is_err() {
+ let _ = writeln!(&mut io::stderr(), "{}", err);
+ let _ = writeln!(
+ &mut io::stderr(), "aborting from rure_compile_must");
+ unsafe { abort() }
+ }
+ re
+ }
+}
+
+ffi_fn! {
+ fn rure_compile(
+ pattern: *const u8,
+ length: size_t,
+ flags: u32,
+ options: *const Options,
+ error: *mut Error,
+ ) -> *const Regex {
+ let pat = unsafe { slice::from_raw_parts(pattern, length) };
+ let pat = match str::from_utf8(pat) {
+ Ok(pat) => pat,
+ Err(err) => {
+ unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Str(err));
+ }
+ return ptr::null();
+ }
+ }
+ };
+ let mut builder = bytes::RegexBuilder::new(pat);
+ if !options.is_null() {
+ let options = unsafe { &*options };
+ builder.size_limit(options.size_limit);
+ builder.dfa_size_limit(options.dfa_size_limit);
+ }
+ builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
+ builder.multi_line(flags & RURE_FLAG_MULTI > 0);
+ builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
+ builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
+ builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
+ builder.unicode(flags & RURE_FLAG_UNICODE > 0);
+ match builder.build() {
+ Ok(re) => {
+ let mut capture_names = HashMap::new();
+ for (i, name) in re.capture_names().enumerate() {
+ if let Some(name) = name {
+ capture_names.insert(name.to_owned(), i as i32);
+ }
+ }
+ let re = Regex {
+ re: re,
+ capture_names: capture_names,
+ };
+ Box::into_raw(Box::new(re))
+ }
+ Err(err) => {
+ unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Regex(err));
+ }
+ ptr::null()
+ }
+ }
+ }
+ }
+}
+
+ffi_fn! {
+ fn rure_free(re: *const Regex) {
+ unsafe { drop(Box::from_raw(re as *mut Regex)); }
+ }
+}
+
+ffi_fn! {
+ fn rure_is_match(
+ re: *const Regex,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t,
+ ) -> bool {
+ let re = unsafe { &*re };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+ re.is_match_at(haystack, start)
+ }
+}
+
+ffi_fn! {
+ fn rure_find(
+ re: *const Regex,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t,
+ match_info: *mut rure_match,
+ ) -> bool {
+ let re = unsafe { &*re };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+ re.find_at(haystack, start).map(|m| unsafe {
+ if !match_info.is_null() {
+ (*match_info).start = m.start();
+ (*match_info).end = m.end();
+ }
+ }).is_some()
+ }
+}
+
+ffi_fn! {
+ fn rure_find_captures(
+ re: *const Regex,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t,
+ captures: *mut Captures,
+ ) -> bool {
+ let re = unsafe { &*re };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+ let slots = unsafe { &mut (*captures).0 };
+ re.read_captures_at(slots, haystack, start).is_some()
+ }
+}
+
+ffi_fn! {
+ fn rure_shortest_match(
+ re: *const Regex,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t,
+ end: *mut usize,
+ ) -> bool {
+ let re = unsafe { &*re };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+ match re.shortest_match_at(haystack, start) {
+ None => false,
+ Some(i) => {
+ if !end.is_null() {
+ unsafe {
+ *end = i;
+ }
+ }
+ true
+ }
+ }
+ }
+}
+
+ffi_fn! {
+ fn rure_capture_name_index(
+ re: *const Regex,
+ name: *const c_char,
+ ) -> i32 {
+ let re = unsafe { &*re };
+ let name = unsafe { CStr::from_ptr(name) };
+ let name = match name.to_str() {
+ Err(_) => return -1,
+ Ok(name) => name,
+ };
+ re.capture_names.get(name).map(|&i|i).unwrap_or(-1)
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_capture_names_new(
+ re: *const Regex,
+ ) -> *mut IterCaptureNames {
+ let re = unsafe { &*re };
+ Box::into_raw(Box::new(IterCaptureNames {
+ capture_names: re.re.capture_names(),
+ name_ptrs: Vec::new(),
+ }))
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_capture_names_free(it: *mut IterCaptureNames) {
+ unsafe {
+ let it = &mut *it;
+ while let Some(ptr) = it.name_ptrs.pop() {
+ drop(CString::from_raw(ptr));
+ }
+ drop(Box::from_raw(it));
+ }
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_capture_names_next(
+ it: *mut IterCaptureNames,
+ capture_name: *mut *mut c_char,
+ ) -> bool {
+ if capture_name.is_null() {
+ return false;
+ }
+
+ let it = unsafe { &mut *it };
+ let cn = match it.capture_names.next() {
+ // Top-level iterator ran out of capture groups
+ None => return false,
+ Some(val) => {
+ let name = match val {
+ // inner Option didn't have a name
+ None => "",
+ Some(name) => name
+ };
+ name
+ }
+ };
+
+ unsafe {
+ let cs = match CString::new(cn.as_bytes()) {
+ Result::Ok(val) => val,
+ Result::Err(_) => return false
+ };
+ let ptr = cs.into_raw();
+ it.name_ptrs.push(ptr);
+ *capture_name = ptr;
+ }
+ true
+
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_new(
+ re: *const Regex,
+ ) -> *mut Iter {
+ Box::into_raw(Box::new(Iter {
+ re: re,
+ last_end: 0,
+ last_match: None,
+ }))
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_free(it: *mut Iter) {
+ unsafe { drop(Box::from_raw(it)); }
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_next(
+ it: *mut Iter,
+ haystack: *const u8,
+ len: size_t,
+ match_info: *mut rure_match,
+ ) -> bool {
+ let it = unsafe { &mut *it };
+ let re = unsafe { &*it.re };
+ let text = unsafe { slice::from_raw_parts(haystack, len) };
+ if it.last_end > text.len() {
+ return false;
+ }
+ let (s, e) = match re.find_at(text, it.last_end) {
+ None => return false,
+ Some(m) => (m.start(), m.end()),
+ };
+ if s == e {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ it.last_end += 1;
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(e) == it.last_match {
+ return rure_iter_next(it, haystack, len, match_info);
+ }
+ } else {
+ it.last_end = e;
+ }
+ it.last_match = Some(e);
+ if !match_info.is_null() {
+ unsafe {
+ (*match_info).start = s;
+ (*match_info).end = e;
+ }
+ }
+ true
+ }
+}
+
+ffi_fn! {
+ fn rure_iter_next_captures(
+ it: *mut Iter,
+ haystack: *const u8,
+ len: size_t,
+ captures: *mut Captures,
+ ) -> bool {
+ let it = unsafe { &mut *it };
+ let re = unsafe { &*it.re };
+ let slots = unsafe { &mut (*captures).0 };
+ let text = unsafe { slice::from_raw_parts(haystack, len) };
+ if it.last_end > text.len() {
+ return false;
+ }
+ let (s, e) = match re.read_captures_at(slots, text, it.last_end) {
+ None => return false,
+ Some(m) => (m.start(), m.end()),
+ };
+ if s == e {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ it.last_end += 1;
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(e) == it.last_match {
+ return rure_iter_next_captures(it, haystack, len, captures);
+ }
+ } else {
+ it.last_end = e;
+ }
+ it.last_match = Some(e);
+ true
+ }
+}
+
+ffi_fn! {
+ fn rure_captures_new(re: *const Regex) -> *mut Captures {
+ let re = unsafe { &*re };
+ let captures = Captures(re.locations());
+ Box::into_raw(Box::new(captures))
+ }
+}
+
+ffi_fn! {
+ fn rure_captures_free(captures: *const Captures) {
+ unsafe { drop(Box::from_raw(captures as *mut Captures)); }
+ }
+}
+
+ffi_fn! {
+ fn rure_captures_at(
+ captures: *const Captures,
+ i: size_t,
+ match_info: *mut rure_match,
+ ) -> bool {
+ let locs = unsafe { &(*captures).0 };
+ match locs.pos(i) {
+ Some((start, end)) => {
+ if !match_info.is_null() {
+ unsafe {
+ (*match_info).start = start;
+ (*match_info).end = end;
+ }
+ }
+ true
+ }
+ _ => false
+ }
+ }
+}
+
+ffi_fn! {
+ fn rure_captures_len(captures: *const Captures) -> size_t {
+ unsafe { (*captures).0.len() }
+ }
+}
+
+ffi_fn! {
+ fn rure_options_new() -> *mut Options {
+ Box::into_raw(Box::new(Options::default()))
+ }
+}
+
+ffi_fn! {
+ fn rure_options_free(options: *mut Options) {
+ unsafe { drop(Box::from_raw(options)); }
+ }
+}
+
+ffi_fn! {
+ fn rure_options_size_limit(options: *mut Options, limit: size_t) {
+ let options = unsafe { &mut *options };
+ options.size_limit = limit;
+ }
+}
+
+ffi_fn! {
+ fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) {
+ let options = unsafe { &mut *options };
+ options.dfa_size_limit = limit;
+ }
+}
+
+ffi_fn! {
+ fn rure_compile_set(
+ patterns: *const *const u8,
+ patterns_lengths: *const size_t,
+ patterns_count: size_t,
+ flags: u32,
+ options: *const Options,
+ error: *mut Error
+ ) -> *const RegexSet {
+ let (raw_pats, raw_patsl) = unsafe {
+ (
+ slice::from_raw_parts(patterns, patterns_count),
+ slice::from_raw_parts(patterns_lengths, patterns_count)
+ )
+ };
+
+ let mut pats = Vec::with_capacity(patterns_count);
+ for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
+ let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
+ pats.push(match str::from_utf8(pat) {
+ Ok(pat) => pat,
+ Err(err) => {
+ unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Str(err));
+ }
+ return ptr::null();
+ }
+ }
+ });
+ }
+
+ let mut builder = bytes::RegexSetBuilder::new(pats);
+ if !options.is_null() {
+ let options = unsafe { &*options };
+ builder.size_limit(options.size_limit);
+ builder.dfa_size_limit(options.dfa_size_limit);
+ }
+ builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
+ builder.multi_line(flags & RURE_FLAG_MULTI > 0);
+ builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
+ builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
+ builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
+ builder.unicode(flags & RURE_FLAG_UNICODE > 0);
+ match builder.build() {
+ Ok(re) => {
+ Box::into_raw(Box::new(RegexSet { re: re }))
+ }
+ Err(err) => {
+ unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Regex(err))
+ }
+ ptr::null()
+ }
+ }
+ }
+ }
+}
+
+ffi_fn! {
+ fn rure_set_free(re: *const RegexSet) {
+ unsafe { drop(Box::from_raw(re as *mut RegexSet)); }
+ }
+}
+
+ffi_fn! {
+ fn rure_set_is_match(
+ re: *const RegexSet,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t
+ ) -> bool {
+ let re = unsafe { &*re };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+ re.is_match_at(haystack, start)
+ }
+}
+
+ffi_fn! {
+ fn rure_set_matches(
+ re: *const RegexSet,
+ haystack: *const u8,
+ len: size_t,
+ start: size_t,
+ matches: *mut bool
+ ) -> bool {
+ let re = unsafe { &*re };
+ let mut matches = unsafe {
+ slice::from_raw_parts_mut(matches, re.len())
+ };
+ let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+
+ // read_matches_at isn't guaranteed to set non-matches to false
+ for item in matches.iter_mut() {
+ *item = false;
+ }
+ re.read_matches_at(&mut matches, haystack, start)
+ }
+}
+
+ffi_fn! {
+ fn rure_set_len(re: *const RegexSet) -> size_t {
+ unsafe { (*re).len() }
+ }
+}
+
+ffi_fn! {
+ fn rure_escape_must(pattern: *const c_char) -> *const c_char {
+ let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
+ let pat = pattern as *const u8;
+ let mut err = Error::new(ErrorKind::None);
+ let esc = rure_escape(pat, len, &mut err);
+ if err.is_err() {
+ let _ = writeln!(&mut io::stderr(), "{}", err);
+ let _ = writeln!(
+ &mut io::stderr(), "aborting from rure_escape_must");
+ unsafe { abort() }
+ }
+ esc
+ }
+}
+
+/// A helper function that implements fallible escaping in a way that returns
+/// an error if escaping failed.
+///
+/// This should ideally be exposed, but it needs API design work. In
+/// particular, this should not return a C string, but a `const uint8_t *`
+/// instead, since it may contain a NUL byte.
+fn rure_escape(
+ pattern: *const u8,
+ length: size_t,
+ error: *mut Error,
+) -> *const c_char {
+ let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
+ let str_pat = match str::from_utf8(pat) {
+ Ok(val) => val,
+ Err(err) => unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Str(err));
+ }
+ return ptr::null();
+ },
+ };
+ let esc_pat = regex::escape(str_pat);
+ let c_esc_pat = match CString::new(esc_pat) {
+ Ok(val) => val,
+ Err(err) => unsafe {
+ if !error.is_null() {
+ *error = Error::new(ErrorKind::Nul(err));
+ }
+ return ptr::null();
+ },
+ };
+ c_esc_pat.into_raw() as *const c_char
+}
+
+ffi_fn! {
+ fn rure_cstring_free(s: *mut c_char) {
+ unsafe { drop(CString::from_raw(s)); }
+ }
+}