use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::ops::Deref; use std::ptr; use std::slice; use std::str; use libc::{c_char, size_t}; use regex::bytes; use crate::error::{Error, ErrorKind}; const RURE_FLAG_CASEI: u32 = 1 << 0; const RURE_FLAG_MULTI: u32 = 1 << 1; const RURE_FLAG_DOTNL: u32 = 1 << 2; const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; const RURE_FLAG_SPACE: u32 = 1 << 4; const RURE_FLAG_UNICODE: u32 = 1 << 5; const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; pub struct Regex { re: bytes::Regex, capture_names: HashMap, } pub struct Options { size_limit: usize, dfa_size_limit: usize, } // The `RegexSet` is not exposed with option support or matching at an // arbitrary position with a crate just yet. To circumvent this, we use // the `Exec` structure directly. pub struct RegexSet { re: bytes::RegexSet, } #[repr(C)] pub struct rure_match { pub start: size_t, pub end: size_t, } pub struct Captures(bytes::Locations); pub struct Iter { re: *const Regex, last_end: usize, last_match: Option, } pub struct IterCaptureNames { capture_names: bytes::CaptureNames<'static>, name_ptrs: Vec<*mut c_char>, } impl Deref for Regex { type Target = bytes::Regex; fn deref(&self) -> &bytes::Regex { &self.re } } impl Deref for RegexSet { type Target = bytes::RegexSet; fn deref(&self) -> &bytes::RegexSet { &self.re } } impl Default for Options { fn default() -> Options { Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) } } } ffi_fn! { fn rure_compile_must(pattern: *const c_char) -> *const Regex { let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; let pat = pattern as *const u8; let mut err = Error::new(ErrorKind::None); let re = rure_compile( pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); if err.is_err() { let _ = writeln!(&mut io::stderr(), "{}", err); let _ = writeln!( &mut io::stderr(), "aborting from rure_compile_must"); unsafe { abort() } } re } } ffi_fn! { fn rure_compile( pattern: *const u8, length: size_t, flags: u32, options: *const Options, error: *mut Error, ) -> *const Regex { let pat = unsafe { slice::from_raw_parts(pattern, length) }; let pat = match str::from_utf8(pat) { Ok(pat) => pat, Err(err) => { unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Str(err)); } return ptr::null(); } } }; let mut builder = bytes::RegexBuilder::new(pat); if !options.is_null() { let options = unsafe { &*options }; builder.size_limit(options.size_limit); builder.dfa_size_limit(options.dfa_size_limit); } builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); builder.multi_line(flags & RURE_FLAG_MULTI > 0); builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); builder.unicode(flags & RURE_FLAG_UNICODE > 0); match builder.build() { Ok(re) => { let mut capture_names = HashMap::new(); for (i, name) in re.capture_names().enumerate() { if let Some(name) = name { capture_names.insert(name.to_owned(), i as i32); } } let re = Regex { re: re, capture_names: capture_names, }; Box::into_raw(Box::new(re)) } Err(err) => { unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Regex(err)); } ptr::null() } } } } } ffi_fn! { fn rure_free(re: *const Regex) { unsafe { drop(Box::from_raw(re as *mut Regex)); } } } ffi_fn! { fn rure_is_match( re: *const Regex, haystack: *const u8, len: size_t, start: size_t, ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; re.is_match_at(haystack, start) } } ffi_fn! { fn rure_find( re: *const Regex, haystack: *const u8, len: size_t, start: size_t, match_info: *mut rure_match, ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; re.find_at(haystack, start).map(|m| unsafe { if !match_info.is_null() { (*match_info).start = m.start(); (*match_info).end = m.end(); } }).is_some() } } ffi_fn! { fn rure_find_captures( re: *const Regex, haystack: *const u8, len: size_t, start: size_t, captures: *mut Captures, ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; let slots = unsafe { &mut (*captures).0 }; re.read_captures_at(slots, haystack, start).is_some() } } ffi_fn! { fn rure_shortest_match( re: *const Regex, haystack: *const u8, len: size_t, start: size_t, end: *mut usize, ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; match re.shortest_match_at(haystack, start) { None => false, Some(i) => { if !end.is_null() { unsafe { *end = i; } } true } } } } ffi_fn! { fn rure_capture_name_index( re: *const Regex, name: *const c_char, ) -> i32 { let re = unsafe { &*re }; let name = unsafe { CStr::from_ptr(name) }; let name = match name.to_str() { Err(_) => return -1, Ok(name) => name, }; re.capture_names.get(name).map(|&i|i).unwrap_or(-1) } } ffi_fn! { fn rure_iter_capture_names_new( re: *const Regex, ) -> *mut IterCaptureNames { let re = unsafe { &*re }; Box::into_raw(Box::new(IterCaptureNames { capture_names: re.re.capture_names(), name_ptrs: Vec::new(), })) } } ffi_fn! { fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { unsafe { let it = &mut *it; while let Some(ptr) = it.name_ptrs.pop() { drop(CString::from_raw(ptr)); } drop(Box::from_raw(it)); } } } ffi_fn! { fn rure_iter_capture_names_next( it: *mut IterCaptureNames, capture_name: *mut *mut c_char, ) -> bool { if capture_name.is_null() { return false; } let it = unsafe { &mut *it }; let cn = match it.capture_names.next() { // Top-level iterator ran out of capture groups None => return false, Some(val) => { let name = match val { // inner Option didn't have a name None => "", Some(name) => name }; name } }; unsafe { let cs = match CString::new(cn.as_bytes()) { Result::Ok(val) => val, Result::Err(_) => return false }; let ptr = cs.into_raw(); it.name_ptrs.push(ptr); *capture_name = ptr; } true } } ffi_fn! { fn rure_iter_new( re: *const Regex, ) -> *mut Iter { Box::into_raw(Box::new(Iter { re: re, last_end: 0, last_match: None, })) } } ffi_fn! { fn rure_iter_free(it: *mut Iter) { unsafe { drop(Box::from_raw(it)); } } } ffi_fn! { fn rure_iter_next( it: *mut Iter, haystack: *const u8, len: size_t, match_info: *mut rure_match, ) -> bool { let it = unsafe { &mut *it }; let re = unsafe { &*it.re }; let text = unsafe { slice::from_raw_parts(haystack, len) }; if it.last_end > text.len() { return false; } let (s, e) = match re.find_at(text, it.last_end) { None => return false, Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. it.last_end += 1; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(e) == it.last_match { return rure_iter_next(it, haystack, len, match_info); } } else { it.last_end = e; } it.last_match = Some(e); if !match_info.is_null() { unsafe { (*match_info).start = s; (*match_info).end = e; } } true } } ffi_fn! { fn rure_iter_next_captures( it: *mut Iter, haystack: *const u8, len: size_t, captures: *mut Captures, ) -> bool { let it = unsafe { &mut *it }; let re = unsafe { &*it.re }; let slots = unsafe { &mut (*captures).0 }; let text = unsafe { slice::from_raw_parts(haystack, len) }; if it.last_end > text.len() { return false; } let (s, e) = match re.read_captures_at(slots, text, it.last_end) { None => return false, Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. it.last_end += 1; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(e) == it.last_match { return rure_iter_next_captures(it, haystack, len, captures); } } else { it.last_end = e; } it.last_match = Some(e); true } } ffi_fn! { fn rure_captures_new(re: *const Regex) -> *mut Captures { let re = unsafe { &*re }; let captures = Captures(re.locations()); Box::into_raw(Box::new(captures)) } } ffi_fn! { fn rure_captures_free(captures: *const Captures) { unsafe { drop(Box::from_raw(captures as *mut Captures)); } } } ffi_fn! { fn rure_captures_at( captures: *const Captures, i: size_t, match_info: *mut rure_match, ) -> bool { let locs = unsafe { &(*captures).0 }; match locs.pos(i) { Some((start, end)) => { if !match_info.is_null() { unsafe { (*match_info).start = start; (*match_info).end = end; } } true } _ => false } } } ffi_fn! { fn rure_captures_len(captures: *const Captures) -> size_t { unsafe { (*captures).0.len() } } } ffi_fn! { fn rure_options_new() -> *mut Options { Box::into_raw(Box::new(Options::default())) } } ffi_fn! { fn rure_options_free(options: *mut Options) { unsafe { drop(Box::from_raw(options)); } } } ffi_fn! { fn rure_options_size_limit(options: *mut Options, limit: size_t) { let options = unsafe { &mut *options }; options.size_limit = limit; } } ffi_fn! { fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) { let options = unsafe { &mut *options }; options.dfa_size_limit = limit; } } ffi_fn! { fn rure_compile_set( patterns: *const *const u8, patterns_lengths: *const size_t, patterns_count: size_t, flags: u32, options: *const Options, error: *mut Error ) -> *const RegexSet { let (raw_pats, raw_patsl) = unsafe { ( slice::from_raw_parts(patterns, patterns_count), slice::from_raw_parts(patterns_lengths, patterns_count) ) }; let mut pats = Vec::with_capacity(patterns_count); for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; pats.push(match str::from_utf8(pat) { Ok(pat) => pat, Err(err) => { unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Str(err)); } return ptr::null(); } } }); } let mut builder = bytes::RegexSetBuilder::new(pats); if !options.is_null() { let options = unsafe { &*options }; builder.size_limit(options.size_limit); builder.dfa_size_limit(options.dfa_size_limit); } builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); builder.multi_line(flags & RURE_FLAG_MULTI > 0); builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); builder.unicode(flags & RURE_FLAG_UNICODE > 0); match builder.build() { Ok(re) => { Box::into_raw(Box::new(RegexSet { re: re })) } Err(err) => { unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Regex(err)) } ptr::null() } } } } } ffi_fn! { fn rure_set_free(re: *const RegexSet) { unsafe { drop(Box::from_raw(re as *mut RegexSet)); } } } ffi_fn! { fn rure_set_is_match( re: *const RegexSet, haystack: *const u8, len: size_t, start: size_t ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; re.is_match_at(haystack, start) } } ffi_fn! { fn rure_set_matches( re: *const RegexSet, haystack: *const u8, len: size_t, start: size_t, matches: *mut bool ) -> bool { let re = unsafe { &*re }; let mut matches = unsafe { slice::from_raw_parts_mut(matches, re.len()) }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; // read_matches_at isn't guaranteed to set non-matches to false for item in matches.iter_mut() { *item = false; } re.read_matches_at(&mut matches, haystack, start) } } ffi_fn! { fn rure_set_len(re: *const RegexSet) -> size_t { unsafe { (*re).len() } } } ffi_fn! { fn rure_escape_must(pattern: *const c_char) -> *const c_char { let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; let pat = pattern as *const u8; let mut err = Error::new(ErrorKind::None); let esc = rure_escape(pat, len, &mut err); if err.is_err() { let _ = writeln!(&mut io::stderr(), "{}", err); let _ = writeln!( &mut io::stderr(), "aborting from rure_escape_must"); unsafe { abort() } } esc } } /// A helper function that implements fallible escaping in a way that returns /// an error if escaping failed. /// /// This should ideally be exposed, but it needs API design work. In /// particular, this should not return a C string, but a `const uint8_t *` /// instead, since it may contain a NUL byte. fn rure_escape( pattern: *const u8, length: size_t, error: *mut Error, ) -> *const c_char { let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; let str_pat = match str::from_utf8(pat) { Ok(val) => val, Err(err) => unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Str(err)); } return ptr::null(); }, }; let esc_pat = regex::escape(str_pat); let c_esc_pat = match CString::new(esc_pat) { Ok(val) => val, Err(err) => unsafe { if !error.is_null() { *error = Error::new(ErrorKind::Nul(err)); } return ptr::null(); }, }; c_esc_pat.into_raw() as *const c_char } ffi_fn! { fn rure_cstring_free(s: *mut c_char) { unsafe { drop(CString::from_raw(s)); } } }