summaryrefslogtreecommitdiffstats
path: root/sqlglotrs/src
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r--sqlglotrs/src/settings.rs9
-rw-r--r--sqlglotrs/src/tokenizer.rs85
2 files changed, 64 insertions, 30 deletions
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index c6e76a7..4cacb9b 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -10,6 +10,7 @@ pub struct TokenTypeSettings {
pub break_: TokenType,
pub dcolon: TokenType,
pub heredoc_string: TokenType,
+ pub raw_string: TokenType,
pub hex_string: TokenType,
pub identifier: TokenType,
pub number: TokenType,
@@ -28,6 +29,7 @@ impl TokenTypeSettings {
break_: TokenType,
dcolon: TokenType,
heredoc_string: TokenType,
+ raw_string: TokenType,
hex_string: TokenType,
identifier: TokenType,
number: TokenType,
@@ -42,6 +44,7 @@ impl TokenTypeSettings {
break_,
dcolon,
heredoc_string,
+ raw_string,
hex_string,
identifier,
number,
@@ -151,7 +154,7 @@ impl TokenizerSettings {
#[derive(Clone, Debug)]
#[pyclass]
pub struct TokenizerDialectSettings {
- pub escape_sequences: HashMap<String, String>,
+ pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
}
@@ -159,11 +162,11 @@ pub struct TokenizerDialectSettings {
impl TokenizerDialectSettings {
#[new]
pub fn new(
- escape_sequences: HashMap<String, String>,
+ unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
) -> Self {
TokenizerDialectSettings {
- escape_sequences,
+ unescaped_sequences,
identifiers_can_start_with_digit,
}
}
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index 2c90a65..cdc61d7 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -118,8 +118,27 @@ impl<'a> TokenizerState<'a> {
fn scan(&mut self, until_peek_char: Option<char>) -> Result<(), TokenizerError> {
while self.size > 0 && !self.is_end {
- self.start = self.current;
- self.advance(1)?;
+ let mut current = self.current;
+
+ // Skip spaces here rather than iteratively calling advance() for performance reasons
+ while current < self.size {
+ let ch = self.char_at(current)?;
+
+ if ch == ' ' || ch == '\t' {
+ current += 1;
+ } else {
+ break;
+ }
+ }
+
+ let offset = if current > self.current {
+ current - self.current
+ } else {
+ 1
+ };
+
+ self.start = current;
+ self.advance(offset as isize)?;
if self.current_char == '\0' {
break;
@@ -153,16 +172,12 @@ impl<'a> TokenizerState<'a> {
}
fn advance(&mut self, i: isize) -> Result<(), TokenizerError> {
- let mut i = i;
if Some(&self.token_types.break_) == self.settings.white_space.get(&self.current_char) {
// Ensures we don't count an extra line if we get a \r\n line break sequence.
- if self.current_char == '\r' && self.peek_char == '\n' {
- i = 2;
- self.start += 1;
+ if !(self.current_char == '\r' && self.peek_char == '\n') {
+ self.column = 1;
+ self.line += 1;
}
-
- self.column = 1;
- self.line += 1;
} else {
self.column = self.column.wrapping_add_signed(i);
}
@@ -404,11 +419,19 @@ impl<'a> TokenizerState<'a> {
};
self.advance(1)?;
+
let tag = if self.current_char.to_string() == *end {
String::from("")
} else {
- self.extract_string(end, false)?
+ self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)?
};
+
+ if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier {
+ self.advance(-(tag.len() as isize))?;
+ self.add(self.token_types.heredoc_string_alternative, None)?;
+ return Ok(true)
+ }
+
(None, *token_type, format!("{}{}{}", start, tag, end))
} else {
(None, *token_type, end.clone())
@@ -418,7 +441,7 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
- let text = self.extract_string(&end, false)?;
+ let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?;
if let Some(b) = base {
if u64::from_str_radix(&text, b).is_err() {
@@ -561,7 +584,7 @@ impl<'a> TokenizerState<'a> {
fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> {
self.advance(1)?;
- let text = self.extract_string(identifier_end, true)?;
+ let text = self.extract_string(identifier_end, true, true, true)?;
self.add(self.token_types.identifier, Some(text))
}
@@ -569,6 +592,8 @@ impl<'a> TokenizerState<'a> {
&mut self,
delimiter: &str,
use_identifier_escapes: bool,
+ unescape_sequences: bool,
+ raise_unmatched: bool,
) -> Result<String, TokenizerError> {
let mut text = String::from("");
@@ -578,8 +603,23 @@ impl<'a> TokenizerState<'a> {
} else {
&self.settings.string_escapes
};
-
let peek_char_str = self.peek_char.to_string();
+
+ if unescape_sequences
+ && !self.dialect_settings.unescaped_sequences.is_empty()
+ && !self.peek_char.is_whitespace()
+ && self.settings.string_escapes.contains(&self.current_char)
+ {
+ let sequence_key = format!("{}{}", self.current_char, self.peek_char);
+ if let Some(unescaped_sequence) =
+ self.dialect_settings.unescaped_sequences.get(&sequence_key)
+ {
+ self.advance(2)?;
+ text.push_str(unescaped_sequence);
+ continue;
+ }
+ }
+
if escapes.contains(&self.current_char)
&& (peek_char_str == delimiter || escapes.contains(&self.peek_char))
&& (self.current_char == self.peek_char
@@ -610,26 +650,17 @@ impl<'a> TokenizerState<'a> {
break;
}
if self.is_end {
+ if !raise_unmatched {
+ text.push(self.current_char);
+ return Ok(text)
+ }
+
return self.error_result(format!(
"Missing {} from {}:{}",
delimiter, self.line, self.current
));
}
- if !self.dialect_settings.escape_sequences.is_empty()
- && !self.peek_char.is_whitespace()
- && self.settings.string_escapes.contains(&self.current_char)
- {
- let sequence_key = format!("{}{}", self.current_char, self.peek_char);
- if let Some(escaped_sequence) =
- self.dialect_settings.escape_sequences.get(&sequence_key)
- {
- self.advance(2)?;
- text.push_str(escaped_sequence);
- continue;
- }
- }
-
let current = self.current - 1;
self.advance(1)?;
text.push_str(