diff options
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r-- | sqlglotrs/src/settings.rs | 9 | ||||
-rw-r--r-- | sqlglotrs/src/tokenizer.rs | 85 |
2 files changed, 64 insertions, 30 deletions
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index c6e76a7..4cacb9b 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -10,6 +10,7 @@ pub struct TokenTypeSettings { pub break_: TokenType, pub dcolon: TokenType, pub heredoc_string: TokenType, + pub raw_string: TokenType, pub hex_string: TokenType, pub identifier: TokenType, pub number: TokenType, @@ -28,6 +29,7 @@ impl TokenTypeSettings { break_: TokenType, dcolon: TokenType, heredoc_string: TokenType, + raw_string: TokenType, hex_string: TokenType, identifier: TokenType, number: TokenType, @@ -42,6 +44,7 @@ impl TokenTypeSettings { break_, dcolon, heredoc_string, + raw_string, hex_string, identifier, number, @@ -151,7 +154,7 @@ impl TokenizerSettings { #[derive(Clone, Debug)] #[pyclass] pub struct TokenizerDialectSettings { - pub escape_sequences: HashMap<String, String>, + pub unescaped_sequences: HashMap<String, String>, pub identifiers_can_start_with_digit: bool, } @@ -159,11 +162,11 @@ pub struct TokenizerDialectSettings { impl TokenizerDialectSettings { #[new] pub fn new( - escape_sequences: HashMap<String, String>, + unescaped_sequences: HashMap<String, String>, identifiers_can_start_with_digit: bool, ) -> Self { TokenizerDialectSettings { - escape_sequences, + unescaped_sequences, identifiers_can_start_with_digit, } } diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 2c90a65..cdc61d7 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -118,8 +118,27 @@ impl<'a> TokenizerState<'a> { fn scan(&mut self, until_peek_char: Option<char>) -> Result<(), TokenizerError> { while self.size > 0 && !self.is_end { - self.start = self.current; - self.advance(1)?; + let mut current = self.current; + + // Skip spaces here rather than iteratively calling advance() for performance reasons + while current < self.size { + let ch = self.char_at(current)?; + + if ch == ' ' || ch == '\t' { + current += 1; + } else { + break; + } + } + + let offset = if current > self.current { + current - self.current + } else { + 1 + }; + + self.start = current; + self.advance(offset as isize)?; if self.current_char == '\0' { break; @@ -153,16 +172,12 @@ impl<'a> TokenizerState<'a> { } fn advance(&mut self, i: isize) -> Result<(), TokenizerError> { - let mut i = i; if Some(&self.token_types.break_) == self.settings.white_space.get(&self.current_char) { // Ensures we don't count an extra line if we get a \r\n line break sequence. - if self.current_char == '\r' && self.peek_char == '\n' { - i = 2; - self.start += 1; + if !(self.current_char == '\r' && self.peek_char == '\n') { + self.column = 1; + self.line += 1; } - - self.column = 1; - self.line += 1; } else { self.column = self.column.wrapping_add_signed(i); } @@ -404,11 +419,19 @@ impl<'a> TokenizerState<'a> { }; self.advance(1)?; + let tag = if self.current_char.to_string() == *end { String::from("") } else { - self.extract_string(end, false)? + self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)? }; + + if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier { + self.advance(-(tag.len() as isize))?; + self.add(self.token_types.heredoc_string_alternative, None)?; + return Ok(true) + } + (None, *token_type, format!("{}{}{}", start, tag, end)) } else { (None, *token_type, end.clone()) @@ -418,7 +441,7 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false)?; + let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?; if let Some(b) = base { if u64::from_str_radix(&text, b).is_err() { @@ -561,7 +584,7 @@ impl<'a> TokenizerState<'a> { fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> { self.advance(1)?; - let text = self.extract_string(identifier_end, true)?; + let text = self.extract_string(identifier_end, true, true, true)?; self.add(self.token_types.identifier, Some(text)) } @@ -569,6 +592,8 @@ impl<'a> TokenizerState<'a> { &mut self, delimiter: &str, use_identifier_escapes: bool, + unescape_sequences: bool, + raise_unmatched: bool, ) -> Result<String, TokenizerError> { let mut text = String::from(""); @@ -578,8 +603,23 @@ impl<'a> TokenizerState<'a> { } else { &self.settings.string_escapes }; - let peek_char_str = self.peek_char.to_string(); + + if unescape_sequences + && !self.dialect_settings.unescaped_sequences.is_empty() + && !self.peek_char.is_whitespace() + && self.settings.string_escapes.contains(&self.current_char) + { + let sequence_key = format!("{}{}", self.current_char, self.peek_char); + if let Some(unescaped_sequence) = + self.dialect_settings.unescaped_sequences.get(&sequence_key) + { + self.advance(2)?; + text.push_str(unescaped_sequence); + continue; + } + } + if escapes.contains(&self.current_char) && (peek_char_str == delimiter || escapes.contains(&self.peek_char)) && (self.current_char == self.peek_char @@ -610,26 +650,17 @@ impl<'a> TokenizerState<'a> { break; } if self.is_end { + if !raise_unmatched { + text.push(self.current_char); + return Ok(text) + } + return self.error_result(format!( "Missing {} from {}:{}", delimiter, self.line, self.current )); } - if !self.dialect_settings.escape_sequences.is_empty() - && !self.peek_char.is_whitespace() - && self.settings.string_escapes.contains(&self.current_char) - { - let sequence_key = format!("{}{}", self.current_char, self.peek_char); - if let Some(escaped_sequence) = - self.dialect_settings.escape_sequences.get(&sequence_key) - { - self.advance(2)?; - text.push_str(escaped_sequence); - continue; - } - } - let current = self.current - 1; self.advance(1)?; text.push_str( |