diff options
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r-- | sqlglotrs/src/settings.rs | 3 | ||||
-rw-r--r-- | sqlglotrs/src/tokenizer.rs | 29 |
2 files changed, 25 insertions, 7 deletions
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index 4cacb9b..9fb8fda 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -76,6 +76,7 @@ pub struct TokenizerSettings { pub commands: HashSet<TokenType>, pub command_prefix_tokens: HashSet<TokenType>, pub heredoc_tag_is_identifier: bool, + pub string_escapes_allowed_in_raw_strings: bool, } #[pymethods] @@ -98,6 +99,7 @@ impl TokenizerSettings { commands: HashSet<TokenType>, command_prefix_tokens: HashSet<TokenType>, heredoc_tag_is_identifier: bool, + string_escapes_allowed_in_raw_strings: bool, ) -> Self { let to_char = |v: &String| { if v.len() == 1 { @@ -147,6 +149,7 @@ impl TokenizerSettings { commands, command_prefix_tokens, heredoc_tag_is_identifier, + string_escapes_allowed_in_raw_strings, } } } diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 45bbe77..ca5c44b 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> { // Skip the comment's start delimiter. self.advance(comment_start_size as isize)?; + let mut comment_count = 1; let comment_end_size = comment_end.len(); - while !self.is_end && self.chars(comment_end_size) != *comment_end { + while !self.is_end { + if self.chars(comment_end_size) == *comment_end { + comment_count -= 1; + if comment_count == 0 { + break; + } + } + self.advance(1)?; + + // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres + if !self.is_end && self.chars(comment_start_size) == *comment_start { + self.advance(comment_start_size as isize)?; + comment_count += 1 + } } let text = self.text(); @@ -410,7 +424,7 @@ impl<'a> TokenizerState<'a> { let tag = if self.current_char.to_string() == *end { String::from("") } else { - self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)? + self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)? }; if !tag.is_empty() @@ -435,7 +449,7 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?; + let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; if let Some(b) = base { if u64::from_str_radix(&text, b).is_err() { @@ -581,7 +595,7 @@ impl<'a> TokenizerState<'a> { fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> { self.advance(1)?; - let text = self.extract_string(identifier_end, true, true, true)?; + let text = self.extract_string(identifier_end, true, false, true)?; self.add(self.token_types.identifier, Some(text)) } @@ -589,7 +603,7 @@ impl<'a> TokenizerState<'a> { &mut self, delimiter: &str, use_identifier_escapes: bool, - unescape_sequences: bool, + raw_string: bool, raise_unmatched: bool, ) -> Result<String, TokenizerError> { let mut text = String::from(""); @@ -602,7 +616,7 @@ impl<'a> TokenizerState<'a> { }; let peek_char_str = self.peek_char.to_string(); - if unescape_sequences + if !raw_string && !self.dialect_settings.unescaped_sequences.is_empty() && !self.peek_char.is_whitespace() && self.settings.string_escapes.contains(&self.current_char) @@ -617,7 +631,8 @@ impl<'a> TokenizerState<'a> { } } - if escapes.contains(&self.current_char) + if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string) + && escapes.contains(&self.current_char) && (peek_char_str == delimiter || escapes.contains(&self.peek_char)) && (self.current_char == self.peek_char || !self |