summaryrefslogtreecommitdiffstats
path: root/sqlglotrs/src
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r--sqlglotrs/src/settings.rs3
-rw-r--r--sqlglotrs/src/tokenizer.rs29
2 files changed, 25 insertions, 7 deletions
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index 4cacb9b..9fb8fda 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -76,6 +76,7 @@ pub struct TokenizerSettings {
pub commands: HashSet<TokenType>,
pub command_prefix_tokens: HashSet<TokenType>,
pub heredoc_tag_is_identifier: bool,
+ pub string_escapes_allowed_in_raw_strings: bool,
}
#[pymethods]
@@ -98,6 +99,7 @@ impl TokenizerSettings {
commands: HashSet<TokenType>,
command_prefix_tokens: HashSet<TokenType>,
heredoc_tag_is_identifier: bool,
+ string_escapes_allowed_in_raw_strings: bool,
) -> Self {
let to_char = |v: &String| {
if v.len() == 1 {
@@ -147,6 +149,7 @@ impl TokenizerSettings {
commands,
command_prefix_tokens,
heredoc_tag_is_identifier,
+ string_escapes_allowed_in_raw_strings,
}
}
}
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index 45bbe77..ca5c44b 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> {
// Skip the comment's start delimiter.
self.advance(comment_start_size as isize)?;
+ let mut comment_count = 1;
let comment_end_size = comment_end.len();
- while !self.is_end && self.chars(comment_end_size) != *comment_end {
+ while !self.is_end {
+ if self.chars(comment_end_size) == *comment_end {
+ comment_count -= 1;
+ if comment_count == 0 {
+ break;
+ }
+ }
+
self.advance(1)?;
+
+ // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
+ if !self.is_end && self.chars(comment_start_size) == *comment_start {
+ self.advance(comment_start_size as isize)?;
+ comment_count += 1
+ }
}
let text = self.text();
@@ -410,7 +424,7 @@ impl<'a> TokenizerState<'a> {
let tag = if self.current_char.to_string() == *end {
String::from("")
} else {
- self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)?
+ self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)?
};
if !tag.is_empty()
@@ -435,7 +449,7 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
- let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?;
+ let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
if let Some(b) = base {
if u64::from_str_radix(&text, b).is_err() {
@@ -581,7 +595,7 @@ impl<'a> TokenizerState<'a> {
fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> {
self.advance(1)?;
- let text = self.extract_string(identifier_end, true, true, true)?;
+ let text = self.extract_string(identifier_end, true, false, true)?;
self.add(self.token_types.identifier, Some(text))
}
@@ -589,7 +603,7 @@ impl<'a> TokenizerState<'a> {
&mut self,
delimiter: &str,
use_identifier_escapes: bool,
- unescape_sequences: bool,
+ raw_string: bool,
raise_unmatched: bool,
) -> Result<String, TokenizerError> {
let mut text = String::from("");
@@ -602,7 +616,7 @@ impl<'a> TokenizerState<'a> {
};
let peek_char_str = self.peek_char.to_string();
- if unescape_sequences
+ if !raw_string
&& !self.dialect_settings.unescaped_sequences.is_empty()
&& !self.peek_char.is_whitespace()
&& self.settings.string_escapes.contains(&self.current_char)
@@ -617,7 +631,8 @@ impl<'a> TokenizerState<'a> {
}
}
- if escapes.contains(&self.current_char)
+ if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string)
+ && escapes.contains(&self.current_char)
&& (peek_char_str == delimiter || escapes.contains(&self.peek_char))
&& (self.current_char == self.peek_char
|| !self