diff options
Diffstat (limited to '')
-rw-r--r-- | sqlglotrs/Cargo.lock | 2 | ||||
-rw-r--r-- | sqlglotrs/Cargo.toml | 3 | ||||
-rw-r--r-- | sqlglotrs/pyproject.toml | 9 | ||||
-rw-r--r-- | sqlglotrs/src/settings.rs | 6 | ||||
-rw-r--r-- | sqlglotrs/src/tokenizer.rs | 63 |
5 files changed, 58 insertions, 25 deletions
diff --git a/sqlglotrs/Cargo.lock b/sqlglotrs/Cargo.lock index e9255b7..b95e6b9 100644 --- a/sqlglotrs/Cargo.lock +++ b/sqlglotrs/Cargo.lock @@ -188,7 +188,7 @@ checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" [[package]] name = "sqlglotrs" -version = "0.2.5" +version = "0.2.12" dependencies = [ "pyo3", ] diff --git a/sqlglotrs/Cargo.toml b/sqlglotrs/Cargo.toml index 4c566ee..a3e23df 100644 --- a/sqlglotrs/Cargo.toml +++ b/sqlglotrs/Cargo.toml @@ -1,7 +1,8 @@ [package] name = "sqlglotrs" -version = "0.2.5" +version = "0.2.12" edition = "2021" +license = "MIT" [lib] name = "sqlglotrs" diff --git a/sqlglotrs/pyproject.toml b/sqlglotrs/pyproject.toml index 867cdcc..d84ee91 100644 --- a/sqlglotrs/pyproject.toml +++ b/sqlglotrs/pyproject.toml @@ -4,13 +4,22 @@ build-backend = "maturin" [project] name = "sqlglotrs" +description = "An easily customizable SQL parser and transpiler" requires-python = ">=3.7" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", + "License :: OSI Approved :: MIT License", +] +authors = [ + { name="Toby Mao", email="toby.mao@gmail.com" }, ] dynamic = ["version"] +[project.urls] +Homepage = "https://github.com/tobymao/sqlglot" +Issues = "https://github.com/tobymao/sqlglot/issues" + [tool.maturin] features = ["pyo3/extension-module"] diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index 4cacb9b..3068fd2 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -76,6 +76,8 @@ pub struct TokenizerSettings { pub commands: HashSet<TokenType>, pub command_prefix_tokens: HashSet<TokenType>, pub heredoc_tag_is_identifier: bool, + pub string_escapes_allowed_in_raw_strings: bool, + pub nested_comments: bool, } #[pymethods] @@ -98,6 +100,8 @@ impl TokenizerSettings { commands: HashSet<TokenType>, command_prefix_tokens: HashSet<TokenType>, heredoc_tag_is_identifier: bool, + string_escapes_allowed_in_raw_strings: bool, + nested_comments: bool, ) -> Self { let to_char = |v: &String| { if v.len() == 1 { @@ -147,6 +151,8 @@ impl TokenizerSettings { commands, command_prefix_tokens, heredoc_tag_is_identifier, + string_escapes_allowed_in_raw_strings, + nested_comments, } } } diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index e79d0e9..6df3bfb 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> { // Skip the comment's start delimiter. self.advance(comment_start_size as isize)?; + let mut comment_count = 1; let comment_end_size = comment_end.len(); - while !self.is_end && self.chars(comment_end_size) != *comment_end { + while !self.is_end { + if self.chars(comment_end_size) == *comment_end { + comment_count -= 1; + if comment_count == 0 { + break; + } + } + self.advance(1)?; + + // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres + if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start { + self.advance(comment_start_size as isize)?; + comment_count += 1 + } } let text = self.text(); @@ -405,28 +419,22 @@ impl<'a> TokenizerState<'a> { } else if *token_type == self.token_types.bit_string { (Some(2), *token_type, end.clone()) } else if *token_type == self.token_types.heredoc_string { - if self.settings.heredoc_tag_is_identifier - && !self.is_identifier(self.peek_char) - && self.peek_char.to_string() != *end - { - if self.token_types.heredoc_string_alternative != self.token_types.var { - self.add(self.token_types.heredoc_string_alternative, None)? - } else { - self.scan_var()? - }; - - return Ok(true) - }; - self.advance(1)?; let tag = if self.current_char.to_string() == *end { String::from("") } else { - self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)? + self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)? }; - if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier { + if !tag.is_empty() + && self.settings.heredoc_tag_is_identifier + && (self.is_end || !self.is_identifier(&tag)) + { + if !self.is_end { + self.advance(-1)?; + } + self.advance(-(tag.len() as isize))?; self.add(self.token_types.heredoc_string_alternative, None)?; return Ok(true) @@ -441,7 +449,7 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?; + let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; if let Some(b) = base { if u64::from_str_radix(&text, b).is_err() { @@ -494,7 +502,7 @@ impl<'a> TokenizerState<'a> { } else if self.peek_char.to_ascii_uppercase() == 'E' && scientific == 0 { scientific += 1; self.advance(1)?; - } else if self.is_identifier(self.peek_char) { + } else if self.is_alphabetic_or_underscore(self.peek_char) { let number_text = self.text(); let mut literal = String::from(""); @@ -587,7 +595,7 @@ impl<'a> TokenizerState<'a> { fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> { self.advance(1)?; - let text = self.extract_string(identifier_end, true, true, true)?; + let text = self.extract_string(identifier_end, true, false, true)?; self.add(self.token_types.identifier, Some(text)) } @@ -595,7 +603,7 @@ impl<'a> TokenizerState<'a> { &mut self, delimiter: &str, use_identifier_escapes: bool, - unescape_sequences: bool, + raw_string: bool, raise_unmatched: bool, ) -> Result<String, TokenizerError> { let mut text = String::from(""); @@ -608,7 +616,7 @@ impl<'a> TokenizerState<'a> { }; let peek_char_str = self.peek_char.to_string(); - if unescape_sequences + if !raw_string && !self.dialect_settings.unescaped_sequences.is_empty() && !self.peek_char.is_whitespace() && self.settings.string_escapes.contains(&self.current_char) @@ -623,7 +631,8 @@ impl<'a> TokenizerState<'a> { } } - if escapes.contains(&self.current_char) + if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string) + && escapes.contains(&self.current_char) && (peek_char_str == delimiter || escapes.contains(&self.peek_char)) && (self.current_char == self.peek_char || !self @@ -676,10 +685,18 @@ impl<'a> TokenizerState<'a> { Ok(text) } - fn is_identifier(&mut self, name: char) -> bool { + fn is_alphabetic_or_underscore(&mut self, name: char) -> bool { name.is_alphabetic() || name == '_' } + fn is_identifier(&mut self, s: &str) -> bool { + s.chars().enumerate().all( + |(i, c)| + if i == 0 { self.is_alphabetic_or_underscore(c) } + else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) } + ) + } + fn extract_value(&mut self) -> Result<String, TokenizerError> { loop { if !self.peek_char.is_whitespace() |