summaryrefslogtreecommitdiffstats
path: root/sqlglotrs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--sqlglotrs/Cargo.lock2
-rw-r--r--sqlglotrs/Cargo.toml3
-rw-r--r--sqlglotrs/pyproject.toml9
-rw-r--r--sqlglotrs/src/settings.rs6
-rw-r--r--sqlglotrs/src/tokenizer.rs63
5 files changed, 58 insertions, 25 deletions
diff --git a/sqlglotrs/Cargo.lock b/sqlglotrs/Cargo.lock
index e9255b7..b95e6b9 100644
--- a/sqlglotrs/Cargo.lock
+++ b/sqlglotrs/Cargo.lock
@@ -188,7 +188,7 @@ checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
[[package]]
name = "sqlglotrs"
-version = "0.2.5"
+version = "0.2.12"
dependencies = [
"pyo3",
]
diff --git a/sqlglotrs/Cargo.toml b/sqlglotrs/Cargo.toml
index 4c566ee..a3e23df 100644
--- a/sqlglotrs/Cargo.toml
+++ b/sqlglotrs/Cargo.toml
@@ -1,7 +1,8 @@
[package]
name = "sqlglotrs"
-version = "0.2.5"
+version = "0.2.12"
edition = "2021"
+license = "MIT"
[lib]
name = "sqlglotrs"
diff --git a/sqlglotrs/pyproject.toml b/sqlglotrs/pyproject.toml
index 867cdcc..d84ee91 100644
--- a/sqlglotrs/pyproject.toml
+++ b/sqlglotrs/pyproject.toml
@@ -4,13 +4,22 @@ build-backend = "maturin"
[project]
name = "sqlglotrs"
+description = "An easily customizable SQL parser and transpiler"
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
+ "License :: OSI Approved :: MIT License",
+]
+authors = [
+ { name="Toby Mao", email="toby.mao@gmail.com" },
]
dynamic = ["version"]
+[project.urls]
+Homepage = "https://github.com/tobymao/sqlglot"
+Issues = "https://github.com/tobymao/sqlglot/issues"
+
[tool.maturin]
features = ["pyo3/extension-module"]
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index 4cacb9b..3068fd2 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -76,6 +76,8 @@ pub struct TokenizerSettings {
pub commands: HashSet<TokenType>,
pub command_prefix_tokens: HashSet<TokenType>,
pub heredoc_tag_is_identifier: bool,
+ pub string_escapes_allowed_in_raw_strings: bool,
+ pub nested_comments: bool,
}
#[pymethods]
@@ -98,6 +100,8 @@ impl TokenizerSettings {
commands: HashSet<TokenType>,
command_prefix_tokens: HashSet<TokenType>,
heredoc_tag_is_identifier: bool,
+ string_escapes_allowed_in_raw_strings: bool,
+ nested_comments: bool,
) -> Self {
let to_char = |v: &String| {
if v.len() == 1 {
@@ -147,6 +151,8 @@ impl TokenizerSettings {
commands,
command_prefix_tokens,
heredoc_tag_is_identifier,
+ string_escapes_allowed_in_raw_strings,
+ nested_comments,
}
}
}
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index e79d0e9..6df3bfb 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> {
// Skip the comment's start delimiter.
self.advance(comment_start_size as isize)?;
+ let mut comment_count = 1;
let comment_end_size = comment_end.len();
- while !self.is_end && self.chars(comment_end_size) != *comment_end {
+ while !self.is_end {
+ if self.chars(comment_end_size) == *comment_end {
+ comment_count -= 1;
+ if comment_count == 0 {
+ break;
+ }
+ }
+
self.advance(1)?;
+
+ // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
+ if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
+ self.advance(comment_start_size as isize)?;
+ comment_count += 1
+ }
}
let text = self.text();
@@ -405,28 +419,22 @@ impl<'a> TokenizerState<'a> {
} else if *token_type == self.token_types.bit_string {
(Some(2), *token_type, end.clone())
} else if *token_type == self.token_types.heredoc_string {
- if self.settings.heredoc_tag_is_identifier
- && !self.is_identifier(self.peek_char)
- && self.peek_char.to_string() != *end
- {
- if self.token_types.heredoc_string_alternative != self.token_types.var {
- self.add(self.token_types.heredoc_string_alternative, None)?
- } else {
- self.scan_var()?
- };
-
- return Ok(true)
- };
-
self.advance(1)?;
let tag = if self.current_char.to_string() == *end {
String::from("")
} else {
- self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)?
+ self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)?
};
- if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier {
+ if !tag.is_empty()
+ && self.settings.heredoc_tag_is_identifier
+ && (self.is_end || !self.is_identifier(&tag))
+ {
+ if !self.is_end {
+ self.advance(-1)?;
+ }
+
self.advance(-(tag.len() as isize))?;
self.add(self.token_types.heredoc_string_alternative, None)?;
return Ok(true)
@@ -441,7 +449,7 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
- let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?;
+ let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
if let Some(b) = base {
if u64::from_str_radix(&text, b).is_err() {
@@ -494,7 +502,7 @@ impl<'a> TokenizerState<'a> {
} else if self.peek_char.to_ascii_uppercase() == 'E' && scientific == 0 {
scientific += 1;
self.advance(1)?;
- } else if self.is_identifier(self.peek_char) {
+ } else if self.is_alphabetic_or_underscore(self.peek_char) {
let number_text = self.text();
let mut literal = String::from("");
@@ -587,7 +595,7 @@ impl<'a> TokenizerState<'a> {
fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> {
self.advance(1)?;
- let text = self.extract_string(identifier_end, true, true, true)?;
+ let text = self.extract_string(identifier_end, true, false, true)?;
self.add(self.token_types.identifier, Some(text))
}
@@ -595,7 +603,7 @@ impl<'a> TokenizerState<'a> {
&mut self,
delimiter: &str,
use_identifier_escapes: bool,
- unescape_sequences: bool,
+ raw_string: bool,
raise_unmatched: bool,
) -> Result<String, TokenizerError> {
let mut text = String::from("");
@@ -608,7 +616,7 @@ impl<'a> TokenizerState<'a> {
};
let peek_char_str = self.peek_char.to_string();
- if unescape_sequences
+ if !raw_string
&& !self.dialect_settings.unescaped_sequences.is_empty()
&& !self.peek_char.is_whitespace()
&& self.settings.string_escapes.contains(&self.current_char)
@@ -623,7 +631,8 @@ impl<'a> TokenizerState<'a> {
}
}
- if escapes.contains(&self.current_char)
+ if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string)
+ && escapes.contains(&self.current_char)
&& (peek_char_str == delimiter || escapes.contains(&self.peek_char))
&& (self.current_char == self.peek_char
|| !self
@@ -676,10 +685,18 @@ impl<'a> TokenizerState<'a> {
Ok(text)
}
- fn is_identifier(&mut self, name: char) -> bool {
+ fn is_alphabetic_or_underscore(&mut self, name: char) -> bool {
name.is_alphabetic() || name == '_'
}
+ fn is_identifier(&mut self, s: &str) -> bool {
+ s.chars().enumerate().all(
+ |(i, c)|
+ if i == 0 { self.is_alphabetic_or_underscore(c) }
+ else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
+ )
+ }
+
fn extract_value(&mut self) -> Result<String, TokenizerError> {
loop {
if !self.peek_char.is_whitespace()