5 files changed, 58 insertions, 25 deletions
diff --git a/sqlglotrs/Cargo.lock b/sqlglotrs/Cargo.lock
index e9255b7..b95e6b9 100644
--- a/sqlglotrs/Cargo.lock
+++ b/sqlglotrs/Cargo.lock
@@ -188,7 +188,7 @@ checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
 
 [[package]]
 name = "sqlglotrs"
-version = "0.2.5"
+version = "0.2.12"
 dependencies = [
  "pyo3",
 ]
diff --git a/sqlglotrs/Cargo.toml b/sqlglotrs/Cargo.toml
index 4c566ee..a3e23df 100644
--- a/sqlglotrs/Cargo.toml
+++ b/sqlglotrs/Cargo.toml
@@ -1,7 +1,8 @@
 [package]
 name = "sqlglotrs"
-version = "0.2.5"
+version = "0.2.12"
 edition = "2021"
+license = "MIT"
 
 [lib]
 name = "sqlglotrs"
diff --git a/sqlglotrs/pyproject.toml b/sqlglotrs/pyproject.toml
index 867cdcc..d84ee91 100644
--- a/sqlglotrs/pyproject.toml
+++ b/sqlglotrs/pyproject.toml
@@ -4,13 +4,22 @@ build-backend = "maturin"
 
 [project]
 name = "sqlglotrs"
+description = "An easily customizable SQL parser and transpiler"
 requires-python = ">=3.7"
 classifiers = [
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
+    "License :: OSI Approved :: MIT License",
+]
+authors = [
+  { name="Toby Mao", email="toby.mao@gmail.com" },
 ]
 dynamic = ["version"]
 
+[project.urls]
+Homepage = "https://github.com/tobymao/sqlglot"
+Issues = "https://github.com/tobymao/sqlglot/issues"
+
 [tool.maturin]
 features = ["pyo3/extension-module"]
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index 4cacb9b..3068fd2 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -76,6 +76,8 @@ pub struct TokenizerSettings {
     pub commands: HashSet<TokenType>,
     pub command_prefix_tokens: HashSet<TokenType>,
     pub heredoc_tag_is_identifier: bool,
+    pub string_escapes_allowed_in_raw_strings: bool,
+    pub nested_comments: bool,
 }
 
 #[pymethods]
@@ -98,6 +100,8 @@ impl TokenizerSettings {
         commands: HashSet<TokenType>,
         command_prefix_tokens: HashSet<TokenType>,
         heredoc_tag_is_identifier: bool,
+        string_escapes_allowed_in_raw_strings: bool,
+        nested_comments: bool,
     ) -> Self {
         let to_char = |v: &String| {
             if v.len() == 1 {
@@ -147,6 +151,8 @@ impl TokenizerSettings {
             commands,
             command_prefix_tokens,
             heredoc_tag_is_identifier,
+            string_escapes_allowed_in_raw_strings,
+            nested_comments,
         }
     }
 }
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index e79d0e9..6df3bfb 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> {
             // Skip the comment's start delimiter.
             self.advance(comment_start_size as isize)?;
 
+            let mut comment_count = 1;
             let comment_end_size = comment_end.len();
 
-            while !self.is_end && self.chars(comment_end_size) != *comment_end {
+            while !self.is_end {
+                if self.chars(comment_end_size) == *comment_end {
+                    comment_count -= 1;
+                    if comment_count == 0 {
+                        break;
+                    }
+                }
+
                 self.advance(1)?;
+
+                // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
+                if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
+                    self.advance(comment_start_size as isize)?;
+                    comment_count += 1
+                }
             }
 
             let text = self.text();
@@ -405,28 +419,22 @@ impl<'a> TokenizerState<'a> {
             } else if *token_type == self.token_types.bit_string {
                 (Some(2), *token_type, end.clone())
             } else if *token_type == self.token_types.heredoc_string {
-                if self.settings.heredoc_tag_is_identifier
-                    && !self.is_identifier(self.peek_char)
-                    && self.peek_char.to_string() != *end
-                {
-                    if self.token_types.heredoc_string_alternative != self.token_types.var {
-                        self.add(self.token_types.heredoc_string_alternative, None)?
-                    } else {
-                        self.scan_var()?
-                    };
-
-                    return Ok(true)
-                };
-
                 self.advance(1)?;
 
                 let tag = if self.current_char.to_string() == *end {
                     String::from("")
                 } else {
-                    self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)?
+                    self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)?
                 };
 
-                if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier {
+                if !tag.is_empty()
+                    && self.settings.heredoc_tag_is_identifier
+                    && (self.is_end || !self.is_identifier(&tag))
+                {
+                    if !self.is_end {
+                        self.advance(-1)?;
+                    }
+
                     self.advance(-(tag.len() as isize))?;
                     self.add(self.token_types.heredoc_string_alternative, None)?;
                     return Ok(true)
@@ -441,7 +449,7 @@ impl<'a> TokenizerState<'a> {
         };
 
         self.advance(start.len() as isize)?;
-        let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?;
+        let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
 
         if let Some(b) = base {
             if u64::from_str_radix(&text, b).is_err() {
@@ -494,7 +502,7 @@ impl<'a> TokenizerState<'a> {
             } else if self.peek_char.to_ascii_uppercase() == 'E' && scientific == 0 {
                 scientific += 1;
                 self.advance(1)?;
-            } else if self.is_identifier(self.peek_char) {
+            } else if self.is_alphabetic_or_underscore(self.peek_char) {
                 let number_text = self.text();
                 let mut literal = String::from("");
 
@@ -587,7 +595,7 @@ impl<'a> TokenizerState<'a> {
 
     fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> {
         self.advance(1)?;
-        let text = self.extract_string(identifier_end, true, true, true)?;
+        let text = self.extract_string(identifier_end, true, false, true)?;
         self.add(self.token_types.identifier, Some(text))
     }
 
@@ -595,7 +603,7 @@ impl<'a> TokenizerState<'a> {
         &mut self,
         delimiter: &str,
         use_identifier_escapes: bool,
-        unescape_sequences: bool,
+        raw_string: bool,
         raise_unmatched: bool,
     ) -> Result<String, TokenizerError> {
         let mut text = String::from("");
@@ -608,7 +616,7 @@ impl<'a> TokenizerState<'a> {
             };
             let peek_char_str = self.peek_char.to_string();
 
-            if unescape_sequences
+            if !raw_string
                 && !self.dialect_settings.unescaped_sequences.is_empty()
                 && !self.peek_char.is_whitespace()
                 && self.settings.string_escapes.contains(&self.current_char)
@@ -623,7 +631,8 @@ impl<'a> TokenizerState<'a> {
                 }
             }
 
-            if escapes.contains(&self.current_char)
+            if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string)
+                && escapes.contains(&self.current_char)
                 && (peek_char_str == delimiter || escapes.contains(&self.peek_char))
                 && (self.current_char == self.peek_char
                     || !self
@@ -676,10 +685,18 @@ impl<'a> TokenizerState<'a> {
         Ok(text)
     }
 
-    fn is_identifier(&mut self, name: char) -> bool {
+    fn is_alphabetic_or_underscore(&mut self, name: char) -> bool {
         name.is_alphabetic() || name == '_'
     }
 
+    fn is_identifier(&mut self, s: &str) -> bool {
+        s.chars().enumerate().all(
+            |(i, c)|
+            if i == 0 { self.is_alphabetic_or_underscore(c) }
+            else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
+        )
+    }
+
     fn extract_value(&mut self) -> Result<String, TokenizerError> {
         loop {
             if !self.peek_char.is_whitespace()