Merging upstream version 26.1.3.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2025-01-14 10:04:11 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2025-01-14 10:04:11 +0000
commit: c71a090d2dcb7b8332b02d5f3c72e4248acf1c19 (patch)
tree: b742ed8e66fadf6668042e2de714484e6212acaa /sqlglotrs/src
parent: Releasing debian version 26.0.1-1. (diff)
download: sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.tar.xz
sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.zip
4 files changed, 166 insertions, 101 deletions
diff --git a/sqlglotrs/src/lib.rs b/sqlglotrs/src/lib.rs
index e60620a..bb6caf6 100644
--- a/sqlglotrs/src/lib.rs
+++ b/sqlglotrs/src/lib.rs
@@ -1,90 +1,13 @@
 use pyo3::prelude::*;
-use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pymodule, types::PyModule, Bound, PyResult};
+use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use token::Token;
+use tokenizer::Tokenizer;
 
-mod settings;
-mod tokenizer;
-mod trie;
-
-pub use self::settings::{
-    TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
-};
-pub use self::tokenizer::Tokenizer;
-
-#[derive(Debug)]
-#[pyclass]
-pub struct Token {
-    #[pyo3(get, name = "token_type_index")]
-    pub token_type: TokenType,
-    #[pyo3(get, set, name = "token_type")]
-    pub token_type_py: PyObject,
-    #[pyo3(get)]
-    pub text: Py<PyString>,
-    #[pyo3(get)]
-    pub line: usize,
-    #[pyo3(get)]
-    pub col: usize,
-    #[pyo3(get)]
-    pub start: usize,
-    #[pyo3(get)]
-    pub end: usize,
-    #[pyo3(get)]
-    pub comments: Py<PyList>,
-}
-
-impl Token {
-    pub fn new(
-        token_type: TokenType,
-        text: String,
-        line: usize,
-        col: usize,
-        start: usize,
-        end: usize,
-        comments: Vec<String>,
-    ) -> Token {
-        Python::with_gil(|py| Token {
-            token_type,
-            token_type_py: PyNone::get_bound(py).into_py(py),
-            text: PyString::new_bound(py, &text).into_py(py),
-            line,
-            col,
-            start,
-            end,
-            comments: PyList::new_bound(py, &comments).into(),
-        })
-    }
-
-    pub fn append_comments(&self, comments: &mut Vec<String>) {
-        Python::with_gil(|py| {
-            let pylist = self.comments.bind(py);
-            for comment in comments.iter() {
-                if let Err(_) = pylist.append(comment) {
-                    panic!("Failed to append comments to the Python list");
-                }
-            }
-        });
-        // Simulate `Vec::append`.
-        let _ = std::mem::replace(comments, Vec::new());
-    }
-}
-
-#[pymethods]
-impl Token {
-    #[pyo3(name = "__repr__")]
-    fn python_repr(&self) -> PyResult<String> {
-        Python::with_gil(|py| {
-            Ok(format!(
-                "<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
-                self.token_type_py.bind(py).repr()?,
-                self.text.bind(py).repr()?,
-                self.line,
-                self.col,
-                self.start,
-                self.end,
-                self.comments.bind(py).repr()?,
-            ))
-        })
-    }
-}
+pub mod settings;
+pub mod token;
+pub mod tokenizer;
+pub mod trie;
 
 #[pymodule]
 fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index 7bc4882..b0c951c 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -1,10 +1,12 @@
-use pyo3::prelude::*;
 use std::collections::{HashMap, HashSet};
 
+use pyo3::prelude::*;
+
 pub type TokenType = u16;
 
 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenTypeSettings {
     pub bit_string: TokenType,
     pub break_: TokenType,
@@ -41,7 +43,7 @@ impl TokenTypeSettings {
         heredoc_string_alternative: TokenType,
         hint: TokenType,
     ) -> Self {
-        TokenTypeSettings {
+        let token_type_settings = TokenTypeSettings {
             bit_string,
             break_,
             dcolon,
@@ -56,12 +58,31 @@ impl TokenTypeSettings {
             var,
             heredoc_string_alternative,
             hint,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            token_type_settings.write_json_to_string();
         }
+
+        token_type_settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenTypeSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/token_type_settings.json");
+        // Write to file
+        std::fs::write(path, &json).unwrap();
     }
 }
 
 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenizerSettings {
     pub white_space: HashMap<char, TokenType>,
     pub single_tokens: HashMap<char, TokenType>,
@@ -141,7 +162,7 @@ impl TokenizerSettings {
         let var_single_tokens_native: HashSet<char> =
             var_single_tokens.iter().map(&to_char).collect();
 
-        TokenizerSettings {
+        let tokenizer_settings = TokenizerSettings {
             white_space: white_space_native,
             single_tokens: single_tokens_native,
             keywords,
@@ -162,15 +183,35 @@ impl TokenizerSettings {
             string_escapes_allowed_in_raw_strings,
             nested_comments,
             hint_start,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            tokenizer_settings.write_json_to_string();
         }
+
+        tokenizer_settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/tokenizer_settings.json");
+        // Write to file
+        std::fs::write(path, &json).unwrap();
     }
 }
 
 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenizerDialectSettings {
     pub unescaped_sequences: HashMap<String, String>,
     pub identifiers_can_start_with_digit: bool,
+    pub numbers_can_be_underscore_separated: bool,
 }
 
 #[pymethods]
@@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
     pub fn new(
         unescaped_sequences: HashMap<String, String>,
         identifiers_can_start_with_digit: bool,
+        numbers_can_be_underscore_separated: bool,
     ) -> Self {
-        TokenizerDialectSettings {
+        let settings = TokenizerDialectSettings {
             unescaped_sequences,
             identifiers_can_start_with_digit,
+            numbers_can_be_underscore_separated,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            settings.write_json_to_string();
         }
+
+        settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerDialectSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/tokenizer_dialect_settings.json");
+        std::fs::write(path, &json).unwrap();
     }
 }
diff --git a/sqlglotrs/src/token.rs b/sqlglotrs/src/token.rs
new file mode 100644
index 0000000..3352469
--- /dev/null
+++ b/sqlglotrs/src/token.rs
@@ -0,0 +1,61 @@
+use crate::settings::TokenType;
+use pyo3::prelude::PyListMethods;
+use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
+
+#[derive(Debug)]
+#[pyclass]
+pub struct Token {
+    #[pyo3(get, name = "token_type_index")]
+    pub token_type: TokenType,
+    #[pyo3(get, set, name = "token_type")]
+    pub token_type_py: PyObject,
+    #[pyo3(get)]
+    pub text: Py<PyString>,
+    #[pyo3(get)]
+    pub line: usize,
+    #[pyo3(get)]
+    pub col: usize,
+    #[pyo3(get)]
+    pub start: usize,
+    #[pyo3(get)]
+    pub end: usize,
+    #[pyo3(get)]
+    pub comments: Py<PyList>,
+}
+
+impl Token {
+    pub fn new(
+        token_type: TokenType,
+        text: String,
+        line: usize,
+        col: usize,
+        start: usize,
+        end: usize,
+        comments: Vec<String>,
+    ) -> Token {
+        Python::with_gil(|py| Token {
+            token_type,
+            token_type_py: PyNone::get_bound(py).into_py(py),
+            text: PyString::new_bound(py, &text).into_py(py),
+            line,
+            col,
+            start,
+            end,
+            comments: PyList::new_bound(py, &comments).into(),
+        })
+    }
+
+    pub fn append_comments(&self, comments: &mut Vec<String>) {
+        Python::with_gil(|py| {
+            let pylist = self.comments.bind(py);
+            for comment in comments.iter() {
+                if let Err(_) = pylist.append(comment) {
+                    panic!("Failed to append comments to the Python list");
+                }
+            }
+        });
+        // Simulate `Vec::append`.
+        let _ = std::mem::replace(comments, Vec::new());
+    }
+}
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index 8228b5a..2ffe45f 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -1,5 +1,6 @@
+use crate::settings::TokenType;
 use crate::trie::{Trie, TrieResult};
-use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
 use pyo3::exceptions::PyException;
 use pyo3::prelude::*;
 use std::cmp::{max, min};
@@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
                 self.advance(1)?;
 
                 // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
-                if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
+                if self.settings.nested_comments
+                    && !self.is_end
+                    && self.chars(comment_start_size) == *comment_start
+                {
                     self.advance(comment_start_size as isize)?;
                     comment_count += 1
                 }
@@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {
 
         if comment_start == self.settings.hint_start
             && self.tokens.last().is_some()
-            && self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
+            && self
+                .settings
+                .tokens_preceding_hint
+                .contains(&self.tokens.last().unwrap().token_type)
+        {
             self.add(self.token_types.hint, None)?;
         }
 
@@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {
 
                     self.advance(-(tag.len() as isize))?;
                     self.add(self.token_types.heredoc_string_alternative, None)?;
-                    return Ok(true)
+                    return Ok(true);
                 }
 
                 (None, *token_type, format!("{}{}{}", start, tag, end))
@@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
         };
 
         self.advance(start.len() as isize)?;
-        let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
+        let text =
+            self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
 
         if let Some(b) = base {
-            if u64::from_str_radix(&text, b).is_err() {
+            if u128::from_str_radix(&text, b).is_err() {
                 return self.error_result(format!(
                     "Numeric string contains invalid characters from {}:{}",
                     self.line, self.start
@@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
                     )
                     .map(|x| *x);
 
+                let replaced = literal.replace("_", "");
+
                 if let Some(unwrapped_token_type) = token_type {
                     self.add(self.token_types.number, Some(number_text))?;
                     self.add(self.token_types.dcolon, Some("::".to_string()))?;
                     self.add(unwrapped_token_type, Some(literal))?;
+                } else if self.dialect_settings.numbers_can_be_underscore_separated
+                    && self.is_numeric(&replaced)
+                {
+                    self.add(self.token_types.number, Some(number_text + &replaced))?;
                 } else if self.dialect_settings.identifiers_can_start_with_digit {
                     self.add(self.token_types.var, None)?;
                 } else {
@@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
                 if self.is_end {
                     if !raise_unmatched {
                         text.push(self.current_char);
-                        return Ok(text)
+                        return Ok(text);
                     }
 
                     return self.error_result(format!(
@@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
     }
 
     fn is_identifier(&mut self, s: &str) -> bool {
-        s.chars().enumerate().all(
-            |(i, c)|
-            if i == 0 { self.is_alphabetic_or_underscore(c) }
-            else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
-        )
+        s.chars().enumerate().all(|(i, c)| {
+            if i == 0 {
+                self.is_alphabetic_or_underscore(c)
+            } else {
+                self.is_alphabetic_or_underscore(c) || c.is_digit(10)
+            }
+        })
+    }
+
+    fn is_numeric(&mut self, s: &str) -> bool {
+        s.chars().all(|c| c.is_digit(10))
     }
 
     fn extract_value(&mut self) -> Result<String, TokenizerError> {
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2025-01-14 10:04:11 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2025-01-14 10:04:11 +0000
commit	c71a090d2dcb7b8332b02d5f3c72e4248acf1c19 (patch)
tree	b742ed8e66fadf6668042e2de714484e6212acaa /sqlglotrs/src
parent	Releasing debian version 26.0.1-1. (diff)
download	sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.tar.xz sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.zip