diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2025-01-14 10:04:11 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2025-01-14 10:04:11 +0000 |
commit | c71a090d2dcb7b8332b02d5f3c72e4248acf1c19 (patch) | |
tree | b742ed8e66fadf6668042e2de714484e6212acaa /sqlglotrs/src | |
parent | Releasing debian version 26.0.1-1. (diff) | |
download | sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.tar.xz sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.zip |
Merging upstream version 26.1.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r-- | sqlglotrs/src/lib.rs | 93 | ||||
-rw-r--r-- | sqlglotrs/src/settings.rs | 68 | ||||
-rw-r--r-- | sqlglotrs/src/token.rs | 61 | ||||
-rw-r--r-- | sqlglotrs/src/tokenizer.rs | 45 |
4 files changed, 166 insertions, 101 deletions
diff --git a/sqlglotrs/src/lib.rs b/sqlglotrs/src/lib.rs index e60620a..bb6caf6 100644 --- a/sqlglotrs/src/lib.rs +++ b/sqlglotrs/src/lib.rs @@ -1,90 +1,13 @@ use pyo3::prelude::*; -use pyo3::types::{PyList, PyNone, PyString}; +use pyo3::{pymodule, types::PyModule, Bound, PyResult}; +use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; +use token::Token; +use tokenizer::Tokenizer; -mod settings; -mod tokenizer; -mod trie; - -pub use self::settings::{ - TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings, -}; -pub use self::tokenizer::Tokenizer; - -#[derive(Debug)] -#[pyclass] -pub struct Token { - #[pyo3(get, name = "token_type_index")] - pub token_type: TokenType, - #[pyo3(get, set, name = "token_type")] - pub token_type_py: PyObject, - #[pyo3(get)] - pub text: Py<PyString>, - #[pyo3(get)] - pub line: usize, - #[pyo3(get)] - pub col: usize, - #[pyo3(get)] - pub start: usize, - #[pyo3(get)] - pub end: usize, - #[pyo3(get)] - pub comments: Py<PyList>, -} - -impl Token { - pub fn new( - token_type: TokenType, - text: String, - line: usize, - col: usize, - start: usize, - end: usize, - comments: Vec<String>, - ) -> Token { - Python::with_gil(|py| Token { - token_type, - token_type_py: PyNone::get_bound(py).into_py(py), - text: PyString::new_bound(py, &text).into_py(py), - line, - col, - start, - end, - comments: PyList::new_bound(py, &comments).into(), - }) - } - - pub fn append_comments(&self, comments: &mut Vec<String>) { - Python::with_gil(|py| { - let pylist = self.comments.bind(py); - for comment in comments.iter() { - if let Err(_) = pylist.append(comment) { - panic!("Failed to append comments to the Python list"); - } - } - }); - // Simulate `Vec::append`. - let _ = std::mem::replace(comments, Vec::new()); - } -} - -#[pymethods] -impl Token { - #[pyo3(name = "__repr__")] - fn python_repr(&self) -> PyResult<String> { - Python::with_gil(|py| { - Ok(format!( - "<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>", - self.token_type_py.bind(py).repr()?, - self.text.bind(py).repr()?, - self.line, - self.col, - self.start, - self.end, - self.comments.bind(py).repr()?, - )) - }) - } -} +pub mod settings; +pub mod token; +pub mod tokenizer; +pub mod trie; #[pymodule] fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> { diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index 7bc4882..b0c951c 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -1,10 +1,12 @@ -use pyo3::prelude::*; use std::collections::{HashMap, HashSet}; +use pyo3::prelude::*; + pub type TokenType = u16; #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenTypeSettings { pub bit_string: TokenType, pub break_: TokenType, @@ -41,7 +43,7 @@ impl TokenTypeSettings { heredoc_string_alternative: TokenType, hint: TokenType, ) -> Self { - TokenTypeSettings { + let token_type_settings = TokenTypeSettings { bit_string, break_, dcolon, @@ -56,12 +58,31 @@ impl TokenTypeSettings { var, heredoc_string_alternative, hint, + }; + + #[cfg(feature = "profiling")] + { + token_type_settings.write_json_to_string(); } + + token_type_settings + } +} + +#[cfg(feature = "profiling")] +impl TokenTypeSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/token_type_settings.json"); + // Write to file + std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerSettings { pub white_space: HashMap<char, TokenType>, pub single_tokens: HashMap<char, TokenType>, @@ -141,7 +162,7 @@ impl TokenizerSettings { let var_single_tokens_native: HashSet<char> = var_single_tokens.iter().map(&to_char).collect(); - TokenizerSettings { + let tokenizer_settings = TokenizerSettings { white_space: white_space_native, single_tokens: single_tokens_native, keywords, @@ -162,15 +183,35 @@ impl TokenizerSettings { string_escapes_allowed_in_raw_strings, nested_comments, hint_start, + }; + + #[cfg(feature = "profiling")] + { + tokenizer_settings.write_json_to_string(); } + + tokenizer_settings + } +} + +#[cfg(feature = "profiling")] +impl TokenizerSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/tokenizer_settings.json"); + // Write to file + std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerDialectSettings { pub unescaped_sequences: HashMap<String, String>, pub identifiers_can_start_with_digit: bool, + pub numbers_can_be_underscore_separated: bool, } #[pymethods] @@ -179,10 +220,29 @@ impl TokenizerDialectSettings { pub fn new( unescaped_sequences: HashMap<String, String>, identifiers_can_start_with_digit: bool, + numbers_can_be_underscore_separated: bool, ) -> Self { - TokenizerDialectSettings { + let settings = TokenizerDialectSettings { unescaped_sequences, identifiers_can_start_with_digit, + numbers_can_be_underscore_separated, + }; + + #[cfg(feature = "profiling")] + { + settings.write_json_to_string(); } + + settings + } +} + +#[cfg(feature = "profiling")] +impl TokenizerDialectSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/tokenizer_dialect_settings.json"); + std::fs::write(path, &json).unwrap(); } } diff --git a/sqlglotrs/src/token.rs b/sqlglotrs/src/token.rs new file mode 100644 index 0000000..3352469 --- /dev/null +++ b/sqlglotrs/src/token.rs @@ -0,0 +1,61 @@ +use crate::settings::TokenType; +use pyo3::prelude::PyListMethods; +use pyo3::types::{PyList, PyNone, PyString}; +use pyo3::{pyclass, IntoPy, Py, PyObject, Python}; + +#[derive(Debug)] +#[pyclass] +pub struct Token { + #[pyo3(get, name = "token_type_index")] + pub token_type: TokenType, + #[pyo3(get, set, name = "token_type")] + pub token_type_py: PyObject, + #[pyo3(get)] + pub text: Py<PyString>, + #[pyo3(get)] + pub line: usize, + #[pyo3(get)] + pub col: usize, + #[pyo3(get)] + pub start: usize, + #[pyo3(get)] + pub end: usize, + #[pyo3(get)] + pub comments: Py<PyList>, +} + +impl Token { + pub fn new( + token_type: TokenType, + text: String, + line: usize, + col: usize, + start: usize, + end: usize, + comments: Vec<String>, + ) -> Token { + Python::with_gil(|py| Token { + token_type, + token_type_py: PyNone::get_bound(py).into_py(py), + text: PyString::new_bound(py, &text).into_py(py), + line, + col, + start, + end, + comments: PyList::new_bound(py, &comments).into(), + }) + } + + pub fn append_comments(&self, comments: &mut Vec<String>) { + Python::with_gil(|py| { + let pylist = self.comments.bind(py); + for comment in comments.iter() { + if let Err(_) = pylist.append(comment) { + panic!("Failed to append comments to the Python list"); + } + } + }); + // Simulate `Vec::append`. + let _ = std::mem::replace(comments, Vec::new()); + } +} diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 8228b5a..2ffe45f 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -1,5 +1,6 @@ +use crate::settings::TokenType; use crate::trie::{Trie, TrieResult}; -use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; +use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; use pyo3::exceptions::PyException; use pyo3::prelude::*; use std::cmp::{max, min}; @@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> { self.advance(1)?; // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres - if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start { + if self.settings.nested_comments + && !self.is_end + && self.chars(comment_start_size) == *comment_start + { self.advance(comment_start_size as isize)?; comment_count += 1 } @@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> { if comment_start == self.settings.hint_start && self.tokens.last().is_some() - && self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) { + && self + .settings + .tokens_preceding_hint + .contains(&self.tokens.last().unwrap().token_type) + { self.add(self.token_types.hint, None)?; } @@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> { self.advance(-(tag.len() as isize))?; self.add(self.token_types.heredoc_string_alternative, None)?; - return Ok(true) + return Ok(true); } (None, *token_type, format!("{}{}{}", start, tag, end)) @@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; + let text = + self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; if let Some(b) = base { - if u64::from_str_radix(&text, b).is_err() { + if u128::from_str_radix(&text, b).is_err() { return self.error_result(format!( "Numeric string contains invalid characters from {}:{}", self.line, self.start @@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> { ) .map(|x| *x); + let replaced = literal.replace("_", ""); + if let Some(unwrapped_token_type) = token_type { self.add(self.token_types.number, Some(number_text))?; self.add(self.token_types.dcolon, Some("::".to_string()))?; self.add(unwrapped_token_type, Some(literal))?; + } else if self.dialect_settings.numbers_can_be_underscore_separated + && self.is_numeric(&replaced) + { + self.add(self.token_types.number, Some(number_text + &replaced))?; } else if self.dialect_settings.identifiers_can_start_with_digit { self.add(self.token_types.var, None)?; } else { @@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> { if self.is_end { if !raise_unmatched { text.push(self.current_char); - return Ok(text) + return Ok(text); } return self.error_result(format!( @@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> { } fn is_identifier(&mut self, s: &str) -> bool { - s.chars().enumerate().all( - |(i, c)| - if i == 0 { self.is_alphabetic_or_underscore(c) } - else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) } - ) + s.chars().enumerate().all(|(i, c)| { + if i == 0 { + self.is_alphabetic_or_underscore(c) + } else { + self.is_alphabetic_or_underscore(c) || c.is_digit(10) + } + }) + } + + fn is_numeric(&mut self, s: &str) -> bool { + s.chars().all(|c| c.is_digit(10)) } fn extract_value(&mut self) -> Result<String, TokenizerError> { |