summaryrefslogtreecommitdiffstats
path: root/sqlglotrs/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2025-01-14 10:04:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2025-01-14 10:04:11 +0000
commitc71a090d2dcb7b8332b02d5f3c72e4248acf1c19 (patch)
treeb742ed8e66fadf6668042e2de714484e6212acaa /sqlglotrs/src
parentReleasing debian version 26.0.1-1. (diff)
downloadsqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.tar.xz
sqlglot-c71a090d2dcb7b8332b02d5f3c72e4248acf1c19.zip
Merging upstream version 26.1.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sqlglotrs/src')
-rw-r--r--sqlglotrs/src/lib.rs93
-rw-r--r--sqlglotrs/src/settings.rs68
-rw-r--r--sqlglotrs/src/token.rs61
-rw-r--r--sqlglotrs/src/tokenizer.rs45
4 files changed, 166 insertions, 101 deletions
diff --git a/sqlglotrs/src/lib.rs b/sqlglotrs/src/lib.rs
index e60620a..bb6caf6 100644
--- a/sqlglotrs/src/lib.rs
+++ b/sqlglotrs/src/lib.rs
@@ -1,90 +1,13 @@
use pyo3::prelude::*;
-use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pymodule, types::PyModule, Bound, PyResult};
+use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use token::Token;
+use tokenizer::Tokenizer;
-mod settings;
-mod tokenizer;
-mod trie;
-
-pub use self::settings::{
- TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
-};
-pub use self::tokenizer::Tokenizer;
-
-#[derive(Debug)]
-#[pyclass]
-pub struct Token {
- #[pyo3(get, name = "token_type_index")]
- pub token_type: TokenType,
- #[pyo3(get, set, name = "token_type")]
- pub token_type_py: PyObject,
- #[pyo3(get)]
- pub text: Py<PyString>,
- #[pyo3(get)]
- pub line: usize,
- #[pyo3(get)]
- pub col: usize,
- #[pyo3(get)]
- pub start: usize,
- #[pyo3(get)]
- pub end: usize,
- #[pyo3(get)]
- pub comments: Py<PyList>,
-}
-
-impl Token {
- pub fn new(
- token_type: TokenType,
- text: String,
- line: usize,
- col: usize,
- start: usize,
- end: usize,
- comments: Vec<String>,
- ) -> Token {
- Python::with_gil(|py| Token {
- token_type,
- token_type_py: PyNone::get_bound(py).into_py(py),
- text: PyString::new_bound(py, &text).into_py(py),
- line,
- col,
- start,
- end,
- comments: PyList::new_bound(py, &comments).into(),
- })
- }
-
- pub fn append_comments(&self, comments: &mut Vec<String>) {
- Python::with_gil(|py| {
- let pylist = self.comments.bind(py);
- for comment in comments.iter() {
- if let Err(_) = pylist.append(comment) {
- panic!("Failed to append comments to the Python list");
- }
- }
- });
- // Simulate `Vec::append`.
- let _ = std::mem::replace(comments, Vec::new());
- }
-}
-
-#[pymethods]
-impl Token {
- #[pyo3(name = "__repr__")]
- fn python_repr(&self) -> PyResult<String> {
- Python::with_gil(|py| {
- Ok(format!(
- "<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
- self.token_type_py.bind(py).repr()?,
- self.text.bind(py).repr()?,
- self.line,
- self.col,
- self.start,
- self.end,
- self.comments.bind(py).repr()?,
- ))
- })
- }
-}
+pub mod settings;
+pub mod token;
+pub mod tokenizer;
+pub mod trie;
#[pymodule]
fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
index 7bc4882..b0c951c 100644
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@@ -1,10 +1,12 @@
-use pyo3::prelude::*;
use std::collections::{HashMap, HashSet};
+use pyo3::prelude::*;
+
pub type TokenType = u16;
#[derive(Clone, Debug)]
#[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenTypeSettings {
pub bit_string: TokenType,
pub break_: TokenType,
@@ -41,7 +43,7 @@ impl TokenTypeSettings {
heredoc_string_alternative: TokenType,
hint: TokenType,
) -> Self {
- TokenTypeSettings {
+ let token_type_settings = TokenTypeSettings {
bit_string,
break_,
dcolon,
@@ -56,12 +58,31 @@ impl TokenTypeSettings {
var,
heredoc_string_alternative,
hint,
+ };
+
+ #[cfg(feature = "profiling")]
+ {
+ token_type_settings.write_json_to_string();
}
+
+ token_type_settings
+ }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenTypeSettings {
+ pub fn write_json_to_string(&self) {
+ let json = serde_json::to_string(self).unwrap();
+ let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+ .join("benches/token_type_settings.json");
+ // Write to file
+ std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerSettings {
pub white_space: HashMap<char, TokenType>,
pub single_tokens: HashMap<char, TokenType>,
@@ -141,7 +162,7 @@ impl TokenizerSettings {
let var_single_tokens_native: HashSet<char> =
var_single_tokens.iter().map(&to_char).collect();
- TokenizerSettings {
+ let tokenizer_settings = TokenizerSettings {
white_space: white_space_native,
single_tokens: single_tokens_native,
keywords,
@@ -162,15 +183,35 @@ impl TokenizerSettings {
string_escapes_allowed_in_raw_strings,
nested_comments,
hint_start,
+ };
+
+ #[cfg(feature = "profiling")]
+ {
+ tokenizer_settings.write_json_to_string();
}
+
+ tokenizer_settings
+ }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerSettings {
+ pub fn write_json_to_string(&self) {
+ let json = serde_json::to_string(self).unwrap();
+ let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+ .join("benches/tokenizer_settings.json");
+ // Write to file
+ std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerDialectSettings {
pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
+ pub numbers_can_be_underscore_separated: bool,
}
#[pymethods]
@@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
pub fn new(
unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
+ numbers_can_be_underscore_separated: bool,
) -> Self {
- TokenizerDialectSettings {
+ let settings = TokenizerDialectSettings {
unescaped_sequences,
identifiers_can_start_with_digit,
+ numbers_can_be_underscore_separated,
+ };
+
+ #[cfg(feature = "profiling")]
+ {
+ settings.write_json_to_string();
}
+
+ settings
+ }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerDialectSettings {
+ pub fn write_json_to_string(&self) {
+ let json = serde_json::to_string(self).unwrap();
+ let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+ .join("benches/tokenizer_dialect_settings.json");
+ std::fs::write(path, &json).unwrap();
}
}
diff --git a/sqlglotrs/src/token.rs b/sqlglotrs/src/token.rs
new file mode 100644
index 0000000..3352469
--- /dev/null
+++ b/sqlglotrs/src/token.rs
@@ -0,0 +1,61 @@
+use crate::settings::TokenType;
+use pyo3::prelude::PyListMethods;
+use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
+
+#[derive(Debug)]
+#[pyclass]
+pub struct Token {
+ #[pyo3(get, name = "token_type_index")]
+ pub token_type: TokenType,
+ #[pyo3(get, set, name = "token_type")]
+ pub token_type_py: PyObject,
+ #[pyo3(get)]
+ pub text: Py<PyString>,
+ #[pyo3(get)]
+ pub line: usize,
+ #[pyo3(get)]
+ pub col: usize,
+ #[pyo3(get)]
+ pub start: usize,
+ #[pyo3(get)]
+ pub end: usize,
+ #[pyo3(get)]
+ pub comments: Py<PyList>,
+}
+
+impl Token {
+ pub fn new(
+ token_type: TokenType,
+ text: String,
+ line: usize,
+ col: usize,
+ start: usize,
+ end: usize,
+ comments: Vec<String>,
+ ) -> Token {
+ Python::with_gil(|py| Token {
+ token_type,
+ token_type_py: PyNone::get_bound(py).into_py(py),
+ text: PyString::new_bound(py, &text).into_py(py),
+ line,
+ col,
+ start,
+ end,
+ comments: PyList::new_bound(py, &comments).into(),
+ })
+ }
+
+ pub fn append_comments(&self, comments: &mut Vec<String>) {
+ Python::with_gil(|py| {
+ let pylist = self.comments.bind(py);
+ for comment in comments.iter() {
+ if let Err(_) = pylist.append(comment) {
+ panic!("Failed to append comments to the Python list");
+ }
+ }
+ });
+ // Simulate `Vec::append`.
+ let _ = std::mem::replace(comments, Vec::new());
+ }
+}
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
index 8228b5a..2ffe45f 100644
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@@ -1,5 +1,6 @@
+use crate::settings::TokenType;
use crate::trie::{Trie, TrieResult};
-use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use pyo3::exceptions::PyException;
use pyo3::prelude::*;
use std::cmp::{max, min};
@@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
self.advance(1)?;
// Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
- if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
+ if self.settings.nested_comments
+ && !self.is_end
+ && self.chars(comment_start_size) == *comment_start
+ {
self.advance(comment_start_size as isize)?;
comment_count += 1
}
@@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {
if comment_start == self.settings.hint_start
&& self.tokens.last().is_some()
- && self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
+ && self
+ .settings
+ .tokens_preceding_hint
+ .contains(&self.tokens.last().unwrap().token_type)
+ {
self.add(self.token_types.hint, None)?;
}
@@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {
self.advance(-(tag.len() as isize))?;
self.add(self.token_types.heredoc_string_alternative, None)?;
- return Ok(true)
+ return Ok(true);
}
(None, *token_type, format!("{}{}{}", start, tag, end))
@@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
- let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
+ let text =
+ self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
if let Some(b) = base {
- if u64::from_str_radix(&text, b).is_err() {
+ if u128::from_str_radix(&text, b).is_err() {
return self.error_result(format!(
"Numeric string contains invalid characters from {}:{}",
self.line, self.start
@@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
)
.map(|x| *x);
+ let replaced = literal.replace("_", "");
+
if let Some(unwrapped_token_type) = token_type {
self.add(self.token_types.number, Some(number_text))?;
self.add(self.token_types.dcolon, Some("::".to_string()))?;
self.add(unwrapped_token_type, Some(literal))?;
+ } else if self.dialect_settings.numbers_can_be_underscore_separated
+ && self.is_numeric(&replaced)
+ {
+ self.add(self.token_types.number, Some(number_text + &replaced))?;
} else if self.dialect_settings.identifiers_can_start_with_digit {
self.add(self.token_types.var, None)?;
} else {
@@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
if self.is_end {
if !raise_unmatched {
text.push(self.current_char);
- return Ok(text)
+ return Ok(text);
}
return self.error_result(format!(
@@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
}
fn is_identifier(&mut self, s: &str) -> bool {
- s.chars().enumerate().all(
- |(i, c)|
- if i == 0 { self.is_alphabetic_or_underscore(c) }
- else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
- )
+ s.chars().enumerate().all(|(i, c)| {
+ if i == 0 {
+ self.is_alphabetic_or_underscore(c)
+ } else {
+ self.is_alphabetic_or_underscore(c) || c.is_digit(10)
+ }
+ })
+ }
+
+ fn is_numeric(&mut self, s: &str) -> bool {
+ s.chars().all(|c| c.is_digit(10))
}
fn extract_value(&mut self) -> Result<String, TokenizerError> {