summaryrefslogtreecommitdiffstats
path: root/sqlglot/tokens.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-06-16 09:41:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-06-16 09:41:18 +0000
commit67578a7602a5be7eb51f324086c8d49bcf8b7498 (patch)
tree0b7515c922d1c383cea24af5175379cfc8edfd15 /sqlglot/tokens.py
parentReleasing debian version 15.2.0-1. (diff)
downloadsqlglot-67578a7602a5be7eb51f324086c8d49bcf8b7498.tar.xz
sqlglot-67578a7602a5be7eb51f324086c8d49bcf8b7498.zip
Merging upstream version 16.2.1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r--sqlglot/tokens.py40
1 files changed, 23 insertions, 17 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
index a30ec24..42628b9 100644
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@@ -144,6 +144,7 @@ class TokenType(AutoName):
VARIANT = auto()
OBJECT = auto()
INET = auto()
+ ENUM = auto()
# keywords
ALIAS = auto()
@@ -346,6 +347,7 @@ class Token:
col: The column that the token ends on.
start: The start index of the token.
end: The ending index of the token.
+ comments: The comments to attach to the token.
"""
self.token_type = token_type
self.text = text
@@ -391,12 +393,15 @@ class _Tokenizer(type):
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
- klass._COMMENTS = dict(
- (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
- for comment in klass.COMMENTS
- )
+ klass._COMMENTS = {
+ **dict(
+ (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
+ for comment in klass.COMMENTS
+ ),
+ "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects
+ }
- klass.KEYWORD_TRIE = new_trie(
+ klass._KEYWORD_TRIE = new_trie(
key.upper()
for key in (
*klass.KEYWORDS,
@@ -456,20 +461,22 @@ class Tokenizer(metaclass=_Tokenizer):
STRING_ESCAPES = ["'"]
VAR_SINGLE_TOKENS: t.Set[str] = set()
+ # Autofilled
+ IDENTIFIERS_CAN_START_WITH_DIGIT: bool = False
+
_COMMENTS: t.Dict[str, str] = {}
_FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
_IDENTIFIERS: t.Dict[str, str] = {}
_IDENTIFIER_ESCAPES: t.Set[str] = set()
_QUOTES: t.Dict[str, str] = {}
_STRING_ESCAPES: t.Set[str] = set()
+ _KEYWORD_TRIE: t.Dict = {}
- KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
+ KEYWORDS: t.Dict[str, TokenType] = {
**{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
**{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
- "{{+": TokenType.BLOCK_START,
- "{{-": TokenType.BLOCK_START,
- "+}}": TokenType.BLOCK_END,
- "-}}": TokenType.BLOCK_END,
+ **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
+ **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
"/*+": TokenType.HINT,
"==": TokenType.EQ,
"::": TokenType.DCOLON,
@@ -594,6 +601,7 @@ class Tokenizer(metaclass=_Tokenizer):
"RECURSIVE": TokenType.RECURSIVE,
"REGEXP": TokenType.RLIKE,
"REPLACE": TokenType.REPLACE,
+ "RETURNING": TokenType.RETURNING,
"REFERENCES": TokenType.REFERENCES,
"RIGHT": TokenType.RIGHT,
"RLIKE": TokenType.RLIKE,
@@ -732,8 +740,7 @@ class Tokenizer(metaclass=_Tokenizer):
NUMERIC_LITERALS: t.Dict[str, str] = {}
ENCODE: t.Optional[str] = None
- COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
- KEYWORD_TRIE: t.Dict = {} # autofilled
+ COMMENTS = ["--", ("/*", "*/")]
__slots__ = (
"sql",
@@ -748,7 +755,6 @@ class Tokenizer(metaclass=_Tokenizer):
"_end",
"_peek",
"_prev_token_line",
- "identifiers_can_start_with_digit",
)
def __init__(self) -> None:
@@ -894,7 +900,7 @@ class Tokenizer(metaclass=_Tokenizer):
char = chars
prev_space = False
skip = False
- trie = self.KEYWORD_TRIE
+ trie = self._KEYWORD_TRIE
single_token = char in self.SINGLE_TOKENS
while chars:
@@ -994,7 +1000,7 @@ class Tokenizer(metaclass=_Tokenizer):
self._advance()
elif self._peek == "." and not decimal:
after = self.peek(1)
- if after.isdigit() or not after.strip():
+ if after.isdigit() or not after.isalpha():
decimal = True
self._advance()
else:
@@ -1013,13 +1019,13 @@ class Tokenizer(metaclass=_Tokenizer):
literal += self._peek.upper()
self._advance()
- token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
+ token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal, ""))
if token_type:
self._add(TokenType.NUMBER, number_text)
self._add(TokenType.DCOLON, "::")
return self._add(token_type, literal)
- elif self.identifiers_can_start_with_digit: # type: ignore
+ elif self.IDENTIFIERS_CAN_START_WITH_DIGIT:
return self._add(TokenType.VAR)
self._add(TokenType.NUMBER, number_text)