From 67578a7602a5be7eb51f324086c8d49bcf8b7498 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 16 Jun 2023 11:41:18 +0200 Subject: Merging upstream version 16.2.1. Signed-off-by: Daniel Baumann --- sqlglot/tokens.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'sqlglot/tokens.py') diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index a30ec24..42628b9 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -144,6 +144,7 @@ class TokenType(AutoName): VARIANT = auto() OBJECT = auto() INET = auto() + ENUM = auto() # keywords ALIAS = auto() @@ -346,6 +347,7 @@ class Token: col: The column that the token ends on. start: The start index of the token. end: The ending index of the token. + comments: The comments to attach to the token. """ self.token_type = token_type self.text = text @@ -391,12 +393,15 @@ class _Tokenizer(type): klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) - klass._COMMENTS = dict( - (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) - for comment in klass.COMMENTS - ) + klass._COMMENTS = { + **dict( + (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) + for comment in klass.COMMENTS + ), + "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects + } - klass.KEYWORD_TRIE = new_trie( + klass._KEYWORD_TRIE = new_trie( key.upper() for key in ( *klass.KEYWORDS, @@ -456,20 +461,22 @@ class Tokenizer(metaclass=_Tokenizer): STRING_ESCAPES = ["'"] VAR_SINGLE_TOKENS: t.Set[str] = set() + # Autofilled + IDENTIFIERS_CAN_START_WITH_DIGIT: bool = False + _COMMENTS: t.Dict[str, str] = {} _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} _IDENTIFIERS: t.Dict[str, str] = {} _IDENTIFIER_ESCAPES: t.Set[str] = set() _QUOTES: t.Dict[str, str] = {} _STRING_ESCAPES: t.Set[str] = set() + _KEYWORD_TRIE: t.Dict = {} - KEYWORDS: t.Dict[t.Optional[str], TokenType] = { + KEYWORDS: t.Dict[str, TokenType] = { **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, - "{{+": TokenType.BLOCK_START, - "{{-": TokenType.BLOCK_START, - "+}}": TokenType.BLOCK_END, - "-}}": TokenType.BLOCK_END, + **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, + **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, "/*+": TokenType.HINT, "==": TokenType.EQ, "::": TokenType.DCOLON, @@ -594,6 +601,7 @@ class Tokenizer(metaclass=_Tokenizer): "RECURSIVE": TokenType.RECURSIVE, "REGEXP": TokenType.RLIKE, "REPLACE": TokenType.REPLACE, + "RETURNING": TokenType.RETURNING, "REFERENCES": TokenType.REFERENCES, "RIGHT": TokenType.RIGHT, "RLIKE": TokenType.RLIKE, @@ -732,8 +740,7 @@ class Tokenizer(metaclass=_Tokenizer): NUMERIC_LITERALS: t.Dict[str, str] = {} ENCODE: t.Optional[str] = None - COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] - KEYWORD_TRIE: t.Dict = {} # autofilled + COMMENTS = ["--", ("/*", "*/")] __slots__ = ( "sql", @@ -748,7 +755,6 @@ class Tokenizer(metaclass=_Tokenizer): "_end", "_peek", "_prev_token_line", - "identifiers_can_start_with_digit", ) def __init__(self) -> None: @@ -894,7 +900,7 @@ class Tokenizer(metaclass=_Tokenizer): char = chars prev_space = False skip = False - trie = self.KEYWORD_TRIE + trie = self._KEYWORD_TRIE single_token = char in self.SINGLE_TOKENS while chars: @@ -994,7 +1000,7 @@ class Tokenizer(metaclass=_Tokenizer): self._advance() elif self._peek == "." and not decimal: after = self.peek(1) - if after.isdigit() or not after.strip(): + if after.isdigit() or not after.isalpha(): decimal = True self._advance() else: @@ -1013,13 +1019,13 @@ class Tokenizer(metaclass=_Tokenizer): literal += self._peek.upper() self._advance() - token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) + token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal, "")) if token_type: self._add(TokenType.NUMBER, number_text) self._add(TokenType.DCOLON, "::") return self._add(token_type, literal) - elif self.identifiers_can_start_with_digit: # type: ignore + elif self.IDENTIFIERS_CAN_START_WITH_DIGIT: return self._add(TokenType.VAR) self._add(TokenType.NUMBER, number_text) -- cgit v1.2.3