From 2228e192dc1a582aa2ae004f20c692f6c7aeb853 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 23 Jan 2023 09:43:00 +0100 Subject: Merging upstream version 10.5.6. Signed-off-by: Daniel Baumann --- sqlglot/tokens.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'sqlglot/tokens.py') diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index 8e312a7..f12528f 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -82,6 +82,8 @@ class TokenType(AutoName): VARCHAR = auto() NVARCHAR = auto() TEXT = auto() + MEDIUMTEXT = auto() + LONGTEXT = auto() BINARY = auto() VARBINARY = auto() JSON = auto() @@ -434,6 +436,8 @@ class Tokenizer(metaclass=_Tokenizer): ESCAPES = ["'"] + _ESCAPES: t.Set[str] = set() + KEYWORDS = { **{ f"{key}{postfix}": TokenType.BLOCK_START @@ -461,6 +465,7 @@ class Tokenizer(metaclass=_Tokenizer): "#>>": TokenType.DHASH_ARROW, "<->": TokenType.LR_ARROW, "ALL": TokenType.ALL, + "ALWAYS": TokenType.ALWAYS, "AND": TokenType.AND, "ANTI": TokenType.ANTI, "ANY": TokenType.ANY, @@ -472,6 +477,7 @@ class Tokenizer(metaclass=_Tokenizer): "BETWEEN": TokenType.BETWEEN, "BOTH": TokenType.BOTH, "BUCKET": TokenType.BUCKET, + "BY DEFAULT": TokenType.BY_DEFAULT, "CACHE": TokenType.CACHE, "UNCACHE": TokenType.UNCACHE, "CASE": TokenType.CASE, @@ -521,9 +527,11 @@ class Tokenizer(metaclass=_Tokenizer): "FOREIGN KEY": TokenType.FOREIGN_KEY, "FORMAT": TokenType.FORMAT, "FROM": TokenType.FROM, + "GENERATED": TokenType.GENERATED, "GROUP BY": TokenType.GROUP_BY, "GROUPING SETS": TokenType.GROUPING_SETS, "HAVING": TokenType.HAVING, + "IDENTITY": TokenType.IDENTITY, "IF": TokenType.IF, "ILIKE": TokenType.ILIKE, "IMMUTABLE": TokenType.IMMUTABLE, @@ -746,7 +754,7 @@ class Tokenizer(metaclass=_Tokenizer): ) def __init__(self) -> None: - self._replace_backslash = "\\" in self._ESCAPES # type: ignore + self._replace_backslash = "\\" in self._ESCAPES self.reset() def reset(self) -> None: @@ -771,7 +779,10 @@ class Tokenizer(metaclass=_Tokenizer): self.reset() self.sql = sql self.size = len(sql) + self._scan() + return self.tokens + def _scan(self, until: t.Optional[t.Callable] = None) -> None: while self.size and not self._end: self._start = self._current self._advance() @@ -792,7 +803,9 @@ class Tokenizer(metaclass=_Tokenizer): self._scan_identifier(identifier_end) else: self._scan_keywords() - return self.tokens + + if until and until(): + break def _chars(self, size: int) -> str: if size == 1: @@ -832,11 +845,13 @@ class Tokenizer(metaclass=_Tokenizer): if token_type in self.COMMANDS and ( len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON ): - self._start = self._current - while not self._end and self._peek != ";": - self._advance() - if self._start < self._current: - self._add(TokenType.STRING) + start = self._current + tokens = len(self.tokens) + self._scan(lambda: self._peek == ";") + self.tokens = self.tokens[:tokens] + text = self.sql[start : self._current].strip() + if text: + self._add(TokenType.STRING, text) def _scan_keywords(self) -> None: size = 0 @@ -947,7 +962,8 @@ class Tokenizer(metaclass=_Tokenizer): elif self._peek.isidentifier(): # type: ignore number_text = self._text literal = [] - while self._peek.isidentifier(): # type: ignore + + while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: # type: ignore literal.append(self._peek.upper()) # type: ignore self._advance() @@ -1063,8 +1079,12 @@ class Tokenizer(metaclass=_Tokenizer): delim_size = len(delimiter) while True: - if self._char in self._ESCAPES and self._peek == delimiter: # type: ignore - text += delimiter + if ( + self._char in self._ESCAPES + and self._peek + and (self._peek == delimiter or self._peek in self._ESCAPES) + ): + text += self._peek self._advance(2) else: if self._chars(delim_size) == delimiter: -- cgit v1.2.3