From 90150543f9314be683d22a16339effd774192f6d Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 22 Sep 2022 06:31:28 +0200 Subject: Merging upstream version 6.1.1. Signed-off-by: Daniel Baumann --- sqlglot/tokens.py | 184 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 118 insertions(+), 66 deletions(-) (limited to 'sqlglot/tokens.py') diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index e4b754d..bd95bc7 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -38,6 +38,7 @@ class TokenType(AutoName): DARROW = auto() HASH_ARROW = auto() DHASH_ARROW = auto() + LR_ARROW = auto() ANNOTATION = auto() DOLLAR = auto() @@ -53,6 +54,7 @@ class TokenType(AutoName): TABLE = auto() VAR = auto() BIT_STRING = auto() + HEX_STRING = auto() # types BOOLEAN = auto() @@ -78,10 +80,17 @@ class TokenType(AutoName): UUID = auto() GEOGRAPHY = auto() NULLABLE = auto() + GEOMETRY = auto() + HLLSKETCH = auto() + SUPER = auto() + SERIAL = auto() + SMALLSERIAL = auto() + BIGSERIAL = auto() # keywords ADD_FILE = auto() ALIAS = auto() + ALWAYS = auto() ALL = auto() ALTER = auto() ANALYZE = auto() @@ -92,11 +101,12 @@ class TokenType(AutoName): AUTO_INCREMENT = auto() BEGIN = auto() BETWEEN = auto() + BOTH = auto() BUCKET = auto() + BY_DEFAULT = auto() CACHE = auto() CALL = auto() CASE = auto() - CAST = auto() CHARACTER_SET = auto() CHECK = auto() CLUSTER_BY = auto() @@ -104,7 +114,6 @@ class TokenType(AutoName): COMMENT = auto() COMMIT = auto() CONSTRAINT = auto() - CONVERT = auto() CREATE = auto() CROSS = auto() CUBE = auto() @@ -127,22 +136,24 @@ class TokenType(AutoName): EXCEPT = auto() EXISTS = auto() EXPLAIN = auto() - EXTRACT = auto() FALSE = auto() FETCH = auto() FILTER = auto() FINAL = auto() FIRST = auto() FOLLOWING = auto() + FOR = auto() FOREIGN_KEY = auto() FORMAT = auto() FULL = auto() FUNCTION = auto() FROM = auto() + GENERATED = auto() GROUP_BY = auto() GROUPING_SETS = auto() HAVING = auto() HINT = auto() + IDENTITY = auto() IF = auto() IGNORE_NULLS = auto() ILIKE = auto() @@ -159,12 +170,14 @@ class TokenType(AutoName): JOIN = auto() LATERAL = auto() LAZY = auto() + LEADING = auto() LEFT = auto() LIKE = auto() LIMIT = auto() LOCATION = auto() MAP = auto() MOD = auto() + NATURAL = auto() NEXT = auto() NO_ACTION = auto() NULL = auto() @@ -204,8 +217,10 @@ class TokenType(AutoName): ROWS = auto() SCHEMA_COMMENT = auto() SELECT = auto() + SEPARATOR = auto() SET = auto() SHOW = auto() + SIMILAR_TO = auto() SOME = auto() SORT_BY = auto() STORED = auto() @@ -213,12 +228,11 @@ class TokenType(AutoName): TABLE_FORMAT = auto() TABLE_SAMPLE = auto() TEMPORARY = auto() - TIME = auto() TOP = auto() THEN = auto() TRUE = auto() + TRAILING = auto() TRUNCATE = auto() - TRY_CAST = auto() UNBOUNDED = auto() UNCACHE = auto() UNION = auto() @@ -272,35 +286,32 @@ class _Tokenizer(type): def __new__(cls, clsname, bases, attrs): klass = super().__new__(cls, clsname, bases, attrs) - klass.QUOTES = dict( - (quote, quote) if isinstance(quote, str) else (quote[0], quote[1]) - for quote in klass.QUOTES - ) - - klass.IDENTIFIERS = dict( - (identifier, identifier) - if isinstance(identifier, str) - else (identifier[0], identifier[1]) - for identifier in klass.IDENTIFIERS - ) - - klass.COMMENTS = dict( - (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) - for comment in klass.COMMENTS + klass._QUOTES = cls._delimeter_list_to_dict(klass.QUOTES) + klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) + klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) + klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) + klass._COMMENTS = dict( + (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) for comment in klass.COMMENTS ) klass.KEYWORD_TRIE = new_trie( key.upper() for key, value in { **klass.KEYWORDS, - **{comment: TokenType.COMMENT for comment in klass.COMMENTS}, - **{quote: TokenType.QUOTE for quote in klass.QUOTES}, + **{comment: TokenType.COMMENT for comment in klass._COMMENTS}, + **{quote: TokenType.QUOTE for quote in klass._QUOTES}, + **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS}, + **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS}, }.items() if " " in key or any(single in key for single in klass.SINGLE_TOKENS) ) return klass + @staticmethod + def _delimeter_list_to_dict(list): + return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) + class Tokenizer(metaclass=_Tokenizer): SINGLE_TOKENS = { @@ -339,6 +350,10 @@ class Tokenizer(metaclass=_Tokenizer): QUOTES = ["'"] + BIT_STRINGS = [] + + HEX_STRINGS = [] + IDENTIFIERS = ['"'] ESCAPE = "'" @@ -357,6 +372,7 @@ class Tokenizer(metaclass=_Tokenizer): "->>": TokenType.DARROW, "#>": TokenType.HASH_ARROW, "#>>": TokenType.DHASH_ARROW, + "<->": TokenType.LR_ARROW, "ADD ARCHIVE": TokenType.ADD_FILE, "ADD ARCHIVES": TokenType.ADD_FILE, "ADD FILE": TokenType.ADD_FILE, @@ -374,12 +390,12 @@ class Tokenizer(metaclass=_Tokenizer): "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, "BEGIN": TokenType.BEGIN, "BETWEEN": TokenType.BETWEEN, + "BOTH": TokenType.BOTH, "BUCKET": TokenType.BUCKET, "CALL": TokenType.CALL, "CACHE": TokenType.CACHE, "UNCACHE": TokenType.UNCACHE, "CASE": TokenType.CASE, - "CAST": TokenType.CAST, "CHARACTER SET": TokenType.CHARACTER_SET, "CHECK": TokenType.CHECK, "CLUSTER BY": TokenType.CLUSTER_BY, @@ -387,7 +403,6 @@ class Tokenizer(metaclass=_Tokenizer): "COMMENT": TokenType.SCHEMA_COMMENT, "COMMIT": TokenType.COMMIT, "CONSTRAINT": TokenType.CONSTRAINT, - "CONVERT": TokenType.CONVERT, "CREATE": TokenType.CREATE, "CROSS": TokenType.CROSS, "CUBE": TokenType.CUBE, @@ -408,7 +423,6 @@ class Tokenizer(metaclass=_Tokenizer): "EXCEPT": TokenType.EXCEPT, "EXISTS": TokenType.EXISTS, "EXPLAIN": TokenType.EXPLAIN, - "EXTRACT": TokenType.EXTRACT, "FALSE": TokenType.FALSE, "FETCH": TokenType.FETCH, "FILTER": TokenType.FILTER, @@ -437,10 +451,12 @@ class Tokenizer(metaclass=_Tokenizer): "JOIN": TokenType.JOIN, "LATERAL": TokenType.LATERAL, "LAZY": TokenType.LAZY, + "LEADING": TokenType.LEADING, "LEFT": TokenType.LEFT, "LIKE": TokenType.LIKE, "LIMIT": TokenType.LIMIT, "LOCATION": TokenType.LOCATION, + "NATURAL": TokenType.NATURAL, "NEXT": TokenType.NEXT, "NO ACTION": TokenType.NO_ACTION, "NOT": TokenType.NOT, @@ -490,8 +506,8 @@ class Tokenizer(metaclass=_Tokenizer): "TEMPORARY": TokenType.TEMPORARY, "THEN": TokenType.THEN, "TRUE": TokenType.TRUE, + "TRAILING": TokenType.TRAILING, "TRUNCATE": TokenType.TRUNCATE, - "TRY_CAST": TokenType.TRY_CAST, "UNBOUNDED": TokenType.UNBOUNDED, "UNION": TokenType.UNION, "UNNEST": TokenType.UNNEST, @@ -626,14 +642,12 @@ class Tokenizer(metaclass=_Tokenizer): break white_space = self.WHITE_SPACE.get(self._char) - identifier_end = self.IDENTIFIERS.get(self._char) + identifier_end = self._IDENTIFIERS.get(self._char) if white_space: if white_space == TokenType.BREAK: self._col = 1 self._line += 1 - elif self._char == "0" and self._peek == "x": - self._scan_hex() elif self._char.isdigit(): self._scan_number() elif identifier_end: @@ -666,9 +680,7 @@ class Tokenizer(metaclass=_Tokenizer): text = self._text if text is None else text self.tokens.append(Token(token_type, text, self._line, self._col)) - if token_type in self.COMMANDS and ( - len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON - ): + if token_type in self.COMMANDS and (len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON): self._start = self._current while not self._end and self._peek != ";": self._advance() @@ -725,6 +737,8 @@ class Tokenizer(metaclass=_Tokenizer): if self._scan_string(word): return + if self._scan_numeric_string(word): + return if self._scan_comment(word): return @@ -732,10 +746,10 @@ class Tokenizer(metaclass=_Tokenizer): self._add(self.KEYWORDS[word.upper()]) def _scan_comment(self, comment_start): - if comment_start not in self.COMMENTS: + if comment_start not in self._COMMENTS: return False - comment_end = self.COMMENTS[comment_start] + comment_end = self._COMMENTS[comment_start] if comment_end: comment_end_size = len(comment_end) @@ -749,15 +763,18 @@ class Tokenizer(metaclass=_Tokenizer): return True def _scan_annotation(self): - while ( - not self._end - and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK - and self._peek != "," - ): + while not self._end and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK and self._peek != ",": self._advance() self._add(TokenType.ANNOTATION, self._text[1:]) def _scan_number(self): + if self._char == "0": + peek = self._peek.upper() + if peek == "B": + return self._scan_bits() + elif peek == "X": + return self._scan_hex() + decimal = False scientific = 0 @@ -788,57 +805,71 @@ class Tokenizer(metaclass=_Tokenizer): else: return self._add(TokenType.NUMBER) + def _scan_bits(self): + self._advance() + value = self._extract_value() + try: + self._add(TokenType.BIT_STRING, f"{int(value, 2)}") + except ValueError: + self._add(TokenType.IDENTIFIER) + def _scan_hex(self): self._advance() + value = self._extract_value() + try: + self._add(TokenType.HEX_STRING, f"{int(value, 16)}") + except ValueError: + self._add(TokenType.IDENTIFIER) + def _extract_value(self): while True: char = self._peek.strip() if char and char not in self.SINGLE_TOKENS: self._advance() else: break - try: - self._add(TokenType.BIT_STRING, f"{int(self._text, 16):b}") - except ValueError: - self._add(TokenType.IDENTIFIER) + + return self._text def _scan_string(self, quote): - quote_end = self.QUOTES.get(quote) + quote_end = self._QUOTES.get(quote) if quote_end is None: return False - text = "" self._advance(len(quote)) - quote_end_size = len(quote_end) - - while True: - if self._char == self.ESCAPE and self._peek == quote_end: - text += quote - self._advance(2) - else: - if self._chars(quote_end_size) == quote_end: - if quote_end_size > 1: - self._advance(quote_end_size - 1) - break - - if self._end: - raise RuntimeError( - f"Missing {quote} from {self._line}:{self._start}" - ) - text += self._char - self._advance() + text = self._extract_string(quote_end) text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text text = text.replace("\\\\", "\\") if self.ESCAPE == "\\" else text self._add(TokenType.STRING, text) return True + def _scan_numeric_string(self, string_start): + if string_start in self._HEX_STRINGS: + delimiters = self._HEX_STRINGS + token_type = TokenType.HEX_STRING + base = 16 + elif string_start in self._BIT_STRINGS: + delimiters = self._BIT_STRINGS + token_type = TokenType.BIT_STRING + base = 2 + else: + return False + + self._advance(len(string_start)) + string_end = delimiters.get(string_start) + text = self._extract_string(string_end) + + try: + self._add(token_type, f"{int(text, base)}") + except ValueError: + raise RuntimeError(f"Numeric string contains invalid characters from {self._line}:{self._start}") + return True + def _scan_identifier(self, identifier_end): while self._peek != identifier_end: if self._end: - raise RuntimeError( - f"Missing {identifier_end} from {self._line}:{self._start}" - ) + raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") self._advance() self._advance() self._add(TokenType.IDENTIFIER, self._text[1:-1]) @@ -851,3 +882,24 @@ class Tokenizer(metaclass=_Tokenizer): else: break self._add(self.KEYWORDS.get(self._text.upper(), TokenType.VAR)) + + def _extract_string(self, delimiter): + text = "" + delim_size = len(delimiter) + + while True: + if self._char == self.ESCAPE and self._peek == delimiter: + text += delimiter + self._advance(2) + else: + if self._chars(delim_size) == delimiter: + if delim_size > 1: + self._advance(delim_size - 1) + break + + if self._end: + raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") + text += self._char + self._advance() + + return text -- cgit v1.2.3