diff options
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r-- | sqlglot/tokens.py | 246 |
1 files changed, 91 insertions, 155 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index 5e50b7c..ad329d2 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -51,7 +51,6 @@ class TokenType(AutoName): DOLLAR = auto() PARAMETER = auto() SESSION_PARAMETER = auto() - NATIONAL = auto() DAMP = auto() BLOCK_START = auto() @@ -72,6 +71,8 @@ class TokenType(AutoName): BIT_STRING = auto() HEX_STRING = auto() BYTE_STRING = auto() + NATIONAL_STRING = auto() + RAW_STRING = auto() # types BIT = auto() @@ -110,6 +111,7 @@ class TokenType(AutoName): TIMESTAMPTZ = auto() TIMESTAMPLTZ = auto() DATETIME = auto() + DATETIME64 = auto() DATE = auto() UUID = auto() GEOGRAPHY = auto() @@ -142,30 +144,22 @@ class TokenType(AutoName): ARRAY = auto() ASC = auto() ASOF = auto() - AT_TIME_ZONE = auto() AUTO_INCREMENT = auto() BEGIN = auto() BETWEEN = auto() - BOTH = auto() - BUCKET = auto() - BY_DEFAULT = auto() CACHE = auto() - CASCADE = auto() CASE = auto() CHARACTER_SET = auto() - CLUSTER_BY = auto() COLLATE = auto() COMMAND = auto() COMMENT = auto() COMMIT = auto() - COMPOUND = auto() CONSTRAINT = auto() CREATE = auto() CROSS = auto() CUBE = auto() CURRENT_DATE = auto() CURRENT_DATETIME = auto() - CURRENT_ROW = auto() CURRENT_TIME = auto() CURRENT_TIMESTAMP = auto() CURRENT_USER = auto() @@ -174,8 +168,6 @@ class TokenType(AutoName): DESC = auto() DESCRIBE = auto() DISTINCT = auto() - DISTINCT_FROM = auto() - DISTRIBUTE_BY = auto() DIV = auto() DROP = auto() ELSE = auto() @@ -189,7 +181,6 @@ class TokenType(AutoName): FILTER = auto() FINAL = auto() FIRST = auto() - FOLLOWING = auto() FOR = auto() FOREIGN_KEY = auto() FORMAT = auto() @@ -203,7 +194,6 @@ class TokenType(AutoName): HAVING = auto() HINT = auto() IF = auto() - IGNORE_NULLS = auto() ILIKE = auto() ILIKE_ANY = auto() IN = auto() @@ -222,36 +212,27 @@ class TokenType(AutoName): KEEP = auto() LANGUAGE = auto() LATERAL = auto() - LAZY = auto() - LEADING = auto() LEFT = auto() LIKE = auto() LIKE_ANY = auto() LIMIT = auto() - LOAD_DATA = auto() - LOCAL = auto() + LOAD = auto() + LOCK = auto() MAP = auto() MATCH_RECOGNIZE = auto() - MATERIALIZED = auto() MERGE = auto() MOD = auto() NATURAL = auto() NEXT = auto() NEXT_VALUE_FOR = auto() - NO_ACTION = auto() NOTNULL = auto() NULL = auto() - NULLS_FIRST = auto() - NULLS_LAST = auto() OFFSET = auto() ON = auto() - ONLY = auto() - OPTIONS = auto() ORDER_BY = auto() ORDERED = auto() ORDINALITY = auto() OUTER = auto() - OUT_OF = auto() OVER = auto() OVERLAPS = auto() OVERWRITE = auto() @@ -261,7 +242,6 @@ class TokenType(AutoName): PIVOT = auto() PLACEHOLDER = auto() PRAGMA = auto() - PRECEDING = auto() PRIMARY_KEY = auto() PROCEDURE = auto() PROPERTIES = auto() @@ -271,7 +251,6 @@ class TokenType(AutoName): RANGE = auto() RECURSIVE = auto() REPLACE = auto() - RESPECT_NULLS = auto() RETURNING = auto() REFERENCES = auto() RIGHT = auto() @@ -280,28 +259,23 @@ class TokenType(AutoName): ROLLUP = auto() ROW = auto() ROWS = auto() - SEED = auto() SELECT = auto() SEMI = auto() SEPARATOR = auto() SERDE_PROPERTIES = auto() SET = auto() + SETTINGS = auto() SHOW = auto() SIMILAR_TO = auto() SOME = auto() - SORTKEY = auto() - SORT_BY = auto() STRUCT = auto() TABLE_SAMPLE = auto() TEMPORARY = auto() TOP = auto() THEN = auto() - TRAILING = auto() TRUE = auto() - UNBOUNDED = auto() UNCACHE = auto() UNION = auto() - UNLOGGED = auto() UNNEST = auto() UNPIVOT = auto() UPDATE = auto() @@ -314,15 +288,11 @@ class TokenType(AutoName): WHERE = auto() WINDOW = auto() WITH = auto() - WITH_TIME_ZONE = auto() - WITH_LOCAL_TIME_ZONE = auto() - WITHIN_GROUP = auto() - WITHOUT_TIME_ZONE = auto() UNIQUE = auto() class Token: - __slots__ = ("token_type", "text", "line", "col", "end", "comments") + __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") @classmethod def number(cls, number: int) -> Token: @@ -350,22 +320,28 @@ class Token: text: str, line: int = 1, col: int = 1, + start: int = 0, end: int = 0, comments: t.List[str] = [], ) -> None: + """Token initializer. + + Args: + token_type: The TokenType Enum. + text: The text of the token. + line: The line that the token ends on. + col: The column that the token ends on. + start: The start index of the token. + end: The ending index of the token. + """ self.token_type = token_type self.text = text self.line = line - size = len(text) self.col = col - self.end = end if end else size + self.start = start + self.end = end self.comments = comments - @property - def start(self) -> int: - """Returns the start of the token.""" - return self.end - len(self.text) - def __repr__(self) -> str: attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) return f"<Token {attributes}>" @@ -375,15 +351,31 @@ class _Tokenizer(type): def __new__(cls, clsname, bases, attrs): klass = super().__new__(cls, clsname, bases, attrs) - klass._QUOTES = { - f"{prefix}{s}": e - for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items() - for prefix in (("",) if s[0].isalpha() else ("", "n", "N")) + def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: + return dict( + (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr + ) + + def _quotes_to_format( + token_type: TokenType, arr: t.List[str | t.Tuple[str, str]] + ) -> t.Dict[str, t.Tuple[str, TokenType]]: + return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} + + klass._QUOTES = _convert_quotes(klass.QUOTES) + klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS) + + klass._FORMAT_STRINGS = { + **{ + p + s: (e, TokenType.NATIONAL_STRING) + for s, e in klass._QUOTES.items() + for p in ("n", "N") + }, + **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS), + **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS), + **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS), + **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS), } - klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) - klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) - klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS) - klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) + klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) klass._COMMENTS = dict( @@ -393,23 +385,17 @@ class _Tokenizer(type): klass.KEYWORD_TRIE = new_trie( key.upper() - for key in { - **klass.KEYWORDS, - **{comment: TokenType.COMMENT for comment in klass._COMMENTS}, - **{quote: TokenType.QUOTE for quote in klass._QUOTES}, - **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS}, - **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS}, - **{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS}, - } + for key in ( + *klass.KEYWORDS, + *klass._COMMENTS, + *klass._QUOTES, + *klass._FORMAT_STRINGS, + ) if " " in key or any(single in key for single in klass.SINGLE_TOKENS) ) return klass - @staticmethod - def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: - return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) - class Tokenizer(metaclass=_Tokenizer): SINGLE_TOKENS = { @@ -450,6 +436,7 @@ class Tokenizer(metaclass=_Tokenizer): BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] + RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] IDENTIFIER_ESCAPES = ['"'] QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] @@ -457,9 +444,7 @@ class Tokenizer(metaclass=_Tokenizer): VAR_SINGLE_TOKENS: t.Set[str] = set() _COMMENTS: t.Dict[str, str] = {} - _BIT_STRINGS: t.Dict[str, str] = {} - _BYTE_STRINGS: t.Dict[str, str] = {} - _HEX_STRINGS: t.Dict[str, str] = {} + _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} _IDENTIFIERS: t.Dict[str, str] = {} _IDENTIFIER_ESCAPES: t.Set[str] = set() _QUOTES: t.Dict[str, str] = {} @@ -495,30 +480,22 @@ class Tokenizer(metaclass=_Tokenizer): "ANY": TokenType.ANY, "ASC": TokenType.ASC, "AS": TokenType.ALIAS, - "AT TIME ZONE": TokenType.AT_TIME_ZONE, "AUTOINCREMENT": TokenType.AUTO_INCREMENT, "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, "BEGIN": TokenType.BEGIN, "BETWEEN": TokenType.BETWEEN, - "BOTH": TokenType.BOTH, - "BUCKET": TokenType.BUCKET, - "BY DEFAULT": TokenType.BY_DEFAULT, "CACHE": TokenType.CACHE, "UNCACHE": TokenType.UNCACHE, "CASE": TokenType.CASE, - "CASCADE": TokenType.CASCADE, "CHARACTER SET": TokenType.CHARACTER_SET, - "CLUSTER BY": TokenType.CLUSTER_BY, "COLLATE": TokenType.COLLATE, "COLUMN": TokenType.COLUMN, "COMMIT": TokenType.COMMIT, - "COMPOUND": TokenType.COMPOUND, "CONSTRAINT": TokenType.CONSTRAINT, "CREATE": TokenType.CREATE, "CROSS": TokenType.CROSS, "CUBE": TokenType.CUBE, "CURRENT_DATE": TokenType.CURRENT_DATE, - "CURRENT ROW": TokenType.CURRENT_ROW, "CURRENT_TIME": TokenType.CURRENT_TIME, "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, "CURRENT_USER": TokenType.CURRENT_USER, @@ -528,8 +505,6 @@ class Tokenizer(metaclass=_Tokenizer): "DESC": TokenType.DESC, "DESCRIBE": TokenType.DESCRIBE, "DISTINCT": TokenType.DISTINCT, - "DISTINCT FROM": TokenType.DISTINCT_FROM, - "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, "DIV": TokenType.DIV, "DROP": TokenType.DROP, "ELSE": TokenType.ELSE, @@ -544,18 +519,18 @@ class Tokenizer(metaclass=_Tokenizer): "FIRST": TokenType.FIRST, "FULL": TokenType.FULL, "FUNCTION": TokenType.FUNCTION, - "FOLLOWING": TokenType.FOLLOWING, "FOR": TokenType.FOR, "FOREIGN KEY": TokenType.FOREIGN_KEY, "FORMAT": TokenType.FORMAT, "FROM": TokenType.FROM, + "GEOGRAPHY": TokenType.GEOGRAPHY, + "GEOMETRY": TokenType.GEOMETRY, "GLOB": TokenType.GLOB, "GROUP BY": TokenType.GROUP_BY, "GROUPING SETS": TokenType.GROUPING_SETS, "HAVING": TokenType.HAVING, "IF": TokenType.IF, "ILIKE": TokenType.ILIKE, - "IGNORE NULLS": TokenType.IGNORE_NULLS, "IN": TokenType.IN, "INDEX": TokenType.INDEX, "INET": TokenType.INET, @@ -569,34 +544,25 @@ class Tokenizer(metaclass=_Tokenizer): "JOIN": TokenType.JOIN, "KEEP": TokenType.KEEP, "LATERAL": TokenType.LATERAL, - "LAZY": TokenType.LAZY, - "LEADING": TokenType.LEADING, "LEFT": TokenType.LEFT, "LIKE": TokenType.LIKE, "LIMIT": TokenType.LIMIT, - "LOAD DATA": TokenType.LOAD_DATA, - "LOCAL": TokenType.LOCAL, - "MATERIALIZED": TokenType.MATERIALIZED, + "LOAD": TokenType.LOAD, + "LOCK": TokenType.LOCK, "MERGE": TokenType.MERGE, "NATURAL": TokenType.NATURAL, "NEXT": TokenType.NEXT, "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR, - "NO ACTION": TokenType.NO_ACTION, "NOT": TokenType.NOT, "NOTNULL": TokenType.NOTNULL, "NULL": TokenType.NULL, - "NULLS FIRST": TokenType.NULLS_FIRST, - "NULLS LAST": TokenType.NULLS_LAST, "OBJECT": TokenType.OBJECT, "OFFSET": TokenType.OFFSET, "ON": TokenType.ON, - "ONLY": TokenType.ONLY, - "OPTIONS": TokenType.OPTIONS, "OR": TokenType.OR, "ORDER BY": TokenType.ORDER_BY, "ORDINALITY": TokenType.ORDINALITY, "OUTER": TokenType.OUTER, - "OUT OF": TokenType.OUT_OF, "OVER": TokenType.OVER, "OVERLAPS": TokenType.OVERLAPS, "OVERWRITE": TokenType.OVERWRITE, @@ -607,7 +573,6 @@ class Tokenizer(metaclass=_Tokenizer): "PERCENT": TokenType.PERCENT, "PIVOT": TokenType.PIVOT, "PRAGMA": TokenType.PRAGMA, - "PRECEDING": TokenType.PRECEDING, "PRIMARY KEY": TokenType.PRIMARY_KEY, "PROCEDURE": TokenType.PROCEDURE, "QUALIFY": TokenType.QUALIFY, @@ -615,7 +580,6 @@ class Tokenizer(metaclass=_Tokenizer): "RECURSIVE": TokenType.RECURSIVE, "REGEXP": TokenType.RLIKE, "REPLACE": TokenType.REPLACE, - "RESPECT NULLS": TokenType.RESPECT_NULLS, "REFERENCES": TokenType.REFERENCES, "RIGHT": TokenType.RIGHT, "RLIKE": TokenType.RLIKE, @@ -624,25 +588,20 @@ class Tokenizer(metaclass=_Tokenizer): "ROW": TokenType.ROW, "ROWS": TokenType.ROWS, "SCHEMA": TokenType.SCHEMA, - "SEED": TokenType.SEED, "SELECT": TokenType.SELECT, "SEMI": TokenType.SEMI, "SET": TokenType.SET, + "SETTINGS": TokenType.SETTINGS, "SHOW": TokenType.SHOW, "SIMILAR TO": TokenType.SIMILAR_TO, "SOME": TokenType.SOME, - "SORTKEY": TokenType.SORTKEY, - "SORT BY": TokenType.SORT_BY, "TABLE": TokenType.TABLE, "TABLESAMPLE": TokenType.TABLE_SAMPLE, "TEMP": TokenType.TEMPORARY, "TEMPORARY": TokenType.TEMPORARY, "THEN": TokenType.THEN, "TRUE": TokenType.TRUE, - "TRAILING": TokenType.TRAILING, - "UNBOUNDED": TokenType.UNBOUNDED, "UNION": TokenType.UNION, - "UNLOGGED": TokenType.UNLOGGED, "UNNEST": TokenType.UNNEST, "UNPIVOT": TokenType.UNPIVOT, "UPDATE": TokenType.UPDATE, @@ -656,10 +615,6 @@ class Tokenizer(metaclass=_Tokenizer): "WHERE": TokenType.WHERE, "WINDOW": TokenType.WINDOW, "WITH": TokenType.WITH, - "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, - "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE, - "WITHIN GROUP": TokenType.WITHIN_GROUP, - "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, "APPLY": TokenType.APPLY, "ARRAY": TokenType.ARRAY, "BIT": TokenType.BIT, @@ -718,15 +673,6 @@ class Tokenizer(metaclass=_Tokenizer): "STRUCT": TokenType.STRUCT, "VARIANT": TokenType.VARIANT, "ALTER": TokenType.ALTER, - "ALTER AGGREGATE": TokenType.COMMAND, - "ALTER DEFAULT": TokenType.COMMAND, - "ALTER DOMAIN": TokenType.COMMAND, - "ALTER ROLE": TokenType.COMMAND, - "ALTER RULE": TokenType.COMMAND, - "ALTER SEQUENCE": TokenType.COMMAND, - "ALTER TYPE": TokenType.COMMAND, - "ALTER USER": TokenType.COMMAND, - "ALTER VIEW": TokenType.COMMAND, "ANALYZE": TokenType.COMMAND, "CALL": TokenType.COMMAND, "COMMENT": TokenType.COMMENT, @@ -790,7 +736,7 @@ class Tokenizer(metaclass=_Tokenizer): self._start = 0 self._current = 0 self._line = 1 - self._col = 1 + self._col = 0 self._comments: t.List[str] = [] self._char = "" @@ -803,13 +749,12 @@ class Tokenizer(metaclass=_Tokenizer): self.reset() self.sql = sql self.size = len(sql) + try: self._scan() except Exception as e: - start = self._current - 50 - end = self._current + 50 - start = start if start > 0 else 0 - end = end if end < self.size else self.size - 1 + start = max(self._current - 50, 0) + end = min(self._current + 50, self.size - 1) context = self.sql[start:end] raise ValueError(f"Error tokenizing '{context}'") from e @@ -834,17 +779,17 @@ class Tokenizer(metaclass=_Tokenizer): if until and until(): break - if self.tokens: + if self.tokens and self._comments: self.tokens[-1].comments.extend(self._comments) def _chars(self, size: int) -> str: if size == 1: return self._char + start = self._current - 1 end = start + size - if end <= self.size: - return self.sql[start:end] - return "" + + return self.sql[start:end] if end <= self.size else "" def _advance(self, i: int = 1, alnum: bool = False) -> None: if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: @@ -859,6 +804,7 @@ class Tokenizer(metaclass=_Tokenizer): self._peek = "" if self._end else self.sql[self._current] if alnum and self._char.isalnum(): + # Here we use local variables instead of attributes for better performance _col = self._col _current = self._current _end = self._end @@ -885,11 +831,12 @@ class Tokenizer(metaclass=_Tokenizer): self.tokens.append( Token( token_type, - self._text if text is None else text, - self._line, - self._col, - self._current, - self._comments, + text=self._text if text is None else text, + line=self._line, + col=self._col, + start=self._start, + end=self._current - 1, + comments=self._comments, ) ) self._comments = [] @@ -929,6 +876,7 @@ class Tokenizer(metaclass=_Tokenizer): break if result == 2: word = chars + size += 1 end = self._current - 1 + size @@ -946,6 +894,7 @@ class Tokenizer(metaclass=_Tokenizer): else: skip = True else: + char = "" chars = " " word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word @@ -959,8 +908,6 @@ class Tokenizer(metaclass=_Tokenizer): if self._scan_string(word): return - if self._scan_formatted_string(word): - return if self._scan_comment(word): return @@ -1004,9 +951,9 @@ class Tokenizer(metaclass=_Tokenizer): if self._char == "0": peek = self._peek.upper() if peek == "B": - return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER) + return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) elif peek == "X": - return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER) + return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) decimal = False scientific = 0 @@ -1075,37 +1022,24 @@ class Tokenizer(metaclass=_Tokenizer): return self._text - def _scan_string(self, quote: str) -> bool: - quote_end = self._QUOTES.get(quote) - if quote_end is None: - return False + def _scan_string(self, start: str) -> bool: + base = None + token_type = TokenType.STRING - self._advance(len(quote)) - text = self._extract_string(quote_end) - text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text - self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) - return True + if start in self._QUOTES: + end = self._QUOTES[start] + elif start in self._FORMAT_STRINGS: + end, token_type = self._FORMAT_STRINGS[start] - # X'1234', b'0110', E'\\\\\' etc. - def _scan_formatted_string(self, string_start: str) -> bool: - if string_start in self._HEX_STRINGS: - delimiters = self._HEX_STRINGS - token_type = TokenType.HEX_STRING - base = 16 - elif string_start in self._BIT_STRINGS: - delimiters = self._BIT_STRINGS - token_type = TokenType.BIT_STRING - base = 2 - elif string_start in self._BYTE_STRINGS: - delimiters = self._BYTE_STRINGS - token_type = TokenType.BYTE_STRING - base = None + if token_type == TokenType.HEX_STRING: + base = 16 + elif token_type == TokenType.BIT_STRING: + base = 2 else: return False - self._advance(len(string_start)) - string_end = delimiters[string_start] - text = self._extract_string(string_end) + self._advance(len(start)) + text = self._extract_string(end) if base: try: @@ -1114,6 +1048,8 @@ class Tokenizer(metaclass=_Tokenizer): raise RuntimeError( f"Numeric string contains invalid characters from {self._line}:{self._start}" ) + else: + text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text self._add(token_type, text) return True |