diff options
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r-- | sqlglot/tokens.py | 95 |
1 files changed, 50 insertions, 45 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index 64c1f92..5e50b7c 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -84,6 +84,10 @@ class TokenType(AutoName): UINT = auto() BIGINT = auto() UBIGINT = auto() + INT128 = auto() + UINT128 = auto() + INT256 = auto() + UINT256 = auto() FLOAT = auto() DOUBLE = auto() DECIMAL = auto() @@ -774,8 +778,6 @@ class Tokenizer(metaclass=_Tokenizer): "_end", "_peek", "_prev_token_line", - "_prev_token_comments", - "_prev_token_type", ) def __init__(self) -> None: @@ -795,8 +797,6 @@ class Tokenizer(metaclass=_Tokenizer): self._end = False self._peek = "" self._prev_token_line = -1 - self._prev_token_comments: t.List[str] = [] - self._prev_token_type: t.Optional[TokenType] = None def tokenize(self, sql: str) -> t.List[Token]: """Returns a list of tokens corresponding to the SQL string `sql`.""" @@ -846,7 +846,7 @@ class Tokenizer(metaclass=_Tokenizer): return self.sql[start:end] return "" - def _advance(self, i: int = 1) -> None: + def _advance(self, i: int = 1, alnum: bool = False) -> None: if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: self._col = 1 self._line += 1 @@ -858,14 +858,30 @@ class Tokenizer(metaclass=_Tokenizer): self._char = self.sql[self._current - 1] self._peek = "" if self._end else self.sql[self._current] + if alnum and self._char.isalnum(): + _col = self._col + _current = self._current + _end = self._end + _peek = self._peek + + while _peek.isalnum(): + _col += 1 + _current += 1 + _end = _current >= self.size + _peek = "" if _end else self.sql[_current] + + self._col = _col + self._current = _current + self._end = _end + self._peek = _peek + self._char = self.sql[_current - 1] + @property def _text(self) -> str: return self.sql[self._start : self._current] def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: self._prev_token_line = self._line - self._prev_token_comments = self._comments - self._prev_token_type = token_type self.tokens.append( Token( token_type, @@ -966,13 +982,13 @@ class Tokenizer(metaclass=_Tokenizer): comment_end_size = len(comment_end) while not self._end and self._chars(comment_end_size) != comment_end: - self._advance() + self._advance(alnum=True) self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) self._advance(comment_end_size - 1) else: while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: - self._advance() + self._advance(alnum=True) self._comments.append(self._text[comment_start_size:]) # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. @@ -988,9 +1004,9 @@ class Tokenizer(metaclass=_Tokenizer): if self._char == "0": peek = self._peek.upper() if peek == "B": - return self._scan_bits() + return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER) elif peek == "X": - return self._scan_hex() + return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER) decimal = False scientific = 0 @@ -1033,7 +1049,9 @@ class Tokenizer(metaclass=_Tokenizer): self._advance() value = self._extract_value() try: - self._add(TokenType.BIT_STRING, f"{int(value, 2)}") + # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier + int(value, 2) + self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b except ValueError: self._add(TokenType.IDENTIFIER) @@ -1041,7 +1059,9 @@ class Tokenizer(metaclass=_Tokenizer): self._advance() value = self._extract_value() try: - self._add(TokenType.HEX_STRING, f"{int(value, 16)}") + # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier + int(value, 16) + self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x except ValueError: self._add(TokenType.IDENTIFIER) @@ -1049,7 +1069,7 @@ class Tokenizer(metaclass=_Tokenizer): while True: char = self._peek.strip() if char and char not in self.SINGLE_TOKENS: - self._advance() + self._advance(alnum=True) else: break @@ -1066,7 +1086,7 @@ class Tokenizer(metaclass=_Tokenizer): self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) return True - # X'1234, b'0110', E'\\\\\' etc. + # X'1234', b'0110', E'\\\\\' etc. def _scan_formatted_string(self, string_start: str) -> bool: if string_start in self._HEX_STRINGS: delimiters = self._HEX_STRINGS @@ -1087,60 +1107,43 @@ class Tokenizer(metaclass=_Tokenizer): string_end = delimiters[string_start] text = self._extract_string(string_end) - if base is None: - self._add(token_type, text) - else: + if base: try: - self._add(token_type, f"{int(text, base)}") + int(text, base) except: raise RuntimeError( f"Numeric string contains invalid characters from {self._line}:{self._start}" ) + self._add(token_type, text) return True def _scan_identifier(self, identifier_end: str) -> None: - text = "" - identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES - - while True: - if self._end: - raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") - - self._advance() - if self._char == identifier_end: - if identifier_end_is_escape and self._peek == identifier_end: - text += identifier_end - self._advance() - continue - - break - - text += self._char - + self._advance() + text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) self._add(TokenType.IDENTIFIER, text) def _scan_var(self) -> None: while True: char = self._peek.strip() if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): - self._advance() + self._advance(alnum=True) else: break + self._add( TokenType.VAR - if self._prev_token_type == TokenType.PARAMETER + if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) ) - def _extract_string(self, delimiter: str) -> str: + def _extract_string(self, delimiter: str, escapes=None) -> str: text = "" delim_size = len(delimiter) + escapes = self._STRING_ESCAPES if escapes is None else escapes while True: - if self._char in self._STRING_ESCAPES and ( - self._peek == delimiter or self._peek in self._STRING_ESCAPES - ): + if self._char in escapes and (self._peek == delimiter or self._peek in escapes): if self._peek == delimiter: text += self._peek else: @@ -1158,7 +1161,9 @@ class Tokenizer(metaclass=_Tokenizer): if self._end: raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") - text += self._char - self._advance() + + current = self._current - 1 + self._advance(alnum=True) + text += self.sql[current : self._current - 1] return text |