summaryrefslogtreecommitdiffstats
path: root/sqlglot/tokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r--sqlglot/tokens.py184
1 files changed, 118 insertions, 66 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
index e4b754d..bd95bc7 100644
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@@ -38,6 +38,7 @@ class TokenType(AutoName):
DARROW = auto()
HASH_ARROW = auto()
DHASH_ARROW = auto()
+ LR_ARROW = auto()
ANNOTATION = auto()
DOLLAR = auto()
@@ -53,6 +54,7 @@ class TokenType(AutoName):
TABLE = auto()
VAR = auto()
BIT_STRING = auto()
+ HEX_STRING = auto()
# types
BOOLEAN = auto()
@@ -78,10 +80,17 @@ class TokenType(AutoName):
UUID = auto()
GEOGRAPHY = auto()
NULLABLE = auto()
+ GEOMETRY = auto()
+ HLLSKETCH = auto()
+ SUPER = auto()
+ SERIAL = auto()
+ SMALLSERIAL = auto()
+ BIGSERIAL = auto()
# keywords
ADD_FILE = auto()
ALIAS = auto()
+ ALWAYS = auto()
ALL = auto()
ALTER = auto()
ANALYZE = auto()
@@ -92,11 +101,12 @@ class TokenType(AutoName):
AUTO_INCREMENT = auto()
BEGIN = auto()
BETWEEN = auto()
+ BOTH = auto()
BUCKET = auto()
+ BY_DEFAULT = auto()
CACHE = auto()
CALL = auto()
CASE = auto()
- CAST = auto()
CHARACTER_SET = auto()
CHECK = auto()
CLUSTER_BY = auto()
@@ -104,7 +114,6 @@ class TokenType(AutoName):
COMMENT = auto()
COMMIT = auto()
CONSTRAINT = auto()
- CONVERT = auto()
CREATE = auto()
CROSS = auto()
CUBE = auto()
@@ -127,22 +136,24 @@ class TokenType(AutoName):
EXCEPT = auto()
EXISTS = auto()
EXPLAIN = auto()
- EXTRACT = auto()
FALSE = auto()
FETCH = auto()
FILTER = auto()
FINAL = auto()
FIRST = auto()
FOLLOWING = auto()
+ FOR = auto()
FOREIGN_KEY = auto()
FORMAT = auto()
FULL = auto()
FUNCTION = auto()
FROM = auto()
+ GENERATED = auto()
GROUP_BY = auto()
GROUPING_SETS = auto()
HAVING = auto()
HINT = auto()
+ IDENTITY = auto()
IF = auto()
IGNORE_NULLS = auto()
ILIKE = auto()
@@ -159,12 +170,14 @@ class TokenType(AutoName):
JOIN = auto()
LATERAL = auto()
LAZY = auto()
+ LEADING = auto()
LEFT = auto()
LIKE = auto()
LIMIT = auto()
LOCATION = auto()
MAP = auto()
MOD = auto()
+ NATURAL = auto()
NEXT = auto()
NO_ACTION = auto()
NULL = auto()
@@ -204,8 +217,10 @@ class TokenType(AutoName):
ROWS = auto()
SCHEMA_COMMENT = auto()
SELECT = auto()
+ SEPARATOR = auto()
SET = auto()
SHOW = auto()
+ SIMILAR_TO = auto()
SOME = auto()
SORT_BY = auto()
STORED = auto()
@@ -213,12 +228,11 @@ class TokenType(AutoName):
TABLE_FORMAT = auto()
TABLE_SAMPLE = auto()
TEMPORARY = auto()
- TIME = auto()
TOP = auto()
THEN = auto()
TRUE = auto()
+ TRAILING = auto()
TRUNCATE = auto()
- TRY_CAST = auto()
UNBOUNDED = auto()
UNCACHE = auto()
UNION = auto()
@@ -272,35 +286,32 @@ class _Tokenizer(type):
def __new__(cls, clsname, bases, attrs):
klass = super().__new__(cls, clsname, bases, attrs)
- klass.QUOTES = dict(
- (quote, quote) if isinstance(quote, str) else (quote[0], quote[1])
- for quote in klass.QUOTES
- )
-
- klass.IDENTIFIERS = dict(
- (identifier, identifier)
- if isinstance(identifier, str)
- else (identifier[0], identifier[1])
- for identifier in klass.IDENTIFIERS
- )
-
- klass.COMMENTS = dict(
- (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
- for comment in klass.COMMENTS
+ klass._QUOTES = cls._delimeter_list_to_dict(klass.QUOTES)
+ klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
+ klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
+ klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
+ klass._COMMENTS = dict(
+ (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) for comment in klass.COMMENTS
)
klass.KEYWORD_TRIE = new_trie(
key.upper()
for key, value in {
**klass.KEYWORDS,
- **{comment: TokenType.COMMENT for comment in klass.COMMENTS},
- **{quote: TokenType.QUOTE for quote in klass.QUOTES},
+ **{comment: TokenType.COMMENT for comment in klass._COMMENTS},
+ **{quote: TokenType.QUOTE for quote in klass._QUOTES},
+ **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
+ **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
}.items()
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
)
return klass
+ @staticmethod
+ def _delimeter_list_to_dict(list):
+ return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
+
class Tokenizer(metaclass=_Tokenizer):
SINGLE_TOKENS = {
@@ -339,6 +350,10 @@ class Tokenizer(metaclass=_Tokenizer):
QUOTES = ["'"]
+ BIT_STRINGS = []
+
+ HEX_STRINGS = []
+
IDENTIFIERS = ['"']
ESCAPE = "'"
@@ -357,6 +372,7 @@ class Tokenizer(metaclass=_Tokenizer):
"->>": TokenType.DARROW,
"#>": TokenType.HASH_ARROW,
"#>>": TokenType.DHASH_ARROW,
+ "<->": TokenType.LR_ARROW,
"ADD ARCHIVE": TokenType.ADD_FILE,
"ADD ARCHIVES": TokenType.ADD_FILE,
"ADD FILE": TokenType.ADD_FILE,
@@ -374,12 +390,12 @@ class Tokenizer(metaclass=_Tokenizer):
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
"BEGIN": TokenType.BEGIN,
"BETWEEN": TokenType.BETWEEN,
+ "BOTH": TokenType.BOTH,
"BUCKET": TokenType.BUCKET,
"CALL": TokenType.CALL,
"CACHE": TokenType.CACHE,
"UNCACHE": TokenType.UNCACHE,
"CASE": TokenType.CASE,
- "CAST": TokenType.CAST,
"CHARACTER SET": TokenType.CHARACTER_SET,
"CHECK": TokenType.CHECK,
"CLUSTER BY": TokenType.CLUSTER_BY,
@@ -387,7 +403,6 @@ class Tokenizer(metaclass=_Tokenizer):
"COMMENT": TokenType.SCHEMA_COMMENT,
"COMMIT": TokenType.COMMIT,
"CONSTRAINT": TokenType.CONSTRAINT,
- "CONVERT": TokenType.CONVERT,
"CREATE": TokenType.CREATE,
"CROSS": TokenType.CROSS,
"CUBE": TokenType.CUBE,
@@ -408,7 +423,6 @@ class Tokenizer(metaclass=_Tokenizer):
"EXCEPT": TokenType.EXCEPT,
"EXISTS": TokenType.EXISTS,
"EXPLAIN": TokenType.EXPLAIN,
- "EXTRACT": TokenType.EXTRACT,
"FALSE": TokenType.FALSE,
"FETCH": TokenType.FETCH,
"FILTER": TokenType.FILTER,
@@ -437,10 +451,12 @@ class Tokenizer(metaclass=_Tokenizer):
"JOIN": TokenType.JOIN,
"LATERAL": TokenType.LATERAL,
"LAZY": TokenType.LAZY,
+ "LEADING": TokenType.LEADING,
"LEFT": TokenType.LEFT,
"LIKE": TokenType.LIKE,
"LIMIT": TokenType.LIMIT,
"LOCATION": TokenType.LOCATION,
+ "NATURAL": TokenType.NATURAL,
"NEXT": TokenType.NEXT,
"NO ACTION": TokenType.NO_ACTION,
"NOT": TokenType.NOT,
@@ -490,8 +506,8 @@ class Tokenizer(metaclass=_Tokenizer):
"TEMPORARY": TokenType.TEMPORARY,
"THEN": TokenType.THEN,
"TRUE": TokenType.TRUE,
+ "TRAILING": TokenType.TRAILING,
"TRUNCATE": TokenType.TRUNCATE,
- "TRY_CAST": TokenType.TRY_CAST,
"UNBOUNDED": TokenType.UNBOUNDED,
"UNION": TokenType.UNION,
"UNNEST": TokenType.UNNEST,
@@ -626,14 +642,12 @@ class Tokenizer(metaclass=_Tokenizer):
break
white_space = self.WHITE_SPACE.get(self._char)
- identifier_end = self.IDENTIFIERS.get(self._char)
+ identifier_end = self._IDENTIFIERS.get(self._char)
if white_space:
if white_space == TokenType.BREAK:
self._col = 1
self._line += 1
- elif self._char == "0" and self._peek == "x":
- self._scan_hex()
elif self._char.isdigit():
self._scan_number()
elif identifier_end:
@@ -666,9 +680,7 @@ class Tokenizer(metaclass=_Tokenizer):
text = self._text if text is None else text
self.tokens.append(Token(token_type, text, self._line, self._col))
- if token_type in self.COMMANDS and (
- len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON
- ):
+ if token_type in self.COMMANDS and (len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON):
self._start = self._current
while not self._end and self._peek != ";":
self._advance()
@@ -725,6 +737,8 @@ class Tokenizer(metaclass=_Tokenizer):
if self._scan_string(word):
return
+ if self._scan_numeric_string(word):
+ return
if self._scan_comment(word):
return
@@ -732,10 +746,10 @@ class Tokenizer(metaclass=_Tokenizer):
self._add(self.KEYWORDS[word.upper()])
def _scan_comment(self, comment_start):
- if comment_start not in self.COMMENTS:
+ if comment_start not in self._COMMENTS:
return False
- comment_end = self.COMMENTS[comment_start]
+ comment_end = self._COMMENTS[comment_start]
if comment_end:
comment_end_size = len(comment_end)
@@ -749,15 +763,18 @@ class Tokenizer(metaclass=_Tokenizer):
return True
def _scan_annotation(self):
- while (
- not self._end
- and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK
- and self._peek != ","
- ):
+ while not self._end and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK and self._peek != ",":
self._advance()
self._add(TokenType.ANNOTATION, self._text[1:])
def _scan_number(self):
+ if self._char == "0":
+ peek = self._peek.upper()
+ if peek == "B":
+ return self._scan_bits()
+ elif peek == "X":
+ return self._scan_hex()
+
decimal = False
scientific = 0
@@ -788,57 +805,71 @@ class Tokenizer(metaclass=_Tokenizer):
else:
return self._add(TokenType.NUMBER)
+ def _scan_bits(self):
+ self._advance()
+ value = self._extract_value()
+ try:
+ self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
+ except ValueError:
+ self._add(TokenType.IDENTIFIER)
+
def _scan_hex(self):
self._advance()
+ value = self._extract_value()
+ try:
+ self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
+ except ValueError:
+ self._add(TokenType.IDENTIFIER)
+ def _extract_value(self):
while True:
char = self._peek.strip()
if char and char not in self.SINGLE_TOKENS:
self._advance()
else:
break
- try:
- self._add(TokenType.BIT_STRING, f"{int(self._text, 16):b}")
- except ValueError:
- self._add(TokenType.IDENTIFIER)
+
+ return self._text
def _scan_string(self, quote):
- quote_end = self.QUOTES.get(quote)
+ quote_end = self._QUOTES.get(quote)
if quote_end is None:
return False
- text = ""
self._advance(len(quote))
- quote_end_size = len(quote_end)
-
- while True:
- if self._char == self.ESCAPE and self._peek == quote_end:
- text += quote
- self._advance(2)
- else:
- if self._chars(quote_end_size) == quote_end:
- if quote_end_size > 1:
- self._advance(quote_end_size - 1)
- break
-
- if self._end:
- raise RuntimeError(
- f"Missing {quote} from {self._line}:{self._start}"
- )
- text += self._char
- self._advance()
+ text = self._extract_string(quote_end)
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
text = text.replace("\\\\", "\\") if self.ESCAPE == "\\" else text
self._add(TokenType.STRING, text)
return True
+ def _scan_numeric_string(self, string_start):
+ if string_start in self._HEX_STRINGS:
+ delimiters = self._HEX_STRINGS
+ token_type = TokenType.HEX_STRING
+ base = 16
+ elif string_start in self._BIT_STRINGS:
+ delimiters = self._BIT_STRINGS
+ token_type = TokenType.BIT_STRING
+ base = 2
+ else:
+ return False
+
+ self._advance(len(string_start))
+ string_end = delimiters.get(string_start)
+ text = self._extract_string(string_end)
+
+ try:
+ self._add(token_type, f"{int(text, base)}")
+ except ValueError:
+ raise RuntimeError(f"Numeric string contains invalid characters from {self._line}:{self._start}")
+ return True
+
def _scan_identifier(self, identifier_end):
while self._peek != identifier_end:
if self._end:
- raise RuntimeError(
- f"Missing {identifier_end} from {self._line}:{self._start}"
- )
+ raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
self._advance()
self._advance()
self._add(TokenType.IDENTIFIER, self._text[1:-1])
@@ -851,3 +882,24 @@ class Tokenizer(metaclass=_Tokenizer):
else:
break
self._add(self.KEYWORDS.get(self._text.upper(), TokenType.VAR))
+
+ def _extract_string(self, delimiter):
+ text = ""
+ delim_size = len(delimiter)
+
+ while True:
+ if self._char == self.ESCAPE and self._peek == delimiter:
+ text += delimiter
+ self._advance(2)
+ else:
+ if self._chars(delim_size) == delimiter:
+ if delim_size > 1:
+ self._advance(delim_size - 1)
+ break
+
+ if self._end:
+ raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
+ text += self._char
+ self._advance()
+
+ return text