summaryrefslogtreecommitdiffstats
path: root/sqlglot/tokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r--sqlglot/tokens.py87
1 files changed, 55 insertions, 32 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
index da9df7d..7f0cb5d 100644
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@@ -135,6 +135,7 @@ class TokenType(AutoName):
LONGBLOB = auto()
TINYBLOB = auto()
TINYTEXT = auto()
+ NAME = auto()
BINARY = auto()
VARBINARY = auto()
JSON = auto()
@@ -290,6 +291,7 @@ class TokenType(AutoName):
LOAD = auto()
LOCK = auto()
MAP = auto()
+ MATCH_CONDITION = auto()
MATCH_RECOGNIZE = auto()
MEMBER_OF = auto()
MERGE = auto()
@@ -317,6 +319,7 @@ class TokenType(AutoName):
PERCENT = auto()
PIVOT = auto()
PLACEHOLDER = auto()
+ POSITIONAL = auto()
PRAGMA = auto()
PREWHERE = auto()
PRIMARY_KEY = auto()
@@ -340,6 +343,7 @@ class TokenType(AutoName):
SELECT = auto()
SEMI = auto()
SEPARATOR = auto()
+ SEQUENCE = auto()
SERDE_PROPERTIES = auto()
SET = auto()
SETTINGS = auto()
@@ -518,6 +522,7 @@ class _Tokenizer(type):
break_=_TOKEN_TYPE_TO_INDEX[TokenType.BREAK],
dcolon=_TOKEN_TYPE_TO_INDEX[TokenType.DCOLON],
heredoc_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEREDOC_STRING],
+ raw_string=_TOKEN_TYPE_TO_INDEX[TokenType.RAW_STRING],
hex_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEX_STRING],
identifier=_TOKEN_TYPE_TO_INDEX[TokenType.IDENTIFIER],
number=_TOKEN_TYPE_TO_INDEX[TokenType.NUMBER],
@@ -562,8 +567,7 @@ class Tokenizer(metaclass=_Tokenizer):
"~": TokenType.TILDA,
"?": TokenType.PLACEHOLDER,
"@": TokenType.PARAMETER,
- # used for breaking a var like x'y' but nothing else
- # the token type doesn't matter
+ # Used for breaking a var like x'y' but nothing else the token type doesn't matter
"'": TokenType.QUOTE,
"`": TokenType.IDENTIFIER,
'"': TokenType.IDENTIFIER,
@@ -796,6 +800,7 @@ class Tokenizer(metaclass=_Tokenizer):
"LONG": TokenType.BIGINT,
"BIGINT": TokenType.BIGINT,
"INT8": TokenType.TINYINT,
+ "UINT": TokenType.UINT,
"DEC": TokenType.DECIMAL,
"DECIMAL": TokenType.DECIMAL,
"BIGDECIMAL": TokenType.BIGDECIMAL,
@@ -856,6 +861,7 @@ class Tokenizer(metaclass=_Tokenizer):
"DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
"UNIQUE": TokenType.UNIQUE,
"STRUCT": TokenType.STRUCT,
+ "SEQUENCE": TokenType.SEQUENCE,
"VARIANT": TokenType.VARIANT,
"ALTER": TokenType.ALTER,
"ANALYZE": TokenType.COMMAND,
@@ -888,7 +894,7 @@ class Tokenizer(metaclass=_Tokenizer):
COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
- # handle numeric literals like in hive (3L = BIGINT)
+ # Handle numeric literals like in hive (3L = BIGINT)
NUMERIC_LITERALS: t.Dict[str, str] = {}
COMMENTS = ["--", ("/*", "*/")]
@@ -917,7 +923,7 @@ class Tokenizer(metaclass=_Tokenizer):
if USE_RS_TOKENIZER:
self._rs_dialect_settings = RsTokenizerDialectSettings(
- escape_sequences=self.dialect.ESCAPE_SEQUENCES,
+ unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
)
@@ -961,8 +967,7 @@ class Tokenizer(metaclass=_Tokenizer):
while self.size and not self._end:
current = self._current
- # skip spaces inline rather than iteratively call advance()
- # for performance reasons
+ # Skip spaces here rather than iteratively calling advance() for performance reasons
while current < self.size:
char = self.sql[current]
@@ -971,12 +976,10 @@ class Tokenizer(metaclass=_Tokenizer):
else:
break
- n = current - self._current
- self._start = current
- self._advance(n if n > 1 else 1)
+ offset = current - self._current if current > self._current else 1
- if self._char is None:
- break
+ self._start = current
+ self._advance(offset)
if not self._char.isspace():
if self._char.isdigit():
@@ -1004,12 +1007,9 @@ class Tokenizer(metaclass=_Tokenizer):
def _advance(self, i: int = 1, alnum: bool = False) -> None:
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
# Ensures we don't count an extra line if we get a \r\n line break sequence
- if self._char == "\r" and self._peek == "\n":
- i = 2
- self._start += 1
-
- self._col = 1
- self._line += 1
+ if not (self._char == "\r" and self._peek == "\n"):
+ self._col = 1
+ self._line += 1
else:
self._col += i
@@ -1268,13 +1268,27 @@ class Tokenizer(metaclass=_Tokenizer):
return True
self._advance()
- tag = "" if self._char == end else self._extract_string(end)
+
+ if self._char == end:
+ tag = ""
+ else:
+ tag = self._extract_string(
+ end,
+ unescape_sequences=False,
+ raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
+ )
+
+ if self._end and tag and self.HEREDOC_TAG_IS_IDENTIFIER:
+ self._advance(-len(tag))
+ self._add(self.HEREDOC_STRING_ALTERNATIVE)
+ return True
+
end = f"{start}{tag}{end}"
else:
return False
self._advance(len(start))
- text = self._extract_string(end)
+ text = self._extract_string(end, unescape_sequences=token_type != TokenType.RAW_STRING)
if base:
try:
@@ -1289,7 +1303,7 @@ class Tokenizer(metaclass=_Tokenizer):
def _scan_identifier(self, identifier_end: str) -> None:
self._advance()
- text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
+ text = self._extract_string(identifier_end, escapes=self._IDENTIFIER_ESCAPES)
self._add(TokenType.IDENTIFIER, text)
def _scan_var(self) -> None:
@@ -1306,13 +1320,30 @@ class Tokenizer(metaclass=_Tokenizer):
else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
)
- def _extract_string(self, delimiter: str, escapes=None) -> str:
+ def _extract_string(
+ self,
+ delimiter: str,
+ escapes: t.Optional[t.Set[str]] = None,
+ unescape_sequences: bool = True,
+ raise_unmatched: bool = True,
+ ) -> str:
text = ""
delim_size = len(delimiter)
escapes = self._STRING_ESCAPES if escapes is None else escapes
while True:
if (
+ unescape_sequences
+ and self.dialect.UNESCAPED_SEQUENCES
+ and self._peek
+ and self._char in self.STRING_ESCAPES
+ ):
+ unescaped_sequence = self.dialect.UNESCAPED_SEQUENCES.get(self._char + self._peek)
+ if unescaped_sequence:
+ self._advance(2)
+ text += unescaped_sequence
+ continue
+ if (
self._char in escapes
and (self._peek == delimiter or self._peek in escapes)
and (self._char not in self._QUOTES or self._char == self._peek)
@@ -1333,18 +1364,10 @@ class Tokenizer(metaclass=_Tokenizer):
break
if self._end:
- raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
+ if not raise_unmatched:
+ return text + self._char
- if (
- self.dialect.ESCAPE_SEQUENCES
- and self._peek
- and self._char in self.STRING_ESCAPES
- ):
- escaped_sequence = self.dialect.ESCAPE_SEQUENCES.get(self._char + self._peek)
- if escaped_sequence:
- self._advance(2)
- text += escaped_sequence
- continue
+ raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
current = self._current - 1
self._advance(alnum=True)