summaryrefslogtreecommitdiffstats
path: root/sqlglot/tokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r--sqlglot/tokens.py246
1 files changed, 91 insertions, 155 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
index 5e50b7c..ad329d2 100644
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@@ -51,7 +51,6 @@ class TokenType(AutoName):
DOLLAR = auto()
PARAMETER = auto()
SESSION_PARAMETER = auto()
- NATIONAL = auto()
DAMP = auto()
BLOCK_START = auto()
@@ -72,6 +71,8 @@ class TokenType(AutoName):
BIT_STRING = auto()
HEX_STRING = auto()
BYTE_STRING = auto()
+ NATIONAL_STRING = auto()
+ RAW_STRING = auto()
# types
BIT = auto()
@@ -110,6 +111,7 @@ class TokenType(AutoName):
TIMESTAMPTZ = auto()
TIMESTAMPLTZ = auto()
DATETIME = auto()
+ DATETIME64 = auto()
DATE = auto()
UUID = auto()
GEOGRAPHY = auto()
@@ -142,30 +144,22 @@ class TokenType(AutoName):
ARRAY = auto()
ASC = auto()
ASOF = auto()
- AT_TIME_ZONE = auto()
AUTO_INCREMENT = auto()
BEGIN = auto()
BETWEEN = auto()
- BOTH = auto()
- BUCKET = auto()
- BY_DEFAULT = auto()
CACHE = auto()
- CASCADE = auto()
CASE = auto()
CHARACTER_SET = auto()
- CLUSTER_BY = auto()
COLLATE = auto()
COMMAND = auto()
COMMENT = auto()
COMMIT = auto()
- COMPOUND = auto()
CONSTRAINT = auto()
CREATE = auto()
CROSS = auto()
CUBE = auto()
CURRENT_DATE = auto()
CURRENT_DATETIME = auto()
- CURRENT_ROW = auto()
CURRENT_TIME = auto()
CURRENT_TIMESTAMP = auto()
CURRENT_USER = auto()
@@ -174,8 +168,6 @@ class TokenType(AutoName):
DESC = auto()
DESCRIBE = auto()
DISTINCT = auto()
- DISTINCT_FROM = auto()
- DISTRIBUTE_BY = auto()
DIV = auto()
DROP = auto()
ELSE = auto()
@@ -189,7 +181,6 @@ class TokenType(AutoName):
FILTER = auto()
FINAL = auto()
FIRST = auto()
- FOLLOWING = auto()
FOR = auto()
FOREIGN_KEY = auto()
FORMAT = auto()
@@ -203,7 +194,6 @@ class TokenType(AutoName):
HAVING = auto()
HINT = auto()
IF = auto()
- IGNORE_NULLS = auto()
ILIKE = auto()
ILIKE_ANY = auto()
IN = auto()
@@ -222,36 +212,27 @@ class TokenType(AutoName):
KEEP = auto()
LANGUAGE = auto()
LATERAL = auto()
- LAZY = auto()
- LEADING = auto()
LEFT = auto()
LIKE = auto()
LIKE_ANY = auto()
LIMIT = auto()
- LOAD_DATA = auto()
- LOCAL = auto()
+ LOAD = auto()
+ LOCK = auto()
MAP = auto()
MATCH_RECOGNIZE = auto()
- MATERIALIZED = auto()
MERGE = auto()
MOD = auto()
NATURAL = auto()
NEXT = auto()
NEXT_VALUE_FOR = auto()
- NO_ACTION = auto()
NOTNULL = auto()
NULL = auto()
- NULLS_FIRST = auto()
- NULLS_LAST = auto()
OFFSET = auto()
ON = auto()
- ONLY = auto()
- OPTIONS = auto()
ORDER_BY = auto()
ORDERED = auto()
ORDINALITY = auto()
OUTER = auto()
- OUT_OF = auto()
OVER = auto()
OVERLAPS = auto()
OVERWRITE = auto()
@@ -261,7 +242,6 @@ class TokenType(AutoName):
PIVOT = auto()
PLACEHOLDER = auto()
PRAGMA = auto()
- PRECEDING = auto()
PRIMARY_KEY = auto()
PROCEDURE = auto()
PROPERTIES = auto()
@@ -271,7 +251,6 @@ class TokenType(AutoName):
RANGE = auto()
RECURSIVE = auto()
REPLACE = auto()
- RESPECT_NULLS = auto()
RETURNING = auto()
REFERENCES = auto()
RIGHT = auto()
@@ -280,28 +259,23 @@ class TokenType(AutoName):
ROLLUP = auto()
ROW = auto()
ROWS = auto()
- SEED = auto()
SELECT = auto()
SEMI = auto()
SEPARATOR = auto()
SERDE_PROPERTIES = auto()
SET = auto()
+ SETTINGS = auto()
SHOW = auto()
SIMILAR_TO = auto()
SOME = auto()
- SORTKEY = auto()
- SORT_BY = auto()
STRUCT = auto()
TABLE_SAMPLE = auto()
TEMPORARY = auto()
TOP = auto()
THEN = auto()
- TRAILING = auto()
TRUE = auto()
- UNBOUNDED = auto()
UNCACHE = auto()
UNION = auto()
- UNLOGGED = auto()
UNNEST = auto()
UNPIVOT = auto()
UPDATE = auto()
@@ -314,15 +288,11 @@ class TokenType(AutoName):
WHERE = auto()
WINDOW = auto()
WITH = auto()
- WITH_TIME_ZONE = auto()
- WITH_LOCAL_TIME_ZONE = auto()
- WITHIN_GROUP = auto()
- WITHOUT_TIME_ZONE = auto()
UNIQUE = auto()
class Token:
- __slots__ = ("token_type", "text", "line", "col", "end", "comments")
+ __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
@classmethod
def number(cls, number: int) -> Token:
@@ -350,22 +320,28 @@ class Token:
text: str,
line: int = 1,
col: int = 1,
+ start: int = 0,
end: int = 0,
comments: t.List[str] = [],
) -> None:
+ """Token initializer.
+
+ Args:
+ token_type: The TokenType Enum.
+ text: The text of the token.
+ line: The line that the token ends on.
+ col: The column that the token ends on.
+ start: The start index of the token.
+ end: The ending index of the token.
+ """
self.token_type = token_type
self.text = text
self.line = line
- size = len(text)
self.col = col
- self.end = end if end else size
+ self.start = start
+ self.end = end
self.comments = comments
- @property
- def start(self) -> int:
- """Returns the start of the token."""
- return self.end - len(self.text)
-
def __repr__(self) -> str:
attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
return f"<Token {attributes}>"
@@ -375,15 +351,31 @@ class _Tokenizer(type):
def __new__(cls, clsname, bases, attrs):
klass = super().__new__(cls, clsname, bases, attrs)
- klass._QUOTES = {
- f"{prefix}{s}": e
- for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items()
- for prefix in (("",) if s[0].isalpha() else ("", "n", "N"))
+ def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
+ return dict(
+ (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr
+ )
+
+ def _quotes_to_format(
+ token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
+ ) -> t.Dict[str, t.Tuple[str, TokenType]]:
+ return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
+
+ klass._QUOTES = _convert_quotes(klass.QUOTES)
+ klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS)
+
+ klass._FORMAT_STRINGS = {
+ **{
+ p + s: (e, TokenType.NATIONAL_STRING)
+ for s, e in klass._QUOTES.items()
+ for p in ("n", "N")
+ },
+ **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS),
+ **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS),
+ **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS),
+ **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS),
}
- klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
- klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
- klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS)
- klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
+
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
klass._COMMENTS = dict(
@@ -393,23 +385,17 @@ class _Tokenizer(type):
klass.KEYWORD_TRIE = new_trie(
key.upper()
- for key in {
- **klass.KEYWORDS,
- **{comment: TokenType.COMMENT for comment in klass._COMMENTS},
- **{quote: TokenType.QUOTE for quote in klass._QUOTES},
- **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
- **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
- **{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS},
- }
+ for key in (
+ *klass.KEYWORDS,
+ *klass._COMMENTS,
+ *klass._QUOTES,
+ *klass._FORMAT_STRINGS,
+ )
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
)
return klass
- @staticmethod
- def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
- return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
-
class Tokenizer(metaclass=_Tokenizer):
SINGLE_TOKENS = {
@@ -450,6 +436,7 @@ class Tokenizer(metaclass=_Tokenizer):
BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
+ RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
IDENTIFIER_ESCAPES = ['"']
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
@@ -457,9 +444,7 @@ class Tokenizer(metaclass=_Tokenizer):
VAR_SINGLE_TOKENS: t.Set[str] = set()
_COMMENTS: t.Dict[str, str] = {}
- _BIT_STRINGS: t.Dict[str, str] = {}
- _BYTE_STRINGS: t.Dict[str, str] = {}
- _HEX_STRINGS: t.Dict[str, str] = {}
+ _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
_IDENTIFIERS: t.Dict[str, str] = {}
_IDENTIFIER_ESCAPES: t.Set[str] = set()
_QUOTES: t.Dict[str, str] = {}
@@ -495,30 +480,22 @@ class Tokenizer(metaclass=_Tokenizer):
"ANY": TokenType.ANY,
"ASC": TokenType.ASC,
"AS": TokenType.ALIAS,
- "AT TIME ZONE": TokenType.AT_TIME_ZONE,
"AUTOINCREMENT": TokenType.AUTO_INCREMENT,
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
"BEGIN": TokenType.BEGIN,
"BETWEEN": TokenType.BETWEEN,
- "BOTH": TokenType.BOTH,
- "BUCKET": TokenType.BUCKET,
- "BY DEFAULT": TokenType.BY_DEFAULT,
"CACHE": TokenType.CACHE,
"UNCACHE": TokenType.UNCACHE,
"CASE": TokenType.CASE,
- "CASCADE": TokenType.CASCADE,
"CHARACTER SET": TokenType.CHARACTER_SET,
- "CLUSTER BY": TokenType.CLUSTER_BY,
"COLLATE": TokenType.COLLATE,
"COLUMN": TokenType.COLUMN,
"COMMIT": TokenType.COMMIT,
- "COMPOUND": TokenType.COMPOUND,
"CONSTRAINT": TokenType.CONSTRAINT,
"CREATE": TokenType.CREATE,
"CROSS": TokenType.CROSS,
"CUBE": TokenType.CUBE,
"CURRENT_DATE": TokenType.CURRENT_DATE,
- "CURRENT ROW": TokenType.CURRENT_ROW,
"CURRENT_TIME": TokenType.CURRENT_TIME,
"CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
"CURRENT_USER": TokenType.CURRENT_USER,
@@ -528,8 +505,6 @@ class Tokenizer(metaclass=_Tokenizer):
"DESC": TokenType.DESC,
"DESCRIBE": TokenType.DESCRIBE,
"DISTINCT": TokenType.DISTINCT,
- "DISTINCT FROM": TokenType.DISTINCT_FROM,
- "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
"DIV": TokenType.DIV,
"DROP": TokenType.DROP,
"ELSE": TokenType.ELSE,
@@ -544,18 +519,18 @@ class Tokenizer(metaclass=_Tokenizer):
"FIRST": TokenType.FIRST,
"FULL": TokenType.FULL,
"FUNCTION": TokenType.FUNCTION,
- "FOLLOWING": TokenType.FOLLOWING,
"FOR": TokenType.FOR,
"FOREIGN KEY": TokenType.FOREIGN_KEY,
"FORMAT": TokenType.FORMAT,
"FROM": TokenType.FROM,
+ "GEOGRAPHY": TokenType.GEOGRAPHY,
+ "GEOMETRY": TokenType.GEOMETRY,
"GLOB": TokenType.GLOB,
"GROUP BY": TokenType.GROUP_BY,
"GROUPING SETS": TokenType.GROUPING_SETS,
"HAVING": TokenType.HAVING,
"IF": TokenType.IF,
"ILIKE": TokenType.ILIKE,
- "IGNORE NULLS": TokenType.IGNORE_NULLS,
"IN": TokenType.IN,
"INDEX": TokenType.INDEX,
"INET": TokenType.INET,
@@ -569,34 +544,25 @@ class Tokenizer(metaclass=_Tokenizer):
"JOIN": TokenType.JOIN,
"KEEP": TokenType.KEEP,
"LATERAL": TokenType.LATERAL,
- "LAZY": TokenType.LAZY,
- "LEADING": TokenType.LEADING,
"LEFT": TokenType.LEFT,
"LIKE": TokenType.LIKE,
"LIMIT": TokenType.LIMIT,
- "LOAD DATA": TokenType.LOAD_DATA,
- "LOCAL": TokenType.LOCAL,
- "MATERIALIZED": TokenType.MATERIALIZED,
+ "LOAD": TokenType.LOAD,
+ "LOCK": TokenType.LOCK,
"MERGE": TokenType.MERGE,
"NATURAL": TokenType.NATURAL,
"NEXT": TokenType.NEXT,
"NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
- "NO ACTION": TokenType.NO_ACTION,
"NOT": TokenType.NOT,
"NOTNULL": TokenType.NOTNULL,
"NULL": TokenType.NULL,
- "NULLS FIRST": TokenType.NULLS_FIRST,
- "NULLS LAST": TokenType.NULLS_LAST,
"OBJECT": TokenType.OBJECT,
"OFFSET": TokenType.OFFSET,
"ON": TokenType.ON,
- "ONLY": TokenType.ONLY,
- "OPTIONS": TokenType.OPTIONS,
"OR": TokenType.OR,
"ORDER BY": TokenType.ORDER_BY,
"ORDINALITY": TokenType.ORDINALITY,
"OUTER": TokenType.OUTER,
- "OUT OF": TokenType.OUT_OF,
"OVER": TokenType.OVER,
"OVERLAPS": TokenType.OVERLAPS,
"OVERWRITE": TokenType.OVERWRITE,
@@ -607,7 +573,6 @@ class Tokenizer(metaclass=_Tokenizer):
"PERCENT": TokenType.PERCENT,
"PIVOT": TokenType.PIVOT,
"PRAGMA": TokenType.PRAGMA,
- "PRECEDING": TokenType.PRECEDING,
"PRIMARY KEY": TokenType.PRIMARY_KEY,
"PROCEDURE": TokenType.PROCEDURE,
"QUALIFY": TokenType.QUALIFY,
@@ -615,7 +580,6 @@ class Tokenizer(metaclass=_Tokenizer):
"RECURSIVE": TokenType.RECURSIVE,
"REGEXP": TokenType.RLIKE,
"REPLACE": TokenType.REPLACE,
- "RESPECT NULLS": TokenType.RESPECT_NULLS,
"REFERENCES": TokenType.REFERENCES,
"RIGHT": TokenType.RIGHT,
"RLIKE": TokenType.RLIKE,
@@ -624,25 +588,20 @@ class Tokenizer(metaclass=_Tokenizer):
"ROW": TokenType.ROW,
"ROWS": TokenType.ROWS,
"SCHEMA": TokenType.SCHEMA,
- "SEED": TokenType.SEED,
"SELECT": TokenType.SELECT,
"SEMI": TokenType.SEMI,
"SET": TokenType.SET,
+ "SETTINGS": TokenType.SETTINGS,
"SHOW": TokenType.SHOW,
"SIMILAR TO": TokenType.SIMILAR_TO,
"SOME": TokenType.SOME,
- "SORTKEY": TokenType.SORTKEY,
- "SORT BY": TokenType.SORT_BY,
"TABLE": TokenType.TABLE,
"TABLESAMPLE": TokenType.TABLE_SAMPLE,
"TEMP": TokenType.TEMPORARY,
"TEMPORARY": TokenType.TEMPORARY,
"THEN": TokenType.THEN,
"TRUE": TokenType.TRUE,
- "TRAILING": TokenType.TRAILING,
- "UNBOUNDED": TokenType.UNBOUNDED,
"UNION": TokenType.UNION,
- "UNLOGGED": TokenType.UNLOGGED,
"UNNEST": TokenType.UNNEST,
"UNPIVOT": TokenType.UNPIVOT,
"UPDATE": TokenType.UPDATE,
@@ -656,10 +615,6 @@ class Tokenizer(metaclass=_Tokenizer):
"WHERE": TokenType.WHERE,
"WINDOW": TokenType.WINDOW,
"WITH": TokenType.WITH,
- "WITH TIME ZONE": TokenType.WITH_TIME_ZONE,
- "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE,
- "WITHIN GROUP": TokenType.WITHIN_GROUP,
- "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE,
"APPLY": TokenType.APPLY,
"ARRAY": TokenType.ARRAY,
"BIT": TokenType.BIT,
@@ -718,15 +673,6 @@ class Tokenizer(metaclass=_Tokenizer):
"STRUCT": TokenType.STRUCT,
"VARIANT": TokenType.VARIANT,
"ALTER": TokenType.ALTER,
- "ALTER AGGREGATE": TokenType.COMMAND,
- "ALTER DEFAULT": TokenType.COMMAND,
- "ALTER DOMAIN": TokenType.COMMAND,
- "ALTER ROLE": TokenType.COMMAND,
- "ALTER RULE": TokenType.COMMAND,
- "ALTER SEQUENCE": TokenType.COMMAND,
- "ALTER TYPE": TokenType.COMMAND,
- "ALTER USER": TokenType.COMMAND,
- "ALTER VIEW": TokenType.COMMAND,
"ANALYZE": TokenType.COMMAND,
"CALL": TokenType.COMMAND,
"COMMENT": TokenType.COMMENT,
@@ -790,7 +736,7 @@ class Tokenizer(metaclass=_Tokenizer):
self._start = 0
self._current = 0
self._line = 1
- self._col = 1
+ self._col = 0
self._comments: t.List[str] = []
self._char = ""
@@ -803,13 +749,12 @@ class Tokenizer(metaclass=_Tokenizer):
self.reset()
self.sql = sql
self.size = len(sql)
+
try:
self._scan()
except Exception as e:
- start = self._current - 50
- end = self._current + 50
- start = start if start > 0 else 0
- end = end if end < self.size else self.size - 1
+ start = max(self._current - 50, 0)
+ end = min(self._current + 50, self.size - 1)
context = self.sql[start:end]
raise ValueError(f"Error tokenizing '{context}'") from e
@@ -834,17 +779,17 @@ class Tokenizer(metaclass=_Tokenizer):
if until and until():
break
- if self.tokens:
+ if self.tokens and self._comments:
self.tokens[-1].comments.extend(self._comments)
def _chars(self, size: int) -> str:
if size == 1:
return self._char
+
start = self._current - 1
end = start + size
- if end <= self.size:
- return self.sql[start:end]
- return ""
+
+ return self.sql[start:end] if end <= self.size else ""
def _advance(self, i: int = 1, alnum: bool = False) -> None:
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
@@ -859,6 +804,7 @@ class Tokenizer(metaclass=_Tokenizer):
self._peek = "" if self._end else self.sql[self._current]
if alnum and self._char.isalnum():
+ # Here we use local variables instead of attributes for better performance
_col = self._col
_current = self._current
_end = self._end
@@ -885,11 +831,12 @@ class Tokenizer(metaclass=_Tokenizer):
self.tokens.append(
Token(
token_type,
- self._text if text is None else text,
- self._line,
- self._col,
- self._current,
- self._comments,
+ text=self._text if text is None else text,
+ line=self._line,
+ col=self._col,
+ start=self._start,
+ end=self._current - 1,
+ comments=self._comments,
)
)
self._comments = []
@@ -929,6 +876,7 @@ class Tokenizer(metaclass=_Tokenizer):
break
if result == 2:
word = chars
+
size += 1
end = self._current - 1 + size
@@ -946,6 +894,7 @@ class Tokenizer(metaclass=_Tokenizer):
else:
skip = True
else:
+ char = ""
chars = " "
word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
@@ -959,8 +908,6 @@ class Tokenizer(metaclass=_Tokenizer):
if self._scan_string(word):
return
- if self._scan_formatted_string(word):
- return
if self._scan_comment(word):
return
@@ -1004,9 +951,9 @@ class Tokenizer(metaclass=_Tokenizer):
if self._char == "0":
peek = self._peek.upper()
if peek == "B":
- return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
+ return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
elif peek == "X":
- return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)
+ return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
decimal = False
scientific = 0
@@ -1075,37 +1022,24 @@ class Tokenizer(metaclass=_Tokenizer):
return self._text
- def _scan_string(self, quote: str) -> bool:
- quote_end = self._QUOTES.get(quote)
- if quote_end is None:
- return False
+ def _scan_string(self, start: str) -> bool:
+ base = None
+ token_type = TokenType.STRING
- self._advance(len(quote))
- text = self._extract_string(quote_end)
- text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
- self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
- return True
+ if start in self._QUOTES:
+ end = self._QUOTES[start]
+ elif start in self._FORMAT_STRINGS:
+ end, token_type = self._FORMAT_STRINGS[start]
- # X'1234', b'0110', E'\\\\\' etc.
- def _scan_formatted_string(self, string_start: str) -> bool:
- if string_start in self._HEX_STRINGS:
- delimiters = self._HEX_STRINGS
- token_type = TokenType.HEX_STRING
- base = 16
- elif string_start in self._BIT_STRINGS:
- delimiters = self._BIT_STRINGS
- token_type = TokenType.BIT_STRING
- base = 2
- elif string_start in self._BYTE_STRINGS:
- delimiters = self._BYTE_STRINGS
- token_type = TokenType.BYTE_STRING
- base = None
+ if token_type == TokenType.HEX_STRING:
+ base = 16
+ elif token_type == TokenType.BIT_STRING:
+ base = 2
else:
return False
- self._advance(len(string_start))
- string_end = delimiters[string_start]
- text = self._extract_string(string_end)
+ self._advance(len(start))
+ text = self._extract_string(end)
if base:
try:
@@ -1114,6 +1048,8 @@ class Tokenizer(metaclass=_Tokenizer):
raise RuntimeError(
f"Numeric string contains invalid characters from {self._line}:{self._start}"
)
+ else:
+ text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
self._add(token_type, text)
return True