diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-09-15 16:46:17 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-09-15 16:46:17 +0000 |
commit | 28cc22419e32a65fea2d1678400265b8cabc3aff (patch) | |
tree | ff9ac1991fd48490b21ef6aa9015a347a165e2d9 /sqlglot/tokens.py | |
parent | Initial commit. (diff) | |
download | sqlglot-28cc22419e32a65fea2d1678400265b8cabc3aff.tar.xz sqlglot-28cc22419e32a65fea2d1678400265b8cabc3aff.zip |
Adding upstream version 6.0.4.upstream/6.0.4
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sqlglot/tokens.py')
-rw-r--r-- | sqlglot/tokens.py | 853 |
1 files changed, 853 insertions, 0 deletions
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py new file mode 100644 index 0000000..e4b754d --- /dev/null +++ b/sqlglot/tokens.py @@ -0,0 +1,853 @@ +from enum import auto + +from sqlglot.helper import AutoName +from sqlglot.trie import in_trie, new_trie + + +class TokenType(AutoName): + L_PAREN = auto() + R_PAREN = auto() + L_BRACKET = auto() + R_BRACKET = auto() + L_BRACE = auto() + R_BRACE = auto() + COMMA = auto() + DOT = auto() + DASH = auto() + PLUS = auto() + COLON = auto() + DCOLON = auto() + SEMICOLON = auto() + STAR = auto() + SLASH = auto() + LT = auto() + LTE = auto() + GT = auto() + GTE = auto() + NOT = auto() + EQ = auto() + NEQ = auto() + AND = auto() + OR = auto() + AMP = auto() + DPIPE = auto() + PIPE = auto() + CARET = auto() + TILDA = auto() + ARROW = auto() + DARROW = auto() + HASH_ARROW = auto() + DHASH_ARROW = auto() + ANNOTATION = auto() + DOLLAR = auto() + + SPACE = auto() + BREAK = auto() + + STRING = auto() + NUMBER = auto() + IDENTIFIER = auto() + COLUMN = auto() + COLUMN_DEF = auto() + SCHEMA = auto() + TABLE = auto() + VAR = auto() + BIT_STRING = auto() + + # types + BOOLEAN = auto() + TINYINT = auto() + SMALLINT = auto() + INT = auto() + BIGINT = auto() + FLOAT = auto() + DOUBLE = auto() + DECIMAL = auto() + CHAR = auto() + NCHAR = auto() + VARCHAR = auto() + NVARCHAR = auto() + TEXT = auto() + BINARY = auto() + BYTEA = auto() + JSON = auto() + TIMESTAMP = auto() + TIMESTAMPTZ = auto() + DATETIME = auto() + DATE = auto() + UUID = auto() + GEOGRAPHY = auto() + NULLABLE = auto() + + # keywords + ADD_FILE = auto() + ALIAS = auto() + ALL = auto() + ALTER = auto() + ANALYZE = auto() + ANY = auto() + ARRAY = auto() + ASC = auto() + AT_TIME_ZONE = auto() + AUTO_INCREMENT = auto() + BEGIN = auto() + BETWEEN = auto() + BUCKET = auto() + CACHE = auto() + CALL = auto() + CASE = auto() + CAST = auto() + CHARACTER_SET = auto() + CHECK = auto() + CLUSTER_BY = auto() + COLLATE = auto() + COMMENT = auto() + COMMIT = auto() + CONSTRAINT = auto() + CONVERT = auto() + CREATE = auto() + CROSS = auto() + CUBE = auto() + CURRENT_DATE = auto() + CURRENT_DATETIME = auto() + CURRENT_ROW = auto() + CURRENT_TIME = auto() + CURRENT_TIMESTAMP = auto() + DIV = auto() + DEFAULT = auto() + DELETE = auto() + DESC = auto() + DISTINCT = auto() + DISTRIBUTE_BY = auto() + DROP = auto() + ELSE = auto() + END = auto() + ENGINE = auto() + ESCAPE = auto() + EXCEPT = auto() + EXISTS = auto() + EXPLAIN = auto() + EXTRACT = auto() + FALSE = auto() + FETCH = auto() + FILTER = auto() + FINAL = auto() + FIRST = auto() + FOLLOWING = auto() + FOREIGN_KEY = auto() + FORMAT = auto() + FULL = auto() + FUNCTION = auto() + FROM = auto() + GROUP_BY = auto() + GROUPING_SETS = auto() + HAVING = auto() + HINT = auto() + IF = auto() + IGNORE_NULLS = auto() + ILIKE = auto() + IN = auto() + INDEX = auto() + INNER = auto() + INSERT = auto() + INTERSECT = auto() + INTERVAL = auto() + INTO = auto() + INTRODUCER = auto() + IS = auto() + ISNULL = auto() + JOIN = auto() + LATERAL = auto() + LAZY = auto() + LEFT = auto() + LIKE = auto() + LIMIT = auto() + LOCATION = auto() + MAP = auto() + MOD = auto() + NEXT = auto() + NO_ACTION = auto() + NULL = auto() + NULLS_FIRST = auto() + NULLS_LAST = auto() + OFFSET = auto() + ON = auto() + ONLY = auto() + OPTIMIZE = auto() + OPTIONS = auto() + ORDER_BY = auto() + ORDERED = auto() + ORDINALITY = auto() + OUTER = auto() + OUT_OF = auto() + OVER = auto() + OVERWRITE = auto() + PARTITION = auto() + PARTITION_BY = auto() + PARTITIONED_BY = auto() + PERCENT = auto() + PLACEHOLDER = auto() + PRECEDING = auto() + PRIMARY_KEY = auto() + PROPERTIES = auto() + QUALIFY = auto() + QUOTE = auto() + RANGE = auto() + RECURSIVE = auto() + REPLACE = auto() + RESPECT_NULLS = auto() + REFERENCES = auto() + RIGHT = auto() + RLIKE = auto() + ROLLUP = auto() + ROW = auto() + ROWS = auto() + SCHEMA_COMMENT = auto() + SELECT = auto() + SET = auto() + SHOW = auto() + SOME = auto() + SORT_BY = auto() + STORED = auto() + STRUCT = auto() + TABLE_FORMAT = auto() + TABLE_SAMPLE = auto() + TEMPORARY = auto() + TIME = auto() + TOP = auto() + THEN = auto() + TRUE = auto() + TRUNCATE = auto() + TRY_CAST = auto() + UNBOUNDED = auto() + UNCACHE = auto() + UNION = auto() + UNNEST = auto() + UPDATE = auto() + USE = auto() + USING = auto() + VALUES = auto() + VIEW = auto() + WHEN = auto() + WHERE = auto() + WINDOW = auto() + WITH = auto() + WITH_TIME_ZONE = auto() + WITHIN_GROUP = auto() + WITHOUT_TIME_ZONE = auto() + UNIQUE = auto() + + +class Token: + __slots__ = ("token_type", "text", "line", "col") + + @classmethod + def number(cls, number): + return cls(TokenType.NUMBER, str(number)) + + @classmethod + def string(cls, string): + return cls(TokenType.STRING, string) + + @classmethod + def identifier(cls, identifier): + return cls(TokenType.IDENTIFIER, identifier) + + @classmethod + def var(cls, var): + return cls(TokenType.VAR, var) + + def __init__(self, token_type, text, line=1, col=1): + self.token_type = token_type + self.text = text + self.line = line + self.col = max(col - len(text), 1) + + def __repr__(self): + attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) + return f"<Token {attributes}>" + + +class _Tokenizer(type): + def __new__(cls, clsname, bases, attrs): + klass = super().__new__(cls, clsname, bases, attrs) + + klass.QUOTES = dict( + (quote, quote) if isinstance(quote, str) else (quote[0], quote[1]) + for quote in klass.QUOTES + ) + + klass.IDENTIFIERS = dict( + (identifier, identifier) + if isinstance(identifier, str) + else (identifier[0], identifier[1]) + for identifier in klass.IDENTIFIERS + ) + + klass.COMMENTS = dict( + (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) + for comment in klass.COMMENTS + ) + + klass.KEYWORD_TRIE = new_trie( + key.upper() + for key, value in { + **klass.KEYWORDS, + **{comment: TokenType.COMMENT for comment in klass.COMMENTS}, + **{quote: TokenType.QUOTE for quote in klass.QUOTES}, + }.items() + if " " in key or any(single in key for single in klass.SINGLE_TOKENS) + ) + + return klass + + +class Tokenizer(metaclass=_Tokenizer): + SINGLE_TOKENS = { + "(": TokenType.L_PAREN, + ")": TokenType.R_PAREN, + "[": TokenType.L_BRACKET, + "]": TokenType.R_BRACKET, + "{": TokenType.L_BRACE, + "}": TokenType.R_BRACE, + "&": TokenType.AMP, + "^": TokenType.CARET, + ":": TokenType.COLON, + ",": TokenType.COMMA, + ".": TokenType.DOT, + "-": TokenType.DASH, + "=": TokenType.EQ, + ">": TokenType.GT, + "<": TokenType.LT, + "%": TokenType.MOD, + "!": TokenType.NOT, + "|": TokenType.PIPE, + "+": TokenType.PLUS, + ";": TokenType.SEMICOLON, + "/": TokenType.SLASH, + "*": TokenType.STAR, + "~": TokenType.TILDA, + "?": TokenType.PLACEHOLDER, + "#": TokenType.ANNOTATION, + "$": TokenType.DOLLAR, + # used for breaking a var like x'y' but nothing else + # the token type doesn't matter + "'": TokenType.QUOTE, + "`": TokenType.IDENTIFIER, + '"': TokenType.IDENTIFIER, + } + + QUOTES = ["'"] + + IDENTIFIERS = ['"'] + + ESCAPE = "'" + + KEYWORDS = { + "/*+": TokenType.HINT, + "*/": TokenType.HINT, + "==": TokenType.EQ, + "::": TokenType.DCOLON, + "||": TokenType.DPIPE, + ">=": TokenType.GTE, + "<=": TokenType.LTE, + "<>": TokenType.NEQ, + "!=": TokenType.NEQ, + "->": TokenType.ARROW, + "->>": TokenType.DARROW, + "#>": TokenType.HASH_ARROW, + "#>>": TokenType.DHASH_ARROW, + "ADD ARCHIVE": TokenType.ADD_FILE, + "ADD ARCHIVES": TokenType.ADD_FILE, + "ADD FILE": TokenType.ADD_FILE, + "ADD FILES": TokenType.ADD_FILE, + "ADD JAR": TokenType.ADD_FILE, + "ADD JARS": TokenType.ADD_FILE, + "ALL": TokenType.ALL, + "ALTER": TokenType.ALTER, + "ANALYZE": TokenType.ANALYZE, + "AND": TokenType.AND, + "ANY": TokenType.ANY, + "ASC": TokenType.ASC, + "AS": TokenType.ALIAS, + "AT TIME ZONE": TokenType.AT_TIME_ZONE, + "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, + "BEGIN": TokenType.BEGIN, + "BETWEEN": TokenType.BETWEEN, + "BUCKET": TokenType.BUCKET, + "CALL": TokenType.CALL, + "CACHE": TokenType.CACHE, + "UNCACHE": TokenType.UNCACHE, + "CASE": TokenType.CASE, + "CAST": TokenType.CAST, + "CHARACTER SET": TokenType.CHARACTER_SET, + "CHECK": TokenType.CHECK, + "CLUSTER BY": TokenType.CLUSTER_BY, + "COLLATE": TokenType.COLLATE, + "COMMENT": TokenType.SCHEMA_COMMENT, + "COMMIT": TokenType.COMMIT, + "CONSTRAINT": TokenType.CONSTRAINT, + "CONVERT": TokenType.CONVERT, + "CREATE": TokenType.CREATE, + "CROSS": TokenType.CROSS, + "CUBE": TokenType.CUBE, + "CURRENT_DATE": TokenType.CURRENT_DATE, + "CURRENT ROW": TokenType.CURRENT_ROW, + "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, + "DIV": TokenType.DIV, + "DEFAULT": TokenType.DEFAULT, + "DELETE": TokenType.DELETE, + "DESC": TokenType.DESC, + "DISTINCT": TokenType.DISTINCT, + "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, + "DROP": TokenType.DROP, + "ELSE": TokenType.ELSE, + "END": TokenType.END, + "ENGINE": TokenType.ENGINE, + "ESCAPE": TokenType.ESCAPE, + "EXCEPT": TokenType.EXCEPT, + "EXISTS": TokenType.EXISTS, + "EXPLAIN": TokenType.EXPLAIN, + "EXTRACT": TokenType.EXTRACT, + "FALSE": TokenType.FALSE, + "FETCH": TokenType.FETCH, + "FILTER": TokenType.FILTER, + "FIRST": TokenType.FIRST, + "FULL": TokenType.FULL, + "FUNCTION": TokenType.FUNCTION, + "FOLLOWING": TokenType.FOLLOWING, + "FOREIGN KEY": TokenType.FOREIGN_KEY, + "FORMAT": TokenType.FORMAT, + "FROM": TokenType.FROM, + "GROUP BY": TokenType.GROUP_BY, + "GROUPING SETS": TokenType.GROUPING_SETS, + "HAVING": TokenType.HAVING, + "IF": TokenType.IF, + "ILIKE": TokenType.ILIKE, + "IGNORE NULLS": TokenType.IGNORE_NULLS, + "IN": TokenType.IN, + "INDEX": TokenType.INDEX, + "INNER": TokenType.INNER, + "INSERT": TokenType.INSERT, + "INTERVAL": TokenType.INTERVAL, + "INTERSECT": TokenType.INTERSECT, + "INTO": TokenType.INTO, + "IS": TokenType.IS, + "ISNULL": TokenType.ISNULL, + "JOIN": TokenType.JOIN, + "LATERAL": TokenType.LATERAL, + "LAZY": TokenType.LAZY, + "LEFT": TokenType.LEFT, + "LIKE": TokenType.LIKE, + "LIMIT": TokenType.LIMIT, + "LOCATION": TokenType.LOCATION, + "NEXT": TokenType.NEXT, + "NO ACTION": TokenType.NO_ACTION, + "NOT": TokenType.NOT, + "NULL": TokenType.NULL, + "NULLS FIRST": TokenType.NULLS_FIRST, + "NULLS LAST": TokenType.NULLS_LAST, + "OFFSET": TokenType.OFFSET, + "ON": TokenType.ON, + "ONLY": TokenType.ONLY, + "OPTIMIZE": TokenType.OPTIMIZE, + "OPTIONS": TokenType.OPTIONS, + "OR": TokenType.OR, + "ORDER BY": TokenType.ORDER_BY, + "ORDINALITY": TokenType.ORDINALITY, + "OUTER": TokenType.OUTER, + "OUT OF": TokenType.OUT_OF, + "OVER": TokenType.OVER, + "OVERWRITE": TokenType.OVERWRITE, + "PARTITION": TokenType.PARTITION, + "PARTITION BY": TokenType.PARTITION_BY, + "PARTITIONED BY": TokenType.PARTITIONED_BY, + "PERCENT": TokenType.PERCENT, + "PRECEDING": TokenType.PRECEDING, + "PRIMARY KEY": TokenType.PRIMARY_KEY, + "RANGE": TokenType.RANGE, + "RECURSIVE": TokenType.RECURSIVE, + "REGEXP": TokenType.RLIKE, + "REPLACE": TokenType.REPLACE, + "RESPECT NULLS": TokenType.RESPECT_NULLS, + "REFERENCES": TokenType.REFERENCES, + "RIGHT": TokenType.RIGHT, + "RLIKE": TokenType.RLIKE, + "ROLLUP": TokenType.ROLLUP, + "ROW": TokenType.ROW, + "ROWS": TokenType.ROWS, + "SELECT": TokenType.SELECT, + "SET": TokenType.SET, + "SHOW": TokenType.SHOW, + "SOME": TokenType.SOME, + "SORT BY": TokenType.SORT_BY, + "STORED": TokenType.STORED, + "TABLE": TokenType.TABLE, + "TABLE_FORMAT": TokenType.TABLE_FORMAT, + "TBLPROPERTIES": TokenType.PROPERTIES, + "TABLESAMPLE": TokenType.TABLE_SAMPLE, + "TEMP": TokenType.TEMPORARY, + "TEMPORARY": TokenType.TEMPORARY, + "THEN": TokenType.THEN, + "TRUE": TokenType.TRUE, + "TRUNCATE": TokenType.TRUNCATE, + "TRY_CAST": TokenType.TRY_CAST, + "UNBOUNDED": TokenType.UNBOUNDED, + "UNION": TokenType.UNION, + "UNNEST": TokenType.UNNEST, + "UPDATE": TokenType.UPDATE, + "USE": TokenType.USE, + "USING": TokenType.USING, + "VALUES": TokenType.VALUES, + "VIEW": TokenType.VIEW, + "WHEN": TokenType.WHEN, + "WHERE": TokenType.WHERE, + "WITH": TokenType.WITH, + "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, + "WITHIN GROUP": TokenType.WITHIN_GROUP, + "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, + "ARRAY": TokenType.ARRAY, + "BOOL": TokenType.BOOLEAN, + "BOOLEAN": TokenType.BOOLEAN, + "BYTE": TokenType.TINYINT, + "TINYINT": TokenType.TINYINT, + "SHORT": TokenType.SMALLINT, + "SMALLINT": TokenType.SMALLINT, + "INT2": TokenType.SMALLINT, + "INTEGER": TokenType.INT, + "INT": TokenType.INT, + "INT4": TokenType.INT, + "LONG": TokenType.BIGINT, + "BIGINT": TokenType.BIGINT, + "INT8": TokenType.BIGINT, + "DECIMAL": TokenType.DECIMAL, + "MAP": TokenType.MAP, + "NUMBER": TokenType.DECIMAL, + "NUMERIC": TokenType.DECIMAL, + "FIXED": TokenType.DECIMAL, + "REAL": TokenType.FLOAT, + "FLOAT": TokenType.FLOAT, + "FLOAT4": TokenType.FLOAT, + "FLOAT8": TokenType.DOUBLE, + "DOUBLE": TokenType.DOUBLE, + "JSON": TokenType.JSON, + "CHAR": TokenType.CHAR, + "NCHAR": TokenType.NCHAR, + "VARCHAR": TokenType.VARCHAR, + "VARCHAR2": TokenType.VARCHAR, + "NVARCHAR": TokenType.NVARCHAR, + "NVARCHAR2": TokenType.NVARCHAR, + "STRING": TokenType.TEXT, + "TEXT": TokenType.TEXT, + "CLOB": TokenType.TEXT, + "BINARY": TokenType.BINARY, + "BLOB": TokenType.BINARY, + "BYTEA": TokenType.BINARY, + "TIMESTAMP": TokenType.TIMESTAMP, + "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, + "DATE": TokenType.DATE, + "DATETIME": TokenType.DATETIME, + "UNIQUE": TokenType.UNIQUE, + "STRUCT": TokenType.STRUCT, + } + + WHITE_SPACE = { + " ": TokenType.SPACE, + "\t": TokenType.SPACE, + "\n": TokenType.BREAK, + "\r": TokenType.BREAK, + "\r\n": TokenType.BREAK, + } + + COMMANDS = { + TokenType.ALTER, + TokenType.ADD_FILE, + TokenType.ANALYZE, + TokenType.BEGIN, + TokenType.CALL, + TokenType.COMMIT, + TokenType.EXPLAIN, + TokenType.OPTIMIZE, + TokenType.SET, + TokenType.SHOW, + TokenType.TRUNCATE, + TokenType.USE, + } + + # handle numeric literals like in hive (3L = BIGINT) + NUMERIC_LITERALS = {} + ENCODE = None + + COMMENTS = ["--", ("/*", "*/")] + KEYWORD_TRIE = None # autofilled + + __slots__ = ( + "sql", + "size", + "tokens", + "_start", + "_current", + "_line", + "_col", + "_char", + "_end", + "_peek", + ) + + def __init__(self): + """ + Tokenizer consumes a sql string and produces an array of :class:`~sqlglot.tokens.Token` + """ + self.reset() + + def reset(self): + self.sql = "" + self.size = 0 + self.tokens = [] + self._start = 0 + self._current = 0 + self._line = 1 + self._col = 1 + + self._char = None + self._end = None + self._peek = None + + def tokenize(self, sql): + self.reset() + self.sql = sql + self.size = len(sql) + + while self.size and not self._end: + self._start = self._current + self._advance() + + if not self._char: + break + + white_space = self.WHITE_SPACE.get(self._char) + identifier_end = self.IDENTIFIERS.get(self._char) + + if white_space: + if white_space == TokenType.BREAK: + self._col = 1 + self._line += 1 + elif self._char == "0" and self._peek == "x": + self._scan_hex() + elif self._char.isdigit(): + self._scan_number() + elif identifier_end: + self._scan_identifier(identifier_end) + else: + self._scan_keywords() + return self.tokens + + def _chars(self, size): + if size == 1: + return self._char + start = self._current - 1 + end = start + size + if end <= self.size: + return self.sql[start:end] + return "" + + def _advance(self, i=1): + self._col += i + self._current += i + self._end = self._current >= self.size + self._char = self.sql[self._current - 1] + self._peek = self.sql[self._current] if self._current < self.size else "" + + @property + def _text(self): + return self.sql[self._start : self._current] + + def _add(self, token_type, text=None): + text = self._text if text is None else text + self.tokens.append(Token(token_type, text, self._line, self._col)) + + if token_type in self.COMMANDS and ( + len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON + ): + self._start = self._current + while not self._end and self._peek != ";": + self._advance() + if self._start < self._current: + self._add(TokenType.STRING) + + def _scan_keywords(self): + size = 0 + word = None + chars = self._text + char = chars + prev_space = False + skip = False + trie = self.KEYWORD_TRIE + + while chars: + if skip: + result = 1 + else: + result, trie = in_trie(trie, char.upper()) + + if result == 0: + break + if result == 2: + word = chars + size += 1 + end = self._current - 1 + size + + if end < self.size: + char = self.sql[end] + is_space = char in self.WHITE_SPACE + + if not is_space or not prev_space: + if is_space: + char = " " + chars += char + prev_space = is_space + skip = False + else: + skip = True + else: + chars = None + + if not word: + if self._char in self.SINGLE_TOKENS: + token = self.SINGLE_TOKENS[self._char] + if token == TokenType.ANNOTATION: + self._scan_annotation() + return + self._add(token) + return + self._scan_var() + return + + if self._scan_string(word): + return + if self._scan_comment(word): + return + + self._advance(size - 1) + self._add(self.KEYWORDS[word.upper()]) + + def _scan_comment(self, comment_start): + if comment_start not in self.COMMENTS: + return False + + comment_end = self.COMMENTS[comment_start] + + if comment_end: + comment_end_size = len(comment_end) + + while not self._end and self._chars(comment_end_size) != comment_end: + self._advance() + self._advance(comment_end_size - 1) + else: + while not self._end and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK: + self._advance() + return True + + def _scan_annotation(self): + while ( + not self._end + and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK + and self._peek != "," + ): + self._advance() + self._add(TokenType.ANNOTATION, self._text[1:]) + + def _scan_number(self): + decimal = False + scientific = 0 + + while True: + if self._peek.isdigit(): + self._advance() + elif self._peek == "." and not decimal: + decimal = True + self._advance() + elif self._peek in ("-", "+") and scientific == 1: + scientific += 1 + self._advance() + elif self._peek.upper() == "E" and not scientific: + scientific += 1 + self._advance() + elif self._peek.isalpha(): + self._add(TokenType.NUMBER) + literal = [] + while self._peek.isalpha(): + literal.append(self._peek.upper()) + self._advance() + literal = "".join(literal) + token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) + if token_type: + self._add(TokenType.DCOLON, "::") + return self._add(token_type, literal) + return self._advance(-len(literal)) + else: + return self._add(TokenType.NUMBER) + + def _scan_hex(self): + self._advance() + + while True: + char = self._peek.strip() + if char and char not in self.SINGLE_TOKENS: + self._advance() + else: + break + try: + self._add(TokenType.BIT_STRING, f"{int(self._text, 16):b}") + except ValueError: + self._add(TokenType.IDENTIFIER) + + def _scan_string(self, quote): + quote_end = self.QUOTES.get(quote) + if quote_end is None: + return False + + text = "" + self._advance(len(quote)) + quote_end_size = len(quote_end) + + while True: + if self._char == self.ESCAPE and self._peek == quote_end: + text += quote + self._advance(2) + else: + if self._chars(quote_end_size) == quote_end: + if quote_end_size > 1: + self._advance(quote_end_size - 1) + break + + if self._end: + raise RuntimeError( + f"Missing {quote} from {self._line}:{self._start}" + ) + text += self._char + self._advance() + + text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text + text = text.replace("\\\\", "\\") if self.ESCAPE == "\\" else text + self._add(TokenType.STRING, text) + return True + + def _scan_identifier(self, identifier_end): + while self._peek != identifier_end: + if self._end: + raise RuntimeError( + f"Missing {identifier_end} from {self._line}:{self._start}" + ) + self._advance() + self._advance() + self._add(TokenType.IDENTIFIER, self._text[1:-1]) + + def _scan_var(self): + while True: + char = self._peek.strip() + if char and char not in self.SINGLE_TOKENS: + self._advance() + else: + break + self._add(self.KEYWORDS.get(self._text.upper(), TokenType.VAR)) |