sqlglot.tokens
1from __future__ import annotations 2 3import typing as t 4from enum import auto 5 6from sqlglot.helper import AutoName 7from sqlglot.trie import in_trie, new_trie 8 9 10class TokenType(AutoName): 11 L_PAREN = auto() 12 R_PAREN = auto() 13 L_BRACKET = auto() 14 R_BRACKET = auto() 15 L_BRACE = auto() 16 R_BRACE = auto() 17 COMMA = auto() 18 DOT = auto() 19 DASH = auto() 20 PLUS = auto() 21 COLON = auto() 22 DCOLON = auto() 23 SEMICOLON = auto() 24 STAR = auto() 25 BACKSLASH = auto() 26 SLASH = auto() 27 LT = auto() 28 LTE = auto() 29 GT = auto() 30 GTE = auto() 31 NOT = auto() 32 EQ = auto() 33 NEQ = auto() 34 NULLSAFE_EQ = auto() 35 AND = auto() 36 OR = auto() 37 AMP = auto() 38 DPIPE = auto() 39 PIPE = auto() 40 CARET = auto() 41 TILDA = auto() 42 ARROW = auto() 43 DARROW = auto() 44 FARROW = auto() 45 HASH = auto() 46 HASH_ARROW = auto() 47 DHASH_ARROW = auto() 48 LR_ARROW = auto() 49 LT_AT = auto() 50 AT_GT = auto() 51 DOLLAR = auto() 52 PARAMETER = auto() 53 SESSION_PARAMETER = auto() 54 NATIONAL = auto() 55 DAMP = auto() 56 57 BLOCK_START = auto() 58 BLOCK_END = auto() 59 60 SPACE = auto() 61 BREAK = auto() 62 63 STRING = auto() 64 NUMBER = auto() 65 IDENTIFIER = auto() 66 DATABASE = auto() 67 COLUMN = auto() 68 COLUMN_DEF = auto() 69 SCHEMA = auto() 70 TABLE = auto() 71 VAR = auto() 72 BIT_STRING = auto() 73 HEX_STRING = auto() 74 BYTE_STRING = auto() 75 76 # types 77 BIT = auto() 78 BOOLEAN = auto() 79 TINYINT = auto() 80 UTINYINT = auto() 81 SMALLINT = auto() 82 USMALLINT = auto() 83 INT = auto() 84 UINT = auto() 85 BIGINT = auto() 86 UBIGINT = auto() 87 FLOAT = auto() 88 DOUBLE = auto() 89 DECIMAL = auto() 90 BIGDECIMAL = auto() 91 CHAR = auto() 92 NCHAR = auto() 93 VARCHAR = auto() 94 NVARCHAR = auto() 95 TEXT = auto() 96 MEDIUMTEXT = auto() 97 LONGTEXT = auto() 98 MEDIUMBLOB = auto() 99 LONGBLOB = auto() 100 BINARY = auto() 101 VARBINARY = auto() 102 JSON = auto() 103 JSONB = auto() 104 TIME = auto() 105 TIMESTAMP = auto() 106 TIMESTAMPTZ = auto() 107 TIMESTAMPLTZ = auto() 108 DATETIME = auto() 109 DATE = auto() 110 UUID = auto() 111 GEOGRAPHY = auto() 112 NULLABLE = auto() 113 GEOMETRY = auto() 114 HLLSKETCH = auto() 115 HSTORE = auto() 116 SUPER = auto() 117 SERIAL = auto() 118 SMALLSERIAL = auto() 119 BIGSERIAL = auto() 120 XML = auto() 121 UNIQUEIDENTIFIER = auto() 122 MONEY = auto() 123 SMALLMONEY = auto() 124 ROWVERSION = auto() 125 IMAGE = auto() 126 VARIANT = auto() 127 OBJECT = auto() 128 INET = auto() 129 130 # keywords 131 ALIAS = auto() 132 ALTER = auto() 133 ALWAYS = auto() 134 ALL = auto() 135 ANTI = auto() 136 ANY = auto() 137 APPLY = auto() 138 ARRAY = auto() 139 ASC = auto() 140 ASOF = auto() 141 AT_TIME_ZONE = auto() 142 AUTO_INCREMENT = auto() 143 BEGIN = auto() 144 BETWEEN = auto() 145 BOTH = auto() 146 BUCKET = auto() 147 BY_DEFAULT = auto() 148 CACHE = auto() 149 CASCADE = auto() 150 CASE = auto() 151 CHARACTER_SET = auto() 152 CLUSTER_BY = auto() 153 COLLATE = auto() 154 COMMAND = auto() 155 COMMENT = auto() 156 COMMIT = auto() 157 COMPOUND = auto() 158 CONSTRAINT = auto() 159 CREATE = auto() 160 CROSS = auto() 161 CUBE = auto() 162 CURRENT_DATE = auto() 163 CURRENT_DATETIME = auto() 164 CURRENT_ROW = auto() 165 CURRENT_TIME = auto() 166 CURRENT_TIMESTAMP = auto() 167 CURRENT_USER = auto() 168 DEFAULT = auto() 169 DELETE = auto() 170 DESC = auto() 171 DESCRIBE = auto() 172 DISTINCT = auto() 173 DISTINCT_FROM = auto() 174 DISTRIBUTE_BY = auto() 175 DIV = auto() 176 DROP = auto() 177 ELSE = auto() 178 END = auto() 179 ESCAPE = auto() 180 EXCEPT = auto() 181 EXECUTE = auto() 182 EXISTS = auto() 183 FALSE = auto() 184 FETCH = auto() 185 FILTER = auto() 186 FINAL = auto() 187 FIRST = auto() 188 FOLLOWING = auto() 189 FOR = auto() 190 FOREIGN_KEY = auto() 191 FORMAT = auto() 192 FROM = auto() 193 FULL = auto() 194 FUNCTION = auto() 195 GLOB = auto() 196 GLOBAL = auto() 197 GROUP_BY = auto() 198 GROUPING_SETS = auto() 199 HAVING = auto() 200 HINT = auto() 201 IF = auto() 202 IGNORE_NULLS = auto() 203 ILIKE = auto() 204 ILIKE_ANY = auto() 205 IN = auto() 206 INDEX = auto() 207 INNER = auto() 208 INSERT = auto() 209 INTERSECT = auto() 210 INTERVAL = auto() 211 INTO = auto() 212 INTRODUCER = auto() 213 IRLIKE = auto() 214 IS = auto() 215 ISNULL = auto() 216 JOIN = auto() 217 JOIN_MARKER = auto() 218 LANGUAGE = auto() 219 LATERAL = auto() 220 LAZY = auto() 221 LEADING = auto() 222 LEFT = auto() 223 LIKE = auto() 224 LIKE_ANY = auto() 225 LIMIT = auto() 226 LOAD_DATA = auto() 227 LOCAL = auto() 228 MAP = auto() 229 MATCH_RECOGNIZE = auto() 230 MATERIALIZED = auto() 231 MERGE = auto() 232 MOD = auto() 233 NATURAL = auto() 234 NEXT = auto() 235 NO_ACTION = auto() 236 NOTNULL = auto() 237 NULL = auto() 238 NULLS_FIRST = auto() 239 NULLS_LAST = auto() 240 OFFSET = auto() 241 ON = auto() 242 ONLY = auto() 243 OPTIONS = auto() 244 ORDER_BY = auto() 245 ORDERED = auto() 246 ORDINALITY = auto() 247 OUTER = auto() 248 OUT_OF = auto() 249 OVER = auto() 250 OVERLAPS = auto() 251 OVERWRITE = auto() 252 PARTITION = auto() 253 PARTITION_BY = auto() 254 PERCENT = auto() 255 PIVOT = auto() 256 PLACEHOLDER = auto() 257 PRAGMA = auto() 258 PRECEDING = auto() 259 PRIMARY_KEY = auto() 260 PROCEDURE = auto() 261 PROPERTIES = auto() 262 PSEUDO_TYPE = auto() 263 QUALIFY = auto() 264 QUOTE = auto() 265 RANGE = auto() 266 RECURSIVE = auto() 267 REPLACE = auto() 268 RESPECT_NULLS = auto() 269 RETURNING = auto() 270 REFERENCES = auto() 271 RIGHT = auto() 272 RLIKE = auto() 273 ROLLBACK = auto() 274 ROLLUP = auto() 275 ROW = auto() 276 ROWS = auto() 277 SEED = auto() 278 SELECT = auto() 279 SEMI = auto() 280 SEPARATOR = auto() 281 SERDE_PROPERTIES = auto() 282 SET = auto() 283 SHOW = auto() 284 SIMILAR_TO = auto() 285 SOME = auto() 286 SORTKEY = auto() 287 SORT_BY = auto() 288 STRUCT = auto() 289 TABLE_SAMPLE = auto() 290 TEMPORARY = auto() 291 TOP = auto() 292 THEN = auto() 293 TRAILING = auto() 294 TRUE = auto() 295 UNBOUNDED = auto() 296 UNCACHE = auto() 297 UNION = auto() 298 UNLOGGED = auto() 299 UNNEST = auto() 300 UNPIVOT = auto() 301 UPDATE = auto() 302 USE = auto() 303 USING = auto() 304 VALUES = auto() 305 VIEW = auto() 306 VOLATILE = auto() 307 WHEN = auto() 308 WHERE = auto() 309 WINDOW = auto() 310 WITH = auto() 311 WITH_TIME_ZONE = auto() 312 WITH_LOCAL_TIME_ZONE = auto() 313 WITHIN_GROUP = auto() 314 WITHOUT_TIME_ZONE = auto() 315 UNIQUE = auto() 316 317 318class Token: 319 __slots__ = ("token_type", "text", "line", "col", "end", "comments") 320 321 @classmethod 322 def number(cls, number: int) -> Token: 323 """Returns a NUMBER token with `number` as its text.""" 324 return cls(TokenType.NUMBER, str(number)) 325 326 @classmethod 327 def string(cls, string: str) -> Token: 328 """Returns a STRING token with `string` as its text.""" 329 return cls(TokenType.STRING, string) 330 331 @classmethod 332 def identifier(cls, identifier: str) -> Token: 333 """Returns an IDENTIFIER token with `identifier` as its text.""" 334 return cls(TokenType.IDENTIFIER, identifier) 335 336 @classmethod 337 def var(cls, var: str) -> Token: 338 """Returns an VAR token with `var` as its text.""" 339 return cls(TokenType.VAR, var) 340 341 def __init__( 342 self, 343 token_type: TokenType, 344 text: str, 345 line: int = 1, 346 col: int = 1, 347 end: int = 0, 348 comments: t.List[str] = [], 349 ) -> None: 350 self.token_type = token_type 351 self.text = text 352 self.line = line 353 size = len(text) 354 self.col = col 355 self.end = end if end else size 356 self.comments = comments 357 358 @property 359 def start(self) -> int: 360 """Returns the start of the token.""" 361 return self.end - len(self.text) 362 363 def __repr__(self) -> str: 364 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 365 return f"<Token {attributes}>" 366 367 368class _Tokenizer(type): 369 def __new__(cls, clsname, bases, attrs): 370 klass = super().__new__(cls, clsname, bases, attrs) 371 372 klass._QUOTES = { 373 f"{prefix}{s}": e 374 for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items() 375 for prefix in (("",) if s[0].isalpha() else ("", "n", "N")) 376 } 377 klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) 378 klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) 379 klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS) 380 klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) 381 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) 382 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) 383 klass._COMMENTS = dict( 384 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) 385 for comment in klass.COMMENTS 386 ) 387 388 klass.KEYWORD_TRIE = new_trie( 389 key.upper() 390 for key in { 391 **klass.KEYWORDS, 392 **{comment: TokenType.COMMENT for comment in klass._COMMENTS}, 393 **{quote: TokenType.QUOTE for quote in klass._QUOTES}, 394 **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS}, 395 **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS}, 396 **{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS}, 397 } 398 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) 399 ) 400 401 return klass 402 403 @staticmethod 404 def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 405 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) 406 407 408class Tokenizer(metaclass=_Tokenizer): 409 SINGLE_TOKENS = { 410 "(": TokenType.L_PAREN, 411 ")": TokenType.R_PAREN, 412 "[": TokenType.L_BRACKET, 413 "]": TokenType.R_BRACKET, 414 "{": TokenType.L_BRACE, 415 "}": TokenType.R_BRACE, 416 "&": TokenType.AMP, 417 "^": TokenType.CARET, 418 ":": TokenType.COLON, 419 ",": TokenType.COMMA, 420 ".": TokenType.DOT, 421 "-": TokenType.DASH, 422 "=": TokenType.EQ, 423 ">": TokenType.GT, 424 "<": TokenType.LT, 425 "%": TokenType.MOD, 426 "!": TokenType.NOT, 427 "|": TokenType.PIPE, 428 "+": TokenType.PLUS, 429 ";": TokenType.SEMICOLON, 430 "/": TokenType.SLASH, 431 "\\": TokenType.BACKSLASH, 432 "*": TokenType.STAR, 433 "~": TokenType.TILDA, 434 "?": TokenType.PLACEHOLDER, 435 "@": TokenType.PARAMETER, 436 # used for breaking a var like x'y' but nothing else 437 # the token type doesn't matter 438 "'": TokenType.QUOTE, 439 "`": TokenType.IDENTIFIER, 440 '"': TokenType.IDENTIFIER, 441 "#": TokenType.HASH, 442 } 443 444 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 445 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 446 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 447 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 448 IDENTIFIER_ESCAPES = ['"'] 449 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 450 STRING_ESCAPES = ["'"] 451 VAR_SINGLE_TOKENS: t.Set[str] = set() 452 453 _COMMENTS: t.Dict[str, str] = {} 454 _BIT_STRINGS: t.Dict[str, str] = {} 455 _BYTE_STRINGS: t.Dict[str, str] = {} 456 _HEX_STRINGS: t.Dict[str, str] = {} 457 _IDENTIFIERS: t.Dict[str, str] = {} 458 _IDENTIFIER_ESCAPES: t.Set[str] = set() 459 _QUOTES: t.Dict[str, str] = {} 460 _STRING_ESCAPES: t.Set[str] = set() 461 462 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { 463 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 464 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 465 "{{+": TokenType.BLOCK_START, 466 "{{-": TokenType.BLOCK_START, 467 "+}}": TokenType.BLOCK_END, 468 "-}}": TokenType.BLOCK_END, 469 "/*+": TokenType.HINT, 470 "==": TokenType.EQ, 471 "::": TokenType.DCOLON, 472 "||": TokenType.DPIPE, 473 ">=": TokenType.GTE, 474 "<=": TokenType.LTE, 475 "<>": TokenType.NEQ, 476 "!=": TokenType.NEQ, 477 "<=>": TokenType.NULLSAFE_EQ, 478 "->": TokenType.ARROW, 479 "->>": TokenType.DARROW, 480 "=>": TokenType.FARROW, 481 "#>": TokenType.HASH_ARROW, 482 "#>>": TokenType.DHASH_ARROW, 483 "<->": TokenType.LR_ARROW, 484 "&&": TokenType.DAMP, 485 "ALL": TokenType.ALL, 486 "ALWAYS": TokenType.ALWAYS, 487 "AND": TokenType.AND, 488 "ANTI": TokenType.ANTI, 489 "ANY": TokenType.ANY, 490 "ASC": TokenType.ASC, 491 "AS": TokenType.ALIAS, 492 "AT TIME ZONE": TokenType.AT_TIME_ZONE, 493 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 494 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 495 "BEGIN": TokenType.BEGIN, 496 "BETWEEN": TokenType.BETWEEN, 497 "BOTH": TokenType.BOTH, 498 "BUCKET": TokenType.BUCKET, 499 "BY DEFAULT": TokenType.BY_DEFAULT, 500 "CACHE": TokenType.CACHE, 501 "UNCACHE": TokenType.UNCACHE, 502 "CASE": TokenType.CASE, 503 "CASCADE": TokenType.CASCADE, 504 "CHARACTER SET": TokenType.CHARACTER_SET, 505 "CLUSTER BY": TokenType.CLUSTER_BY, 506 "COLLATE": TokenType.COLLATE, 507 "COLUMN": TokenType.COLUMN, 508 "COMMIT": TokenType.COMMIT, 509 "COMPOUND": TokenType.COMPOUND, 510 "CONSTRAINT": TokenType.CONSTRAINT, 511 "CREATE": TokenType.CREATE, 512 "CROSS": TokenType.CROSS, 513 "CUBE": TokenType.CUBE, 514 "CURRENT_DATE": TokenType.CURRENT_DATE, 515 "CURRENT ROW": TokenType.CURRENT_ROW, 516 "CURRENT_TIME": TokenType.CURRENT_TIME, 517 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 518 "CURRENT_USER": TokenType.CURRENT_USER, 519 "DATABASE": TokenType.DATABASE, 520 "DEFAULT": TokenType.DEFAULT, 521 "DELETE": TokenType.DELETE, 522 "DESC": TokenType.DESC, 523 "DESCRIBE": TokenType.DESCRIBE, 524 "DISTINCT": TokenType.DISTINCT, 525 "DISTINCT FROM": TokenType.DISTINCT_FROM, 526 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 527 "DIV": TokenType.DIV, 528 "DROP": TokenType.DROP, 529 "ELSE": TokenType.ELSE, 530 "END": TokenType.END, 531 "ESCAPE": TokenType.ESCAPE, 532 "EXCEPT": TokenType.EXCEPT, 533 "EXECUTE": TokenType.EXECUTE, 534 "EXISTS": TokenType.EXISTS, 535 "FALSE": TokenType.FALSE, 536 "FETCH": TokenType.FETCH, 537 "FILTER": TokenType.FILTER, 538 "FIRST": TokenType.FIRST, 539 "FULL": TokenType.FULL, 540 "FUNCTION": TokenType.FUNCTION, 541 "FOLLOWING": TokenType.FOLLOWING, 542 "FOR": TokenType.FOR, 543 "FOREIGN KEY": TokenType.FOREIGN_KEY, 544 "FORMAT": TokenType.FORMAT, 545 "FROM": TokenType.FROM, 546 "GLOB": TokenType.GLOB, 547 "GROUP BY": TokenType.GROUP_BY, 548 "GROUPING SETS": TokenType.GROUPING_SETS, 549 "HAVING": TokenType.HAVING, 550 "IF": TokenType.IF, 551 "ILIKE": TokenType.ILIKE, 552 "IGNORE NULLS": TokenType.IGNORE_NULLS, 553 "IN": TokenType.IN, 554 "INDEX": TokenType.INDEX, 555 "INET": TokenType.INET, 556 "INNER": TokenType.INNER, 557 "INSERT": TokenType.INSERT, 558 "INTERVAL": TokenType.INTERVAL, 559 "INTERSECT": TokenType.INTERSECT, 560 "INTO": TokenType.INTO, 561 "IS": TokenType.IS, 562 "ISNULL": TokenType.ISNULL, 563 "JOIN": TokenType.JOIN, 564 "LATERAL": TokenType.LATERAL, 565 "LAZY": TokenType.LAZY, 566 "LEADING": TokenType.LEADING, 567 "LEFT": TokenType.LEFT, 568 "LIKE": TokenType.LIKE, 569 "LIMIT": TokenType.LIMIT, 570 "LOAD DATA": TokenType.LOAD_DATA, 571 "LOCAL": TokenType.LOCAL, 572 "MATERIALIZED": TokenType.MATERIALIZED, 573 "MERGE": TokenType.MERGE, 574 "NATURAL": TokenType.NATURAL, 575 "NEXT": TokenType.NEXT, 576 "NO ACTION": TokenType.NO_ACTION, 577 "NOT": TokenType.NOT, 578 "NOTNULL": TokenType.NOTNULL, 579 "NULL": TokenType.NULL, 580 "NULLS FIRST": TokenType.NULLS_FIRST, 581 "NULLS LAST": TokenType.NULLS_LAST, 582 "OBJECT": TokenType.OBJECT, 583 "OFFSET": TokenType.OFFSET, 584 "ON": TokenType.ON, 585 "ONLY": TokenType.ONLY, 586 "OPTIONS": TokenType.OPTIONS, 587 "OR": TokenType.OR, 588 "ORDER BY": TokenType.ORDER_BY, 589 "ORDINALITY": TokenType.ORDINALITY, 590 "OUTER": TokenType.OUTER, 591 "OUT OF": TokenType.OUT_OF, 592 "OVER": TokenType.OVER, 593 "OVERLAPS": TokenType.OVERLAPS, 594 "OVERWRITE": TokenType.OVERWRITE, 595 "PARTITION": TokenType.PARTITION, 596 "PARTITION BY": TokenType.PARTITION_BY, 597 "PARTITIONED BY": TokenType.PARTITION_BY, 598 "PARTITIONED_BY": TokenType.PARTITION_BY, 599 "PERCENT": TokenType.PERCENT, 600 "PIVOT": TokenType.PIVOT, 601 "PRAGMA": TokenType.PRAGMA, 602 "PRECEDING": TokenType.PRECEDING, 603 "PRIMARY KEY": TokenType.PRIMARY_KEY, 604 "PROCEDURE": TokenType.PROCEDURE, 605 "QUALIFY": TokenType.QUALIFY, 606 "RANGE": TokenType.RANGE, 607 "RECURSIVE": TokenType.RECURSIVE, 608 "REGEXP": TokenType.RLIKE, 609 "REPLACE": TokenType.REPLACE, 610 "RESPECT NULLS": TokenType.RESPECT_NULLS, 611 "REFERENCES": TokenType.REFERENCES, 612 "RIGHT": TokenType.RIGHT, 613 "RLIKE": TokenType.RLIKE, 614 "ROLLBACK": TokenType.ROLLBACK, 615 "ROLLUP": TokenType.ROLLUP, 616 "ROW": TokenType.ROW, 617 "ROWS": TokenType.ROWS, 618 "SCHEMA": TokenType.SCHEMA, 619 "SEED": TokenType.SEED, 620 "SELECT": TokenType.SELECT, 621 "SEMI": TokenType.SEMI, 622 "SET": TokenType.SET, 623 "SHOW": TokenType.SHOW, 624 "SIMILAR TO": TokenType.SIMILAR_TO, 625 "SOME": TokenType.SOME, 626 "SORTKEY": TokenType.SORTKEY, 627 "SORT BY": TokenType.SORT_BY, 628 "TABLE": TokenType.TABLE, 629 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 630 "TEMP": TokenType.TEMPORARY, 631 "TEMPORARY": TokenType.TEMPORARY, 632 "THEN": TokenType.THEN, 633 "TRUE": TokenType.TRUE, 634 "TRAILING": TokenType.TRAILING, 635 "UNBOUNDED": TokenType.UNBOUNDED, 636 "UNION": TokenType.UNION, 637 "UNLOGGED": TokenType.UNLOGGED, 638 "UNNEST": TokenType.UNNEST, 639 "UNPIVOT": TokenType.UNPIVOT, 640 "UPDATE": TokenType.UPDATE, 641 "USE": TokenType.USE, 642 "USING": TokenType.USING, 643 "UUID": TokenType.UUID, 644 "VALUES": TokenType.VALUES, 645 "VIEW": TokenType.VIEW, 646 "VOLATILE": TokenType.VOLATILE, 647 "WHEN": TokenType.WHEN, 648 "WHERE": TokenType.WHERE, 649 "WINDOW": TokenType.WINDOW, 650 "WITH": TokenType.WITH, 651 "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, 652 "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE, 653 "WITHIN GROUP": TokenType.WITHIN_GROUP, 654 "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, 655 "APPLY": TokenType.APPLY, 656 "ARRAY": TokenType.ARRAY, 657 "BIT": TokenType.BIT, 658 "BOOL": TokenType.BOOLEAN, 659 "BOOLEAN": TokenType.BOOLEAN, 660 "BYTE": TokenType.TINYINT, 661 "TINYINT": TokenType.TINYINT, 662 "SHORT": TokenType.SMALLINT, 663 "SMALLINT": TokenType.SMALLINT, 664 "INT2": TokenType.SMALLINT, 665 "INTEGER": TokenType.INT, 666 "INT": TokenType.INT, 667 "INT4": TokenType.INT, 668 "LONG": TokenType.BIGINT, 669 "BIGINT": TokenType.BIGINT, 670 "INT8": TokenType.BIGINT, 671 "DEC": TokenType.DECIMAL, 672 "DECIMAL": TokenType.DECIMAL, 673 "BIGDECIMAL": TokenType.BIGDECIMAL, 674 "BIGNUMERIC": TokenType.BIGDECIMAL, 675 "MAP": TokenType.MAP, 676 "NULLABLE": TokenType.NULLABLE, 677 "NUMBER": TokenType.DECIMAL, 678 "NUMERIC": TokenType.DECIMAL, 679 "FIXED": TokenType.DECIMAL, 680 "REAL": TokenType.FLOAT, 681 "FLOAT": TokenType.FLOAT, 682 "FLOAT4": TokenType.FLOAT, 683 "FLOAT8": TokenType.DOUBLE, 684 "DOUBLE": TokenType.DOUBLE, 685 "DOUBLE PRECISION": TokenType.DOUBLE, 686 "JSON": TokenType.JSON, 687 "CHAR": TokenType.CHAR, 688 "CHARACTER": TokenType.CHAR, 689 "NCHAR": TokenType.NCHAR, 690 "VARCHAR": TokenType.VARCHAR, 691 "VARCHAR2": TokenType.VARCHAR, 692 "NVARCHAR": TokenType.NVARCHAR, 693 "NVARCHAR2": TokenType.NVARCHAR, 694 "STR": TokenType.TEXT, 695 "STRING": TokenType.TEXT, 696 "TEXT": TokenType.TEXT, 697 "CLOB": TokenType.TEXT, 698 "LONGVARCHAR": TokenType.TEXT, 699 "BINARY": TokenType.BINARY, 700 "BLOB": TokenType.VARBINARY, 701 "BYTEA": TokenType.VARBINARY, 702 "VARBINARY": TokenType.VARBINARY, 703 "TIME": TokenType.TIME, 704 "TIMESTAMP": TokenType.TIMESTAMP, 705 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 706 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 707 "DATE": TokenType.DATE, 708 "DATETIME": TokenType.DATETIME, 709 "UNIQUE": TokenType.UNIQUE, 710 "STRUCT": TokenType.STRUCT, 711 "VARIANT": TokenType.VARIANT, 712 "ALTER": TokenType.ALTER, 713 "ALTER AGGREGATE": TokenType.COMMAND, 714 "ALTER DEFAULT": TokenType.COMMAND, 715 "ALTER DOMAIN": TokenType.COMMAND, 716 "ALTER ROLE": TokenType.COMMAND, 717 "ALTER RULE": TokenType.COMMAND, 718 "ALTER SEQUENCE": TokenType.COMMAND, 719 "ALTER TYPE": TokenType.COMMAND, 720 "ALTER USER": TokenType.COMMAND, 721 "ALTER VIEW": TokenType.COMMAND, 722 "ANALYZE": TokenType.COMMAND, 723 "CALL": TokenType.COMMAND, 724 "COMMENT": TokenType.COMMENT, 725 "COPY": TokenType.COMMAND, 726 "EXPLAIN": TokenType.COMMAND, 727 "GRANT": TokenType.COMMAND, 728 "OPTIMIZE": TokenType.COMMAND, 729 "PREPARE": TokenType.COMMAND, 730 "TRUNCATE": TokenType.COMMAND, 731 "VACUUM": TokenType.COMMAND, 732 } 733 734 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 735 " ": TokenType.SPACE, 736 "\t": TokenType.SPACE, 737 "\n": TokenType.BREAK, 738 "\r": TokenType.BREAK, 739 "\r\n": TokenType.BREAK, 740 } 741 742 COMMANDS = { 743 TokenType.COMMAND, 744 TokenType.EXECUTE, 745 TokenType.FETCH, 746 TokenType.SHOW, 747 } 748 749 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 750 751 # handle numeric literals like in hive (3L = BIGINT) 752 NUMERIC_LITERALS: t.Dict[str, str] = {} 753 ENCODE: t.Optional[str] = None 754 755 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] 756 KEYWORD_TRIE: t.Dict = {} # autofilled 757 758 IDENTIFIER_CAN_START_WITH_DIGIT = False 759 760 __slots__ = ( 761 "sql", 762 "size", 763 "tokens", 764 "_start", 765 "_current", 766 "_line", 767 "_col", 768 "_comments", 769 "_char", 770 "_end", 771 "_peek", 772 "_prev_token_line", 773 "_prev_token_comments", 774 "_prev_token_type", 775 ) 776 777 def __init__(self) -> None: 778 self.reset() 779 780 def reset(self) -> None: 781 self.sql = "" 782 self.size = 0 783 self.tokens: t.List[Token] = [] 784 self._start = 0 785 self._current = 0 786 self._line = 1 787 self._col = 1 788 self._comments: t.List[str] = [] 789 790 self._char = "" 791 self._end = False 792 self._peek = "" 793 self._prev_token_line = -1 794 self._prev_token_comments: t.List[str] = [] 795 self._prev_token_type: t.Optional[TokenType] = None 796 797 def tokenize(self, sql: str) -> t.List[Token]: 798 """Returns a list of tokens corresponding to the SQL string `sql`.""" 799 self.reset() 800 self.sql = sql 801 self.size = len(sql) 802 try: 803 self._scan() 804 except Exception as e: 805 start = self._current - 50 806 end = self._current + 50 807 start = start if start > 0 else 0 808 end = end if end < self.size else self.size - 1 809 context = self.sql[start:end] 810 raise ValueError(f"Error tokenizing '{context}'") from e 811 812 return self.tokens 813 814 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 815 while self.size and not self._end: 816 self._start = self._current 817 self._advance() 818 819 if self._char is None: 820 break 821 822 if self._char not in self.WHITE_SPACE: 823 if self._char.isdigit(): 824 self._scan_number() 825 elif self._char in self._IDENTIFIERS: 826 self._scan_identifier(self._IDENTIFIERS[self._char]) 827 else: 828 self._scan_keywords() 829 830 if until and until(): 831 break 832 833 if self.tokens: 834 self.tokens[-1].comments.extend(self._comments) 835 836 def _chars(self, size: int) -> str: 837 if size == 1: 838 return self._char 839 start = self._current - 1 840 end = start + size 841 if end <= self.size: 842 return self.sql[start:end] 843 return "" 844 845 def _advance(self, i: int = 1) -> None: 846 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 847 self._col = 1 848 self._line += 1 849 else: 850 self._col += i 851 852 self._current += i 853 self._end = self._current >= self.size 854 self._char = self.sql[self._current - 1] 855 self._peek = "" if self._end else self.sql[self._current] 856 857 @property 858 def _text(self) -> str: 859 return self.sql[self._start : self._current] 860 861 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 862 self._prev_token_line = self._line 863 self._prev_token_comments = self._comments 864 self._prev_token_type = token_type 865 self.tokens.append( 866 Token( 867 token_type, 868 self._text if text is None else text, 869 self._line, 870 self._col, 871 self._current, 872 self._comments, 873 ) 874 ) 875 self._comments = [] 876 877 # If we have either a semicolon or a begin token before the command's token, we'll parse 878 # whatever follows the command's token as a string 879 if ( 880 token_type in self.COMMANDS 881 and self._peek != ";" 882 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 883 ): 884 start = self._current 885 tokens = len(self.tokens) 886 self._scan(lambda: self._peek == ";") 887 self.tokens = self.tokens[:tokens] 888 text = self.sql[start : self._current].strip() 889 if text: 890 self._add(TokenType.STRING, text) 891 892 def _scan_keywords(self) -> None: 893 size = 0 894 word = None 895 chars = self._text 896 char = chars 897 prev_space = False 898 skip = False 899 trie = self.KEYWORD_TRIE 900 single_token = char in self.SINGLE_TOKENS 901 902 while chars: 903 if skip: 904 result = 1 905 else: 906 result, trie = in_trie(trie, char.upper()) 907 908 if result == 0: 909 break 910 if result == 2: 911 word = chars 912 size += 1 913 end = self._current - 1 + size 914 915 if end < self.size: 916 char = self.sql[end] 917 single_token = single_token or char in self.SINGLE_TOKENS 918 is_space = char in self.WHITE_SPACE 919 920 if not is_space or not prev_space: 921 if is_space: 922 char = " " 923 chars += char 924 prev_space = is_space 925 skip = False 926 else: 927 skip = True 928 else: 929 chars = " " 930 931 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word 932 933 if not word: 934 if self._char in self.SINGLE_TOKENS: 935 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 936 return 937 self._scan_var() 938 return 939 940 if self._scan_string(word): 941 return 942 if self._scan_formatted_string(word): 943 return 944 if self._scan_comment(word): 945 return 946 947 self._advance(size - 1) 948 word = word.upper() 949 self._add(self.KEYWORDS[word], text=word) 950 951 def _scan_comment(self, comment_start: str) -> bool: 952 if comment_start not in self._COMMENTS: 953 return False 954 955 comment_start_line = self._line 956 comment_start_size = len(comment_start) 957 comment_end = self._COMMENTS[comment_start] 958 959 if comment_end: 960 # Skip the comment's start delimiter 961 self._advance(comment_start_size) 962 963 comment_end_size = len(comment_end) 964 while not self._end and self._chars(comment_end_size) != comment_end: 965 self._advance() 966 967 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 968 self._advance(comment_end_size - 1) 969 else: 970 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 971 self._advance() 972 self._comments.append(self._text[comment_start_size:]) 973 974 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 975 # Multiple consecutive comments are preserved by appending them to the current comments list. 976 if comment_start_line == self._prev_token_line: 977 self.tokens[-1].comments.extend(self._comments) 978 self._comments = [] 979 self._prev_token_line = self._line 980 981 return True 982 983 def _scan_number(self) -> None: 984 if self._char == "0": 985 peek = self._peek.upper() 986 if peek == "B": 987 return self._scan_bits() 988 elif peek == "X": 989 return self._scan_hex() 990 991 decimal = False 992 scientific = 0 993 994 while True: 995 if self._peek.isdigit(): 996 self._advance() 997 elif self._peek == "." and not decimal: 998 decimal = True 999 self._advance() 1000 elif self._peek in ("-", "+") and scientific == 1: 1001 scientific += 1 1002 self._advance() 1003 elif self._peek.upper() == "E" and not scientific: 1004 scientific += 1 1005 self._advance() 1006 elif self._peek.isidentifier(): 1007 number_text = self._text 1008 literal = "" 1009 1010 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1011 literal += self._peek.upper() 1012 self._advance() 1013 1014 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) 1015 1016 if token_type: 1017 self._add(TokenType.NUMBER, number_text) 1018 self._add(TokenType.DCOLON, "::") 1019 return self._add(token_type, literal) 1020 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: 1021 return self._add(TokenType.VAR) 1022 1023 self._add(TokenType.NUMBER, number_text) 1024 return self._advance(-len(literal)) 1025 else: 1026 return self._add(TokenType.NUMBER) 1027 1028 def _scan_bits(self) -> None: 1029 self._advance() 1030 value = self._extract_value() 1031 try: 1032 self._add(TokenType.BIT_STRING, f"{int(value, 2)}") 1033 except ValueError: 1034 self._add(TokenType.IDENTIFIER) 1035 1036 def _scan_hex(self) -> None: 1037 self._advance() 1038 value = self._extract_value() 1039 try: 1040 self._add(TokenType.HEX_STRING, f"{int(value, 16)}") 1041 except ValueError: 1042 self._add(TokenType.IDENTIFIER) 1043 1044 def _extract_value(self) -> str: 1045 while True: 1046 char = self._peek.strip() 1047 if char and char not in self.SINGLE_TOKENS: 1048 self._advance() 1049 else: 1050 break 1051 1052 return self._text 1053 1054 def _scan_string(self, quote: str) -> bool: 1055 quote_end = self._QUOTES.get(quote) 1056 if quote_end is None: 1057 return False 1058 1059 self._advance(len(quote)) 1060 text = self._extract_string(quote_end) 1061 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1062 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) 1063 return True 1064 1065 # X'1234, b'0110', E'\\\\\' etc. 1066 def _scan_formatted_string(self, string_start: str) -> bool: 1067 if string_start in self._HEX_STRINGS: 1068 delimiters = self._HEX_STRINGS 1069 token_type = TokenType.HEX_STRING 1070 base = 16 1071 elif string_start in self._BIT_STRINGS: 1072 delimiters = self._BIT_STRINGS 1073 token_type = TokenType.BIT_STRING 1074 base = 2 1075 elif string_start in self._BYTE_STRINGS: 1076 delimiters = self._BYTE_STRINGS 1077 token_type = TokenType.BYTE_STRING 1078 base = None 1079 else: 1080 return False 1081 1082 self._advance(len(string_start)) 1083 string_end = delimiters[string_start] 1084 text = self._extract_string(string_end) 1085 1086 if base is None: 1087 self._add(token_type, text) 1088 else: 1089 try: 1090 self._add(token_type, f"{int(text, base)}") 1091 except: 1092 raise RuntimeError( 1093 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1094 ) 1095 1096 return True 1097 1098 def _scan_identifier(self, identifier_end: str) -> None: 1099 text = "" 1100 identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES 1101 1102 while True: 1103 if self._end: 1104 raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") 1105 1106 self._advance() 1107 if self._char == identifier_end: 1108 if identifier_end_is_escape and self._peek == identifier_end: 1109 text += identifier_end 1110 self._advance() 1111 continue 1112 1113 break 1114 1115 text += self._char 1116 1117 self._add(TokenType.IDENTIFIER, text) 1118 1119 def _scan_var(self) -> None: 1120 while True: 1121 char = self._peek.strip() 1122 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1123 self._advance() 1124 else: 1125 break 1126 self._add( 1127 TokenType.VAR 1128 if self._prev_token_type == TokenType.PARAMETER 1129 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1130 ) 1131 1132 def _extract_string(self, delimiter: str) -> str: 1133 text = "" 1134 delim_size = len(delimiter) 1135 1136 while True: 1137 if self._char in self._STRING_ESCAPES and ( 1138 self._peek == delimiter or self._peek in self._STRING_ESCAPES 1139 ): 1140 if self._peek == delimiter: 1141 text += self._peek 1142 else: 1143 text += self._char + self._peek 1144 1145 if self._current + 1 < self.size: 1146 self._advance(2) 1147 else: 1148 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") 1149 else: 1150 if self._chars(delim_size) == delimiter: 1151 if delim_size > 1: 1152 self._advance(delim_size - 1) 1153 break 1154 1155 if self._end: 1156 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") 1157 text += self._char 1158 self._advance() 1159 1160 return text
11class TokenType(AutoName): 12 L_PAREN = auto() 13 R_PAREN = auto() 14 L_BRACKET = auto() 15 R_BRACKET = auto() 16 L_BRACE = auto() 17 R_BRACE = auto() 18 COMMA = auto() 19 DOT = auto() 20 DASH = auto() 21 PLUS = auto() 22 COLON = auto() 23 DCOLON = auto() 24 SEMICOLON = auto() 25 STAR = auto() 26 BACKSLASH = auto() 27 SLASH = auto() 28 LT = auto() 29 LTE = auto() 30 GT = auto() 31 GTE = auto() 32 NOT = auto() 33 EQ = auto() 34 NEQ = auto() 35 NULLSAFE_EQ = auto() 36 AND = auto() 37 OR = auto() 38 AMP = auto() 39 DPIPE = auto() 40 PIPE = auto() 41 CARET = auto() 42 TILDA = auto() 43 ARROW = auto() 44 DARROW = auto() 45 FARROW = auto() 46 HASH = auto() 47 HASH_ARROW = auto() 48 DHASH_ARROW = auto() 49 LR_ARROW = auto() 50 LT_AT = auto() 51 AT_GT = auto() 52 DOLLAR = auto() 53 PARAMETER = auto() 54 SESSION_PARAMETER = auto() 55 NATIONAL = auto() 56 DAMP = auto() 57 58 BLOCK_START = auto() 59 BLOCK_END = auto() 60 61 SPACE = auto() 62 BREAK = auto() 63 64 STRING = auto() 65 NUMBER = auto() 66 IDENTIFIER = auto() 67 DATABASE = auto() 68 COLUMN = auto() 69 COLUMN_DEF = auto() 70 SCHEMA = auto() 71 TABLE = auto() 72 VAR = auto() 73 BIT_STRING = auto() 74 HEX_STRING = auto() 75 BYTE_STRING = auto() 76 77 # types 78 BIT = auto() 79 BOOLEAN = auto() 80 TINYINT = auto() 81 UTINYINT = auto() 82 SMALLINT = auto() 83 USMALLINT = auto() 84 INT = auto() 85 UINT = auto() 86 BIGINT = auto() 87 UBIGINT = auto() 88 FLOAT = auto() 89 DOUBLE = auto() 90 DECIMAL = auto() 91 BIGDECIMAL = auto() 92 CHAR = auto() 93 NCHAR = auto() 94 VARCHAR = auto() 95 NVARCHAR = auto() 96 TEXT = auto() 97 MEDIUMTEXT = auto() 98 LONGTEXT = auto() 99 MEDIUMBLOB = auto() 100 LONGBLOB = auto() 101 BINARY = auto() 102 VARBINARY = auto() 103 JSON = auto() 104 JSONB = auto() 105 TIME = auto() 106 TIMESTAMP = auto() 107 TIMESTAMPTZ = auto() 108 TIMESTAMPLTZ = auto() 109 DATETIME = auto() 110 DATE = auto() 111 UUID = auto() 112 GEOGRAPHY = auto() 113 NULLABLE = auto() 114 GEOMETRY = auto() 115 HLLSKETCH = auto() 116 HSTORE = auto() 117 SUPER = auto() 118 SERIAL = auto() 119 SMALLSERIAL = auto() 120 BIGSERIAL = auto() 121 XML = auto() 122 UNIQUEIDENTIFIER = auto() 123 MONEY = auto() 124 SMALLMONEY = auto() 125 ROWVERSION = auto() 126 IMAGE = auto() 127 VARIANT = auto() 128 OBJECT = auto() 129 INET = auto() 130 131 # keywords 132 ALIAS = auto() 133 ALTER = auto() 134 ALWAYS = auto() 135 ALL = auto() 136 ANTI = auto() 137 ANY = auto() 138 APPLY = auto() 139 ARRAY = auto() 140 ASC = auto() 141 ASOF = auto() 142 AT_TIME_ZONE = auto() 143 AUTO_INCREMENT = auto() 144 BEGIN = auto() 145 BETWEEN = auto() 146 BOTH = auto() 147 BUCKET = auto() 148 BY_DEFAULT = auto() 149 CACHE = auto() 150 CASCADE = auto() 151 CASE = auto() 152 CHARACTER_SET = auto() 153 CLUSTER_BY = auto() 154 COLLATE = auto() 155 COMMAND = auto() 156 COMMENT = auto() 157 COMMIT = auto() 158 COMPOUND = auto() 159 CONSTRAINT = auto() 160 CREATE = auto() 161 CROSS = auto() 162 CUBE = auto() 163 CURRENT_DATE = auto() 164 CURRENT_DATETIME = auto() 165 CURRENT_ROW = auto() 166 CURRENT_TIME = auto() 167 CURRENT_TIMESTAMP = auto() 168 CURRENT_USER = auto() 169 DEFAULT = auto() 170 DELETE = auto() 171 DESC = auto() 172 DESCRIBE = auto() 173 DISTINCT = auto() 174 DISTINCT_FROM = auto() 175 DISTRIBUTE_BY = auto() 176 DIV = auto() 177 DROP = auto() 178 ELSE = auto() 179 END = auto() 180 ESCAPE = auto() 181 EXCEPT = auto() 182 EXECUTE = auto() 183 EXISTS = auto() 184 FALSE = auto() 185 FETCH = auto() 186 FILTER = auto() 187 FINAL = auto() 188 FIRST = auto() 189 FOLLOWING = auto() 190 FOR = auto() 191 FOREIGN_KEY = auto() 192 FORMAT = auto() 193 FROM = auto() 194 FULL = auto() 195 FUNCTION = auto() 196 GLOB = auto() 197 GLOBAL = auto() 198 GROUP_BY = auto() 199 GROUPING_SETS = auto() 200 HAVING = auto() 201 HINT = auto() 202 IF = auto() 203 IGNORE_NULLS = auto() 204 ILIKE = auto() 205 ILIKE_ANY = auto() 206 IN = auto() 207 INDEX = auto() 208 INNER = auto() 209 INSERT = auto() 210 INTERSECT = auto() 211 INTERVAL = auto() 212 INTO = auto() 213 INTRODUCER = auto() 214 IRLIKE = auto() 215 IS = auto() 216 ISNULL = auto() 217 JOIN = auto() 218 JOIN_MARKER = auto() 219 LANGUAGE = auto() 220 LATERAL = auto() 221 LAZY = auto() 222 LEADING = auto() 223 LEFT = auto() 224 LIKE = auto() 225 LIKE_ANY = auto() 226 LIMIT = auto() 227 LOAD_DATA = auto() 228 LOCAL = auto() 229 MAP = auto() 230 MATCH_RECOGNIZE = auto() 231 MATERIALIZED = auto() 232 MERGE = auto() 233 MOD = auto() 234 NATURAL = auto() 235 NEXT = auto() 236 NO_ACTION = auto() 237 NOTNULL = auto() 238 NULL = auto() 239 NULLS_FIRST = auto() 240 NULLS_LAST = auto() 241 OFFSET = auto() 242 ON = auto() 243 ONLY = auto() 244 OPTIONS = auto() 245 ORDER_BY = auto() 246 ORDERED = auto() 247 ORDINALITY = auto() 248 OUTER = auto() 249 OUT_OF = auto() 250 OVER = auto() 251 OVERLAPS = auto() 252 OVERWRITE = auto() 253 PARTITION = auto() 254 PARTITION_BY = auto() 255 PERCENT = auto() 256 PIVOT = auto() 257 PLACEHOLDER = auto() 258 PRAGMA = auto() 259 PRECEDING = auto() 260 PRIMARY_KEY = auto() 261 PROCEDURE = auto() 262 PROPERTIES = auto() 263 PSEUDO_TYPE = auto() 264 QUALIFY = auto() 265 QUOTE = auto() 266 RANGE = auto() 267 RECURSIVE = auto() 268 REPLACE = auto() 269 RESPECT_NULLS = auto() 270 RETURNING = auto() 271 REFERENCES = auto() 272 RIGHT = auto() 273 RLIKE = auto() 274 ROLLBACK = auto() 275 ROLLUP = auto() 276 ROW = auto() 277 ROWS = auto() 278 SEED = auto() 279 SELECT = auto() 280 SEMI = auto() 281 SEPARATOR = auto() 282 SERDE_PROPERTIES = auto() 283 SET = auto() 284 SHOW = auto() 285 SIMILAR_TO = auto() 286 SOME = auto() 287 SORTKEY = auto() 288 SORT_BY = auto() 289 STRUCT = auto() 290 TABLE_SAMPLE = auto() 291 TEMPORARY = auto() 292 TOP = auto() 293 THEN = auto() 294 TRAILING = auto() 295 TRUE = auto() 296 UNBOUNDED = auto() 297 UNCACHE = auto() 298 UNION = auto() 299 UNLOGGED = auto() 300 UNNEST = auto() 301 UNPIVOT = auto() 302 UPDATE = auto() 303 USE = auto() 304 USING = auto() 305 VALUES = auto() 306 VIEW = auto() 307 VOLATILE = auto() 308 WHEN = auto() 309 WHERE = auto() 310 WINDOW = auto() 311 WITH = auto() 312 WITH_TIME_ZONE = auto() 313 WITH_LOCAL_TIME_ZONE = auto() 314 WITHIN_GROUP = auto() 315 WITHOUT_TIME_ZONE = auto() 316 UNIQUE = auto()
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 'L_PAREN'>
R_PAREN =
<TokenType.R_PAREN: 'R_PAREN'>
L_BRACKET =
<TokenType.L_BRACKET: 'L_BRACKET'>
R_BRACKET =
<TokenType.R_BRACKET: 'R_BRACKET'>
L_BRACE =
<TokenType.L_BRACE: 'L_BRACE'>
R_BRACE =
<TokenType.R_BRACE: 'R_BRACE'>
COMMA =
<TokenType.COMMA: 'COMMA'>
DOT =
<TokenType.DOT: 'DOT'>
DASH =
<TokenType.DASH: 'DASH'>
PLUS =
<TokenType.PLUS: 'PLUS'>
COLON =
<TokenType.COLON: 'COLON'>
DCOLON =
<TokenType.DCOLON: 'DCOLON'>
SEMICOLON =
<TokenType.SEMICOLON: 'SEMICOLON'>
STAR =
<TokenType.STAR: 'STAR'>
BACKSLASH =
<TokenType.BACKSLASH: 'BACKSLASH'>
SLASH =
<TokenType.SLASH: 'SLASH'>
LT =
<TokenType.LT: 'LT'>
LTE =
<TokenType.LTE: 'LTE'>
GT =
<TokenType.GT: 'GT'>
GTE =
<TokenType.GTE: 'GTE'>
NOT =
<TokenType.NOT: 'NOT'>
EQ =
<TokenType.EQ: 'EQ'>
NEQ =
<TokenType.NEQ: 'NEQ'>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>
AND =
<TokenType.AND: 'AND'>
OR =
<TokenType.OR: 'OR'>
AMP =
<TokenType.AMP: 'AMP'>
DPIPE =
<TokenType.DPIPE: 'DPIPE'>
PIPE =
<TokenType.PIPE: 'PIPE'>
CARET =
<TokenType.CARET: 'CARET'>
TILDA =
<TokenType.TILDA: 'TILDA'>
ARROW =
<TokenType.ARROW: 'ARROW'>
DARROW =
<TokenType.DARROW: 'DARROW'>
FARROW =
<TokenType.FARROW: 'FARROW'>
HASH =
<TokenType.HASH: 'HASH'>
HASH_ARROW =
<TokenType.HASH_ARROW: 'HASH_ARROW'>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 'DHASH_ARROW'>
LR_ARROW =
<TokenType.LR_ARROW: 'LR_ARROW'>
LT_AT =
<TokenType.LT_AT: 'LT_AT'>
AT_GT =
<TokenType.AT_GT: 'AT_GT'>
DOLLAR =
<TokenType.DOLLAR: 'DOLLAR'>
PARAMETER =
<TokenType.PARAMETER: 'PARAMETER'>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 'SESSION_PARAMETER'>
NATIONAL =
<TokenType.NATIONAL: 'NATIONAL'>
DAMP =
<TokenType.DAMP: 'DAMP'>
BLOCK_START =
<TokenType.BLOCK_START: 'BLOCK_START'>
BLOCK_END =
<TokenType.BLOCK_END: 'BLOCK_END'>
SPACE =
<TokenType.SPACE: 'SPACE'>
BREAK =
<TokenType.BREAK: 'BREAK'>
STRING =
<TokenType.STRING: 'STRING'>
NUMBER =
<TokenType.NUMBER: 'NUMBER'>
IDENTIFIER =
<TokenType.IDENTIFIER: 'IDENTIFIER'>
DATABASE =
<TokenType.DATABASE: 'DATABASE'>
COLUMN =
<TokenType.COLUMN: 'COLUMN'>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 'COLUMN_DEF'>
SCHEMA =
<TokenType.SCHEMA: 'SCHEMA'>
TABLE =
<TokenType.TABLE: 'TABLE'>
VAR =
<TokenType.VAR: 'VAR'>
BIT_STRING =
<TokenType.BIT_STRING: 'BIT_STRING'>
HEX_STRING =
<TokenType.HEX_STRING: 'HEX_STRING'>
BYTE_STRING =
<TokenType.BYTE_STRING: 'BYTE_STRING'>
BIT =
<TokenType.BIT: 'BIT'>
BOOLEAN =
<TokenType.BOOLEAN: 'BOOLEAN'>
TINYINT =
<TokenType.TINYINT: 'TINYINT'>
UTINYINT =
<TokenType.UTINYINT: 'UTINYINT'>
SMALLINT =
<TokenType.SMALLINT: 'SMALLINT'>
USMALLINT =
<TokenType.USMALLINT: 'USMALLINT'>
INT =
<TokenType.INT: 'INT'>
UINT =
<TokenType.UINT: 'UINT'>
BIGINT =
<TokenType.BIGINT: 'BIGINT'>
UBIGINT =
<TokenType.UBIGINT: 'UBIGINT'>
FLOAT =
<TokenType.FLOAT: 'FLOAT'>
DOUBLE =
<TokenType.DOUBLE: 'DOUBLE'>
DECIMAL =
<TokenType.DECIMAL: 'DECIMAL'>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 'BIGDECIMAL'>
CHAR =
<TokenType.CHAR: 'CHAR'>
NCHAR =
<TokenType.NCHAR: 'NCHAR'>
VARCHAR =
<TokenType.VARCHAR: 'VARCHAR'>
NVARCHAR =
<TokenType.NVARCHAR: 'NVARCHAR'>
TEXT =
<TokenType.TEXT: 'TEXT'>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>
LONGTEXT =
<TokenType.LONGTEXT: 'LONGTEXT'>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>
LONGBLOB =
<TokenType.LONGBLOB: 'LONGBLOB'>
BINARY =
<TokenType.BINARY: 'BINARY'>
VARBINARY =
<TokenType.VARBINARY: 'VARBINARY'>
JSON =
<TokenType.JSON: 'JSON'>
JSONB =
<TokenType.JSONB: 'JSONB'>
TIME =
<TokenType.TIME: 'TIME'>
TIMESTAMP =
<TokenType.TIMESTAMP: 'TIMESTAMP'>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>
DATETIME =
<TokenType.DATETIME: 'DATETIME'>
DATE =
<TokenType.DATE: 'DATE'>
UUID =
<TokenType.UUID: 'UUID'>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 'GEOGRAPHY'>
NULLABLE =
<TokenType.NULLABLE: 'NULLABLE'>
GEOMETRY =
<TokenType.GEOMETRY: 'GEOMETRY'>
HLLSKETCH =
<TokenType.HLLSKETCH: 'HLLSKETCH'>
HSTORE =
<TokenType.HSTORE: 'HSTORE'>
SUPER =
<TokenType.SUPER: 'SUPER'>
SERIAL =
<TokenType.SERIAL: 'SERIAL'>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 'SMALLSERIAL'>
BIGSERIAL =
<TokenType.BIGSERIAL: 'BIGSERIAL'>
XML =
<TokenType.XML: 'XML'>
UNIQUEIDENTIFIER =
<TokenType.UNIQUEIDENTIFIER: 'UNIQUEIDENTIFIER'>
MONEY =
<TokenType.MONEY: 'MONEY'>
SMALLMONEY =
<TokenType.SMALLMONEY: 'SMALLMONEY'>
ROWVERSION =
<TokenType.ROWVERSION: 'ROWVERSION'>
IMAGE =
<TokenType.IMAGE: 'IMAGE'>
VARIANT =
<TokenType.VARIANT: 'VARIANT'>
OBJECT =
<TokenType.OBJECT: 'OBJECT'>
INET =
<TokenType.INET: 'INET'>
ALIAS =
<TokenType.ALIAS: 'ALIAS'>
ALTER =
<TokenType.ALTER: 'ALTER'>
ALWAYS =
<TokenType.ALWAYS: 'ALWAYS'>
ALL =
<TokenType.ALL: 'ALL'>
ANTI =
<TokenType.ANTI: 'ANTI'>
ANY =
<TokenType.ANY: 'ANY'>
APPLY =
<TokenType.APPLY: 'APPLY'>
ARRAY =
<TokenType.ARRAY: 'ARRAY'>
ASC =
<TokenType.ASC: 'ASC'>
ASOF =
<TokenType.ASOF: 'ASOF'>
AT_TIME_ZONE =
<TokenType.AT_TIME_ZONE: 'AT_TIME_ZONE'>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>
BEGIN =
<TokenType.BEGIN: 'BEGIN'>
BETWEEN =
<TokenType.BETWEEN: 'BETWEEN'>
BOTH =
<TokenType.BOTH: 'BOTH'>
BUCKET =
<TokenType.BUCKET: 'BUCKET'>
BY_DEFAULT =
<TokenType.BY_DEFAULT: 'BY_DEFAULT'>
CACHE =
<TokenType.CACHE: 'CACHE'>
CASCADE =
<TokenType.CASCADE: 'CASCADE'>
CASE =
<TokenType.CASE: 'CASE'>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 'CHARACTER_SET'>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 'CLUSTER_BY'>
COLLATE =
<TokenType.COLLATE: 'COLLATE'>
COMMAND =
<TokenType.COMMAND: 'COMMAND'>
COMMENT =
<TokenType.COMMENT: 'COMMENT'>
COMMIT =
<TokenType.COMMIT: 'COMMIT'>
COMPOUND =
<TokenType.COMPOUND: 'COMPOUND'>
CONSTRAINT =
<TokenType.CONSTRAINT: 'CONSTRAINT'>
CREATE =
<TokenType.CREATE: 'CREATE'>
CROSS =
<TokenType.CROSS: 'CROSS'>
CUBE =
<TokenType.CUBE: 'CUBE'>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 'CURRENT_DATE'>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 'CURRENT_DATETIME'>
CURRENT_ROW =
<TokenType.CURRENT_ROW: 'CURRENT_ROW'>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 'CURRENT_TIME'>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>
CURRENT_USER =
<TokenType.CURRENT_USER: 'CURRENT_USER'>
DEFAULT =
<TokenType.DEFAULT: 'DEFAULT'>
DELETE =
<TokenType.DELETE: 'DELETE'>
DESC =
<TokenType.DESC: 'DESC'>
DESCRIBE =
<TokenType.DESCRIBE: 'DESCRIBE'>
DISTINCT =
<TokenType.DISTINCT: 'DISTINCT'>
DISTINCT_FROM =
<TokenType.DISTINCT_FROM: 'DISTINCT_FROM'>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>
DIV =
<TokenType.DIV: 'DIV'>
DROP =
<TokenType.DROP: 'DROP'>
ELSE =
<TokenType.ELSE: 'ELSE'>
END =
<TokenType.END: 'END'>
ESCAPE =
<TokenType.ESCAPE: 'ESCAPE'>
EXCEPT =
<TokenType.EXCEPT: 'EXCEPT'>
EXECUTE =
<TokenType.EXECUTE: 'EXECUTE'>
EXISTS =
<TokenType.EXISTS: 'EXISTS'>
FALSE =
<TokenType.FALSE: 'FALSE'>
FETCH =
<TokenType.FETCH: 'FETCH'>
FILTER =
<TokenType.FILTER: 'FILTER'>
FINAL =
<TokenType.FINAL: 'FINAL'>
FIRST =
<TokenType.FIRST: 'FIRST'>
FOLLOWING =
<TokenType.FOLLOWING: 'FOLLOWING'>
FOR =
<TokenType.FOR: 'FOR'>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>
FORMAT =
<TokenType.FORMAT: 'FORMAT'>
FROM =
<TokenType.FROM: 'FROM'>
FULL =
<TokenType.FULL: 'FULL'>
FUNCTION =
<TokenType.FUNCTION: 'FUNCTION'>
GLOB =
<TokenType.GLOB: 'GLOB'>
GLOBAL =
<TokenType.GLOBAL: 'GLOBAL'>
GROUP_BY =
<TokenType.GROUP_BY: 'GROUP_BY'>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 'GROUPING_SETS'>
HAVING =
<TokenType.HAVING: 'HAVING'>
HINT =
<TokenType.HINT: 'HINT'>
IF =
<TokenType.IF: 'IF'>
IGNORE_NULLS =
<TokenType.IGNORE_NULLS: 'IGNORE_NULLS'>
ILIKE =
<TokenType.ILIKE: 'ILIKE'>
ILIKE_ANY =
<TokenType.ILIKE_ANY: 'ILIKE_ANY'>
IN =
<TokenType.IN: 'IN'>
INDEX =
<TokenType.INDEX: 'INDEX'>
INNER =
<TokenType.INNER: 'INNER'>
INSERT =
<TokenType.INSERT: 'INSERT'>
INTERSECT =
<TokenType.INTERSECT: 'INTERSECT'>
INTERVAL =
<TokenType.INTERVAL: 'INTERVAL'>
INTO =
<TokenType.INTO: 'INTO'>
INTRODUCER =
<TokenType.INTRODUCER: 'INTRODUCER'>
IRLIKE =
<TokenType.IRLIKE: 'IRLIKE'>
IS =
<TokenType.IS: 'IS'>
ISNULL =
<TokenType.ISNULL: 'ISNULL'>
JOIN =
<TokenType.JOIN: 'JOIN'>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 'JOIN_MARKER'>
LANGUAGE =
<TokenType.LANGUAGE: 'LANGUAGE'>
LATERAL =
<TokenType.LATERAL: 'LATERAL'>
LAZY =
<TokenType.LAZY: 'LAZY'>
LEADING =
<TokenType.LEADING: 'LEADING'>
LEFT =
<TokenType.LEFT: 'LEFT'>
LIKE =
<TokenType.LIKE: 'LIKE'>
LIKE_ANY =
<TokenType.LIKE_ANY: 'LIKE_ANY'>
LIMIT =
<TokenType.LIMIT: 'LIMIT'>
LOAD_DATA =
<TokenType.LOAD_DATA: 'LOAD_DATA'>
LOCAL =
<TokenType.LOCAL: 'LOCAL'>
MAP =
<TokenType.MAP: 'MAP'>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 'MATCH_RECOGNIZE'>
MATERIALIZED =
<TokenType.MATERIALIZED: 'MATERIALIZED'>
MERGE =
<TokenType.MERGE: 'MERGE'>
MOD =
<TokenType.MOD: 'MOD'>
NATURAL =
<TokenType.NATURAL: 'NATURAL'>
NEXT =
<TokenType.NEXT: 'NEXT'>
NO_ACTION =
<TokenType.NO_ACTION: 'NO_ACTION'>
NOTNULL =
<TokenType.NOTNULL: 'NOTNULL'>
NULL =
<TokenType.NULL: 'NULL'>
NULLS_FIRST =
<TokenType.NULLS_FIRST: 'NULLS_FIRST'>
NULLS_LAST =
<TokenType.NULLS_LAST: 'NULLS_LAST'>
OFFSET =
<TokenType.OFFSET: 'OFFSET'>
ON =
<TokenType.ON: 'ON'>
ONLY =
<TokenType.ONLY: 'ONLY'>
OPTIONS =
<TokenType.OPTIONS: 'OPTIONS'>
ORDER_BY =
<TokenType.ORDER_BY: 'ORDER_BY'>
ORDERED =
<TokenType.ORDERED: 'ORDERED'>
ORDINALITY =
<TokenType.ORDINALITY: 'ORDINALITY'>
OUTER =
<TokenType.OUTER: 'OUTER'>
OUT_OF =
<TokenType.OUT_OF: 'OUT_OF'>
OVER =
<TokenType.OVER: 'OVER'>
OVERLAPS =
<TokenType.OVERLAPS: 'OVERLAPS'>
OVERWRITE =
<TokenType.OVERWRITE: 'OVERWRITE'>
PARTITION =
<TokenType.PARTITION: 'PARTITION'>
PARTITION_BY =
<TokenType.PARTITION_BY: 'PARTITION_BY'>
PERCENT =
<TokenType.PERCENT: 'PERCENT'>
PIVOT =
<TokenType.PIVOT: 'PIVOT'>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 'PLACEHOLDER'>
PRAGMA =
<TokenType.PRAGMA: 'PRAGMA'>
PRECEDING =
<TokenType.PRECEDING: 'PRECEDING'>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>
PROCEDURE =
<TokenType.PROCEDURE: 'PROCEDURE'>
PROPERTIES =
<TokenType.PROPERTIES: 'PROPERTIES'>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 'PSEUDO_TYPE'>
QUALIFY =
<TokenType.QUALIFY: 'QUALIFY'>
QUOTE =
<TokenType.QUOTE: 'QUOTE'>
RANGE =
<TokenType.RANGE: 'RANGE'>
RECURSIVE =
<TokenType.RECURSIVE: 'RECURSIVE'>
REPLACE =
<TokenType.REPLACE: 'REPLACE'>
RESPECT_NULLS =
<TokenType.RESPECT_NULLS: 'RESPECT_NULLS'>
RETURNING =
<TokenType.RETURNING: 'RETURNING'>
REFERENCES =
<TokenType.REFERENCES: 'REFERENCES'>
RIGHT =
<TokenType.RIGHT: 'RIGHT'>
RLIKE =
<TokenType.RLIKE: 'RLIKE'>
ROLLBACK =
<TokenType.ROLLBACK: 'ROLLBACK'>
ROLLUP =
<TokenType.ROLLUP: 'ROLLUP'>
ROW =
<TokenType.ROW: 'ROW'>
ROWS =
<TokenType.ROWS: 'ROWS'>
SEED =
<TokenType.SEED: 'SEED'>
SELECT =
<TokenType.SELECT: 'SELECT'>
SEMI =
<TokenType.SEMI: 'SEMI'>
SEPARATOR =
<TokenType.SEPARATOR: 'SEPARATOR'>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 'SERDE_PROPERTIES'>
SET =
<TokenType.SET: 'SET'>
SHOW =
<TokenType.SHOW: 'SHOW'>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 'SIMILAR_TO'>
SOME =
<TokenType.SOME: 'SOME'>
SORTKEY =
<TokenType.SORTKEY: 'SORTKEY'>
SORT_BY =
<TokenType.SORT_BY: 'SORT_BY'>
STRUCT =
<TokenType.STRUCT: 'STRUCT'>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>
TEMPORARY =
<TokenType.TEMPORARY: 'TEMPORARY'>
TOP =
<TokenType.TOP: 'TOP'>
THEN =
<TokenType.THEN: 'THEN'>
TRAILING =
<TokenType.TRAILING: 'TRAILING'>
TRUE =
<TokenType.TRUE: 'TRUE'>
UNBOUNDED =
<TokenType.UNBOUNDED: 'UNBOUNDED'>
UNCACHE =
<TokenType.UNCACHE: 'UNCACHE'>
UNION =
<TokenType.UNION: 'UNION'>
UNLOGGED =
<TokenType.UNLOGGED: 'UNLOGGED'>
UNNEST =
<TokenType.UNNEST: 'UNNEST'>
UNPIVOT =
<TokenType.UNPIVOT: 'UNPIVOT'>
UPDATE =
<TokenType.UPDATE: 'UPDATE'>
USE =
<TokenType.USE: 'USE'>
USING =
<TokenType.USING: 'USING'>
VALUES =
<TokenType.VALUES: 'VALUES'>
VIEW =
<TokenType.VIEW: 'VIEW'>
VOLATILE =
<TokenType.VOLATILE: 'VOLATILE'>
WHEN =
<TokenType.WHEN: 'WHEN'>
WHERE =
<TokenType.WHERE: 'WHERE'>
WINDOW =
<TokenType.WINDOW: 'WINDOW'>
WITH =
<TokenType.WITH: 'WITH'>
WITH_TIME_ZONE =
<TokenType.WITH_TIME_ZONE: 'WITH_TIME_ZONE'>
WITH_LOCAL_TIME_ZONE =
<TokenType.WITH_LOCAL_TIME_ZONE: 'WITH_LOCAL_TIME_ZONE'>
WITHIN_GROUP =
<TokenType.WITHIN_GROUP: 'WITHIN_GROUP'>
WITHOUT_TIME_ZONE =
<TokenType.WITHOUT_TIME_ZONE: 'WITHOUT_TIME_ZONE'>
UNIQUE =
<TokenType.UNIQUE: 'UNIQUE'>
Inherited Members
- enum.Enum
- name
- value
class
Token:
319class Token: 320 __slots__ = ("token_type", "text", "line", "col", "end", "comments") 321 322 @classmethod 323 def number(cls, number: int) -> Token: 324 """Returns a NUMBER token with `number` as its text.""" 325 return cls(TokenType.NUMBER, str(number)) 326 327 @classmethod 328 def string(cls, string: str) -> Token: 329 """Returns a STRING token with `string` as its text.""" 330 return cls(TokenType.STRING, string) 331 332 @classmethod 333 def identifier(cls, identifier: str) -> Token: 334 """Returns an IDENTIFIER token with `identifier` as its text.""" 335 return cls(TokenType.IDENTIFIER, identifier) 336 337 @classmethod 338 def var(cls, var: str) -> Token: 339 """Returns an VAR token with `var` as its text.""" 340 return cls(TokenType.VAR, var) 341 342 def __init__( 343 self, 344 token_type: TokenType, 345 text: str, 346 line: int = 1, 347 col: int = 1, 348 end: int = 0, 349 comments: t.List[str] = [], 350 ) -> None: 351 self.token_type = token_type 352 self.text = text 353 self.line = line 354 size = len(text) 355 self.col = col 356 self.end = end if end else size 357 self.comments = comments 358 359 @property 360 def start(self) -> int: 361 """Returns the start of the token.""" 362 return self.end - len(self.text) 363 364 def __repr__(self) -> str: 365 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 366 return f"<Token {attributes}>"
Token( token_type: sqlglot.tokens.TokenType, text: str, line: int = 1, col: int = 1, end: int = 0, comments: List[str] = [])
342 def __init__( 343 self, 344 token_type: TokenType, 345 text: str, 346 line: int = 1, 347 col: int = 1, 348 end: int = 0, 349 comments: t.List[str] = [], 350 ) -> None: 351 self.token_type = token_type 352 self.text = text 353 self.line = line 354 size = len(text) 355 self.col = col 356 self.end = end if end else size 357 self.comments = comments
322 @classmethod 323 def number(cls, number: int) -> Token: 324 """Returns a NUMBER token with `number` as its text.""" 325 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number
as its text.
327 @classmethod 328 def string(cls, string: str) -> Token: 329 """Returns a STRING token with `string` as its text.""" 330 return cls(TokenType.STRING, string)
Returns a STRING token with string
as its text.
332 @classmethod 333 def identifier(cls, identifier: str) -> Token: 334 """Returns an IDENTIFIER token with `identifier` as its text.""" 335 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier
as its text.
class
Tokenizer:
409class Tokenizer(metaclass=_Tokenizer): 410 SINGLE_TOKENS = { 411 "(": TokenType.L_PAREN, 412 ")": TokenType.R_PAREN, 413 "[": TokenType.L_BRACKET, 414 "]": TokenType.R_BRACKET, 415 "{": TokenType.L_BRACE, 416 "}": TokenType.R_BRACE, 417 "&": TokenType.AMP, 418 "^": TokenType.CARET, 419 ":": TokenType.COLON, 420 ",": TokenType.COMMA, 421 ".": TokenType.DOT, 422 "-": TokenType.DASH, 423 "=": TokenType.EQ, 424 ">": TokenType.GT, 425 "<": TokenType.LT, 426 "%": TokenType.MOD, 427 "!": TokenType.NOT, 428 "|": TokenType.PIPE, 429 "+": TokenType.PLUS, 430 ";": TokenType.SEMICOLON, 431 "/": TokenType.SLASH, 432 "\\": TokenType.BACKSLASH, 433 "*": TokenType.STAR, 434 "~": TokenType.TILDA, 435 "?": TokenType.PLACEHOLDER, 436 "@": TokenType.PARAMETER, 437 # used for breaking a var like x'y' but nothing else 438 # the token type doesn't matter 439 "'": TokenType.QUOTE, 440 "`": TokenType.IDENTIFIER, 441 '"': TokenType.IDENTIFIER, 442 "#": TokenType.HASH, 443 } 444 445 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 446 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 447 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 448 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 449 IDENTIFIER_ESCAPES = ['"'] 450 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 451 STRING_ESCAPES = ["'"] 452 VAR_SINGLE_TOKENS: t.Set[str] = set() 453 454 _COMMENTS: t.Dict[str, str] = {} 455 _BIT_STRINGS: t.Dict[str, str] = {} 456 _BYTE_STRINGS: t.Dict[str, str] = {} 457 _HEX_STRINGS: t.Dict[str, str] = {} 458 _IDENTIFIERS: t.Dict[str, str] = {} 459 _IDENTIFIER_ESCAPES: t.Set[str] = set() 460 _QUOTES: t.Dict[str, str] = {} 461 _STRING_ESCAPES: t.Set[str] = set() 462 463 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { 464 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 465 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 466 "{{+": TokenType.BLOCK_START, 467 "{{-": TokenType.BLOCK_START, 468 "+}}": TokenType.BLOCK_END, 469 "-}}": TokenType.BLOCK_END, 470 "/*+": TokenType.HINT, 471 "==": TokenType.EQ, 472 "::": TokenType.DCOLON, 473 "||": TokenType.DPIPE, 474 ">=": TokenType.GTE, 475 "<=": TokenType.LTE, 476 "<>": TokenType.NEQ, 477 "!=": TokenType.NEQ, 478 "<=>": TokenType.NULLSAFE_EQ, 479 "->": TokenType.ARROW, 480 "->>": TokenType.DARROW, 481 "=>": TokenType.FARROW, 482 "#>": TokenType.HASH_ARROW, 483 "#>>": TokenType.DHASH_ARROW, 484 "<->": TokenType.LR_ARROW, 485 "&&": TokenType.DAMP, 486 "ALL": TokenType.ALL, 487 "ALWAYS": TokenType.ALWAYS, 488 "AND": TokenType.AND, 489 "ANTI": TokenType.ANTI, 490 "ANY": TokenType.ANY, 491 "ASC": TokenType.ASC, 492 "AS": TokenType.ALIAS, 493 "AT TIME ZONE": TokenType.AT_TIME_ZONE, 494 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 495 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 496 "BEGIN": TokenType.BEGIN, 497 "BETWEEN": TokenType.BETWEEN, 498 "BOTH": TokenType.BOTH, 499 "BUCKET": TokenType.BUCKET, 500 "BY DEFAULT": TokenType.BY_DEFAULT, 501 "CACHE": TokenType.CACHE, 502 "UNCACHE": TokenType.UNCACHE, 503 "CASE": TokenType.CASE, 504 "CASCADE": TokenType.CASCADE, 505 "CHARACTER SET": TokenType.CHARACTER_SET, 506 "CLUSTER BY": TokenType.CLUSTER_BY, 507 "COLLATE": TokenType.COLLATE, 508 "COLUMN": TokenType.COLUMN, 509 "COMMIT": TokenType.COMMIT, 510 "COMPOUND": TokenType.COMPOUND, 511 "CONSTRAINT": TokenType.CONSTRAINT, 512 "CREATE": TokenType.CREATE, 513 "CROSS": TokenType.CROSS, 514 "CUBE": TokenType.CUBE, 515 "CURRENT_DATE": TokenType.CURRENT_DATE, 516 "CURRENT ROW": TokenType.CURRENT_ROW, 517 "CURRENT_TIME": TokenType.CURRENT_TIME, 518 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 519 "CURRENT_USER": TokenType.CURRENT_USER, 520 "DATABASE": TokenType.DATABASE, 521 "DEFAULT": TokenType.DEFAULT, 522 "DELETE": TokenType.DELETE, 523 "DESC": TokenType.DESC, 524 "DESCRIBE": TokenType.DESCRIBE, 525 "DISTINCT": TokenType.DISTINCT, 526 "DISTINCT FROM": TokenType.DISTINCT_FROM, 527 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 528 "DIV": TokenType.DIV, 529 "DROP": TokenType.DROP, 530 "ELSE": TokenType.ELSE, 531 "END": TokenType.END, 532 "ESCAPE": TokenType.ESCAPE, 533 "EXCEPT": TokenType.EXCEPT, 534 "EXECUTE": TokenType.EXECUTE, 535 "EXISTS": TokenType.EXISTS, 536 "FALSE": TokenType.FALSE, 537 "FETCH": TokenType.FETCH, 538 "FILTER": TokenType.FILTER, 539 "FIRST": TokenType.FIRST, 540 "FULL": TokenType.FULL, 541 "FUNCTION": TokenType.FUNCTION, 542 "FOLLOWING": TokenType.FOLLOWING, 543 "FOR": TokenType.FOR, 544 "FOREIGN KEY": TokenType.FOREIGN_KEY, 545 "FORMAT": TokenType.FORMAT, 546 "FROM": TokenType.FROM, 547 "GLOB": TokenType.GLOB, 548 "GROUP BY": TokenType.GROUP_BY, 549 "GROUPING SETS": TokenType.GROUPING_SETS, 550 "HAVING": TokenType.HAVING, 551 "IF": TokenType.IF, 552 "ILIKE": TokenType.ILIKE, 553 "IGNORE NULLS": TokenType.IGNORE_NULLS, 554 "IN": TokenType.IN, 555 "INDEX": TokenType.INDEX, 556 "INET": TokenType.INET, 557 "INNER": TokenType.INNER, 558 "INSERT": TokenType.INSERT, 559 "INTERVAL": TokenType.INTERVAL, 560 "INTERSECT": TokenType.INTERSECT, 561 "INTO": TokenType.INTO, 562 "IS": TokenType.IS, 563 "ISNULL": TokenType.ISNULL, 564 "JOIN": TokenType.JOIN, 565 "LATERAL": TokenType.LATERAL, 566 "LAZY": TokenType.LAZY, 567 "LEADING": TokenType.LEADING, 568 "LEFT": TokenType.LEFT, 569 "LIKE": TokenType.LIKE, 570 "LIMIT": TokenType.LIMIT, 571 "LOAD DATA": TokenType.LOAD_DATA, 572 "LOCAL": TokenType.LOCAL, 573 "MATERIALIZED": TokenType.MATERIALIZED, 574 "MERGE": TokenType.MERGE, 575 "NATURAL": TokenType.NATURAL, 576 "NEXT": TokenType.NEXT, 577 "NO ACTION": TokenType.NO_ACTION, 578 "NOT": TokenType.NOT, 579 "NOTNULL": TokenType.NOTNULL, 580 "NULL": TokenType.NULL, 581 "NULLS FIRST": TokenType.NULLS_FIRST, 582 "NULLS LAST": TokenType.NULLS_LAST, 583 "OBJECT": TokenType.OBJECT, 584 "OFFSET": TokenType.OFFSET, 585 "ON": TokenType.ON, 586 "ONLY": TokenType.ONLY, 587 "OPTIONS": TokenType.OPTIONS, 588 "OR": TokenType.OR, 589 "ORDER BY": TokenType.ORDER_BY, 590 "ORDINALITY": TokenType.ORDINALITY, 591 "OUTER": TokenType.OUTER, 592 "OUT OF": TokenType.OUT_OF, 593 "OVER": TokenType.OVER, 594 "OVERLAPS": TokenType.OVERLAPS, 595 "OVERWRITE": TokenType.OVERWRITE, 596 "PARTITION": TokenType.PARTITION, 597 "PARTITION BY": TokenType.PARTITION_BY, 598 "PARTITIONED BY": TokenType.PARTITION_BY, 599 "PARTITIONED_BY": TokenType.PARTITION_BY, 600 "PERCENT": TokenType.PERCENT, 601 "PIVOT": TokenType.PIVOT, 602 "PRAGMA": TokenType.PRAGMA, 603 "PRECEDING": TokenType.PRECEDING, 604 "PRIMARY KEY": TokenType.PRIMARY_KEY, 605 "PROCEDURE": TokenType.PROCEDURE, 606 "QUALIFY": TokenType.QUALIFY, 607 "RANGE": TokenType.RANGE, 608 "RECURSIVE": TokenType.RECURSIVE, 609 "REGEXP": TokenType.RLIKE, 610 "REPLACE": TokenType.REPLACE, 611 "RESPECT NULLS": TokenType.RESPECT_NULLS, 612 "REFERENCES": TokenType.REFERENCES, 613 "RIGHT": TokenType.RIGHT, 614 "RLIKE": TokenType.RLIKE, 615 "ROLLBACK": TokenType.ROLLBACK, 616 "ROLLUP": TokenType.ROLLUP, 617 "ROW": TokenType.ROW, 618 "ROWS": TokenType.ROWS, 619 "SCHEMA": TokenType.SCHEMA, 620 "SEED": TokenType.SEED, 621 "SELECT": TokenType.SELECT, 622 "SEMI": TokenType.SEMI, 623 "SET": TokenType.SET, 624 "SHOW": TokenType.SHOW, 625 "SIMILAR TO": TokenType.SIMILAR_TO, 626 "SOME": TokenType.SOME, 627 "SORTKEY": TokenType.SORTKEY, 628 "SORT BY": TokenType.SORT_BY, 629 "TABLE": TokenType.TABLE, 630 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 631 "TEMP": TokenType.TEMPORARY, 632 "TEMPORARY": TokenType.TEMPORARY, 633 "THEN": TokenType.THEN, 634 "TRUE": TokenType.TRUE, 635 "TRAILING": TokenType.TRAILING, 636 "UNBOUNDED": TokenType.UNBOUNDED, 637 "UNION": TokenType.UNION, 638 "UNLOGGED": TokenType.UNLOGGED, 639 "UNNEST": TokenType.UNNEST, 640 "UNPIVOT": TokenType.UNPIVOT, 641 "UPDATE": TokenType.UPDATE, 642 "USE": TokenType.USE, 643 "USING": TokenType.USING, 644 "UUID": TokenType.UUID, 645 "VALUES": TokenType.VALUES, 646 "VIEW": TokenType.VIEW, 647 "VOLATILE": TokenType.VOLATILE, 648 "WHEN": TokenType.WHEN, 649 "WHERE": TokenType.WHERE, 650 "WINDOW": TokenType.WINDOW, 651 "WITH": TokenType.WITH, 652 "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, 653 "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE, 654 "WITHIN GROUP": TokenType.WITHIN_GROUP, 655 "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, 656 "APPLY": TokenType.APPLY, 657 "ARRAY": TokenType.ARRAY, 658 "BIT": TokenType.BIT, 659 "BOOL": TokenType.BOOLEAN, 660 "BOOLEAN": TokenType.BOOLEAN, 661 "BYTE": TokenType.TINYINT, 662 "TINYINT": TokenType.TINYINT, 663 "SHORT": TokenType.SMALLINT, 664 "SMALLINT": TokenType.SMALLINT, 665 "INT2": TokenType.SMALLINT, 666 "INTEGER": TokenType.INT, 667 "INT": TokenType.INT, 668 "INT4": TokenType.INT, 669 "LONG": TokenType.BIGINT, 670 "BIGINT": TokenType.BIGINT, 671 "INT8": TokenType.BIGINT, 672 "DEC": TokenType.DECIMAL, 673 "DECIMAL": TokenType.DECIMAL, 674 "BIGDECIMAL": TokenType.BIGDECIMAL, 675 "BIGNUMERIC": TokenType.BIGDECIMAL, 676 "MAP": TokenType.MAP, 677 "NULLABLE": TokenType.NULLABLE, 678 "NUMBER": TokenType.DECIMAL, 679 "NUMERIC": TokenType.DECIMAL, 680 "FIXED": TokenType.DECIMAL, 681 "REAL": TokenType.FLOAT, 682 "FLOAT": TokenType.FLOAT, 683 "FLOAT4": TokenType.FLOAT, 684 "FLOAT8": TokenType.DOUBLE, 685 "DOUBLE": TokenType.DOUBLE, 686 "DOUBLE PRECISION": TokenType.DOUBLE, 687 "JSON": TokenType.JSON, 688 "CHAR": TokenType.CHAR, 689 "CHARACTER": TokenType.CHAR, 690 "NCHAR": TokenType.NCHAR, 691 "VARCHAR": TokenType.VARCHAR, 692 "VARCHAR2": TokenType.VARCHAR, 693 "NVARCHAR": TokenType.NVARCHAR, 694 "NVARCHAR2": TokenType.NVARCHAR, 695 "STR": TokenType.TEXT, 696 "STRING": TokenType.TEXT, 697 "TEXT": TokenType.TEXT, 698 "CLOB": TokenType.TEXT, 699 "LONGVARCHAR": TokenType.TEXT, 700 "BINARY": TokenType.BINARY, 701 "BLOB": TokenType.VARBINARY, 702 "BYTEA": TokenType.VARBINARY, 703 "VARBINARY": TokenType.VARBINARY, 704 "TIME": TokenType.TIME, 705 "TIMESTAMP": TokenType.TIMESTAMP, 706 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 707 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 708 "DATE": TokenType.DATE, 709 "DATETIME": TokenType.DATETIME, 710 "UNIQUE": TokenType.UNIQUE, 711 "STRUCT": TokenType.STRUCT, 712 "VARIANT": TokenType.VARIANT, 713 "ALTER": TokenType.ALTER, 714 "ALTER AGGREGATE": TokenType.COMMAND, 715 "ALTER DEFAULT": TokenType.COMMAND, 716 "ALTER DOMAIN": TokenType.COMMAND, 717 "ALTER ROLE": TokenType.COMMAND, 718 "ALTER RULE": TokenType.COMMAND, 719 "ALTER SEQUENCE": TokenType.COMMAND, 720 "ALTER TYPE": TokenType.COMMAND, 721 "ALTER USER": TokenType.COMMAND, 722 "ALTER VIEW": TokenType.COMMAND, 723 "ANALYZE": TokenType.COMMAND, 724 "CALL": TokenType.COMMAND, 725 "COMMENT": TokenType.COMMENT, 726 "COPY": TokenType.COMMAND, 727 "EXPLAIN": TokenType.COMMAND, 728 "GRANT": TokenType.COMMAND, 729 "OPTIMIZE": TokenType.COMMAND, 730 "PREPARE": TokenType.COMMAND, 731 "TRUNCATE": TokenType.COMMAND, 732 "VACUUM": TokenType.COMMAND, 733 } 734 735 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 736 " ": TokenType.SPACE, 737 "\t": TokenType.SPACE, 738 "\n": TokenType.BREAK, 739 "\r": TokenType.BREAK, 740 "\r\n": TokenType.BREAK, 741 } 742 743 COMMANDS = { 744 TokenType.COMMAND, 745 TokenType.EXECUTE, 746 TokenType.FETCH, 747 TokenType.SHOW, 748 } 749 750 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 751 752 # handle numeric literals like in hive (3L = BIGINT) 753 NUMERIC_LITERALS: t.Dict[str, str] = {} 754 ENCODE: t.Optional[str] = None 755 756 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] 757 KEYWORD_TRIE: t.Dict = {} # autofilled 758 759 IDENTIFIER_CAN_START_WITH_DIGIT = False 760 761 __slots__ = ( 762 "sql", 763 "size", 764 "tokens", 765 "_start", 766 "_current", 767 "_line", 768 "_col", 769 "_comments", 770 "_char", 771 "_end", 772 "_peek", 773 "_prev_token_line", 774 "_prev_token_comments", 775 "_prev_token_type", 776 ) 777 778 def __init__(self) -> None: 779 self.reset() 780 781 def reset(self) -> None: 782 self.sql = "" 783 self.size = 0 784 self.tokens: t.List[Token] = [] 785 self._start = 0 786 self._current = 0 787 self._line = 1 788 self._col = 1 789 self._comments: t.List[str] = [] 790 791 self._char = "" 792 self._end = False 793 self._peek = "" 794 self._prev_token_line = -1 795 self._prev_token_comments: t.List[str] = [] 796 self._prev_token_type: t.Optional[TokenType] = None 797 798 def tokenize(self, sql: str) -> t.List[Token]: 799 """Returns a list of tokens corresponding to the SQL string `sql`.""" 800 self.reset() 801 self.sql = sql 802 self.size = len(sql) 803 try: 804 self._scan() 805 except Exception as e: 806 start = self._current - 50 807 end = self._current + 50 808 start = start if start > 0 else 0 809 end = end if end < self.size else self.size - 1 810 context = self.sql[start:end] 811 raise ValueError(f"Error tokenizing '{context}'") from e 812 813 return self.tokens 814 815 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 816 while self.size and not self._end: 817 self._start = self._current 818 self._advance() 819 820 if self._char is None: 821 break 822 823 if self._char not in self.WHITE_SPACE: 824 if self._char.isdigit(): 825 self._scan_number() 826 elif self._char in self._IDENTIFIERS: 827 self._scan_identifier(self._IDENTIFIERS[self._char]) 828 else: 829 self._scan_keywords() 830 831 if until and until(): 832 break 833 834 if self.tokens: 835 self.tokens[-1].comments.extend(self._comments) 836 837 def _chars(self, size: int) -> str: 838 if size == 1: 839 return self._char 840 start = self._current - 1 841 end = start + size 842 if end <= self.size: 843 return self.sql[start:end] 844 return "" 845 846 def _advance(self, i: int = 1) -> None: 847 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 848 self._col = 1 849 self._line += 1 850 else: 851 self._col += i 852 853 self._current += i 854 self._end = self._current >= self.size 855 self._char = self.sql[self._current - 1] 856 self._peek = "" if self._end else self.sql[self._current] 857 858 @property 859 def _text(self) -> str: 860 return self.sql[self._start : self._current] 861 862 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 863 self._prev_token_line = self._line 864 self._prev_token_comments = self._comments 865 self._prev_token_type = token_type 866 self.tokens.append( 867 Token( 868 token_type, 869 self._text if text is None else text, 870 self._line, 871 self._col, 872 self._current, 873 self._comments, 874 ) 875 ) 876 self._comments = [] 877 878 # If we have either a semicolon or a begin token before the command's token, we'll parse 879 # whatever follows the command's token as a string 880 if ( 881 token_type in self.COMMANDS 882 and self._peek != ";" 883 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 884 ): 885 start = self._current 886 tokens = len(self.tokens) 887 self._scan(lambda: self._peek == ";") 888 self.tokens = self.tokens[:tokens] 889 text = self.sql[start : self._current].strip() 890 if text: 891 self._add(TokenType.STRING, text) 892 893 def _scan_keywords(self) -> None: 894 size = 0 895 word = None 896 chars = self._text 897 char = chars 898 prev_space = False 899 skip = False 900 trie = self.KEYWORD_TRIE 901 single_token = char in self.SINGLE_TOKENS 902 903 while chars: 904 if skip: 905 result = 1 906 else: 907 result, trie = in_trie(trie, char.upper()) 908 909 if result == 0: 910 break 911 if result == 2: 912 word = chars 913 size += 1 914 end = self._current - 1 + size 915 916 if end < self.size: 917 char = self.sql[end] 918 single_token = single_token or char in self.SINGLE_TOKENS 919 is_space = char in self.WHITE_SPACE 920 921 if not is_space or not prev_space: 922 if is_space: 923 char = " " 924 chars += char 925 prev_space = is_space 926 skip = False 927 else: 928 skip = True 929 else: 930 chars = " " 931 932 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word 933 934 if not word: 935 if self._char in self.SINGLE_TOKENS: 936 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 937 return 938 self._scan_var() 939 return 940 941 if self._scan_string(word): 942 return 943 if self._scan_formatted_string(word): 944 return 945 if self._scan_comment(word): 946 return 947 948 self._advance(size - 1) 949 word = word.upper() 950 self._add(self.KEYWORDS[word], text=word) 951 952 def _scan_comment(self, comment_start: str) -> bool: 953 if comment_start not in self._COMMENTS: 954 return False 955 956 comment_start_line = self._line 957 comment_start_size = len(comment_start) 958 comment_end = self._COMMENTS[comment_start] 959 960 if comment_end: 961 # Skip the comment's start delimiter 962 self._advance(comment_start_size) 963 964 comment_end_size = len(comment_end) 965 while not self._end and self._chars(comment_end_size) != comment_end: 966 self._advance() 967 968 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 969 self._advance(comment_end_size - 1) 970 else: 971 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 972 self._advance() 973 self._comments.append(self._text[comment_start_size:]) 974 975 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 976 # Multiple consecutive comments are preserved by appending them to the current comments list. 977 if comment_start_line == self._prev_token_line: 978 self.tokens[-1].comments.extend(self._comments) 979 self._comments = [] 980 self._prev_token_line = self._line 981 982 return True 983 984 def _scan_number(self) -> None: 985 if self._char == "0": 986 peek = self._peek.upper() 987 if peek == "B": 988 return self._scan_bits() 989 elif peek == "X": 990 return self._scan_hex() 991 992 decimal = False 993 scientific = 0 994 995 while True: 996 if self._peek.isdigit(): 997 self._advance() 998 elif self._peek == "." and not decimal: 999 decimal = True 1000 self._advance() 1001 elif self._peek in ("-", "+") and scientific == 1: 1002 scientific += 1 1003 self._advance() 1004 elif self._peek.upper() == "E" and not scientific: 1005 scientific += 1 1006 self._advance() 1007 elif self._peek.isidentifier(): 1008 number_text = self._text 1009 literal = "" 1010 1011 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1012 literal += self._peek.upper() 1013 self._advance() 1014 1015 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) 1016 1017 if token_type: 1018 self._add(TokenType.NUMBER, number_text) 1019 self._add(TokenType.DCOLON, "::") 1020 return self._add(token_type, literal) 1021 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: 1022 return self._add(TokenType.VAR) 1023 1024 self._add(TokenType.NUMBER, number_text) 1025 return self._advance(-len(literal)) 1026 else: 1027 return self._add(TokenType.NUMBER) 1028 1029 def _scan_bits(self) -> None: 1030 self._advance() 1031 value = self._extract_value() 1032 try: 1033 self._add(TokenType.BIT_STRING, f"{int(value, 2)}") 1034 except ValueError: 1035 self._add(TokenType.IDENTIFIER) 1036 1037 def _scan_hex(self) -> None: 1038 self._advance() 1039 value = self._extract_value() 1040 try: 1041 self._add(TokenType.HEX_STRING, f"{int(value, 16)}") 1042 except ValueError: 1043 self._add(TokenType.IDENTIFIER) 1044 1045 def _extract_value(self) -> str: 1046 while True: 1047 char = self._peek.strip() 1048 if char and char not in self.SINGLE_TOKENS: 1049 self._advance() 1050 else: 1051 break 1052 1053 return self._text 1054 1055 def _scan_string(self, quote: str) -> bool: 1056 quote_end = self._QUOTES.get(quote) 1057 if quote_end is None: 1058 return False 1059 1060 self._advance(len(quote)) 1061 text = self._extract_string(quote_end) 1062 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1063 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) 1064 return True 1065 1066 # X'1234, b'0110', E'\\\\\' etc. 1067 def _scan_formatted_string(self, string_start: str) -> bool: 1068 if string_start in self._HEX_STRINGS: 1069 delimiters = self._HEX_STRINGS 1070 token_type = TokenType.HEX_STRING 1071 base = 16 1072 elif string_start in self._BIT_STRINGS: 1073 delimiters = self._BIT_STRINGS 1074 token_type = TokenType.BIT_STRING 1075 base = 2 1076 elif string_start in self._BYTE_STRINGS: 1077 delimiters = self._BYTE_STRINGS 1078 token_type = TokenType.BYTE_STRING 1079 base = None 1080 else: 1081 return False 1082 1083 self._advance(len(string_start)) 1084 string_end = delimiters[string_start] 1085 text = self._extract_string(string_end) 1086 1087 if base is None: 1088 self._add(token_type, text) 1089 else: 1090 try: 1091 self._add(token_type, f"{int(text, base)}") 1092 except: 1093 raise RuntimeError( 1094 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1095 ) 1096 1097 return True 1098 1099 def _scan_identifier(self, identifier_end: str) -> None: 1100 text = "" 1101 identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES 1102 1103 while True: 1104 if self._end: 1105 raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") 1106 1107 self._advance() 1108 if self._char == identifier_end: 1109 if identifier_end_is_escape and self._peek == identifier_end: 1110 text += identifier_end 1111 self._advance() 1112 continue 1113 1114 break 1115 1116 text += self._char 1117 1118 self._add(TokenType.IDENTIFIER, text) 1119 1120 def _scan_var(self) -> None: 1121 while True: 1122 char = self._peek.strip() 1123 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1124 self._advance() 1125 else: 1126 break 1127 self._add( 1128 TokenType.VAR 1129 if self._prev_token_type == TokenType.PARAMETER 1130 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1131 ) 1132 1133 def _extract_string(self, delimiter: str) -> str: 1134 text = "" 1135 delim_size = len(delimiter) 1136 1137 while True: 1138 if self._char in self._STRING_ESCAPES and ( 1139 self._peek == delimiter or self._peek in self._STRING_ESCAPES 1140 ): 1141 if self._peek == delimiter: 1142 text += self._peek 1143 else: 1144 text += self._char + self._peek 1145 1146 if self._current + 1 < self.size: 1147 self._advance(2) 1148 else: 1149 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") 1150 else: 1151 if self._chars(delim_size) == delimiter: 1152 if delim_size > 1: 1153 self._advance(delim_size - 1) 1154 break 1155 1156 if self._end: 1157 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") 1158 text += self._char 1159 self._advance() 1160 1161 return text
def
reset(self) -> None:
781 def reset(self) -> None: 782 self.sql = "" 783 self.size = 0 784 self.tokens: t.List[Token] = [] 785 self._start = 0 786 self._current = 0 787 self._line = 1 788 self._col = 1 789 self._comments: t.List[str] = [] 790 791 self._char = "" 792 self._end = False 793 self._peek = "" 794 self._prev_token_line = -1 795 self._prev_token_comments: t.List[str] = [] 796 self._prev_token_type: t.Optional[TokenType] = None
798 def tokenize(self, sql: str) -> t.List[Token]: 799 """Returns a list of tokens corresponding to the SQL string `sql`.""" 800 self.reset() 801 self.sql = sql 802 self.size = len(sql) 803 try: 804 self._scan() 805 except Exception as e: 806 start = self._current - 50 807 end = self._current + 50 808 start = start if start > 0 else 0 809 end = end if end < self.size else self.size - 1 810 context = self.sql[start:end] 811 raise ValueError(f"Error tokenizing '{context}'") from e 812 813 return self.tokens
Returns a list of tokens corresponding to the SQL string sql
.