From beba715b97dd2349e01dde9b077d2535680ebdca Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 10 May 2023 08:44:58 +0200 Subject: Merging upstream version 12.2.0. Signed-off-by: Daniel Baumann --- docs/sqlglot/tokens.html | 3788 +++++++++++++++++++++++----------------------- 1 file changed, 1913 insertions(+), 1875 deletions(-) (limited to 'docs/sqlglot/tokens.html') diff --git a/docs/sqlglot/tokens.html b/docs/sqlglot/tokens.html index 822a76d..91afd40 100644 --- a/docs/sqlglot/tokens.html +++ b/docs/sqlglot/tokens.html @@ -633,6 +633,9 @@
  • JOIN_MARKER
  • +
  • + KEEP +
  • LANGUAGE
  • @@ -684,6 +687,9 @@
  • NEXT
  • +
  • + NEXT_VALUE_FOR +
  • NO_ACTION
  • @@ -1206,949 +1212,953 @@ 215 ISNULL = auto() 216 JOIN = auto() 217 JOIN_MARKER = auto() - 218 LANGUAGE = auto() - 219 LATERAL = auto() - 220 LAZY = auto() - 221 LEADING = auto() - 222 LEFT = auto() - 223 LIKE = auto() - 224 LIKE_ANY = auto() - 225 LIMIT = auto() - 226 LOAD_DATA = auto() - 227 LOCAL = auto() - 228 MAP = auto() - 229 MATCH_RECOGNIZE = auto() - 230 MATERIALIZED = auto() - 231 MERGE = auto() - 232 MOD = auto() - 233 NATURAL = auto() - 234 NEXT = auto() - 235 NO_ACTION = auto() - 236 NOTNULL = auto() - 237 NULL = auto() - 238 NULLS_FIRST = auto() - 239 NULLS_LAST = auto() - 240 OFFSET = auto() - 241 ON = auto() - 242 ONLY = auto() - 243 OPTIONS = auto() - 244 ORDER_BY = auto() - 245 ORDERED = auto() - 246 ORDINALITY = auto() - 247 OUTER = auto() - 248 OUT_OF = auto() - 249 OVER = auto() - 250 OVERLAPS = auto() - 251 OVERWRITE = auto() - 252 PARTITION = auto() - 253 PARTITION_BY = auto() - 254 PERCENT = auto() - 255 PIVOT = auto() - 256 PLACEHOLDER = auto() - 257 PRAGMA = auto() - 258 PRECEDING = auto() - 259 PRIMARY_KEY = auto() - 260 PROCEDURE = auto() - 261 PROPERTIES = auto() - 262 PSEUDO_TYPE = auto() - 263 QUALIFY = auto() - 264 QUOTE = auto() - 265 RANGE = auto() - 266 RECURSIVE = auto() - 267 REPLACE = auto() - 268 RESPECT_NULLS = auto() - 269 RETURNING = auto() - 270 REFERENCES = auto() - 271 RIGHT = auto() - 272 RLIKE = auto() - 273 ROLLBACK = auto() - 274 ROLLUP = auto() - 275 ROW = auto() - 276 ROWS = auto() - 277 SEED = auto() - 278 SELECT = auto() - 279 SEMI = auto() - 280 SEPARATOR = auto() - 281 SERDE_PROPERTIES = auto() - 282 SET = auto() - 283 SHOW = auto() - 284 SIMILAR_TO = auto() - 285 SOME = auto() - 286 SORTKEY = auto() - 287 SORT_BY = auto() - 288 STRUCT = auto() - 289 TABLE_SAMPLE = auto() - 290 TEMPORARY = auto() - 291 TOP = auto() - 292 THEN = auto() - 293 TRAILING = auto() - 294 TRUE = auto() - 295 UNBOUNDED = auto() - 296 UNCACHE = auto() - 297 UNION = auto() - 298 UNLOGGED = auto() - 299 UNNEST = auto() - 300 UNPIVOT = auto() - 301 UPDATE = auto() - 302 USE = auto() - 303 USING = auto() - 304 VALUES = auto() - 305 VIEW = auto() - 306 VOLATILE = auto() - 307 WHEN = auto() - 308 WHERE = auto() - 309 WINDOW = auto() - 310 WITH = auto() - 311 WITH_TIME_ZONE = auto() - 312 WITH_LOCAL_TIME_ZONE = auto() - 313 WITHIN_GROUP = auto() - 314 WITHOUT_TIME_ZONE = auto() - 315 UNIQUE = auto() - 316 - 317 - 318class Token: - 319 __slots__ = ("token_type", "text", "line", "col", "end", "comments") - 320 - 321 @classmethod - 322 def number(cls, number: int) -> Token: - 323 """Returns a NUMBER token with `number` as its text.""" - 324 return cls(TokenType.NUMBER, str(number)) - 325 - 326 @classmethod - 327 def string(cls, string: str) -> Token: - 328 """Returns a STRING token with `string` as its text.""" - 329 return cls(TokenType.STRING, string) - 330 - 331 @classmethod - 332 def identifier(cls, identifier: str) -> Token: - 333 """Returns an IDENTIFIER token with `identifier` as its text.""" - 334 return cls(TokenType.IDENTIFIER, identifier) - 335 - 336 @classmethod - 337 def var(cls, var: str) -> Token: - 338 """Returns an VAR token with `var` as its text.""" - 339 return cls(TokenType.VAR, var) - 340 - 341 def __init__( - 342 self, - 343 token_type: TokenType, - 344 text: str, - 345 line: int = 1, - 346 col: int = 1, - 347 end: int = 0, - 348 comments: t.List[str] = [], - 349 ) -> None: - 350 self.token_type = token_type - 351 self.text = text - 352 self.line = line - 353 size = len(text) - 354 self.col = col - 355 self.end = end if end else size - 356 self.comments = comments - 357 - 358 @property - 359 def start(self) -> int: - 360 """Returns the start of the token.""" - 361 return self.end - len(self.text) - 362 - 363 def __repr__(self) -> str: - 364 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) - 365 return f"<Token {attributes}>" - 366 - 367 - 368class _Tokenizer(type): - 369 def __new__(cls, clsname, bases, attrs): - 370 klass = super().__new__(cls, clsname, bases, attrs) - 371 - 372 klass._QUOTES = { - 373 f"{prefix}{s}": e - 374 for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items() - 375 for prefix in (("",) if s[0].isalpha() else ("", "n", "N")) - 376 } - 377 klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) - 378 klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) - 379 klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS) - 380 klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) - 381 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) - 382 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) - 383 klass._COMMENTS = dict( - 384 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) - 385 for comment in klass.COMMENTS - 386 ) - 387 - 388 klass.KEYWORD_TRIE = new_trie( - 389 key.upper() - 390 for key in { - 391 **klass.KEYWORDS, - 392 **{comment: TokenType.COMMENT for comment in klass._COMMENTS}, - 393 **{quote: TokenType.QUOTE for quote in klass._QUOTES}, - 394 **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS}, - 395 **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS}, - 396 **{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS}, - 397 } - 398 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) - 399 ) - 400 - 401 return klass + 218 KEEP = auto() + 219 LANGUAGE = auto() + 220 LATERAL = auto() + 221 LAZY = auto() + 222 LEADING = auto() + 223 LEFT = auto() + 224 LIKE = auto() + 225 LIKE_ANY = auto() + 226 LIMIT = auto() + 227 LOAD_DATA = auto() + 228 LOCAL = auto() + 229 MAP = auto() + 230 MATCH_RECOGNIZE = auto() + 231 MATERIALIZED = auto() + 232 MERGE = auto() + 233 MOD = auto() + 234 NATURAL = auto() + 235 NEXT = auto() + 236 NEXT_VALUE_FOR = auto() + 237 NO_ACTION = auto() + 238 NOTNULL = auto() + 239 NULL = auto() + 240 NULLS_FIRST = auto() + 241 NULLS_LAST = auto() + 242 OFFSET = auto() + 243 ON = auto() + 244 ONLY = auto() + 245 OPTIONS = auto() + 246 ORDER_BY = auto() + 247 ORDERED = auto() + 248 ORDINALITY = auto() + 249 OUTER = auto() + 250 OUT_OF = auto() + 251 OVER = auto() + 252 OVERLAPS = auto() + 253 OVERWRITE = auto() + 254 PARTITION = auto() + 255 PARTITION_BY = auto() + 256 PERCENT = auto() + 257 PIVOT = auto() + 258 PLACEHOLDER = auto() + 259 PRAGMA = auto() + 260 PRECEDING = auto() + 261 PRIMARY_KEY = auto() + 262 PROCEDURE = auto() + 263 PROPERTIES = auto() + 264 PSEUDO_TYPE = auto() + 265 QUALIFY = auto() + 266 QUOTE = auto() + 267 RANGE = auto() + 268 RECURSIVE = auto() + 269 REPLACE = auto() + 270 RESPECT_NULLS = auto() + 271 RETURNING = auto() + 272 REFERENCES = auto() + 273 RIGHT = auto() + 274 RLIKE = auto() + 275 ROLLBACK = auto() + 276 ROLLUP = auto() + 277 ROW = auto() + 278 ROWS = auto() + 279 SEED = auto() + 280 SELECT = auto() + 281 SEMI = auto() + 282 SEPARATOR = auto() + 283 SERDE_PROPERTIES = auto() + 284 SET = auto() + 285 SHOW = auto() + 286 SIMILAR_TO = auto() + 287 SOME = auto() + 288 SORTKEY = auto() + 289 SORT_BY = auto() + 290 STRUCT = auto() + 291 TABLE_SAMPLE = auto() + 292 TEMPORARY = auto() + 293 TOP = auto() + 294 THEN = auto() + 295 TRAILING = auto() + 296 TRUE = auto() + 297 UNBOUNDED = auto() + 298 UNCACHE = auto() + 299 UNION = auto() + 300 UNLOGGED = auto() + 301 UNNEST = auto() + 302 UNPIVOT = auto() + 303 UPDATE = auto() + 304 USE = auto() + 305 USING = auto() + 306 VALUES = auto() + 307 VIEW = auto() + 308 VOLATILE = auto() + 309 WHEN = auto() + 310 WHERE = auto() + 311 WINDOW = auto() + 312 WITH = auto() + 313 WITH_TIME_ZONE = auto() + 314 WITH_LOCAL_TIME_ZONE = auto() + 315 WITHIN_GROUP = auto() + 316 WITHOUT_TIME_ZONE = auto() + 317 UNIQUE = auto() + 318 + 319 + 320class Token: + 321 __slots__ = ("token_type", "text", "line", "col", "end", "comments") + 322 + 323 @classmethod + 324 def number(cls, number: int) -> Token: + 325 """Returns a NUMBER token with `number` as its text.""" + 326 return cls(TokenType.NUMBER, str(number)) + 327 + 328 @classmethod + 329 def string(cls, string: str) -> Token: + 330 """Returns a STRING token with `string` as its text.""" + 331 return cls(TokenType.STRING, string) + 332 + 333 @classmethod + 334 def identifier(cls, identifier: str) -> Token: + 335 """Returns an IDENTIFIER token with `identifier` as its text.""" + 336 return cls(TokenType.IDENTIFIER, identifier) + 337 + 338 @classmethod + 339 def var(cls, var: str) -> Token: + 340 """Returns an VAR token with `var` as its text.""" + 341 return cls(TokenType.VAR, var) + 342 + 343 def __init__( + 344 self, + 345 token_type: TokenType, + 346 text: str, + 347 line: int = 1, + 348 col: int = 1, + 349 end: int = 0, + 350 comments: t.List[str] = [], + 351 ) -> None: + 352 self.token_type = token_type + 353 self.text = text + 354 self.line = line + 355 size = len(text) + 356 self.col = col + 357 self.end = end if end else size + 358 self.comments = comments + 359 + 360 @property + 361 def start(self) -> int: + 362 """Returns the start of the token.""" + 363 return self.end - len(self.text) + 364 + 365 def __repr__(self) -> str: + 366 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) + 367 return f"<Token {attributes}>" + 368 + 369 + 370class _Tokenizer(type): + 371 def __new__(cls, clsname, bases, attrs): + 372 klass = super().__new__(cls, clsname, bases, attrs) + 373 + 374 klass._QUOTES = { + 375 f"{prefix}{s}": e + 376 for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items() + 377 for prefix in (("",) if s[0].isalpha() else ("", "n", "N")) + 378 } + 379 klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) + 380 klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) + 381 klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS) + 382 klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) + 383 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) + 384 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) + 385 klass._COMMENTS = dict( + 386 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) + 387 for comment in klass.COMMENTS + 388 ) + 389 + 390 klass.KEYWORD_TRIE = new_trie( + 391 key.upper() + 392 for key in { + 393 **klass.KEYWORDS, + 394 **{comment: TokenType.COMMENT for comment in klass._COMMENTS}, + 395 **{quote: TokenType.QUOTE for quote in klass._QUOTES}, + 396 **{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS}, + 397 **{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS}, + 398 **{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS}, + 399 } + 400 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) + 401 ) 402 - 403 @staticmethod - 404 def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: - 405 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) - 406 - 407 - 408class Tokenizer(metaclass=_Tokenizer): - 409 SINGLE_TOKENS = { - 410 "(": TokenType.L_PAREN, - 411 ")": TokenType.R_PAREN, - 412 "[": TokenType.L_BRACKET, - 413 "]": TokenType.R_BRACKET, - 414 "{": TokenType.L_BRACE, - 415 "}": TokenType.R_BRACE, - 416 "&": TokenType.AMP, - 417 "^": TokenType.CARET, - 418 ":": TokenType.COLON, - 419 ",": TokenType.COMMA, - 420 ".": TokenType.DOT, - 421 "-": TokenType.DASH, - 422 "=": TokenType.EQ, - 423 ">": TokenType.GT, - 424 "<": TokenType.LT, - 425 "%": TokenType.MOD, - 426 "!": TokenType.NOT, - 427 "|": TokenType.PIPE, - 428 "+": TokenType.PLUS, - 429 ";": TokenType.SEMICOLON, - 430 "/": TokenType.SLASH, - 431 "\\": TokenType.BACKSLASH, - 432 "*": TokenType.STAR, - 433 "~": TokenType.TILDA, - 434 "?": TokenType.PLACEHOLDER, - 435 "@": TokenType.PARAMETER, - 436 # used for breaking a var like x'y' but nothing else - 437 # the token type doesn't matter - 438 "'": TokenType.QUOTE, - 439 "`": TokenType.IDENTIFIER, - 440 '"': TokenType.IDENTIFIER, - 441 "#": TokenType.HASH, - 442 } - 443 - 444 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] - 445 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] - 446 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] - 447 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] - 448 IDENTIFIER_ESCAPES = ['"'] - 449 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] - 450 STRING_ESCAPES = ["'"] - 451 VAR_SINGLE_TOKENS: t.Set[str] = set() - 452 - 453 _COMMENTS: t.Dict[str, str] = {} - 454 _BIT_STRINGS: t.Dict[str, str] = {} - 455 _BYTE_STRINGS: t.Dict[str, str] = {} - 456 _HEX_STRINGS: t.Dict[str, str] = {} - 457 _IDENTIFIERS: t.Dict[str, str] = {} - 458 _IDENTIFIER_ESCAPES: t.Set[str] = set() - 459 _QUOTES: t.Dict[str, str] = {} - 460 _STRING_ESCAPES: t.Set[str] = set() - 461 - 462 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { - 463 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, - 464 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, - 465 "{{+": TokenType.BLOCK_START, - 466 "{{-": TokenType.BLOCK_START, - 467 "+}}": TokenType.BLOCK_END, - 468 "-}}": TokenType.BLOCK_END, - 469 "/*+": TokenType.HINT, - 470 "==": TokenType.EQ, - 471 "::": TokenType.DCOLON, - 472 "||": TokenType.DPIPE, - 473 ">=": TokenType.GTE, - 474 "<=": TokenType.LTE, - 475 "<>": TokenType.NEQ, - 476 "!=": TokenType.NEQ, - 477 "<=>": TokenType.NULLSAFE_EQ, - 478 "->": TokenType.ARROW, - 479 "->>": TokenType.DARROW, - 480 "=>": TokenType.FARROW, - 481 "#>": TokenType.HASH_ARROW, - 482 "#>>": TokenType.DHASH_ARROW, - 483 "<->": TokenType.LR_ARROW, - 484 "&&": TokenType.DAMP, - 485 "ALL": TokenType.ALL, - 486 "ALWAYS": TokenType.ALWAYS, - 487 "AND": TokenType.AND, - 488 "ANTI": TokenType.ANTI, - 489 "ANY": TokenType.ANY, - 490 "ASC": TokenType.ASC, - 491 "AS": TokenType.ALIAS, - 492 "AT TIME ZONE": TokenType.AT_TIME_ZONE, - 493 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, - 494 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, - 495 "BEGIN": TokenType.BEGIN, - 496 "BETWEEN": TokenType.BETWEEN, - 497 "BOTH": TokenType.BOTH, - 498 "BUCKET": TokenType.BUCKET, - 499 "BY DEFAULT": TokenType.BY_DEFAULT, - 500 "CACHE": TokenType.CACHE, - 501 "UNCACHE": TokenType.UNCACHE, - 502 "CASE": TokenType.CASE, - 503 "CASCADE": TokenType.CASCADE, - 504 "CHARACTER SET": TokenType.CHARACTER_SET, - 505 "CLUSTER BY": TokenType.CLUSTER_BY, - 506 "COLLATE": TokenType.COLLATE, - 507 "COLUMN": TokenType.COLUMN, - 508 "COMMIT": TokenType.COMMIT, - 509 "COMPOUND": TokenType.COMPOUND, - 510 "CONSTRAINT": TokenType.CONSTRAINT, - 511 "CREATE": TokenType.CREATE, - 512 "CROSS": TokenType.CROSS, - 513 "CUBE": TokenType.CUBE, - 514 "CURRENT_DATE": TokenType.CURRENT_DATE, - 515 "CURRENT ROW": TokenType.CURRENT_ROW, - 516 "CURRENT_TIME": TokenType.CURRENT_TIME, - 517 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, - 518 "CURRENT_USER": TokenType.CURRENT_USER, - 519 "DATABASE": TokenType.DATABASE, - 520 "DEFAULT": TokenType.DEFAULT, - 521 "DELETE": TokenType.DELETE, - 522 "DESC": TokenType.DESC, - 523 "DESCRIBE": TokenType.DESCRIBE, - 524 "DISTINCT": TokenType.DISTINCT, - 525 "DISTINCT FROM": TokenType.DISTINCT_FROM, - 526 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, - 527 "DIV": TokenType.DIV, - 528 "DROP": TokenType.DROP, - 529 "ELSE": TokenType.ELSE, - 530 "END": TokenType.END, - 531 "ESCAPE": TokenType.ESCAPE, - 532 "EXCEPT": TokenType.EXCEPT, - 533 "EXECUTE": TokenType.EXECUTE, - 534 "EXISTS": TokenType.EXISTS, - 535 "FALSE": TokenType.FALSE, - 536 "FETCH": TokenType.FETCH, - 537 "FILTER": TokenType.FILTER, - 538 "FIRST": TokenType.FIRST, - 539 "FULL": TokenType.FULL, - 540 "FUNCTION": TokenType.FUNCTION, - 541 "FOLLOWING": TokenType.FOLLOWING, - 542 "FOR": TokenType.FOR, - 543 "FOREIGN KEY": TokenType.FOREIGN_KEY, - 544 "FORMAT": TokenType.FORMAT, - 545 "FROM": TokenType.FROM, - 546 "GLOB": TokenType.GLOB, - 547 "GROUP BY": TokenType.GROUP_BY, - 548 "GROUPING SETS": TokenType.GROUPING_SETS, - 549 "HAVING": TokenType.HAVING, - 550 "IF": TokenType.IF, - 551 "ILIKE": TokenType.ILIKE, - 552 "IGNORE NULLS": TokenType.IGNORE_NULLS, - 553 "IN": TokenType.IN, - 554 "INDEX": TokenType.INDEX, - 555 "INET": TokenType.INET, - 556 "INNER": TokenType.INNER, - 557 "INSERT": TokenType.INSERT, - 558 "INTERVAL": TokenType.INTERVAL, - 559 "INTERSECT": TokenType.INTERSECT, - 560 "INTO": TokenType.INTO, - 561 "IS": TokenType.IS, - 562 "ISNULL": TokenType.ISNULL, - 563 "JOIN": TokenType.JOIN, - 564 "LATERAL": TokenType.LATERAL, - 565 "LAZY": TokenType.LAZY, - 566 "LEADING": TokenType.LEADING, - 567 "LEFT": TokenType.LEFT, - 568 "LIKE": TokenType.LIKE, - 569 "LIMIT": TokenType.LIMIT, - 570 "LOAD DATA": TokenType.LOAD_DATA, - 571 "LOCAL": TokenType.LOCAL, - 572 "MATERIALIZED": TokenType.MATERIALIZED, - 573 "MERGE": TokenType.MERGE, - 574 "NATURAL": TokenType.NATURAL, - 575 "NEXT": TokenType.NEXT, - 576 "NO ACTION": TokenType.NO_ACTION, - 577 "NOT": TokenType.NOT, - 578 "NOTNULL": TokenType.NOTNULL, - 579 "NULL": TokenType.NULL, - 580 "NULLS FIRST": TokenType.NULLS_FIRST, - 581 "NULLS LAST": TokenType.NULLS_LAST, - 582 "OBJECT": TokenType.OBJECT, - 583 "OFFSET": TokenType.OFFSET, - 584 "ON": TokenType.ON, - 585 "ONLY": TokenType.ONLY, - 586 "OPTIONS": TokenType.OPTIONS, - 587 "OR": TokenType.OR, - 588 "ORDER BY": TokenType.ORDER_BY, - 589 "ORDINALITY": TokenType.ORDINALITY, - 590 "OUTER": TokenType.OUTER, - 591 "OUT OF": TokenType.OUT_OF, - 592 "OVER": TokenType.OVER, - 593 "OVERLAPS": TokenType.OVERLAPS, - 594 "OVERWRITE": TokenType.OVERWRITE, - 595 "PARTITION": TokenType.PARTITION, - 596 "PARTITION BY": TokenType.PARTITION_BY, - 597 "PARTITIONED BY": TokenType.PARTITION_BY, - 598 "PARTITIONED_BY": TokenType.PARTITION_BY, - 599 "PERCENT": TokenType.PERCENT, - 600 "PIVOT": TokenType.PIVOT, - 601 "PRAGMA": TokenType.PRAGMA, - 602 "PRECEDING": TokenType.PRECEDING, - 603 "PRIMARY KEY": TokenType.PRIMARY_KEY, - 604 "PROCEDURE": TokenType.PROCEDURE, - 605 "QUALIFY": TokenType.QUALIFY, - 606 "RANGE": TokenType.RANGE, - 607 "RECURSIVE": TokenType.RECURSIVE, - 608 "REGEXP": TokenType.RLIKE, - 609 "REPLACE": TokenType.REPLACE, - 610 "RESPECT NULLS": TokenType.RESPECT_NULLS, - 611 "REFERENCES": TokenType.REFERENCES, - 612 "RIGHT": TokenType.RIGHT, - 613 "RLIKE": TokenType.RLIKE, - 614 "ROLLBACK": TokenType.ROLLBACK, - 615 "ROLLUP": TokenType.ROLLUP, - 616 "ROW": TokenType.ROW, - 617 "ROWS": TokenType.ROWS, - 618 "SCHEMA": TokenType.SCHEMA, - 619 "SEED": TokenType.SEED, - 620 "SELECT": TokenType.SELECT, - 621 "SEMI": TokenType.SEMI, - 622 "SET": TokenType.SET, - 623 "SHOW": TokenType.SHOW, - 624 "SIMILAR TO": TokenType.SIMILAR_TO, - 625 "SOME": TokenType.SOME, - 626 "SORTKEY": TokenType.SORTKEY, - 627 "SORT BY": TokenType.SORT_BY, - 628 "TABLE": TokenType.TABLE, - 629 "TABLESAMPLE": TokenType.TABLE_SAMPLE, - 630 "TEMP": TokenType.TEMPORARY, - 631 "TEMPORARY": TokenType.TEMPORARY, - 632 "THEN": TokenType.THEN, - 633 "TRUE": TokenType.TRUE, - 634 "TRAILING": TokenType.TRAILING, - 635 "UNBOUNDED": TokenType.UNBOUNDED, - 636 "UNION": TokenType.UNION, - 637 "UNLOGGED": TokenType.UNLOGGED, - 638 "UNNEST": TokenType.UNNEST, - 639 "UNPIVOT": TokenType.UNPIVOT, - 640 "UPDATE": TokenType.UPDATE, - 641 "USE": TokenType.USE, - 642 "USING": TokenType.USING, - 643 "UUID": TokenType.UUID, - 644 "VALUES": TokenType.VALUES, - 645 "VIEW": TokenType.VIEW, - 646 "VOLATILE": TokenType.VOLATILE, - 647 "WHEN": TokenType.WHEN, - 648 "WHERE": TokenType.WHERE, - 649 "WINDOW": TokenType.WINDOW, - 650 "WITH": TokenType.WITH, - 651 "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, - 652 "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE, - 653 "WITHIN GROUP": TokenType.WITHIN_GROUP, - 654 "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, - 655 "APPLY": TokenType.APPLY, - 656 "ARRAY": TokenType.ARRAY, - 657 "BIT": TokenType.BIT, - 658 "BOOL": TokenType.BOOLEAN, - 659 "BOOLEAN": TokenType.BOOLEAN, - 660 "BYTE": TokenType.TINYINT, - 661 "TINYINT": TokenType.TINYINT, - 662 "SHORT": TokenType.SMALLINT, - 663 "SMALLINT": TokenType.SMALLINT, - 664 "INT2": TokenType.SMALLINT, - 665 "INTEGER": TokenType.INT, - 666 "INT": TokenType.INT, - 667 "INT4": TokenType.INT, - 668 "LONG": TokenType.BIGINT, - 669 "BIGINT": TokenType.BIGINT, - 670 "INT8": TokenType.BIGINT, - 671 "DEC": TokenType.DECIMAL, - 672 "DECIMAL": TokenType.DECIMAL, - 673 "BIGDECIMAL": TokenType.BIGDECIMAL, - 674 "BIGNUMERIC": TokenType.BIGDECIMAL, - 675 "MAP": TokenType.MAP, - 676 "NULLABLE": TokenType.NULLABLE, - 677 "NUMBER": TokenType.DECIMAL, - 678 "NUMERIC": TokenType.DECIMAL, - 679 "FIXED": TokenType.DECIMAL, - 680 "REAL": TokenType.FLOAT, - 681 "FLOAT": TokenType.FLOAT, - 682 "FLOAT4": TokenType.FLOAT, - 683 "FLOAT8": TokenType.DOUBLE, - 684 "DOUBLE": TokenType.DOUBLE, - 685 "DOUBLE PRECISION": TokenType.DOUBLE, - 686 "JSON": TokenType.JSON, - 687 "CHAR": TokenType.CHAR, - 688 "CHARACTER": TokenType.CHAR, - 689 "NCHAR": TokenType.NCHAR, - 690 "VARCHAR": TokenType.VARCHAR, - 691 "VARCHAR2": TokenType.VARCHAR, - 692 "NVARCHAR": TokenType.NVARCHAR, - 693 "NVARCHAR2": TokenType.NVARCHAR, - 694 "STR": TokenType.TEXT, - 695 "STRING": TokenType.TEXT, - 696 "TEXT": TokenType.TEXT, - 697 "CLOB": TokenType.TEXT, - 698 "LONGVARCHAR": TokenType.TEXT, - 699 "BINARY": TokenType.BINARY, - 700 "BLOB": TokenType.VARBINARY, - 701 "BYTEA": TokenType.VARBINARY, - 702 "VARBINARY": TokenType.VARBINARY, - 703 "TIME": TokenType.TIME, - 704 "TIMESTAMP": TokenType.TIMESTAMP, - 705 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, - 706 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, - 707 "DATE": TokenType.DATE, - 708 "DATETIME": TokenType.DATETIME, - 709 "UNIQUE": TokenType.UNIQUE, - 710 "STRUCT": TokenType.STRUCT, - 711 "VARIANT": TokenType.VARIANT, - 712 "ALTER": TokenType.ALTER, - 713 "ALTER AGGREGATE": TokenType.COMMAND, - 714 "ALTER DEFAULT": TokenType.COMMAND, - 715 "ALTER DOMAIN": TokenType.COMMAND, - 716 "ALTER ROLE": TokenType.COMMAND, - 717 "ALTER RULE": TokenType.COMMAND, - 718 "ALTER SEQUENCE": TokenType.COMMAND, - 719 "ALTER TYPE": TokenType.COMMAND, - 720 "ALTER USER": TokenType.COMMAND, - 721 "ALTER VIEW": TokenType.COMMAND, - 722 "ANALYZE": TokenType.COMMAND, - 723 "CALL": TokenType.COMMAND, - 724 "COMMENT": TokenType.COMMENT, - 725 "COPY": TokenType.COMMAND, - 726 "EXPLAIN": TokenType.COMMAND, - 727 "GRANT": TokenType.COMMAND, - 728 "OPTIMIZE": TokenType.COMMAND, - 729 "PREPARE": TokenType.COMMAND, - 730 "TRUNCATE": TokenType.COMMAND, - 731 "VACUUM": TokenType.COMMAND, - 732 } - 733 - 734 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { - 735 " ": TokenType.SPACE, - 736 "\t": TokenType.SPACE, - 737 "\n": TokenType.BREAK, - 738 "\r": TokenType.BREAK, - 739 "\r\n": TokenType.BREAK, - 740 } - 741 - 742 COMMANDS = { - 743 TokenType.COMMAND, - 744 TokenType.EXECUTE, - 745 TokenType.FETCH, - 746 TokenType.SHOW, - 747 } - 748 - 749 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} - 750 - 751 # handle numeric literals like in hive (3L = BIGINT) - 752 NUMERIC_LITERALS: t.Dict[str, str] = {} - 753 ENCODE: t.Optional[str] = None + 403 return klass + 404 + 405 @staticmethod + 406 def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: + 407 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) + 408 + 409 + 410class Tokenizer(metaclass=_Tokenizer): + 411 SINGLE_TOKENS = { + 412 "(": TokenType.L_PAREN, + 413 ")": TokenType.R_PAREN, + 414 "[": TokenType.L_BRACKET, + 415 "]": TokenType.R_BRACKET, + 416 "{": TokenType.L_BRACE, + 417 "}": TokenType.R_BRACE, + 418 "&": TokenType.AMP, + 419 "^": TokenType.CARET, + 420 ":": TokenType.COLON, + 421 ",": TokenType.COMMA, + 422 ".": TokenType.DOT, + 423 "-": TokenType.DASH, + 424 "=": TokenType.EQ, + 425 ">": TokenType.GT, + 426 "<": TokenType.LT, + 427 "%": TokenType.MOD, + 428 "!": TokenType.NOT, + 429 "|": TokenType.PIPE, + 430 "+": TokenType.PLUS, + 431 ";": TokenType.SEMICOLON, + 432 "/": TokenType.SLASH, + 433 "\\": TokenType.BACKSLASH, + 434 "*": TokenType.STAR, + 435 "~": TokenType.TILDA, + 436 "?": TokenType.PLACEHOLDER, + 437 "@": TokenType.PARAMETER, + 438 # used for breaking a var like x'y' but nothing else + 439 # the token type doesn't matter + 440 "'": TokenType.QUOTE, + 441 "`": TokenType.IDENTIFIER, + 442 '"': TokenType.IDENTIFIER, + 443 "#": TokenType.HASH, + 444 } + 445 + 446 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] + 447 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] + 448 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] + 449 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] + 450 IDENTIFIER_ESCAPES = ['"'] + 451 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] + 452 STRING_ESCAPES = ["'"] + 453 VAR_SINGLE_TOKENS: t.Set[str] = set() + 454 + 455 _COMMENTS: t.Dict[str, str] = {} + 456 _BIT_STRINGS: t.Dict[str, str] = {} + 457 _BYTE_STRINGS: t.Dict[str, str] = {} + 458 _HEX_STRINGS: t.Dict[str, str] = {} + 459 _IDENTIFIERS: t.Dict[str, str] = {} + 460 _IDENTIFIER_ESCAPES: t.Set[str] = set() + 461 _QUOTES: t.Dict[str, str] = {} + 462 _STRING_ESCAPES: t.Set[str] = set() + 463 + 464 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { + 465 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, + 466 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, + 467 "{{+": TokenType.BLOCK_START, + 468 "{{-": TokenType.BLOCK_START, + 469 "+}}": TokenType.BLOCK_END, + 470 "-}}": TokenType.BLOCK_END, + 471 "/*+": TokenType.HINT, + 472 "==": TokenType.EQ, + 473 "::": TokenType.DCOLON, + 474 "||": TokenType.DPIPE, + 475 ">=": TokenType.GTE, + 476 "<=": TokenType.LTE, + 477 "<>": TokenType.NEQ, + 478 "!=": TokenType.NEQ, + 479 "<=>": TokenType.NULLSAFE_EQ, + 480 "->": TokenType.ARROW, + 481 "->>": TokenType.DARROW, + 482 "=>": TokenType.FARROW, + 483 "#>": TokenType.HASH_ARROW, + 484 "#>>": TokenType.DHASH_ARROW, + 485 "<->": TokenType.LR_ARROW, + 486 "&&": TokenType.DAMP, + 487 "ALL": TokenType.ALL, + 488 "ALWAYS": TokenType.ALWAYS, + 489 "AND": TokenType.AND, + 490 "ANTI": TokenType.ANTI, + 491 "ANY": TokenType.ANY, + 492 "ASC": TokenType.ASC, + 493 "AS": TokenType.ALIAS, + 494 "AT TIME ZONE": TokenType.AT_TIME_ZONE, + 495 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, + 496 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, + 497 "BEGIN": TokenType.BEGIN, + 498 "BETWEEN": TokenType.BETWEEN, + 499 "BOTH": TokenType.BOTH, + 500 "BUCKET": TokenType.BUCKET, + 501 "BY DEFAULT": TokenType.BY_DEFAULT, + 502 "CACHE": TokenType.CACHE, + 503 "UNCACHE": TokenType.UNCACHE, + 504 "CASE": TokenType.CASE, + 505 "CASCADE": TokenType.CASCADE, + 506 "CHARACTER SET": TokenType.CHARACTER_SET, + 507 "CLUSTER BY": TokenType.CLUSTER_BY, + 508 "COLLATE": TokenType.COLLATE, + 509 "COLUMN": TokenType.COLUMN, + 510 "COMMIT": TokenType.COMMIT, + 511 "COMPOUND": TokenType.COMPOUND, + 512 "CONSTRAINT": TokenType.CONSTRAINT, + 513 "CREATE": TokenType.CREATE, + 514 "CROSS": TokenType.CROSS, + 515 "CUBE": TokenType.CUBE, + 516 "CURRENT_DATE": TokenType.CURRENT_DATE, + 517 "CURRENT ROW": TokenType.CURRENT_ROW, + 518 "CURRENT_TIME": TokenType.CURRENT_TIME, + 519 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, + 520 "CURRENT_USER": TokenType.CURRENT_USER, + 521 "DATABASE": TokenType.DATABASE, + 522 "DEFAULT": TokenType.DEFAULT, + 523 "DELETE": TokenType.DELETE, + 524 "DESC": TokenType.DESC, + 525 "DESCRIBE": TokenType.DESCRIBE, + 526 "DISTINCT": TokenType.DISTINCT, + 527 "DISTINCT FROM": TokenType.DISTINCT_FROM, + 528 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, + 529 "DIV": TokenType.DIV, + 530 "DROP": TokenType.DROP, + 531 "ELSE": TokenType.ELSE, + 532 "END": TokenType.END, + 533 "ESCAPE": TokenType.ESCAPE, + 534 "EXCEPT": TokenType.EXCEPT, + 535 "EXECUTE": TokenType.EXECUTE, + 536 "EXISTS": TokenType.EXISTS, + 537 "FALSE": TokenType.FALSE, + 538 "FETCH": TokenType.FETCH, + 539 "FILTER": TokenType.FILTER, + 540 "FIRST": TokenType.FIRST, + 541 "FULL": TokenType.FULL, + 542 "FUNCTION": TokenType.FUNCTION, + 543 "FOLLOWING": TokenType.FOLLOWING, + 544 "FOR": TokenType.FOR, + 545 "FOREIGN KEY": TokenType.FOREIGN_KEY, + 546 "FORMAT": TokenType.FORMAT, + 547 "FROM": TokenType.FROM, + 548 "GLOB": TokenType.GLOB, + 549 "GROUP BY": TokenType.GROUP_BY, + 550 "GROUPING SETS": TokenType.GROUPING_SETS, + 551 "HAVING": TokenType.HAVING, + 552 "IF": TokenType.IF, + 553 "ILIKE": TokenType.ILIKE, + 554 "IGNORE NULLS": TokenType.IGNORE_NULLS, + 555 "IN": TokenType.IN, + 556 "INDEX": TokenType.INDEX, + 557 "INET": TokenType.INET, + 558 "INNER": TokenType.INNER, + 559 "INSERT": TokenType.INSERT, + 560 "INTERVAL": TokenType.INTERVAL, + 561 "INTERSECT": TokenType.INTERSECT, + 562 "INTO": TokenType.INTO, + 563 "IS": TokenType.IS, + 564 "ISNULL": TokenType.ISNULL, + 565 "JOIN": TokenType.JOIN, + 566 "KEEP": TokenType.KEEP, + 567 "LATERAL": TokenType.LATERAL, + 568 "LAZY": TokenType.LAZY, + 569 "LEADING": TokenType.LEADING, + 570 "LEFT": TokenType.LEFT, + 571 "LIKE": TokenType.LIKE, + 572 "LIMIT": TokenType.LIMIT, + 573 "LOAD DATA": TokenType.LOAD_DATA, + 574 "LOCAL": TokenType.LOCAL, + 575 "MATERIALIZED": TokenType.MATERIALIZED, + 576 "MERGE": TokenType.MERGE, + 577 "NATURAL": TokenType.NATURAL, + 578 "NEXT": TokenType.NEXT, + 579 "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR, + 580 "NO ACTION": TokenType.NO_ACTION, + 581 "NOT": TokenType.NOT, + 582 "NOTNULL": TokenType.NOTNULL, + 583 "NULL": TokenType.NULL, + 584 "NULLS FIRST": TokenType.NULLS_FIRST, + 585 "NULLS LAST": TokenType.NULLS_LAST, + 586 "OBJECT": TokenType.OBJECT, + 587 "OFFSET": TokenType.OFFSET, + 588 "ON": TokenType.ON, + 589 "ONLY": TokenType.ONLY, + 590 "OPTIONS": TokenType.OPTIONS, + 591 "OR": TokenType.OR, + 592 "ORDER BY": TokenType.ORDER_BY, + 593 "ORDINALITY": TokenType.ORDINALITY, + 594 "OUTER": TokenType.OUTER, + 595 "OUT OF": TokenType.OUT_OF, + 596 "OVER": TokenType.OVER, + 597 "OVERLAPS": TokenType.OVERLAPS, + 598 "OVERWRITE": TokenType.OVERWRITE, + 599 "PARTITION": TokenType.PARTITION, + 600 "PARTITION BY": TokenType.PARTITION_BY, + 601 "PARTITIONED BY": TokenType.PARTITION_BY, + 602 "PARTITIONED_BY": TokenType.PARTITION_BY, + 603 "PERCENT": TokenType.PERCENT, + 604 "PIVOT": TokenType.PIVOT, + 605 "PRAGMA": TokenType.PRAGMA, + 606 "PRECEDING": TokenType.PRECEDING, + 607 "PRIMARY KEY": TokenType.PRIMARY_KEY, + 608 "PROCEDURE": TokenType.PROCEDURE, + 609 "QUALIFY": TokenType.QUALIFY, + 610 "RANGE": TokenType.RANGE, + 611 "RECURSIVE": TokenType.RECURSIVE, + 612 "REGEXP": TokenType.RLIKE, + 613 "REPLACE": TokenType.REPLACE, + 614 "RESPECT NULLS": TokenType.RESPECT_NULLS, + 615 "REFERENCES": TokenType.REFERENCES, + 616 "RIGHT": TokenType.RIGHT, + 617 "RLIKE": TokenType.RLIKE, + 618 "ROLLBACK": TokenType.ROLLBACK, + 619 "ROLLUP": TokenType.ROLLUP, + 620 "ROW": TokenType.ROW, + 621 "ROWS": TokenType.ROWS, + 622 "SCHEMA": TokenType.SCHEMA, + 623 "SEED": TokenType.SEED, + 624 "SELECT": TokenType.SELECT, + 625 "SEMI": TokenType.SEMI, + 626 "SET": TokenType.SET, + 627 "SHOW": TokenType.SHOW, + 628 "SIMILAR TO": TokenType.SIMILAR_TO, + 629 "SOME": TokenType.SOME, + 630 "SORTKEY": TokenType.SORTKEY, + 631 "SORT BY": TokenType.SORT_BY, + 632 "TABLE": TokenType.TABLE, + 633 "TABLESAMPLE": TokenType.TABLE_SAMPLE, + 634 "TEMP": TokenType.TEMPORARY, + 635 "TEMPORARY": TokenType.TEMPORARY, + 636 "THEN": TokenType.THEN, + 637 "TRUE": TokenType.TRUE, + 638 "TRAILING": TokenType.TRAILING, + 639 "UNBOUNDED": TokenType.UNBOUNDED, + 640 "UNION": TokenType.UNION, + 641 "UNLOGGED": TokenType.UNLOGGED, + 642 "UNNEST": TokenType.UNNEST, + 643 "UNPIVOT": TokenType.UNPIVOT, + 644 "UPDATE": TokenType.UPDATE, + 645 "USE": TokenType.USE, + 646 "USING": TokenType.USING, + 647 "UUID": TokenType.UUID, + 648 "VALUES": TokenType.VALUES, + 649 "VIEW": TokenType.VIEW, + 650 "VOLATILE": TokenType.VOLATILE, + 651 "WHEN": TokenType.WHEN, + 652 "WHERE": TokenType.WHERE, + 653 "WINDOW": TokenType.WINDOW, + 654 "WITH": TokenType.WITH, + 655 "WITH TIME ZONE": TokenType.WITH_TIME_ZONE, + 656 "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE, + 657 "WITHIN GROUP": TokenType.WITHIN_GROUP, + 658 "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE, + 659 "APPLY": TokenType.APPLY, + 660 "ARRAY": TokenType.ARRAY, + 661 "BIT": TokenType.BIT, + 662 "BOOL": TokenType.BOOLEAN, + 663 "BOOLEAN": TokenType.BOOLEAN, + 664 "BYTE": TokenType.TINYINT, + 665 "TINYINT": TokenType.TINYINT, + 666 "SHORT": TokenType.SMALLINT, + 667 "SMALLINT": TokenType.SMALLINT, + 668 "INT2": TokenType.SMALLINT, + 669 "INTEGER": TokenType.INT, + 670 "INT": TokenType.INT, + 671 "INT4": TokenType.INT, + 672 "LONG": TokenType.BIGINT, + 673 "BIGINT": TokenType.BIGINT, + 674 "INT8": TokenType.BIGINT, + 675 "DEC": TokenType.DECIMAL, + 676 "DECIMAL": TokenType.DECIMAL, + 677 "BIGDECIMAL": TokenType.BIGDECIMAL, + 678 "BIGNUMERIC": TokenType.BIGDECIMAL, + 679 "MAP": TokenType.MAP, + 680 "NULLABLE": TokenType.NULLABLE, + 681 "NUMBER": TokenType.DECIMAL, + 682 "NUMERIC": TokenType.DECIMAL, + 683 "FIXED": TokenType.DECIMAL, + 684 "REAL": TokenType.FLOAT, + 685 "FLOAT": TokenType.FLOAT, + 686 "FLOAT4": TokenType.FLOAT, + 687 "FLOAT8": TokenType.DOUBLE, + 688 "DOUBLE": TokenType.DOUBLE, + 689 "DOUBLE PRECISION": TokenType.DOUBLE, + 690 "JSON": TokenType.JSON, + 691 "CHAR": TokenType.CHAR, + 692 "CHARACTER": TokenType.CHAR, + 693 "NCHAR": TokenType.NCHAR, + 694 "VARCHAR": TokenType.VARCHAR, + 695 "VARCHAR2": TokenType.VARCHAR, + 696 "NVARCHAR": TokenType.NVARCHAR, + 697 "NVARCHAR2": TokenType.NVARCHAR, + 698 "STR": TokenType.TEXT, + 699 "STRING": TokenType.TEXT, + 700 "TEXT": TokenType.TEXT, + 701 "CLOB": TokenType.TEXT, + 702 "LONGVARCHAR": TokenType.TEXT, + 703 "BINARY": TokenType.BINARY, + 704 "BLOB": TokenType.VARBINARY, + 705 "BYTEA": TokenType.VARBINARY, + 706 "VARBINARY": TokenType.VARBINARY, + 707 "TIME": TokenType.TIME, + 708 "TIMESTAMP": TokenType.TIMESTAMP, + 709 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, + 710 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, + 711 "DATE": TokenType.DATE, + 712 "DATETIME": TokenType.DATETIME, + 713 "UNIQUE": TokenType.UNIQUE, + 714 "STRUCT": TokenType.STRUCT, + 715 "VARIANT": TokenType.VARIANT, + 716 "ALTER": TokenType.ALTER, + 717 "ALTER AGGREGATE": TokenType.COMMAND, + 718 "ALTER DEFAULT": TokenType.COMMAND, + 719 "ALTER DOMAIN": TokenType.COMMAND, + 720 "ALTER ROLE": TokenType.COMMAND, + 721 "ALTER RULE": TokenType.COMMAND, + 722 "ALTER SEQUENCE": TokenType.COMMAND, + 723 "ALTER TYPE": TokenType.COMMAND, + 724 "ALTER USER": TokenType.COMMAND, + 725 "ALTER VIEW": TokenType.COMMAND, + 726 "ANALYZE": TokenType.COMMAND, + 727 "CALL": TokenType.COMMAND, + 728 "COMMENT": TokenType.COMMENT, + 729 "COPY": TokenType.COMMAND, + 730 "EXPLAIN": TokenType.COMMAND, + 731 "GRANT": TokenType.COMMAND, + 732 "OPTIMIZE": TokenType.COMMAND, + 733 "PREPARE": TokenType.COMMAND, + 734 "TRUNCATE": TokenType.COMMAND, + 735 "VACUUM": TokenType.COMMAND, + 736 } + 737 + 738 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { + 739 " ": TokenType.SPACE, + 740 "\t": TokenType.SPACE, + 741 "\n": TokenType.BREAK, + 742 "\r": TokenType.BREAK, + 743 "\r\n": TokenType.BREAK, + 744 } + 745 + 746 COMMANDS = { + 747 TokenType.COMMAND, + 748 TokenType.EXECUTE, + 749 TokenType.FETCH, + 750 TokenType.SHOW, + 751 } + 752 + 753 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 754 - 755 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] - 756 KEYWORD_TRIE: t.Dict = {} # autofilled - 757 - 758 IDENTIFIER_CAN_START_WITH_DIGIT = False - 759 - 760 __slots__ = ( - 761 "sql", - 762 "size", - 763 "tokens", - 764 "_start", - 765 "_current", - 766 "_line", - 767 "_col", - 768 "_comments", - 769 "_char", - 770 "_end", - 771 "_peek", - 772 "_prev_token_line", - 773 "_prev_token_comments", - 774 "_prev_token_type", - 775 ) - 776 - 777 def __init__(self) -> None: - 778 self.reset() - 779 - 780 def reset(self) -> None: - 781 self.sql = "" - 782 self.size = 0 - 783 self.tokens: t.List[Token] = [] - 784 self._start = 0 - 785 self._current = 0 - 786 self._line = 1 - 787 self._col = 1 - 788 self._comments: t.List[str] = [] - 789 - 790 self._char = "" - 791 self._end = False - 792 self._peek = "" - 793 self._prev_token_line = -1 - 794 self._prev_token_comments: t.List[str] = [] - 795 self._prev_token_type: t.Optional[TokenType] = None - 796 - 797 def tokenize(self, sql: str) -> t.List[Token]: - 798 """Returns a list of tokens corresponding to the SQL string `sql`.""" - 799 self.reset() - 800 self.sql = sql - 801 self.size = len(sql) - 802 try: - 803 self._scan() - 804 except Exception as e: - 805 start = self._current - 50 - 806 end = self._current + 50 - 807 start = start if start > 0 else 0 - 808 end = end if end < self.size else self.size - 1 - 809 context = self.sql[start:end] - 810 raise ValueError(f"Error tokenizing '{context}'") from e - 811 - 812 return self.tokens - 813 - 814 def _scan(self, until: t.Optional[t.Callable] = None) -> None: - 815 while self.size and not self._end: - 816 self._start = self._current - 817 self._advance() - 818 - 819 if self._char is None: - 820 break - 821 - 822 if self._char not in self.WHITE_SPACE: - 823 if self._char.isdigit(): - 824 self._scan_number() - 825 elif self._char in self._IDENTIFIERS: - 826 self._scan_identifier(self._IDENTIFIERS[self._char]) - 827 else: - 828 self._scan_keywords() - 829 - 830 if until and until(): - 831 break - 832 - 833 if self.tokens: - 834 self.tokens[-1].comments.extend(self._comments) - 835 - 836 def _chars(self, size: int) -> str: - 837 if size == 1: - 838 return self._char - 839 start = self._current - 1 - 840 end = start + size - 841 if end <= self.size: - 842 return self.sql[start:end] - 843 return "" - 844 - 845 def _advance(self, i: int = 1) -> None: - 846 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: - 847 self._col = 1 - 848 self._line += 1 - 849 else: - 850 self._col += i - 851 - 852 self._current += i - 853 self._end = self._current >= self.size - 854 self._char = self.sql[self._current - 1] - 855 self._peek = "" if self._end else self.sql[self._current] - 856 - 857 @property - 858 def _text(self) -> str: - 859 return self.sql[self._start : self._current] + 755 # handle numeric literals like in hive (3L = BIGINT) + 756 NUMERIC_LITERALS: t.Dict[str, str] = {} + 757 ENCODE: t.Optional[str] = None + 758 + 759 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] + 760 KEYWORD_TRIE: t.Dict = {} # autofilled + 761 + 762 IDENTIFIER_CAN_START_WITH_DIGIT = False + 763 + 764 __slots__ = ( + 765 "sql", + 766 "size", + 767 "tokens", + 768 "_start", + 769 "_current", + 770 "_line", + 771 "_col", + 772 "_comments", + 773 "_char", + 774 "_end", + 775 "_peek", + 776 "_prev_token_line", + 777 "_prev_token_comments", + 778 "_prev_token_type", + 779 ) + 780 + 781 def __init__(self) -> None: + 782 self.reset() + 783 + 784 def reset(self) -> None: + 785 self.sql = "" + 786 self.size = 0 + 787 self.tokens: t.List[Token] = [] + 788 self._start = 0 + 789 self._current = 0 + 790 self._line = 1 + 791 self._col = 1 + 792 self._comments: t.List[str] = [] + 793 + 794 self._char = "" + 795 self._end = False + 796 self._peek = "" + 797 self._prev_token_line = -1 + 798 self._prev_token_comments: t.List[str] = [] + 799 self._prev_token_type: t.Optional[TokenType] = None + 800 + 801 def tokenize(self, sql: str) -> t.List[Token]: + 802 """Returns a list of tokens corresponding to the SQL string `sql`.""" + 803 self.reset() + 804 self.sql = sql + 805 self.size = len(sql) + 806 try: + 807 self._scan() + 808 except Exception as e: + 809 start = self._current - 50 + 810 end = self._current + 50 + 811 start = start if start > 0 else 0 + 812 end = end if end < self.size else self.size - 1 + 813 context = self.sql[start:end] + 814 raise ValueError(f"Error tokenizing '{context}'") from e + 815 + 816 return self.tokens + 817 + 818 def _scan(self, until: t.Optional[t.Callable] = None) -> None: + 819 while self.size and not self._end: + 820 self._start = self._current + 821 self._advance() + 822 + 823 if self._char is None: + 824 break + 825 + 826 if self._char not in self.WHITE_SPACE: + 827 if self._char.isdigit(): + 828 self._scan_number() + 829 elif self._char in self._IDENTIFIERS: + 830 self._scan_identifier(self._IDENTIFIERS[self._char]) + 831 else: + 832 self._scan_keywords() + 833 + 834 if until and until(): + 835 break + 836 + 837 if self.tokens: + 838 self.tokens[-1].comments.extend(self._comments) + 839 + 840 def _chars(self, size: int) -> str: + 841 if size == 1: + 842 return self._char + 843 start = self._current - 1 + 844 end = start + size + 845 if end <= self.size: + 846 return self.sql[start:end] + 847 return "" + 848 + 849 def _advance(self, i: int = 1) -> None: + 850 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: + 851 self._col = 1 + 852 self._line += 1 + 853 else: + 854 self._col += i + 855 + 856 self._current += i + 857 self._end = self._current >= self.size + 858 self._char = self.sql[self._current - 1] + 859 self._peek = "" if self._end else self.sql[self._current] 860 - 861 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: - 862 self._prev_token_line = self._line - 863 self._prev_token_comments = self._comments - 864 self._prev_token_type = token_type - 865 self.tokens.append( - 866 Token( - 867 token_type, - 868 self._text if text is None else text, - 869 self._line, - 870 self._col, - 871 self._current, - 872 self._comments, - 873 ) - 874 ) - 875 self._comments = [] - 876 - 877 # If we have either a semicolon or a begin token before the command's token, we'll parse - 878 # whatever follows the command's token as a string - 879 if ( - 880 token_type in self.COMMANDS - 881 and self._peek != ";" - 882 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) - 883 ): - 884 start = self._current - 885 tokens = len(self.tokens) - 886 self._scan(lambda: self._peek == ";") - 887 self.tokens = self.tokens[:tokens] - 888 text = self.sql[start : self._current].strip() - 889 if text: - 890 self._add(TokenType.STRING, text) - 891 - 892 def _scan_keywords(self) -> None: - 893 size = 0 - 894 word = None - 895 chars = self._text - 896 char = chars - 897 prev_space = False - 898 skip = False - 899 trie = self.KEYWORD_TRIE - 900 single_token = char in self.SINGLE_TOKENS - 901 - 902 while chars: - 903 if skip: - 904 result = 1 - 905 else: - 906 result, trie = in_trie(trie, char.upper()) - 907 - 908 if result == 0: - 909 break - 910 if result == 2: - 911 word = chars - 912 size += 1 - 913 end = self._current - 1 + size - 914 - 915 if end < self.size: - 916 char = self.sql[end] - 917 single_token = single_token or char in self.SINGLE_TOKENS - 918 is_space = char in self.WHITE_SPACE - 919 - 920 if not is_space or not prev_space: - 921 if is_space: - 922 char = " " - 923 chars += char - 924 prev_space = is_space - 925 skip = False - 926 else: - 927 skip = True - 928 else: - 929 chars = " " - 930 - 931 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word - 932 - 933 if not word: - 934 if self._char in self.SINGLE_TOKENS: - 935 self._add(self.SINGLE_TOKENS[self._char], text=self._char) - 936 return - 937 self._scan_var() - 938 return - 939 - 940 if self._scan_string(word): - 941 return - 942 if self._scan_formatted_string(word): - 943 return - 944 if self._scan_comment(word): + 861 @property + 862 def _text(self) -> str: + 863 return self.sql[self._start : self._current] + 864 + 865 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: + 866 self._prev_token_line = self._line + 867 self._prev_token_comments = self._comments + 868 self._prev_token_type = token_type + 869 self.tokens.append( + 870 Token( + 871 token_type, + 872 self._text if text is None else text, + 873 self._line, + 874 self._col, + 875 self._current, + 876 self._comments, + 877 ) + 878 ) + 879 self._comments = [] + 880 + 881 # If we have either a semicolon or a begin token before the command's token, we'll parse + 882 # whatever follows the command's token as a string + 883 if ( + 884 token_type in self.COMMANDS + 885 and self._peek != ";" + 886 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) + 887 ): + 888 start = self._current + 889 tokens = len(self.tokens) + 890 self._scan(lambda: self._peek == ";") + 891 self.tokens = self.tokens[:tokens] + 892 text = self.sql[start : self._current].strip() + 893 if text: + 894 self._add(TokenType.STRING, text) + 895 + 896 def _scan_keywords(self) -> None: + 897 size = 0 + 898 word = None + 899 chars = self._text + 900 char = chars + 901 prev_space = False + 902 skip = False + 903 trie = self.KEYWORD_TRIE + 904 single_token = char in self.SINGLE_TOKENS + 905 + 906 while chars: + 907 if skip: + 908 result = 1 + 909 else: + 910 result, trie = in_trie(trie, char.upper()) + 911 + 912 if result == 0: + 913 break + 914 if result == 2: + 915 word = chars + 916 size += 1 + 917 end = self._current - 1 + size + 918 + 919 if end < self.size: + 920 char = self.sql[end] + 921 single_token = single_token or char in self.SINGLE_TOKENS + 922 is_space = char in self.WHITE_SPACE + 923 + 924 if not is_space or not prev_space: + 925 if is_space: + 926 char = " " + 927 chars += char + 928 prev_space = is_space + 929 skip = False + 930 else: + 931 skip = True + 932 else: + 933 chars = " " + 934 + 935 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word + 936 + 937 if not word: + 938 if self._char in self.SINGLE_TOKENS: + 939 self._add(self.SINGLE_TOKENS[self._char], text=self._char) + 940 return + 941 self._scan_var() + 942 return + 943 + 944 if self._scan_string(word): 945 return - 946 - 947 self._advance(size - 1) - 948 word = word.upper() - 949 self._add(self.KEYWORDS[word], text=word) + 946 if self._scan_formatted_string(word): + 947 return + 948 if self._scan_comment(word): + 949 return 950 - 951 def _scan_comment(self, comment_start: str) -> bool: - 952 if comment_start not in self._COMMENTS: - 953 return False + 951 self._advance(size - 1) + 952 word = word.upper() + 953 self._add(self.KEYWORDS[word], text=word) 954 - 955 comment_start_line = self._line - 956 comment_start_size = len(comment_start) - 957 comment_end = self._COMMENTS[comment_start] + 955 def _scan_comment(self, comment_start: str) -> bool: + 956 if comment_start not in self._COMMENTS: + 957 return False 958 - 959 if comment_end: - 960 # Skip the comment's start delimiter - 961 self._advance(comment_start_size) + 959 comment_start_line = self._line + 960 comment_start_size = len(comment_start) + 961 comment_end = self._COMMENTS[comment_start] 962 - 963 comment_end_size = len(comment_end) - 964 while not self._end and self._chars(comment_end_size) != comment_end: - 965 self._advance() + 963 if comment_end: + 964 # Skip the comment's start delimiter + 965 self._advance(comment_start_size) 966 - 967 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) - 968 self._advance(comment_end_size - 1) - 969 else: - 970 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: - 971 self._advance() - 972 self._comments.append(self._text[comment_start_size:]) - 973 - 974 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. - 975 # Multiple consecutive comments are preserved by appending them to the current comments list. - 976 if comment_start_line == self._prev_token_line: - 977 self.tokens[-1].comments.extend(self._comments) - 978 self._comments = [] - 979 self._prev_token_line = self._line - 980 - 981 return True - 982 - 983 def _scan_number(self) -> None: - 984 if self._char == "0": - 985 peek = self._peek.upper() - 986 if peek == "B": - 987 return self._scan_bits() - 988 elif peek == "X": - 989 return self._scan_hex() - 990 - 991 decimal = False - 992 scientific = 0 - 993 - 994 while True: - 995 if self._peek.isdigit(): - 996 self._advance() - 997 elif self._peek == "." and not decimal: - 998 decimal = True - 999 self._advance() -1000 elif self._peek in ("-", "+") and scientific == 1: -1001 scientific += 1 -1002 self._advance() -1003 elif self._peek.upper() == "E" and not scientific: -1004 scientific += 1 -1005 self._advance() -1006 elif self._peek.isidentifier(): -1007 number_text = self._text -1008 literal = "" -1009 -1010 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: -1011 literal += self._peek.upper() -1012 self._advance() + 967 comment_end_size = len(comment_end) + 968 while not self._end and self._chars(comment_end_size) != comment_end: + 969 self._advance() + 970 + 971 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) + 972 self._advance(comment_end_size - 1) + 973 else: + 974 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: + 975 self._advance() + 976 self._comments.append(self._text[comment_start_size:]) + 977 + 978 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. + 979 # Multiple consecutive comments are preserved by appending them to the current comments list. + 980 if comment_start_line == self._prev_token_line: + 981 self.tokens[-1].comments.extend(self._comments) + 982 self._comments = [] + 983 self._prev_token_line = self._line + 984 + 985 return True + 986 + 987 def _scan_number(self) -> None: + 988 if self._char == "0": + 989 peek = self._peek.upper() + 990 if peek == "B": + 991 return self._scan_bits() + 992 elif peek == "X": + 993 return self._scan_hex() + 994 + 995 decimal = False + 996 scientific = 0 + 997 + 998 while True: + 999 if self._peek.isdigit(): +1000 self._advance() +1001 elif self._peek == "." and not decimal: +1002 decimal = True +1003 self._advance() +1004 elif self._peek in ("-", "+") and scientific == 1: +1005 scientific += 1 +1006 self._advance() +1007 elif self._peek.upper() == "E" and not scientific: +1008 scientific += 1 +1009 self._advance() +1010 elif self._peek.isidentifier(): +1011 number_text = self._text +1012 literal = "" 1013 -1014 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) -1015 -1016 if token_type: -1017 self._add(TokenType.NUMBER, number_text) -1018 self._add(TokenType.DCOLON, "::") -1019 return self._add(token_type, literal) -1020 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: -1021 return self._add(TokenType.VAR) -1022 -1023 self._add(TokenType.NUMBER, number_text) -1024 return self._advance(-len(literal)) -1025 else: -1026 return self._add(TokenType.NUMBER) -1027 -1028 def _scan_bits(self) -> None: -1029 self._advance() -1030 value = self._extract_value() -1031 try: -1032 self._add(TokenType.BIT_STRING, f"{int(value, 2)}") -1033 except ValueError: -1034 self._add(TokenType.IDENTIFIER) -1035 -1036 def _scan_hex(self) -> None: -1037 self._advance() -1038 value = self._extract_value() -1039 try: -1040 self._add(TokenType.HEX_STRING, f"{int(value, 16)}") -1041 except ValueError: -1042 self._add(TokenType.IDENTIFIER) -1043 -1044 def _extract_value(self) -> str: -1045 while True: -1046 char = self._peek.strip() -1047 if char and char not in self.SINGLE_TOKENS: -1048 self._advance() -1049 else: -1050 break -1051 -1052 return self._text -1053 -1054 def _scan_string(self, quote: str) -> bool: -1055 quote_end = self._QUOTES.get(quote) -1056 if quote_end is None: -1057 return False -1058 -1059 self._advance(len(quote)) -1060 text = self._extract_string(quote_end) -1061 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text -1062 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) -1063 return True -1064 -1065 # X'1234, b'0110', E'\\\\\' etc. -1066 def _scan_formatted_string(self, string_start: str) -> bool: -1067 if string_start in self._HEX_STRINGS: -1068 delimiters = self._HEX_STRINGS -1069 token_type = TokenType.HEX_STRING -1070 base = 16 -1071 elif string_start in self._BIT_STRINGS: -1072 delimiters = self._BIT_STRINGS -1073 token_type = TokenType.BIT_STRING -1074 base = 2 -1075 elif string_start in self._BYTE_STRINGS: -1076 delimiters = self._BYTE_STRINGS -1077 token_type = TokenType.BYTE_STRING -1078 base = None -1079 else: -1080 return False -1081 -1082 self._advance(len(string_start)) -1083 string_end = delimiters[string_start] -1084 text = self._extract_string(string_end) +1014 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: +1015 literal += self._peek.upper() +1016 self._advance() +1017 +1018 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) +1019 +1020 if token_type: +1021 self._add(TokenType.NUMBER, number_text) +1022 self._add(TokenType.DCOLON, "::") +1023 return self._add(token_type, literal) +1024 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: +1025 return self._add(TokenType.VAR) +1026 +1027 self._add(TokenType.NUMBER, number_text) +1028 return self._advance(-len(literal)) +1029 else: +1030 return self._add(TokenType.NUMBER) +1031 +1032 def _scan_bits(self) -> None: +1033 self._advance() +1034 value = self._extract_value() +1035 try: +1036 self._add(TokenType.BIT_STRING, f"{int(value, 2)}") +1037 except ValueError: +1038 self._add(TokenType.IDENTIFIER) +1039 +1040 def _scan_hex(self) -> None: +1041 self._advance() +1042 value = self._extract_value() +1043 try: +1044 self._add(TokenType.HEX_STRING, f"{int(value, 16)}") +1045 except ValueError: +1046 self._add(TokenType.IDENTIFIER) +1047 +1048 def _extract_value(self) -> str: +1049 while True: +1050 char = self._peek.strip() +1051 if char and char not in self.SINGLE_TOKENS: +1052 self._advance() +1053 else: +1054 break +1055 +1056 return self._text +1057 +1058 def _scan_string(self, quote: str) -> bool: +1059 quote_end = self._QUOTES.get(quote) +1060 if quote_end is None: +1061 return False +1062 +1063 self._advance(len(quote)) +1064 text = self._extract_string(quote_end) +1065 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text +1066 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) +1067 return True +1068 +1069 # X'1234, b'0110', E'\\\\\' etc. +1070 def _scan_formatted_string(self, string_start: str) -> bool: +1071 if string_start in self._HEX_STRINGS: +1072 delimiters = self._HEX_STRINGS +1073 token_type = TokenType.HEX_STRING +1074 base = 16 +1075 elif string_start in self._BIT_STRINGS: +1076 delimiters = self._BIT_STRINGS +1077 token_type = TokenType.BIT_STRING +1078 base = 2 +1079 elif string_start in self._BYTE_STRINGS: +1080 delimiters = self._BYTE_STRINGS +1081 token_type = TokenType.BYTE_STRING +1082 base = None +1083 else: +1084 return False 1085 -1086 if base is None: -1087 self._add(token_type, text) -1088 else: -1089 try: -1090 self._add(token_type, f"{int(text, base)}") -1091 except: -1092 raise RuntimeError( -1093 f"Numeric string contains invalid characters from {self._line}:{self._start}" -1094 ) -1095 -1096 return True -1097 -1098 def _scan_identifier(self, identifier_end: str) -> None: -1099 text = "" -1100 identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES +1086 self._advance(len(string_start)) +1087 string_end = delimiters[string_start] +1088 text = self._extract_string(string_end) +1089 +1090 if base is None: +1091 self._add(token_type, text) +1092 else: +1093 try: +1094 self._add(token_type, f"{int(text, base)}") +1095 except: +1096 raise RuntimeError( +1097 f"Numeric string contains invalid characters from {self._line}:{self._start}" +1098 ) +1099 +1100 return True 1101 -1102 while True: -1103 if self._end: -1104 raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") +1102 def _scan_identifier(self, identifier_end: str) -> None: +1103 text = "" +1104 identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES 1105 -1106 self._advance() -1107 if self._char == identifier_end: -1108 if identifier_end_is_escape and self._peek == identifier_end: -1109 text += identifier_end -1110 self._advance() -1111 continue -1112 -1113 break -1114 -1115 text += self._char +1106 while True: +1107 if self._end: +1108 raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}") +1109 +1110 self._advance() +1111 if self._char == identifier_end: +1112 if identifier_end_is_escape and self._peek == identifier_end: +1113 text += identifier_end +1114 self._advance() +1115 continue 1116 -1117 self._add(TokenType.IDENTIFIER, text) +1117 break 1118 -1119 def _scan_var(self) -> None: -1120 while True: -1121 char = self._peek.strip() -1122 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): -1123 self._advance() -1124 else: -1125 break -1126 self._add( -1127 TokenType.VAR -1128 if self._prev_token_type == TokenType.PARAMETER -1129 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) -1130 ) -1131 -1132 def _extract_string(self, delimiter: str) -> str: -1133 text = "" -1134 delim_size = len(delimiter) +1119 text += self._char +1120 +1121 self._add(TokenType.IDENTIFIER, text) +1122 +1123 def _scan_var(self) -> None: +1124 while True: +1125 char = self._peek.strip() +1126 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): +1127 self._advance() +1128 else: +1129 break +1130 self._add( +1131 TokenType.VAR +1132 if self._prev_token_type == TokenType.PARAMETER +1133 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) +1134 ) 1135 -1136 while True: -1137 if self._char in self._STRING_ESCAPES and ( -1138 self._peek == delimiter or self._peek in self._STRING_ESCAPES -1139 ): -1140 if self._peek == delimiter: -1141 text += self._peek -1142 else: -1143 text += self._char + self._peek -1144 -1145 if self._current + 1 < self.size: -1146 self._advance(2) -1147 else: -1148 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") -1149 else: -1150 if self._chars(delim_size) == delimiter: -1151 if delim_size > 1: -1152 self._advance(delim_size - 1) -1153 break -1154 -1155 if self._end: -1156 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") -1157 text += self._char -1158 self._advance() -1159 -1160 return text +1136 def _extract_string(self, delimiter: str) -> str: +1137 text = "" +1138 delim_size = len(delimiter) +1139 +1140 while True: +1141 if self._char in self._STRING_ESCAPES and ( +1142 self._peek == delimiter or self._peek in self._STRING_ESCAPES +1143 ): +1144 if self._peek == delimiter: +1145 text += self._peek +1146 else: +1147 text += self._char + self._peek +1148 +1149 if self._current + 1 < self.size: +1150 self._advance(2) +1151 else: +1152 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") +1153 else: +1154 if self._chars(delim_size) == delimiter: +1155 if delim_size > 1: +1156 self._advance(delim_size - 1) +1157 break +1158 +1159 if self._end: +1160 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") +1161 text += self._char +1162 self._advance() +1163 +1164 return text @@ -2372,104 +2382,106 @@ 216 ISNULL = auto() 217 JOIN = auto() 218 JOIN_MARKER = auto() -219 LANGUAGE = auto() -220 LATERAL = auto() -221 LAZY = auto() -222 LEADING = auto() -223 LEFT = auto() -224 LIKE = auto() -225 LIKE_ANY = auto() -226 LIMIT = auto() -227 LOAD_DATA = auto() -228 LOCAL = auto() -229 MAP = auto() -230 MATCH_RECOGNIZE = auto() -231 MATERIALIZED = auto() -232 MERGE = auto() -233 MOD = auto() -234 NATURAL = auto() -235 NEXT = auto() -236 NO_ACTION = auto() -237 NOTNULL = auto() -238 NULL = auto() -239 NULLS_FIRST = auto() -240 NULLS_LAST = auto() -241 OFFSET = auto() -242 ON = auto() -243 ONLY = auto() -244 OPTIONS = auto() -245 ORDER_BY = auto() -246 ORDERED = auto() -247 ORDINALITY = auto() -248 OUTER = auto() -249 OUT_OF = auto() -250 OVER = auto() -251 OVERLAPS = auto() -252 OVERWRITE = auto() -253 PARTITION = auto() -254 PARTITION_BY = auto() -255 PERCENT = auto() -256 PIVOT = auto() -257 PLACEHOLDER = auto() -258 PRAGMA = auto() -259 PRECEDING = auto() -260 PRIMARY_KEY = auto() -261 PROCEDURE = auto() -262 PROPERTIES = auto() -263 PSEUDO_TYPE = auto() -264 QUALIFY = auto() -265 QUOTE = auto() -266 RANGE = auto() -267 RECURSIVE = auto() -268 REPLACE = auto() -269 RESPECT_NULLS = auto() -270 RETURNING = auto() -271 REFERENCES = auto() -272 RIGHT = auto() -273 RLIKE = auto() -274 ROLLBACK = auto() -275 ROLLUP = auto() -276 ROW = auto() -277 ROWS = auto() -278 SEED = auto() -279 SELECT = auto() -280 SEMI = auto() -281 SEPARATOR = auto() -282 SERDE_PROPERTIES = auto() -283 SET = auto() -284 SHOW = auto() -285 SIMILAR_TO = auto() -286 SOME = auto() -287 SORTKEY = auto() -288 SORT_BY = auto() -289 STRUCT = auto() -290 TABLE_SAMPLE = auto() -291 TEMPORARY = auto() -292 TOP = auto() -293 THEN = auto() -294 TRAILING = auto() -295 TRUE = auto() -296 UNBOUNDED = auto() -297 UNCACHE = auto() -298 UNION = auto() -299 UNLOGGED = auto() -300 UNNEST = auto() -301 UNPIVOT = auto() -302 UPDATE = auto() -303 USE = auto() -304 USING = auto() -305 VALUES = auto() -306 VIEW = auto() -307 VOLATILE = auto() -308 WHEN = auto() -309 WHERE = auto() -310 WINDOW = auto() -311 WITH = auto() -312 WITH_TIME_ZONE = auto() -313 WITH_LOCAL_TIME_ZONE = auto() -314 WITHIN_GROUP = auto() -315 WITHOUT_TIME_ZONE = auto() -316 UNIQUE = auto() +219 KEEP = auto() +220 LANGUAGE = auto() +221 LATERAL = auto() +222 LAZY = auto() +223 LEADING = auto() +224 LEFT = auto() +225 LIKE = auto() +226 LIKE_ANY = auto() +227 LIMIT = auto() +228 LOAD_DATA = auto() +229 LOCAL = auto() +230 MAP = auto() +231 MATCH_RECOGNIZE = auto() +232 MATERIALIZED = auto() +233 MERGE = auto() +234 MOD = auto() +235 NATURAL = auto() +236 NEXT = auto() +237 NEXT_VALUE_FOR = auto() +238 NO_ACTION = auto() +239 NOTNULL = auto() +240 NULL = auto() +241 NULLS_FIRST = auto() +242 NULLS_LAST = auto() +243 OFFSET = auto() +244 ON = auto() +245 ONLY = auto() +246 OPTIONS = auto() +247 ORDER_BY = auto() +248 ORDERED = auto() +249 ORDINALITY = auto() +250 OUTER = auto() +251 OUT_OF = auto() +252 OVER = auto() +253 OVERLAPS = auto() +254 OVERWRITE = auto() +255 PARTITION = auto() +256 PARTITION_BY = auto() +257 PERCENT = auto() +258 PIVOT = auto() +259 PLACEHOLDER = auto() +260 PRAGMA = auto() +261 PRECEDING = auto() +262 PRIMARY_KEY = auto() +263 PROCEDURE = auto() +264 PROPERTIES = auto() +265 PSEUDO_TYPE = auto() +266 QUALIFY = auto() +267 QUOTE = auto() +268 RANGE = auto() +269 RECURSIVE = auto() +270 REPLACE = auto() +271 RESPECT_NULLS = auto() +272 RETURNING = auto() +273 REFERENCES = auto() +274 RIGHT = auto() +275 RLIKE = auto() +276 ROLLBACK = auto() +277 ROLLUP = auto() +278 ROW = auto() +279 ROWS = auto() +280 SEED = auto() +281 SELECT = auto() +282 SEMI = auto() +283 SEPARATOR = auto() +284 SERDE_PROPERTIES = auto() +285 SET = auto() +286 SHOW = auto() +287 SIMILAR_TO = auto() +288 SOME = auto() +289 SORTKEY = auto() +290 SORT_BY = auto() +291 STRUCT = auto() +292 TABLE_SAMPLE = auto() +293 TEMPORARY = auto() +294 TOP = auto() +295 THEN = auto() +296 TRAILING = auto() +297 TRUE = auto() +298 UNBOUNDED = auto() +299 UNCACHE = auto() +300 UNION = auto() +301 UNLOGGED = auto() +302 UNNEST = auto() +303 UNPIVOT = auto() +304 UPDATE = auto() +305 USE = auto() +306 USING = auto() +307 VALUES = auto() +308 VIEW = auto() +309 VOLATILE = auto() +310 WHEN = auto() +311 WHERE = auto() +312 WINDOW = auto() +313 WITH = auto() +314 WITH_TIME_ZONE = auto() +315 WITH_LOCAL_TIME_ZONE = auto() +316 WITHIN_GROUP = auto() +317 WITHOUT_TIME_ZONE = auto() +318 UNIQUE = auto() @@ -4876,6 +4888,18 @@ + +
    +
    + KEEP = +<TokenType.KEEP: 'KEEP'> + + +
    + + + +
    @@ -5080,6 +5104,18 @@ +
    +
    +
    + NEXT_VALUE_FOR = +<TokenType.NEXT_VALUE_FOR: 'NEXT_VALUE_FOR'> + + +
    + + + +
    @@ -6075,54 +6111,54 @@
    -
    319class Token:
    -320    __slots__ = ("token_type", "text", "line", "col", "end", "comments")
    -321
    -322    @classmethod
    -323    def number(cls, number: int) -> Token:
    -324        """Returns a NUMBER token with `number` as its text."""
    -325        return cls(TokenType.NUMBER, str(number))
    -326
    -327    @classmethod
    -328    def string(cls, string: str) -> Token:
    -329        """Returns a STRING token with `string` as its text."""
    -330        return cls(TokenType.STRING, string)
    -331
    -332    @classmethod
    -333    def identifier(cls, identifier: str) -> Token:
    -334        """Returns an IDENTIFIER token with `identifier` as its text."""
    -335        return cls(TokenType.IDENTIFIER, identifier)
    -336
    -337    @classmethod
    -338    def var(cls, var: str) -> Token:
    -339        """Returns an VAR token with `var` as its text."""
    -340        return cls(TokenType.VAR, var)
    -341
    -342    def __init__(
    -343        self,
    -344        token_type: TokenType,
    -345        text: str,
    -346        line: int = 1,
    -347        col: int = 1,
    -348        end: int = 0,
    -349        comments: t.List[str] = [],
    -350    ) -> None:
    -351        self.token_type = token_type
    -352        self.text = text
    -353        self.line = line
    -354        size = len(text)
    -355        self.col = col
    -356        self.end = end if end else size
    -357        self.comments = comments
    -358
    -359    @property
    -360    def start(self) -> int:
    -361        """Returns the start of the token."""
    -362        return self.end - len(self.text)
    -363
    -364    def __repr__(self) -> str:
    -365        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
    -366        return f"<Token {attributes}>"
    +            
    321class Token:
    +322    __slots__ = ("token_type", "text", "line", "col", "end", "comments")
    +323
    +324    @classmethod
    +325    def number(cls, number: int) -> Token:
    +326        """Returns a NUMBER token with `number` as its text."""
    +327        return cls(TokenType.NUMBER, str(number))
    +328
    +329    @classmethod
    +330    def string(cls, string: str) -> Token:
    +331        """Returns a STRING token with `string` as its text."""
    +332        return cls(TokenType.STRING, string)
    +333
    +334    @classmethod
    +335    def identifier(cls, identifier: str) -> Token:
    +336        """Returns an IDENTIFIER token with `identifier` as its text."""
    +337        return cls(TokenType.IDENTIFIER, identifier)
    +338
    +339    @classmethod
    +340    def var(cls, var: str) -> Token:
    +341        """Returns an VAR token with `var` as its text."""
    +342        return cls(TokenType.VAR, var)
    +343
    +344    def __init__(
    +345        self,
    +346        token_type: TokenType,
    +347        text: str,
    +348        line: int = 1,
    +349        col: int = 1,
    +350        end: int = 0,
    +351        comments: t.List[str] = [],
    +352    ) -> None:
    +353        self.token_type = token_type
    +354        self.text = text
    +355        self.line = line
    +356        size = len(text)
    +357        self.col = col
    +358        self.end = end if end else size
    +359        self.comments = comments
    +360
    +361    @property
    +362    def start(self) -> int:
    +363        """Returns the start of the token."""
    +364        return self.end - len(self.text)
    +365
    +366    def __repr__(self) -> str:
    +367        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
    +368        return f"<Token {attributes}>"
     
    @@ -6138,22 +6174,22 @@
    -
    342    def __init__(
    -343        self,
    -344        token_type: TokenType,
    -345        text: str,
    -346        line: int = 1,
    -347        col: int = 1,
    -348        end: int = 0,
    -349        comments: t.List[str] = [],
    -350    ) -> None:
    -351        self.token_type = token_type
    -352        self.text = text
    -353        self.line = line
    -354        size = len(text)
    -355        self.col = col
    -356        self.end = end if end else size
    -357        self.comments = comments
    +            
    344    def __init__(
    +345        self,
    +346        token_type: TokenType,
    +347        text: str,
    +348        line: int = 1,
    +349        col: int = 1,
    +350        end: int = 0,
    +351        comments: t.List[str] = [],
    +352    ) -> None:
    +353        self.token_type = token_type
    +354        self.text = text
    +355        self.line = line
    +356        size = len(text)
    +357        self.col = col
    +358        self.end = end if end else size
    +359        self.comments = comments
     
    @@ -6172,10 +6208,10 @@
    -
    322    @classmethod
    -323    def number(cls, number: int) -> Token:
    -324        """Returns a NUMBER token with `number` as its text."""
    -325        return cls(TokenType.NUMBER, str(number))
    +            
    324    @classmethod
    +325    def number(cls, number: int) -> Token:
    +326        """Returns a NUMBER token with `number` as its text."""
    +327        return cls(TokenType.NUMBER, str(number))
     
    @@ -6196,10 +6232,10 @@
    -
    327    @classmethod
    -328    def string(cls, string: str) -> Token:
    -329        """Returns a STRING token with `string` as its text."""
    -330        return cls(TokenType.STRING, string)
    +            
    329    @classmethod
    +330    def string(cls, string: str) -> Token:
    +331        """Returns a STRING token with `string` as its text."""
    +332        return cls(TokenType.STRING, string)
     
    @@ -6220,10 +6256,10 @@
    -
    332    @classmethod
    -333    def identifier(cls, identifier: str) -> Token:
    -334        """Returns an IDENTIFIER token with `identifier` as its text."""
    -335        return cls(TokenType.IDENTIFIER, identifier)
    +            
    334    @classmethod
    +335    def identifier(cls, identifier: str) -> Token:
    +336        """Returns an IDENTIFIER token with `identifier` as its text."""
    +337        return cls(TokenType.IDENTIFIER, identifier)
     
    @@ -6244,10 +6280,10 @@
    -
    337    @classmethod
    -338    def var(cls, var: str) -> Token:
    -339        """Returns an VAR token with `var` as its text."""
    -340        return cls(TokenType.VAR, var)
    +            
    339    @classmethod
    +340    def var(cls, var: str) -> Token:
    +341        """Returns an VAR token with `var` as its text."""
    +342        return cls(TokenType.VAR, var)
     
    @@ -6281,759 +6317,761 @@
    -
     409class Tokenizer(metaclass=_Tokenizer):
    - 410    SINGLE_TOKENS = {
    - 411        "(": TokenType.L_PAREN,
    - 412        ")": TokenType.R_PAREN,
    - 413        "[": TokenType.L_BRACKET,
    - 414        "]": TokenType.R_BRACKET,
    - 415        "{": TokenType.L_BRACE,
    - 416        "}": TokenType.R_BRACE,
    - 417        "&": TokenType.AMP,
    - 418        "^": TokenType.CARET,
    - 419        ":": TokenType.COLON,
    - 420        ",": TokenType.COMMA,
    - 421        ".": TokenType.DOT,
    - 422        "-": TokenType.DASH,
    - 423        "=": TokenType.EQ,
    - 424        ">": TokenType.GT,
    - 425        "<": TokenType.LT,
    - 426        "%": TokenType.MOD,
    - 427        "!": TokenType.NOT,
    - 428        "|": TokenType.PIPE,
    - 429        "+": TokenType.PLUS,
    - 430        ";": TokenType.SEMICOLON,
    - 431        "/": TokenType.SLASH,
    - 432        "\\": TokenType.BACKSLASH,
    - 433        "*": TokenType.STAR,
    - 434        "~": TokenType.TILDA,
    - 435        "?": TokenType.PLACEHOLDER,
    - 436        "@": TokenType.PARAMETER,
    - 437        # used for breaking a var like x'y' but nothing else
    - 438        # the token type doesn't matter
    - 439        "'": TokenType.QUOTE,
    - 440        "`": TokenType.IDENTIFIER,
    - 441        '"': TokenType.IDENTIFIER,
    - 442        "#": TokenType.HASH,
    - 443    }
    - 444
    - 445    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
    - 446    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
    - 447    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
    - 448    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
    - 449    IDENTIFIER_ESCAPES = ['"']
    - 450    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
    - 451    STRING_ESCAPES = ["'"]
    - 452    VAR_SINGLE_TOKENS: t.Set[str] = set()
    - 453
    - 454    _COMMENTS: t.Dict[str, str] = {}
    - 455    _BIT_STRINGS: t.Dict[str, str] = {}
    - 456    _BYTE_STRINGS: t.Dict[str, str] = {}
    - 457    _HEX_STRINGS: t.Dict[str, str] = {}
    - 458    _IDENTIFIERS: t.Dict[str, str] = {}
    - 459    _IDENTIFIER_ESCAPES: t.Set[str] = set()
    - 460    _QUOTES: t.Dict[str, str] = {}
    - 461    _STRING_ESCAPES: t.Set[str] = set()
    - 462
    - 463    KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
    - 464        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
    - 465        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
    - 466        "{{+": TokenType.BLOCK_START,
    - 467        "{{-": TokenType.BLOCK_START,
    - 468        "+}}": TokenType.BLOCK_END,
    - 469        "-}}": TokenType.BLOCK_END,
    - 470        "/*+": TokenType.HINT,
    - 471        "==": TokenType.EQ,
    - 472        "::": TokenType.DCOLON,
    - 473        "||": TokenType.DPIPE,
    - 474        ">=": TokenType.GTE,
    - 475        "<=": TokenType.LTE,
    - 476        "<>": TokenType.NEQ,
    - 477        "!=": TokenType.NEQ,
    - 478        "<=>": TokenType.NULLSAFE_EQ,
    - 479        "->": TokenType.ARROW,
    - 480        "->>": TokenType.DARROW,
    - 481        "=>": TokenType.FARROW,
    - 482        "#>": TokenType.HASH_ARROW,
    - 483        "#>>": TokenType.DHASH_ARROW,
    - 484        "<->": TokenType.LR_ARROW,
    - 485        "&&": TokenType.DAMP,
    - 486        "ALL": TokenType.ALL,
    - 487        "ALWAYS": TokenType.ALWAYS,
    - 488        "AND": TokenType.AND,
    - 489        "ANTI": TokenType.ANTI,
    - 490        "ANY": TokenType.ANY,
    - 491        "ASC": TokenType.ASC,
    - 492        "AS": TokenType.ALIAS,
    - 493        "AT TIME ZONE": TokenType.AT_TIME_ZONE,
    - 494        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
    - 495        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
    - 496        "BEGIN": TokenType.BEGIN,
    - 497        "BETWEEN": TokenType.BETWEEN,
    - 498        "BOTH": TokenType.BOTH,
    - 499        "BUCKET": TokenType.BUCKET,
    - 500        "BY DEFAULT": TokenType.BY_DEFAULT,
    - 501        "CACHE": TokenType.CACHE,
    - 502        "UNCACHE": TokenType.UNCACHE,
    - 503        "CASE": TokenType.CASE,
    - 504        "CASCADE": TokenType.CASCADE,
    - 505        "CHARACTER SET": TokenType.CHARACTER_SET,
    - 506        "CLUSTER BY": TokenType.CLUSTER_BY,
    - 507        "COLLATE": TokenType.COLLATE,
    - 508        "COLUMN": TokenType.COLUMN,
    - 509        "COMMIT": TokenType.COMMIT,
    - 510        "COMPOUND": TokenType.COMPOUND,
    - 511        "CONSTRAINT": TokenType.CONSTRAINT,
    - 512        "CREATE": TokenType.CREATE,
    - 513        "CROSS": TokenType.CROSS,
    - 514        "CUBE": TokenType.CUBE,
    - 515        "CURRENT_DATE": TokenType.CURRENT_DATE,
    - 516        "CURRENT ROW": TokenType.CURRENT_ROW,
    - 517        "CURRENT_TIME": TokenType.CURRENT_TIME,
    - 518        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
    - 519        "CURRENT_USER": TokenType.CURRENT_USER,
    - 520        "DATABASE": TokenType.DATABASE,
    - 521        "DEFAULT": TokenType.DEFAULT,
    - 522        "DELETE": TokenType.DELETE,
    - 523        "DESC": TokenType.DESC,
    - 524        "DESCRIBE": TokenType.DESCRIBE,
    - 525        "DISTINCT": TokenType.DISTINCT,
    - 526        "DISTINCT FROM": TokenType.DISTINCT_FROM,
    - 527        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
    - 528        "DIV": TokenType.DIV,
    - 529        "DROP": TokenType.DROP,
    - 530        "ELSE": TokenType.ELSE,
    - 531        "END": TokenType.END,
    - 532        "ESCAPE": TokenType.ESCAPE,
    - 533        "EXCEPT": TokenType.EXCEPT,
    - 534        "EXECUTE": TokenType.EXECUTE,
    - 535        "EXISTS": TokenType.EXISTS,
    - 536        "FALSE": TokenType.FALSE,
    - 537        "FETCH": TokenType.FETCH,
    - 538        "FILTER": TokenType.FILTER,
    - 539        "FIRST": TokenType.FIRST,
    - 540        "FULL": TokenType.FULL,
    - 541        "FUNCTION": TokenType.FUNCTION,
    - 542        "FOLLOWING": TokenType.FOLLOWING,
    - 543        "FOR": TokenType.FOR,
    - 544        "FOREIGN KEY": TokenType.FOREIGN_KEY,
    - 545        "FORMAT": TokenType.FORMAT,
    - 546        "FROM": TokenType.FROM,
    - 547        "GLOB": TokenType.GLOB,
    - 548        "GROUP BY": TokenType.GROUP_BY,
    - 549        "GROUPING SETS": TokenType.GROUPING_SETS,
    - 550        "HAVING": TokenType.HAVING,
    - 551        "IF": TokenType.IF,
    - 552        "ILIKE": TokenType.ILIKE,
    - 553        "IGNORE NULLS": TokenType.IGNORE_NULLS,
    - 554        "IN": TokenType.IN,
    - 555        "INDEX": TokenType.INDEX,
    - 556        "INET": TokenType.INET,
    - 557        "INNER": TokenType.INNER,
    - 558        "INSERT": TokenType.INSERT,
    - 559        "INTERVAL": TokenType.INTERVAL,
    - 560        "INTERSECT": TokenType.INTERSECT,
    - 561        "INTO": TokenType.INTO,
    - 562        "IS": TokenType.IS,
    - 563        "ISNULL": TokenType.ISNULL,
    - 564        "JOIN": TokenType.JOIN,
    - 565        "LATERAL": TokenType.LATERAL,
    - 566        "LAZY": TokenType.LAZY,
    - 567        "LEADING": TokenType.LEADING,
    - 568        "LEFT": TokenType.LEFT,
    - 569        "LIKE": TokenType.LIKE,
    - 570        "LIMIT": TokenType.LIMIT,
    - 571        "LOAD DATA": TokenType.LOAD_DATA,
    - 572        "LOCAL": TokenType.LOCAL,
    - 573        "MATERIALIZED": TokenType.MATERIALIZED,
    - 574        "MERGE": TokenType.MERGE,
    - 575        "NATURAL": TokenType.NATURAL,
    - 576        "NEXT": TokenType.NEXT,
    - 577        "NO ACTION": TokenType.NO_ACTION,
    - 578        "NOT": TokenType.NOT,
    - 579        "NOTNULL": TokenType.NOTNULL,
    - 580        "NULL": TokenType.NULL,
    - 581        "NULLS FIRST": TokenType.NULLS_FIRST,
    - 582        "NULLS LAST": TokenType.NULLS_LAST,
    - 583        "OBJECT": TokenType.OBJECT,
    - 584        "OFFSET": TokenType.OFFSET,
    - 585        "ON": TokenType.ON,
    - 586        "ONLY": TokenType.ONLY,
    - 587        "OPTIONS": TokenType.OPTIONS,
    - 588        "OR": TokenType.OR,
    - 589        "ORDER BY": TokenType.ORDER_BY,
    - 590        "ORDINALITY": TokenType.ORDINALITY,
    - 591        "OUTER": TokenType.OUTER,
    - 592        "OUT OF": TokenType.OUT_OF,
    - 593        "OVER": TokenType.OVER,
    - 594        "OVERLAPS": TokenType.OVERLAPS,
    - 595        "OVERWRITE": TokenType.OVERWRITE,
    - 596        "PARTITION": TokenType.PARTITION,
    - 597        "PARTITION BY": TokenType.PARTITION_BY,
    - 598        "PARTITIONED BY": TokenType.PARTITION_BY,
    - 599        "PARTITIONED_BY": TokenType.PARTITION_BY,
    - 600        "PERCENT": TokenType.PERCENT,
    - 601        "PIVOT": TokenType.PIVOT,
    - 602        "PRAGMA": TokenType.PRAGMA,
    - 603        "PRECEDING": TokenType.PRECEDING,
    - 604        "PRIMARY KEY": TokenType.PRIMARY_KEY,
    - 605        "PROCEDURE": TokenType.PROCEDURE,
    - 606        "QUALIFY": TokenType.QUALIFY,
    - 607        "RANGE": TokenType.RANGE,
    - 608        "RECURSIVE": TokenType.RECURSIVE,
    - 609        "REGEXP": TokenType.RLIKE,
    - 610        "REPLACE": TokenType.REPLACE,
    - 611        "RESPECT NULLS": TokenType.RESPECT_NULLS,
    - 612        "REFERENCES": TokenType.REFERENCES,
    - 613        "RIGHT": TokenType.RIGHT,
    - 614        "RLIKE": TokenType.RLIKE,
    - 615        "ROLLBACK": TokenType.ROLLBACK,
    - 616        "ROLLUP": TokenType.ROLLUP,
    - 617        "ROW": TokenType.ROW,
    - 618        "ROWS": TokenType.ROWS,
    - 619        "SCHEMA": TokenType.SCHEMA,
    - 620        "SEED": TokenType.SEED,
    - 621        "SELECT": TokenType.SELECT,
    - 622        "SEMI": TokenType.SEMI,
    - 623        "SET": TokenType.SET,
    - 624        "SHOW": TokenType.SHOW,
    - 625        "SIMILAR TO": TokenType.SIMILAR_TO,
    - 626        "SOME": TokenType.SOME,
    - 627        "SORTKEY": TokenType.SORTKEY,
    - 628        "SORT BY": TokenType.SORT_BY,
    - 629        "TABLE": TokenType.TABLE,
    - 630        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
    - 631        "TEMP": TokenType.TEMPORARY,
    - 632        "TEMPORARY": TokenType.TEMPORARY,
    - 633        "THEN": TokenType.THEN,
    - 634        "TRUE": TokenType.TRUE,
    - 635        "TRAILING": TokenType.TRAILING,
    - 636        "UNBOUNDED": TokenType.UNBOUNDED,
    - 637        "UNION": TokenType.UNION,
    - 638        "UNLOGGED": TokenType.UNLOGGED,
    - 639        "UNNEST": TokenType.UNNEST,
    - 640        "UNPIVOT": TokenType.UNPIVOT,
    - 641        "UPDATE": TokenType.UPDATE,
    - 642        "USE": TokenType.USE,
    - 643        "USING": TokenType.USING,
    - 644        "UUID": TokenType.UUID,
    - 645        "VALUES": TokenType.VALUES,
    - 646        "VIEW": TokenType.VIEW,
    - 647        "VOLATILE": TokenType.VOLATILE,
    - 648        "WHEN": TokenType.WHEN,
    - 649        "WHERE": TokenType.WHERE,
    - 650        "WINDOW": TokenType.WINDOW,
    - 651        "WITH": TokenType.WITH,
    - 652        "WITH TIME ZONE": TokenType.WITH_TIME_ZONE,
    - 653        "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE,
    - 654        "WITHIN GROUP": TokenType.WITHIN_GROUP,
    - 655        "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE,
    - 656        "APPLY": TokenType.APPLY,
    - 657        "ARRAY": TokenType.ARRAY,
    - 658        "BIT": TokenType.BIT,
    - 659        "BOOL": TokenType.BOOLEAN,
    - 660        "BOOLEAN": TokenType.BOOLEAN,
    - 661        "BYTE": TokenType.TINYINT,
    - 662        "TINYINT": TokenType.TINYINT,
    - 663        "SHORT": TokenType.SMALLINT,
    - 664        "SMALLINT": TokenType.SMALLINT,
    - 665        "INT2": TokenType.SMALLINT,
    - 666        "INTEGER": TokenType.INT,
    - 667        "INT": TokenType.INT,
    - 668        "INT4": TokenType.INT,
    - 669        "LONG": TokenType.BIGINT,
    - 670        "BIGINT": TokenType.BIGINT,
    - 671        "INT8": TokenType.BIGINT,
    - 672        "DEC": TokenType.DECIMAL,
    - 673        "DECIMAL": TokenType.DECIMAL,
    - 674        "BIGDECIMAL": TokenType.BIGDECIMAL,
    - 675        "BIGNUMERIC": TokenType.BIGDECIMAL,
    - 676        "MAP": TokenType.MAP,
    - 677        "NULLABLE": TokenType.NULLABLE,
    - 678        "NUMBER": TokenType.DECIMAL,
    - 679        "NUMERIC": TokenType.DECIMAL,
    - 680        "FIXED": TokenType.DECIMAL,
    - 681        "REAL": TokenType.FLOAT,
    - 682        "FLOAT": TokenType.FLOAT,
    - 683        "FLOAT4": TokenType.FLOAT,
    - 684        "FLOAT8": TokenType.DOUBLE,
    - 685        "DOUBLE": TokenType.DOUBLE,
    - 686        "DOUBLE PRECISION": TokenType.DOUBLE,
    - 687        "JSON": TokenType.JSON,
    - 688        "CHAR": TokenType.CHAR,
    - 689        "CHARACTER": TokenType.CHAR,
    - 690        "NCHAR": TokenType.NCHAR,
    - 691        "VARCHAR": TokenType.VARCHAR,
    - 692        "VARCHAR2": TokenType.VARCHAR,
    - 693        "NVARCHAR": TokenType.NVARCHAR,
    - 694        "NVARCHAR2": TokenType.NVARCHAR,
    - 695        "STR": TokenType.TEXT,
    - 696        "STRING": TokenType.TEXT,
    - 697        "TEXT": TokenType.TEXT,
    - 698        "CLOB": TokenType.TEXT,
    - 699        "LONGVARCHAR": TokenType.TEXT,
    - 700        "BINARY": TokenType.BINARY,
    - 701        "BLOB": TokenType.VARBINARY,
    - 702        "BYTEA": TokenType.VARBINARY,
    - 703        "VARBINARY": TokenType.VARBINARY,
    - 704        "TIME": TokenType.TIME,
    - 705        "TIMESTAMP": TokenType.TIMESTAMP,
    - 706        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
    - 707        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
    - 708        "DATE": TokenType.DATE,
    - 709        "DATETIME": TokenType.DATETIME,
    - 710        "UNIQUE": TokenType.UNIQUE,
    - 711        "STRUCT": TokenType.STRUCT,
    - 712        "VARIANT": TokenType.VARIANT,
    - 713        "ALTER": TokenType.ALTER,
    - 714        "ALTER AGGREGATE": TokenType.COMMAND,
    - 715        "ALTER DEFAULT": TokenType.COMMAND,
    - 716        "ALTER DOMAIN": TokenType.COMMAND,
    - 717        "ALTER ROLE": TokenType.COMMAND,
    - 718        "ALTER RULE": TokenType.COMMAND,
    - 719        "ALTER SEQUENCE": TokenType.COMMAND,
    - 720        "ALTER TYPE": TokenType.COMMAND,
    - 721        "ALTER USER": TokenType.COMMAND,
    - 722        "ALTER VIEW": TokenType.COMMAND,
    - 723        "ANALYZE": TokenType.COMMAND,
    - 724        "CALL": TokenType.COMMAND,
    - 725        "COMMENT": TokenType.COMMENT,
    - 726        "COPY": TokenType.COMMAND,
    - 727        "EXPLAIN": TokenType.COMMAND,
    - 728        "GRANT": TokenType.COMMAND,
    - 729        "OPTIMIZE": TokenType.COMMAND,
    - 730        "PREPARE": TokenType.COMMAND,
    - 731        "TRUNCATE": TokenType.COMMAND,
    - 732        "VACUUM": TokenType.COMMAND,
    - 733    }
    - 734
    - 735    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
    - 736        " ": TokenType.SPACE,
    - 737        "\t": TokenType.SPACE,
    - 738        "\n": TokenType.BREAK,
    - 739        "\r": TokenType.BREAK,
    - 740        "\r\n": TokenType.BREAK,
    - 741    }
    - 742
    - 743    COMMANDS = {
    - 744        TokenType.COMMAND,
    - 745        TokenType.EXECUTE,
    - 746        TokenType.FETCH,
    - 747        TokenType.SHOW,
    - 748    }
    - 749
    - 750    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
    - 751
    - 752    # handle numeric literals like in hive (3L = BIGINT)
    - 753    NUMERIC_LITERALS: t.Dict[str, str] = {}
    - 754    ENCODE: t.Optional[str] = None
    +            
     411class Tokenizer(metaclass=_Tokenizer):
    + 412    SINGLE_TOKENS = {
    + 413        "(": TokenType.L_PAREN,
    + 414        ")": TokenType.R_PAREN,
    + 415        "[": TokenType.L_BRACKET,
    + 416        "]": TokenType.R_BRACKET,
    + 417        "{": TokenType.L_BRACE,
    + 418        "}": TokenType.R_BRACE,
    + 419        "&": TokenType.AMP,
    + 420        "^": TokenType.CARET,
    + 421        ":": TokenType.COLON,
    + 422        ",": TokenType.COMMA,
    + 423        ".": TokenType.DOT,
    + 424        "-": TokenType.DASH,
    + 425        "=": TokenType.EQ,
    + 426        ">": TokenType.GT,
    + 427        "<": TokenType.LT,
    + 428        "%": TokenType.MOD,
    + 429        "!": TokenType.NOT,
    + 430        "|": TokenType.PIPE,
    + 431        "+": TokenType.PLUS,
    + 432        ";": TokenType.SEMICOLON,
    + 433        "/": TokenType.SLASH,
    + 434        "\\": TokenType.BACKSLASH,
    + 435        "*": TokenType.STAR,
    + 436        "~": TokenType.TILDA,
    + 437        "?": TokenType.PLACEHOLDER,
    + 438        "@": TokenType.PARAMETER,
    + 439        # used for breaking a var like x'y' but nothing else
    + 440        # the token type doesn't matter
    + 441        "'": TokenType.QUOTE,
    + 442        "`": TokenType.IDENTIFIER,
    + 443        '"': TokenType.IDENTIFIER,
    + 444        "#": TokenType.HASH,
    + 445    }
    + 446
    + 447    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
    + 448    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
    + 449    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
    + 450    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
    + 451    IDENTIFIER_ESCAPES = ['"']
    + 452    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
    + 453    STRING_ESCAPES = ["'"]
    + 454    VAR_SINGLE_TOKENS: t.Set[str] = set()
    + 455
    + 456    _COMMENTS: t.Dict[str, str] = {}
    + 457    _BIT_STRINGS: t.Dict[str, str] = {}
    + 458    _BYTE_STRINGS: t.Dict[str, str] = {}
    + 459    _HEX_STRINGS: t.Dict[str, str] = {}
    + 460    _IDENTIFIERS: t.Dict[str, str] = {}
    + 461    _IDENTIFIER_ESCAPES: t.Set[str] = set()
    + 462    _QUOTES: t.Dict[str, str] = {}
    + 463    _STRING_ESCAPES: t.Set[str] = set()
    + 464
    + 465    KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
    + 466        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
    + 467        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
    + 468        "{{+": TokenType.BLOCK_START,
    + 469        "{{-": TokenType.BLOCK_START,
    + 470        "+}}": TokenType.BLOCK_END,
    + 471        "-}}": TokenType.BLOCK_END,
    + 472        "/*+": TokenType.HINT,
    + 473        "==": TokenType.EQ,
    + 474        "::": TokenType.DCOLON,
    + 475        "||": TokenType.DPIPE,
    + 476        ">=": TokenType.GTE,
    + 477        "<=": TokenType.LTE,
    + 478        "<>": TokenType.NEQ,
    + 479        "!=": TokenType.NEQ,
    + 480        "<=>": TokenType.NULLSAFE_EQ,
    + 481        "->": TokenType.ARROW,
    + 482        "->>": TokenType.DARROW,
    + 483        "=>": TokenType.FARROW,
    + 484        "#>": TokenType.HASH_ARROW,
    + 485        "#>>": TokenType.DHASH_ARROW,
    + 486        "<->": TokenType.LR_ARROW,
    + 487        "&&": TokenType.DAMP,
    + 488        "ALL": TokenType.ALL,
    + 489        "ALWAYS": TokenType.ALWAYS,
    + 490        "AND": TokenType.AND,
    + 491        "ANTI": TokenType.ANTI,
    + 492        "ANY": TokenType.ANY,
    + 493        "ASC": TokenType.ASC,
    + 494        "AS": TokenType.ALIAS,
    + 495        "AT TIME ZONE": TokenType.AT_TIME_ZONE,
    + 496        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
    + 497        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
    + 498        "BEGIN": TokenType.BEGIN,
    + 499        "BETWEEN": TokenType.BETWEEN,
    + 500        "BOTH": TokenType.BOTH,
    + 501        "BUCKET": TokenType.BUCKET,
    + 502        "BY DEFAULT": TokenType.BY_DEFAULT,
    + 503        "CACHE": TokenType.CACHE,
    + 504        "UNCACHE": TokenType.UNCACHE,
    + 505        "CASE": TokenType.CASE,
    + 506        "CASCADE": TokenType.CASCADE,
    + 507        "CHARACTER SET": TokenType.CHARACTER_SET,
    + 508        "CLUSTER BY": TokenType.CLUSTER_BY,
    + 509        "COLLATE": TokenType.COLLATE,
    + 510        "COLUMN": TokenType.COLUMN,
    + 511        "COMMIT": TokenType.COMMIT,
    + 512        "COMPOUND": TokenType.COMPOUND,
    + 513        "CONSTRAINT": TokenType.CONSTRAINT,
    + 514        "CREATE": TokenType.CREATE,
    + 515        "CROSS": TokenType.CROSS,
    + 516        "CUBE": TokenType.CUBE,
    + 517        "CURRENT_DATE": TokenType.CURRENT_DATE,
    + 518        "CURRENT ROW": TokenType.CURRENT_ROW,
    + 519        "CURRENT_TIME": TokenType.CURRENT_TIME,
    + 520        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
    + 521        "CURRENT_USER": TokenType.CURRENT_USER,
    + 522        "DATABASE": TokenType.DATABASE,
    + 523        "DEFAULT": TokenType.DEFAULT,
    + 524        "DELETE": TokenType.DELETE,
    + 525        "DESC": TokenType.DESC,
    + 526        "DESCRIBE": TokenType.DESCRIBE,
    + 527        "DISTINCT": TokenType.DISTINCT,
    + 528        "DISTINCT FROM": TokenType.DISTINCT_FROM,
    + 529        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
    + 530        "DIV": TokenType.DIV,
    + 531        "DROP": TokenType.DROP,
    + 532        "ELSE": TokenType.ELSE,
    + 533        "END": TokenType.END,
    + 534        "ESCAPE": TokenType.ESCAPE,
    + 535        "EXCEPT": TokenType.EXCEPT,
    + 536        "EXECUTE": TokenType.EXECUTE,
    + 537        "EXISTS": TokenType.EXISTS,
    + 538        "FALSE": TokenType.FALSE,
    + 539        "FETCH": TokenType.FETCH,
    + 540        "FILTER": TokenType.FILTER,
    + 541        "FIRST": TokenType.FIRST,
    + 542        "FULL": TokenType.FULL,
    + 543        "FUNCTION": TokenType.FUNCTION,
    + 544        "FOLLOWING": TokenType.FOLLOWING,
    + 545        "FOR": TokenType.FOR,
    + 546        "FOREIGN KEY": TokenType.FOREIGN_KEY,
    + 547        "FORMAT": TokenType.FORMAT,
    + 548        "FROM": TokenType.FROM,
    + 549        "GLOB": TokenType.GLOB,
    + 550        "GROUP BY": TokenType.GROUP_BY,
    + 551        "GROUPING SETS": TokenType.GROUPING_SETS,
    + 552        "HAVING": TokenType.HAVING,
    + 553        "IF": TokenType.IF,
    + 554        "ILIKE": TokenType.ILIKE,
    + 555        "IGNORE NULLS": TokenType.IGNORE_NULLS,
    + 556        "IN": TokenType.IN,
    + 557        "INDEX": TokenType.INDEX,
    + 558        "INET": TokenType.INET,
    + 559        "INNER": TokenType.INNER,
    + 560        "INSERT": TokenType.INSERT,
    + 561        "INTERVAL": TokenType.INTERVAL,
    + 562        "INTERSECT": TokenType.INTERSECT,
    + 563        "INTO": TokenType.INTO,
    + 564        "IS": TokenType.IS,
    + 565        "ISNULL": TokenType.ISNULL,
    + 566        "JOIN": TokenType.JOIN,
    + 567        "KEEP": TokenType.KEEP,
    + 568        "LATERAL": TokenType.LATERAL,
    + 569        "LAZY": TokenType.LAZY,
    + 570        "LEADING": TokenType.LEADING,
    + 571        "LEFT": TokenType.LEFT,
    + 572        "LIKE": TokenType.LIKE,
    + 573        "LIMIT": TokenType.LIMIT,
    + 574        "LOAD DATA": TokenType.LOAD_DATA,
    + 575        "LOCAL": TokenType.LOCAL,
    + 576        "MATERIALIZED": TokenType.MATERIALIZED,
    + 577        "MERGE": TokenType.MERGE,
    + 578        "NATURAL": TokenType.NATURAL,
    + 579        "NEXT": TokenType.NEXT,
    + 580        "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
    + 581        "NO ACTION": TokenType.NO_ACTION,
    + 582        "NOT": TokenType.NOT,
    + 583        "NOTNULL": TokenType.NOTNULL,
    + 584        "NULL": TokenType.NULL,
    + 585        "NULLS FIRST": TokenType.NULLS_FIRST,
    + 586        "NULLS LAST": TokenType.NULLS_LAST,
    + 587        "OBJECT": TokenType.OBJECT,
    + 588        "OFFSET": TokenType.OFFSET,
    + 589        "ON": TokenType.ON,
    + 590        "ONLY": TokenType.ONLY,
    + 591        "OPTIONS": TokenType.OPTIONS,
    + 592        "OR": TokenType.OR,
    + 593        "ORDER BY": TokenType.ORDER_BY,
    + 594        "ORDINALITY": TokenType.ORDINALITY,
    + 595        "OUTER": TokenType.OUTER,
    + 596        "OUT OF": TokenType.OUT_OF,
    + 597        "OVER": TokenType.OVER,
    + 598        "OVERLAPS": TokenType.OVERLAPS,
    + 599        "OVERWRITE": TokenType.OVERWRITE,
    + 600        "PARTITION": TokenType.PARTITION,
    + 601        "PARTITION BY": TokenType.PARTITION_BY,
    + 602        "PARTITIONED BY": TokenType.PARTITION_BY,
    + 603        "PARTITIONED_BY": TokenType.PARTITION_BY,
    + 604        "PERCENT": TokenType.PERCENT,
    + 605        "PIVOT": TokenType.PIVOT,
    + 606        "PRAGMA": TokenType.PRAGMA,
    + 607        "PRECEDING": TokenType.PRECEDING,
    + 608        "PRIMARY KEY": TokenType.PRIMARY_KEY,
    + 609        "PROCEDURE": TokenType.PROCEDURE,
    + 610        "QUALIFY": TokenType.QUALIFY,
    + 611        "RANGE": TokenType.RANGE,
    + 612        "RECURSIVE": TokenType.RECURSIVE,
    + 613        "REGEXP": TokenType.RLIKE,
    + 614        "REPLACE": TokenType.REPLACE,
    + 615        "RESPECT NULLS": TokenType.RESPECT_NULLS,
    + 616        "REFERENCES": TokenType.REFERENCES,
    + 617        "RIGHT": TokenType.RIGHT,
    + 618        "RLIKE": TokenType.RLIKE,
    + 619        "ROLLBACK": TokenType.ROLLBACK,
    + 620        "ROLLUP": TokenType.ROLLUP,
    + 621        "ROW": TokenType.ROW,
    + 622        "ROWS": TokenType.ROWS,
    + 623        "SCHEMA": TokenType.SCHEMA,
    + 624        "SEED": TokenType.SEED,
    + 625        "SELECT": TokenType.SELECT,
    + 626        "SEMI": TokenType.SEMI,
    + 627        "SET": TokenType.SET,
    + 628        "SHOW": TokenType.SHOW,
    + 629        "SIMILAR TO": TokenType.SIMILAR_TO,
    + 630        "SOME": TokenType.SOME,
    + 631        "SORTKEY": TokenType.SORTKEY,
    + 632        "SORT BY": TokenType.SORT_BY,
    + 633        "TABLE": TokenType.TABLE,
    + 634        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
    + 635        "TEMP": TokenType.TEMPORARY,
    + 636        "TEMPORARY": TokenType.TEMPORARY,
    + 637        "THEN": TokenType.THEN,
    + 638        "TRUE": TokenType.TRUE,
    + 639        "TRAILING": TokenType.TRAILING,
    + 640        "UNBOUNDED": TokenType.UNBOUNDED,
    + 641        "UNION": TokenType.UNION,
    + 642        "UNLOGGED": TokenType.UNLOGGED,
    + 643        "UNNEST": TokenType.UNNEST,
    + 644        "UNPIVOT": TokenType.UNPIVOT,
    + 645        "UPDATE": TokenType.UPDATE,
    + 646        "USE": TokenType.USE,
    + 647        "USING": TokenType.USING,
    + 648        "UUID": TokenType.UUID,
    + 649        "VALUES": TokenType.VALUES,
    + 650        "VIEW": TokenType.VIEW,
    + 651        "VOLATILE": TokenType.VOLATILE,
    + 652        "WHEN": TokenType.WHEN,
    + 653        "WHERE": TokenType.WHERE,
    + 654        "WINDOW": TokenType.WINDOW,
    + 655        "WITH": TokenType.WITH,
    + 656        "WITH TIME ZONE": TokenType.WITH_TIME_ZONE,
    + 657        "WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE,
    + 658        "WITHIN GROUP": TokenType.WITHIN_GROUP,
    + 659        "WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE,
    + 660        "APPLY": TokenType.APPLY,
    + 661        "ARRAY": TokenType.ARRAY,
    + 662        "BIT": TokenType.BIT,
    + 663        "BOOL": TokenType.BOOLEAN,
    + 664        "BOOLEAN": TokenType.BOOLEAN,
    + 665        "BYTE": TokenType.TINYINT,
    + 666        "TINYINT": TokenType.TINYINT,
    + 667        "SHORT": TokenType.SMALLINT,
    + 668        "SMALLINT": TokenType.SMALLINT,
    + 669        "INT2": TokenType.SMALLINT,
    + 670        "INTEGER": TokenType.INT,
    + 671        "INT": TokenType.INT,
    + 672        "INT4": TokenType.INT,
    + 673        "LONG": TokenType.BIGINT,
    + 674        "BIGINT": TokenType.BIGINT,
    + 675        "INT8": TokenType.BIGINT,
    + 676        "DEC": TokenType.DECIMAL,
    + 677        "DECIMAL": TokenType.DECIMAL,
    + 678        "BIGDECIMAL": TokenType.BIGDECIMAL,
    + 679        "BIGNUMERIC": TokenType.BIGDECIMAL,
    + 680        "MAP": TokenType.MAP,
    + 681        "NULLABLE": TokenType.NULLABLE,
    + 682        "NUMBER": TokenType.DECIMAL,
    + 683        "NUMERIC": TokenType.DECIMAL,
    + 684        "FIXED": TokenType.DECIMAL,
    + 685        "REAL": TokenType.FLOAT,
    + 686        "FLOAT": TokenType.FLOAT,
    + 687        "FLOAT4": TokenType.FLOAT,
    + 688        "FLOAT8": TokenType.DOUBLE,
    + 689        "DOUBLE": TokenType.DOUBLE,
    + 690        "DOUBLE PRECISION": TokenType.DOUBLE,
    + 691        "JSON": TokenType.JSON,
    + 692        "CHAR": TokenType.CHAR,
    + 693        "CHARACTER": TokenType.CHAR,
    + 694        "NCHAR": TokenType.NCHAR,
    + 695        "VARCHAR": TokenType.VARCHAR,
    + 696        "VARCHAR2": TokenType.VARCHAR,
    + 697        "NVARCHAR": TokenType.NVARCHAR,
    + 698        "NVARCHAR2": TokenType.NVARCHAR,
    + 699        "STR": TokenType.TEXT,
    + 700        "STRING": TokenType.TEXT,
    + 701        "TEXT": TokenType.TEXT,
    + 702        "CLOB": TokenType.TEXT,
    + 703        "LONGVARCHAR": TokenType.TEXT,
    + 704        "BINARY": TokenType.BINARY,
    + 705        "BLOB": TokenType.VARBINARY,
    + 706        "BYTEA": TokenType.VARBINARY,
    + 707        "VARBINARY": TokenType.VARBINARY,
    + 708        "TIME": TokenType.TIME,
    + 709        "TIMESTAMP": TokenType.TIMESTAMP,
    + 710        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
    + 711        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
    + 712        "DATE": TokenType.DATE,
    + 713        "DATETIME": TokenType.DATETIME,
    + 714        "UNIQUE": TokenType.UNIQUE,
    + 715        "STRUCT": TokenType.STRUCT,
    + 716        "VARIANT": TokenType.VARIANT,
    + 717        "ALTER": TokenType.ALTER,
    + 718        "ALTER AGGREGATE": TokenType.COMMAND,
    + 719        "ALTER DEFAULT": TokenType.COMMAND,
    + 720        "ALTER DOMAIN": TokenType.COMMAND,
    + 721        "ALTER ROLE": TokenType.COMMAND,
    + 722        "ALTER RULE": TokenType.COMMAND,
    + 723        "ALTER SEQUENCE": TokenType.COMMAND,
    + 724        "ALTER TYPE": TokenType.COMMAND,
    + 725        "ALTER USER": TokenType.COMMAND,
    + 726        "ALTER VIEW": TokenType.COMMAND,
    + 727        "ANALYZE": TokenType.COMMAND,
    + 728        "CALL": TokenType.COMMAND,
    + 729        "COMMENT": TokenType.COMMENT,
    + 730        "COPY": TokenType.COMMAND,
    + 731        "EXPLAIN": TokenType.COMMAND,
    + 732        "GRANT": TokenType.COMMAND,
    + 733        "OPTIMIZE": TokenType.COMMAND,
    + 734        "PREPARE": TokenType.COMMAND,
    + 735        "TRUNCATE": TokenType.COMMAND,
    + 736        "VACUUM": TokenType.COMMAND,
    + 737    }
    + 738
    + 739    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
    + 740        " ": TokenType.SPACE,
    + 741        "\t": TokenType.SPACE,
    + 742        "\n": TokenType.BREAK,
    + 743        "\r": TokenType.BREAK,
    + 744        "\r\n": TokenType.BREAK,
    + 745    }
    + 746
    + 747    COMMANDS = {
    + 748        TokenType.COMMAND,
    + 749        TokenType.EXECUTE,
    + 750        TokenType.FETCH,
    + 751        TokenType.SHOW,
    + 752    }
    + 753
    + 754    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
      755
    - 756    COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
    - 757    KEYWORD_TRIE: t.Dict = {}  # autofilled
    - 758
    - 759    IDENTIFIER_CAN_START_WITH_DIGIT = False
    - 760
    - 761    __slots__ = (
    - 762        "sql",
    - 763        "size",
    - 764        "tokens",
    - 765        "_start",
    - 766        "_current",
    - 767        "_line",
    - 768        "_col",
    - 769        "_comments",
    - 770        "_char",
    - 771        "_end",
    - 772        "_peek",
    - 773        "_prev_token_line",
    - 774        "_prev_token_comments",
    - 775        "_prev_token_type",
    - 776    )
    - 777
    - 778    def __init__(self) -> None:
    - 779        self.reset()
    - 780
    - 781    def reset(self) -> None:
    - 782        self.sql = ""
    - 783        self.size = 0
    - 784        self.tokens: t.List[Token] = []
    - 785        self._start = 0
    - 786        self._current = 0
    - 787        self._line = 1
    - 788        self._col = 1
    - 789        self._comments: t.List[str] = []
    - 790
    - 791        self._char = ""
    - 792        self._end = False
    - 793        self._peek = ""
    - 794        self._prev_token_line = -1
    - 795        self._prev_token_comments: t.List[str] = []
    - 796        self._prev_token_type: t.Optional[TokenType] = None
    - 797
    - 798    def tokenize(self, sql: str) -> t.List[Token]:
    - 799        """Returns a list of tokens corresponding to the SQL string `sql`."""
    - 800        self.reset()
    - 801        self.sql = sql
    - 802        self.size = len(sql)
    - 803        try:
    - 804            self._scan()
    - 805        except Exception as e:
    - 806            start = self._current - 50
    - 807            end = self._current + 50
    - 808            start = start if start > 0 else 0
    - 809            end = end if end < self.size else self.size - 1
    - 810            context = self.sql[start:end]
    - 811            raise ValueError(f"Error tokenizing '{context}'") from e
    - 812
    - 813        return self.tokens
    - 814
    - 815    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
    - 816        while self.size and not self._end:
    - 817            self._start = self._current
    - 818            self._advance()
    - 819
    - 820            if self._char is None:
    - 821                break
    - 822
    - 823            if self._char not in self.WHITE_SPACE:
    - 824                if self._char.isdigit():
    - 825                    self._scan_number()
    - 826                elif self._char in self._IDENTIFIERS:
    - 827                    self._scan_identifier(self._IDENTIFIERS[self._char])
    - 828                else:
    - 829                    self._scan_keywords()
    - 830
    - 831            if until and until():
    - 832                break
    - 833
    - 834        if self.tokens:
    - 835            self.tokens[-1].comments.extend(self._comments)
    - 836
    - 837    def _chars(self, size: int) -> str:
    - 838        if size == 1:
    - 839            return self._char
    - 840        start = self._current - 1
    - 841        end = start + size
    - 842        if end <= self.size:
    - 843            return self.sql[start:end]
    - 844        return ""
    - 845
    - 846    def _advance(self, i: int = 1) -> None:
    - 847        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
    - 848            self._col = 1
    - 849            self._line += 1
    - 850        else:
    - 851            self._col += i
    - 852
    - 853        self._current += i
    - 854        self._end = self._current >= self.size
    - 855        self._char = self.sql[self._current - 1]
    - 856        self._peek = "" if self._end else self.sql[self._current]
    - 857
    - 858    @property
    - 859    def _text(self) -> str:
    - 860        return self.sql[self._start : self._current]
    + 756    # handle numeric literals like in hive (3L = BIGINT)
    + 757    NUMERIC_LITERALS: t.Dict[str, str] = {}
    + 758    ENCODE: t.Optional[str] = None
    + 759
    + 760    COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
    + 761    KEYWORD_TRIE: t.Dict = {}  # autofilled
    + 762
    + 763    IDENTIFIER_CAN_START_WITH_DIGIT = False
    + 764
    + 765    __slots__ = (
    + 766        "sql",
    + 767        "size",
    + 768        "tokens",
    + 769        "_start",
    + 770        "_current",
    + 771        "_line",
    + 772        "_col",
    + 773        "_comments",
    + 774        "_char",
    + 775        "_end",
    + 776        "_peek",
    + 777        "_prev_token_line",
    + 778        "_prev_token_comments",
    + 779        "_prev_token_type",
    + 780    )
    + 781
    + 782    def __init__(self) -> None:
    + 783        self.reset()
    + 784
    + 785    def reset(self) -> None:
    + 786        self.sql = ""
    + 787        self.size = 0
    + 788        self.tokens: t.List[Token] = []
    + 789        self._start = 0
    + 790        self._current = 0
    + 791        self._line = 1
    + 792        self._col = 1
    + 793        self._comments: t.List[str] = []
    + 794
    + 795        self._char = ""
    + 796        self._end = False
    + 797        self._peek = ""
    + 798        self._prev_token_line = -1
    + 799        self._prev_token_comments: t.List[str] = []
    + 800        self._prev_token_type: t.Optional[TokenType] = None
    + 801
    + 802    def tokenize(self, sql: str) -> t.List[Token]:
    + 803        """Returns a list of tokens corresponding to the SQL string `sql`."""
    + 804        self.reset()
    + 805        self.sql = sql
    + 806        self.size = len(sql)
    + 807        try:
    + 808            self._scan()
    + 809        except Exception as e:
    + 810            start = self._current - 50
    + 811            end = self._current + 50
    + 812            start = start if start > 0 else 0
    + 813            end = end if end < self.size else self.size - 1
    + 814            context = self.sql[start:end]
    + 815            raise ValueError(f"Error tokenizing '{context}'") from e
    + 816
    + 817        return self.tokens
    + 818
    + 819    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
    + 820        while self.size and not self._end:
    + 821            self._start = self._current
    + 822            self._advance()
    + 823
    + 824            if self._char is None:
    + 825                break
    + 826
    + 827            if self._char not in self.WHITE_SPACE:
    + 828                if self._char.isdigit():
    + 829                    self._scan_number()
    + 830                elif self._char in self._IDENTIFIERS:
    + 831                    self._scan_identifier(self._IDENTIFIERS[self._char])
    + 832                else:
    + 833                    self._scan_keywords()
    + 834
    + 835            if until and until():
    + 836                break
    + 837
    + 838        if self.tokens:
    + 839            self.tokens[-1].comments.extend(self._comments)
    + 840
    + 841    def _chars(self, size: int) -> str:
    + 842        if size == 1:
    + 843            return self._char
    + 844        start = self._current - 1
    + 845        end = start + size
    + 846        if end <= self.size:
    + 847            return self.sql[start:end]
    + 848        return ""
    + 849
    + 850    def _advance(self, i: int = 1) -> None:
    + 851        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
    + 852            self._col = 1
    + 853            self._line += 1
    + 854        else:
    + 855            self._col += i
    + 856
    + 857        self._current += i
    + 858        self._end = self._current >= self.size
    + 859        self._char = self.sql[self._current - 1]
    + 860        self._peek = "" if self._end else self.sql[self._current]
      861
    - 862    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
    - 863        self._prev_token_line = self._line
    - 864        self._prev_token_comments = self._comments
    - 865        self._prev_token_type = token_type
    - 866        self.tokens.append(
    - 867            Token(
    - 868                token_type,
    - 869                self._text if text is None else text,
    - 870                self._line,
    - 871                self._col,
    - 872                self._current,
    - 873                self._comments,
    - 874            )
    - 875        )
    - 876        self._comments = []
    - 877
    - 878        # If we have either a semicolon or a begin token before the command's token, we'll parse
    - 879        # whatever follows the command's token as a string
    - 880        if (
    - 881            token_type in self.COMMANDS
    - 882            and self._peek != ";"
    - 883            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
    - 884        ):
    - 885            start = self._current
    - 886            tokens = len(self.tokens)
    - 887            self._scan(lambda: self._peek == ";")
    - 888            self.tokens = self.tokens[:tokens]
    - 889            text = self.sql[start : self._current].strip()
    - 890            if text:
    - 891                self._add(TokenType.STRING, text)
    - 892
    - 893    def _scan_keywords(self) -> None:
    - 894        size = 0
    - 895        word = None
    - 896        chars = self._text
    - 897        char = chars
    - 898        prev_space = False
    - 899        skip = False
    - 900        trie = self.KEYWORD_TRIE
    - 901        single_token = char in self.SINGLE_TOKENS
    - 902
    - 903        while chars:
    - 904            if skip:
    - 905                result = 1
    - 906            else:
    - 907                result, trie = in_trie(trie, char.upper())
    - 908
    - 909            if result == 0:
    - 910                break
    - 911            if result == 2:
    - 912                word = chars
    - 913            size += 1
    - 914            end = self._current - 1 + size
    - 915
    - 916            if end < self.size:
    - 917                char = self.sql[end]
    - 918                single_token = single_token or char in self.SINGLE_TOKENS
    - 919                is_space = char in self.WHITE_SPACE
    - 920
    - 921                if not is_space or not prev_space:
    - 922                    if is_space:
    - 923                        char = " "
    - 924                    chars += char
    - 925                    prev_space = is_space
    - 926                    skip = False
    - 927                else:
    - 928                    skip = True
    - 929            else:
    - 930                chars = " "
    - 931
    - 932        word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
    - 933
    - 934        if not word:
    - 935            if self._char in self.SINGLE_TOKENS:
    - 936                self._add(self.SINGLE_TOKENS[self._char], text=self._char)
    - 937                return
    - 938            self._scan_var()
    - 939            return
    - 940
    - 941        if self._scan_string(word):
    - 942            return
    - 943        if self._scan_formatted_string(word):
    - 944            return
    - 945        if self._scan_comment(word):
    + 862    @property
    + 863    def _text(self) -> str:
    + 864        return self.sql[self._start : self._current]
    + 865
    + 866    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
    + 867        self._prev_token_line = self._line
    + 868        self._prev_token_comments = self._comments
    + 869        self._prev_token_type = token_type
    + 870        self.tokens.append(
    + 871            Token(
    + 872                token_type,
    + 873                self._text if text is None else text,
    + 874                self._line,
    + 875                self._col,
    + 876                self._current,
    + 877                self._comments,
    + 878            )
    + 879        )
    + 880        self._comments = []
    + 881
    + 882        # If we have either a semicolon or a begin token before the command's token, we'll parse
    + 883        # whatever follows the command's token as a string
    + 884        if (
    + 885            token_type in self.COMMANDS
    + 886            and self._peek != ";"
    + 887            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
    + 888        ):
    + 889            start = self._current
    + 890            tokens = len(self.tokens)
    + 891            self._scan(lambda: self._peek == ";")
    + 892            self.tokens = self.tokens[:tokens]
    + 893            text = self.sql[start : self._current].strip()
    + 894            if text:
    + 895                self._add(TokenType.STRING, text)
    + 896
    + 897    def _scan_keywords(self) -> None:
    + 898        size = 0
    + 899        word = None
    + 900        chars = self._text
    + 901        char = chars
    + 902        prev_space = False
    + 903        skip = False
    + 904        trie = self.KEYWORD_TRIE
    + 905        single_token = char in self.SINGLE_TOKENS
    + 906
    + 907        while chars:
    + 908            if skip:
    + 909                result = 1
    + 910            else:
    + 911                result, trie = in_trie(trie, char.upper())
    + 912
    + 913            if result == 0:
    + 914                break
    + 915            if result == 2:
    + 916                word = chars
    + 917            size += 1
    + 918            end = self._current - 1 + size
    + 919
    + 920            if end < self.size:
    + 921                char = self.sql[end]
    + 922                single_token = single_token or char in self.SINGLE_TOKENS
    + 923                is_space = char in self.WHITE_SPACE
    + 924
    + 925                if not is_space or not prev_space:
    + 926                    if is_space:
    + 927                        char = " "
    + 928                    chars += char
    + 929                    prev_space = is_space
    + 930                    skip = False
    + 931                else:
    + 932                    skip = True
    + 933            else:
    + 934                chars = " "
    + 935
    + 936        word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
    + 937
    + 938        if not word:
    + 939            if self._char in self.SINGLE_TOKENS:
    + 940                self._add(self.SINGLE_TOKENS[self._char], text=self._char)
    + 941                return
    + 942            self._scan_var()
    + 943            return
    + 944
    + 945        if self._scan_string(word):
      946            return
    - 947
    - 948        self._advance(size - 1)
    - 949        word = word.upper()
    - 950        self._add(self.KEYWORDS[word], text=word)
    + 947        if self._scan_formatted_string(word):
    + 948            return
    + 949        if self._scan_comment(word):
    + 950            return
      951
    - 952    def _scan_comment(self, comment_start: str) -> bool:
    - 953        if comment_start not in self._COMMENTS:
    - 954            return False
    + 952        self._advance(size - 1)
    + 953        word = word.upper()
    + 954        self._add(self.KEYWORDS[word], text=word)
      955
    - 956        comment_start_line = self._line
    - 957        comment_start_size = len(comment_start)
    - 958        comment_end = self._COMMENTS[comment_start]
    + 956    def _scan_comment(self, comment_start: str) -> bool:
    + 957        if comment_start not in self._COMMENTS:
    + 958            return False
      959
    - 960        if comment_end:
    - 961            # Skip the comment's start delimiter
    - 962            self._advance(comment_start_size)
    + 960        comment_start_line = self._line
    + 961        comment_start_size = len(comment_start)
    + 962        comment_end = self._COMMENTS[comment_start]
      963
    - 964            comment_end_size = len(comment_end)
    - 965            while not self._end and self._chars(comment_end_size) != comment_end:
    - 966                self._advance()
    + 964        if comment_end:
    + 965            # Skip the comment's start delimiter
    + 966            self._advance(comment_start_size)
      967
    - 968            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
    - 969            self._advance(comment_end_size - 1)
    - 970        else:
    - 971            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
    - 972                self._advance()
    - 973            self._comments.append(self._text[comment_start_size:])
    - 974
    - 975        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
    - 976        # Multiple consecutive comments are preserved by appending them to the current comments list.
    - 977        if comment_start_line == self._prev_token_line:
    - 978            self.tokens[-1].comments.extend(self._comments)
    - 979            self._comments = []
    - 980            self._prev_token_line = self._line
    - 981
    - 982        return True
    - 983
    - 984    def _scan_number(self) -> None:
    - 985        if self._char == "0":
    - 986            peek = self._peek.upper()
    - 987            if peek == "B":
    - 988                return self._scan_bits()
    - 989            elif peek == "X":
    - 990                return self._scan_hex()
    - 991
    - 992        decimal = False
    - 993        scientific = 0
    - 994
    - 995        while True:
    - 996            if self._peek.isdigit():
    - 997                self._advance()
    - 998            elif self._peek == "." and not decimal:
    - 999                decimal = True
    -1000                self._advance()
    -1001            elif self._peek in ("-", "+") and scientific == 1:
    -1002                scientific += 1
    -1003                self._advance()
    -1004            elif self._peek.upper() == "E" and not scientific:
    -1005                scientific += 1
    -1006                self._advance()
    -1007            elif self._peek.isidentifier():
    -1008                number_text = self._text
    -1009                literal = ""
    -1010
    -1011                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
    -1012                    literal += self._peek.upper()
    -1013                    self._advance()
    + 968            comment_end_size = len(comment_end)
    + 969            while not self._end and self._chars(comment_end_size) != comment_end:
    + 970                self._advance()
    + 971
    + 972            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
    + 973            self._advance(comment_end_size - 1)
    + 974        else:
    + 975            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
    + 976                self._advance()
    + 977            self._comments.append(self._text[comment_start_size:])
    + 978
    + 979        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
    + 980        # Multiple consecutive comments are preserved by appending them to the current comments list.
    + 981        if comment_start_line == self._prev_token_line:
    + 982            self.tokens[-1].comments.extend(self._comments)
    + 983            self._comments = []
    + 984            self._prev_token_line = self._line
    + 985
    + 986        return True
    + 987
    + 988    def _scan_number(self) -> None:
    + 989        if self._char == "0":
    + 990            peek = self._peek.upper()
    + 991            if peek == "B":
    + 992                return self._scan_bits()
    + 993            elif peek == "X":
    + 994                return self._scan_hex()
    + 995
    + 996        decimal = False
    + 997        scientific = 0
    + 998
    + 999        while True:
    +1000            if self._peek.isdigit():
    +1001                self._advance()
    +1002            elif self._peek == "." and not decimal:
    +1003                decimal = True
    +1004                self._advance()
    +1005            elif self._peek in ("-", "+") and scientific == 1:
    +1006                scientific += 1
    +1007                self._advance()
    +1008            elif self._peek.upper() == "E" and not scientific:
    +1009                scientific += 1
    +1010                self._advance()
    +1011            elif self._peek.isidentifier():
    +1012                number_text = self._text
    +1013                literal = ""
     1014
    -1015                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
    -1016
    -1017                if token_type:
    -1018                    self._add(TokenType.NUMBER, number_text)
    -1019                    self._add(TokenType.DCOLON, "::")
    -1020                    return self._add(token_type, literal)
    -1021                elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
    -1022                    return self._add(TokenType.VAR)
    -1023
    -1024                self._add(TokenType.NUMBER, number_text)
    -1025                return self._advance(-len(literal))
    -1026            else:
    -1027                return self._add(TokenType.NUMBER)
    -1028
    -1029    def _scan_bits(self) -> None:
    -1030        self._advance()
    -1031        value = self._extract_value()
    -1032        try:
    -1033            self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
    -1034        except ValueError:
    -1035            self._add(TokenType.IDENTIFIER)
    -1036
    -1037    def _scan_hex(self) -> None:
    -1038        self._advance()
    -1039        value = self._extract_value()
    -1040        try:
    -1041            self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
    -1042        except ValueError:
    -1043            self._add(TokenType.IDENTIFIER)
    -1044
    -1045    def _extract_value(self) -> str:
    -1046        while True:
    -1047            char = self._peek.strip()
    -1048            if char and char not in self.SINGLE_TOKENS:
    -1049                self._advance()
    -1050            else:
    -1051                break
    -1052
    -1053        return self._text
    -1054
    -1055    def _scan_string(self, quote: str) -> bool:
    -1056        quote_end = self._QUOTES.get(quote)
    -1057        if quote_end is None:
    -1058            return False
    -1059
    -1060        self._advance(len(quote))
    -1061        text = self._extract_string(quote_end)
    -1062        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
    -1063        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
    -1064        return True
    -1065
    -1066    # X'1234, b'0110', E'\\\\\' etc.
    -1067    def _scan_formatted_string(self, string_start: str) -> bool:
    -1068        if string_start in self._HEX_STRINGS:
    -1069            delimiters = self._HEX_STRINGS
    -1070            token_type = TokenType.HEX_STRING
    -1071            base = 16
    -1072        elif string_start in self._BIT_STRINGS:
    -1073            delimiters = self._BIT_STRINGS
    -1074            token_type = TokenType.BIT_STRING
    -1075            base = 2
    -1076        elif string_start in self._BYTE_STRINGS:
    -1077            delimiters = self._BYTE_STRINGS
    -1078            token_type = TokenType.BYTE_STRING
    -1079            base = None
    -1080        else:
    -1081            return False
    -1082
    -1083        self._advance(len(string_start))
    -1084        string_end = delimiters[string_start]
    -1085        text = self._extract_string(string_end)
    +1015                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
    +1016                    literal += self._peek.upper()
    +1017                    self._advance()
    +1018
    +1019                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
    +1020
    +1021                if token_type:
    +1022                    self._add(TokenType.NUMBER, number_text)
    +1023                    self._add(TokenType.DCOLON, "::")
    +1024                    return self._add(token_type, literal)
    +1025                elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
    +1026                    return self._add(TokenType.VAR)
    +1027
    +1028                self._add(TokenType.NUMBER, number_text)
    +1029                return self._advance(-len(literal))
    +1030            else:
    +1031                return self._add(TokenType.NUMBER)
    +1032
    +1033    def _scan_bits(self) -> None:
    +1034        self._advance()
    +1035        value = self._extract_value()
    +1036        try:
    +1037            self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
    +1038        except ValueError:
    +1039            self._add(TokenType.IDENTIFIER)
    +1040
    +1041    def _scan_hex(self) -> None:
    +1042        self._advance()
    +1043        value = self._extract_value()
    +1044        try:
    +1045            self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
    +1046        except ValueError:
    +1047            self._add(TokenType.IDENTIFIER)
    +1048
    +1049    def _extract_value(self) -> str:
    +1050        while True:
    +1051            char = self._peek.strip()
    +1052            if char and char not in self.SINGLE_TOKENS:
    +1053                self._advance()
    +1054            else:
    +1055                break
    +1056
    +1057        return self._text
    +1058
    +1059    def _scan_string(self, quote: str) -> bool:
    +1060        quote_end = self._QUOTES.get(quote)
    +1061        if quote_end is None:
    +1062            return False
    +1063
    +1064        self._advance(len(quote))
    +1065        text = self._extract_string(quote_end)
    +1066        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
    +1067        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
    +1068        return True
    +1069
    +1070    # X'1234, b'0110', E'\\\\\' etc.
    +1071    def _scan_formatted_string(self, string_start: str) -> bool:
    +1072        if string_start in self._HEX_STRINGS:
    +1073            delimiters = self._HEX_STRINGS
    +1074            token_type = TokenType.HEX_STRING
    +1075            base = 16
    +1076        elif string_start in self._BIT_STRINGS:
    +1077            delimiters = self._BIT_STRINGS
    +1078            token_type = TokenType.BIT_STRING
    +1079            base = 2
    +1080        elif string_start in self._BYTE_STRINGS:
    +1081            delimiters = self._BYTE_STRINGS
    +1082            token_type = TokenType.BYTE_STRING
    +1083            base = None
    +1084        else:
    +1085            return False
     1086
    -1087        if base is None:
    -1088            self._add(token_type, text)
    -1089        else:
    -1090            try:
    -1091                self._add(token_type, f"{int(text, base)}")
    -1092            except:
    -1093                raise RuntimeError(
    -1094                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
    -1095                )
    -1096
    -1097        return True
    -1098
    -1099    def _scan_identifier(self, identifier_end: str) -> None:
    -1100        text = ""
    -1101        identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES
    +1087        self._advance(len(string_start))
    +1088        string_end = delimiters[string_start]
    +1089        text = self._extract_string(string_end)
    +1090
    +1091        if base is None:
    +1092            self._add(token_type, text)
    +1093        else:
    +1094            try:
    +1095                self._add(token_type, f"{int(text, base)}")
    +1096            except:
    +1097                raise RuntimeError(
    +1098                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
    +1099                )
    +1100
    +1101        return True
     1102
    -1103        while True:
    -1104            if self._end:
    -1105                raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
    +1103    def _scan_identifier(self, identifier_end: str) -> None:
    +1104        text = ""
    +1105        identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES
     1106
    -1107            self._advance()
    -1108            if self._char == identifier_end:
    -1109                if identifier_end_is_escape and self._peek == identifier_end:
    -1110                    text += identifier_end
    -1111                    self._advance()
    -1112                    continue
    -1113
    -1114                break
    -1115
    -1116            text += self._char
    +1107        while True:
    +1108            if self._end:
    +1109                raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
    +1110
    +1111            self._advance()
    +1112            if self._char == identifier_end:
    +1113                if identifier_end_is_escape and self._peek == identifier_end:
    +1114                    text += identifier_end
    +1115                    self._advance()
    +1116                    continue
     1117
    -1118        self._add(TokenType.IDENTIFIER, text)
    +1118                break
     1119
    -1120    def _scan_var(self) -> None:
    -1121        while True:
    -1122            char = self._peek.strip()
    -1123            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
    -1124                self._advance()
    -1125            else:
    -1126                break
    -1127        self._add(
    -1128            TokenType.VAR
    -1129            if self._prev_token_type == TokenType.PARAMETER
    -1130            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
    -1131        )
    -1132
    -1133    def _extract_string(self, delimiter: str) -> str:
    -1134        text = ""
    -1135        delim_size = len(delimiter)
    +1120            text += self._char
    +1121
    +1122        self._add(TokenType.IDENTIFIER, text)
    +1123
    +1124    def _scan_var(self) -> None:
    +1125        while True:
    +1126            char = self._peek.strip()
    +1127            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
    +1128                self._advance()
    +1129            else:
    +1130                break
    +1131        self._add(
    +1132            TokenType.VAR
    +1133            if self._prev_token_type == TokenType.PARAMETER
    +1134            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
    +1135        )
     1136
    -1137        while True:
    -1138            if self._char in self._STRING_ESCAPES and (
    -1139                self._peek == delimiter or self._peek in self._STRING_ESCAPES
    -1140            ):
    -1141                if self._peek == delimiter:
    -1142                    text += self._peek
    -1143                else:
    -1144                    text += self._char + self._peek
    -1145
    -1146                if self._current + 1 < self.size:
    -1147                    self._advance(2)
    -1148                else:
    -1149                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}")
    -1150            else:
    -1151                if self._chars(delim_size) == delimiter:
    -1152                    if delim_size > 1:
    -1153                        self._advance(delim_size - 1)
    -1154                    break
    -1155
    -1156                if self._end:
    -1157                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
    -1158                text += self._char
    -1159                self._advance()
    -1160
    -1161        return text
    +1137    def _extract_string(self, delimiter: str) -> str:
    +1138        text = ""
    +1139        delim_size = len(delimiter)
    +1140
    +1141        while True:
    +1142            if self._char in self._STRING_ESCAPES and (
    +1143                self._peek == delimiter or self._peek in self._STRING_ESCAPES
    +1144            ):
    +1145                if self._peek == delimiter:
    +1146                    text += self._peek
    +1147                else:
    +1148                    text += self._char + self._peek
    +1149
    +1150                if self._current + 1 < self.size:
    +1151                    self._advance(2)
    +1152                else:
    +1153                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}")
    +1154            else:
    +1155                if self._chars(delim_size) == delimiter:
    +1156                    if delim_size > 1:
    +1157                        self._advance(delim_size - 1)
    +1158                    break
    +1159
    +1160                if self._end:
    +1161                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
    +1162                text += self._char
    +1163                self._advance()
    +1164
    +1165        return text
     
    @@ -7050,22 +7088,22 @@
    -
    781    def reset(self) -> None:
    -782        self.sql = ""
    -783        self.size = 0
    -784        self.tokens: t.List[Token] = []
    -785        self._start = 0
    -786        self._current = 0
    -787        self._line = 1
    -788        self._col = 1
    -789        self._comments: t.List[str] = []
    -790
    -791        self._char = ""
    -792        self._end = False
    -793        self._peek = ""
    -794        self._prev_token_line = -1
    -795        self._prev_token_comments: t.List[str] = []
    -796        self._prev_token_type: t.Optional[TokenType] = None
    +            
    785    def reset(self) -> None:
    +786        self.sql = ""
    +787        self.size = 0
    +788        self.tokens: t.List[Token] = []
    +789        self._start = 0
    +790        self._current = 0
    +791        self._line = 1
    +792        self._col = 1
    +793        self._comments: t.List[str] = []
    +794
    +795        self._char = ""
    +796        self._end = False
    +797        self._peek = ""
    +798        self._prev_token_line = -1
    +799        self._prev_token_comments: t.List[str] = []
    +800        self._prev_token_type: t.Optional[TokenType] = None
     
    @@ -7083,22 +7121,22 @@
    -
    798    def tokenize(self, sql: str) -> t.List[Token]:
    -799        """Returns a list of tokens corresponding to the SQL string `sql`."""
    -800        self.reset()
    -801        self.sql = sql
    -802        self.size = len(sql)
    -803        try:
    -804            self._scan()
    -805        except Exception as e:
    -806            start = self._current - 50
    -807            end = self._current + 50
    -808            start = start if start > 0 else 0
    -809            end = end if end < self.size else self.size - 1
    -810            context = self.sql[start:end]
    -811            raise ValueError(f"Error tokenizing '{context}'") from e
    -812
    -813        return self.tokens
    +            
    802    def tokenize(self, sql: str) -> t.List[Token]:
    +803        """Returns a list of tokens corresponding to the SQL string `sql`."""
    +804        self.reset()
    +805        self.sql = sql
    +806        self.size = len(sql)
    +807        try:
    +808            self._scan()
    +809        except Exception as e:
    +810            start = self._current - 50
    +811            end = self._current + 50
    +812            start = start if start > 0 else 0
    +813            end = end if end < self.size else self.size - 1
    +814            context = self.sql[start:end]
    +815            raise ValueError(f"Error tokenizing '{context}'") from e
    +816
    +817        return self.tokens
     
    -- cgit v1.2.3