Merging upstream version 20.3.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-12-19 11:01:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-12-19 11:01:55 +0000
commit: f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5 (patch)
tree: 5dce0fe2a11381761496eb973c20750f44db56d5 /sqlglot/dialects/dialect.py
parent: Releasing debian version 20.1.0-1. (diff)
download: sqlglot-f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5.tar.xz
sqlglot-f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5.zip
1 files changed, 66 insertions, 35 deletions
diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py
index c7cea64..b7eef45 100644
--- a/sqlglot/dialects/dialect.py
+++ b/sqlglot/dialects/dialect.py
@@ -21,11 +21,14 @@ DATE_ADD_OR_SUB = t.Union[exp.DateAdd, exp.TsOrDsAdd, exp.DateSub]
 
 
 class Dialects(str, Enum):
+    """Dialects supported by SQLGLot."""
+
     DIALECT = ""
 
     BIGQUERY = "bigquery"
     CLICKHOUSE = "clickhouse"
     DATABRICKS = "databricks"
+    DORIS = "doris"
     DRILL = "drill"
     DUCKDB = "duckdb"
     HIVE = "hive"
@@ -43,16 +46,22 @@ class Dialects(str, Enum):
     TERADATA = "teradata"
     TRINO = "trino"
     TSQL = "tsql"
-    Doris = "doris"
 
 
 class NormalizationStrategy(str, AutoName):
     """Specifies the strategy according to which identifiers should be normalized."""
 
-    LOWERCASE = auto()  # Unquoted identifiers are lowercased
-    UPPERCASE = auto()  # Unquoted identifiers are uppercased
-    CASE_SENSITIVE = auto()  # Always case-sensitive, regardless of quotes
-    CASE_INSENSITIVE = auto()  # Always case-insensitive, regardless of quotes
+    LOWERCASE = auto()
+    """Unquoted identifiers are lowercased."""
+
+    UPPERCASE = auto()
+    """Unquoted identifiers are uppercased."""
+
+    CASE_SENSITIVE = auto()
+    """Always case-sensitive, regardless of quotes."""
+
+    CASE_INSENSITIVE = auto()
+    """Always case-insensitive, regardless of quotes."""
 
 
 class _Dialect(type):
@@ -117,6 +126,7 @@ class _Dialect(type):
         klass.BIT_START, klass.BIT_END = get_start_end(TokenType.BIT_STRING)
         klass.HEX_START, klass.HEX_END = get_start_end(TokenType.HEX_STRING)
         klass.BYTE_START, klass.BYTE_END = get_start_end(TokenType.BYTE_STRING)
+        klass.UNICODE_START, klass.UNICODE_END = get_start_end(TokenType.UNICODE_STRING)
 
         if enum not in ("", "bigquery"):
             klass.generator_class.SELECT_KINDS = ()
@@ -131,74 +141,84 @@ class _Dialect(type):
 
 
 class Dialect(metaclass=_Dialect):
-    # Determines the base index offset for arrays
     INDEX_OFFSET = 0
+    """Determines the base index offset for arrays."""
+
+    WEEK_OFFSET = 0
+    """Determines the day of week of DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday."""
 
-    # If true unnest table aliases are considered only as column aliases
     UNNEST_COLUMN_ONLY = False
+    """Determines whether or not `UNNEST` table aliases are treated as column aliases."""
 
-    # Determines whether or not the table alias comes after tablesample
     ALIAS_POST_TABLESAMPLE = False
+    """Determines whether or not the table alias comes after tablesample."""
 
-    # Specifies the strategy according to which identifiers should be normalized.
     NORMALIZATION_STRATEGY = NormalizationStrategy.LOWERCASE
+    """Specifies the strategy according to which identifiers should be normalized."""
 
-    # Determines whether or not an unquoted identifier can start with a digit
     IDENTIFIERS_CAN_START_WITH_DIGIT = False
+    """Determines whether or not an unquoted identifier can start with a digit."""
 
-    # Determines whether or not the DPIPE token ('||') is a string concatenation operator
     DPIPE_IS_STRING_CONCAT = True
+    """Determines whether or not the DPIPE token (`||`) is a string concatenation operator."""
 
-    # Determines whether or not CONCAT's arguments must be strings
     STRICT_STRING_CONCAT = False
+    """Determines whether or not `CONCAT`'s arguments must be strings."""
 
-    # Determines whether or not user-defined data types are supported
     SUPPORTS_USER_DEFINED_TYPES = True
+    """Determines whether or not user-defined data types are supported."""
 
-    # Determines whether or not SEMI/ANTI JOINs are supported
     SUPPORTS_SEMI_ANTI_JOIN = True
+    """Determines whether or not `SEMI` or `ANTI` joins are supported."""
 
-    # Determines how function names are going to be normalized
     NORMALIZE_FUNCTIONS: bool | str = "upper"
+    """Determines how function names are going to be normalized."""
 
-    # Determines whether the base comes first in the LOG function
     LOG_BASE_FIRST = True
+    """Determines whether the base comes first in the `LOG` function."""
 
-    # Indicates the default null ordering method to use if not explicitly set
-    # Options are: "nulls_are_small", "nulls_are_large", "nulls_are_last"
     NULL_ORDERING = "nulls_are_small"
+    """
+    Indicates the default `NULL` ordering method to use if not explicitly set.
+    Possible values: `"nulls_are_small"`, `"nulls_are_large"`, `"nulls_are_last"`
+    """
 
-    # Whether the behavior of a / b depends on the types of a and b.
-    # False means a / b is always float division.
-    # True means a / b is integer division if both a and b are integers.
     TYPED_DIVISION = False
+    """
+    Whether the behavior of `a / b` depends on the types of `a` and `b`.
+    False means `a / b` is always float division.
+    True means `a / b` is integer division if both `a` and `b` are integers.
+    """
 
-    # False means 1 / 0 throws an error.
-    # True means 1 / 0 returns null.
     SAFE_DIVISION = False
+    """Determines whether division by zero throws an error (`False`) or returns NULL (`True`)."""
 
-    # A NULL arg in CONCAT yields NULL by default, but in some dialects it yields an empty string
     CONCAT_COALESCE = False
+    """A `NULL` arg in `CONCAT` yields `NULL` by default, but in some dialects it yields an empty string."""
 
     DATE_FORMAT = "'%Y-%m-%d'"
     DATEINT_FORMAT = "'%Y%m%d'"
     TIME_FORMAT = "'%Y-%m-%d %H:%M:%S'"
 
-    # Custom time mappings in which the key represents dialect time format
-    # and the value represents a python time format
     TIME_MAPPING: t.Dict[str, str] = {}
+    """Associates this dialect's time formats with their equivalent Python `strftime` format."""
 
     # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time
     # https://docs.teradata.com/r/Teradata-Database-SQL-Functions-Operators-Expressions-and-Predicates/March-2017/Data-Type-Conversions/Character-to-DATE-Conversion/Forcing-a-FORMAT-on-CAST-for-Converting-Character-to-DATE
-    # special syntax cast(x as date format 'yyyy') defaults to time_mapping
     FORMAT_MAPPING: t.Dict[str, str] = {}
+    """
+    Helper which is used for parsing the special syntax `CAST(x AS DATE FORMAT 'yyyy')`.
+    If empty, the corresponding trie will be constructed off of `TIME_MAPPING`.
+    """
 
-    # Mapping of an unescaped escape sequence to the corresponding character
     ESCAPE_SEQUENCES: t.Dict[str, str] = {}
+    """Mapping of an unescaped escape sequence to the corresponding character."""
 
-    # Columns that are auto-generated by the engine corresponding to this dialect
-    # Such columns may be excluded from SELECT * queries, for example
     PSEUDOCOLUMNS: t.Set[str] = set()
+    """
+    Columns that are auto-generated by the engine corresponding to this dialect.
+    For example, such columns may be excluded from `SELECT *` queries.
+    """
 
     # --- Autofilled ---
 
@@ -221,13 +241,15 @@ class Dialect(metaclass=_Dialect):
     IDENTIFIER_START = '"'
     IDENTIFIER_END = '"'
 
-    # Delimiters for bit, hex and byte literals
+    # Delimiters for bit, hex, byte and unicode literals
     BIT_START: t.Optional[str] = None
     BIT_END: t.Optional[str] = None
     HEX_START: t.Optional[str] = None
     HEX_END: t.Optional[str] = None
     BYTE_START: t.Optional[str] = None
     BYTE_END: t.Optional[str] = None
+    UNICODE_START: t.Optional[str] = None
+    UNICODE_END: t.Optional[str] = None
 
     @classmethod
     def get_or_raise(cls, dialect: DialectType) -> Dialect:
@@ -275,6 +297,7 @@ class Dialect(metaclass=_Dialect):
     def format_time(
         cls, expression: t.Optional[str | exp.Expression]
     ) -> t.Optional[exp.Expression]:
+        """Converts a time format in this dialect to its equivalent Python `strftime` format."""
         if isinstance(expression, str):
             return exp.Literal.string(
                 # the time formats are quoted
@@ -306,9 +329,9 @@ class Dialect(metaclass=_Dialect):
         """
         Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
 
-        For example, an identifier like FoO would be resolved as foo in Postgres, because it
+        For example, an identifier like `FoO` would be resolved as `foo` in Postgres, because it
         lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
-        it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
+        it would resolve it as `FOO`. If it was quoted, it'd need to be treated as case-sensitive,
         and so any normalization would be prohibited in order to avoid "breaking" the identifier.
 
         There are also dialects like Spark, which are case-insensitive even when quotes are
@@ -356,8 +379,8 @@ class Dialect(metaclass=_Dialect):
         Args:
             text: The text to check.
             identify:
-                "always" or `True`: Always returns true.
-                "safe": True if the identifier is case-insensitive.
+                `"always"` or `True`: Always returns `True`.
+                `"safe"`: Only returns `True` if the identifier is case-insensitive.
 
         Returns:
             Whether or not the given text can be identified.
@@ -371,6 +394,14 @@ class Dialect(metaclass=_Dialect):
         return False
 
     def quote_identifier(self, expression: E, identify: bool = True) -> E:
+        """
+        Adds quotes to a given identifier.
+
+        Args:
+            expression: The expression of interest. If it's not an `Identifier`, this method is a no-op.
+            identify: If set to `False`, the quotes will only be added if the identifier is deemed
+                "unsafe", with respect to its characters and this dialect's normalization strategy.
+        """
         if isinstance(expression, exp.Identifier):
             name = expression.this
             expression.set(
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-12-19 11:01:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-12-19 11:01:55 +0000
commit	f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5 (patch)
tree	5dce0fe2a11381761496eb973c20750f44db56d5 /sqlglot/dialects/dialect.py
parent	Releasing debian version 20.1.0-1. (diff)
download	sqlglot-f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5.tar.xz sqlglot-f1c2dbe3b17a0d5edffbb65b85b642d0bb2756c5.zip