diff options
Diffstat (limited to 'sqlglot/dialects/hive.py')
-rw-r--r-- | sqlglot/dialects/hive.py | 41 |
1 files changed, 39 insertions, 2 deletions
diff --git a/sqlglot/dialects/hive.py b/sqlglot/dialects/hive.py index fbd626a..650a1e1 100644 --- a/sqlglot/dialects/hive.py +++ b/sqlglot/dialects/hive.py @@ -9,6 +9,7 @@ from sqlglot.dialects.dialect import ( create_with_partitions_sql, format_time_lambda, if_sql, + left_to_substring_sql, locate_to_strposition, max_or_greatest, min_or_least, @@ -17,6 +18,7 @@ from sqlglot.dialects.dialect import ( no_safe_divide_sql, no_trycast_sql, rename_func, + right_to_substring_sql, strposition_to_locate_sql, struct_extract_sql, timestrtotime_sql, @@ -89,7 +91,7 @@ def _json_format_sql(self: generator.Generator, expression: exp.JSONFormat) -> s annotate_types(this) - if this.type.is_type(exp.DataType.Type.JSON): + if this.type.is_type("json"): return self.sql(this) return self.func("TO_JSON", this, expression.args.get("options")) @@ -149,6 +151,7 @@ def _to_date_sql(self: generator.Generator, expression: exp.TsOrDsToDate) -> str class Hive(Dialect): alias_post_tablesample = True + identifiers_can_start_with_digit = True time_mapping = { "y": "%Y", @@ -190,7 +193,6 @@ class Hive(Dialect): IDENTIFIERS = ["`"] STRING_ESCAPES = ["\\"] ENCODE = "utf-8" - IDENTIFIER_CAN_START_WITH_DIGIT = True KEYWORDS = { **tokens.Tokenizer.KEYWORDS, @@ -276,6 +278,39 @@ class Hive(Dialect): "cluster": lambda self: self._parse_sort(exp.Cluster, "CLUSTER", "BY"), } + def _parse_types( + self, check_func: bool = False, schema: bool = False + ) -> t.Optional[exp.Expression]: + """ + Spark (and most likely Hive) treats casts to CHAR(length) and VARCHAR(length) as casts to + STRING in all contexts except for schema definitions. For example, this is in Spark v3.4.0: + + spark-sql (default)> select cast(1234 as varchar(2)); + 23/06/06 15:51:18 WARN CharVarcharUtils: The Spark cast operator does not support + char/varchar type and simply treats them as string type. Please use string type + directly to avoid confusion. Otherwise, you can set spark.sql.legacy.charVarcharAsString + to true, so that Spark treat them as string type as same as Spark 3.0 and earlier + + 1234 + Time taken: 4.265 seconds, Fetched 1 row(s) + + This shows that Spark doesn't truncate the value into '12', which is inconsistent with + what other dialects (e.g. postgres) do, so we need to drop the length to transpile correctly. + + Reference: https://spark.apache.org/docs/latest/sql-ref-datatypes.html + """ + this = super()._parse_types(check_func=check_func, schema=schema) + + if this and not schema: + return this.transform( + lambda node: node.replace(exp.DataType.build("text")) + if isinstance(node, exp.DataType) and node.is_type("char", "varchar") + else node, + copy=False, + ) + + return this + class Generator(generator.Generator): LIMIT_FETCH = "LIMIT" TABLESAMPLE_WITH_METHOD = False @@ -323,6 +358,7 @@ class Hive(Dialect): exp.JSONExtract: rename_func("GET_JSON_OBJECT"), exp.JSONExtractScalar: rename_func("GET_JSON_OBJECT"), exp.JSONFormat: _json_format_sql, + exp.Left: left_to_substring_sql, exp.Map: var_map_sql, exp.Max: max_or_greatest, exp.Min: min_or_least, @@ -332,6 +368,7 @@ class Hive(Dialect): exp.ApproxQuantile: rename_func("PERCENTILE_APPROX"), exp.RegexpLike: lambda self, e: self.binary(e, "RLIKE"), exp.RegexpSplit: rename_func("SPLIT"), + exp.Right: right_to_substring_sql, exp.SafeDivide: no_safe_divide_sql, exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), exp.SetAgg: rename_func("COLLECT_SET"), |