summaryrefslogtreecommitdiffstats
path: root/sqlglot/dialects/hive.py
diff options
context:
space:
mode:
Diffstat (limited to 'sqlglot/dialects/hive.py')
-rw-r--r--sqlglot/dialects/hive.py41
1 files changed, 39 insertions, 2 deletions
diff --git a/sqlglot/dialects/hive.py b/sqlglot/dialects/hive.py
index fbd626a..650a1e1 100644
--- a/sqlglot/dialects/hive.py
+++ b/sqlglot/dialects/hive.py
@@ -9,6 +9,7 @@ from sqlglot.dialects.dialect import (
create_with_partitions_sql,
format_time_lambda,
if_sql,
+ left_to_substring_sql,
locate_to_strposition,
max_or_greatest,
min_or_least,
@@ -17,6 +18,7 @@ from sqlglot.dialects.dialect import (
no_safe_divide_sql,
no_trycast_sql,
rename_func,
+ right_to_substring_sql,
strposition_to_locate_sql,
struct_extract_sql,
timestrtotime_sql,
@@ -89,7 +91,7 @@ def _json_format_sql(self: generator.Generator, expression: exp.JSONFormat) -> s
annotate_types(this)
- if this.type.is_type(exp.DataType.Type.JSON):
+ if this.type.is_type("json"):
return self.sql(this)
return self.func("TO_JSON", this, expression.args.get("options"))
@@ -149,6 +151,7 @@ def _to_date_sql(self: generator.Generator, expression: exp.TsOrDsToDate) -> str
class Hive(Dialect):
alias_post_tablesample = True
+ identifiers_can_start_with_digit = True
time_mapping = {
"y": "%Y",
@@ -190,7 +193,6 @@ class Hive(Dialect):
IDENTIFIERS = ["`"]
STRING_ESCAPES = ["\\"]
ENCODE = "utf-8"
- IDENTIFIER_CAN_START_WITH_DIGIT = True
KEYWORDS = {
**tokens.Tokenizer.KEYWORDS,
@@ -276,6 +278,39 @@ class Hive(Dialect):
"cluster": lambda self: self._parse_sort(exp.Cluster, "CLUSTER", "BY"),
}
+ def _parse_types(
+ self, check_func: bool = False, schema: bool = False
+ ) -> t.Optional[exp.Expression]:
+ """
+ Spark (and most likely Hive) treats casts to CHAR(length) and VARCHAR(length) as casts to
+ STRING in all contexts except for schema definitions. For example, this is in Spark v3.4.0:
+
+ spark-sql (default)> select cast(1234 as varchar(2));
+ 23/06/06 15:51:18 WARN CharVarcharUtils: The Spark cast operator does not support
+ char/varchar type and simply treats them as string type. Please use string type
+ directly to avoid confusion. Otherwise, you can set spark.sql.legacy.charVarcharAsString
+ to true, so that Spark treat them as string type as same as Spark 3.0 and earlier
+
+ 1234
+ Time taken: 4.265 seconds, Fetched 1 row(s)
+
+ This shows that Spark doesn't truncate the value into '12', which is inconsistent with
+ what other dialects (e.g. postgres) do, so we need to drop the length to transpile correctly.
+
+ Reference: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
+ """
+ this = super()._parse_types(check_func=check_func, schema=schema)
+
+ if this and not schema:
+ return this.transform(
+ lambda node: node.replace(exp.DataType.build("text"))
+ if isinstance(node, exp.DataType) and node.is_type("char", "varchar")
+ else node,
+ copy=False,
+ )
+
+ return this
+
class Generator(generator.Generator):
LIMIT_FETCH = "LIMIT"
TABLESAMPLE_WITH_METHOD = False
@@ -323,6 +358,7 @@ class Hive(Dialect):
exp.JSONExtract: rename_func("GET_JSON_OBJECT"),
exp.JSONExtractScalar: rename_func("GET_JSON_OBJECT"),
exp.JSONFormat: _json_format_sql,
+ exp.Left: left_to_substring_sql,
exp.Map: var_map_sql,
exp.Max: max_or_greatest,
exp.Min: min_or_least,
@@ -332,6 +368,7 @@ class Hive(Dialect):
exp.ApproxQuantile: rename_func("PERCENTILE_APPROX"),
exp.RegexpLike: lambda self, e: self.binary(e, "RLIKE"),
exp.RegexpSplit: rename_func("SPLIT"),
+ exp.Right: right_to_substring_sql,
exp.SafeDivide: no_safe_divide_sql,
exp.SchemaCommentProperty: lambda self, e: self.naked_property(e),
exp.SetAgg: rename_func("COLLECT_SET"),