from tests.dialects.test_dialect import Validator from sqlglot import exp class TestHive(Validator): dialect = "hive" def test_bits(self): self.validate_all( "x & 1", read={ "duckdb": "x & 1", "presto": "BITWISE_AND(x, 1)", "spark": "x & 1", }, write={ "duckdb": "x & 1", "hive": "x & 1", "presto": "BITWISE_AND(x, 1)", "spark": "x & 1", }, ) self.validate_all( "x & 1 > 0", read={ "duckdb": "x & 1 > 0", "presto": "BITWISE_AND(x, 1) > 0", "spark": "x & 1 > 0", }, write={ "duckdb": "x & 1 > 0", "presto": "BITWISE_AND(x, 1) > 0", "hive": "x & 1 > 0", "spark": "x & 1 > 0", }, ) self.validate_all( "~x", read={ "duckdb": "~x", "presto": "BITWISE_NOT(x)", "spark": "~x", }, write={ "duckdb": "~x", "hive": "~x", "presto": "BITWISE_NOT(x)", "spark": "~x", }, ) self.validate_all( "x | 1", read={ "duckdb": "x | 1", "presto": "BITWISE_OR(x, 1)", "spark": "x | 1", }, write={ "duckdb": "x | 1", "hive": "x | 1", "presto": "BITWISE_OR(x, 1)", "spark": "x | 1", }, ) self.validate_all( "x << 1", read={ "spark": "SHIFTLEFT(x, 1)", }, write={ "duckdb": "x << 1", "presto": "BITWISE_ARITHMETIC_SHIFT_LEFT(x, 1)", "hive": "x << 1", "spark": "SHIFTLEFT(x, 1)", }, ) self.validate_all( "x >> 1", read={ "spark": "SHIFTRIGHT(x, 1)", }, write={ "duckdb": "x >> 1", "presto": "BITWISE_ARITHMETIC_SHIFT_RIGHT(x, 1)", "hive": "x >> 1", "spark": "SHIFTRIGHT(x, 1)", }, ) def test_cast(self): self.validate_all( "1s", write={ "duckdb": "TRY_CAST(1 AS SMALLINT)", "presto": "TRY_CAST(1 AS SMALLINT)", "hive": "CAST(1 AS SMALLINT)", "spark": "CAST(1 AS SMALLINT)", }, ) self.validate_all( "1S", write={ "duckdb": "TRY_CAST(1 AS SMALLINT)", "presto": "TRY_CAST(1 AS SMALLINT)", "hive": "CAST(1 AS SMALLINT)", "spark": "CAST(1 AS SMALLINT)", }, ) self.validate_all( "1Y", write={ "duckdb": "TRY_CAST(1 AS TINYINT)", "presto": "TRY_CAST(1 AS TINYINT)", "hive": "CAST(1 AS TINYINT)", "spark": "CAST(1 AS TINYINT)", }, ) self.validate_all( "1L", write={ "duckdb": "TRY_CAST(1 AS BIGINT)", "presto": "TRY_CAST(1 AS BIGINT)", "hive": "CAST(1 AS BIGINT)", "spark": "CAST(1 AS BIGINT)", }, ) self.validate_all( "1.0bd", write={ "duckdb": "TRY_CAST(1.0 AS DECIMAL)", "presto": "TRY_CAST(1.0 AS DECIMAL)", "hive": "CAST(1.0 AS DECIMAL)", "spark": "CAST(1.0 AS DECIMAL)", }, ) self.validate_all( "CAST(1 AS INT)", read={ "presto": "TRY_CAST(1 AS INT)", }, write={ "duckdb": "TRY_CAST(1 AS INT)", "presto": "TRY_CAST(1 AS INTEGER)", "hive": "CAST(1 AS INT)", "spark": "CAST(1 AS INT)", }, ) def test_ddl(self): self.validate_all( "CREATE TABLE x (w STRING) PARTITIONED BY (y INT, z INT)", write={ "duckdb": "CREATE TABLE x (w TEXT)", # Partition columns should exist in table "presto": "CREATE TABLE x (w VARCHAR, y INTEGER, z INTEGER) WITH (PARTITIONED_BY=ARRAY['y', 'z'])", "hive": "CREATE TABLE x (w STRING) PARTITIONED BY (y INT, z INT)", "spark": "CREATE TABLE x (w STRING, y INT, z INT) PARTITIONED BY (y, z)", }, ) self.validate_all( "CREATE TABLE test STORED AS parquet TBLPROPERTIES ('x'='1', 'Z'='2') AS SELECT 1", write={ "duckdb": "CREATE TABLE test AS SELECT 1", "presto": "CREATE TABLE test WITH (FORMAT='PARQUET', x='1', Z='2') AS SELECT 1", "hive": "CREATE TABLE test STORED AS PARQUET TBLPROPERTIES ('x'='1', 'Z'='2') AS SELECT 1", "spark": "CREATE TABLE test USING PARQUET TBLPROPERTIES ('x'='1', 'Z'='2') AS SELECT 1", }, ) self.validate_identity( """CREATE EXTERNAL TABLE x (y INT) ROW FORMAT SERDE 'serde' ROW FORMAT DELIMITED FIELDS TERMINATED BY '1' WITH SERDEPROPERTIES ('input.regex'='')""", ) self.validate_identity( """CREATE EXTERNAL TABLE `my_table` (`a7` ARRAY) ROW FORMAT SERDE 'a' STORED AS INPUTFORMAT 'b' OUTPUTFORMAT 'c' LOCATION 'd' TBLPROPERTIES ('e'='f')""" ) self.validate_identity("ALTER VIEW v1 AS SELECT x, UPPER(s) AS s FROM t2") self.validate_identity("ALTER VIEW v1 (c1, c2) AS SELECT x, UPPER(s) AS s FROM t2") self.validate_identity( "ALTER VIEW v7 (c1 COMMENT 'Comment for c1', c2) AS SELECT t1.c1, t1.c2 FROM t1" ) self.validate_identity("ALTER VIEW db1.v1 RENAME TO db2.v2") self.validate_identity("ALTER VIEW v1 SET TBLPROPERTIES ('tblp1'='1', 'tblp2'='2')") self.validate_identity( "ALTER VIEW v1 UNSET TBLPROPERTIES ('tblp1', 'tblp2')", check_command_warning=True ) def test_lateral_view(self): self.validate_all( "SELECT a, b FROM x LATERAL VIEW EXPLODE(y) t AS a LATERAL VIEW EXPLODE(z) u AS b", write={ "presto": "SELECT a, b FROM x CROSS JOIN UNNEST(y) AS t(a) CROSS JOIN UNNEST(z) AS u(b)", "duckdb": "SELECT a, b FROM x CROSS JOIN UNNEST(y) AS t(a) CROSS JOIN UNNEST(z) AS u(b)", "hive": "SELECT a, b FROM x LATERAL VIEW EXPLODE(y) t AS a LATERAL VIEW EXPLODE(z) u AS b", "spark": "SELECT a, b FROM x LATERAL VIEW EXPLODE(y) t AS a LATERAL VIEW EXPLODE(z) u AS b", }, ) self.validate_all( "SELECT a FROM x LATERAL VIEW EXPLODE(y) t AS a", write={ "presto": "SELECT a FROM x CROSS JOIN UNNEST(y) AS t(a)", "duckdb": "SELECT a FROM x CROSS JOIN UNNEST(y) AS t(a)", "hive": "SELECT a FROM x LATERAL VIEW EXPLODE(y) t AS a", "spark": "SELECT a FROM x LATERAL VIEW EXPLODE(y) t AS a", }, ) self.validate_all( "SELECT a FROM x LATERAL VIEW POSEXPLODE(y) t AS a", write={ "presto": "SELECT a FROM x CROSS JOIN UNNEST(y) WITH ORDINALITY AS t(a)", "hive": "SELECT a FROM x LATERAL VIEW POSEXPLODE(y) t AS a", "spark": "SELECT a FROM x LATERAL VIEW POSEXPLODE(y) t AS a", }, ) self.validate_all( "SELECT a FROM x LATERAL VIEW EXPLODE(ARRAY(y)) t AS a", write={ "presto": "SELECT a FROM x CROSS JOIN UNNEST(ARRAY[y]) AS t(a)", "duckdb": "SELECT a FROM x CROSS JOIN UNNEST([y]) AS t(a)", "hive": "SELECT a FROM x LATERAL VIEW EXPLODE(ARRAY(y)) t AS a", "spark": "SELECT a FROM x LATERAL VIEW EXPLODE(ARRAY(y)) t AS a", }, ) def test_quotes(self): self.validate_all( "'\\''", write={ "duckdb": "''''", "presto": "''''", "hive": "'\\''", "spark": "'\\''", }, ) self.validate_all( "'\"x\"'", write={ "duckdb": "'\"x\"'", "presto": "'\"x\"'", "hive": "'\"x\"'", "spark": "'\"x\"'", }, ) self.validate_all( "\"'x'\"", write={ "duckdb": "'''x'''", "presto": "'''x'''", "hive": "'\\'x\\''", "spark": "'\\'x\\''", }, ) self.validate_all( "'\\\\\\\\a'", read={ "drill": "'\\\\\\\\a'", "duckdb": "'\\\\a'", "presto": "'\\\\a'", }, write={ "drill": "'\\\\\\\\a'", "duckdb": "'\\\\a'", "hive": "'\\\\\\\\a'", "presto": "'\\\\a'", "spark": "'\\\\\\\\a'", }, ) def test_regex(self): self.validate_all( "a RLIKE 'x'", write={ "duckdb": "REGEXP_MATCHES(a, 'x')", "presto": "REGEXP_LIKE(a, 'x')", "hive": "a RLIKE 'x'", "spark": "a RLIKE 'x'", }, ) self.validate_all( "a REGEXP 'x'", write={ "duckdb": "REGEXP_MATCHES(a, 'x')", "presto": "REGEXP_LIKE(a, 'x')", "hive": "a RLIKE 'x'", "spark": "a RLIKE 'x'", }, ) def test_time(self): self.validate_all( "(UNIX_TIMESTAMP(y) - UNIX_TIMESTAMP(x)) * 1000", read={ "presto": "DATE_DIFF('millisecond', x, y)", }, ) self.validate_all( "UNIX_TIMESTAMP(y) - UNIX_TIMESTAMP(x)", read={ "presto": "DATE_DIFF('second', x, y)", }, ) self.validate_all( "(UNIX_TIMESTAMP(y) - UNIX_TIMESTAMP(x)) / 60", read={ "presto": "DATE_DIFF('minute', x, y)", }, ) self.validate_all( "(UNIX_TIMESTAMP(y) - UNIX_TIMESTAMP(x)) / 3600", read={ "presto": "DATE_DIFF('hour', x, y)", }, ) self.validate_all( "DATEDIFF(a, b)", write={ "duckdb": "DATE_DIFF('DAY', CAST(b AS DATE), CAST(a AS DATE))", "presto": "DATE_DIFF('DAY', CAST(CAST(b AS TIMESTAMP) AS DATE), CAST(CAST(a AS TIMESTAMP) AS DATE))", "hive": "DATEDIFF(TO_DATE(a), TO_DATE(b))", "spark": "DATEDIFF(TO_DATE(a), TO_DATE(b))", "": "DATEDIFF(CAST(a AS DATE), CAST(b AS DATE))", }, ) self.validate_all( """from_unixtime(x, "yyyy-MM-dd'T'HH")""", write={ "duckdb": "STRFTIME(TO_TIMESTAMP(x), '%Y-%m-%d''T''%H')", "presto": "DATE_FORMAT(FROM_UNIXTIME(x), '%Y-%m-%d''T''%H')", "hive": "FROM_UNIXTIME(x, 'yyyy-MM-dd\\'T\\'HH')", "spark": "FROM_UNIXTIME(x, 'yyyy-MM-dd\\'T\\'HH')", }, ) self.validate_all( "DATE_FORMAT('2020-01-01', 'yyyy-MM-dd HH:mm:ss')", write={ "bigquery": "FORMAT_DATE('%Y-%m-%d %H:%M:%S', CAST('2020-01-01' AS DATETIME))", "duckdb": "STRFTIME(CAST('2020-01-01' AS TIMESTAMP), '%Y-%m-%d %H:%M:%S')", "presto": "DATE_FORMAT(CAST('2020-01-01' AS TIMESTAMP), '%Y-%m-%d %T')", "hive": "DATE_FORMAT(CAST('2020-01-01' AS TIMESTAMP), 'yyyy-MM-dd HH:mm:ss')", "spark": "DATE_FORMAT(CAST('2020-01-01' AS TIMESTAMP), 'yyyy-MM-dd HH:mm:ss')", }, ) self.validate_all( "DATE_ADD('2020-01-01', 1)", write={ "": "TS_OR_DS_ADD('2020-01-01', 1, DAY)", "bigquery": "DATE_ADD(CAST(CAST('2020-01-01' AS DATETIME) AS DATE), INTERVAL 1 DAY)", "duckdb": "CAST('2020-01-01' AS DATE) + INTERVAL 1 DAY", "hive": "DATE_ADD('2020-01-01', 1)", "presto": "DATE_ADD('DAY', 1, CAST(CAST('2020-01-01' AS TIMESTAMP) AS DATE))", "redshift": "DATEADD(DAY, 1, '2020-01-01')", "snowflake": "DATEADD(DAY, 1, CAST(CAST('2020-01-01' AS TIMESTAMP) AS DATE))", "spark": "DATE_ADD('2020-01-01', 1)", "tsql": "DATEADD(DAY, 1, CAST(CAST('2020-01-01' AS DATETIME2) AS DATE))", }, ) self.validate_all( "DATE_SUB('2020-01-01', 1)", write={ "": "TS_OR_DS_ADD('2020-01-01', 1 * -1, DAY)", "bigquery": "DATE_ADD(CAST(CAST('2020-01-01' AS DATETIME) AS DATE), INTERVAL (1 * -1) DAY)", "duckdb": "CAST('2020-01-01' AS DATE) + INTERVAL (1 * -1) DAY", "hive": "DATE_ADD('2020-01-01', 1 * -1)", "presto": "DATE_ADD('DAY', 1 * -1, CAST(CAST('2020-01-01' AS TIMESTAMP) AS DATE))", "redshift": "DATEADD(DAY, 1 * -1, '2020-01-01')", "snowflake": "DATEADD(DAY, 1 * -1, CAST(CAST('2020-01-01' AS TIMESTAMP) AS DATE))", "spark": "DATE_ADD('2020-01-01', 1 * -1)", "tsql": "DATEADD(DAY, 1 * -1, CAST(CAST('2020-01-01' AS DATETIME2) AS DATE))", }, ) self.validate_all("DATE_ADD('2020-01-01', -1)", read={"": "DATE_SUB('2020-01-01', 1)"}) self.validate_all("DATE_ADD(a, b * -1)", read={"": "DATE_SUB(a, b)"}) self.validate_all( "ADD_MONTHS('2020-01-01', -2)", read={"": "DATE_SUB('2020-01-01', 2, month)"} ) self.validate_all( "DATEDIFF(TO_DATE(y), x)", write={ "duckdb": "DATE_DIFF('DAY', CAST(x AS DATE), CAST(y AS DATE))", "presto": "DATE_DIFF('DAY', CAST(CAST(x AS TIMESTAMP) AS DATE), CAST(CAST(CAST(CAST(y AS TIMESTAMP) AS DATE) AS TIMESTAMP) AS DATE))", "hive": "DATEDIFF(TO_DATE(y), TO_DATE(x))", "spark": "DATEDIFF(TO_DATE(y), TO_DATE(x))", "": "DATEDIFF(CAST(y AS DATE), CAST(x AS DATE))", }, ) self.validate_all( "UNIX_TIMESTAMP(x)", write={ "duckdb": "EPOCH(STRPTIME(x, '%Y-%m-%d %H:%M:%S'))", "presto": "TO_UNIXTIME(COALESCE(TRY(DATE_PARSE(CAST(x AS VARCHAR), '%Y-%m-%d %T')), PARSE_DATETIME(DATE_FORMAT(x, '%Y-%m-%d %T'), 'yyyy-MM-dd HH:mm:ss')))", "hive": "UNIX_TIMESTAMP(x)", "spark": "UNIX_TIMESTAMP(x)", "": "STR_TO_UNIX(x, '%Y-%m-%d %H:%M:%S')", }, ) for unit in ("DAY", "MONTH", "YEAR"): self.validate_all( f"{unit}(x)", write={ "duckdb": f"{unit}(CAST(x AS DATE))", "presto": f"{unit}(CAST(CAST(x AS TIMESTAMP) AS DATE))", "hive": f"{unit}(TO_DATE(x))", "spark": f"{unit}(TO_DATE(x))", }, ) def test_order_by(self): self.validate_all( "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC NULLS LAST, lname", write={ "duckdb": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC, lname NULLS FIRST", "presto": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC, lname NULLS FIRST", "hive": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC NULLS LAST, lname", "spark": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC NULLS LAST, lname", }, ) def test_hive(self): self.validate_identity("SELECT * FROM t WHERE col IN ('stream')") self.validate_identity("SET hiveconf:some_var = 5", check_command_warning=True) self.validate_identity("(VALUES (1 AS a, 2 AS b, 3))") self.validate_identity("SELECT * FROM my_table TIMESTAMP AS OF DATE_ADD(CURRENT_DATE, -1)") self.validate_identity("SELECT * FROM my_table VERSION AS OF DATE_ADD(CURRENT_DATE, -1)") self.validate_identity( "SELECT WEEKOFYEAR('2024-05-22'), DAYOFMONTH('2024-05-22'), DAYOFWEEK('2024-05-22')" ) self.validate_identity( "SELECT ROW() OVER (DISTRIBUTE BY x SORT BY y)", "SELECT ROW() OVER (PARTITION BY x ORDER BY y)", ) self.validate_identity("SELECT transform") self.validate_identity("SELECT * FROM test DISTRIBUTE BY y SORT BY x DESC ORDER BY l") self.validate_identity( "SELECT * FROM test WHERE RAND() <= 0.1 DISTRIBUTE BY RAND() SORT BY RAND()" ) self.validate_identity("(SELECT 1 UNION SELECT 2) DISTRIBUTE BY z") self.validate_identity("(SELECT 1 UNION SELECT 2) DISTRIBUTE BY z SORT BY x") self.validate_identity("(SELECT 1 UNION SELECT 2) CLUSTER BY y DESC") self.validate_identity("SELECT * FROM test CLUSTER BY y") self.validate_identity("(SELECT 1 UNION SELECT 2) SORT BY z") self.validate_identity( "INSERT OVERWRITE TABLE zipcodes PARTITION(state = '0') VALUES (896, 'US', 'TAMPA', 33607)" ) self.validate_identity( "INSERT OVERWRITE TABLE zipcodes PARTITION(state = 0) VALUES (896, 'US', 'TAMPA', 33607)" ) self.validate_identity( "INSERT OVERWRITE DIRECTORY 'x' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '' STORED AS TEXTFILE SELECT * FROM `a`.`b`" ) self.validate_identity( "SELECT a, b, SUM(c) FROM tabl AS t GROUP BY a, b, GROUPING SETS ((a, b), a)" ) self.validate_identity( "SELECT a, b, SUM(c) FROM tabl AS t GROUP BY a, b, GROUPING SETS ((t.a, b), a)" ) self.validate_identity( "SELECT a, b, SUM(c) FROM tabl AS t GROUP BY a, FOO(b), GROUPING SETS ((a, FOO(b)), a)" ) self.validate_identity( "SELECT key, value, GROUPING__ID, COUNT(*) FROM T1 GROUP BY key, value WITH CUBE" ) self.validate_identity( "SELECT key, value, GROUPING__ID, COUNT(*) FROM T1 GROUP BY key, value WITH ROLLUP" ) self.validate_identity( "TRUNCATE TABLE t1 PARTITION(age = 10, name = 'test', address = 'abc')" ) self.validate_all( "SELECT ${hiveconf:some_var}", write={ "hive": "SELECT ${hiveconf:some_var}", "spark": "SELECT ${hiveconf:some_var}", }, ) self.validate_all( "SELECT A.1a AS b FROM test_a AS A", write={ "spark": "SELECT A.1a AS b FROM test_a AS A", }, ) self.validate_all( "SELECT 1_a AS a FROM test_table", write={ "spark": "SELECT 1_a AS a FROM test_table", "trino": 'SELECT "1_a" AS a FROM test_table', }, ) self.validate_all( "SELECT a_b AS 1_a FROM test_table", write={ "spark": "SELECT a_b AS 1_a FROM test_table", }, ) self.validate_all( "SELECT 1a_1a FROM test_a", write={ "spark": "SELECT 1a_1a FROM test_a", }, ) self.validate_all( "SELECT 1a AS 1a_1a FROM test_a", write={ "spark": "SELECT 1a AS 1a_1a FROM test_a", }, ) self.validate_all( "CREATE TABLE test_table (1a STRING)", write={ "spark": "CREATE TABLE test_table (1a STRING)", }, ) self.validate_all( "CREATE TABLE test_table2 (1a_1a STRING)", write={ "spark": "CREATE TABLE test_table2 (1a_1a STRING)", }, ) self.validate_all( "PERCENTILE(x, 0.5)", write={ "duckdb": "QUANTILE(x, 0.5)", "presto": "APPROX_PERCENTILE(x, 0.5)", "hive": "PERCENTILE(x, 0.5)", "spark": "PERCENTILE(x, 0.5)", }, ) self.validate_all( "PERCENTILE_APPROX(x, 0.5)", read={ "hive": "PERCENTILE_APPROX(x, 0.5)", "presto": "APPROX_PERCENTILE(x, 0.5)", "duckdb": "APPROX_QUANTILE(x, 0.5)", "spark": "PERCENTILE_APPROX(x, 0.5)", }, write={ "hive": "PERCENTILE_APPROX(x, 0.5)", "presto": "APPROX_PERCENTILE(x, 0.5)", "duckdb": "APPROX_QUANTILE(x, 0.5)", "spark": "PERCENTILE_APPROX(x, 0.5)", }, ) self.validate_all( "APPROX_COUNT_DISTINCT(a)", write={ "bigquery": "APPROX_COUNT_DISTINCT(a)", "duckdb": "APPROX_COUNT_DISTINCT(a)", "presto": "APPROX_DISTINCT(a)", "hive": "APPROX_COUNT_DISTINCT(a)", "snowflake": "APPROX_COUNT_DISTINCT(a)", "spark": "APPROX_COUNT_DISTINCT(a)", }, ) self.validate_all( "ARRAY_CONTAINS(x, 1)", read={ "duckdb": "LIST_HAS(x, 1)", "snowflake": "ARRAY_CONTAINS(1, x)", }, write={ "duckdb": "ARRAY_CONTAINS(x, 1)", "presto": "CONTAINS(x, 1)", "hive": "ARRAY_CONTAINS(x, 1)", "spark": "ARRAY_CONTAINS(x, 1)", "snowflake": "ARRAY_CONTAINS(1, x)", }, ) self.validate_all( "SIZE(x)", write={ "duckdb": "ARRAY_LENGTH(x)", "presto": "CARDINALITY(x)", "hive": "SIZE(x)", "spark": "SIZE(x)", }, ) self.validate_all( "LOCATE('a', x)", write={ "duckdb": "STRPOS(x, 'a')", "presto": "STRPOS(x, 'a')", "hive": "LOCATE('a', x)", "spark": "LOCATE('a', x)", }, ) self.validate_all( "LOCATE('a', x, 3)", write={ "duckdb": "STRPOS(SUBSTR(x, 3), 'a') + 3 - 1", "presto": "STRPOS(SUBSTR(x, 3), 'a') + 3 - 1", "hive": "LOCATE('a', x, 3)", "spark": "LOCATE('a', x, 3)", }, ) self.validate_all( "INITCAP('new york')", write={ "duckdb": "INITCAP('new york')", "presto": r"REGEXP_REPLACE('new york', '(\w)(\w*)', x -> UPPER(x[1]) || LOWER(x[2]))", "hive": "INITCAP('new york')", "spark": "INITCAP('new york')", }, ) self.validate_all( "SELECT * FROM x.z TABLESAMPLE(10 PERCENT) y", write={ "hive": "SELECT * FROM x.z TABLESAMPLE (10 PERCENT) AS y", "spark": "SELECT * FROM x.z TABLESAMPLE (10 PERCENT) AS y", }, ) self.validate_all( "SELECT SORT_ARRAY(x)", write={ "duckdb": "SELECT ARRAY_SORT(x)", "presto": "SELECT ARRAY_SORT(x)", "hive": "SELECT SORT_ARRAY(x)", "spark": "SELECT SORT_ARRAY(x)", }, ) self.validate_all( "SELECT SORT_ARRAY(x, FALSE)", read={ "duckdb": "SELECT ARRAY_REVERSE_SORT(x)", "spark": "SELECT SORT_ARRAY(x, FALSE)", }, write={ "duckdb": "SELECT ARRAY_REVERSE_SORT(x)", "presto": "SELECT ARRAY_SORT(x, (a, b) -> CASE WHEN a < b THEN 1 WHEN a > b THEN -1 ELSE 0 END)", "hive": "SELECT SORT_ARRAY(x, FALSE)", "spark": "SELECT SORT_ARRAY(x, FALSE)", }, ) self.validate_all( "GET_JSON_OBJECT(x, '$.name')", write={ "presto": "JSON_EXTRACT_SCALAR(x, '$.name')", "hive": "GET_JSON_OBJECT(x, '$.name')", "spark": "GET_JSON_OBJECT(x, '$.name')", }, ) self.validate_all( "MAP(a, b, c, d)", read={ "": "VAR_MAP(a, b, c, d)", "clickhouse": "map(a, b, c, d)", "duckdb": "MAP([a, c], [b, d])", "hive": "MAP(a, b, c, d)", "presto": "MAP(ARRAY[a, c], ARRAY[b, d])", "spark": "MAP(a, b, c, d)", }, write={ "": "MAP(ARRAY(a, c), ARRAY(b, d))", "clickhouse": "map(a, b, c, d)", "duckdb": "MAP([a, c], [b, d])", "presto": "MAP(ARRAY[a, c], ARRAY[b, d])", "hive": "MAP(a, b, c, d)", "spark": "MAP(a, b, c, d)", "snowflake": "OBJECT_CONSTRUCT(a, b, c, d)", }, ) self.validate_all( "MAP(a, b)", write={ "duckdb": "MAP([a], [b])", "presto": "MAP(ARRAY[a], ARRAY[b])", "hive": "MAP(a, b)", "spark": "MAP(a, b)", "snowflake": "OBJECT_CONSTRUCT(a, b)", }, ) self.validate_all( "LOG(10)", write={ "duckdb": "LN(10)", "presto": "LN(10)", "hive": "LN(10)", "spark": "LN(10)", }, ) self.validate_all( 'ds = "2020-01-01"', write={ "duckdb": "ds = '2020-01-01'", "presto": "ds = '2020-01-01'", "hive": "ds = '2020-01-01'", "spark": "ds = '2020-01-01'", }, ) self.validate_all( "ds = \"1''2\"", write={ "duckdb": "ds = '1''''2'", "presto": "ds = '1''''2'", "hive": "ds = '1\\'\\'2'", "spark": "ds = '1\\'\\'2'", }, ) self.validate_all( "x == 1", write={ "duckdb": "x = 1", "presto": "x = 1", "hive": "x = 1", "spark": "x = 1", }, ) self.validate_all( "x div y", write={ "duckdb": "x // y", "presto": "CAST(CAST(x AS DOUBLE) / y AS INTEGER)", "hive": "CAST(x / y AS INT)", "spark": "CAST(x / y AS INT)", }, ) self.validate_all( "COLLECT_LIST(x)", read={ "presto": "ARRAY_AGG(x)", }, write={ "duckdb": "ARRAY_AGG(x) FILTER(WHERE x IS NOT NULL)", "presto": "ARRAY_AGG(x) FILTER(WHERE x IS NOT NULL)", "hive": "COLLECT_LIST(x)", "spark": "COLLECT_LIST(x)", }, ) self.validate_all( "COLLECT_SET(x)", read={ "doris": "COLLECT_SET(x)", "presto": "SET_AGG(x)", "snowflake": "ARRAY_UNIQUE_AGG(x)", }, write={ "doris": "COLLECT_SET(x)", "hive": "COLLECT_SET(x)", "presto": "SET_AGG(x)", "snowflake": "ARRAY_UNIQUE_AGG(x)", "spark": "COLLECT_SET(x)", }, ) self.validate_all( "SELECT * FROM x TABLESAMPLE (1 PERCENT) AS foo", read={ "presto": "SELECT * FROM x AS foo TABLESAMPLE BERNOULLI (1)", "snowflake": "SELECT * FROM x AS foo TABLESAMPLE (1)", }, write={ "hive": "SELECT * FROM x TABLESAMPLE (1 PERCENT) AS foo", "snowflake": "SELECT * FROM x AS foo TABLESAMPLE (1)", "spark": "SELECT * FROM x TABLESAMPLE (1 PERCENT) AS foo", }, ) self.validate_all( "SELECT a, SUM(c) FROM t GROUP BY a, DATE_FORMAT(b, 'yyyy'), GROUPING SETS ((a, DATE_FORMAT(b, 'yyyy')), a)", write={ "hive": "SELECT a, SUM(c) FROM t GROUP BY a, DATE_FORMAT(CAST(b AS TIMESTAMP), 'yyyy'), GROUPING SETS ((a, DATE_FORMAT(CAST(b AS TIMESTAMP), 'yyyy')), a)", }, ) self.validate_all( "SELECT TRUNC(CAST(ds AS TIMESTAMP), 'MONTH') AS mm FROM tbl WHERE ds BETWEEN '2023-10-01' AND '2024-02-29'", read={ "hive": "SELECT TRUNC(CAST(ds AS TIMESTAMP), 'MONTH') AS mm FROM tbl WHERE ds BETWEEN '2023-10-01' AND '2024-02-29'", "presto": "SELECT DATE_TRUNC('MONTH', CAST(ds AS TIMESTAMP)) AS mm FROM tbl WHERE ds BETWEEN '2023-10-01' AND '2024-02-29'", }, write={ "presto": "SELECT DATE_TRUNC('MONTH', TRY_CAST(ds AS TIMESTAMP)) AS mm FROM tbl WHERE ds BETWEEN '2023-10-01' AND '2024-02-29'", }, ) self.validate_all( "REGEXP_EXTRACT('abc', '(a)(b)(c)')", read={ "hive": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "spark2": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "spark": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "databricks": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", }, write={ "hive": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "spark2": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "spark": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "databricks": "REGEXP_EXTRACT('abc', '(a)(b)(c)')", "presto": "REGEXP_EXTRACT('abc', '(a)(b)(c)', 1)", "trino": "REGEXP_EXTRACT('abc', '(a)(b)(c)', 1)", "duckdb": "REGEXP_EXTRACT('abc', '(a)(b)(c)', 1)", }, ) self.validate_identity("EXISTS(col, x -> x % 2 = 0)").assert_is(exp.Exists) self.validate_all( "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", read={ "hive": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", "spark2": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", "spark": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", "databricks": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", }, write={ "spark2": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", "spark": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", "databricks": "SELECT EXISTS(ARRAY(2, 3), x -> x % 2 = 0)", }, ) def test_escapes(self) -> None: self.validate_identity("'\n'", "'\\n'") self.validate_identity("'\\n'") self.validate_identity("'\\\n'", "'\\\\\\n'") self.validate_identity("'\\\\n'") self.validate_identity("''") self.validate_identity("'\\\\'") self.validate_identity("'\\\\z'") def test_data_type(self): self.validate_all( "CAST(a AS BIT)", write={ "hive": "CAST(a AS BOOLEAN)", }, )