Edit on GitHub

sqlglot.dataframe.sql

View Source

 1from sqlglot.dataframe.sql.column import Column
 2from sqlglot.dataframe.sql.dataframe import DataFrame, DataFrameNaFunctions
 3from sqlglot.dataframe.sql.group import GroupedData
 4from sqlglot.dataframe.sql.readwriter import DataFrameReader, DataFrameWriter
 5from sqlglot.dataframe.sql.session import SparkSession
 6from sqlglot.dataframe.sql.window import Window, WindowSpec
 7
 8__all__ = [
 9    "SparkSession",
10    "DataFrame",
11    "GroupedData",
12    "Column",
13    "DataFrameNaFunctions",
14    "Window",
15    "WindowSpec",
16    "DataFrameReader",
17    "DataFrameWriter",
18]

class SparkSession: View Source

 20class SparkSession:
 21    known_ids: t.ClassVar[t.Set[str]] = set()
 22    known_branch_ids: t.ClassVar[t.Set[str]] = set()
 23    known_sequence_ids: t.ClassVar[t.Set[str]] = set()
 24    name_to_sequence_id_mapping: t.ClassVar[t.Dict[str, t.List[str]]] = defaultdict(list)
 25
 26    def __init__(self):
 27        self.incrementing_id = 1
 28
 29    def __getattr__(self, name: str) -> SparkSession:
 30        return self
 31
 32    def __call__(self, *args, **kwargs) -> SparkSession:
 33        return self
 34
 35    @property
 36    def read(self) -> DataFrameReader:
 37        return DataFrameReader(self)
 38
 39    def table(self, tableName: str) -> DataFrame:
 40        return self.read.table(tableName)
 41
 42    def createDataFrame(
 43        self,
 44        data: t.Sequence[t.Union[t.Dict[str, ColumnLiterals], t.List[ColumnLiterals], t.Tuple]],
 45        schema: t.Optional[SchemaInput] = None,
 46        samplingRatio: t.Optional[float] = None,
 47        verifySchema: bool = False,
 48    ) -> DataFrame:
 49        from sqlglot.dataframe.sql.dataframe import DataFrame
 50
 51        if samplingRatio is not None or verifySchema:
 52            raise NotImplementedError("Sampling Ratio and Verify Schema are not supported")
 53        if schema is not None and (
 54            not isinstance(schema, (StructType, str, list))
 55            or (isinstance(schema, list) and not isinstance(schema[0], str))
 56        ):
 57            raise NotImplementedError("Only schema of either list or string of list supported")
 58        if not data:
 59            raise ValueError("Must provide data to create into a DataFrame")
 60
 61        column_mapping: t.Dict[str, t.Optional[str]]
 62        if schema is not None:
 63            column_mapping = get_column_mapping_from_schema_input(schema)
 64        elif isinstance(data[0], dict):
 65            column_mapping = {col_name.strip(): None for col_name in data[0]}
 66        else:
 67            column_mapping = {f"_{i}": None for i in range(1, len(data[0]) + 1)}
 68
 69        data_expressions = [
 70            exp.Tuple(
 71                expressions=list(
 72                    map(
 73                        lambda x: F.lit(x).expression,
 74                        row if not isinstance(row, dict) else row.values(),
 75                    )
 76                )
 77            )
 78            for row in data
 79        ]
 80
 81        sel_columns = [
 82            F.col(name).cast(data_type).alias(name).expression
 83            if data_type is not None
 84            else F.col(name).expression
 85            for name, data_type in column_mapping.items()
 86        ]
 87
 88        select_kwargs = {
 89            "expressions": sel_columns,
 90            "from": exp.From(
 91                expressions=[
 92                    exp.Values(
 93                        expressions=data_expressions,
 94                        alias=exp.TableAlias(
 95                            this=exp.to_identifier(self._auto_incrementing_name),
 96                            columns=[exp.to_identifier(col_name) for col_name in column_mapping],
 97                        ),
 98                    ),
 99                ],
100            ),
101        }
102
103        sel_expression = exp.Select(**select_kwargs)
104        return DataFrame(self, sel_expression)
105
106    def sql(self, sqlQuery: str) -> DataFrame:
107        expression = sqlglot.parse_one(sqlQuery, read="spark")
108        if isinstance(expression, exp.Select):
109            df = DataFrame(self, expression)
110            df = df._convert_leaf_to_cte()
111        elif isinstance(expression, (exp.Create, exp.Insert)):
112            select_expression = expression.expression.copy()
113            if isinstance(expression, exp.Insert):
114                select_expression.set("with", expression.args.get("with"))
115                expression.set("with", None)
116            del expression.args["expression"]
117            df = DataFrame(self, select_expression, output_expression_container=expression)  # type: ignore
118            df = df._convert_leaf_to_cte()
119        else:
120            raise ValueError(
121                "Unknown expression type provided in the SQL. Please create an issue with the SQL."
122            )
123        return df
124
125    @property
126    def _auto_incrementing_name(self) -> str:
127        name = f"a{self.incrementing_id}"
128        self.incrementing_id += 1
129        return name
130
131    @property
132    def _random_name(self) -> str:
133        return "r" + uuid.uuid4().hex
134
135    @property
136    def _random_branch_id(self) -> str:
137        id = self._random_id
138        self.known_branch_ids.add(id)
139        return id
140
141    @property
142    def _random_sequence_id(self):
143        id = self._random_id
144        self.known_sequence_ids.add(id)
145        return id
146
147    @property
148    def _random_id(self) -> str:
149        id = self._random_name
150        self.known_ids.add(id)
151        return id
152
153    @property
154    def _join_hint_names(self) -> t.Set[str]:
155        return {"BROADCAST", "MERGE", "SHUFFLE_HASH", "SHUFFLE_REPLICATE_NL"}
156
157    def _add_alias_to_mapping(self, name: str, sequence_id: str):
158        self.name_to_sequence_id_mapping[name].append(sequence_id)

def table(self, tableName: str) -> sqlglot.dataframe.sql.DataFrame: View Source

39    def table(self, tableName: str) -> DataFrame:
40        return self.read.table(tableName)

def createDataFrame( self, data: Sequence[Union[Dict[str, <MagicMock id='140512212685136'>], List[<MagicMock id='140512212685136'>], Tuple]], schema: Optional[<MagicMock id='140512212632768'>] = None, samplingRatio: Optional[float] = None, verifySchema: bool = False) -> sqlglot.dataframe.sql.DataFrame: View Source

 42    def createDataFrame(
 43        self,
 44        data: t.Sequence[t.Union[t.Dict[str, ColumnLiterals], t.List[ColumnLiterals], t.Tuple]],
 45        schema: t.Optional[SchemaInput] = None,
 46        samplingRatio: t.Optional[float] = None,
 47        verifySchema: bool = False,
 48    ) -> DataFrame:
 49        from sqlglot.dataframe.sql.dataframe import DataFrame
 50
 51        if samplingRatio is not None or verifySchema:
 52            raise NotImplementedError("Sampling Ratio and Verify Schema are not supported")
 53        if schema is not None and (
 54            not isinstance(schema, (StructType, str, list))
 55            or (isinstance(schema, list) and not isinstance(schema[0], str))
 56        ):
 57            raise NotImplementedError("Only schema of either list or string of list supported")
 58        if not data:
 59            raise ValueError("Must provide data to create into a DataFrame")
 60
 61        column_mapping: t.Dict[str, t.Optional[str]]
 62        if schema is not None:
 63            column_mapping = get_column_mapping_from_schema_input(schema)
 64        elif isinstance(data[0], dict):
 65            column_mapping = {col_name.strip(): None for col_name in data[0]}
 66        else:
 67            column_mapping = {f"_{i}": None for i in range(1, len(data[0]) + 1)}
 68
 69        data_expressions = [
 70            exp.Tuple(
 71                expressions=list(
 72                    map(
 73                        lambda x: F.lit(x).expression,
 74                        row if not isinstance(row, dict) else row.values(),
 75                    )
 76                )
 77            )
 78            for row in data
 79        ]
 80
 81        sel_columns = [
 82            F.col(name).cast(data_type).alias(name).expression
 83            if data_type is not None
 84            else F.col(name).expression
 85            for name, data_type in column_mapping.items()
 86        ]
 87
 88        select_kwargs = {
 89            "expressions": sel_columns,
 90            "from": exp.From(
 91                expressions=[
 92                    exp.Values(
 93                        expressions=data_expressions,
 94                        alias=exp.TableAlias(
 95                            this=exp.to_identifier(self._auto_incrementing_name),
 96                            columns=[exp.to_identifier(col_name) for col_name in column_mapping],
 97                        ),
 98                    ),
 99                ],
100            ),
101        }
102
103        sel_expression = exp.Select(**select_kwargs)
104        return DataFrame(self, sel_expression)

def sql(self, sqlQuery: str) -> sqlglot.dataframe.sql.DataFrame: View Source

106    def sql(self, sqlQuery: str) -> DataFrame:
107        expression = sqlglot.parse_one(sqlQuery, read="spark")
108        if isinstance(expression, exp.Select):
109            df = DataFrame(self, expression)
110            df = df._convert_leaf_to_cte()
111        elif isinstance(expression, (exp.Create, exp.Insert)):
112            select_expression = expression.expression.copy()
113            if isinstance(expression, exp.Insert):
114                select_expression.set("with", expression.args.get("with"))
115                expression.set("with", None)
116            del expression.args["expression"]
117            df = DataFrame(self, select_expression, output_expression_container=expression)  # type: ignore
118            df = df._convert_leaf_to_cte()
119        else:
120            raise ValueError(
121                "Unknown expression type provided in the SQL. Please create an issue with the SQL."
122            )
123        return df

class DataFrameNaFunctions: View Source

784class DataFrameNaFunctions:
785    def __init__(self, df: DataFrame):
786        self.df = df
787
788    def drop(
789        self,
790        how: str = "any",
791        thresh: t.Optional[int] = None,
792        subset: t.Optional[t.Union[str, t.Tuple[str, ...], t.List[str]]] = None,
793    ) -> DataFrame:
794        return self.df.dropna(how=how, thresh=thresh, subset=subset)
795
796    def fill(
797        self,
798        value: t.Union[int, bool, float, str, t.Dict[str, t.Any]],
799        subset: t.Optional[t.Union[str, t.Tuple[str, ...], t.List[str]]] = None,
800    ) -> DataFrame:
801        return self.df.fillna(value=value, subset=subset)
802
803    def replace(
804        self,
805        to_replace: t.Union[bool, int, float, str, t.List, t.Dict],
806        value: t.Optional[t.Union[bool, int, float, str, t.List]] = None,
807        subset: t.Optional[t.Union[str, t.List[str]]] = None,
808    ) -> DataFrame:
809        return self.df.replace(to_replace=to_replace, value=value, subset=subset)

DataFrameNaFunctions(df: sqlglot.dataframe.sql.DataFrame) View Source

785    def __init__(self, df: DataFrame):
786        self.df = df

def drop( self, how: str = 'any', thresh: Optional[int] = None, subset: Union[str, Tuple[str, ...], List[str], NoneType] = None) -> sqlglot.dataframe.sql.DataFrame: View Source

788    def drop(
789        self,
790        how: str = "any",
791        thresh: t.Optional[int] = None,
792        subset: t.Optional[t.Union[str, t.Tuple[str, ...], t.List[str]]] = None,
793    ) -> DataFrame:
794        return self.df.dropna(how=how, thresh=thresh, subset=subset)

def fill( self, value: Union[int, bool, float, str, Dict[str, Any]], subset: Union[str, Tuple[str, ...], List[str], NoneType] = None) -> sqlglot.dataframe.sql.DataFrame: View Source

796    def fill(
797        self,
798        value: t.Union[int, bool, float, str, t.Dict[str, t.Any]],
799        subset: t.Optional[t.Union[str, t.Tuple[str, ...], t.List[str]]] = None,
800    ) -> DataFrame:
801        return self.df.fillna(value=value, subset=subset)

def replace( self, to_replace: Union[bool, int, float, str, List, Dict], value: Union[bool, int, float, str, List, NoneType] = None, subset: Union[str, List[str], NoneType] = None) -> sqlglot.dataframe.sql.DataFrame: View Source

803    def replace(
804        self,
805        to_replace: t.Union[bool, int, float, str, t.List, t.Dict],
806        value: t.Optional[t.Union[bool, int, float, str, t.List]] = None,
807        subset: t.Optional[t.Union[str, t.List[str]]] = None,
808    ) -> DataFrame:
809        return self.df.replace(to_replace=to_replace, value=value, subset=subset)

class Window: View Source

15class Window:
16    _JAVA_MIN_LONG = -(1 << 63)  # -9223372036854775808
17    _JAVA_MAX_LONG = (1 << 63) - 1  # 9223372036854775807
18    _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG)
19    _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG)
20
21    unboundedPreceding: int = _JAVA_MIN_LONG
22
23    unboundedFollowing: int = _JAVA_MAX_LONG
24
25    currentRow: int = 0
26
27    @classmethod
28    def partitionBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
29        return WindowSpec().partitionBy(*cols)
30
31    @classmethod
32    def orderBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
33        return WindowSpec().orderBy(*cols)
34
35    @classmethod
36    def rowsBetween(cls, start: int, end: int) -> WindowSpec:
37        return WindowSpec().rowsBetween(start, end)
38
39    @classmethod
40    def rangeBetween(cls, start: int, end: int) -> WindowSpec:
41        return WindowSpec().rangeBetween(start, end)

@classmethod

def partitionBy( cls, *cols: Union[<MagicMock id='140512209413808'>, List[<MagicMock id='140512209413808'>]]) -> sqlglot.dataframe.sql.WindowSpec: View Source

27    @classmethod
28    def partitionBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
29        return WindowSpec().partitionBy(*cols)

@classmethod

def orderBy( cls, *cols: Union[<MagicMock id='140512209283600'>, List[<MagicMock id='140512209283600'>]]) -> sqlglot.dataframe.sql.WindowSpec: View Source

31    @classmethod
32    def orderBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
33        return WindowSpec().orderBy(*cols)

@classmethod

def rowsBetween(cls, start: int, end: int) -> sqlglot.dataframe.sql.WindowSpec: View Source

35    @classmethod
36    def rowsBetween(cls, start: int, end: int) -> WindowSpec:
37        return WindowSpec().rowsBetween(start, end)

@classmethod

def rangeBetween(cls, start: int, end: int) -> sqlglot.dataframe.sql.WindowSpec: View Source

39    @classmethod
40    def rangeBetween(cls, start: int, end: int) -> WindowSpec:
41        return WindowSpec().rangeBetween(start, end)

class WindowSpec: View Source

 44class WindowSpec:
 45    def __init__(self, expression: exp.Expression = exp.Window()):
 46        self.expression = expression
 47
 48    def copy(self):
 49        return WindowSpec(self.expression.copy())
 50
 51    def sql(self, **kwargs) -> str:
 52        return self.expression.sql(dialect="spark", **kwargs)
 53
 54    def partitionBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
 55        from sqlglot.dataframe.sql.column import Column
 56
 57        cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols  # type: ignore
 58        expressions = [Column.ensure_col(x).expression for x in cols]
 59        window_spec = self.copy()
 60        partition_by_expressions = window_spec.expression.args.get("partition_by", [])
 61        partition_by_expressions.extend(expressions)
 62        window_spec.expression.set("partition_by", partition_by_expressions)
 63        return window_spec
 64
 65    def orderBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
 66        from sqlglot.dataframe.sql.column import Column
 67
 68        cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols  # type: ignore
 69        expressions = [Column.ensure_col(x).expression for x in cols]
 70        window_spec = self.copy()
 71        if window_spec.expression.args.get("order") is None:
 72            window_spec.expression.set("order", exp.Order(expressions=[]))
 73        order_by = window_spec.expression.args["order"].expressions
 74        order_by.extend(expressions)
 75        window_spec.expression.args["order"].set("expressions", order_by)
 76        return window_spec
 77
 78    def _calc_start_end(
 79        self, start: int, end: int
 80    ) -> t.Dict[str, t.Optional[t.Union[str, exp.Expression]]]:
 81        kwargs: t.Dict[str, t.Optional[t.Union[str, exp.Expression]]] = {
 82            "start_side": None,
 83            "end_side": None,
 84        }
 85        if start == Window.currentRow:
 86            kwargs["start"] = "CURRENT ROW"
 87        else:
 88            kwargs = {
 89                **kwargs,
 90                **{
 91                    "start_side": "PRECEDING",
 92                    "start": "UNBOUNDED"
 93                    if start <= Window.unboundedPreceding
 94                    else F.lit(start).expression,
 95                },
 96            }
 97        if end == Window.currentRow:
 98            kwargs["end"] = "CURRENT ROW"
 99        else:
100            kwargs = {
101                **kwargs,
102                **{
103                    "end_side": "FOLLOWING",
104                    "end": "UNBOUNDED"
105                    if end >= Window.unboundedFollowing
106                    else F.lit(end).expression,
107                },
108            }
109        return kwargs
110
111    def rowsBetween(self, start: int, end: int) -> WindowSpec:
112        window_spec = self.copy()
113        spec = self._calc_start_end(start, end)
114        spec["kind"] = "ROWS"
115        window_spec.expression.set(
116            "spec",
117            exp.WindowSpec(
118                **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
119            ),
120        )
121        return window_spec
122
123    def rangeBetween(self, start: int, end: int) -> WindowSpec:
124        window_spec = self.copy()
125        spec = self._calc_start_end(start, end)
126        spec["kind"] = "RANGE"
127        window_spec.expression.set(
128            "spec",
129            exp.WindowSpec(
130                **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
131            ),
132        )
133        return window_spec

WindowSpec(expression: sqlglot.expressions.Expression = (WINDOW )) View Source

45    def __init__(self, expression: exp.Expression = exp.Window()):
46        self.expression = expression

def copy(self): View Source

48    def copy(self):
49        return WindowSpec(self.expression.copy())

def sql(self, **kwargs) -> str: View Source

51    def sql(self, **kwargs) -> str:
52        return self.expression.sql(dialect="spark", **kwargs)

def partitionBy( self, *cols: Union[<MagicMock id='140512209559296'>, List[<MagicMock id='140512209559296'>]]) -> sqlglot.dataframe.sql.WindowSpec: View Source

54    def partitionBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
55        from sqlglot.dataframe.sql.column import Column
56
57        cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols  # type: ignore
58        expressions = [Column.ensure_col(x).expression for x in cols]
59        window_spec = self.copy()
60        partition_by_expressions = window_spec.expression.args.get("partition_by", [])
61        partition_by_expressions.extend(expressions)
62        window_spec.expression.set("partition_by", partition_by_expressions)
63        return window_spec

def orderBy( self, *cols: Union[<MagicMock id='140512209618832'>, List[<MagicMock id='140512209618832'>]]) -> sqlglot.dataframe.sql.WindowSpec: View Source

65    def orderBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
66        from sqlglot.dataframe.sql.column import Column
67
68        cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols  # type: ignore
69        expressions = [Column.ensure_col(x).expression for x in cols]
70        window_spec = self.copy()
71        if window_spec.expression.args.get("order") is None:
72            window_spec.expression.set("order", exp.Order(expressions=[]))
73        order_by = window_spec.expression.args["order"].expressions
74        order_by.extend(expressions)
75        window_spec.expression.args["order"].set("expressions", order_by)
76        return window_spec

def rowsBetween(self, start: int, end: int) -> sqlglot.dataframe.sql.WindowSpec: View Source

111    def rowsBetween(self, start: int, end: int) -> WindowSpec:
112        window_spec = self.copy()
113        spec = self._calc_start_end(start, end)
114        spec["kind"] = "ROWS"
115        window_spec.expression.set(
116            "spec",
117            exp.WindowSpec(
118                **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
119            ),
120        )
121        return window_spec

def rangeBetween(self, start: int, end: int) -> sqlglot.dataframe.sql.WindowSpec: View Source

123    def rangeBetween(self, start: int, end: int) -> WindowSpec:
124        window_spec = self.copy()
125        spec = self._calc_start_end(start, end)
126        spec["kind"] = "RANGE"
127        window_spec.expression.set(
128            "spec",
129            exp.WindowSpec(
130                **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
131            ),
132        )
133        return window_spec

class DataFrameReader: View Source

15class DataFrameReader:
16    def __init__(self, spark: SparkSession):
17        self.spark = spark
18
19    def table(self, tableName: str) -> DataFrame:
20        from sqlglot.dataframe.sql.dataframe import DataFrame
21
22        sqlglot.schema.add_table(tableName)
23        return DataFrame(
24            self.spark,
25            exp.Select().from_(tableName).select(*sqlglot.schema.column_names(tableName)),
26        )

DataFrameReader(spark: sqlglot.dataframe.sql.SparkSession) View Source

16    def __init__(self, spark: SparkSession):
17        self.spark = spark

def table(self, tableName: str) -> sqlglot.dataframe.sql.DataFrame: View Source

19    def table(self, tableName: str) -> DataFrame:
20        from sqlglot.dataframe.sql.dataframe import DataFrame
21
22        sqlglot.schema.add_table(tableName)
23        return DataFrame(
24            self.spark,
25            exp.Select().from_(tableName).select(*sqlglot.schema.column_names(tableName)),
26        )

class DataFrameWriter: View Source

29class DataFrameWriter:
30    def __init__(
31        self,
32        df: DataFrame,
33        spark: t.Optional[SparkSession] = None,
34        mode: t.Optional[str] = None,
35        by_name: bool = False,
36    ):
37        self._df = df
38        self._spark = spark or df.spark
39        self._mode = mode
40        self._by_name = by_name
41
42    def copy(self, **kwargs) -> DataFrameWriter:
43        return DataFrameWriter(
44            **{
45                k[1:] if k.startswith("_") else k: v
46                for k, v in object_to_dict(self, **kwargs).items()
47            }
48        )
49
50    def sql(self, **kwargs) -> t.List[str]:
51        return self._df.sql(**kwargs)
52
53    def mode(self, saveMode: t.Optional[str]) -> DataFrameWriter:
54        return self.copy(_mode=saveMode)
55
56    @property
57    def byName(self):
58        return self.copy(by_name=True)
59
60    def insertInto(self, tableName: str, overwrite: t.Optional[bool] = None) -> DataFrameWriter:
61        output_expression_container = exp.Insert(
62            **{
63                "this": exp.to_table(tableName),
64                "overwrite": overwrite,
65            }
66        )
67        df = self._df.copy(output_expression_container=output_expression_container)
68        if self._by_name:
69            columns = sqlglot.schema.column_names(tableName, only_visible=True)
70            df = df._convert_leaf_to_cte().select(*columns)
71
72        return self.copy(_df=df)
73
74    def saveAsTable(self, name: str, format: t.Optional[str] = None, mode: t.Optional[str] = None):
75        if format is not None:
76            raise NotImplementedError("Providing Format in the save as table is not supported")
77        exists, replace, mode = None, None, mode or str(self._mode)
78        if mode == "append":
79            return self.insertInto(name)
80        if mode == "ignore":
81            exists = True
82        if mode == "overwrite":
83            replace = True
84        output_expression_container = exp.Create(
85            this=exp.to_table(name),
86            kind="TABLE",
87            exists=exists,
88            replace=replace,
89        )
90        return self.copy(_df=self._df.copy(output_expression_container=output_expression_container))

DataFrameWriter( df: sqlglot.dataframe.sql.DataFrame, spark: Optional[sqlglot.dataframe.sql.SparkSession] = None, mode: Optional[str] = None, by_name: bool = False) View Source

30    def __init__(
31        self,
32        df: DataFrame,
33        spark: t.Optional[SparkSession] = None,
34        mode: t.Optional[str] = None,
35        by_name: bool = False,
36    ):
37        self._df = df
38        self._spark = spark or df.spark
39        self._mode = mode
40        self._by_name = by_name

def copy(self, **kwargs) -> sqlglot.dataframe.sql.DataFrameWriter: View Source

42    def copy(self, **kwargs) -> DataFrameWriter:
43        return DataFrameWriter(
44            **{
45                k[1:] if k.startswith("_") else k: v
46                for k, v in object_to_dict(self, **kwargs).items()
47            }
48        )

def sql(self, **kwargs) -> List[str]: View Source

50    def sql(self, **kwargs) -> t.List[str]:
51        return self._df.sql(**kwargs)

def mode( self, saveMode: Optional[str]) -> sqlglot.dataframe.sql.DataFrameWriter: View Source

53    def mode(self, saveMode: t.Optional[str]) -> DataFrameWriter:
54        return self.copy(_mode=saveMode)

def insertInto( self, tableName: str, overwrite: Optional[bool] = None) -> sqlglot.dataframe.sql.DataFrameWriter: View Source

60    def insertInto(self, tableName: str, overwrite: t.Optional[bool] = None) -> DataFrameWriter:
61        output_expression_container = exp.Insert(
62            **{
63                "this": exp.to_table(tableName),
64                "overwrite": overwrite,
65            }
66        )
67        df = self._df.copy(output_expression_container=output_expression_container)
68        if self._by_name:
69            columns = sqlglot.schema.column_names(tableName, only_visible=True)
70            df = df._convert_leaf_to_cte().select(*columns)
71
72        return self.copy(_df=df)

def saveAsTable( self, name: str, format: Optional[str] = None, mode: Optional[str] = None): View Source

74    def saveAsTable(self, name: str, format: t.Optional[str] = None, mode: t.Optional[str] = None):
75        if format is not None:
76            raise NotImplementedError("Providing Format in the save as table is not supported")
77        exists, replace, mode = None, None, mode or str(self._mode)
78        if mode == "append":
79            return self.insertInto(name)
80        if mode == "ignore":
81            exists = True
82        if mode == "overwrite":
83            replace = True
84        output_expression_container = exp.Create(
85            this=exp.to_table(name),
86            kind="TABLE",
87            exists=exists,
88            replace=replace,
89        )
90        return self.copy(_df=self._df.copy(output_expression_container=output_expression_container))