summaryrefslogtreecommitdiffstats
path: root/sqlglot/schema.py
blob: c916330493ffcf4b9a9658dab0c6abd3d10ec483 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import abc

from sqlglot import expressions as exp
from sqlglot.errors import OptimizeError
from sqlglot.helper import csv_reader


class Schema(abc.ABC):
    """Abstract base class for database schemas"""

    @abc.abstractmethod
    def add_table(self, table, column_mapping=None):
        """
        Register or update a table. Some implementing classes may require column information to also be provided

        Args:
            table (sqlglot.expressions.Table|str): Table expression instance or string representing the table
            column_mapping (dict|str|sqlglot.dataframe.sql.types.StructType|list): A column mapping that describes the structure of the table
        """

    @abc.abstractmethod
    def column_names(self, table, only_visible=False):
        """
        Get the column names for a table.
        Args:
            table (sqlglot.expressions.Table): Table expression instance
            only_visible (bool): Whether to include invisible columns
        Returns:
            list[str]: list of column names
        """

    @abc.abstractmethod
    def get_column_type(self, table, column):
        """
        Get the exp.DataType type of a column in the schema.

        Args:
            table (sqlglot.expressions.Table): The source table.
            column (sqlglot.expressions.Column): The target column.
        Returns:
            sqlglot.expressions.DataType.Type: The resulting column type.
        """


class MappingSchema(Schema):
    """
    Schema based on a nested mapping.

    Args:
        schema (dict): Mapping in one of the following forms:
            1. {table: {col: type}}
            2. {db: {table: {col: type}}}
            3. {catalog: {db: {table: {col: type}}}}
            4. None - Tables will be added later
        visible (dict): Optional mapping of which columns in the schema are visible. If not provided, all columns
            are assumed to be visible. The nesting should mirror that of the schema:
            1. {table: set(*cols)}}
            2. {db: {table: set(*cols)}}}
            3. {catalog: {db: {table: set(*cols)}}}}
        dialect (str): The dialect to be used for custom type mappings.
    """

    def __init__(self, schema=None, visible=None, dialect=None):
        self.schema = schema or {}
        self.visible = visible
        self.dialect = dialect
        self._type_mapping_cache = {}
        self.supported_table_args = []
        self.forbidden_table_args = set()
        if self.schema:
            self._initialize_supported_args()

    @classmethod
    def from_mapping_schema(cls, mapping_schema):
        return MappingSchema(
            schema=mapping_schema.schema, visible=mapping_schema.visible, dialect=mapping_schema.dialect
        )

    def copy(self, **kwargs):
        return MappingSchema(**{"schema": self.schema.copy(), **kwargs})

    def add_table(self, table, column_mapping=None):
        """
        Register or update a table. Updates are only performed if a new column mapping is provided.

        Args:
            table (sqlglot.expressions.Table|str): Table expression instance or string representing the table
            column_mapping (dict|str|sqlglot.dataframe.sql.types.StructType|list): A column mapping that describes the structure of the table
        """
        table = exp.to_table(table)
        self._validate_table(table)
        column_mapping = ensure_column_mapping(column_mapping)
        table_args = [table.text(p) for p in self.supported_table_args or self._get_table_args_from_table(table)]
        existing_column_mapping = _nested_get(
            self.schema, *zip(self.supported_table_args, table_args), raise_on_missing=False
        )
        if existing_column_mapping and not column_mapping:
            return
        _nested_set(
            self.schema,
            [table.text(p) for p in self.supported_table_args or self._get_table_args_from_table(table)],
            column_mapping,
        )
        self._initialize_supported_args()

    def _get_table_args_from_table(self, table):
        if table.args.get("catalog") is not None:
            return "catalog", "db", "this"
        if table.args.get("db") is not None:
            return "db", "this"
        return ("this",)

    def _validate_table(self, table):
        if not self.supported_table_args and isinstance(table, exp.Table):
            return
        for forbidden in self.forbidden_table_args:
            if table.text(forbidden):
                raise ValueError(f"Schema doesn't support {forbidden}. Received: {table.sql()}")
        for expected in self.supported_table_args:
            if not table.text(expected):
                raise ValueError(f"Table is expected to have {expected}. Received: {table.sql()} ")

    def column_names(self, table, only_visible=False):
        table = exp.to_table(table)
        if not isinstance(table.this, exp.Identifier):
            return fs_get(table)

        args = tuple(table.text(p) for p in self.supported_table_args)

        for forbidden in self.forbidden_table_args:
            if table.text(forbidden):
                raise ValueError(f"Schema doesn't support {forbidden}. Received: {table.sql()}")

        columns = list(_nested_get(self.schema, *zip(self.supported_table_args, args)))
        if not only_visible or not self.visible:
            return columns

        visible = _nested_get(self.visible, *zip(self.supported_table_args, args))
        return [col for col in columns if col in visible]

    def get_column_type(self, table, column):
        try:
            schema_type = self.schema.get(table.name, {}).get(column.name).upper()
            return self._convert_type(schema_type)
        except:
            raise OptimizeError(f"Failed to get type for column {column.sql()}")

    def _convert_type(self, schema_type):
        """
        Convert a type represented as a string to the corresponding exp.DataType.Type object.
        Args:
            schema_type (str): The type we want to convert.
        Returns:
            sqlglot.expressions.DataType.Type: The resulting expression type.
        """
        if schema_type not in self._type_mapping_cache:
            try:
                self._type_mapping_cache[schema_type] = exp.maybe_parse(
                    schema_type, into=exp.DataType, dialect=self.dialect
                ).this
            except AttributeError:
                raise OptimizeError(f"Failed to convert type {schema_type}")

        return self._type_mapping_cache[schema_type]

    def _initialize_supported_args(self):
        if not self.supported_table_args:
            depth = _dict_depth(self.schema)

            all_args = ["this", "db", "catalog"]
            if not depth or depth == 1:  # {}
                self.supported_table_args = []
            elif 2 <= depth <= 4:
                self.supported_table_args = tuple(reversed(all_args[: depth - 1]))
            else:
                raise OptimizeError(f"Invalid schema shape. Depth: {depth}")

            self.forbidden_table_args = {"catalog", "db", "this"} - set(self.supported_table_args)


def ensure_schema(schema):
    if isinstance(schema, Schema):
        return schema

    return MappingSchema(schema)


def ensure_column_mapping(mapping):
    if isinstance(mapping, dict):
        return mapping
    elif isinstance(mapping, str):
        col_name_type_strs = [x.strip() for x in mapping.split(",")]
        return {
            name_type_str.split(":")[0].strip(): name_type_str.split(":")[1].strip()
            for name_type_str in col_name_type_strs
        }
    # Check if mapping looks like a DataFrame StructType
    elif hasattr(mapping, "simpleString"):
        return {struct_field.name: struct_field.dataType.simpleString() for struct_field in mapping}
    elif isinstance(mapping, list):
        return {x.strip(): None for x in mapping}
    elif mapping is None:
        return {}
    raise ValueError(f"Invalid mapping provided: {type(mapping)}")


def fs_get(table):
    name = table.this.name

    if name.upper() == "READ_CSV":
        with csv_reader(table) as reader:
            return next(reader)

    raise ValueError(f"Cannot read schema for {table}")


def _nested_get(d, *path, raise_on_missing=True):
    """
    Get a value for a nested dictionary.

    Args:
        d (dict): dictionary
        *path (tuple[str, str]): tuples of (name, key)
            `key` is the key in the dictionary to get.
            `name` is a string to use in the error if `key` isn't found.

    Returns:
        The value or None if it doesn't exist
    """
    for name, key in path:
        d = d.get(key)
        if d is None:
            if raise_on_missing:
                name = "table" if name == "this" else name
                raise ValueError(f"Unknown {name}")
            return None
    return d


def _nested_set(d, keys, value):
    """
    In-place set a value for a nested dictionary

    Ex:
        >>> _nested_set({}, ["top_key", "second_key"], "value")
        {'top_key': {'second_key': 'value'}}
        >>> _nested_set({"top_key": {"third_key": "third_value"}}, ["top_key", "second_key"], "value")
        {'top_key': {'third_key': 'third_value', 'second_key': 'value'}}

    d (dict): dictionary
    keys (Iterable[str]): ordered iterable of keys that makeup path to value
    value (Any): The value to set in the dictionary for the given key path
    """
    if not keys:
        return
    if len(keys) == 1:
        d[keys[0]] = value
        return
    subd = d
    for key in keys[:-1]:
        if key not in subd:
            subd = subd.setdefault(key, {})
        else:
            subd = subd[key]
    subd[keys[-1]] = value
    return d


def _dict_depth(d):
    """
    Get the nesting depth of a dictionary.

    For example:
        >>> _dict_depth(None)
        0
        >>> _dict_depth({})
        1
        >>> _dict_depth({"a": "b"})
        1
        >>> _dict_depth({"a": {}})
        2
        >>> _dict_depth({"a": {"b": {}}})
        3

    Args:
        d (dict): dictionary
    Returns:
        int: depth
    """
    try:
        return 1 + _dict_depth(next(iter(d.values())))
    except AttributeError:
        # d doesn't have attribute "values"
        return 0
    except StopIteration:
        # d.values() returns an empty sequence
        return 1