Edit on GitHub

sqlglot.optimizer.scope

  1import itertools
  2from collections import defaultdict
  3from enum import Enum, auto
  4
  5from sqlglot import exp
  6from sqlglot.errors import OptimizeError
  7
  8
  9class ScopeType(Enum):
 10    ROOT = auto()
 11    SUBQUERY = auto()
 12    DERIVED_TABLE = auto()
 13    CTE = auto()
 14    UNION = auto()
 15    UDTF = auto()
 16
 17
 18class Scope:
 19    """
 20    Selection scope.
 21
 22    Attributes:
 23        expression (exp.Select|exp.Union): Root expression of this scope
 24        sources (dict[str, exp.Table|Scope]): Mapping of source name to either
 25            a Table expression or another Scope instance. For example:
 26                SELECT * FROM x                     {"x": Table(this="x")}
 27                SELECT * FROM x AS y                {"y": Table(this="x")}
 28                SELECT * FROM (SELECT ...) AS y     {"y": Scope(...)}
 29        outer_column_list (list[str]): If this is a derived table or CTE, and the outer query
 30            defines a column list of it's alias of this scope, this is that list of columns.
 31            For example:
 32                SELECT * FROM (SELECT ...) AS y(col1, col2)
 33            The inner query would have `["col1", "col2"]` for its `outer_column_list`
 34        parent (Scope): Parent scope
 35        scope_type (ScopeType): Type of this scope, relative to it's parent
 36        subquery_scopes (list[Scope]): List of all child scopes for subqueries
 37        cte_scopes = (list[Scope]) List of all child scopes for CTEs
 38        derived_table_scopes = (list[Scope]) List of all child scopes for derived_tables
 39        union_scopes (list[Scope, Scope]): If this Scope is for a Union expression, this will be
 40            a list of the left and right child scopes.
 41    """
 42
 43    def __init__(
 44        self,
 45        expression,
 46        sources=None,
 47        outer_column_list=None,
 48        parent=None,
 49        scope_type=ScopeType.ROOT,
 50    ):
 51        self.expression = expression
 52        self.sources = sources or {}
 53        self.outer_column_list = outer_column_list or []
 54        self.parent = parent
 55        self.scope_type = scope_type
 56        self.subquery_scopes = []
 57        self.derived_table_scopes = []
 58        self.cte_scopes = []
 59        self.union_scopes = []
 60        self.clear_cache()
 61
 62    def clear_cache(self):
 63        self._collected = False
 64        self._raw_columns = None
 65        self._derived_tables = None
 66        self._tables = None
 67        self._ctes = None
 68        self._subqueries = None
 69        self._selected_sources = None
 70        self._columns = None
 71        self._external_columns = None
 72        self._join_hints = None
 73
 74    def branch(self, expression, scope_type, chain_sources=None, **kwargs):
 75        """Branch from the current scope to a new, inner scope"""
 76        return Scope(
 77            expression=expression.unnest(),
 78            sources={**self.cte_sources, **(chain_sources or {})},
 79            parent=self,
 80            scope_type=scope_type,
 81            **kwargs,
 82        )
 83
 84    def _collect(self):
 85        self._tables = []
 86        self._ctes = []
 87        self._subqueries = []
 88        self._derived_tables = []
 89        self._raw_columns = []
 90        self._join_hints = []
 91
 92        for node, parent, _ in self.walk(bfs=False):
 93            if node is self.expression:
 94                continue
 95            elif isinstance(node, exp.Column) and not isinstance(node.this, exp.Star):
 96                self._raw_columns.append(node)
 97            elif isinstance(node, exp.Table) and not isinstance(node.parent, exp.JoinHint):
 98                self._tables.append(node)
 99            elif isinstance(node, exp.JoinHint):
100                self._join_hints.append(node)
101            elif isinstance(node, exp.UDTF):
102                self._derived_tables.append(node)
103            elif isinstance(node, exp.CTE):
104                self._ctes.append(node)
105            elif isinstance(node, exp.Subquery) and isinstance(parent, (exp.From, exp.Join)):
106                self._derived_tables.append(node)
107            elif isinstance(node, exp.Subqueryable):
108                self._subqueries.append(node)
109
110        self._collected = True
111
112    def _ensure_collected(self):
113        if not self._collected:
114            self._collect()
115
116    def walk(self, bfs=True):
117        return walk_in_scope(self.expression, bfs=bfs)
118
119    def find(self, *expression_types, bfs=True):
120        """
121        Returns the first node in this scope which matches at least one of the specified types.
122
123        This does NOT traverse into subscopes.
124
125        Args:
126            expression_types (type): the expression type(s) to match.
127            bfs (bool): True to use breadth-first search, False to use depth-first.
128
129        Returns:
130            exp.Expression: the node which matches the criteria or None if no node matching
131            the criteria was found.
132        """
133        return next(self.find_all(*expression_types, bfs=bfs), None)
134
135    def find_all(self, *expression_types, bfs=True):
136        """
137        Returns a generator object which visits all nodes in this scope and only yields those that
138        match at least one of the specified expression types.
139
140        This does NOT traverse into subscopes.
141
142        Args:
143            expression_types (type): the expression type(s) to match.
144            bfs (bool): True to use breadth-first search, False to use depth-first.
145
146        Yields:
147            exp.Expression: nodes
148        """
149        for expression, _, _ in self.walk(bfs=bfs):
150            if isinstance(expression, expression_types):
151                yield expression
152
153    def replace(self, old, new):
154        """
155        Replace `old` with `new`.
156
157        This can be used instead of `exp.Expression.replace` to ensure the `Scope` is kept up-to-date.
158
159        Args:
160            old (exp.Expression): old node
161            new (exp.Expression): new node
162        """
163        old.replace(new)
164        self.clear_cache()
165
166    @property
167    def tables(self):
168        """
169        List of tables in this scope.
170
171        Returns:
172            list[exp.Table]: tables
173        """
174        self._ensure_collected()
175        return self._tables
176
177    @property
178    def ctes(self):
179        """
180        List of CTEs in this scope.
181
182        Returns:
183            list[exp.CTE]: ctes
184        """
185        self._ensure_collected()
186        return self._ctes
187
188    @property
189    def derived_tables(self):
190        """
191        List of derived tables in this scope.
192
193        For example:
194            SELECT * FROM (SELECT ...) <- that's a derived table
195
196        Returns:
197            list[exp.Subquery]: derived tables
198        """
199        self._ensure_collected()
200        return self._derived_tables
201
202    @property
203    def subqueries(self):
204        """
205        List of subqueries in this scope.
206
207        For example:
208            SELECT * FROM x WHERE a IN (SELECT ...) <- that's a subquery
209
210        Returns:
211            list[exp.Subqueryable]: subqueries
212        """
213        self._ensure_collected()
214        return self._subqueries
215
216    @property
217    def columns(self):
218        """
219        List of columns in this scope.
220
221        Returns:
222            list[exp.Column]: Column instances in this scope, plus any
223                Columns that reference this scope from correlated subqueries.
224        """
225        if self._columns is None:
226            self._ensure_collected()
227            columns = self._raw_columns
228
229            external_columns = [
230                column for scope in self.subquery_scopes for column in scope.external_columns
231            ]
232
233            named_selects = set(self.expression.named_selects)
234
235            self._columns = []
236            for column in columns + external_columns:
237                ancestor = column.find_ancestor(exp.Qualify, exp.Order, exp.Having, exp.Hint)
238                if (
239                    not ancestor
240                    # Window functions can have an ORDER BY clause
241                    or not isinstance(ancestor.parent, exp.Select)
242                    or column.table
243                    or (column.name not in named_selects and not isinstance(ancestor, exp.Hint))
244                ):
245                    self._columns.append(column)
246
247        return self._columns
248
249    @property
250    def selected_sources(self):
251        """
252        Mapping of nodes and sources that are actually selected from in this scope.
253
254        That is, all tables in a schema are selectable at any point. But a
255        table only becomes a selected source if it's included in a FROM or JOIN clause.
256
257        Returns:
258            dict[str, (exp.Table|exp.Select, exp.Table|Scope)]: selected sources and nodes
259        """
260        if self._selected_sources is None:
261            referenced_names = []
262
263            for table in self.tables:
264                referenced_names.append((table.alias_or_name, table))
265            for derived_table in self.derived_tables:
266                referenced_names.append((derived_table.alias, derived_table.unnest()))
267
268            result = {}
269
270            for name, node in referenced_names:
271                if name in self.sources:
272                    result[name] = (node, self.sources[name])
273
274            self._selected_sources = result
275        return self._selected_sources
276
277    @property
278    def cte_sources(self):
279        """
280        Sources that are CTEs.
281
282        Returns:
283            dict[str, Scope]: Mapping of source alias to Scope
284        """
285        return {
286            alias: scope
287            for alias, scope in self.sources.items()
288            if isinstance(scope, Scope) and scope.is_cte
289        }
290
291    @property
292    def selects(self):
293        """
294        Select expressions of this scope.
295
296        For example, for the following expression:
297            SELECT 1 as a, 2 as b FROM x
298
299        The outputs are the "1 as a" and "2 as b" expressions.
300
301        Returns:
302            list[exp.Expression]: expressions
303        """
304        if isinstance(self.expression, exp.Union):
305            return self.expression.unnest().selects
306        return self.expression.selects
307
308    @property
309    def external_columns(self):
310        """
311        Columns that appear to reference sources in outer scopes.
312
313        Returns:
314            list[exp.Column]: Column instances that don't reference
315                sources in the current scope.
316        """
317        if self._external_columns is None:
318            self._external_columns = [
319                c for c in self.columns if c.table not in self.selected_sources
320            ]
321        return self._external_columns
322
323    @property
324    def unqualified_columns(self):
325        """
326        Unqualified columns in the current scope.
327
328        Returns:
329             list[exp.Column]: Unqualified columns
330        """
331        return [c for c in self.columns if not c.table]
332
333    @property
334    def join_hints(self):
335        """
336        Hints that exist in the scope that reference tables
337
338        Returns:
339            list[exp.JoinHint]: Join hints that are referenced within the scope
340        """
341        if self._join_hints is None:
342            return []
343        return self._join_hints
344
345    def source_columns(self, source_name):
346        """
347        Get all columns in the current scope for a particular source.
348
349        Args:
350            source_name (str): Name of the source
351        Returns:
352            list[exp.Column]: Column instances that reference `source_name`
353        """
354        return [column for column in self.columns if column.table == source_name]
355
356    @property
357    def is_subquery(self):
358        """Determine if this scope is a subquery"""
359        return self.scope_type == ScopeType.SUBQUERY
360
361    @property
362    def is_derived_table(self):
363        """Determine if this scope is a derived table"""
364        return self.scope_type == ScopeType.DERIVED_TABLE
365
366    @property
367    def is_union(self):
368        """Determine if this scope is a union"""
369        return self.scope_type == ScopeType.UNION
370
371    @property
372    def is_cte(self):
373        """Determine if this scope is a common table expression"""
374        return self.scope_type == ScopeType.CTE
375
376    @property
377    def is_root(self):
378        """Determine if this is the root scope"""
379        return self.scope_type == ScopeType.ROOT
380
381    @property
382    def is_udtf(self):
383        """Determine if this scope is a UDTF (User Defined Table Function)"""
384        return self.scope_type == ScopeType.UDTF
385
386    @property
387    def is_correlated_subquery(self):
388        """Determine if this scope is a correlated subquery"""
389        return bool(self.is_subquery and self.external_columns)
390
391    def rename_source(self, old_name, new_name):
392        """Rename a source in this scope"""
393        columns = self.sources.pop(old_name or "", [])
394        self.sources[new_name] = columns
395
396    def add_source(self, name, source):
397        """Add a source to this scope"""
398        self.sources[name] = source
399        self.clear_cache()
400
401    def remove_source(self, name):
402        """Remove a source from this scope"""
403        self.sources.pop(name, None)
404        self.clear_cache()
405
406    def __repr__(self):
407        return f"Scope<{self.expression.sql()}>"
408
409    def traverse(self):
410        """
411        Traverse the scope tree from this node.
412
413        Yields:
414            Scope: scope instances in depth-first-search post-order
415        """
416        for child_scope in itertools.chain(
417            self.cte_scopes, self.union_scopes, self.derived_table_scopes, self.subquery_scopes
418        ):
419            yield from child_scope.traverse()
420        yield self
421
422    def ref_count(self):
423        """
424        Count the number of times each scope in this tree is referenced.
425
426        Returns:
427            dict[int, int]: Mapping of Scope instance ID to reference count
428        """
429        scope_ref_count = defaultdict(lambda: 0)
430
431        for scope in self.traverse():
432            for _, source in scope.selected_sources.values():
433                scope_ref_count[id(source)] += 1
434
435        return scope_ref_count
436
437
438def traverse_scope(expression):
439    """
440    Traverse an expression by it's "scopes".
441
442    "Scope" represents the current context of a Select statement.
443
444    This is helpful for optimizing queries, where we need more information than
445    the expression tree itself. For example, we might care about the source
446    names within a subquery. Returns a list because a generator could result in
447    incomplete properties which is confusing.
448
449    Examples:
450        >>> import sqlglot
451        >>> expression = sqlglot.parse_one("SELECT a FROM (SELECT a FROM x) AS y")
452        >>> scopes = traverse_scope(expression)
453        >>> scopes[0].expression.sql(), list(scopes[0].sources)
454        ('SELECT a FROM x', ['x'])
455        >>> scopes[1].expression.sql(), list(scopes[1].sources)
456        ('SELECT a FROM (SELECT a FROM x) AS y', ['y'])
457
458    Args:
459        expression (exp.Expression): expression to traverse
460    Returns:
461        list[Scope]: scope instances
462    """
463    return list(_traverse_scope(Scope(expression)))
464
465
466def build_scope(expression):
467    """
468    Build a scope tree.
469
470    Args:
471        expression (exp.Expression): expression to build the scope tree for
472    Returns:
473        Scope: root scope
474    """
475    return traverse_scope(expression)[-1]
476
477
478def _traverse_scope(scope):
479    if isinstance(scope.expression, exp.Select):
480        yield from _traverse_select(scope)
481    elif isinstance(scope.expression, exp.Union):
482        yield from _traverse_union(scope)
483    elif isinstance(scope.expression, exp.UDTF):
484        _set_udtf_scope(scope)
485    elif isinstance(scope.expression, exp.Subquery):
486        yield from _traverse_subqueries(scope)
487    else:
488        raise OptimizeError(f"Unexpected expression type: {type(scope.expression)}")
489    yield scope
490
491
492def _traverse_select(scope):
493    yield from _traverse_derived_tables(scope.ctes, scope, ScopeType.CTE)
494    yield from _traverse_derived_tables(scope.derived_tables, scope, ScopeType.DERIVED_TABLE)
495    yield from _traverse_subqueries(scope)
496    _add_table_sources(scope)
497
498
499def _traverse_union(scope):
500    yield from _traverse_derived_tables(scope.ctes, scope, scope_type=ScopeType.CTE)
501
502    # The last scope to be yield should be the top most scope
503    left = None
504    for left in _traverse_scope(scope.branch(scope.expression.left, scope_type=ScopeType.UNION)):
505        yield left
506
507    right = None
508    for right in _traverse_scope(scope.branch(scope.expression.right, scope_type=ScopeType.UNION)):
509        yield right
510
511    scope.union_scopes = [left, right]
512
513
514def _set_udtf_scope(scope):
515    parent = scope.expression.parent
516    from_ = parent.args.get("from")
517
518    if not from_:
519        return
520
521    for table in from_.expressions:
522        if isinstance(table, exp.Table):
523            scope.tables.append(table)
524        elif isinstance(table, exp.Subquery):
525            scope.subqueries.append(table)
526    _add_table_sources(scope)
527    _traverse_subqueries(scope)
528
529
530def _traverse_derived_tables(derived_tables, scope, scope_type):
531    sources = {}
532    is_cte = scope_type == ScopeType.CTE
533
534    for derived_table in derived_tables:
535        recursive_scope = None
536
537        # if the scope is a recursive cte, it must be in the form of
538        # base_case UNION recursive. thus the recursive scope is the first
539        # section of the union.
540        if is_cte and scope.expression.args["with"].recursive:
541            union = derived_table.this
542
543            if isinstance(union, exp.Union):
544                recursive_scope = scope.branch(union.this, scope_type=ScopeType.CTE)
545
546        for child_scope in _traverse_scope(
547            scope.branch(
548                derived_table if isinstance(derived_table, exp.UDTF) else derived_table.this,
549                chain_sources=sources if scope_type == ScopeType.CTE else None,
550                outer_column_list=derived_table.alias_column_names,
551                scope_type=ScopeType.UDTF if isinstance(derived_table, exp.UDTF) else scope_type,
552            )
553        ):
554            yield child_scope
555
556            # Tables without aliases will be set as ""
557            # This shouldn't be a problem once qualify_columns runs, as it adds aliases on everything.
558            # Until then, this means that only a single, unaliased derived table is allowed (rather,
559            # the latest one wins.
560            alias = derived_table.alias
561            sources[alias] = child_scope
562
563            if recursive_scope:
564                child_scope.add_source(alias, recursive_scope)
565
566        # append the final child_scope yielded
567        if is_cte:
568            scope.cte_scopes.append(child_scope)
569        else:
570            scope.derived_table_scopes.append(child_scope)
571
572    scope.sources.update(sources)
573
574
575def _add_table_sources(scope):
576    sources = {}
577    for table in scope.tables:
578        table_name = table.name
579
580        if table.alias:
581            source_name = table.alias
582        else:
583            source_name = table_name
584
585        if table_name in scope.sources:
586            # This is a reference to a parent source (e.g. a CTE), not an actual table.
587            scope.sources[source_name] = scope.sources[table_name]
588        else:
589            sources[source_name] = table
590
591    scope.sources.update(sources)
592
593
594def _traverse_subqueries(scope):
595    for subquery in scope.subqueries:
596        top = None
597        for child_scope in _traverse_scope(scope.branch(subquery, scope_type=ScopeType.SUBQUERY)):
598            yield child_scope
599            top = child_scope
600        scope.subquery_scopes.append(top)
601
602
603def walk_in_scope(expression, bfs=True):
604    """
605    Returns a generator object which visits all nodes in the syntrax tree, stopping at
606    nodes that start child scopes.
607
608    Args:
609        expression (exp.Expression):
610        bfs (bool): if set to True the BFS traversal order will be applied,
611            otherwise the DFS traversal will be used instead.
612
613    Yields:
614        tuple[exp.Expression, Optional[exp.Expression], str]: node, parent, arg key
615    """
616    # We'll use this variable to pass state into the dfs generator.
617    # Whenever we set it to True, we exclude a subtree from traversal.
618    prune = False
619
620    for node, parent, key in expression.walk(bfs=bfs, prune=lambda *_: prune):
621        prune = False
622
623        yield node, parent, key
624
625        if node is expression:
626            continue
627        elif isinstance(node, exp.CTE):
628            prune = True
629        elif isinstance(node, exp.Subquery) and isinstance(parent, (exp.From, exp.Join)):
630            prune = True
631        elif isinstance(node, exp.Subqueryable):
632            prune = True
class ScopeType(enum.Enum):
10class ScopeType(Enum):
11    ROOT = auto()
12    SUBQUERY = auto()
13    DERIVED_TABLE = auto()
14    CTE = auto()
15    UNION = auto()
16    UDTF = auto()

An enumeration.

ROOT = <ScopeType.ROOT: 1>
SUBQUERY = <ScopeType.SUBQUERY: 2>
DERIVED_TABLE = <ScopeType.DERIVED_TABLE: 3>
CTE = <ScopeType.CTE: 4>
UNION = <ScopeType.UNION: 5>
UDTF = <ScopeType.UDTF: 6>
Inherited Members
enum.Enum
name
value
class Scope:
 19class Scope:
 20    """
 21    Selection scope.
 22
 23    Attributes:
 24        expression (exp.Select|exp.Union): Root expression of this scope
 25        sources (dict[str, exp.Table|Scope]): Mapping of source name to either
 26            a Table expression or another Scope instance. For example:
 27                SELECT * FROM x                     {"x": Table(this="x")}
 28                SELECT * FROM x AS y                {"y": Table(this="x")}
 29                SELECT * FROM (SELECT ...) AS y     {"y": Scope(...)}
 30        outer_column_list (list[str]): If this is a derived table or CTE, and the outer query
 31            defines a column list of it's alias of this scope, this is that list of columns.
 32            For example:
 33                SELECT * FROM (SELECT ...) AS y(col1, col2)
 34            The inner query would have `["col1", "col2"]` for its `outer_column_list`
 35        parent (Scope): Parent scope
 36        scope_type (ScopeType): Type of this scope, relative to it's parent
 37        subquery_scopes (list[Scope]): List of all child scopes for subqueries
 38        cte_scopes = (list[Scope]) List of all child scopes for CTEs
 39        derived_table_scopes = (list[Scope]) List of all child scopes for derived_tables
 40        union_scopes (list[Scope, Scope]): If this Scope is for a Union expression, this will be
 41            a list of the left and right child scopes.
 42    """
 43
 44    def __init__(
 45        self,
 46        expression,
 47        sources=None,
 48        outer_column_list=None,
 49        parent=None,
 50        scope_type=ScopeType.ROOT,
 51    ):
 52        self.expression = expression
 53        self.sources = sources or {}
 54        self.outer_column_list = outer_column_list or []
 55        self.parent = parent
 56        self.scope_type = scope_type
 57        self.subquery_scopes = []
 58        self.derived_table_scopes = []
 59        self.cte_scopes = []
 60        self.union_scopes = []
 61        self.clear_cache()
 62
 63    def clear_cache(self):
 64        self._collected = False
 65        self._raw_columns = None
 66        self._derived_tables = None
 67        self._tables = None
 68        self._ctes = None
 69        self._subqueries = None
 70        self._selected_sources = None
 71        self._columns = None
 72        self._external_columns = None
 73        self._join_hints = None
 74
 75    def branch(self, expression, scope_type, chain_sources=None, **kwargs):
 76        """Branch from the current scope to a new, inner scope"""
 77        return Scope(
 78            expression=expression.unnest(),
 79            sources={**self.cte_sources, **(chain_sources or {})},
 80            parent=self,
 81            scope_type=scope_type,
 82            **kwargs,
 83        )
 84
 85    def _collect(self):
 86        self._tables = []
 87        self._ctes = []
 88        self._subqueries = []
 89        self._derived_tables = []
 90        self._raw_columns = []
 91        self._join_hints = []
 92
 93        for node, parent, _ in self.walk(bfs=False):
 94            if node is self.expression:
 95                continue
 96            elif isinstance(node, exp.Column) and not isinstance(node.this, exp.Star):
 97                self._raw_columns.append(node)
 98            elif isinstance(node, exp.Table) and not isinstance(node.parent, exp.JoinHint):
 99                self._tables.append(node)
100            elif isinstance(node, exp.JoinHint):
101                self._join_hints.append(node)
102            elif isinstance(node, exp.UDTF):
103                self._derived_tables.append(node)
104            elif isinstance(node, exp.CTE):
105                self._ctes.append(node)
106            elif isinstance(node, exp.Subquery) and isinstance(parent, (exp.From, exp.Join)):
107                self._derived_tables.append(node)
108            elif isinstance(node, exp.Subqueryable):
109                self._subqueries.append(node)
110
111        self._collected = True
112
113    def _ensure_collected(self):
114        if not self._collected:
115            self._collect()
116
117    def walk(self, bfs=True):
118        return walk_in_scope(self.expression, bfs=bfs)
119
120    def find(self, *expression_types, bfs=True):
121        """
122        Returns the first node in this scope which matches at least one of the specified types.
123
124        This does NOT traverse into subscopes.
125
126        Args:
127            expression_types (type): the expression type(s) to match.
128            bfs (bool): True to use breadth-first search, False to use depth-first.
129
130        Returns:
131            exp.Expression: the node which matches the criteria or None if no node matching
132            the criteria was found.
133        """
134        return next(self.find_all(*expression_types, bfs=bfs), None)
135
136    def find_all(self, *expression_types, bfs=True):
137        """
138        Returns a generator object which visits all nodes in this scope and only yields those that
139        match at least one of the specified expression types.
140
141        This does NOT traverse into subscopes.
142
143        Args:
144            expression_types (type): the expression type(s) to match.
145            bfs (bool): True to use breadth-first search, False to use depth-first.
146
147        Yields:
148            exp.Expression: nodes
149        """
150        for expression, _, _ in self.walk(bfs=bfs):
151            if isinstance(expression, expression_types):
152                yield expression
153
154    def replace(self, old, new):
155        """
156        Replace `old` with `new`.
157
158        This can be used instead of `exp.Expression.replace` to ensure the `Scope` is kept up-to-date.
159
160        Args:
161            old (exp.Expression): old node
162            new (exp.Expression): new node
163        """
164        old.replace(new)
165        self.clear_cache()
166
167    @property
168    def tables(self):
169        """
170        List of tables in this scope.
171
172        Returns:
173            list[exp.Table]: tables
174        """
175        self._ensure_collected()
176        return self._tables
177
178    @property
179    def ctes(self):
180        """
181        List of CTEs in this scope.
182
183        Returns:
184            list[exp.CTE]: ctes
185        """
186        self._ensure_collected()
187        return self._ctes
188
189    @property
190    def derived_tables(self):
191        """
192        List of derived tables in this scope.
193
194        For example:
195            SELECT * FROM (SELECT ...) <- that's a derived table
196
197        Returns:
198            list[exp.Subquery]: derived tables
199        """
200        self._ensure_collected()
201        return self._derived_tables
202
203    @property
204    def subqueries(self):
205        """
206        List of subqueries in this scope.
207
208        For example:
209            SELECT * FROM x WHERE a IN (SELECT ...) <- that's a subquery
210
211        Returns:
212            list[exp.Subqueryable]: subqueries
213        """
214        self._ensure_collected()
215        return self._subqueries
216
217    @property
218    def columns(self):
219        """
220        List of columns in this scope.
221
222        Returns:
223            list[exp.Column]: Column instances in this scope, plus any
224                Columns that reference this scope from correlated subqueries.
225        """
226        if self._columns is None:
227            self._ensure_collected()
228            columns = self._raw_columns
229
230            external_columns = [
231                column for scope in self.subquery_scopes for column in scope.external_columns
232            ]
233
234            named_selects = set(self.expression.named_selects)
235
236            self._columns = []
237            for column in columns + external_columns:
238                ancestor = column.find_ancestor(exp.Qualify, exp.Order, exp.Having, exp.Hint)
239                if (
240                    not ancestor
241                    # Window functions can have an ORDER BY clause
242                    or not isinstance(ancestor.parent, exp.Select)
243                    or column.table
244                    or (column.name not in named_selects and not isinstance(ancestor, exp.Hint))
245                ):
246                    self._columns.append(column)
247
248        return self._columns
249
250    @property
251    def selected_sources(self):
252        """
253        Mapping of nodes and sources that are actually selected from in this scope.
254
255        That is, all tables in a schema are selectable at any point. But a
256        table only becomes a selected source if it's included in a FROM or JOIN clause.
257
258        Returns:
259            dict[str, (exp.Table|exp.Select, exp.Table|Scope)]: selected sources and nodes
260        """
261        if self._selected_sources is None:
262            referenced_names = []
263
264            for table in self.tables:
265                referenced_names.append((table.alias_or_name, table))
266            for derived_table in self.derived_tables:
267                referenced_names.append((derived_table.alias, derived_table.unnest()))
268
269            result = {}
270
271            for name, node in referenced_names:
272                if name in self.sources:
273                    result[name] = (node, self.sources[name])
274
275            self._selected_sources = result
276        return self._selected_sources
277
278    @property
279    def cte_sources(self):
280        """
281        Sources that are CTEs.
282
283        Returns:
284            dict[str, Scope]: Mapping of source alias to Scope
285        """
286        return {
287            alias: scope
288            for alias, scope in self.sources.items()
289            if isinstance(scope, Scope) and scope.is_cte
290        }
291
292    @property
293    def selects(self):
294        """
295        Select expressions of this scope.
296
297        For example, for the following expression:
298            SELECT 1 as a, 2 as b FROM x
299
300        The outputs are the "1 as a" and "2 as b" expressions.
301
302        Returns:
303            list[exp.Expression]: expressions
304        """
305        if isinstance(self.expression, exp.Union):
306            return self.expression.unnest().selects
307        return self.expression.selects
308
309    @property
310    def external_columns(self):
311        """
312        Columns that appear to reference sources in outer scopes.
313
314        Returns:
315            list[exp.Column]: Column instances that don't reference
316                sources in the current scope.
317        """
318        if self._external_columns is None:
319            self._external_columns = [
320                c for c in self.columns if c.table not in self.selected_sources
321            ]
322        return self._external_columns
323
324    @property
325    def unqualified_columns(self):
326        """
327        Unqualified columns in the current scope.
328
329        Returns:
330             list[exp.Column]: Unqualified columns
331        """
332        return [c for c in self.columns if not c.table]
333
334    @property
335    def join_hints(self):
336        """
337        Hints that exist in the scope that reference tables
338
339        Returns:
340            list[exp.JoinHint]: Join hints that are referenced within the scope
341        """
342        if self._join_hints is None:
343            return []
344        return self._join_hints
345
346    def source_columns(self, source_name):
347        """
348        Get all columns in the current scope for a particular source.
349
350        Args:
351            source_name (str): Name of the source
352        Returns:
353            list[exp.Column]: Column instances that reference `source_name`
354        """
355        return [column for column in self.columns if column.table == source_name]
356
357    @property
358    def is_subquery(self):
359        """Determine if this scope is a subquery"""
360        return self.scope_type == ScopeType.SUBQUERY
361
362    @property
363    def is_derived_table(self):
364        """Determine if this scope is a derived table"""
365        return self.scope_type == ScopeType.DERIVED_TABLE
366
367    @property
368    def is_union(self):
369        """Determine if this scope is a union"""
370        return self.scope_type == ScopeType.UNION
371
372    @property
373    def is_cte(self):
374        """Determine if this scope is a common table expression"""
375        return self.scope_type == ScopeType.CTE
376
377    @property
378    def is_root(self):
379        """Determine if this is the root scope"""
380        return self.scope_type == ScopeType.ROOT
381
382    @property
383    def is_udtf(self):
384        """Determine if this scope is a UDTF (User Defined Table Function)"""
385        return self.scope_type == ScopeType.UDTF
386
387    @property
388    def is_correlated_subquery(self):
389        """Determine if this scope is a correlated subquery"""
390        return bool(self.is_subquery and self.external_columns)
391
392    def rename_source(self, old_name, new_name):
393        """Rename a source in this scope"""
394        columns = self.sources.pop(old_name or "", [])
395        self.sources[new_name] = columns
396
397    def add_source(self, name, source):
398        """Add a source to this scope"""
399        self.sources[name] = source
400        self.clear_cache()
401
402    def remove_source(self, name):
403        """Remove a source from this scope"""
404        self.sources.pop(name, None)
405        self.clear_cache()
406
407    def __repr__(self):
408        return f"Scope<{self.expression.sql()}>"
409
410    def traverse(self):
411        """
412        Traverse the scope tree from this node.
413
414        Yields:
415            Scope: scope instances in depth-first-search post-order
416        """
417        for child_scope in itertools.chain(
418            self.cte_scopes, self.union_scopes, self.derived_table_scopes, self.subquery_scopes
419        ):
420            yield from child_scope.traverse()
421        yield self
422
423    def ref_count(self):
424        """
425        Count the number of times each scope in this tree is referenced.
426
427        Returns:
428            dict[int, int]: Mapping of Scope instance ID to reference count
429        """
430        scope_ref_count = defaultdict(lambda: 0)
431
432        for scope in self.traverse():
433            for _, source in scope.selected_sources.values():
434                scope_ref_count[id(source)] += 1
435
436        return scope_ref_count

Selection scope.

Attributes:
  • expression (exp.Select|exp.Union): Root expression of this scope
  • sources (dict[str, exp.Table|Scope]): Mapping of source name to either a Table expression or another Scope instance. For example: SELECT * FROM x {"x": Table(this="x")} SELECT * FROM x AS y {"y": Table(this="x")} SELECT * FROM (SELECT ...) AS y {"y": Scope(...)}
  • outer_column_list (list[str]): If this is a derived table or CTE, and the outer query defines a column list of it's alias of this scope, this is that list of columns. For example: SELECT * FROM (SELECT ...) AS y(col1, col2) The inner query would have ["col1", "col2"] for its outer_column_list
  • parent (Scope): Parent scope
  • scope_type (ScopeType): Type of this scope, relative to it's parent
  • subquery_scopes (list[Scope]): List of all child scopes for subqueries
  • cte_scopes = (list[Scope]) List of all child scopes for CTEs
  • derived_table_scopes = (list[Scope]) List of all child scopes for derived_tables
  • union_scopes (list[Scope, Scope]): If this Scope is for a Union expression, this will be a list of the left and right child scopes.
Scope( expression, sources=None, outer_column_list=None, parent=None, scope_type=<ScopeType.ROOT: 1>)
44    def __init__(
45        self,
46        expression,
47        sources=None,
48        outer_column_list=None,
49        parent=None,
50        scope_type=ScopeType.ROOT,
51    ):
52        self.expression = expression
53        self.sources = sources or {}
54        self.outer_column_list = outer_column_list or []
55        self.parent = parent
56        self.scope_type = scope_type
57        self.subquery_scopes = []
58        self.derived_table_scopes = []
59        self.cte_scopes = []
60        self.union_scopes = []
61        self.clear_cache()
def clear_cache(self):
63    def clear_cache(self):
64        self._collected = False
65        self._raw_columns = None
66        self._derived_tables = None
67        self._tables = None
68        self._ctes = None
69        self._subqueries = None
70        self._selected_sources = None
71        self._columns = None
72        self._external_columns = None
73        self._join_hints = None
def branch(self, expression, scope_type, chain_sources=None, **kwargs):
75    def branch(self, expression, scope_type, chain_sources=None, **kwargs):
76        """Branch from the current scope to a new, inner scope"""
77        return Scope(
78            expression=expression.unnest(),
79            sources={**self.cte_sources, **(chain_sources or {})},
80            parent=self,
81            scope_type=scope_type,
82            **kwargs,
83        )

Branch from the current scope to a new, inner scope

def walk(self, bfs=True):
117    def walk(self, bfs=True):
118        return walk_in_scope(self.expression, bfs=bfs)
def find(self, *expression_types, bfs=True):
120    def find(self, *expression_types, bfs=True):
121        """
122        Returns the first node in this scope which matches at least one of the specified types.
123
124        This does NOT traverse into subscopes.
125
126        Args:
127            expression_types (type): the expression type(s) to match.
128            bfs (bool): True to use breadth-first search, False to use depth-first.
129
130        Returns:
131            exp.Expression: the node which matches the criteria or None if no node matching
132            the criteria was found.
133        """
134        return next(self.find_all(*expression_types, bfs=bfs), None)

Returns the first node in this scope which matches at least one of the specified types.

This does NOT traverse into subscopes.

Arguments:
  • expression_types (type): the expression type(s) to match.
  • bfs (bool): True to use breadth-first search, False to use depth-first.
Returns:

exp.Expression: the node which matches the criteria or None if no node matching the criteria was found.

def find_all(self, *expression_types, bfs=True):
136    def find_all(self, *expression_types, bfs=True):
137        """
138        Returns a generator object which visits all nodes in this scope and only yields those that
139        match at least one of the specified expression types.
140
141        This does NOT traverse into subscopes.
142
143        Args:
144            expression_types (type): the expression type(s) to match.
145            bfs (bool): True to use breadth-first search, False to use depth-first.
146
147        Yields:
148            exp.Expression: nodes
149        """
150        for expression, _, _ in self.walk(bfs=bfs):
151            if isinstance(expression, expression_types):
152                yield expression

Returns a generator object which visits all nodes in this scope and only yields those that match at least one of the specified expression types.

This does NOT traverse into subscopes.

Arguments:
  • expression_types (type): the expression type(s) to match.
  • bfs (bool): True to use breadth-first search, False to use depth-first.
Yields:

exp.Expression: nodes

def replace(self, old, new):
154    def replace(self, old, new):
155        """
156        Replace `old` with `new`.
157
158        This can be used instead of `exp.Expression.replace` to ensure the `Scope` is kept up-to-date.
159
160        Args:
161            old (exp.Expression): old node
162            new (exp.Expression): new node
163        """
164        old.replace(new)
165        self.clear_cache()

Replace old with new.

This can be used instead of exp.Expression.replace to ensure the Scope is kept up-to-date.

Arguments:
  • old (exp.Expression): old node
  • new (exp.Expression): new node
tables

List of tables in this scope.

Returns:

list[exp.Table]: tables

ctes

List of CTEs in this scope.

Returns:

list[exp.CTE]: ctes

derived_tables

List of derived tables in this scope.

For example:

SELECT * FROM (SELECT ...) <- that's a derived table

Returns:

list[exp.Subquery]: derived tables

subqueries

List of subqueries in this scope.

For example:

SELECT * FROM x WHERE a IN (SELECT ...) <- that's a subquery

Returns:

list[exp.Subqueryable]: subqueries

columns

List of columns in this scope.

Returns:

list[exp.Column]: Column instances in this scope, plus any Columns that reference this scope from correlated subqueries.

selected_sources

Mapping of nodes and sources that are actually selected from in this scope.

That is, all tables in a schema are selectable at any point. But a table only becomes a selected source if it's included in a FROM or JOIN clause.

Returns:

dict[str, (exp.Table|exp.Select, exp.Table|Scope)]: selected sources and nodes

cte_sources

Sources that are CTEs.

Returns:

dict[str, Scope]: Mapping of source alias to Scope

selects

Select expressions of this scope.

For example, for the following expression: SELECT 1 as a, 2 as b FROM x

The outputs are the "1 as a" and "2 as b" expressions.

Returns:

list[exp.Expression]: expressions

external_columns

Columns that appear to reference sources in outer scopes.

Returns:

list[exp.Column]: Column instances that don't reference sources in the current scope.

unqualified_columns

Unqualified columns in the current scope.

Returns:

list[exp.Column]: Unqualified columns

join_hints

Hints that exist in the scope that reference tables

Returns:

list[exp.JoinHint]: Join hints that are referenced within the scope

def source_columns(self, source_name):
346    def source_columns(self, source_name):
347        """
348        Get all columns in the current scope for a particular source.
349
350        Args:
351            source_name (str): Name of the source
352        Returns:
353            list[exp.Column]: Column instances that reference `source_name`
354        """
355        return [column for column in self.columns if column.table == source_name]

Get all columns in the current scope for a particular source.

Arguments:
  • source_name (str): Name of the source
Returns:

list[exp.Column]: Column instances that reference source_name

is_subquery

Determine if this scope is a subquery

is_derived_table

Determine if this scope is a derived table

is_union

Determine if this scope is a union

is_cte

Determine if this scope is a common table expression

is_root

Determine if this is the root scope

is_udtf

Determine if this scope is a UDTF (User Defined Table Function)

is_correlated_subquery

Determine if this scope is a correlated subquery

def rename_source(self, old_name, new_name):
392    def rename_source(self, old_name, new_name):
393        """Rename a source in this scope"""
394        columns = self.sources.pop(old_name or "", [])
395        self.sources[new_name] = columns

Rename a source in this scope

def add_source(self, name, source):
397    def add_source(self, name, source):
398        """Add a source to this scope"""
399        self.sources[name] = source
400        self.clear_cache()

Add a source to this scope

def remove_source(self, name):
402    def remove_source(self, name):
403        """Remove a source from this scope"""
404        self.sources.pop(name, None)
405        self.clear_cache()

Remove a source from this scope

def traverse(self):
410    def traverse(self):
411        """
412        Traverse the scope tree from this node.
413
414        Yields:
415            Scope: scope instances in depth-first-search post-order
416        """
417        for child_scope in itertools.chain(
418            self.cte_scopes, self.union_scopes, self.derived_table_scopes, self.subquery_scopes
419        ):
420            yield from child_scope.traverse()
421        yield self

Traverse the scope tree from this node.

Yields:

Scope: scope instances in depth-first-search post-order

def ref_count(self):
423    def ref_count(self):
424        """
425        Count the number of times each scope in this tree is referenced.
426
427        Returns:
428            dict[int, int]: Mapping of Scope instance ID to reference count
429        """
430        scope_ref_count = defaultdict(lambda: 0)
431
432        for scope in self.traverse():
433            for _, source in scope.selected_sources.values():
434                scope_ref_count[id(source)] += 1
435
436        return scope_ref_count

Count the number of times each scope in this tree is referenced.

Returns:

dict[int, int]: Mapping of Scope instance ID to reference count

def traverse_scope(expression):
439def traverse_scope(expression):
440    """
441    Traverse an expression by it's "scopes".
442
443    "Scope" represents the current context of a Select statement.
444
445    This is helpful for optimizing queries, where we need more information than
446    the expression tree itself. For example, we might care about the source
447    names within a subquery. Returns a list because a generator could result in
448    incomplete properties which is confusing.
449
450    Examples:
451        >>> import sqlglot
452        >>> expression = sqlglot.parse_one("SELECT a FROM (SELECT a FROM x) AS y")
453        >>> scopes = traverse_scope(expression)
454        >>> scopes[0].expression.sql(), list(scopes[0].sources)
455        ('SELECT a FROM x', ['x'])
456        >>> scopes[1].expression.sql(), list(scopes[1].sources)
457        ('SELECT a FROM (SELECT a FROM x) AS y', ['y'])
458
459    Args:
460        expression (exp.Expression): expression to traverse
461    Returns:
462        list[Scope]: scope instances
463    """
464    return list(_traverse_scope(Scope(expression)))

Traverse an expression by it's "scopes".

"Scope" represents the current context of a Select statement.

This is helpful for optimizing queries, where we need more information than the expression tree itself. For example, we might care about the source names within a subquery. Returns a list because a generator could result in incomplete properties which is confusing.

Examples:
>>> import sqlglot
>>> expression = sqlglot.parse_one("SELECT a FROM (SELECT a FROM x) AS y")
>>> scopes = traverse_scope(expression)
>>> scopes[0].expression.sql(), list(scopes[0].sources)
('SELECT a FROM x', ['x'])
>>> scopes[1].expression.sql(), list(scopes[1].sources)
('SELECT a FROM (SELECT a FROM x) AS y', ['y'])
Arguments:
  • expression (exp.Expression): expression to traverse
Returns:

list[Scope]: scope instances

def build_scope(expression):
467def build_scope(expression):
468    """
469    Build a scope tree.
470
471    Args:
472        expression (exp.Expression): expression to build the scope tree for
473    Returns:
474        Scope: root scope
475    """
476    return traverse_scope(expression)[-1]

Build a scope tree.

Arguments:
  • expression (exp.Expression): expression to build the scope tree for
Returns:

Scope: root scope

def walk_in_scope(expression, bfs=True):
604def walk_in_scope(expression, bfs=True):
605    """
606    Returns a generator object which visits all nodes in the syntrax tree, stopping at
607    nodes that start child scopes.
608
609    Args:
610        expression (exp.Expression):
611        bfs (bool): if set to True the BFS traversal order will be applied,
612            otherwise the DFS traversal will be used instead.
613
614    Yields:
615        tuple[exp.Expression, Optional[exp.Expression], str]: node, parent, arg key
616    """
617    # We'll use this variable to pass state into the dfs generator.
618    # Whenever we set it to True, we exclude a subtree from traversal.
619    prune = False
620
621    for node, parent, key in expression.walk(bfs=bfs, prune=lambda *_: prune):
622        prune = False
623
624        yield node, parent, key
625
626        if node is expression:
627            continue
628        elif isinstance(node, exp.CTE):
629            prune = True
630        elif isinstance(node, exp.Subquery) and isinstance(parent, (exp.From, exp.Join)):
631            prune = True
632        elif isinstance(node, exp.Subqueryable):
633            prune = True

Returns a generator object which visits all nodes in the syntrax tree, stopping at nodes that start child scopes.

Arguments:
  • expression (exp.Expression):
  • bfs (bool): if set to True the BFS traversal order will be applied, otherwise the DFS traversal will be used instead.
Yields:

tuple[exp.Expression, Optional[exp.Expression], str]: node, parent, arg key