import re import os import itertools from collections import defaultdict from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Pattern, Tuple, TypeVar, Union, cast) T = TypeVar('T') end_space = re.compile(r"([^\\]\s)*$") def fnmatch_translate(pat: bytes) -> Tuple[bool, Pattern[bytes]]: parts = [] seq: Optional[int] = None i = 0 any_char = b"[^/]" if pat[0:1] == b"/": parts.append(b"^") pat = pat[1:] else: # By default match the entire path up to a / # but if / doesn't appear in the pattern we will mark is as # a name pattern and just produce a pattern that matches against # the filename parts.append(b"^(?:.*/)?") name_pattern = True if pat[-1:] == b"/": # If the last character is / match this directory or any subdirectory pat = pat[:-1] suffix = b"(?:/|$)" else: suffix = b"$" while i < len(pat): c = pat[i:i+1] if c == b"\\": if i < len(pat) - 1: i += 1 c = pat[i:i+1] parts.append(re.escape(c)) else: raise ValueError elif seq is not None: # TODO: this doesn't really handle invalid sequences in the right way if c == b"]": seq = None if parts[-1] == b"[": parts = parts[:-1] elif parts[-1] == b"^" and parts[-2] == b"[": raise ValueError else: parts.append(c) elif c == b"-": parts.append(c) elif c == b"[": raise ValueError else: parts.append(re.escape(c)) elif c == b"[": parts.append(b"[") if i < len(pat) - 1 and pat[i+1:i+2] in (b"!", b"^"): parts.append(b"^") i += 1 seq = i elif c == b"*": if i < len(pat) - 1 and pat[i+1:i+2] == b"*": if i > 0 and pat[i-1:i] != b"/": raise ValueError parts.append(b".*") i += 1 if i < len(pat) - 1 and pat[i+1:i+2] != b"/": raise ValueError else: parts.append(any_char + b"*") elif c == b"?": parts.append(any_char) elif c == b"/" and not seq: name_pattern = False parts.append(c) else: parts.append(re.escape(c)) i += 1 if name_pattern: parts[0] = b"^" if seq is not None: raise ValueError parts.append(suffix) try: return name_pattern, re.compile(b"".join(parts)) except Exception: raise ValueError # Regexp matching rules that have to be converted to patterns pattern_re = re.compile(br".*[\*\[\?]") def parse_line(line: bytes) -> Optional[Tuple[bool, bool, bool, Union[Tuple[bytes, ...], Tuple[bool, Pattern[bytes]]]]]: line = line.rstrip() if not line or line[0:1] == b"#": return None invert = line[0:1] == b"!" if invert: line = line[1:] dir_only = line[-1:] == b"/" if dir_only: line = line[:-1] # Could make a special case for **/foo, but we don't have any patterns like that if not invert and not pattern_re.match(line): literal = True pattern: Union[Tuple[bytes, ...], Tuple[bool, Pattern[bytes]]] = tuple(line.rsplit(b"/", 1)) else: pattern = fnmatch_translate(line) literal = False return invert, dir_only, literal, pattern class PathFilter: def __init__(self, root: bytes, extras: Optional[List[bytes]] = None, cache: Optional[MutableMapping[bytes, bool]] = None) -> None: if root: ignore_path: Optional[bytes] = os.path.join(root, b".gitignore") else: ignore_path = None if not ignore_path and not extras: self.trivial = True return self.trivial = False self.literals_file: Dict[Optional[bytes], Dict[bytes, List[Tuple[bool, Pattern[bytes]]]]] = defaultdict(dict) self.literals_dir: Dict[Optional[bytes], Dict[bytes, List[Tuple[bool, Pattern[bytes]]]]] = defaultdict(dict) self.patterns_file: List[Tuple[Tuple[bool, Pattern[bytes]], List[Tuple[bool, Pattern[bytes]]]]] = [] self.patterns_dir: List[Tuple[Tuple[bool, Pattern[bytes]], List[Tuple[bool, Pattern[bytes]]]]] = [] if cache is None: cache = {} self.cache: MutableMapping[bytes, bool] = cache if extras is None: extras = [] if ignore_path and os.path.exists(ignore_path): args: Tuple[Optional[bytes], List[bytes]] = (ignore_path, extras) else: args = None, extras self._read_ignore(*args) def _read_ignore(self, ignore_path: Optional[bytes], extras: List[bytes]) -> None: if ignore_path is not None: with open(ignore_path, "rb") as f: for line in f: self._read_line(line) for line in extras: self._read_line(line) def _read_line(self, line: bytes) -> None: parsed = parse_line(line) if not parsed: return invert, dir_only, literal, rule = parsed if invert: # For exclude rules, we attach the rules to all preceeding patterns, so # that we can match patterns out of order and check if they were later # overriden by an exclude rule assert not literal rule = cast(Tuple[bool, Pattern[bytes]], rule) if not dir_only: rules_iter: Iterable[Tuple[Any, List[Tuple[bool, Pattern[bytes]]]]] = itertools.chain( itertools.chain(*(item.items() for item in self.literals_dir.values())), itertools.chain(*(item.items() for item in self.literals_file.values())), self.patterns_dir, self.patterns_file) else: rules_iter = itertools.chain( itertools.chain(*(item.items() for item in self.literals_dir.values())), self.patterns_dir) for rules in rules_iter: rules[1].append(rule) else: if literal: rule = cast(Tuple[bytes, ...], rule) if len(rule) == 1: dir_name, pattern = None, rule[0] # type: Tuple[Optional[bytes], bytes] else: dir_name, pattern = rule self.literals_dir[dir_name][pattern] = [] if not dir_only: self.literals_file[dir_name][pattern] = [] else: rule = cast(Tuple[bool, Pattern[bytes]], rule) self.patterns_dir.append((rule, [])) if not dir_only: self.patterns_file.append((rule, [])) def filter(self, iterator: Iterable[Tuple[bytes, List[Tuple[bytes, T]], List[Tuple[bytes, T]]]] ) -> Iterable[Tuple[bytes, List[Tuple[bytes, T]], List[Tuple[bytes, T]]]]: empty: Dict[Any, Any] = {} for dirpath, dirnames, filenames in iterator: orig_dirpath = dirpath path_sep = os.path.sep.encode() if path_sep != b"/": dirpath = dirpath.replace(path_sep, b"/") keep_dirs: List[Tuple[bytes, T]] = [] keep_files: List[Tuple[bytes, T]] = [] for iter_items, literals, patterns, target, suffix in [ (dirnames, self.literals_dir, self.patterns_dir, keep_dirs, b"/"), (filenames, self.literals_file, self.patterns_file, keep_files, b"")]: for item in iter_items: name = item[0] if dirpath: path = b"%s/%s" % (dirpath, name) + suffix else: path = name + suffix if path in self.cache: if not self.cache[path]: target.append(item) continue for rule_dir in [None, dirpath if dirpath != b"." else b""]: if name in literals.get(rule_dir, empty): exclude = literals[rule_dir][name] if not any(rule.match(name if name_only else path) for name_only, rule in exclude): # Skip this item self.cache[path] = True break else: for (component_only, pattern), exclude in patterns: if component_only: match = pattern.match(name) else: match = pattern.match(path) if match: if not any(rule.match(name if name_only else path) for name_only, rule in exclude): # Skip this item self.cache[path] = True break else: self.cache[path] = False target.append(item) dirnames[:] = keep_dirs assert not any(b".git" == name for name, _ in dirnames) yield orig_dirpath, dirnames, keep_files def __call__(self, iterator: Iterable[Tuple[bytes, List[Tuple[bytes, T]], List[Tuple[bytes, T]]]] ) -> Iterable[Tuple[bytes, List[Tuple[bytes, T]], List[Tuple[bytes, T]]]]: if self.trivial: return iterator return self.filter(iterator) def has_ignore(dirpath: bytes) -> bool: return os.path.exists(os.path.join(dirpath, b".gitignore"))