diff options
Diffstat (limited to 'gitlint-core/gitlint/git.py')
-rw-r--r-- | gitlint-core/gitlint/git.py | 510 |
1 files changed, 510 insertions, 0 deletions
diff --git a/gitlint-core/gitlint/git.py b/gitlint-core/gitlint/git.py new file mode 100644 index 0000000..6612a7d --- /dev/null +++ b/gitlint-core/gitlint/git.py @@ -0,0 +1,510 @@ +import logging +import os +from pathlib import Path + +import arrow + +from gitlint import shell as sh +from gitlint.cache import PropertyCache, cache +from gitlint.exception import GitlintError + +# import exceptions separately, this makes it a little easier to mock them out in the unit tests +from gitlint.shell import CommandNotFound, ErrorReturnCode + +# For now, the git date format we use is fixed, but technically this format is determined by `git config log.date` +# We should fix this at some point :-) +GIT_TIMEFORMAT = "YYYY-MM-DD HH:mm:ss Z" + +LOG = logging.getLogger(__name__) + + +class GitContextError(GitlintError): + """Exception indicating there is an issue with the git context""" + + +class GitNotInstalledError(GitContextError): + def __init__(self): + super().__init__( + "'git' command not found. You need to install git to use gitlint on a local repository. " + "See https://git-scm.com/book/en/v2/Getting-Started-Installing-Git on how to install git." + ) + + +class GitExitCodeError(GitContextError): + def __init__(self, command, stderr): + self.command = command + self.stderr = stderr + super().__init__(f"An error occurred while executing '{command}': {stderr}") + + +def _git(*command_parts, **kwargs): + """Convenience function for running git commands. Automatically deals with exceptions and unicode.""" + git_kwargs = {"_tty_out": False} + git_kwargs.update(kwargs) + try: + LOG.debug(command_parts) + result = sh.git(*command_parts, **git_kwargs) + # If we reach this point and the result has an exit_code that is larger than 0, this means that we didn't + # get an exception (which is the default sh behavior for non-zero exit codes) and so the user is expecting + # a non-zero exit code -> just return the entire result + if hasattr(result, "exit_code") and result.exit_code > 0: + return result + return str(result) + except CommandNotFound as e: + raise GitNotInstalledError from e + except ErrorReturnCode as e: # Something went wrong while executing the git command + error_msg = e.stderr.strip() + error_msg_lower = error_msg.lower() + if "_cwd" in git_kwargs and b"not a git repository" in error_msg_lower: + raise GitContextError(f"{git_kwargs['_cwd']} is not a git repository.") from e + + if ( + b"does not have any commits yet" in error_msg_lower + or b"ambiguous argument 'head': unknown revision" in error_msg_lower + ): + msg = "Current branch has no commits. Gitlint requires at least one commit to function." + raise GitContextError(msg) from e + + raise GitExitCodeError(e.full_cmd, error_msg) from e + + +def git_version(): + """Determine the git version installed on this host by calling git --version""" + return _git("--version").replace("\n", "") + + +def git_commentchar(repository_path=None): + """Shortcut for retrieving comment char from git config""" + commentchar = _git("config", "--get", "core.commentchar", _cwd=repository_path, _ok_code=[0, 1]) + # git will return an exit code of 1 if it can't find a config value, in this case we fall-back to # as commentchar + if hasattr(commentchar, "exit_code") and commentchar.exit_code == 1: + commentchar = "#" + return commentchar.replace("\n", "") + + +def git_hooks_dir(repository_path): + """Determine hooks directory for a given target dir""" + hooks_dir = _git("rev-parse", "--git-path", "hooks", _cwd=repository_path) + hooks_dir = hooks_dir.replace("\n", "") + return os.path.realpath(os.path.join(repository_path, hooks_dir)) + + +def _parse_git_changed_file_stats(changed_files_stats_raw): + """Parse the output of git diff --numstat and return a dict of: + dict[filename: GitChangedFileStats(filename, additions, deletions)]""" + changed_files_stats_lines = changed_files_stats_raw.split("\n") + changed_files_stats = {} + for line in changed_files_stats_lines[:-1]: # drop last empty line + line_stats = line.split() + + # If the file is binary, numstat will show "-" + # See https://git-scm.com/docs/git-diff#Documentation/git-diff.txt---numstat + additions = int(line_stats[0]) if line_stats[0] != "-" else None + deletions = int(line_stats[1]) if line_stats[1] != "-" else None + + changed_file_stat = GitChangedFileStats(line_stats[2], additions, deletions) + changed_files_stats[line_stats[2]] = changed_file_stat + + return changed_files_stats + + +class GitCommitMessage: + """Class representing a git commit message. A commit message consists of the following: + - context: The `GitContext` this commit message is part of + - original: The actual commit message as returned by `git log` + - full: original, but stripped of any comments + - title: the first line of full + - body: all lines following the title + """ + + def __init__(self, context, original=None, full=None, title=None, body=None): + self.context = context + self.original = original + self.full = full + self.title = title + self.body = body + + @staticmethod + def from_full_message(context, commit_msg_str): + """Parses a full git commit message by parsing a given string into the different parts of a commit message""" + all_lines = commit_msg_str.splitlines() + cutline = f"{context.commentchar} ------------------------ >8 ------------------------" + try: + cutline_index = all_lines.index(cutline) + except ValueError: + cutline_index = None + lines = [line for line in all_lines[:cutline_index] if not line.startswith(context.commentchar)] + full = "\n".join(lines) + title = lines[0] if lines else "" + body = lines[1:] if len(lines) > 1 else [] + return GitCommitMessage(context=context, original=commit_msg_str, full=full, title=title, body=body) + + def __str__(self): + return self.full + + def __eq__(self, other): + return ( + isinstance(other, GitCommitMessage) + and self.original == other.original + and self.full == other.full + and self.title == other.title + and self.body == other.body + ) + + +class GitChangedFileStats: + """Class representing the stats for a changed file in git""" + + def __init__(self, filepath, additions, deletions): + self.filepath = Path(filepath) + self.additions = additions + self.deletions = deletions + + def __eq__(self, other): + return ( + isinstance(other, GitChangedFileStats) + and self.filepath == other.filepath + and self.additions == other.additions + and self.deletions == other.deletions + ) + + def __str__(self) -> str: + return f"{self.filepath}: {self.additions} additions, {self.deletions} deletions" + + +class GitCommit: + """Class representing a git commit. + A commit consists of: context, message, author name, author email, date, list of parent commit shas, + list of changed files, list of branch names. + In the context of gitlint, only the git context and commit message are required. + """ + + def __init__( + self, + context, + message, + sha=None, + date=None, + author_name=None, + author_email=None, + parents=None, + changed_files_stats=None, + branches=None, + ): + self.context = context + self.message = message + self.sha = sha + self.date = date + self.author_name = author_name + self.author_email = author_email + self.parents = parents or [] # parent commit hashes + self.changed_files_stats = changed_files_stats or {} + self.branches = branches or [] + + @property + def is_merge_commit(self): + return self.message.title.startswith("Merge") + + @property + def is_fixup_commit(self): + return self.message.title.startswith("fixup!") + + @property + def is_squash_commit(self): + return self.message.title.startswith("squash!") + + @property + def is_fixup_amend_commit(self): + return self.message.title.startswith("amend!") + + @property + def is_revert_commit(self): + return self.message.title.startswith("Revert") + + @property + def changed_files(self): + return list(self.changed_files_stats.keys()) + + def __str__(self): + date_str = arrow.get(self.date).format(GIT_TIMEFORMAT) if self.date else None + + if len(self.changed_files_stats) > 0: + changed_files_stats_str = "\n " + "\n ".join([str(stats) for stats in self.changed_files_stats.values()]) + else: + changed_files_stats_str = " {}" + + return ( + f"--- Commit Message ----\n{self.message}\n" + "--- Meta info ---------\n" + f"Author: {self.author_name} <{self.author_email}>\n" + f"Date: {date_str}\n" + f"is-merge-commit: {self.is_merge_commit}\n" + f"is-fixup-commit: {self.is_fixup_commit}\n" + f"is-fixup-amend-commit: {self.is_fixup_amend_commit}\n" + f"is-squash-commit: {self.is_squash_commit}\n" + f"is-revert-commit: {self.is_revert_commit}\n" + f"Parents: {self.parents}\n" + f"Branches: {self.branches}\n" + f"Changed Files: {self.changed_files}\n" + f"Changed Files Stats:{changed_files_stats_str}\n" + "-----------------------" + ) + + def __eq__(self, other): + # skip checking the context as context refers back to this obj, this will trigger a cyclic dependency + return ( + isinstance(other, GitCommit) + and self.message == other.message + and self.sha == other.sha + and self.author_name == other.author_name + and self.author_email == other.author_email + and self.date == other.date + and self.parents == other.parents + and self.is_merge_commit == other.is_merge_commit + and self.is_fixup_commit == other.is_fixup_commit + and self.is_fixup_amend_commit == other.is_fixup_amend_commit + and self.is_squash_commit == other.is_squash_commit + and self.is_revert_commit == other.is_revert_commit + and self.changed_files == other.changed_files + and self.changed_files_stats == other.changed_files_stats + and self.branches == other.branches + ) + + +class LocalGitCommit(GitCommit, PropertyCache): + """Class representing a git commit that exists in the local git repository. + This class uses lazy loading: it defers reading information from the local git repository until the associated + property is accessed for the first time. Properties are then cached for subsequent access. + + This approach ensures that we don't do 'expensive' git calls when certain properties are not actually used. + In addition, reading the required info when it's needed rather than up front avoids adding delay during gitlint + startup time and reduces gitlint's memory footprint. + """ + + def __init__(self, context, sha): + PropertyCache.__init__(self) + self.context = context + self.sha = sha + + def _log(self): + """Does a call to `git log` to determine a bunch of information about the commit.""" + long_format = "--pretty=%aN%x00%aE%x00%ai%x00%P%n%B" + raw_commit = _git("log", self.sha, "-1", long_format, _cwd=self.context.repository_path).split("\n") + + (name, email, date, parents), commit_msg = raw_commit[0].split("\x00"), "\n".join(raw_commit[1:]) + + commit_parents = [] if parents == "" else parents.split(" ") + commit_is_merge_commit = len(commit_parents) > 1 + + # "YYYY-MM-DD HH:mm:ss Z" -> ISO 8601-like format + # Use arrow for datetime parsing, because apparently python is quirky around ISO-8601 dates: + # http://stackoverflow.com/a/30696682/381010 + commit_date = arrow.get(date, GIT_TIMEFORMAT).datetime + + # Create Git commit object with the retrieved info + commit_msg_obj = GitCommitMessage.from_full_message(self.context, commit_msg) + + self._cache.update( + { + "message": commit_msg_obj, + "author_name": name, + "author_email": email, + "date": commit_date, + "parents": commit_parents, + "is_merge_commit": commit_is_merge_commit, + } + ) + + @property + def message(self): + return self._try_cache("message", self._log) + + @property + def author_name(self): + return self._try_cache("author_name", self._log) + + @property + def author_email(self): + return self._try_cache("author_email", self._log) + + @property + def date(self): + return self._try_cache("date", self._log) + + @property + def parents(self): + return self._try_cache("parents", self._log) + + @property + def branches(self): + def cache_branches(): + # We have to parse 'git branch --contains <sha>' instead of 'git for-each-ref' to be compatible with + # git versions < 2.7.0 + # https://stackoverflow.com/questions/45173979/can-i-force-git-branch-contains-tag-to-not-print-the-asterisk + branches = _git("branch", "--contains", self.sha, _cwd=self.context.repository_path).split("\n") + + # This means that we need to remove any leading * that indicates the current branch. Note that we can + # safely do this since git branches cannot contain '*' anywhere, so if we find an '*' we know it's output + # from the git CLI and not part of the branch name. See https://git-scm.com/docs/git-check-ref-format + # We also drop the last empty line from the output. + self._cache["branches"] = [branch.replace("*", "").strip() for branch in branches[:-1]] + + return self._try_cache("branches", cache_branches) + + @property + def is_merge_commit(self): + return self._try_cache("is_merge_commit", self._log) + + @property + def changed_files_stats(self): + def cache_changed_files_stats(): + changed_files_stats_raw = _git( + "diff-tree", "--no-commit-id", "--numstat", "-r", "--root", self.sha, _cwd=self.context.repository_path + ) + self._cache["changed_files_stats"] = _parse_git_changed_file_stats(changed_files_stats_raw) + + return self._try_cache("changed_files_stats", cache_changed_files_stats) + + +class StagedLocalGitCommit(GitCommit, PropertyCache): + """Class representing a git commit that has been staged, but not committed. + + Other than the commit message itself (and changed files), a lot of information is actually not known at staging + time, since the commit hasn't happened yet. However, we can make educated guesses based on existing repository + information. + """ + + def __init__(self, context, commit_message): + PropertyCache.__init__(self) + self.context = context + self.message = commit_message + self.sha = None + self.parents = [] # Not really possible to determine before a commit + + @property + @cache + def author_name(self): + try: + return _git("config", "--get", "user.name", _cwd=self.context.repository_path).strip() + except GitExitCodeError as e: + raise GitContextError("Missing git configuration: please set user.name") from e + + @property + @cache + def author_email(self): + try: + return _git("config", "--get", "user.email", _cwd=self.context.repository_path).strip() + except GitExitCodeError as e: + raise GitContextError("Missing git configuration: please set user.email") from e + + @property + @cache + def date(self): + # We don't know the actual commit date yet, but we make a pragmatic trade-off here by providing the current date + # We get current date from arrow, reformat in git date format, then re-interpret it as a date. + # This ensure we capture the same precision and timezone information that git does. + return arrow.get(arrow.now().format(GIT_TIMEFORMAT), GIT_TIMEFORMAT).datetime + + @property + @cache + def branches(self): + # We don't know the branch this commit will be part of yet, but we're pragmatic here and just return the + # current branch, as for all intents and purposes, this will be what the user is looking for. + return [self.context.current_branch] + + @property + def changed_files_stats(self): + def cache_changed_files_stats(): + changed_files_stats_raw = _git("diff", "--staged", "--numstat", "-r", _cwd=self.context.repository_path) + self._cache["changed_files_stats"] = _parse_git_changed_file_stats(changed_files_stats_raw) + + return self._try_cache("changed_files_stats", cache_changed_files_stats) + + +class GitContext(PropertyCache): + """Class representing the git context in which gitlint is operating: a data object storing information about + the git repository that gitlint is linting. + """ + + def __init__(self, repository_path=None): + PropertyCache.__init__(self) + self.commits = [] + self.repository_path = repository_path + + @property + @cache + def commentchar(self): + return git_commentchar(self.repository_path) + + @property + @cache + def current_branch(self): + try: + current_branch = _git("rev-parse", "--abbrev-ref", "HEAD", _cwd=self.repository_path).strip() + except GitContextError: + # Maybe there is no commit. Try another way to get current branch (need Git 2.22+) + current_branch = _git("branch", "--show-current", _cwd=self.repository_path).strip() + return current_branch + + @staticmethod + def from_commit_msg(commit_msg_str): + """Determines git context based on a commit message. + :param commit_msg_str: Full git commit message. + """ + context = GitContext() + commit_msg_obj = GitCommitMessage.from_full_message(context, commit_msg_str) + commit = GitCommit(context, commit_msg_obj) + context.commits.append(commit) + return context + + @staticmethod + def from_staged_commit(commit_msg_str, repository_path): + """Determines git context based on a commit message that is a staged commit for a local git repository. + :param commit_msg_str: Full git commit message. + :param repository_path: Path to the git repository to retrieve the context from + """ + context = GitContext(repository_path=repository_path) + commit_msg_obj = GitCommitMessage.from_full_message(context, commit_msg_str) + commit = StagedLocalGitCommit(context, commit_msg_obj) + context.commits.append(commit) + return context + + @staticmethod + def from_local_repository(repository_path, refspec=None, commit_hashes=None): + """Retrieves the git context from a local git repository. + :param repository_path: Path to the git repository to retrieve the context from + :param refspec: The commit(s) to retrieve (mutually exclusive with `commit_hash`) + :param commit_hash: Hash of the commit to retrieve (mutually exclusive with `refspec`) + """ + + context = GitContext(repository_path=repository_path) + + if refspec: + sha_list = _git("rev-list", refspec, _cwd=repository_path).split() + elif commit_hashes: # One or more commit hashes, just pass it to `git log -1` + # Even though we have already been passed the commit hash, we ask git to retrieve this hash and + # return it to us. This way we verify that the passed hash is a valid hash for the target repo and we + # also convert it to the full hash format (we might have been passed a short hash). + sha_list = [] + for commit_hash in commit_hashes: + sha_list.append(_git("log", "-1", commit_hash, "--pretty=%H", _cwd=repository_path).replace("\n", "")) + else: # If no refspec is defined, fallback to the last commit on the current branch + # We tried many things here e.g.: defaulting to e.g. HEAD or HEAD^... (incl. dealing with + # repos that only have a single commit - HEAD^... doesn't work there), but then we still get into + # problems with e.g. merge commits. Easiest solution is just taking the SHA from `git log -1`. + sha_list = [_git("log", "-1", "--pretty=%H", _cwd=repository_path).replace("\n", "")] + + for sha in sha_list: + commit = LocalGitCommit(context, sha) + context.commits.append(commit) + + return context + + def __eq__(self, other): + return ( + isinstance(other, GitContext) + and self.commits == other.commits + and self.repository_path == other.repository_path + and self.commentchar == other.commentchar + and self.current_branch == other.current_branch + ) |