Skip to content

Commit 7628617

Browse files
Matthew Rothenbergmroth
Matthew Rothenberg
authored andcommitted
add git find-reviewers tool
this makes sense as an essential part of of the "git workflow", so it behoves to move it into these tools and out of the webapp specific version.
1 parent 3de109d commit 7628617

File tree

2 files changed

+289
-4
lines changed

2 files changed

+289
-4
lines changed

README.md

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
# khan academy git-workflow
22

3-
Collection of scripts in order to enable the [git workflow][git-at-ka]
3+
> Collection of scripts used to enable the [git workflow][git-at-ka]
44
at Khan Academy. (see also: [arcanist])
55

66
[git-at-ka]: https://khanacademy.org/r/git-at-ka
77
[arcanist]: https://github.com/khan/arcanist
88

9-
## Tools
109

1110
#### git deploy-branch
12-
Creates a remote deploy branch for use with GitHub-style deploys.
11+
Creates a remote _deploy branch_ for use with GitHub-style deploys.
1312

1413
For GitHub-style deploys, all work must branch off a deploy branch.
1514

1615
#### git review-branch
1716
Creates a new local branch ensuring that it's not based off master.
18-
Such a branch is called a 'review branch'.
17+
Such a branch is called a _review branch_.
1918

2019
All review branches should be based off a deploy branch.
2120

2221
#### git recursive-grep
2322
Runs git grep recursively through submodules, showing file paths
2423
relative to cwd.
24+
25+
#### git find-reviewers
26+
Find the best reviewer(s) for a given changeset. The idea is that if one user
27+
has modified all the lines you are editing, they are a good candidate to review
28+
your change.

bin/git-find-reviewers

+281
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#!/usr/bin/env python
2+
3+
"""Find the best reviewer(s) for a given changeset.
4+
5+
This works under both git and mercurial.
6+
7+
1) Runs 'hg diff <argv>' (so by default, diffs all currently edited files).
8+
or 'git diff <argv>'.
9+
2) Analyzes the diff to find out exact lines that have changed.
10+
3) Runs 'hg/git blame' to figure out who last modified those lines.
11+
4) For each file, prints the usernames who last-modified any of the
12+
diffed lines in the file, along with how many of the lines they
13+
modified.
14+
15+
The idea is that if one user has modified all the lines you are
16+
editing, they are a good candidate to review your change.
17+
"""
18+
19+
import os
20+
import re
21+
import sys
22+
import subprocess
23+
24+
25+
# The line we care about in the diff output is
26+
# @@ -<startline>,<numlines> ...
27+
# or @@ -<startline> ... # in which <numlines> is taken to be 1
28+
# (Everything else is header or diff content, which we ignore.)
29+
_DIFFLINE_RE = re.compile('^@@ -(\d+)(?:,(\d+))? ')
30+
31+
_NEWFILE_RE = re.compile('^--- (.*)')
32+
33+
34+
class Mercurial(object):
35+
def __init__(self, ui, repo):
36+
self.ui = ui
37+
self.repo = repo
38+
39+
def write(self, msg):
40+
self.ui.write(msg)
41+
42+
def find_wholefile_lines(self, files, revision='.'):
43+
"""Return a map from abspath -> set-of-all-linumbers in the file."""
44+
ctx = self.repo[revision] # state of repository base revision
45+
m = ctx.match(files, None, None, 'relpath')
46+
all_files = ctx.walk(m)
47+
48+
all_lines = {}
49+
for abspath in all_files:
50+
before_text = ctx.filectx(abspath).data()
51+
num_lines = before_text.count('\n')
52+
all_lines[abspath] = set(range(1, num_lines + 1))
53+
54+
return all_lines
55+
56+
def find_modified_lines(self, files, revision='.'):
57+
"""Return a map from abspath -> set-of-linenumbers changed."""
58+
import mercurial.mdiff
59+
60+
ctx = self.repo[revision] # state of repository base revision
61+
edit_ctx = self.repo[None] # current working state ('.' + local edits)
62+
63+
# Find the files that have modifications.
64+
m = ctx.match(files, None, None, 'relpath')
65+
# Only count files that have been edited from tip.
66+
modified = self.repo.status(ctx, None, match=m)[0]
67+
68+
modified_lines = {}
69+
diffopts = mercurial.mdiff.diffopts(context=0, nodates=True)
70+
for abspath in modified:
71+
before_text = ctx.filectx(abspath).data()
72+
after_text = edit_ctx.filectx(abspath).data()
73+
diff_text = mercurial.mdiff.unidiff(
74+
before_text, None, after_text, None,
75+
abspath, abspath, opts=diffopts)
76+
# Look at the '@@ -<startline>,<numlines> ...' diffline to
77+
# find what lines in the input file were changed.
78+
modified_lines.setdefault(abspath, set())
79+
for line in diff_text.splitlines():
80+
m = _DIFFLINE_RE.match(line)
81+
if m:
82+
startline, n = int(m.group(1)), int(m.group(2) or '1')
83+
modified_lines[abspath].update(range(startline,
84+
startline + n))
85+
86+
return modified_lines
87+
88+
def get_annotation_info(self, abspaths, revision='.'):
89+
"""Return a map abspath -> list-of-author-names.
90+
91+
retval[filename][i] says who wrote the i-th line of the file.
92+
Line numbers start at 1, so retval[filename][0] is always None.
93+
"""
94+
retval = {}
95+
user_to_shortuser = {}
96+
97+
ctx = self.repo[revision] # state of repository base revision
98+
for abspath in abspaths:
99+
retval[abspath] = [None]
100+
anno_lines = ctx[abspath].annotate(follow=True)
101+
for anno_line in anno_lines:
102+
modifier = anno_line[0].user()
103+
if modifier not in user_to_shortuser:
104+
user_to_shortuser[modifier] = self.ui.shortuser(modifier)
105+
retval[abspath].append(user_to_shortuser[modifier])
106+
107+
return retval
108+
109+
110+
class Git(object):
111+
def write(self, msg):
112+
sys.stdout.write(msg)
113+
114+
def find_wholefile_lines(self, files, revision='HEAD'):
115+
"""Return a map from abspath -> set-of-all-linumbers in the file."""
116+
all_lines = {}
117+
for f in files:
118+
before_text = subprocess.check_output(['git', 'show',
119+
'%s:%s' % (revision, f)])
120+
num_lines = before_text.count('\n')
121+
all_lines[os.path.abspath(f)] = set(range(1, num_lines + 1))
122+
return all_lines
123+
124+
def find_modified_lines(self, files, revision='HEAD'):
125+
"""Return a map from abspath -> set-of-linenumbers changed."""
126+
modified_lines = {}
127+
128+
# Only count Deleted and Modified files.
129+
diff_output = subprocess.check_output(['git', 'diff', '-U0',
130+
'--diff-filter=DM',
131+
'--no-ext-diff', '--no-prefix',
132+
revision, '--'] + files)
133+
abspath = None
134+
for line in diff_output.splitlines():
135+
m = _NEWFILE_RE.match(line)
136+
if m:
137+
abspath = os.path.abspath(m.group(1))
138+
modified_lines[abspath] = set()
139+
else:
140+
m = _DIFFLINE_RE.match(line)
141+
if m:
142+
assert abspath, line # filename comes before diff info
143+
startline, n = int(m.group(1)), int(m.group(2) or '1')
144+
modified_lines[abspath].update(range(startline,
145+
startline + n))
146+
147+
return modified_lines
148+
149+
def get_annotation_info(self, abspaths, revision='HEAD'):
150+
"""Return a map abspath -> list-of-author-nqames.
151+
152+
retval[filename][i] says who wrote the i-th line of the file.
153+
Line numbers start at 1, so retval[filename][0] is always None.
154+
"""
155+
retval = {}
156+
author_re = re.compile(r'author-mail <([^>]*)>')
157+
158+
for abspath in abspaths:
159+
retval[abspath] = [None]
160+
# TODO(csilvers): also specify -C?
161+
blame_output = subprocess.check_output(['git', 'blame', '-M',
162+
'--line-porcelain',
163+
revision, '--', abspath])
164+
for line in blame_output.splitlines():
165+
m = author_re.match(line)
166+
if m:
167+
author = m.group(1)
168+
# Just to make the common-case output prettier.
169+
if author.endswith('@khanacademy.org'):
170+
author = author[:-len('@khanacademy.org')]
171+
retval[abspath].append(author)
172+
173+
return retval
174+
175+
176+
def findreviewers(vcs, files, revision=None, num_reviewers=3,
177+
whole_file=False, output_per_file=False, ignore=[]):
178+
"""Find the best reviewer(s) for a given changeset.
179+
180+
Examines the current changes in this file, and runs 'hg blame' or
181+
'git blame' (depending on 'vcs') to find who last edited those
182+
same lines. Collates and returns this information, including how
183+
many lines each person is responsible for.
184+
185+
Arguments:
186+
vcs: either a Mercurial or a Git instance, from this file.
187+
files: a list of filenames to find reviewer information for
188+
revision: what revision to diff against when looking for
189+
reviewers (typically '.' for mercurial or 'HEAD' for git).
190+
num_reviewers: the number of reviewers to suggest for each file.
191+
3 is a reasonable value.
192+
whole_file: if True, return reviewer information for the input
193+
files as a whole, not just for the diff vs 'revision'. This
194+
is useful when you want to know who is 'most responsible' for
195+
a file.
196+
output_per_file: if True, instead of printing the best reviewers
197+
for the set of input files as a whole, prints a separate list
198+
of best reviewers for each file in the input.
199+
ignore: a set/list of revisions to ignore when finding blame info.
200+
TODO(csilvers): implement this.
201+
"""
202+
# revision has to be a kwarg because of how findreviewers() is
203+
# called, but it's actually required.
204+
assert revision, 'revision argument cannot be None!'
205+
206+
if whole_file:
207+
modified_lines = vcs.find_wholefile_lines(files, revision)
208+
else:
209+
modified_lines = vcs.find_modified_lines(files, revision)
210+
211+
annotation_info = vcs.get_annotation_info(modified_lines.keys(), revision)
212+
213+
if output_per_file:
214+
# filename -> {author: num_lines, ...}
215+
num_lines_per_author = {abspath: {} for abspath in modified_lines}
216+
else:
217+
# None -> {author: num_lines, ...}
218+
num_lines_per_author = {None: {}}
219+
220+
for abspath in modified_lines:
221+
for linenum in modified_lines[abspath]:
222+
author = annotation_info[abspath][linenum]
223+
if output_per_file:
224+
num_lines_per_author[abspath].setdefault(author, 0)
225+
num_lines_per_author[abspath][author] += 1
226+
else:
227+
# Just store global info
228+
num_lines_per_author[None].setdefault(author, 0)
229+
num_lines_per_author[None][author] += 1
230+
231+
# Print the information out.
232+
for abspath in sorted(num_lines_per_author.iterkeys()):
233+
if abspath:
234+
vcs.write('\n--- %s\n' % abspath)
235+
236+
reviewers = num_lines_per_author[abspath].items()
237+
reviewers.sort(key=lambda (_, num_lines): num_lines, reverse=True)
238+
total_lines = sum(num_lines_per_author[abspath].itervalues())
239+
240+
for (reviewer, reviewer_num_lines) in reviewers[:num_reviewers]:
241+
vcs.write('%s: %s lines (%.1f%%)\n'
242+
% (reviewer, reviewer_num_lines,
243+
reviewer_num_lines * 100.0 / total_lines))
244+
245+
246+
# How hg uses this script: via the cmdtable hook.
247+
cmdtable = {
248+
'findreviewers':
249+
(lambda ui, repo, *files, **opts: (
250+
findreviewers(Mercurial(ui, repo), files, **opts)),
251+
[('f', 'output-per-file', None, 'Print results per input file'),
252+
('n', 'num-reviewers', 3, 'How many reviewers to show'),
253+
('w', 'whole-file', None, 'Calculate reviewers based on entire file'),
254+
('i', 'ignore', [], 'TODO: Revisions to ignore when annotating'),
255+
('r', 'revision', '.', 'Revision to use as base'),
256+
],
257+
'[-f] [-n #] [-w] [-i <commit_id> ...] [FILE...]')
258+
}
259+
260+
261+
# How git uses this script: via __main__
262+
if __name__ == '__main__':
263+
import argparse
264+
parser = argparse.ArgumentParser(description=__doc__)
265+
266+
parser.add_argument('files', nargs='*')
267+
for (shortname, longname, default, help) in cmdtable['findreviewers'][1]:
268+
if default is None:
269+
parser.add_argument('--%s' % longname, '-%s' % shortname,
270+
help=help, default=default,
271+
action='store_true')
272+
else:
273+
parser.add_argument('--%s' % longname, '-%s' % shortname,
274+
help=help, default=default,
275+
type=type(default))
276+
# The one place we differ from the mercurial defaults.
277+
parser.set_defaults(revision='HEAD')
278+
279+
args = parser.parse_args()
280+
281+
findreviewers(Git(), **args.__dict__)

0 commit comments

Comments
 (0)