grep: add --jobs support

Use multiprocessing to run in parallel.  When operating on multiple
projects, this can greatly speed things up.  Across 1000 repos, it
goes from ~40sec to ~16sec with the default -j8.

The output processing does not appear to be a significant bottle
neck -- it accounts for <1sec out of the ~16sec runtime.  Thus we
leave it in the main thread to simplify the code.

Change-Id: I750b72c7711b0c5d26e65d480738fbaac3a69971
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/297984
Reviewed-by: Chris Mcdonald <cjmcdonald@google.com>
Tested-by: Mike Frysinger <vapier@google.com>
This commit is contained in:
Mike Frysinger 2021-02-24 12:50:30 -05:00
parent bec4fe8aa3
commit d246d1fee7

View File

@ -12,10 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import functools
import multiprocessing
import sys import sys
from color import Coloring from color import Coloring
from command import PagedCommand from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE
from error import GitError from error import GitError
from git_command import GitCommand from git_command import GitCommand
@ -61,6 +63,7 @@ contain a line that matches both expressions:
repo grep --all-match -e NODE -e Unexpected repo grep --all-match -e NODE -e Unexpected
""" """
PARALLEL_JOBS = DEFAULT_LOCAL_JOBS
@staticmethod @staticmethod
def _carry_option(_option, opt_str, value, parser): def _carry_option(_option, opt_str, value, parser):
@ -80,6 +83,7 @@ contain a line that matches both expressions:
pt.append(value) pt.append(value)
def _Options(self, p): def _Options(self, p):
super()._Options(p)
g = p.add_option_group('Sources') g = p.add_option_group('Sources')
g.add_option('--cached', g.add_option('--cached',
action='callback', callback=self._carry_option, action='callback', callback=self._carry_option,
@ -152,6 +156,72 @@ contain a line that matches both expressions:
action='callback', callback=self._carry_option, action='callback', callback=self._carry_option,
help='Show only file names not containing matching lines') help='Show only file names not containing matching lines')
def _ExecuteOne(self, cmd_argv, project):
"""Process one project."""
try:
p = GitCommand(project,
cmd_argv,
bare=False,
capture_stdout=True,
capture_stderr=True)
except GitError as e:
return (project, -1, None, str(e))
return (project, p.Wait(), p.stdout, p.stderr)
@staticmethod
def _ProcessResults(out, full_name, have_rev, results):
git_failed = False
bad_rev = False
have_match = False
for project, rc, stdout, stderr in results:
if rc < 0:
git_failed = True
out.project('--- project %s ---' % project.relpath)
out.nl()
out.fail('%s', stderr)
out.nl()
continue
if rc:
# no results
if stderr:
if have_rev and 'fatal: ambiguous argument' in stderr:
bad_rev = True
else:
out.project('--- project %s ---' % project.relpath)
out.nl()
out.fail('%s', stderr.strip())
out.nl()
continue
have_match = True
# We cut the last element, to avoid a blank line.
r = stdout.split('\n')
r = r[0:-1]
if have_rev and full_name:
for line in r:
rev, line = line.split(':', 1)
out.write("%s", rev)
out.write(':')
out.project(project.relpath)
out.write('/')
out.write("%s", line)
out.nl()
elif full_name:
for line in r:
out.project(project.relpath)
out.write('/')
out.write("%s", line)
out.nl()
else:
for line in r:
print(line)
return (git_failed, bad_rev, have_match)
def Execute(self, opt, args): def Execute(self, opt, args):
out = GrepColoring(self.manifest.manifestProject.config) out = GrepColoring(self.manifest.manifestProject.config)
@ -183,62 +253,18 @@ contain a line that matches both expressions:
cmd_argv.extend(opt.revision) cmd_argv.extend(opt.revision)
cmd_argv.append('--') cmd_argv.append('--')
git_failed = False process_results = functools.partial(
bad_rev = False self._ProcessResults, out, full_name, have_rev)
have_match = False # NB: Multiprocessing is heavy, so don't spin it up for one job.
if len(projects) == 1 or opt.jobs == 1:
for project in projects: git_failed, bad_rev, have_match = process_results(
try: self._ExecuteOne(cmd_argv, x) for x in projects)
p = GitCommand(project, else:
cmd_argv, with multiprocessing.Pool(opt.jobs) as pool:
bare=False, results = pool.imap(
capture_stdout=True, functools.partial(self._ExecuteOne, cmd_argv), projects,
capture_stderr=True) chunksize=WORKER_BATCH_SIZE)
except GitError as e: git_failed, bad_rev, have_match = process_results(results)
git_failed = True
out.project('--- project %s ---' % project.relpath)
out.nl()
out.fail('%s', str(e))
out.nl()
continue
if p.Wait() != 0:
# no results
#
if p.stderr:
if have_rev and 'fatal: ambiguous argument' in p.stderr:
bad_rev = True
else:
out.project('--- project %s ---' % project.relpath)
out.nl()
out.fail('%s', p.stderr.strip())
out.nl()
continue
have_match = True
# We cut the last element, to avoid a blank line.
#
r = p.stdout.split('\n')
r = r[0:-1]
if have_rev and full_name:
for line in r:
rev, line = line.split(':', 1)
out.write("%s", rev)
out.write(':')
out.project(project.relpath)
out.write('/')
out.write("%s", line)
out.nl()
elif full_name:
for line in r:
out.project(project.relpath)
out.write('/')
out.write("%s", line)
out.nl()
else:
for line in r:
print(line)
if git_failed: if git_failed:
sys.exit(1) sys.exit(1)