sync: switch local checkout to multiprocessing

This avoids GIL limitations with using threads for parallel processing.
In a CrOS checkout with ~1000 repos, the nop case goes from ~6 sec down
to ~4 sec with -j8.  Not a big deal, but shows that this actually works
to speed things up unlike the threading model.

This reworks the checkout logic to return results for processing in the
main thread instead of leaving every thread to do its own processing.

Bug: https://crbug.com/gerrit/12389
Change-Id: I143e5e3f7158e83ea67e2d14e5552153a874248a
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/298063
Reviewed-by: Chris Mcdonald <cjmcdonald@google.com>
Tested-by: Mike Frysinger <vapier@google.com>
This commit is contained in:
Mike Frysinger 2021-02-23 20:48:04 -05:00
parent 8dbc07aced
commit ebf04a4404

View File

@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import functools
import http.cookiejar as cookielib import http.cookiejar as cookielib
import io import io
import json import json
import multiprocessing
import netrc import netrc
from optparse import SUPPRESS_HELP from optparse import SUPPRESS_HELP
import os import os
@ -56,7 +58,7 @@ import git_superproject
import gitc_utils import gitc_utils
from project import Project from project import Project
from project import RemoteSpec from project import RemoteSpec
from command import Command, MirrorSafeCommand from command import Command, MirrorSafeCommand, WORKER_BATCH_SIZE
from error import RepoChangedException, GitError, ManifestParseError from error import RepoChangedException, GitError, ManifestParseError
import platform_utils import platform_utils
from project import SyncBuffer from project import SyncBuffer
@ -71,10 +73,6 @@ class _FetchError(Exception):
"""Internal error thrown in _FetchHelper() when we don't want stack trace.""" """Internal error thrown in _FetchHelper() when we don't want stack trace."""
class _CheckoutError(Exception):
"""Internal error thrown in _CheckoutOne() when we don't want stack trace."""
class Sync(Command, MirrorSafeCommand): class Sync(Command, MirrorSafeCommand):
jobs = 1 jobs = 1
common = True common = True
@ -457,149 +455,80 @@ later is required to fix a server side protocol bug.
return fetched return fetched
def _CheckoutWorker(self, opt, sem, project, *args, **kwargs): def _CheckoutOne(self, opt, project):
"""Main function of the fetch threads.
Delegates most of the work to _CheckoutOne.
Args:
opt: Program options returned from optparse. See _Options().
projects: Projects to fetch.
sem: We'll release() this semaphore when we exit so that another thread
can be started up.
*args, **kwargs: Remaining arguments to pass to _CheckoutOne. See the
_CheckoutOne docstring for details.
"""
try:
return self._CheckoutOne(opt, project, *args, **kwargs)
finally:
sem.release()
def _CheckoutOne(self, opt, project, lock, pm, err_event, err_results):
"""Checkout work tree for one project """Checkout work tree for one project
Args: Args:
opt: Program options returned from optparse. See _Options(). opt: Program options returned from optparse. See _Options().
project: Project object for the project to checkout. project: Project object for the project to checkout.
lock: Lock for accessing objects that are shared amongst multiple
_CheckoutWorker() threads.
pm: Instance of a Project object. We will call pm.update() (with our
lock held).
err_event: We'll set this event in the case of an error (after printing
out info about the error).
err_results: A list of strings, paths to git repos where checkout
failed.
Returns: Returns:
Whether the fetch was successful. Whether the fetch was successful.
""" """
# We'll set to true once we've locked the lock.
did_lock = False
# Encapsulate everything in a try/except/finally so that:
# - We always set err_event in the case of an exception.
# - We always make sure we unlock the lock if we locked it.
start = time.time() start = time.time()
syncbuf = SyncBuffer(self.manifest.manifestProject.config, syncbuf = SyncBuffer(self.manifest.manifestProject.config,
detach_head=opt.detach_head) detach_head=opt.detach_head)
success = False success = False
with lock:
pm.start(project.name)
try:
try: try:
project.Sync_LocalHalf(syncbuf, force_sync=opt.force_sync) project.Sync_LocalHalf(syncbuf, force_sync=opt.force_sync)
# Lock around all the rest of the code, since printing, updating a set
# and Progress.update() are not thread safe.
lock.acquire()
success = syncbuf.Finish() success = syncbuf.Finish()
did_lock = True
if not success:
err_event.set()
print('error: Cannot checkout %s' % (project.name),
file=sys.stderr)
raise _CheckoutError()
except _CheckoutError:
pass
except Exception as e: except Exception as e:
print('error: Cannot checkout %s: %s: %s' % print('error: Cannot checkout %s: %s: %s' %
(project.name, type(e).__name__, str(e)), (project.name, type(e).__name__, str(e)),
file=sys.stderr) file=sys.stderr)
err_event.set()
raise raise
finally:
if not did_lock:
lock.acquire()
if not success: if not success:
err_results.append(project.relpath) print('error: Cannot checkout %s' % (project.name), file=sys.stderr)
pm.finish(project.name)
lock.release()
finish = time.time() finish = time.time()
self.event_log.AddSync(project, event_log.TASK_SYNC_LOCAL, return (success, project, start, finish)
start, finish, success)
return success def _Checkout(self, all_projects, opt, err_results):
def _Checkout(self, all_projects, opt, err_event, err_results):
"""Checkout projects listed in all_projects """Checkout projects listed in all_projects
Args: Args:
all_projects: List of all projects that should be checked out. all_projects: List of all projects that should be checked out.
opt: Program options returned from optparse. See _Options(). opt: Program options returned from optparse. See _Options().
err_event: We'll set this event in the case of an error (after printing err_results: A list of strings, paths to git repos where checkout failed.
out info about the error).
err_results: A list of strings, paths to git repos where checkout
failed.
""" """
ret = True
# Perform checkouts in multiple threads when we are using partial clone. # Only checkout projects with worktrees.
# Without partial clone, all needed git objects are already downloaded, all_projects = [x for x in all_projects if x.worktree]
# in this situation it's better to use only one process because the checkout
# would be mostly disk I/O; with partial clone, the objects are only
# downloaded when demanded (at checkout time), which is similar to the
# Sync_NetworkHalf case and parallelism would be helpful.
if self.manifest.CloneFilter:
syncjobs = self.jobs
else:
syncjobs = 1
lock = _threading.Lock()
pm = Progress('Checking out', len(all_projects)) pm = Progress('Checking out', len(all_projects))
threads = set() def _ProcessResults(results):
sem = _threading.Semaphore(syncjobs) for (success, project, start, finish) in results:
self.event_log.AddSync(project, event_log.TASK_SYNC_LOCAL,
for project in all_projects: start, finish, success)
# Check for any errors before running any more tasks. # Check for any errors before running any more tasks.
# ...we'll let existing threads finish, though. # ...we'll let existing threads finish, though.
if err_event.is_set() and opt.fail_fast: if not success:
break err_results.append(project.relpath)
if opt.fail_fast:
return False
pm.update(msg=project.name)
return True
sem.acquire() # NB: Multiprocessing is heavy, so don't spin it up for one job.
if project.worktree: if len(all_projects) == 1 or opt.jobs == 1:
kwargs = dict(opt=opt, if not _ProcessResults(self._CheckoutOne(opt, x) for x in all_projects):
sem=sem, ret = False
project=project,
lock=lock,
pm=pm,
err_event=err_event,
err_results=err_results)
if syncjobs > 1:
t = _threading.Thread(target=self._CheckoutWorker,
kwargs=kwargs)
# Ensure that Ctrl-C will not freeze the repo process.
t.daemon = True
threads.add(t)
t.start()
else: else:
self._CheckoutWorker(**kwargs) with multiprocessing.Pool(opt.jobs) as pool:
results = pool.imap_unordered(
for t in threads: functools.partial(self._CheckoutOne, opt),
t.join() all_projects,
chunksize=WORKER_BATCH_SIZE)
if not _ProcessResults(results):
ret = False
pool.close()
pm.end() pm.end()
return ret
def _GCProjects(self, projects, opt, err_event): def _GCProjects(self, projects, opt, err_event):
gc_gitdirs = {} gc_gitdirs = {}
for project in projects: for project in projects:
@ -946,7 +875,6 @@ later is required to fix a server side protocol bug.
err_network_sync = False err_network_sync = False
err_update_projects = False err_update_projects = False
err_checkout = False
self._fetch_times = _FetchTimes(self.manifest) self._fetch_times = _FetchTimes(self.manifest)
if not opt.local_only: if not opt.local_only:
@ -1011,10 +939,10 @@ later is required to fix a server side protocol bug.
sys.exit(1) sys.exit(1)
err_results = [] err_results = []
self._Checkout(all_projects, opt, err_event, err_results)
if err_event.is_set():
err_checkout = True
# NB: We don't exit here because this is the last step. # NB: We don't exit here because this is the last step.
err_checkout = not self._Checkout(all_projects, opt, err_results)
if err_checkout:
err_event.set()
# If there's a notice that's supposed to print at the end of the sync, print # If there's a notice that's supposed to print at the end of the sync, print
# it now... # it now...