From 454fdaf1191c87e5c770ab865a911e10e600e178 Mon Sep 17 00:00:00 2001 From: Josip Sokcevic Date: Mon, 7 Oct 2024 17:33:38 +0000 Subject: [PATCH] sync: Always use WORKER_BATCH_SIZE With 551285fa35ccd0836513e9cf64ee8d3372e5e3f4, the comment about number of workers no longer stands - dict is shared among multiprocesses and real time information is available. Using 2.7k projects as the baseline, using chunk size of 4 takes close to 5 minutes. A chunk size of 32 takes this down to 40s - a reduction of rougly 8 times which matches the increase. R=gavinmak@google.com Bug: b/371638995 Change-Id: Ida5fd8f7abc44b3b82c02aa0f7f7ae01dff5eb07 Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/438523 Commit-Queue: Josip Sokcevic Tested-by: Josip Sokcevic Reviewed-by: Gavin Mak --- project.py | 19 +++++++++---------- subcmds/sync.py | 27 ++++++++++----------------- tests/test_subcmds_sync.py | 24 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/project.py b/project.py index 88dd747b..1d8ab556 100644 --- a/project.py +++ b/project.py @@ -2396,26 +2396,25 @@ class Project: try: # if revision (sha or tag) is not present then following function # throws an error. + revs = [f"{self.revisionExpr}^0"] + upstream_rev = None + if self.upstream: + upstream_rev = self.GetRemote().ToLocal(self.upstream) + revs.append(upstream_rev) + self.bare_git.rev_list( "-1", "--missing=allow-any", - "%s^0" % self.revisionExpr, + *revs, "--", log_as_error=False, ) + if self.upstream: - rev = self.GetRemote().ToLocal(self.upstream) - self.bare_git.rev_list( - "-1", - "--missing=allow-any", - "%s^0" % rev, - "--", - log_as_error=False, - ) self.bare_git.merge_base( "--is-ancestor", self.revisionExpr, - rev, + upstream_rev, log_as_error=False, ) return True diff --git a/subcmds/sync.py b/subcmds/sync.py index 0ae59f55..bebe18b9 100644 --- a/subcmds/sync.py +++ b/subcmds/sync.py @@ -131,6 +131,11 @@ def _SafeCheckoutOrder(checkouts: List[Project]) -> List[List[Project]]: return res +def _chunksize(projects: int, jobs: int) -> int: + """Calculate chunk size for the given number of projects and jobs.""" + return min(max(1, projects // jobs), WORKER_BATCH_SIZE) + + class _FetchOneResult(NamedTuple): """_FetchOne return value. @@ -819,7 +824,6 @@ later is required to fix a server side protocol bug. def _Fetch(self, projects, opt, err_event, ssh_proxy, errors): ret = True - jobs = opt.jobs_network fetched = set() remote_fetched = set() pm = Progress( @@ -849,6 +853,8 @@ later is required to fix a server side protocol bug. objdir_project_map.setdefault(project.objdir, []).append(project) projects_list = list(objdir_project_map.values()) + jobs = min(opt.jobs_network, len(projects_list)) + def _ProcessResults(results_sets): ret = True for results in results_sets: @@ -888,35 +894,22 @@ later is required to fix a server side protocol bug. Sync.ssh_proxy = None # NB: Multiprocessing is heavy, so don't spin it up for one job. - if len(projects_list) == 1 or jobs == 1: + if jobs == 1: self._FetchInitChild(ssh_proxy) if not _ProcessResults( self._FetchProjectList(opt, x) for x in projects_list ): ret = False else: - # Favor throughput over responsiveness when quiet. It seems that - # imap() will yield results in batches relative to chunksize, so - # even as the children finish a sync, we won't see the result until - # one child finishes ~chunksize jobs. When using a large --jobs - # with large chunksize, this can be jarring as there will be a large - # initial delay where repo looks like it isn't doing anything and - # sits at 0%, but then suddenly completes a lot of jobs all at once. - # Since this code is more network bound, we can accept a bit more - # CPU overhead with a smaller chunksize so that the user sees more - # immediate & continuous feedback. - if opt.quiet: - chunksize = WORKER_BATCH_SIZE - else: + if not opt.quiet: pm.update(inc=0, msg="warming up") - chunksize = 4 with multiprocessing.Pool( jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,) ) as pool: results = pool.imap_unordered( functools.partial(self._FetchProjectList, opt), projects_list, - chunksize=chunksize, + chunksize=_chunksize(len(projects_list), jobs), ) if not _ProcessResults(results): ret = False diff --git a/tests/test_subcmds_sync.py b/tests/test_subcmds_sync.py index 8dde687c..b871317c 100644 --- a/tests/test_subcmds_sync.py +++ b/tests/test_subcmds_sync.py @@ -355,6 +355,30 @@ class SafeCheckoutOrder(unittest.TestCase): ) +class Chunksize(unittest.TestCase): + """Tests for _chunksize.""" + + def test_single_project(self): + """Single project.""" + self.assertEqual(sync._chunksize(1, 1), 1) + + def test_low_project_count(self): + """Multiple projects, low number of projects to sync.""" + self.assertEqual(sync._chunksize(10, 1), 10) + self.assertEqual(sync._chunksize(10, 2), 5) + self.assertEqual(sync._chunksize(10, 4), 2) + self.assertEqual(sync._chunksize(10, 8), 1) + self.assertEqual(sync._chunksize(10, 16), 1) + + def test_high_project_count(self): + """Multiple projects, high number of projects to sync.""" + self.assertEqual(sync._chunksize(2800, 1), 32) + self.assertEqual(sync._chunksize(2800, 16), 32) + self.assertEqual(sync._chunksize(2800, 32), 32) + self.assertEqual(sync._chunksize(2800, 64), 32) + self.assertEqual(sync._chunksize(2800, 128), 21) + + class GetPreciousObjectsState(unittest.TestCase): """Tests for _GetPreciousObjectsState."""