diff options
| author | Josip Sokcevic <sokcevic@chromium.org> | 2024-10-07 17:33:38 +0000 |
|---|---|---|
| committer | LUCI <gerrit-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2024-10-07 18:44:19 +0000 |
| commit | 454fdaf1191c87e5c770ab865a911e10e600e178 (patch) | |
| tree | a20af3e4f05b48f28d30346648ab9fa1be7f4f64 /subcmds/sync.py | |
| parent | f7f9dd4deb3b92bf175a0411dac60e7b6fdd9cfa (diff) | |
| download | git-repo-454fdaf1191c87e5c770ab865a911e10e600e178.tar.gz | |
sync: Always use WORKER_BATCH_SIZEv2.48
With 551285fa35ccd0836513e9cf64ee8d3372e5e3f4, the comment about number
of workers no longer stands - dict is shared among multiprocesses and
real time information is available.
Using 2.7k projects as the baseline, using chunk size of 4 takes close
to 5 minutes. A chunk size of 32 takes this down to 40s - a reduction of
rougly 8 times which matches the increase.
R=gavinmak@google.com
Bug: b/371638995
Change-Id: Ida5fd8f7abc44b3b82c02aa0f7f7ae01dff5eb07
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/438523
Commit-Queue: Josip Sokcevic <sokcevic@google.com>
Tested-by: Josip Sokcevic <sokcevic@google.com>
Reviewed-by: Gavin Mak <gavinmak@google.com>
Diffstat (limited to 'subcmds/sync.py')
| -rw-r--r-- | subcmds/sync.py | 27 |
1 files changed, 10 insertions, 17 deletions
diff --git a/subcmds/sync.py b/subcmds/sync.py index 0ae59f55..bebe18b9 100644 --- a/subcmds/sync.py +++ b/subcmds/sync.py | |||
| @@ -131,6 +131,11 @@ def _SafeCheckoutOrder(checkouts: List[Project]) -> List[List[Project]]: | |||
| 131 | return res | 131 | return res |
| 132 | 132 | ||
| 133 | 133 | ||
| 134 | def _chunksize(projects: int, jobs: int) -> int: | ||
| 135 | """Calculate chunk size for the given number of projects and jobs.""" | ||
| 136 | return min(max(1, projects // jobs), WORKER_BATCH_SIZE) | ||
| 137 | |||
| 138 | |||
| 134 | class _FetchOneResult(NamedTuple): | 139 | class _FetchOneResult(NamedTuple): |
| 135 | """_FetchOne return value. | 140 | """_FetchOne return value. |
| 136 | 141 | ||
| @@ -819,7 +824,6 @@ later is required to fix a server side protocol bug. | |||
| 819 | def _Fetch(self, projects, opt, err_event, ssh_proxy, errors): | 824 | def _Fetch(self, projects, opt, err_event, ssh_proxy, errors): |
| 820 | ret = True | 825 | ret = True |
| 821 | 826 | ||
| 822 | jobs = opt.jobs_network | ||
| 823 | fetched = set() | 827 | fetched = set() |
| 824 | remote_fetched = set() | 828 | remote_fetched = set() |
| 825 | pm = Progress( | 829 | pm = Progress( |
| @@ -849,6 +853,8 @@ later is required to fix a server side protocol bug. | |||
| 849 | objdir_project_map.setdefault(project.objdir, []).append(project) | 853 | objdir_project_map.setdefault(project.objdir, []).append(project) |
| 850 | projects_list = list(objdir_project_map.values()) | 854 | projects_list = list(objdir_project_map.values()) |
| 851 | 855 | ||
| 856 | jobs = min(opt.jobs_network, len(projects_list)) | ||
| 857 | |||
| 852 | def _ProcessResults(results_sets): | 858 | def _ProcessResults(results_sets): |
| 853 | ret = True | 859 | ret = True |
| 854 | for results in results_sets: | 860 | for results in results_sets: |
| @@ -888,35 +894,22 @@ later is required to fix a server side protocol bug. | |||
| 888 | Sync.ssh_proxy = None | 894 | Sync.ssh_proxy = None |
| 889 | 895 | ||
| 890 | # NB: Multiprocessing is heavy, so don't spin it up for one job. | 896 | # NB: Multiprocessing is heavy, so don't spin it up for one job. |
| 891 | if len(projects_list) == 1 or jobs == 1: | 897 | if jobs == 1: |
| 892 | self._FetchInitChild(ssh_proxy) | 898 | self._FetchInitChild(ssh_proxy) |
| 893 | if not _ProcessResults( | 899 | if not _ProcessResults( |
| 894 | self._FetchProjectList(opt, x) for x in projects_list | 900 | self._FetchProjectList(opt, x) for x in projects_list |
| 895 | ): | 901 | ): |
| 896 | ret = False | 902 | ret = False |
| 897 | else: | 903 | else: |
| 898 | # Favor throughput over responsiveness when quiet. It seems that | 904 | if not opt.quiet: |
| 899 | # imap() will yield results in batches relative to chunksize, so | ||
| 900 | # even as the children finish a sync, we won't see the result until | ||
| 901 | # one child finishes ~chunksize jobs. When using a large --jobs | ||
| 902 | # with large chunksize, this can be jarring as there will be a large | ||
| 903 | # initial delay where repo looks like it isn't doing anything and | ||
| 904 | # sits at 0%, but then suddenly completes a lot of jobs all at once. | ||
| 905 | # Since this code is more network bound, we can accept a bit more | ||
| 906 | # CPU overhead with a smaller chunksize so that the user sees more | ||
| 907 | # immediate & continuous feedback. | ||
| 908 | if opt.quiet: | ||
| 909 | chunksize = WORKER_BATCH_SIZE | ||
| 910 | else: | ||
| 911 | pm.update(inc=0, msg="warming up") | 905 | pm.update(inc=0, msg="warming up") |
| 912 | chunksize = 4 | ||
| 913 | with multiprocessing.Pool( | 906 | with multiprocessing.Pool( |
| 914 | jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,) | 907 | jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,) |
| 915 | ) as pool: | 908 | ) as pool: |
| 916 | results = pool.imap_unordered( | 909 | results = pool.imap_unordered( |
| 917 | functools.partial(self._FetchProjectList, opt), | 910 | functools.partial(self._FetchProjectList, opt), |
| 918 | projects_list, | 911 | projects_list, |
| 919 | chunksize=chunksize, | 912 | chunksize=_chunksize(len(projects_list), jobs), |
| 920 | ) | 913 | ) |
| 921 | if not _ProcessResults(results): | 914 | if not _ProcessResults(results): |
| 922 | ret = False | 915 | ret = False |
