diff options
| author | Mike Frysinger <vapier@google.com> | 2021-02-24 12:50:30 -0500 |
|---|---|---|
| committer | Mike Frysinger <vapier@google.com> | 2021-04-01 14:43:19 +0000 |
| commit | d246d1fee7f42f2526a20a96597c8f01eda31433 (patch) | |
| tree | c61356504d2c6011346183171e3ed87dc4d04529 | |
| parent | bec4fe8aa39cdf9d1a67bfba8a31b3826f9ff197 (diff) | |
| download | git-repo-d246d1fee7f42f2526a20a96597c8f01eda31433.tar.gz | |
grep: add --jobs support
Use multiprocessing to run in parallel. When operating on multiple
projects, this can greatly speed things up. Across 1000 repos, it
goes from ~40sec to ~16sec with the default -j8.
The output processing does not appear to be a significant bottle
neck -- it accounts for <1sec out of the ~16sec runtime. Thus we
leave it in the main thread to simplify the code.
Change-Id: I750b72c7711b0c5d26e65d480738fbaac3a69971
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/297984
Reviewed-by: Chris Mcdonald <cjmcdonald@google.com>
Tested-by: Mike Frysinger <vapier@google.com>
| -rw-r--r-- | subcmds/grep.py | 118 |
1 files changed, 72 insertions, 46 deletions
diff --git a/subcmds/grep.py b/subcmds/grep.py index c16d4185..49feaf6b 100644 --- a/subcmds/grep.py +++ b/subcmds/grep.py | |||
| @@ -12,10 +12,12 @@ | |||
| 12 | # See the License for the specific language governing permissions and | 12 | # See the License for the specific language governing permissions and |
| 13 | # limitations under the License. | 13 | # limitations under the License. |
| 14 | 14 | ||
| 15 | import functools | ||
| 16 | import multiprocessing | ||
| 15 | import sys | 17 | import sys |
| 16 | 18 | ||
| 17 | from color import Coloring | 19 | from color import Coloring |
| 18 | from command import PagedCommand | 20 | from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE |
| 19 | from error import GitError | 21 | from error import GitError |
| 20 | from git_command import GitCommand | 22 | from git_command import GitCommand |
| 21 | 23 | ||
| @@ -61,6 +63,7 @@ contain a line that matches both expressions: | |||
| 61 | repo grep --all-match -e NODE -e Unexpected | 63 | repo grep --all-match -e NODE -e Unexpected |
| 62 | 64 | ||
| 63 | """ | 65 | """ |
| 66 | PARALLEL_JOBS = DEFAULT_LOCAL_JOBS | ||
| 64 | 67 | ||
| 65 | @staticmethod | 68 | @staticmethod |
| 66 | def _carry_option(_option, opt_str, value, parser): | 69 | def _carry_option(_option, opt_str, value, parser): |
| @@ -80,6 +83,7 @@ contain a line that matches both expressions: | |||
| 80 | pt.append(value) | 83 | pt.append(value) |
| 81 | 84 | ||
| 82 | def _Options(self, p): | 85 | def _Options(self, p): |
| 86 | super()._Options(p) | ||
| 83 | g = p.add_option_group('Sources') | 87 | g = p.add_option_group('Sources') |
| 84 | g.add_option('--cached', | 88 | g.add_option('--cached', |
| 85 | action='callback', callback=self._carry_option, | 89 | action='callback', callback=self._carry_option, |
| @@ -152,73 +156,49 @@ contain a line that matches both expressions: | |||
| 152 | action='callback', callback=self._carry_option, | 156 | action='callback', callback=self._carry_option, |
| 153 | help='Show only file names not containing matching lines') | 157 | help='Show only file names not containing matching lines') |
| 154 | 158 | ||
| 155 | def Execute(self, opt, args): | 159 | def _ExecuteOne(self, cmd_argv, project): |
| 156 | out = GrepColoring(self.manifest.manifestProject.config) | 160 | """Process one project.""" |
| 157 | 161 | try: | |
| 158 | cmd_argv = ['grep'] | 162 | p = GitCommand(project, |
| 159 | if out.is_on: | 163 | cmd_argv, |
| 160 | cmd_argv.append('--color') | 164 | bare=False, |
| 161 | cmd_argv.extend(getattr(opt, 'cmd_argv', [])) | 165 | capture_stdout=True, |
| 162 | 166 | capture_stderr=True) | |
| 163 | if '-e' not in cmd_argv: | 167 | except GitError as e: |
| 164 | if not args: | 168 | return (project, -1, None, str(e)) |
| 165 | self.Usage() | ||
| 166 | cmd_argv.append('-e') | ||
| 167 | cmd_argv.append(args[0]) | ||
| 168 | args = args[1:] | ||
| 169 | |||
| 170 | projects = self.GetProjects(args) | ||
| 171 | 169 | ||
| 172 | full_name = False | 170 | return (project, p.Wait(), p.stdout, p.stderr) |
| 173 | if len(projects) > 1: | ||
| 174 | cmd_argv.append('--full-name') | ||
| 175 | full_name = True | ||
| 176 | |||
| 177 | have_rev = False | ||
| 178 | if opt.revision: | ||
| 179 | if '--cached' in cmd_argv: | ||
| 180 | print('fatal: cannot combine --cached and --revision', file=sys.stderr) | ||
| 181 | sys.exit(1) | ||
| 182 | have_rev = True | ||
| 183 | cmd_argv.extend(opt.revision) | ||
| 184 | cmd_argv.append('--') | ||
| 185 | 171 | ||
| 172 | @staticmethod | ||
| 173 | def _ProcessResults(out, full_name, have_rev, results): | ||
| 186 | git_failed = False | 174 | git_failed = False |
| 187 | bad_rev = False | 175 | bad_rev = False |
| 188 | have_match = False | 176 | have_match = False |
| 189 | 177 | ||
| 190 | for project in projects: | 178 | for project, rc, stdout, stderr in results: |
| 191 | try: | 179 | if rc < 0: |
| 192 | p = GitCommand(project, | ||
| 193 | cmd_argv, | ||
| 194 | bare=False, | ||
| 195 | capture_stdout=True, | ||
| 196 | capture_stderr=True) | ||
| 197 | except GitError as e: | ||
| 198 | git_failed = True | 180 | git_failed = True |
| 199 | out.project('--- project %s ---' % project.relpath) | 181 | out.project('--- project %s ---' % project.relpath) |
| 200 | out.nl() | 182 | out.nl() |
| 201 | out.fail('%s', str(e)) | 183 | out.fail('%s', stderr) |
| 202 | out.nl() | 184 | out.nl() |
| 203 | continue | 185 | continue |
| 204 | 186 | ||
| 205 | if p.Wait() != 0: | 187 | if rc: |
| 206 | # no results | 188 | # no results |
| 207 | # | 189 | if stderr: |
| 208 | if p.stderr: | 190 | if have_rev and 'fatal: ambiguous argument' in stderr: |
| 209 | if have_rev and 'fatal: ambiguous argument' in p.stderr: | ||
| 210 | bad_rev = True | 191 | bad_rev = True |
| 211 | else: | 192 | else: |
| 212 | out.project('--- project %s ---' % project.relpath) | 193 | out.project('--- project %s ---' % project.relpath) |
| 213 | out.nl() | 194 | out.nl() |
| 214 | out.fail('%s', p.stderr.strip()) | 195 | out.fail('%s', stderr.strip()) |
| 215 | out.nl() | 196 | out.nl() |
| 216 | continue | 197 | continue |
| 217 | have_match = True | 198 | have_match = True |
| 218 | 199 | ||
| 219 | # We cut the last element, to avoid a blank line. | 200 | # We cut the last element, to avoid a blank line. |
| 220 | # | 201 | r = stdout.split('\n') |
| 221 | r = p.stdout.split('\n') | ||
| 222 | r = r[0:-1] | 202 | r = r[0:-1] |
| 223 | 203 | ||
| 224 | if have_rev and full_name: | 204 | if have_rev and full_name: |
| @@ -240,6 +220,52 @@ contain a line that matches both expressions: | |||
| 240 | for line in r: | 220 | for line in r: |
| 241 | print(line) | 221 | print(line) |
| 242 | 222 | ||
| 223 | return (git_failed, bad_rev, have_match) | ||
| 224 | |||
| 225 | def Execute(self, opt, args): | ||
| 226 | out = GrepColoring(self.manifest.manifestProject.config) | ||
| 227 | |||
| 228 | cmd_argv = ['grep'] | ||
| 229 | if out.is_on: | ||
| 230 | cmd_argv.append('--color') | ||
| 231 | cmd_argv.extend(getattr(opt, 'cmd_argv', [])) | ||
| 232 | |||
| 233 | if '-e' not in cmd_argv: | ||
| 234 | if not args: | ||
| 235 | self.Usage() | ||
| 236 | cmd_argv.append('-e') | ||
| 237 | cmd_argv.append(args[0]) | ||
| 238 | args = args[1:] | ||
| 239 | |||
| 240 | projects = self.GetProjects(args) | ||
| 241 | |||
| 242 | full_name = False | ||
| 243 | if len(projects) > 1: | ||
| 244 | cmd_argv.append('--full-name') | ||
| 245 | full_name = True | ||
| 246 | |||
| 247 | have_rev = False | ||
| 248 | if opt.revision: | ||
| 249 | if '--cached' in cmd_argv: | ||
| 250 | print('fatal: cannot combine --cached and --revision', file=sys.stderr) | ||
| 251 | sys.exit(1) | ||
| 252 | have_rev = True | ||
| 253 | cmd_argv.extend(opt.revision) | ||
| 254 | cmd_argv.append('--') | ||
| 255 | |||
| 256 | process_results = functools.partial( | ||
| 257 | self._ProcessResults, out, full_name, have_rev) | ||
| 258 | # NB: Multiprocessing is heavy, so don't spin it up for one job. | ||
| 259 | if len(projects) == 1 or opt.jobs == 1: | ||
| 260 | git_failed, bad_rev, have_match = process_results( | ||
| 261 | self._ExecuteOne(cmd_argv, x) for x in projects) | ||
| 262 | else: | ||
| 263 | with multiprocessing.Pool(opt.jobs) as pool: | ||
| 264 | results = pool.imap( | ||
| 265 | functools.partial(self._ExecuteOne, cmd_argv), projects, | ||
| 266 | chunksize=WORKER_BATCH_SIZE) | ||
| 267 | git_failed, bad_rev, have_match = process_results(results) | ||
| 268 | |||
| 243 | if git_failed: | 269 | if git_failed: |
| 244 | sys.exit(1) | 270 | sys.exit(1) |
| 245 | elif have_match: | 271 | elif have_match: |
