diff options
Diffstat (limited to 'meta/recipes-extended/grep/grep-2.5.1a/grep-egrep-fgrep-Fix-LSB-NG-cases.patch')
-rw-r--r-- | meta/recipes-extended/grep/grep-2.5.1a/grep-egrep-fgrep-Fix-LSB-NG-cases.patch | 1342 |
1 files changed, 1342 insertions, 0 deletions
diff --git a/meta/recipes-extended/grep/grep-2.5.1a/grep-egrep-fgrep-Fix-LSB-NG-cases.patch b/meta/recipes-extended/grep/grep-2.5.1a/grep-egrep-fgrep-Fix-LSB-NG-cases.patch new file mode 100644 index 0000000000..327ee56402 --- /dev/null +++ b/meta/recipes-extended/grep/grep-2.5.1a/grep-egrep-fgrep-Fix-LSB-NG-cases.patch | |||
@@ -0,0 +1,1342 @@ | |||
1 | From c884dd12ec062569335702848fc5f29f436c28fa Mon Sep 17 00:00:00 2001 | ||
2 | From: Li xin <lixin.fnst@cn.fujitsu.com> | ||
3 | Date: Mon, 25 May 2015 10:15:57 +0900 | ||
4 | Subject: [PATCH] grep egrep fgrep: Fix LSB NG cases. | ||
5 | |||
6 | The LSB core test requires grep egrep and fgrep can | ||
7 | perform pattern matching in searches without regard | ||
8 | to case if -i option is specified. | ||
9 | |||
10 | Upstream-Status: backport. | ||
11 | |||
12 | Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com> | ||
13 | --- | ||
14 | lib/posix/regex.h | 4 + | ||
15 | src/dfa.c | 22 +- | ||
16 | src/grep.c | 96 ++++--- | ||
17 | src/search.c | 833 +++++++++++++++++++++++++++++++++++++++++++++--------- | ||
18 | 4 files changed, 768 insertions(+), 187 deletions(-) | ||
19 | |||
20 | diff --git a/lib/posix/regex.h b/lib/posix/regex.h | ||
21 | index 63c2fef..7bb2b0e 100644 | ||
22 | --- a/lib/posix/regex.h | ||
23 | +++ b/lib/posix/regex.h | ||
24 | @@ -109,6 +109,10 @@ typedef unsigned long int reg_syntax_t; | ||
25 | If not set, \{, \}, {, and } are literals. */ | ||
26 | #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) | ||
27 | |||
28 | +/* If this bit is set, then ignore case when matching. | ||
29 | + If not set, then case is significant. */ | ||
30 | +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) | ||
31 | + | ||
32 | /* If this bit is set, +, ? and | aren't recognized as operators. | ||
33 | If not set, they are. */ | ||
34 | #define RE_LIMITED_OPS (RE_INTERVALS << 1) | ||
35 | diff --git a/src/dfa.c b/src/dfa.c | ||
36 | index 590bfa7..27c876a 100644 | ||
37 | --- a/src/dfa.c | ||
38 | +++ b/src/dfa.c | ||
39 | @@ -414,7 +414,7 @@ update_mb_len_index (unsigned char const *p, int len) | ||
40 | |||
41 | /* This function fetch a wide character, and update cur_mb_len, | ||
42 | used only if the current locale is a multibyte environment. */ | ||
43 | -static wchar_t | ||
44 | +static wint_t | ||
45 | fetch_wc (char const *eoferr) | ||
46 | { | ||
47 | wchar_t wc; | ||
48 | @@ -423,7 +423,7 @@ fetch_wc (char const *eoferr) | ||
49 | if (eoferr != 0) | ||
50 | dfaerror (eoferr); | ||
51 | else | ||
52 | - return -1; | ||
53 | + return WEOF; | ||
54 | } | ||
55 | |||
56 | cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); | ||
57 | @@ -459,7 +459,7 @@ fetch_wc (char const *eoferr) | ||
58 | static void | ||
59 | parse_bracket_exp_mb () | ||
60 | { | ||
61 | - wchar_t wc, wc1, wc2; | ||
62 | + wint_t wc, wc1, wc2; | ||
63 | |||
64 | /* Work area to build a mb_char_classes. */ | ||
65 | struct mb_char_classes *work_mbc; | ||
66 | @@ -496,7 +496,7 @@ parse_bracket_exp_mb () | ||
67 | work_mbc->invert = 0; | ||
68 | do | ||
69 | { | ||
70 | - wc1 = -1; /* mark wc1 is not initialized". */ | ||
71 | + wc1 = WEOF; /* mark wc1 is not initialized". */ | ||
72 | |||
73 | /* Note that if we're looking at some other [:...:] construct, | ||
74 | we just treat it as a bunch of ordinary characters. We can do | ||
75 | @@ -586,7 +586,7 @@ parse_bracket_exp_mb () | ||
76 | work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; | ||
77 | } | ||
78 | } | ||
79 | - wc = -1; | ||
80 | + wc1 = wc = WEOF; | ||
81 | } | ||
82 | else | ||
83 | /* We treat '[' as a normal character here. */ | ||
84 | @@ -600,7 +600,7 @@ parse_bracket_exp_mb () | ||
85 | wc = fetch_wc(("Unbalanced [")); | ||
86 | } | ||
87 | |||
88 | - if (wc1 == -1) | ||
89 | + if (wc1 == WEOF) | ||
90 | wc1 = fetch_wc(_("Unbalanced [")); | ||
91 | |||
92 | if (wc1 == L'-') | ||
93 | @@ -630,17 +630,17 @@ parse_bracket_exp_mb () | ||
94 | } | ||
95 | REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, | ||
96 | range_sts_al, work_mbc->nranges + 1); | ||
97 | - work_mbc->range_sts[work_mbc->nranges] = wc; | ||
98 | + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; | ||
99 | REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, | ||
100 | range_ends_al, work_mbc->nranges + 1); | ||
101 | - work_mbc->range_ends[work_mbc->nranges++] = wc2; | ||
102 | + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; | ||
103 | } | ||
104 | - else if (wc != -1) | ||
105 | + else if (wc != WEOF) | ||
106 | /* build normal characters. */ | ||
107 | { | ||
108 | REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, | ||
109 | work_mbc->nchars + 1); | ||
110 | - work_mbc->chars[work_mbc->nchars++] = wc; | ||
111 | + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; | ||
112 | } | ||
113 | } | ||
114 | while ((wc = wc1) != L']'); | ||
115 | @@ -2552,6 +2552,8 @@ match_mb_charset (struct dfa *d, int s, position pos, int index) | ||
116 | } | ||
117 | |||
118 | /* match with a character? */ | ||
119 | + if (case_fold) | ||
120 | + wc = towlower (wc); | ||
121 | for (i = 0; i<work_mbc->nchars; i++) | ||
122 | { | ||
123 | if (wc == work_mbc->chars[i]) | ||
124 | diff --git a/src/grep.c b/src/grep.c | ||
125 | index 2fb2fac..3fd4b47 100644 | ||
126 | --- a/src/grep.c | ||
127 | +++ b/src/grep.c | ||
128 | @@ -30,6 +30,12 @@ | ||
129 | # include <sys/time.h> | ||
130 | # include <sys/resource.h> | ||
131 | #endif | ||
132 | +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC | ||
133 | +/* We can handle multibyte string. */ | ||
134 | +# define MBS_SUPPORT | ||
135 | +# include <wchar.h> | ||
136 | +# include <wctype.h> | ||
137 | +#endif | ||
138 | #include <stdio.h> | ||
139 | #include "system.h" | ||
140 | #include "getopt.h" | ||
141 | @@ -255,19 +261,6 @@ reset (int fd, char const *file, struct stats *stats) | ||
142 | bufbeg[-1] = eolbyte; | ||
143 | bufdesc = fd; | ||
144 | |||
145 | - if (fstat (fd, &stats->stat) != 0) | ||
146 | - { | ||
147 | - error (0, errno, "fstat"); | ||
148 | - return 0; | ||
149 | - } | ||
150 | - if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) | ||
151 | - return 0; | ||
152 | -#ifndef DJGPP | ||
153 | - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode))) | ||
154 | -#else | ||
155 | - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) | ||
156 | -#endif | ||
157 | - return 0; | ||
158 | if (S_ISREG (stats->stat.st_mode)) | ||
159 | { | ||
160 | if (file) | ||
161 | @@ -558,33 +551,6 @@ prline (char const *beg, char const *lim, int sep) | ||
162 | { | ||
163 | size_t match_size; | ||
164 | size_t match_offset; | ||
165 | - if(match_icase) | ||
166 | - { | ||
167 | - /* Yuck, this is tricky */ | ||
168 | - char *buf = (char*) xmalloc (lim - beg); | ||
169 | - char *ibeg = buf; | ||
170 | - char *ilim = ibeg + (lim - beg); | ||
171 | - int i; | ||
172 | - for (i = 0; i < lim - beg; i++) | ||
173 | - ibeg[i] = tolower (beg[i]); | ||
174 | - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) | ||
175 | - != (size_t) -1) | ||
176 | - { | ||
177 | - char const *b = beg + match_offset; | ||
178 | - if (b == lim) | ||
179 | - break; | ||
180 | - fwrite (beg, sizeof (char), match_offset, stdout); | ||
181 | - printf ("\33[%sm", grep_color); | ||
182 | - fwrite (b, sizeof (char), match_size, stdout); | ||
183 | - fputs ("\33[00m", stdout); | ||
184 | - beg = b + match_size; | ||
185 | - ibeg = ibeg + match_offset + match_size; | ||
186 | - } | ||
187 | - fwrite (beg, 1, lim - beg, stdout); | ||
188 | - free (buf); | ||
189 | - lastout = lim; | ||
190 | - return; | ||
191 | - } | ||
192 | while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) | ||
193 | != (size_t) -1) | ||
194 | { | ||
195 | @@ -601,6 +567,7 @@ prline (char const *beg, char const *lim, int sep) | ||
196 | fputs ("\33[00m", stdout); | ||
197 | beg = b + match_size; | ||
198 | } | ||
199 | + fputs ("\33[K", stdout); | ||
200 | } | ||
201 | fwrite (beg, 1, lim - beg, stdout); | ||
202 | if (ferror (stdout)) | ||
203 | @@ -623,7 +590,7 @@ prpending (char const *lim) | ||
204 | size_t match_size; | ||
205 | --pending; | ||
206 | if (outleft | ||
207 | - || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1) | ||
208 | + || (((*execute) (lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1) | ||
209 | == !out_invert)) | ||
210 | prline (lastout, nl + 1, '-'); | ||
211 | else | ||
212 | @@ -895,6 +862,19 @@ grepfile (char const *file, struct stats *stats) | ||
213 | } | ||
214 | else | ||
215 | { | ||
216 | + if (stat (file, &stats->stat) != 0) | ||
217 | + { | ||
218 | + suppressible_error (file, errno); | ||
219 | + return 1; | ||
220 | + } | ||
221 | + if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) | ||
222 | + return 1; | ||
223 | +#ifndef DJGPP | ||
224 | + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode) || S_ISFIFO(stats->stat.st_mode))) | ||
225 | +#else | ||
226 | + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) | ||
227 | +#endif | ||
228 | + return 1; | ||
229 | while ((desc = open (file, O_RDONLY)) < 0 && errno == EINTR) | ||
230 | continue; | ||
231 | |||
232 | @@ -1681,9 +1661,6 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) | ||
233 | out_invert ^= 1; | ||
234 | match_lines = match_words = 0; | ||
235 | } | ||
236 | - else | ||
237 | - /* Strip trailing newline. */ | ||
238 | - --keycc; | ||
239 | } | ||
240 | else | ||
241 | if (optind < argc) | ||
242 | @@ -1697,6 +1674,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) | ||
243 | if (!install_matcher (matcher) && !install_matcher ("default")) | ||
244 | abort (); | ||
245 | |||
246 | +#ifdef MBS_SUPPORT | ||
247 | + if (MB_CUR_MAX != 1 && match_icase) | ||
248 | + { | ||
249 | + wchar_t wc; | ||
250 | + mbstate_t cur_state, prev_state; | ||
251 | + int i, len = strlen(keys); | ||
252 | + | ||
253 | + memset(&cur_state, 0, sizeof(mbstate_t)); | ||
254 | + for (i = 0; i <= len ;) | ||
255 | + { | ||
256 | + size_t mbclen; | ||
257 | + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); | ||
258 | + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | ||
259 | + { | ||
260 | + /* An invalid sequence, or a truncated multibyte character. | ||
261 | + We treat it as a singlebyte character. */ | ||
262 | + mbclen = 1; | ||
263 | + } | ||
264 | + else | ||
265 | + { | ||
266 | + if (iswupper((wint_t)wc)) | ||
267 | + { | ||
268 | + wc = towlower((wint_t)wc); | ||
269 | + wcrtomb(keys + i, wc, &cur_state); | ||
270 | + } | ||
271 | + } | ||
272 | + i += mbclen; | ||
273 | + } | ||
274 | + } | ||
275 | +#endif /* MBS_SUPPORT */ | ||
276 | + | ||
277 | (*compile)(keys, keycc); | ||
278 | |||
279 | if ((argc - optind > 1 && !no_filenames) || with_filenames) | ||
280 | diff --git a/src/search.c b/src/search.c | ||
281 | index 7bd233f..3c6a485 100644 | ||
282 | --- a/src/search.c | ||
283 | +++ b/src/search.c | ||
284 | @@ -18,9 +18,13 @@ | ||
285 | |||
286 | /* Written August 1992 by Mike Haertel. */ | ||
287 | |||
288 | +#ifndef _GNU_SOURCE | ||
289 | +# define _GNU_SOURCE 1 | ||
290 | +#endif | ||
291 | #ifdef HAVE_CONFIG_H | ||
292 | # include <config.h> | ||
293 | #endif | ||
294 | +#include <assert.h> | ||
295 | #include <sys/types.h> | ||
296 | #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC | ||
297 | /* We can handle multibyte string. */ | ||
298 | @@ -31,7 +35,7 @@ | ||
299 | |||
300 | #include "system.h" | ||
301 | #include "grep.h" | ||
302 | -#include "regex.h" | ||
303 | +#include <regex.h> | ||
304 | #include "dfa.h" | ||
305 | #include "kwset.h" | ||
306 | #include "error.h" | ||
307 | @@ -39,6 +43,9 @@ | ||
308 | #ifdef HAVE_LIBPCRE | ||
309 | # include <pcre.h> | ||
310 | #endif | ||
311 | +#ifdef HAVE_LANGINFO_CODESET | ||
312 | +# include <langinfo.h> | ||
313 | +#endif | ||
314 | |||
315 | #define NCHAR (UCHAR_MAX + 1) | ||
316 | |||
317 | @@ -70,9 +77,10 @@ static kwset_t kwset; | ||
318 | call the regexp matcher at all. */ | ||
319 | static int kwset_exact_matches; | ||
320 | |||
321 | -#if defined(MBS_SUPPORT) | ||
322 | -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); | ||
323 | -#endif | ||
324 | +/* UTF-8 encoding allows some optimizations that we can't otherwise | ||
325 | + assume in a multibyte encoding. */ | ||
326 | +static int using_utf8; | ||
327 | + | ||
328 | static void kwsinit PARAMS ((void)); | ||
329 | static void kwsmusts PARAMS ((void)); | ||
330 | static void Gcompile PARAMS ((char const *, size_t)); | ||
331 | @@ -84,6 +92,15 @@ static void Pcompile PARAMS ((char const *, size_t )); | ||
332 | static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); | ||
333 | |||
334 | void | ||
335 | +check_utf8 (void) | ||
336 | +{ | ||
337 | +#ifdef HAVE_LANGINFO_CODESET | ||
338 | + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) | ||
339 | + using_utf8 = 1; | ||
340 | +#endif | ||
341 | +} | ||
342 | + | ||
343 | +void | ||
344 | dfaerror (char const *mesg) | ||
345 | { | ||
346 | error (2, 0, mesg); | ||
347 | @@ -141,38 +158,6 @@ kwsmusts (void) | ||
348 | } | ||
349 | } | ||
350 | |||
351 | -#ifdef MBS_SUPPORT | ||
352 | -/* This function allocate the array which correspond to "buf". | ||
353 | - Then this check multibyte string and mark on the positions which | ||
354 | - are not singlebyte character nor the first byte of a multibyte | ||
355 | - character. Caller must free the array. */ | ||
356 | -static char* | ||
357 | -check_multibyte_string(char const *buf, size_t size) | ||
358 | -{ | ||
359 | - char *mb_properties = malloc(size); | ||
360 | - mbstate_t cur_state; | ||
361 | - int i; | ||
362 | - memset(&cur_state, 0, sizeof(mbstate_t)); | ||
363 | - memset(mb_properties, 0, sizeof(char)*size); | ||
364 | - for (i = 0; i < size ;) | ||
365 | - { | ||
366 | - size_t mbclen; | ||
367 | - mbclen = mbrlen(buf + i, size - i, &cur_state); | ||
368 | - | ||
369 | - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | ||
370 | - { | ||
371 | - /* An invalid sequence, or a truncated multibyte character. | ||
372 | - We treat it as a singlebyte character. */ | ||
373 | - mbclen = 1; | ||
374 | - } | ||
375 | - mb_properties[i] = mbclen; | ||
376 | - i += mbclen; | ||
377 | - } | ||
378 | - | ||
379 | - return mb_properties; | ||
380 | -} | ||
381 | -#endif | ||
382 | - | ||
383 | static void | ||
384 | Gcompile (char const *pattern, size_t size) | ||
385 | { | ||
386 | @@ -181,7 +166,8 @@ Gcompile (char const *pattern, size_t size) | ||
387 | size_t total = size; | ||
388 | char const *motif = pattern; | ||
389 | |||
390 | - re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); | ||
391 | + check_utf8 (); | ||
392 | + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); | ||
393 | dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); | ||
394 | |||
395 | /* For GNU regex compiler we have to pass the patterns separately to detect | ||
396 | @@ -218,6 +204,10 @@ Gcompile (char const *pattern, size_t size) | ||
397 | motif = sep; | ||
398 | } while (sep && total != 0); | ||
399 | |||
400 | + /* Strip trailing newline. */ | ||
401 | + if (size && pattern[size - 1] == '\n') | ||
402 | + size--; | ||
403 | + | ||
404 | /* In the match_words and match_lines cases, we use a different pattern | ||
405 | for the DFA matcher that will quickly throw out cases that won't work. | ||
406 | Then if DFA succeeds we do some hairy stuff using the regex matcher | ||
407 | @@ -233,7 +223,7 @@ Gcompile (char const *pattern, size_t size) | ||
408 | static char const line_end[] = "\\)$"; | ||
409 | static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; | ||
410 | static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; | ||
411 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | ||
412 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); | ||
413 | size_t i; | ||
414 | strcpy (n, match_lines ? line_beg : word_beg); | ||
415 | i = strlen (n); | ||
416 | @@ -257,14 +247,15 @@ Ecompile (char const *pattern, size_t size) | ||
417 | size_t total = size; | ||
418 | char const *motif = pattern; | ||
419 | |||
420 | + check_utf8 (); | ||
421 | if (strcmp (matcher, "awk") == 0) | ||
422 | { | ||
423 | - re_set_syntax (RE_SYNTAX_AWK); | ||
424 | + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); | ||
425 | dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); | ||
426 | } | ||
427 | else | ||
428 | { | ||
429 | - re_set_syntax (RE_SYNTAX_POSIX_EGREP); | ||
430 | + re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); | ||
431 | dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); | ||
432 | } | ||
433 | |||
434 | @@ -301,6 +292,10 @@ Ecompile (char const *pattern, size_t size) | ||
435 | motif = sep; | ||
436 | } while (sep && total != 0); | ||
437 | |||
438 | + /* Strip trailing newline. */ | ||
439 | + if (size && pattern[size - 1] == '\n') | ||
440 | + size--; | ||
441 | + | ||
442 | /* In the match_words and match_lines cases, we use a different pattern | ||
443 | for the DFA matcher that will quickly throw out cases that won't work. | ||
444 | Then if DFA succeeds we do some hairy stuff using the regex matcher | ||
445 | @@ -316,7 +311,7 @@ Ecompile (char const *pattern, size_t size) | ||
446 | static char const line_end[] = ")$"; | ||
447 | static char const word_beg[] = "(^|[^[:alnum:]_])("; | ||
448 | static char const word_end[] = ")([^[:alnum:]_]|$)"; | ||
449 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); | ||
450 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); | ||
451 | size_t i; | ||
452 | strcpy (n, match_lines ? line_beg : word_beg); | ||
453 | i = strlen(n); | ||
454 | @@ -339,15 +334,34 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
455 | char eol = eolbyte; | ||
456 | int backref, start, len; | ||
457 | struct kwsmatch kwsm; | ||
458 | - size_t i; | ||
459 | + size_t i, ret_val; | ||
460 | + static int use_dfa; | ||
461 | + static int use_dfa_checked = 0; | ||
462 | #ifdef MBS_SUPPORT | ||
463 | - char *mb_properties = NULL; | ||
464 | + int mb_cur_max = MB_CUR_MAX; | ||
465 | + mbstate_t mbs; | ||
466 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
467 | #endif /* MBS_SUPPORT */ | ||
468 | |||
469 | + if (!use_dfa_checked) | ||
470 | + { | ||
471 | + char *grep_use_dfa = getenv ("GREP_USE_DFA"); | ||
472 | + if (!grep_use_dfa) | ||
473 | + { | ||
474 | #ifdef MBS_SUPPORT | ||
475 | - if (MB_CUR_MAX > 1 && kwset) | ||
476 | - mb_properties = check_multibyte_string(buf, size); | ||
477 | + /* Turn off DFA when processing multibyte input. */ | ||
478 | + use_dfa = (MB_CUR_MAX == 1); | ||
479 | +#else | ||
480 | + use_dfa = 1; | ||
481 | #endif /* MBS_SUPPORT */ | ||
482 | + } | ||
483 | + else | ||
484 | + { | ||
485 | + use_dfa = atoi (grep_use_dfa); | ||
486 | + } | ||
487 | + | ||
488 | + use_dfa_checked = 1; | ||
489 | + } | ||
490 | |||
491 | buflim = buf + size; | ||
492 | |||
493 | @@ -358,47 +372,120 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
494 | if (kwset) | ||
495 | { | ||
496 | /* Find a possible match using the KWset matcher. */ | ||
497 | - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | ||
498 | +#ifdef MBS_SUPPORT | ||
499 | + size_t bytes_left = 0; | ||
500 | +#endif /* MBS_SUPPORT */ | ||
501 | + size_t offset; | ||
502 | +#ifdef MBS_SUPPORT | ||
503 | + /* kwsexec doesn't work with match_icase and multibyte input. */ | ||
504 | + if (match_icase && mb_cur_max > 1) | ||
505 | + /* Avoid kwset */ | ||
506 | + offset = 0; | ||
507 | + else | ||
508 | +#endif /* MBS_SUPPORT */ | ||
509 | + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | ||
510 | if (offset == (size_t) -1) | ||
511 | - { | ||
512 | + goto failure; | ||
513 | #ifdef MBS_SUPPORT | ||
514 | - if (MB_CUR_MAX > 1) | ||
515 | - free(mb_properties); | ||
516 | -#endif | ||
517 | - return (size_t)-1; | ||
518 | + if (mb_cur_max > 1 && !using_utf8) | ||
519 | + { | ||
520 | + bytes_left = offset; | ||
521 | + while (bytes_left) | ||
522 | + { | ||
523 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | ||
524 | + if (mlen == (size_t) -1 || mlen == 0) | ||
525 | + { | ||
526 | + /* Incomplete character: treat as single-byte. */ | ||
527 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
528 | + beg++; | ||
529 | + bytes_left--; | ||
530 | + continue; | ||
531 | + } | ||
532 | + | ||
533 | + if (mlen == (size_t) -2) | ||
534 | + /* Offset points inside multibyte character: | ||
535 | + * no good. */ | ||
536 | + break; | ||
537 | + | ||
538 | + beg += mlen; | ||
539 | + bytes_left -= mlen; | ||
540 | + } | ||
541 | } | ||
542 | + else | ||
543 | +#endif /* MBS_SUPPORT */ | ||
544 | beg += offset; | ||
545 | /* Narrow down to the line containing the candidate, and | ||
546 | run it through DFA. */ | ||
547 | end = memchr(beg, eol, buflim - beg); | ||
548 | end++; | ||
549 | #ifdef MBS_SUPPORT | ||
550 | - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) | ||
551 | + if (mb_cur_max > 1 && bytes_left) | ||
552 | continue; | ||
553 | -#endif | ||
554 | +#endif /* MBS_SUPPORT */ | ||
555 | while (beg > buf && beg[-1] != eol) | ||
556 | --beg; | ||
557 | - if (kwsm.index < kwset_exact_matches) | ||
558 | - goto success; | ||
559 | - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | ||
560 | + if ( | ||
561 | +#ifdef MBS_SUPPORT | ||
562 | + !(match_icase && mb_cur_max > 1) && | ||
563 | +#endif /* MBS_SUPPORT */ | ||
564 | + (kwsm.index < kwset_exact_matches)) | ||
565 | + goto success_in_beg_and_end; | ||
566 | + if (use_dfa && | ||
567 | + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | ||
568 | continue; | ||
569 | } | ||
570 | else | ||
571 | { | ||
572 | /* No good fixed strings; start with DFA. */ | ||
573 | - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); | ||
574 | +#ifdef MBS_SUPPORT | ||
575 | + size_t bytes_left = 0; | ||
576 | +#endif /* MBS_SUPPORT */ | ||
577 | + size_t offset = 0; | ||
578 | + if (use_dfa) | ||
579 | + offset = dfaexec (&dfa, beg, buflim - beg, &backref); | ||
580 | if (offset == (size_t) -1) | ||
581 | break; | ||
582 | /* Narrow down to the line we've found. */ | ||
583 | +#ifdef MBS_SUPPORT | ||
584 | + if (mb_cur_max > 1 && !using_utf8) | ||
585 | + { | ||
586 | + bytes_left = offset; | ||
587 | + while (bytes_left) | ||
588 | + { | ||
589 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | ||
590 | + if (mlen == (size_t) -1 || mlen == 0) | ||
591 | + { | ||
592 | + /* Incomplete character: treat as single-byte. */ | ||
593 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
594 | + beg++; | ||
595 | + bytes_left--; | ||
596 | + continue; | ||
597 | + } | ||
598 | + | ||
599 | + if (mlen == (size_t) -2) | ||
600 | + /* Offset points inside multibyte character: | ||
601 | + * no good. */ | ||
602 | + break; | ||
603 | + | ||
604 | + beg += mlen; | ||
605 | + bytes_left -= mlen; | ||
606 | + } | ||
607 | + } | ||
608 | + else | ||
609 | +#endif /* MBS_SUPPORT */ | ||
610 | beg += offset; | ||
611 | end = memchr (beg, eol, buflim - beg); | ||
612 | end++; | ||
613 | +#ifdef MBS_SUPPORT | ||
614 | + if (mb_cur_max > 1 && bytes_left) | ||
615 | + continue; | ||
616 | +#endif /* MBS_SUPPORT */ | ||
617 | while (beg > buf && beg[-1] != eol) | ||
618 | --beg; | ||
619 | } | ||
620 | /* Successful, no backreferences encountered! */ | ||
621 | - if (!backref) | ||
622 | - goto success; | ||
623 | + if (use_dfa && !backref) | ||
624 | + goto success_in_beg_and_end; | ||
625 | } | ||
626 | else | ||
627 | end = beg + size; | ||
628 | @@ -413,14 +500,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
629 | end - beg - 1, &(patterns[i].regs)))) | ||
630 | { | ||
631 | len = patterns[i].regs.end[0] - start; | ||
632 | - if (exact) | ||
633 | - { | ||
634 | - *match_size = len; | ||
635 | - return start; | ||
636 | - } | ||
637 | + if (exact && !match_words) | ||
638 | + goto success_in_start_and_len; | ||
639 | if ((!match_lines && !match_words) | ||
640 | || (match_lines && len == end - beg - 1)) | ||
641 | - goto success; | ||
642 | + goto success_in_beg_and_end; | ||
643 | /* If -w, check if the match aligns with word boundaries. | ||
644 | We do this iteratively because: | ||
645 | (a) the line may contain more than one occurence of the | ||
646 | @@ -431,10 +515,114 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
647 | if (match_words) | ||
648 | while (start >= 0) | ||
649 | { | ||
650 | - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) | ||
651 | - && (len == end - beg - 1 | ||
652 | - || !WCHAR ((unsigned char) beg[start + len]))) | ||
653 | - goto success; | ||
654 | + int lword_match = 0; | ||
655 | + if (start == 0) | ||
656 | + lword_match = 1; | ||
657 | + else | ||
658 | + { | ||
659 | + assert (start > 0); | ||
660 | +#ifdef MBS_SUPPORT | ||
661 | + if (mb_cur_max > 1) | ||
662 | + { | ||
663 | + const char *s; | ||
664 | + size_t mr; | ||
665 | + wchar_t pwc; | ||
666 | + | ||
667 | + /* Locate the start of the multibyte character | ||
668 | + before the match position (== beg + start). */ | ||
669 | + if (using_utf8) | ||
670 | + { | ||
671 | + /* UTF-8 is a special case: scan backwards | ||
672 | + until we find a 7-bit character or a | ||
673 | + lead byte. */ | ||
674 | + s = beg + start - 1; | ||
675 | + while (s > buf | ||
676 | + && (unsigned char) *s >= 0x80 | ||
677 | + && (unsigned char) *s <= 0xbf) | ||
678 | + --s; | ||
679 | + } | ||
680 | + else | ||
681 | + { | ||
682 | + /* Scan forwards to find the start of the | ||
683 | + last complete character before the | ||
684 | + match position. */ | ||
685 | + size_t bytes_left = start - 1; | ||
686 | + s = beg; | ||
687 | + while (bytes_left > 0) | ||
688 | + { | ||
689 | + mr = mbrlen (s, bytes_left, &mbs); | ||
690 | + if (mr == (size_t) -1 || mr == 0) | ||
691 | + { | ||
692 | + memset (&mbs, '\0', sizeof (mbs)); | ||
693 | + s++; | ||
694 | + bytes_left--; | ||
695 | + continue; | ||
696 | + } | ||
697 | + if (mr == (size_t) -2) | ||
698 | + { | ||
699 | + memset (&mbs, '\0', sizeof (mbs)); | ||
700 | + break; | ||
701 | + } | ||
702 | + s += mr; | ||
703 | + bytes_left -= mr; | ||
704 | + } | ||
705 | + } | ||
706 | + mr = mbrtowc (&pwc, s, beg + start - s, &mbs); | ||
707 | + if (mr == (size_t) -2 || mr == (size_t) -1 || | ||
708 | + mr == 0) | ||
709 | + { | ||
710 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
711 | + lword_match = 1; | ||
712 | + } | ||
713 | + else if (!(iswalnum (pwc) || pwc == L'_') | ||
714 | + && mr == beg + start - s) | ||
715 | + lword_match = 1; | ||
716 | + } | ||
717 | + else | ||
718 | +#endif /* MBS_SUPPORT */ | ||
719 | + if (!WCHAR ((unsigned char) beg[start - 1])) | ||
720 | + lword_match = 1; | ||
721 | + } | ||
722 | + | ||
723 | + if (lword_match) | ||
724 | + { | ||
725 | + int rword_match = 0; | ||
726 | + if (start + len == end - beg - 1) | ||
727 | + rword_match = 1; | ||
728 | + else | ||
729 | + { | ||
730 | +#ifdef MBS_SUPPORT | ||
731 | + if (mb_cur_max > 1) | ||
732 | + { | ||
733 | + wchar_t nwc; | ||
734 | + int mr; | ||
735 | + | ||
736 | + mr = mbtowc (&nwc, beg + start + len, | ||
737 | + end - beg - start - len - 1); | ||
738 | + if (mr <= 0) | ||
739 | + { | ||
740 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
741 | + rword_match = 1; | ||
742 | + } | ||
743 | + else if (!iswalnum (nwc) && nwc != L'_') | ||
744 | + rword_match = 1; | ||
745 | + } | ||
746 | + else | ||
747 | +#endif /* MBS_SUPPORT */ | ||
748 | + if (!WCHAR ((unsigned char) beg[start + len])) | ||
749 | + rword_match = 1; | ||
750 | + } | ||
751 | + | ||
752 | + if (rword_match) | ||
753 | + { | ||
754 | + if (!exact) | ||
755 | + /* Returns the whole line. */ | ||
756 | + goto success_in_beg_and_end; | ||
757 | + else | ||
758 | + /* Returns just this word match. */ | ||
759 | + goto success_in_start_and_len; | ||
760 | + } | ||
761 | + } | ||
762 | if (len > 0) | ||
763 | { | ||
764 | /* Try a shorter length anchored at the same place. */ | ||
765 | @@ -461,26 +649,154 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
766 | } | ||
767 | } /* for Regex patterns. */ | ||
768 | } /* for (beg = end ..) */ | ||
769 | -#ifdef MBS_SUPPORT | ||
770 | - if (MB_CUR_MAX > 1 && mb_properties) | ||
771 | - free (mb_properties); | ||
772 | -#endif /* MBS_SUPPORT */ | ||
773 | + | ||
774 | + failure: | ||
775 | return (size_t) -1; | ||
776 | |||
777 | - success: | ||
778 | -#ifdef MBS_SUPPORT | ||
779 | - if (MB_CUR_MAX > 1 && mb_properties) | ||
780 | - free (mb_properties); | ||
781 | -#endif /* MBS_SUPPORT */ | ||
782 | - *match_size = end - beg; | ||
783 | - return beg - buf; | ||
784 | + success_in_beg_and_end: | ||
785 | + len = end - beg; | ||
786 | + start = beg - buf; | ||
787 | + /* FALLTHROUGH */ | ||
788 | + | ||
789 | + success_in_start_and_len: | ||
790 | + *match_size = len; | ||
791 | + return start; | ||
792 | } | ||
793 | |||
794 | +#ifdef MBS_SUPPORT | ||
795 | +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ | ||
796 | +static struct | ||
797 | +{ | ||
798 | + wchar_t **patterns; | ||
799 | + size_t count, maxlen; | ||
800 | + unsigned char *match; | ||
801 | +} Fimb; | ||
802 | +#endif | ||
803 | + | ||
804 | static void | ||
805 | Fcompile (char const *pattern, size_t size) | ||
806 | { | ||
807 | + int mb_cur_max = MB_CUR_MAX; | ||
808 | char const *beg, *lim, *err; | ||
809 | |||
810 | + check_utf8 (); | ||
811 | +#ifdef MBS_SUPPORT | ||
812 | + /* Support -F -i for UTF-8 input. */ | ||
813 | + if (match_icase && mb_cur_max > 1) | ||
814 | + { | ||
815 | + mbstate_t mbs; | ||
816 | + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); | ||
817 | + const char *patternend = pattern; | ||
818 | + size_t wcsize; | ||
819 | + kwset_t fimb_kwset = NULL; | ||
820 | + char *starts = NULL; | ||
821 | + wchar_t *wcbeg, *wclim; | ||
822 | + size_t allocated = 0; | ||
823 | + | ||
824 | + memset (&mbs, '\0', sizeof (mbs)); | ||
825 | +# ifdef __GNU_LIBRARY__ | ||
826 | + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); | ||
827 | + if (patternend != pattern + size) | ||
828 | + wcsize = (size_t) -1; | ||
829 | +# else | ||
830 | + { | ||
831 | + char *patterncopy = xmalloc (size + 1); | ||
832 | + | ||
833 | + memcpy (patterncopy, pattern, size); | ||
834 | + patterncopy[size] = '\0'; | ||
835 | + patternend = patterncopy; | ||
836 | + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); | ||
837 | + if (patternend != patterncopy + size) | ||
838 | + wcsize = (size_t) -1; | ||
839 | + free (patterncopy); | ||
840 | + } | ||
841 | +# endif | ||
842 | + if (wcsize + 2 <= 2) | ||
843 | + { | ||
844 | +fimb_fail: | ||
845 | + free (wcpattern); | ||
846 | + free (starts); | ||
847 | + if (fimb_kwset) | ||
848 | + kwsfree (fimb_kwset); | ||
849 | + free (Fimb.patterns); | ||
850 | + Fimb.patterns = NULL; | ||
851 | + } | ||
852 | + else | ||
853 | + { | ||
854 | + if (!(fimb_kwset = kwsalloc (NULL))) | ||
855 | + error (2, 0, _("memory exhausted")); | ||
856 | + | ||
857 | + starts = xmalloc (mb_cur_max * 3); | ||
858 | + wcbeg = wcpattern; | ||
859 | + do | ||
860 | + { | ||
861 | + int i; | ||
862 | + size_t wclen; | ||
863 | + | ||
864 | + if (Fimb.count >= allocated) | ||
865 | + { | ||
866 | + if (allocated == 0) | ||
867 | + allocated = 128; | ||
868 | + else | ||
869 | + allocated *= 2; | ||
870 | + Fimb.patterns = xrealloc (Fimb.patterns, | ||
871 | + sizeof (wchar_t *) * allocated); | ||
872 | + } | ||
873 | + Fimb.patterns[Fimb.count++] = wcbeg; | ||
874 | + for (wclim = wcbeg; | ||
875 | + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) | ||
876 | + *wclim = towlower (*wclim); | ||
877 | + *wclim = L'\0'; | ||
878 | + wclen = wclim - wcbeg; | ||
879 | + if (wclen > Fimb.maxlen) | ||
880 | + Fimb.maxlen = wclen; | ||
881 | + if (wclen > 3) | ||
882 | + wclen = 3; | ||
883 | + if (wclen == 0) | ||
884 | + { | ||
885 | + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) | ||
886 | + error (2, 0, err); | ||
887 | + } | ||
888 | + else | ||
889 | + for (i = 0; i < (1 << wclen); i++) | ||
890 | + { | ||
891 | + char *p = starts; | ||
892 | + int j, k; | ||
893 | + | ||
894 | + for (j = 0; j < wclen; ++j) | ||
895 | + { | ||
896 | + wchar_t wc = wcbeg[j]; | ||
897 | + if (i & (1 << j)) | ||
898 | + { | ||
899 | + wc = towupper (wc); | ||
900 | + if (wc == wcbeg[j]) | ||
901 | + continue; | ||
902 | + } | ||
903 | + k = wctomb (p, wc); | ||
904 | + if (k <= 0) | ||
905 | + goto fimb_fail; | ||
906 | + p += k; | ||
907 | + } | ||
908 | + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) | ||
909 | + error (2, 0, err); | ||
910 | + } | ||
911 | + if (wclim < wcpattern + wcsize) | ||
912 | + ++wclim; | ||
913 | + wcbeg = wclim; | ||
914 | + } | ||
915 | + while (wcbeg < wcpattern + wcsize); | ||
916 | + f_i_multibyte = 1; | ||
917 | + kwset = fimb_kwset; | ||
918 | + free (starts); | ||
919 | + Fimb.match = xmalloc (Fimb.count); | ||
920 | + if ((err = kwsprep (kwset)) != 0) | ||
921 | + error (2, 0, err); | ||
922 | + return; | ||
923 | + } | ||
924 | + } | ||
925 | +#endif /* MBS_SUPPORT */ | ||
926 | + | ||
927 | + | ||
928 | kwsinit (); | ||
929 | beg = pattern; | ||
930 | do | ||
931 | @@ -499,6 +815,76 @@ Fcompile (char const *pattern, size_t size) | ||
932 | error (2, 0, err); | ||
933 | } | ||
934 | |||
935 | +#ifdef MBS_SUPPORT | ||
936 | +static int | ||
937 | +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) | ||
938 | +{ | ||
939 | + size_t len, letter, i; | ||
940 | + int ret = -1; | ||
941 | + mbstate_t mbs; | ||
942 | + wchar_t wc; | ||
943 | + int patterns_left; | ||
944 | + | ||
945 | + assert (match_icase && f_i_multibyte == 1); | ||
946 | + assert (MB_CUR_MAX > 1); | ||
947 | + | ||
948 | + memset (&mbs, '\0', sizeof (mbs)); | ||
949 | + memset (Fimb.match, '\1', Fimb.count); | ||
950 | + letter = len = 0; | ||
951 | + patterns_left = 1; | ||
952 | + while (patterns_left && len <= size) | ||
953 | + { | ||
954 | + size_t c; | ||
955 | + | ||
956 | + patterns_left = 0; | ||
957 | + if (len < size) | ||
958 | + { | ||
959 | + c = mbrtowc (&wc, buf + len, size - len, &mbs); | ||
960 | + if (c + 2 <= 2) | ||
961 | + return ret; | ||
962 | + | ||
963 | + wc = towlower (wc); | ||
964 | + } | ||
965 | + else | ||
966 | + { | ||
967 | + c = 1; | ||
968 | + wc = L'\0'; | ||
969 | + } | ||
970 | + | ||
971 | + for (i = 0; i < Fimb.count; i++) | ||
972 | + { | ||
973 | + if (Fimb.match[i]) | ||
974 | + { | ||
975 | + if (Fimb.patterns[i][letter] == L'\0') | ||
976 | + { | ||
977 | + /* Found a match. */ | ||
978 | + *plen = len; | ||
979 | + if (!exact && !match_words) | ||
980 | + return 0; | ||
981 | + else | ||
982 | + { | ||
983 | + /* For -w or exact look for longest match. */ | ||
984 | + ret = 0; | ||
985 | + Fimb.match[i] = '\0'; | ||
986 | + continue; | ||
987 | + } | ||
988 | + } | ||
989 | + | ||
990 | + if (Fimb.patterns[i][letter] == wc) | ||
991 | + patterns_left = 1; | ||
992 | + else | ||
993 | + Fimb.match[i] = '\0'; | ||
994 | + } | ||
995 | + } | ||
996 | + | ||
997 | + len += c; | ||
998 | + letter++; | ||
999 | + } | ||
1000 | + | ||
1001 | + return ret; | ||
1002 | +} | ||
1003 | +#endif /* MBS_SUPPORT */ | ||
1004 | + | ||
1005 | static size_t | ||
1006 | Fexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
1007 | { | ||
1008 | @@ -506,88 +892,268 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
1009 | register size_t len; | ||
1010 | char eol = eolbyte; | ||
1011 | struct kwsmatch kwsmatch; | ||
1012 | + size_t ret_val; | ||
1013 | #ifdef MBS_SUPPORT | ||
1014 | - char *mb_properties; | ||
1015 | - if (MB_CUR_MAX > 1) | ||
1016 | - mb_properties = check_multibyte_string (buf, size); | ||
1017 | + int mb_cur_max = MB_CUR_MAX; | ||
1018 | + mbstate_t mbs; | ||
1019 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1020 | + const char *last_char = NULL; | ||
1021 | #endif /* MBS_SUPPORT */ | ||
1022 | |||
1023 | - for (beg = buf; beg <= buf + size; ++beg) | ||
1024 | + for (beg = buf; beg < buf + size; ++beg) | ||
1025 | { | ||
1026 | - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | ||
1027 | + size_t offset; | ||
1028 | + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | ||
1029 | + | ||
1030 | if (offset == (size_t) -1) | ||
1031 | - { | ||
1032 | + goto failure; | ||
1033 | #ifdef MBS_SUPPORT | ||
1034 | - if (MB_CUR_MAX > 1) | ||
1035 | - free(mb_properties); | ||
1036 | -#endif /* MBS_SUPPORT */ | ||
1037 | - return offset; | ||
1038 | + if (mb_cur_max > 1 && !using_utf8) | ||
1039 | + { | ||
1040 | + size_t bytes_left = offset; | ||
1041 | + while (bytes_left) | ||
1042 | + { | ||
1043 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | ||
1044 | + | ||
1045 | + last_char = beg; | ||
1046 | + if (mlen == (size_t) -1 || mlen == 0) | ||
1047 | + { | ||
1048 | + /* Incomplete character: treat as single-byte. */ | ||
1049 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1050 | + beg++; | ||
1051 | + bytes_left--; | ||
1052 | + continue; | ||
1053 | + } | ||
1054 | + | ||
1055 | + if (mlen == (size_t) -2) | ||
1056 | + /* Offset points inside multibyte character: no good. */ | ||
1057 | + break; | ||
1058 | + | ||
1059 | + beg += mlen; | ||
1060 | + bytes_left -= mlen; | ||
1061 | + } | ||
1062 | + | ||
1063 | + if (bytes_left) | ||
1064 | + continue; | ||
1065 | } | ||
1066 | -#ifdef MBS_SUPPORT | ||
1067 | - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) | ||
1068 | - continue; /* It is a part of multibyte character. */ | ||
1069 | + else | ||
1070 | #endif /* MBS_SUPPORT */ | ||
1071 | beg += offset; | ||
1072 | - len = kwsmatch.size[0]; | ||
1073 | - if (exact) | ||
1074 | - { | ||
1075 | - *match_size = len; | ||
1076 | #ifdef MBS_SUPPORT | ||
1077 | - if (MB_CUR_MAX > 1) | ||
1078 | - free (mb_properties); | ||
1079 | + /* For f_i_multibyte, the string at beg now matches first 3 chars of | ||
1080 | + one of the search strings (less if there are shorter search strings). | ||
1081 | + See if this is a real match. */ | ||
1082 | + if (f_i_multibyte | ||
1083 | + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) | ||
1084 | + goto next_char; | ||
1085 | #endif /* MBS_SUPPORT */ | ||
1086 | - return beg - buf; | ||
1087 | - } | ||
1088 | + len = kwsmatch.size[0]; | ||
1089 | + if (exact && !match_words) | ||
1090 | + goto success_in_beg_and_len; | ||
1091 | if (match_lines) | ||
1092 | { | ||
1093 | if (beg > buf && beg[-1] != eol) | ||
1094 | - continue; | ||
1095 | + goto next_char; | ||
1096 | if (beg + len < buf + size && beg[len] != eol) | ||
1097 | - continue; | ||
1098 | + goto next_char; | ||
1099 | goto success; | ||
1100 | } | ||
1101 | else if (match_words) | ||
1102 | - for (try = beg; len; ) | ||
1103 | - { | ||
1104 | - if (try > buf && WCHAR((unsigned char) try[-1])) | ||
1105 | - break; | ||
1106 | - if (try + len < buf + size && WCHAR((unsigned char) try[len])) | ||
1107 | - { | ||
1108 | - offset = kwsexec (kwset, beg, --len, &kwsmatch); | ||
1109 | - if (offset == (size_t) -1) | ||
1110 | - { | ||
1111 | + { | ||
1112 | + while (len) | ||
1113 | + { | ||
1114 | + int word_match = 0; | ||
1115 | + if (beg > buf) | ||
1116 | + { | ||
1117 | #ifdef MBS_SUPPORT | ||
1118 | - if (MB_CUR_MAX > 1) | ||
1119 | - free (mb_properties); | ||
1120 | + if (mb_cur_max > 1) | ||
1121 | + { | ||
1122 | + const char *s; | ||
1123 | + int mr; | ||
1124 | + wchar_t pwc; | ||
1125 | + | ||
1126 | + if (using_utf8) | ||
1127 | + { | ||
1128 | + s = beg - 1; | ||
1129 | + while (s > buf | ||
1130 | + && (unsigned char) *s >= 0x80 | ||
1131 | + && (unsigned char) *s <= 0xbf) | ||
1132 | + --s; | ||
1133 | + } | ||
1134 | + else | ||
1135 | + s = last_char; | ||
1136 | + mr = mbtowc (&pwc, s, beg - s); | ||
1137 | + if (mr <= 0) | ||
1138 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1139 | + else if ((iswalnum (pwc) || pwc == L'_') | ||
1140 | + && mr == (int) (beg - s)) | ||
1141 | + goto next_char; | ||
1142 | + } | ||
1143 | + else | ||
1144 | #endif /* MBS_SUPPORT */ | ||
1145 | - return offset; | ||
1146 | - } | ||
1147 | - try = beg + offset; | ||
1148 | - len = kwsmatch.size[0]; | ||
1149 | - } | ||
1150 | - else | ||
1151 | - goto success; | ||
1152 | - } | ||
1153 | + if (WCHAR ((unsigned char) beg[-1])) | ||
1154 | + goto next_char; | ||
1155 | + } | ||
1156 | +#ifdef MBS_SUPPORT | ||
1157 | + if (mb_cur_max > 1) | ||
1158 | + { | ||
1159 | + wchar_t nwc; | ||
1160 | + int mr; | ||
1161 | + | ||
1162 | + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); | ||
1163 | + if (mr <= 0) | ||
1164 | + { | ||
1165 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1166 | + word_match = 1; | ||
1167 | + } | ||
1168 | + else if (!iswalnum (nwc) && nwc != L'_') | ||
1169 | + word_match = 1; | ||
1170 | + } | ||
1171 | + else | ||
1172 | +#endif /* MBS_SUPPORT */ | ||
1173 | + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) | ||
1174 | + word_match = 1; | ||
1175 | + if (word_match) | ||
1176 | + { | ||
1177 | + if (!exact) | ||
1178 | + /* Returns the whole line now we know there's a word match. */ | ||
1179 | + goto success; | ||
1180 | + else | ||
1181 | + /* Returns just this word match. */ | ||
1182 | + goto success_in_beg_and_len; | ||
1183 | + } | ||
1184 | + if (len > 0) | ||
1185 | + { | ||
1186 | + /* Try a shorter length anchored at the same place. */ | ||
1187 | + --len; | ||
1188 | + offset = kwsexec (kwset, beg, len, &kwsmatch); | ||
1189 | + | ||
1190 | + if (offset == -1) | ||
1191 | + goto next_char; /* Try a different anchor. */ | ||
1192 | +#ifdef MBS_SUPPORT | ||
1193 | + if (mb_cur_max > 1 && !using_utf8) | ||
1194 | + { | ||
1195 | + size_t bytes_left = offset; | ||
1196 | + while (bytes_left) | ||
1197 | + { | ||
1198 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | ||
1199 | + | ||
1200 | + last_char = beg; | ||
1201 | + if (mlen == (size_t) -1 || mlen == 0) | ||
1202 | + { | ||
1203 | + /* Incomplete character: treat as single-byte. */ | ||
1204 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1205 | + beg++; | ||
1206 | + bytes_left--; | ||
1207 | + continue; | ||
1208 | + } | ||
1209 | + | ||
1210 | + if (mlen == (size_t) -2) | ||
1211 | + { | ||
1212 | + /* Offset points inside multibyte character: | ||
1213 | + * no good. */ | ||
1214 | + break; | ||
1215 | + } | ||
1216 | + | ||
1217 | + beg += mlen; | ||
1218 | + bytes_left -= mlen; | ||
1219 | + } | ||
1220 | + | ||
1221 | + if (bytes_left) | ||
1222 | + { | ||
1223 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1224 | + goto next_char; /* Try a different anchor. */ | ||
1225 | + } | ||
1226 | + } | ||
1227 | + else | ||
1228 | +#endif /* MBS_SUPPORT */ | ||
1229 | + beg += offset; | ||
1230 | +#ifdef MBS_SUPPORT | ||
1231 | + /* The string at beg now matches first 3 chars of one of | ||
1232 | + the search strings (less if there are shorter search | ||
1233 | + strings). See if this is a real match. */ | ||
1234 | + if (f_i_multibyte | ||
1235 | + && Fimbexec (beg, len - offset, &kwsmatch.size[0], | ||
1236 | + exact)) | ||
1237 | + goto next_char; | ||
1238 | +#endif /* MBS_SUPPORT */ | ||
1239 | + len = kwsmatch.size[0]; | ||
1240 | + } | ||
1241 | + } | ||
1242 | + } | ||
1243 | else | ||
1244 | goto success; | ||
1245 | - } | ||
1246 | - | ||
1247 | +next_char:; | ||
1248 | #ifdef MBS_SUPPORT | ||
1249 | - if (MB_CUR_MAX > 1) | ||
1250 | - free (mb_properties); | ||
1251 | + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled | ||
1252 | + by ++beg above. */ | ||
1253 | + if (mb_cur_max > 1) | ||
1254 | + { | ||
1255 | + if (using_utf8) | ||
1256 | + { | ||
1257 | + unsigned char c = *beg; | ||
1258 | + if (c >= 0xc2) | ||
1259 | + { | ||
1260 | + if (c < 0xe0) | ||
1261 | + ++beg; | ||
1262 | + else if (c < 0xf0) | ||
1263 | + beg += 2; | ||
1264 | + else if (c < 0xf8) | ||
1265 | + beg += 3; | ||
1266 | + else if (c < 0xfc) | ||
1267 | + beg += 4; | ||
1268 | + else if (c < 0xfe) | ||
1269 | + beg += 5; | ||
1270 | + } | ||
1271 | + } | ||
1272 | + else | ||
1273 | + { | ||
1274 | + size_t l = mbrlen (beg, buf + size - beg, &mbs); | ||
1275 | + | ||
1276 | + last_char = beg; | ||
1277 | + if (l + 2 >= 2) | ||
1278 | + beg += l - 1; | ||
1279 | + else | ||
1280 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1281 | + } | ||
1282 | + } | ||
1283 | #endif /* MBS_SUPPORT */ | ||
1284 | + } | ||
1285 | + | ||
1286 | + failure: | ||
1287 | return -1; | ||
1288 | |||
1289 | success: | ||
1290 | +#ifdef MBS_SUPPORT | ||
1291 | + if (mb_cur_max > 1 && !using_utf8) | ||
1292 | + { | ||
1293 | + end = beg + len; | ||
1294 | + while (end < buf + size) | ||
1295 | + { | ||
1296 | + size_t mlen = mbrlen (end, buf + size - end, &mbs); | ||
1297 | + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) | ||
1298 | + { | ||
1299 | + memset (&mbs, '\0', sizeof (mbstate_t)); | ||
1300 | + mlen = 1; | ||
1301 | + } | ||
1302 | + if (mlen == 1 && *end == eol) | ||
1303 | + break; | ||
1304 | + | ||
1305 | + end += mlen; | ||
1306 | + } | ||
1307 | + } | ||
1308 | + else | ||
1309 | +#endif /* MBS_SUPPORT */ | ||
1310 | end = memchr (beg + len, eol, (buf + size) - (beg + len)); | ||
1311 | + | ||
1312 | end++; | ||
1313 | while (buf < beg && beg[-1] != eol) | ||
1314 | --beg; | ||
1315 | - *match_size = end - beg; | ||
1316 | -#ifdef MBS_SUPPORT | ||
1317 | - if (MB_CUR_MAX > 1) | ||
1318 | - free (mb_properties); | ||
1319 | -#endif /* MBS_SUPPORT */ | ||
1320 | + len = end - beg; | ||
1321 | + /* FALLTHROUGH */ | ||
1322 | + | ||
1323 | + success_in_beg_and_len: | ||
1324 | + *match_size = len; | ||
1325 | return beg - buf; | ||
1326 | } | ||
1327 | |||
1328 | @@ -701,8 +1267,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int exact) | ||
1329 | char eol = eolbyte; | ||
1330 | if (!exact) | ||
1331 | { | ||
1332 | - end = memchr (end, eol, buflim - end); | ||
1333 | - end++; | ||
1334 | + while (end < buflim) | ||
1335 | + if (*end++ == eol) | ||
1336 | + break; | ||
1337 | while (buf < beg && beg[-1] != eol) | ||
1338 | --beg; | ||
1339 | } | ||
1340 | -- | ||
1341 | 1.8.4.2 | ||
1342 | |||