summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpgowda <pgowda.cve@gmail.com>2021-12-20 01:50:09 -0800
committerRichard Purdie <richard.purdie@linuxfoundation.org>2021-12-22 23:11:45 +0000
commit3cb504cebafcbf6490c049efb5acd8e2fa4e95ec (patch)
treeed7967cbbae8652020d3b4533fc04e2722c364fe
parent3503555a8b3595ec6ab4d836176ca5309f32912c (diff)
downloadpoky-3cb504cebafcbf6490c049efb5acd8e2fa4e95ec.tar.gz
binutils: CVE-2021-42574
Upstream-Status: Backport [https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=b3aa80b45c4f46029efeb204bb9f2d2c4278a0e5] [RP: Merge uint -> unsigned int change] (From OE-Core rev: fa242a41f3436f1d73eabee335573c1801bf7888) Signed-off-by: pgowda <pgowda.cve@gmail.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
-rw-r--r--meta/recipes-devtools/binutils/binutils-2.37.inc1
-rw-r--r--meta/recipes-devtools/binutils/binutils/0001-CVE-2021-42574.patch2001
2 files changed, 2002 insertions, 0 deletions
diff --git a/meta/recipes-devtools/binutils/binutils-2.37.inc b/meta/recipes-devtools/binutils/binutils-2.37.inc
index fca4a80ad2..043f7f8235 100644
--- a/meta/recipes-devtools/binutils/binutils-2.37.inc
+++ b/meta/recipes-devtools/binutils/binutils-2.37.inc
@@ -33,5 +33,6 @@ SRC_URI = "\
33 file://0016-Check-for-clang-before-checking-gcc-version.patch \ 33 file://0016-Check-for-clang-before-checking-gcc-version.patch \
34 file://0017-bfd-Close-the-file-descriptor-if-there-is-no-archive.patch \ 34 file://0017-bfd-Close-the-file-descriptor-if-there-is-no-archive.patch \
35 file://0001-elf-Discard-input-.note.gnu.build-id-sections.patch \ 35 file://0001-elf-Discard-input-.note.gnu.build-id-sections.patch \
36 file://0001-CVE-2021-42574.patch \
36" 37"
37S = "${WORKDIR}/git" 38S = "${WORKDIR}/git"
diff --git a/meta/recipes-devtools/binutils/binutils/0001-CVE-2021-42574.patch b/meta/recipes-devtools/binutils/binutils/0001-CVE-2021-42574.patch
new file mode 100644
index 0000000000..0622ae389e
--- /dev/null
+++ b/meta/recipes-devtools/binutils/binutils/0001-CVE-2021-42574.patch
@@ -0,0 +1,2001 @@
1From b3aa80b45c4f46029efeb204bb9f2d2c4278a0e5 Mon Sep 17 00:00:00 2001
2From: Nick Clifton <nickc@redhat.com>
3Date: Tue, 9 Nov 2021 13:25:42 +0000
4Subject: [PATCH] Add --unicode option to control how unicode characters are
5 handled by display tools.
6
7 * nm.c: Add --unicode option to control how unicode characters are
8 handled.
9 * objdump.c: Likewise.
10 * readelf.c: Likewise.
11 * strings.c: Likewise.
12 * binutils.texi: Document the new feature.
13 * NEWS: Document the new feature.
14 * testsuite/binutils-all/unicode.exp: New file.
15 * testsuite/binutils-all/nm.hex.unicode
16 * testsuite/binutils-all/strings.escape.unicode
17 * testsuite/binutils-all/objdump.highlight.unicode
18 * testsuite/binutils-all/readelf.invalid.unicode
19
20CVE: CVE-2021-42574
21Upstream-Status: Backport [https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=b3aa80b45c4f46029efeb204bb9f2d2c4278a0e5]
22
23RP: Added tweak uint -> unsigned int partial backport of
24https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=795588aec4f894206863c938bd6d716895886009
25
26Signed-off-by: pgowda <pgowda.cve@gmail.com>
27---
28 binutils/ChangeLog | 15 +
29 binutils/NEWS | 9 +
30 binutils/doc/binutils.texi | 78 ++++
31 binutils/nm.c | 228 ++++++++++-
32 binutils/objdump.c | 235 ++++++++++--
33 binutils/readelf.c | 190 +++++++++-
34 binutils/strings.c | 757 ++++++++++++++++++++++++++++++++++---
35 7 files changed, 1409 insertions(+), 103 deletions(-)
36
37diff --git a/binutils/ChangeLog b/binutils/ChangeLog
38--- a/binutils/ChangeLog 2021-12-19 19:00:27.038540406 -0800
39+++ b/binutils/ChangeLog 2021-12-19 19:28:42.733565078 -0800
40@@ -1,3 +1,18 @@
41+2021-11-09 Nick Clifton <nickc@redhat.com>
42+
43+ * nm.c: Add --unicode option to control how unicode characters are
44+ handled.
45+ * objdump.c: Likewise.
46+ * readelf.c: Likewise.
47+ * strings.c: Likewise.
48+ * binutils.texi: Document the new feature.
49+ * NEWS: Document the new feature.
50+ * testsuite/binutils-all/unicode.exp: New file.
51+ * testsuite/binutils-all/nm.hex.unicode
52+ * testsuite/binutils-all/strings.escape.unicode
53+ * testsuite/binutils-all/objdump.highlight.unicode
54+ * testsuite/binutils-all/readelf.invalid.unicode
55+
56 2021-07-16 Nick Clifton <nickc@redhat.com>
57
58 * po/sv.po: Updated Swedish translation.
59diff --git a/binutils/doc/binutils.texi b/binutils/doc/binutils.texi
60--- a/binutils/doc/binutils.texi 2021-12-19 19:00:27.042540338 -0800
61+++ b/binutils/doc/binutils.texi 2021-12-19 19:27:56.526354667 -0800
62@@ -812,6 +812,7 @@ nm [@option{-A}|@option{-o}|@option{--pr
63 [@option{-s}|@option{--print-armap}]
64 [@option{-t} @var{radix}|@option{--radix=}@var{radix}]
65 [@option{-u}|@option{--undefined-only}]
66+ [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
67 [@option{-V}|@option{--version}]
68 [@option{-X 32_64}]
69 [@option{--defined-only}]
70@@ -1132,6 +1133,21 @@ Use @var{radix} as the radix for printin
71 @cindex undefined symbols
72 Display only undefined symbols (those external to each object file).
73
74+@item -U @var{[d|i|l|e|x|h]}
75+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
76+Controls the display of UTF-8 encoded mulibyte characters in strings.
77+The default (@option{--unicode=default}) is to give them no special
78+treatment. The @option{--unicode=locale} option displays the sequence
79+in the current locale, which may or may not support them. The options
80+@option{--unicode=hex} and @option{--unicode=invalid} display them as
81+hex byte sequences enclosed by either angle brackets or curly braces.
82+
83+The @option{--unicode=escape} option displays them as escape sequences
84+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
85+them as escape sequences highlighted in red (if supported by the
86+output device). The colouring is intended to draw attention to the
87+presence of unicode sequences where they might not be expected.
88+
89 @item -V
90 @itemx --version
91 Show the version number of @command{nm} and exit.
92@@ -2247,6 +2263,7 @@ objdump [@option{-a}|@option{--archive-h
93 [@option{--prefix-strip=}@var{level}]
94 [@option{--insn-width=}@var{width}]
95 [@option{--visualize-jumps[=color|=extended-color|=off]}
96+ [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
97 [@option{-V}|@option{--version}]
98 [@option{-H}|@option{--help}]
99 @var{objfile}@dots{}
100@@ -2921,6 +2938,21 @@ When displaying symbols include those wh
101 special in some way and which would not normally be of interest to the
102 user.
103
104+@item -U @var{[d|i|l|e|x|h]}
105+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
106+Controls the display of UTF-8 encoded mulibyte characters in strings.
107+The default (@option{--unicode=default}) is to give them no special
108+treatment. The @option{--unicode=locale} option displays the sequence
109+in the current locale, which may or may not support them. The options
110+@option{--unicode=hex} and @option{--unicode=invalid} display them as
111+hex byte sequences enclosed by either angle brackets or curly braces.
112+
113+The @option{--unicode=escape} option displays them as escape sequences
114+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
115+them as escape sequences highlighted in red (if supported by the
116+output device). The colouring is intended to draw attention to the
117+presence of unicode sequences where they might not be expected.
118+
119 @item -V
120 @itemx --version
121 Print the version number of @command{objdump} and exit.
122@@ -3197,6 +3229,7 @@ strings [@option{-afovV}] [@option{-}@va
123 [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}]
124 [@option{-t} @var{radix}] [@option{--radix=}@var{radix}]
125 [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}]
126+ [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
127 [@option{-}] [@option{--all}] [@option{--print-file-name}]
128 [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}]
129 [@option{-w}] [@option{--include-all-whitespace}]
130@@ -3288,6 +3321,28 @@ single-8-bit-byte characters, @samp{b} =
131 littleendian. Useful for finding wide character strings. (@samp{l}
132 and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings).
133
134+@item -U @var{[d|i|l|e|x|h]}
135+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
136+Controls the display of UTF-8 encoded mulibyte characters in strings.
137+The default (@option{--unicode=default}) is to give them no special
138+treatment, and instead rely upon the setting of the
139+@option{--encoding} option. The other values for this option
140+automatically enable @option{--encoding=S}.
141+
142+The @option{--unicode=invalid} option treats them as non-graphic
143+characters and hence not part of a valid string. All the remaining
144+options treat them as valid string characters.
145+
146+The @option{--unicode=locale} option displays them in the current
147+locale, which may or may not support UTF-8 encoding. The
148+@option{--unicode=hex} option displays them as hex byte sequences
149+enclosed between @var{<>} characters. The @option{--unicode=escape}
150+option displays them as escape sequences (@var{\uxxxx}) and the
151+@option{--unicode=highlight} option displays them as escape sequences
152+highlighted in red (if supported by the output device). The colouring
153+is intended to draw attention to the presence of unicode sequences
154+where they might not be expected.
155+
156 @item -T @var{bfdname}
157 @itemx --target=@var{bfdname}
158 @cindex object code format
159@@ -4796,6 +4851,7 @@ readelf [@option{-a}|@option{--all}]
160 [@option{--demangle@var{=style}}|@option{--no-demangle}]
161 [@option{--quiet}]
162 [@option{--recurse-limit}|@option{--no-recurse-limit}]
163+ [@option{-U} @var{method}|@option{--unicode=}@var{method}]
164 [@option{-n}|@option{--notes}]
165 [@option{-r}|@option{--relocs}]
166 [@option{-u}|@option{--unwind}]
167@@ -4962,6 +5018,28 @@ necessary in order to demangle truly com
168 that if the recursion limit is disabled then stack exhaustion is
169 possible and any bug reports about such an event will be rejected.
170
171+@item -U @var{[d|i|l|e|x|h]}
172+@itemx --unicode=[default|invalid|locale|escape|hex|highlight]
173+Controls the display of non-ASCII characters in identifier names.
174+The default (@option{--unicode=locale} or @option{--unicode=default}) is
175+to treat them as multibyte characters and display them in the current
176+locale. All other versions of this option treat the bytes as UTF-8
177+encoded values and attempt to interpret them. If they cannot be
178+interpreted or if the @option{--unicode=invalid} option is used then
179+they are displayed as a sequence of hex bytes, encloses in curly
180+parethesis characters.
181+
182+Using the @option{--unicode=escape} option will display the characters
183+as as unicode escape sequences (@var{\uxxxx}). Using the
184+@option{--unicode=hex} will display the characters as hex byte
185+sequences enclosed between angle brackets.
186+
187+Using the @option{--unicode=highlight} will display the characters as
188+unicode escape sequences but it will also highlighted them in red,
189+assuming that colouring is supported by the output device. The
190+colouring is intended to draw attention to the presence of unicode
191+sequences when they might not be expected.
192+
193 @item -e
194 @itemx --headers
195 Display all the headers in the file. Equivalent to @option{-h -l -S}.
196diff --git a/binutils/NEWS b/binutils/NEWS
197--- a/binutils/NEWS 2021-12-19 19:00:27.038540406 -0800
198+++ b/binutils/NEWS 2021-12-19 19:30:04.764162972 -0800
199@@ -1,5 +1,14 @@
200 -*- text -*-
201
202+* Tools which display symbols or strings (readelf, strings, nm, objdump)
203+ have a new command line option which controls how unicode characters are
204+ handled. By default they are treated as normal for the tool. Using
205+ --unicode=locale will display them according to the current locale.
206+ Using --unicode=hex will display them as hex byte values, whilst
207+ --unicode=escape will display them as escape sequences. In addition
208+ using --unicode=highlight will display them as unicode escape sequences
209+ highlighted in red (if supported by the output device).
210+
211 Changes in 2.37:
212
213 * The readelf tool has a new command line option which can be used to specify
214diff --git a/binutils/nm.c b/binutils/nm.c
215--- a/binutils/nm.c 2021-12-19 19:00:27.046540270 -0800
216+++ b/binutils/nm.c 2021-12-19 19:36:34.797491555 -0800
217@@ -38,6 +38,11 @@
218 #include "bucomm.h"
219 #include "plugin-api.h"
220 #include "plugin.h"
221+#include "safe-ctype.h"
222+
223+#ifndef streq
224+#define streq(a,b) (strcmp ((a),(b)) == 0)
225+#endif
226
227 /* When sorting by size, we use this structure to hold the size and a
228 pointer to the minisymbol. */
229@@ -216,6 +221,18 @@ static const char *plugin_target = NULL;
230 static bfd *lineno_cache_bfd;
231 static bfd *lineno_cache_rel_bfd;
232
233+typedef enum unicode_display_type
234+{
235+ unicode_default = 0,
236+ unicode_locale,
237+ unicode_escape,
238+ unicode_hex,
239+ unicode_highlight,
240+ unicode_invalid
241+} unicode_display_type;
242+
243+static unicode_display_type unicode_display = unicode_default;
244+
245 enum long_option_values
246 {
247 OPTION_TARGET = 200,
248@@ -260,6 +277,7 @@ static struct option long_options[] =
249 {"target", required_argument, 0, OPTION_TARGET},
250 {"defined-only", no_argument, &defined_only, 1},
251 {"undefined-only", no_argument, &undefined_only, 1},
252+ {"unicode", required_argument, NULL, 'U'},
253 {"version", no_argument, &show_version, 1},
254 {"with-symbol-versions", no_argument, &with_symbol_versions, 1},
255 {"without-symbol-versions", no_argument, &with_symbol_versions, 0},
256@@ -313,6 +331,8 @@ usage (FILE *stream, int status)
257 -t, --radix=RADIX Use RADIX for printing symbol values\n\
258 --target=BFDNAME Specify the target object format as BFDNAME\n\
259 -u, --undefined-only Display only undefined symbols\n\
260+ -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
261+ --unicode={default|show|invalid|hex|escape|highlight}\n\
262 --with-symbol-versions Display version strings after symbol names\n\
263 -X 32_64 (ignored)\n\
264 @FILE Read options from FILE\n\
265@@ -432,6 +452,187 @@ get_coff_symbol_type (const struct inter
266 return bufp;
267 }
268
269+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
270+ The conversion format is controlled by the unicode_display variable.
271+ Returns the number of characters added to OUT.
272+ Returns the number of bytes consumed from IN in CONSUMED.
273+ Always consumes at least one byte and displays at least one character. */
274+
275+static unsigned int
276+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
277+{
278+ char * orig_out = out;
279+ unsigned int nchars = 0;
280+ unsigned int j;
281+
282+ if (unicode_display == unicode_default)
283+ goto invalid;
284+
285+ if (in[0] < 0xc0)
286+ goto invalid;
287+
288+ if ((in[1] & 0xc0) != 0x80)
289+ goto invalid;
290+
291+ if ((in[0] & 0x20) == 0)
292+ {
293+ nchars = 2;
294+ goto valid;
295+ }
296+
297+ if ((in[2] & 0xc0) != 0x80)
298+ goto invalid;
299+
300+ if ((in[0] & 0x10) == 0)
301+ {
302+ nchars = 3;
303+ goto valid;
304+ }
305+
306+ if ((in[3] & 0xc0) != 0x80)
307+ goto invalid;
308+
309+ nchars = 4;
310+
311+ valid:
312+ switch (unicode_display)
313+ {
314+ case unicode_locale:
315+ /* Copy the bytes into the output buffer as is. */
316+ memcpy (out, in, nchars);
317+ out += nchars;
318+ break;
319+
320+ case unicode_invalid:
321+ case unicode_hex:
322+ out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
323+ out += sprintf (out, "0x");
324+ for (j = 0; j < nchars; j++)
325+ out += sprintf (out, "%02x", in [j]);
326+ out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
327+ break;
328+
329+ case unicode_highlight:
330+ if (isatty (1))
331+ out += sprintf (out, "\x1B[31;47m"); /* Red. */
332+ /* Fall through. */
333+ case unicode_escape:
334+ switch (nchars)
335+ {
336+ case 2:
337+ out += sprintf (out, "\\u%02x%02x",
338+ ((in[0] & 0x1c) >> 2),
339+ ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
340+ break;
341+
342+ case 3:
343+ out += sprintf (out, "\\u%02x%02x",
344+ ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
345+ ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
346+ break;
347+
348+ case 4:
349+ out += sprintf (out, "\\u%02x%02x%02x",
350+ ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
351+ ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
352+ ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
353+ break;
354+ default:
355+ /* URG. */
356+ break;
357+ }
358+
359+ if (unicode_display == unicode_highlight && isatty (1))
360+ out += sprintf (out, "\033[0m"); /* Default colour. */
361+ break;
362+
363+ default:
364+ /* URG */
365+ break;
366+ }
367+
368+ * consumed = nchars;
369+ return out - orig_out;
370+
371+ invalid:
372+ /* Not a valid UTF-8 sequence. */
373+ *out = *in;
374+ * consumed = 1;
375+ return 1;
376+}
377+
378+/* Convert any UTF-8 encoded characters in NAME into the form specified by
379+ unicode_display. Also converts control characters. Returns a static
380+ buffer if conversion was necessary.
381+ Code stolen from objdump.c:sanitize_string(). */
382+
383+static const char *
384+convert_utf8 (const char * in)
385+{
386+ static char * buffer = NULL;
387+ static size_t buffer_len = 0;
388+ const char * original = in;
389+ char * out;
390+
391+ /* Paranoia. */
392+ if (in == NULL)
393+ return "";
394+
395+ /* See if any conversion is necessary.
396+ In the majority of cases it will not be needed. */
397+ do
398+ {
399+ unsigned char c = *in++;
400+
401+ if (c == 0)
402+ return original;
403+
404+ if (ISCNTRL (c))
405+ break;
406+
407+ if (unicode_display != unicode_default && c >= 0xc0)
408+ break;
409+ }
410+ while (1);
411+
412+ /* Copy the input, translating as needed. */
413+ in = original;
414+ if (buffer_len < (strlen (in) * 9))
415+ {
416+ free ((void *) buffer);
417+ buffer_len = strlen (in) * 9;
418+ buffer = xmalloc (buffer_len + 1);
419+ }
420+
421+ out = buffer;
422+ do
423+ {
424+ unsigned char c = *in++;
425+
426+ if (c == 0)
427+ break;
428+
429+ if (ISCNTRL (c))
430+ {
431+ *out++ = '^';
432+ *out++ = c + 0x40;
433+ }
434+ else if (unicode_display != unicode_default && c >= 0xc0)
435+ {
436+ unsigned int num_consumed;
437+
438+ out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
439+ in += num_consumed - 1;
440+ }
441+ else
442+ *out++ = c;
443+ }
444+ while (1);
445+
446+ *out = 0;
447+ return buffer;
448+}
449+
450 /* Print symbol name NAME, read from ABFD, with printf format FORM,
451 demangling it if requested. */
452
453@@ -444,6 +645,7 @@ print_symname (const char *form, struct
454
455 if (name == NULL)
456 name = info->sinfo->name;
457+
458 if (!with_symbol_versions
459 && bfd_get_flavour (abfd) == bfd_target_elf_flavour)
460 {
461@@ -451,6 +653,7 @@ print_symname (const char *form, struct
462 if (atver)
463 *atver = 0;
464 }
465+
466 if (do_demangle && *name)
467 {
468 alloc = bfd_demangle (abfd, name, demangle_flags);
469@@ -458,6 +661,11 @@ print_symname (const char *form, struct
470 name = alloc;
471 }
472
473+ if (unicode_display != unicode_default)
474+ {
475+ name = convert_utf8 (name);
476+ }
477+
478 if (info != NULL && info->elfinfo && with_symbol_versions)
479 {
480 const char *version_string;
481@@ -1807,7 +2015,7 @@ main (int argc, char **argv)
482 fatal (_("fatal error: libbfd ABI mismatch"));
483 set_default_bfd_target ();
484
485- while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uvVvX:",
486+ while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uU:vVvX:",
487 long_options, (int *) 0)) != EOF)
488 {
489 switch (c)
490@@ -1900,6 +2108,24 @@ main (int argc, char **argv)
491 case 'u':
492 undefined_only = 1;
493 break;
494+
495+ case 'U':
496+ if (streq (optarg, "default") || streq (optarg, "d"))
497+ unicode_display = unicode_default;
498+ else if (streq (optarg, "locale") || streq (optarg, "l"))
499+ unicode_display = unicode_locale;
500+ else if (streq (optarg, "escape") || streq (optarg, "e"))
501+ unicode_display = unicode_escape;
502+ else if (streq (optarg, "invalid") || streq (optarg, "i"))
503+ unicode_display = unicode_invalid;
504+ else if (streq (optarg, "hex") || streq (optarg, "x"))
505+ unicode_display = unicode_hex;
506+ else if (streq (optarg, "highlight") || streq (optarg, "h"))
507+ unicode_display = unicode_highlight;
508+ else
509+ fatal (_("invalid argument to -U/--unicode: %s"), optarg);
510+ break;
511+
512 case 'V':
513 show_version = 1;
514 break;
515diff --git a/binutils/objdump.c b/binutils/objdump.c
516--- a/binutils/objdump.c 2021-12-19 19:00:27.046540270 -0800
517+++ b/binutils/objdump.c 2021-12-19 19:43:09.438736729 -0800
518@@ -204,6 +204,18 @@ static const struct objdump_private_desc
519
520 /* The list of detected jumps inside a function. */
521 static struct jump_info *detected_jumps = NULL;
522+
523+typedef enum unicode_display_type
524+{
525+ unicode_default = 0,
526+ unicode_locale,
527+ unicode_escape,
528+ unicode_hex,
529+ unicode_highlight,
530+ unicode_invalid
531+} unicode_display_type;
532+
533+static unicode_display_type unicode_display = unicode_default;
534
535 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
536 static void
537@@ -330,6 +342,9 @@ usage (FILE *stream, int status)
538 fprintf (stream, _("\
539 -w, --wide Format output for more than 80 columns\n"));
540 fprintf (stream, _("\
541+ -U[d|l|i|x|e|h] Controls the display of UTF-8 unicode characters\n\
542+ --unicode=[default|locale|invalid|hex|escape|highlight]\n"));
543+ fprintf (stream, _("\
544 -z, --disassemble-zeroes Do not skip blocks of zeroes when disassembling\n"));
545 fprintf (stream, _("\
546 --start-address=ADDR Only process data whose address is >= ADDR\n"));
547@@ -420,17 +435,23 @@ static struct option long_options[]=
548 {
549 {"adjust-vma", required_argument, NULL, OPTION_ADJUST_VMA},
550 {"all-headers", no_argument, NULL, 'x'},
551- {"private-headers", no_argument, NULL, 'p'},
552- {"private", required_argument, NULL, 'P'},
553 {"architecture", required_argument, NULL, 'm'},
554 {"archive-headers", no_argument, NULL, 'a'},
555+#ifdef ENABLE_LIBCTF
556+ {"ctf", required_argument, NULL, OPTION_CTF},
557+ {"ctf-parent", required_argument, NULL, OPTION_CTF_PARENT},
558+#endif
559 {"debugging", no_argument, NULL, 'g'},
560 {"debugging-tags", no_argument, NULL, 'e'},
561 {"demangle", optional_argument, NULL, 'C'},
562 {"disassemble", optional_argument, NULL, 'd'},
563 {"disassemble-all", no_argument, NULL, 'D'},
564- {"disassembler-options", required_argument, NULL, 'M'},
565 {"disassemble-zeroes", no_argument, NULL, 'z'},
566+ {"disassembler-options", required_argument, NULL, 'M'},
567+ {"dwarf", optional_argument, NULL, OPTION_DWARF},
568+ {"dwarf-check", no_argument, 0, OPTION_DWARF_CHECK},
569+ {"dwarf-depth", required_argument, 0, OPTION_DWARF_DEPTH},
570+ {"dwarf-start", required_argument, 0, OPTION_DWARF_START},
571 {"dynamic-reloc", no_argument, NULL, 'R'},
572 {"dynamic-syms", no_argument, NULL, 'T'},
573 {"endian", required_argument, NULL, OPTION_ENDIAN},
574@@ -440,16 +461,23 @@ static struct option long_options[]=
575 {"full-contents", no_argument, NULL, 's'},
576 {"headers", no_argument, NULL, 'h'},
577 {"help", no_argument, NULL, 'H'},
578+ {"include", required_argument, NULL, 'I'},
579 {"info", no_argument, NULL, 'i'},
580+ {"inlines", no_argument, 0, OPTION_INLINES},
581+ {"insn-width", required_argument, NULL, OPTION_INSN_WIDTH},
582 {"line-numbers", no_argument, NULL, 'l'},
583- {"no-show-raw-insn", no_argument, &show_raw_insn, -1},
584 {"no-addresses", no_argument, &no_addresses, 1},
585- {"process-links", no_argument, &process_links, true},
586+ {"no-recurse-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT},
587+ {"no-recursion-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT},
588+ {"no-show-raw-insn", no_argument, &show_raw_insn, -1},
589+ {"prefix", required_argument, NULL, OPTION_PREFIX},
590 {"prefix-addresses", no_argument, &prefix_addresses, 1},
591+ {"prefix-strip", required_argument, NULL, OPTION_PREFIX_STRIP},
592+ {"private", required_argument, NULL, 'P'},
593+ {"private-headers", no_argument, NULL, 'p'},
594+ {"process-links", no_argument, &process_links, true},
595 {"recurse-limit", no_argument, NULL, OPTION_RECURSE_LIMIT},
596 {"recursion-limit", no_argument, NULL, OPTION_RECURSE_LIMIT},
597- {"no-recurse-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT},
598- {"no-recursion-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT},
599 {"reloc", no_argument, NULL, 'r'},
600 {"section", required_argument, NULL, 'j'},
601 {"section-headers", no_argument, NULL, 'h'},
602@@ -457,28 +485,16 @@ static struct option long_options[]=
603 {"source", no_argument, NULL, 'S'},
604 {"source-comment", optional_argument, NULL, OPTION_SOURCE_COMMENT},
605 {"special-syms", no_argument, &dump_special_syms, 1},
606- {"include", required_argument, NULL, 'I'},
607- {"dwarf", optional_argument, NULL, OPTION_DWARF},
608-#ifdef ENABLE_LIBCTF
609- {"ctf", required_argument, NULL, OPTION_CTF},
610- {"ctf-parent", required_argument, NULL, OPTION_CTF_PARENT},
611-#endif
612 {"stabs", no_argument, NULL, 'G'},
613 {"start-address", required_argument, NULL, OPTION_START_ADDRESS},
614 {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS},
615 {"syms", no_argument, NULL, 't'},
616 {"target", required_argument, NULL, 'b'},
617+ {"unicode", required_argument, NULL, 'U'},
618 {"version", no_argument, NULL, 'V'},
619- {"wide", no_argument, NULL, 'w'},
620- {"prefix", required_argument, NULL, OPTION_PREFIX},
621- {"prefix-strip", required_argument, NULL, OPTION_PREFIX_STRIP},
622- {"insn-width", required_argument, NULL, OPTION_INSN_WIDTH},
623- {"dwarf-depth", required_argument, 0, OPTION_DWARF_DEPTH},
624- {"dwarf-start", required_argument, 0, OPTION_DWARF_START},
625- {"dwarf-check", no_argument, 0, OPTION_DWARF_CHECK},
626- {"inlines", no_argument, 0, OPTION_INLINES},
627 {"visualize-jumps", optional_argument, 0, OPTION_VISUALIZE_JUMPS},
628- {0, no_argument, 0, 0}
629+ {"wide", no_argument, NULL, 'w'},
630+ {NULL, no_argument, NULL, 0}
631 };
632
633 static void
634@@ -488,9 +504,121 @@ nonfatal (const char *msg)
635 exit_status = 1;
636 }
637
638+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
639+ The conversion format is controlled by the unicode_display variable.
640+ Returns the number of characters added to OUT.
641+ Returns the number of bytes consumed from IN in CONSUMED.
642+ Always consumes at least one byte and displays at least one character. */
643+
644+static unsigned int
645+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
646+{
647+ char * orig_out = out;
648+ unsigned int nchars = 0;
649+ unsigned int j;
650+
651+ if (unicode_display == unicode_default)
652+ goto invalid;
653+
654+ if (in[0] < 0xc0)
655+ goto invalid;
656+
657+ if ((in[1] & 0xc0) != 0x80)
658+ goto invalid;
659+
660+ if ((in[0] & 0x20) == 0)
661+ {
662+ nchars = 2;
663+ goto valid;
664+ }
665+
666+ if ((in[2] & 0xc0) != 0x80)
667+ goto invalid;
668+
669+ if ((in[0] & 0x10) == 0)
670+ {
671+ nchars = 3;
672+ goto valid;
673+ }
674+
675+ if ((in[3] & 0xc0) != 0x80)
676+ goto invalid;
677+
678+ nchars = 4;
679+
680+ valid:
681+ switch (unicode_display)
682+ {
683+ case unicode_locale:
684+ /* Copy the bytes into the output buffer as is. */
685+ memcpy (out, in, nchars);
686+ out += nchars;
687+ break;
688+
689+ case unicode_invalid:
690+ case unicode_hex:
691+ out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
692+ out += sprintf (out, "0x");
693+ for (j = 0; j < nchars; j++)
694+ out += sprintf (out, "%02x", in [j]);
695+ out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
696+ break;
697+
698+ case unicode_highlight:
699+ if (isatty (1))
700+ out += sprintf (out, "\x1B[31;47m"); /* Red. */
701+ /* Fall through. */
702+ case unicode_escape:
703+ switch (nchars)
704+ {
705+ case 2:
706+ out += sprintf (out, "\\u%02x%02x",
707+ ((in[0] & 0x1c) >> 2),
708+ ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
709+ break;
710+
711+ case 3:
712+ out += sprintf (out, "\\u%02x%02x",
713+ ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
714+ ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
715+ break;
716+
717+ case 4:
718+ out += sprintf (out, "\\u%02x%02x%02x",
719+ ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
720+ ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
721+ ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
722+ break;
723+ default:
724+ /* URG. */
725+ break;
726+ }
727+
728+ if (unicode_display == unicode_highlight && isatty (1))
729+ out += sprintf (out, "\033[0m"); /* Default colour. */
730+ break;
731+
732+ default:
733+ /* URG */
734+ break;
735+ }
736+
737+ * consumed = nchars;
738+ return out - orig_out;
739+
740+ invalid:
741+ /* Not a valid UTF-8 sequence. */
742+ *out = *in;
743+ * consumed = 1;
744+ return 1;
745+}
746+
747 /* Returns a version of IN with any control characters
748 replaced by escape sequences. Uses a static buffer
749- if necessary. */
750+ if necessary.
751+
752+ If unicode display is enabled, then also handles the
753+ conversion of unicode characters. */
754
755 static const char *
756 sanitize_string (const char * in)
757@@ -508,40 +636,50 @@ sanitize_string (const char * in)
758 of cases it will not be needed. */
759 do
760 {
761- char c = *in++;
762+ unsigned char c = *in++;
763
764 if (c == 0)
765 return original;
766
767 if (ISCNTRL (c))
768 break;
769+
770+ if (unicode_display != unicode_default && c >= 0xc0)
771+ break;
772 }
773 while (1);
774
775 /* Copy the input, translating as needed. */
776 in = original;
777- if (buffer_len < (strlen (in) * 2))
778+ if (buffer_len < (strlen (in) * 9))
779 {
780 free ((void *) buffer);
781- buffer_len = strlen (in) * 2;
782+ buffer_len = strlen (in) * 9;
783 buffer = xmalloc (buffer_len + 1);
784 }
785
786 out = buffer;
787 do
788 {
789- char c = *in++;
790+ unsigned char c = *in++;
791
792 if (c == 0)
793 break;
794
795- if (!ISCNTRL (c))
796- *out++ = c;
797- else
798+ if (ISCNTRL (c))
799 {
800 *out++ = '^';
801 *out++ = c + 0x40;
802 }
803+ else if (unicode_display != unicode_default && c >= 0xc0)
804+ {
805+ unsigned int num_consumed;
806+
807+ out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
808+ in += num_consumed - 1;
809+ }
810+ else
811+ *out++ = c;
812 }
813 while (1);
814
815@@ -4529,6 +4667,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED
816 free (alloc);
817 }
818 }
819+ else if (unicode_display != unicode_default
820+ && name != NULL && *name != '\0')
821+ {
822+ const char * sanitized_name;
823+
824+ /* If we want to sanitize the name, we do it here, and
825+ temporarily clobber it while calling bfd_print_symbol.
826+ FIXME: This is a gross hack. */
827+ sanitized_name = sanitize_string (name);
828+ if (sanitized_name != name)
829+ (*current)->name = sanitized_name;
830+ else
831+ sanitized_name = NULL;
832+ bfd_print_symbol (cur_bfd, stdout, *current,
833+ bfd_print_symbol_all);
834+ if (sanitized_name != NULL)
835+ (*current)->name = name;
836+ }
837 else
838 bfd_print_symbol (cur_bfd, stdout, *current,
839 bfd_print_symbol_all);
840@@ -5212,7 +5368,7 @@ main (int argc, char **argv)
841 set_default_bfd_target ();
842
843 while ((c = getopt_long (argc, argv,
844- "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::",
845+ "CDE:FGHI:LM:P:RSTU:VW::ab:defghij:lm:prstvwxz",
846 long_options, (int *) 0))
847 != EOF)
848 {
849@@ -5495,6 +5651,23 @@ main (int argc, char **argv)
850 seenflag = true;
851 break;
852
853+ case 'U':
854+ if (streq (optarg, "default") || streq (optarg, "d"))
855+ unicode_display = unicode_default;
856+ else if (streq (optarg, "locale") || streq (optarg, "l"))
857+ unicode_display = unicode_locale;
858+ else if (streq (optarg, "escape") || streq (optarg, "e"))
859+ unicode_display = unicode_escape;
860+ else if (streq (optarg, "invalid") || streq (optarg, "i"))
861+ unicode_display = unicode_invalid;
862+ else if (streq (optarg, "hex") || streq (optarg, "x"))
863+ unicode_display = unicode_hex;
864+ else if (streq (optarg, "highlight") || streq (optarg, "h"))
865+ unicode_display = unicode_highlight;
866+ else
867+ fatal (_("invalid argument to -U/--unicode: %s"), optarg);
868+ break;
869+
870 case 'H':
871 usage (stdout, 0);
872 /* No need to set seenflag or to break - usage() does not return. */
873diff --git a/binutils/readelf.c b/binutils/readelf.c
874--- a/binutils/readelf.c 2021-12-19 19:00:27.058540065 -0800
875+++ b/binutils/readelf.c 2021-12-19 19:27:56.538354462 -0800
876@@ -328,6 +328,19 @@ typedef enum print_mode
877 }
878 print_mode;
879
880+typedef enum unicode_display_type
881+{
882+ unicode_default = 0,
883+ unicode_locale,
884+ unicode_escape,
885+ unicode_hex,
886+ unicode_highlight,
887+ unicode_invalid
888+} unicode_display_type;
889+
890+static unicode_display_type unicode_display = unicode_default;
891+
892+
893 /* Versioned symbol info. */
894 enum versioned_symbol_info
895 {
896@@ -632,11 +645,18 @@ print_symbol (signed int width, const ch
897 if (c == 0)
898 break;
899
900- /* Do not print control characters directly as they can affect terminal
901- settings. Such characters usually appear in the names generated
902- by the assembler for local labels. */
903- if (ISCNTRL (c))
904+ if (ISPRINT (c))
905+ {
906+ putchar (c);
907+ width_remaining --;
908+ num_printed ++;
909+ }
910+ else if (ISCNTRL (c))
911 {
912+ /* Do not print control characters directly as they can affect terminal
913+ settings. Such characters usually appear in the names generated
914+ by the assembler for local labels. */
915+
916 if (width_remaining < 2)
917 break;
918
919@@ -644,11 +664,137 @@ print_symbol (signed int width, const ch
920 width_remaining -= 2;
921 num_printed += 2;
922 }
923- else if (ISPRINT (c))
924+ else if (c == 0x7f)
925 {
926- putchar (c);
927- width_remaining --;
928- num_printed ++;
929+ if (width_remaining < 5)
930+ break;
931+ printf ("<DEL>");
932+ width_remaining -= 5;
933+ num_printed += 5;
934+ }
935+ else if (unicode_display != unicode_locale
936+ && unicode_display != unicode_default)
937+ {
938+ /* Display unicode characters as something else. */
939+ unsigned char bytes[4];
940+ bool is_utf8;
941+ unsigned int nbytes;
942+
943+ bytes[0] = c;
944+
945+ if (bytes[0] < 0xc0)
946+ {
947+ nbytes = 1;
948+ is_utf8 = false;
949+ }
950+ else
951+ {
952+ bytes[1] = *symbol++;
953+
954+ if ((bytes[1] & 0xc0) != 0x80)
955+ {
956+ is_utf8 = false;
957+ /* Do not consume this character. It may only
958+ be the first byte in the sequence that was
959+ corrupt. */
960+ --symbol;
961+ nbytes = 1;
962+ }
963+ else if ((bytes[0] & 0x20) == 0)
964+ {
965+ is_utf8 = true;
966+ nbytes = 2;
967+ }
968+ else
969+ {
970+ bytes[2] = *symbol++;
971+
972+ if ((bytes[2] & 0xc0) != 0x80)
973+ {
974+ is_utf8 = false;
975+ symbol -= 2;
976+ nbytes = 1;
977+ }
978+ else if ((bytes[0] & 0x10) == 0)
979+ {
980+ is_utf8 = true;
981+ nbytes = 3;
982+ }
983+ else
984+ {
985+ bytes[3] = *symbol++;
986+
987+ nbytes = 4;
988+
989+ if ((bytes[3] & 0xc0) != 0x80)
990+ {
991+ is_utf8 = false;
992+ symbol -= 3;
993+ nbytes = 1;
994+ }
995+ else
996+ is_utf8 = true;
997+ }
998+ }
999+ }
1000+
1001+ if (unicode_display == unicode_invalid)
1002+ is_utf8 = false;
1003+
1004+ if (unicode_display == unicode_hex || ! is_utf8)
1005+ {
1006+ unsigned int i;
1007+
1008+ if (width_remaining < (nbytes * 2) + 2)
1009+ break;
1010+
1011+ putchar (is_utf8 ? '<' : '{');
1012+ printf ("0x");
1013+ for (i = 0; i < nbytes; i++)
1014+ printf ("%02x", bytes[i]);
1015+ putchar (is_utf8 ? '>' : '}');
1016+ }
1017+ else
1018+ {
1019+ if (unicode_display == unicode_highlight && isatty (1))
1020+ printf ("\x1B[31;47m"); /* Red. */
1021+
1022+ switch (nbytes)
1023+ {
1024+ case 2:
1025+ if (width_remaining < 6)
1026+ break;
1027+ printf ("\\u%02x%02x",
1028+ (bytes[0] & 0x1c) >> 2,
1029+ ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f));
1030+ break;
1031+ case 3:
1032+ if (width_remaining < 6)
1033+ break;
1034+ printf ("\\u%02x%02x",
1035+ ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2),
1036+ ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f));
1037+ break;
1038+ case 4:
1039+ if (width_remaining < 8)
1040+ break;
1041+ printf ("\\u%02x%02x%02x",
1042+ ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2),
1043+ ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2),
1044+ ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
1045+
1046+ break;
1047+ default:
1048+ /* URG. */
1049+ break;
1050+ }
1051+
1052+ if (unicode_display == unicode_highlight && isatty (1))
1053+ printf ("\033[0m"); /* Default colour. */
1054+ }
1055+
1056+ if (bytes[nbytes - 1] == 0)
1057+ break;
1058 }
1059 else
1060 {
1061@@ -4668,6 +4814,7 @@ static struct option options[] =
1062 {"syms", no_argument, 0, 's'},
1063 {"silent-truncation",no_argument, 0, 'T'},
1064 {"section-details", no_argument, 0, 't'},
1065+ {"unicode", required_argument, NULL, 'U'},
1066 {"unwind", no_argument, 0, 'u'},
1067 {"version-info", no_argument, 0, 'V'},
1068 {"version", no_argument, 0, 'v'},
1069@@ -4744,6 +4891,12 @@ usage (FILE * stream)
1070 fprintf (stream, _("\
1071 --no-recurse-limit Disable a demangling recursion limit\n"));
1072 fprintf (stream, _("\
1073+ -U[dlexhi] --unicode=[default|locale|escape|hex|highlight|invalid]\n\
1074+ Display unicode characters as determined by the current locale\n\
1075+ (default), escape sequences, \"<hex sequences>\", highlighted\n\
1076+ escape sequences, or treat them as invalid and display as\n\
1077+ \"{hex sequences}\"\n"));
1078+ fprintf (stream, _("\
1079 -n --notes Display the core notes (if present)\n"));
1080 fprintf (stream, _("\
1081 -r --relocs Display the relocations (if present)\n"));
1082@@ -4928,7 +5081,7 @@ parse_args (struct dump_data *dumpdata,
1083 usage (stderr);
1084
1085 while ((c = getopt_long
1086- (argc, argv, "ACDHILNPR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
1087+ (argc, argv, "ACDHILNPR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
1088 {
1089 switch (c)
1090 {
1091@@ -5130,6 +5283,25 @@ parse_args (struct dump_data *dumpdata,
1092 /* Ignored for backward compatibility. */
1093 break;
1094
1095+ case 'U':
1096+ if (optarg == NULL)
1097+ error (_("Missing arg to -U/--unicode")); /* Can this happen ? */
1098+ else if (streq (optarg, "default") || streq (optarg, "d"))
1099+ unicode_display = unicode_default;
1100+ else if (streq (optarg, "locale") || streq (optarg, "l"))
1101+ unicode_display = unicode_locale;
1102+ else if (streq (optarg, "escape") || streq (optarg, "e"))
1103+ unicode_display = unicode_escape;
1104+ else if (streq (optarg, "invalid") || streq (optarg, "i"))
1105+ unicode_display = unicode_invalid;
1106+ else if (streq (optarg, "hex") || streq (optarg, "x"))
1107+ unicode_display = unicode_hex;
1108+ else if (streq (optarg, "highlight") || streq (optarg, "h"))
1109+ unicode_display = unicode_highlight;
1110+ else
1111+ error (_("invalid argument to -U/--unicode: %s"), optarg);
1112+ break;
1113+
1114 case OPTION_SYM_BASE:
1115 sym_base = 0;
1116 if (optarg != NULL)
1117diff --git a/binutils/strings.c b/binutils/strings.c
1118--- a/binutils/strings.c 2021-12-19 19:00:27.058540065 -0800
1119+++ b/binutils/strings.c 2021-12-19 19:48:26.205313218 -0800
1120@@ -55,6 +55,19 @@
1121 -T {bfdname}
1122 Specify a non-default object file format.
1123
1124+ --unicode={default|locale|invalid|hex|escape|highlight}
1125+ -u {d|l|i|x|e|h}
1126+ Determine how to handle UTF-8 unicode characters. The default
1127+ is no special treatment. All other versions of this option
1128+ only apply if the encoding is valid and enabling the option
1129+ implies --encoding=S.
1130+ The 'locale' option displays the characters according to the
1131+ current locale. The 'invalid' option treats them as
1132+ non-string characters. The 'hex' option displays them as hex
1133+ byte sequences. The 'escape' option displays them as escape
1134+ sequences and the 'highlight' option displays them as
1135+ coloured escape sequences.
1136+
1137 --output-separator=sep_string
1138 -s sep_string String used to separate parsed strings in output.
1139 Default is newline.
1140@@ -76,6 +89,22 @@
1141 #include "safe-ctype.h"
1142 #include "bucomm.h"
1143
1144+#ifndef streq
1145+#define streq(a,b) (strcmp ((a),(b)) == 0)
1146+#endif
1147+
1148+typedef enum unicode_display_type
1149+{
1150+ unicode_default = 0,
1151+ unicode_locale,
1152+ unicode_escape,
1153+ unicode_hex,
1154+ unicode_highlight,
1155+ unicode_invalid
1156+} unicode_display_type;
1157+
1158+static unicode_display_type unicode_display = unicode_default;
1159+
1160 #define STRING_ISGRAPHIC(c) \
1161 ( (c) >= 0 \
1162 && (c) <= 255 \
1163@@ -94,7 +123,7 @@ extern int errno;
1164 static int address_radix;
1165
1166 /* Minimum length of sequence of graphic chars to trigger output. */
1167-static int string_min;
1168+static unsigned int string_min;
1169
1170 /* Whether or not we include all whitespace as a graphic char. */
1171 static bool include_all_whitespace;
1172@@ -121,21 +150,22 @@ static char *output_separator;
1173 static struct option long_options[] =
1174 {
1175 {"all", no_argument, NULL, 'a'},
1176+ {"bytes", required_argument, NULL, 'n'},
1177 {"data", no_argument, NULL, 'd'},
1178+ {"encoding", required_argument, NULL, 'e'},
1179+ {"help", no_argument, NULL, 'h'},
1180+ {"include-all-whitespace", no_argument, NULL, 'w'},
1181+ {"output-separator", required_argument, NULL, 's'},
1182 {"print-file-name", no_argument, NULL, 'f'},
1183- {"bytes", required_argument, NULL, 'n'},
1184 {"radix", required_argument, NULL, 't'},
1185- {"include-all-whitespace", no_argument, NULL, 'w'},
1186- {"encoding", required_argument, NULL, 'e'},
1187 {"target", required_argument, NULL, 'T'},
1188- {"output-separator", required_argument, NULL, 's'},
1189- {"help", no_argument, NULL, 'h'},
1190+ {"unicode", required_argument, NULL, 'U'},
1191 {"version", no_argument, NULL, 'v'},
1192 {NULL, 0, NULL, 0}
1193 };
1194
1195 static bool strings_file (char *);
1196-static void print_strings (const char *, FILE *, file_ptr, int, int, char *);
1197+static void print_strings (const char *, FILE *, file_ptr, int, char *);
1198 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
1199
1200 int main (int, char **);
1201@@ -171,7 +201,7 @@ main (int argc, char **argv)
1202 encoding = 's';
1203 output_separator = NULL;
1204
1205- while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789",
1206+ while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
1207 long_options, (int *) 0)) != EOF)
1208 {
1209 switch (optc)
1210@@ -244,6 +274,23 @@ main (int argc, char **argv)
1211 output_separator = optarg;
1212 break;
1213
1214+ case 'U':
1215+ if (streq (optarg, "default") || streq (optarg, "d"))
1216+ unicode_display = unicode_default;
1217+ else if (streq (optarg, "locale") || streq (optarg, "l"))
1218+ unicode_display = unicode_locale;
1219+ else if (streq (optarg, "escape") || streq (optarg, "e"))
1220+ unicode_display = unicode_escape;
1221+ else if (streq (optarg, "invalid") || streq (optarg, "i"))
1222+ unicode_display = unicode_invalid;
1223+ else if (streq (optarg, "hex") || streq (optarg, "x"))
1224+ unicode_display = unicode_hex;
1225+ else if (streq (optarg, "highlight") || streq (optarg, "h"))
1226+ unicode_display = unicode_highlight;
1227+ else
1228+ fatal (_("invalid argument to -U/--unicode: %s"), optarg);
1229+ break;
1230+
1231 case 'V':
1232 case 'v':
1233 print_version ("strings");
1234@@ -258,6 +305,9 @@ main (int argc, char **argv)
1235 }
1236 }
1237
1238+ if (unicode_display != unicode_default)
1239+ encoding = 'S';
1240+
1241 if (numeric_opt != 0)
1242 {
1243 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
1244@@ -293,14 +343,14 @@ main (int argc, char **argv)
1245 {
1246 datasection_only = false;
1247 SET_BINARY (fileno (stdin));
1248- print_strings ("{standard input}", stdin, 0, 0, 0, (char *) NULL);
1249+ print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
1250 files_given = true;
1251 }
1252 else
1253 {
1254 for (; optind < argc; ++optind)
1255 {
1256- if (strcmp (argv[optind], "-") == 0)
1257+ if (streq (argv[optind], "-"))
1258 datasection_only = false;
1259 else
1260 {
1261@@ -342,7 +392,7 @@ strings_a_section (bfd *abfd, asection *
1262 }
1263
1264 *got_a_section = true;
1265- print_strings (filename, NULL, sect->filepos, 0, sectsize, (char *) mem);
1266+ print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
1267 free (mem);
1268 }
1269
1270@@ -427,7 +477,7 @@ strings_file (char *file)
1271 return false;
1272 }
1273
1274- print_strings (file, stream, (file_ptr) 0, 0, 0, (char *) 0);
1275+ print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
1276
1277 if (fclose (stream) == EOF)
1278 {
1279@@ -551,6 +601,626 @@ unget_part_char (long c, file_ptr *addre
1280 }
1281 }
1282 }
1283+
1284+static void
1285+print_filename_and_address (const char * filename, file_ptr address)
1286+{
1287+ if (print_filenames)
1288+ printf ("%s: ", filename);
1289+
1290+ if (! print_addresses)
1291+ return;
1292+
1293+ switch (address_radix)
1294+ {
1295+ case 8:
1296+ if (sizeof (address) > sizeof (long))
1297+ {
1298+#ifndef __MSVCRT__
1299+ printf ("%7llo ", (unsigned long long) address);
1300+#else
1301+ printf ("%7I64o ", (unsigned long long) address);
1302+#endif
1303+ }
1304+ else
1305+ printf ("%7lo ", (unsigned long) address);
1306+ break;
1307+
1308+ case 10:
1309+ if (sizeof (address) > sizeof (long))
1310+ {
1311+#ifndef __MSVCRT__
1312+ printf ("%7llu ", (unsigned long long) address);
1313+#else
1314+ printf ("%7I64d ", (unsigned long long) address);
1315+#endif
1316+ }
1317+ else
1318+ printf ("%7ld ", (long) address);
1319+ break;
1320+
1321+ case 16:
1322+ if (sizeof (address) > sizeof (long))
1323+ {
1324+#ifndef __MSVCRT__
1325+ printf ("%7llx ", (unsigned long long) address);
1326+#else
1327+ printf ("%7I64x ", (unsigned long long) address);
1328+#endif
1329+ }
1330+ else
1331+ printf ("%7lx ", (unsigned long) address);
1332+ break;
1333+ }
1334+}
1335+
1336+/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
1337+ If the encoding is valid then returns the number of bytes it uses. */
1338+
1339+static unsigned int
1340+is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
1341+{
1342+ if (buffer[0] < 0xc0)
1343+ return 0;
1344+
1345+ if (buflen < 2)
1346+ return 0;
1347+
1348+ if ((buffer[1] & 0xc0) != 0x80)
1349+ return 0;
1350+
1351+ if ((buffer[0] & 0x20) == 0)
1352+ return 2;
1353+
1354+ if (buflen < 3)
1355+ return 0;
1356+
1357+ if ((buffer[2] & 0xc0) != 0x80)
1358+ return 0;
1359+
1360+ if ((buffer[0] & 0x10) == 0)
1361+ return 3;
1362+
1363+ if (buflen < 4)
1364+ return 0;
1365+
1366+ if ((buffer[3] & 0xc0) != 0x80)
1367+ return 0;
1368+
1369+ return 4;
1370+}
1371+
1372+/* Display a UTF-8 encoded character in BUFFER according to the setting
1373+ of unicode_display. The character is known to be valid.
1374+ Returns the number of bytes consumed. */
1375+
1376+static unsigned int
1377+display_utf8_char (const unsigned char * buffer)
1378+{
1379+ unsigned int j;
1380+ unsigned int utf8_len;
1381+
1382+ switch (buffer[0] & 0x30)
1383+ {
1384+ case 0x00:
1385+ case 0x10:
1386+ utf8_len = 2;
1387+ break;
1388+ case 0x20:
1389+ utf8_len = 3;
1390+ break;
1391+ default:
1392+ utf8_len = 4;
1393+ }
1394+
1395+ switch (unicode_display)
1396+ {
1397+ default:
1398+ fprintf (stderr, "ICE: unexpected unicode display type\n");
1399+ break;
1400+
1401+ case unicode_escape:
1402+ case unicode_highlight:
1403+ if (unicode_display == unicode_highlight && isatty (1))
1404+ printf ("\x1B[31;47m"); /* Red. */
1405+
1406+ switch (utf8_len)
1407+ {
1408+ case 2:
1409+ printf ("\\u%02x%02x",
1410+ ((buffer[0] & 0x1c) >> 2),
1411+ ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
1412+ break;
1413+
1414+ case 3:
1415+ printf ("\\u%02x%02x",
1416+ ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
1417+ ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
1418+ break;
1419+
1420+ case 4:
1421+ printf ("\\u%02x%02x%02x",
1422+ ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
1423+ ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
1424+ ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
1425+ break;
1426+ default:
1427+ /* URG. */
1428+ break;
1429+ }
1430+
1431+ if (unicode_display == unicode_highlight && isatty (1))
1432+ printf ("\033[0m"); /* Default colour. */
1433+ break;
1434+
1435+ case unicode_hex:
1436+ putchar ('<');
1437+ printf ("0x");
1438+ for (j = 0; j < utf8_len; j++)
1439+ printf ("%02x", buffer [j]);
1440+ putchar ('>');
1441+ break;
1442+
1443+ case unicode_locale:
1444+ printf ("%.1s", buffer);
1445+ break;
1446+ }
1447+
1448+ return utf8_len;
1449+}
1450+
1451+/* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
1452+ according to the setting of the unicode_display variable. The buffer
1453+ contains BUFLEN bytes.
1454+
1455+ Display the characters as if they started at ADDRESS and are contained in
1456+ FILENAME. */
1457+
1458+static void
1459+print_unicode_buffer (const char * filename,
1460+ file_ptr address,
1461+ const unsigned char * buffer,
1462+ unsigned long buflen)
1463+{
1464+ /* Paranoia checks... */
1465+ if (filename == NULL
1466+ || buffer == NULL
1467+ || unicode_display == unicode_default
1468+ || encoding != 'S'
1469+ || encoding_bytes != 1)
1470+ {
1471+ fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
1472+ return;
1473+ }
1474+
1475+ if (buflen == 0)
1476+ return;
1477+
1478+ /* We must only display strings that are at least string_min *characters*
1479+ long. So we scan the buffer in two stages. First we locate the start
1480+ of a potential string. Then we walk along it until we have found
1481+ string_min characters. Then we go back to the start point and start
1482+ displaying characters according to the unicode_display setting. */
1483+
1484+ unsigned long start_point = 0;
1485+ unsigned long i = 0;
1486+ unsigned int char_len = 1;
1487+ unsigned int num_found = 0;
1488+
1489+ for (i = 0; i < buflen; i += char_len)
1490+ {
1491+ int c = buffer[i];
1492+
1493+ char_len = 1;
1494+
1495+ /* Find the first potential character of a string. */
1496+ if (! STRING_ISGRAPHIC (c))
1497+ {
1498+ num_found = 0;
1499+ continue;
1500+ }
1501+
1502+ if (c > 126)
1503+ {
1504+ if (c < 0xc0)
1505+ {
1506+ num_found = 0;
1507+ continue;
1508+ }
1509+
1510+ if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
1511+ {
1512+ char_len = 1;
1513+ num_found = 0;
1514+ continue;
1515+ }
1516+
1517+ if (unicode_display == unicode_invalid)
1518+ {
1519+ /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
1520+ num_found = 0;
1521+ continue;
1522+ }
1523+ }
1524+
1525+ if (num_found == 0)
1526+ /* We have found a potential starting point for a string. */
1527+ start_point = i;
1528+
1529+ ++ num_found;
1530+
1531+ if (num_found >= string_min)
1532+ break;
1533+ }
1534+
1535+ if (num_found < string_min)
1536+ return;
1537+
1538+ print_filename_and_address (filename, address + start_point);
1539+
1540+ /* We have found string_min characters. Display them and any
1541+ more that follow. */
1542+ for (i = start_point; i < buflen; i += char_len)
1543+ {
1544+ int c = buffer[i];
1545+
1546+ char_len = 1;
1547+
1548+ if (! STRING_ISGRAPHIC (c))
1549+ break;
1550+ else if (c < 127)
1551+ putchar (c);
1552+ else if (! is_valid_utf8 (buffer + i, buflen - i))
1553+ break;
1554+ else if (unicode_display == unicode_invalid)
1555+ break;
1556+ else
1557+ char_len = display_utf8_char (buffer + i);
1558+ }
1559+
1560+ if (output_separator)
1561+ fputs (output_separator, stdout);
1562+ else
1563+ putchar ('\n');
1564+
1565+ /* FIXME: Using tail recursion here is lazy programming... */
1566+ print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
1567+}
1568+
1569+static int
1570+get_unicode_byte (FILE * stream,
1571+ unsigned char * putback,
1572+ unsigned int * num_putback,
1573+ unsigned int * num_read)
1574+{
1575+ if (* num_putback > 0)
1576+ {
1577+ * num_putback = * num_putback - 1;
1578+ return putback [* num_putback];
1579+ }
1580+
1581+ * num_read = * num_read + 1;
1582+
1583+#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
1584+ return getc_unlocked (stream);
1585+#else
1586+ return getc (stream);
1587+#endif
1588+}
1589+
1590+/* Helper function for print_unicode_stream. */
1591+
1592+static void
1593+print_unicode_stream_body (const char * filename,
1594+ file_ptr address,
1595+ FILE * stream,
1596+ unsigned char * putback_buf,
1597+ unsigned int num_putback,
1598+ unsigned char * print_buf)
1599+{
1600+ /* It would be nice if we could just read the stream into a buffer
1601+ and then process if with print_unicode_buffer. But the input
1602+ might be huge or it might time-locked (eg stdin). So instead
1603+ we go one byte at a time... */
1604+
1605+ file_ptr start_point = 0;
1606+ unsigned int num_read = 0;
1607+ unsigned int num_chars = 0;
1608+ unsigned int num_print = 0;
1609+ int c = 0;
1610+
1611+ /* Find a series of string_min characters. Put them into print_buf. */
1612+ do
1613+ {
1614+ if (num_chars >= string_min)
1615+ break;
1616+
1617+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1618+ if (c == EOF)
1619+ break;
1620+
1621+ if (! STRING_ISGRAPHIC (c))
1622+ {
1623+ num_chars = num_print = 0;
1624+ continue;
1625+ }
1626+
1627+ if (num_chars == 0)
1628+ start_point = num_read - 1;
1629+
1630+ if (c < 127)
1631+ {
1632+ print_buf[num_print] = c;
1633+ num_chars ++;
1634+ num_print ++;
1635+ continue;
1636+ }
1637+
1638+ if (c < 0xc0)
1639+ {
1640+ num_chars = num_print = 0;
1641+ continue;
1642+ }
1643+
1644+ /* We *might* have a UTF-8 sequence. Time to start peeking. */
1645+ char utf8[4];
1646+
1647+ utf8[0] = c;
1648+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1649+ if (c == EOF)
1650+ break;
1651+ utf8[1] = c;
1652+
1653+ if ((utf8[1] & 0xc0) != 0x80)
1654+ {
1655+ /* Invalid UTF-8. */
1656+ putback_buf[num_putback++] = utf8[1];
1657+ num_chars = num_print = 0;
1658+ continue;
1659+ }
1660+ else if ((utf8[0] & 0x20) == 0)
1661+ {
1662+ /* A valid 2-byte UTF-8 encoding. */
1663+ if (unicode_display == unicode_invalid)
1664+ {
1665+ putback_buf[num_putback++] = utf8[1];
1666+ num_chars = num_print = 0;
1667+ }
1668+ else
1669+ {
1670+ print_buf[num_print ++] = utf8[0];
1671+ print_buf[num_print ++] = utf8[1];
1672+ num_chars ++;
1673+ }
1674+ continue;
1675+ }
1676+
1677+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1678+ if (c == EOF)
1679+ break;
1680+ utf8[2] = c;
1681+
1682+ if ((utf8[2] & 0xc0) != 0x80)
1683+ {
1684+ /* Invalid UTF-8. */
1685+ putback_buf[num_putback++] = utf8[2];
1686+ putback_buf[num_putback++] = utf8[1];
1687+ num_chars = num_print = 0;
1688+ continue;
1689+ }
1690+ else if ((utf8[0] & 0x10) == 0)
1691+ {
1692+ /* A valid 3-byte UTF-8 encoding. */
1693+ if (unicode_display == unicode_invalid)
1694+ {
1695+ putback_buf[num_putback++] = utf8[2];
1696+ putback_buf[num_putback++] = utf8[1];
1697+ num_chars = num_print = 0;
1698+ }
1699+ else
1700+ {
1701+ print_buf[num_print ++] = utf8[0];
1702+ print_buf[num_print ++] = utf8[1];
1703+ print_buf[num_print ++] = utf8[2];
1704+ num_chars ++;
1705+ }
1706+ continue;
1707+ }
1708+
1709+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1710+ if (c == EOF)
1711+ break;
1712+ utf8[3] = c;
1713+
1714+ if ((utf8[3] & 0xc0) != 0x80)
1715+ {
1716+ /* Invalid UTF-8. */
1717+ putback_buf[num_putback++] = utf8[3];
1718+ putback_buf[num_putback++] = utf8[2];
1719+ putback_buf[num_putback++] = utf8[1];
1720+ num_chars = num_print = 0;
1721+ }
1722+ /* We have a valid 4-byte UTF-8 encoding. */
1723+ else if (unicode_display == unicode_invalid)
1724+ {
1725+ putback_buf[num_putback++] = utf8[3];
1726+ putback_buf[num_putback++] = utf8[1];
1727+ putback_buf[num_putback++] = utf8[2];
1728+ num_chars = num_print = 0;
1729+ }
1730+ else
1731+ {
1732+ print_buf[num_print ++] = utf8[0];
1733+ print_buf[num_print ++] = utf8[1];
1734+ print_buf[num_print ++] = utf8[2];
1735+ print_buf[num_print ++] = utf8[3];
1736+ num_chars ++;
1737+ }
1738+ }
1739+ while (1);
1740+
1741+ if (num_chars >= string_min)
1742+ {
1743+ /* We know that we have string_min valid characters in print_buf,
1744+ and there may be more to come in the stream. Start displaying
1745+ them. */
1746+
1747+ print_filename_and_address (filename, address + start_point);
1748+
1749+ unsigned int i;
1750+ for (i = 0; i < num_print;)
1751+ {
1752+ if (print_buf[i] < 127)
1753+ putchar (print_buf[i++]);
1754+ else
1755+ i += display_utf8_char (print_buf + i);
1756+ }
1757+
1758+ /* OK so now we have to start read unchecked bytes. */
1759+
1760+ /* Find a series of string_min characters. Put them into print_buf. */
1761+ do
1762+ {
1763+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1764+ if (c == EOF)
1765+ break;
1766+
1767+ if (! STRING_ISGRAPHIC (c))
1768+ break;
1769+
1770+ if (c < 127)
1771+ {
1772+ putchar (c);
1773+ continue;
1774+ }
1775+
1776+ if (c < 0xc0)
1777+ break;
1778+
1779+ /* We *might* have a UTF-8 sequence. Time to start peeking. */
1780+ unsigned char utf8[4];
1781+
1782+ utf8[0] = c;
1783+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1784+ if (c == EOF)
1785+ break;
1786+ utf8[1] = c;
1787+
1788+ if ((utf8[1] & 0xc0) != 0x80)
1789+ {
1790+ /* Invalid UTF-8. */
1791+ putback_buf[num_putback++] = utf8[1];
1792+ break;
1793+ }
1794+ else if ((utf8[0] & 0x20) == 0)
1795+ {
1796+ /* Valid 2-byte UTF-8. */
1797+ if (unicode_display == unicode_invalid)
1798+ {
1799+ putback_buf[num_putback++] = utf8[1];
1800+ break;
1801+ }
1802+ else
1803+ {
1804+ (void) display_utf8_char (utf8);
1805+ continue;
1806+ }
1807+ }
1808+
1809+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1810+ if (c == EOF)
1811+ break;
1812+ utf8[2] = c;
1813+
1814+ if ((utf8[2] & 0xc0) != 0x80)
1815+ {
1816+ /* Invalid UTF-8. */
1817+ putback_buf[num_putback++] = utf8[2];
1818+ putback_buf[num_putback++] = utf8[1];
1819+ break;
1820+ }
1821+ else if ((utf8[0] & 0x10) == 0)
1822+ {
1823+ /* Valid 3-byte UTF-8. */
1824+ if (unicode_display == unicode_invalid)
1825+ {
1826+ putback_buf[num_putback++] = utf8[2];
1827+ putback_buf[num_putback++] = utf8[1];
1828+ break;
1829+ }
1830+ else
1831+ {
1832+ (void) display_utf8_char (utf8);
1833+ continue;
1834+ }
1835+ }
1836+
1837+ c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1838+ if (c == EOF)
1839+ break;
1840+ utf8[3] = c;
1841+
1842+ if ((utf8[3] & 0xc0) != 0x80)
1843+ {
1844+ /* Invalid UTF-8. */
1845+ putback_buf[num_putback++] = utf8[3];
1846+ putback_buf[num_putback++] = utf8[2];
1847+ putback_buf[num_putback++] = utf8[1];
1848+ break;
1849+ }
1850+ else if (unicode_display == unicode_invalid)
1851+ {
1852+ putback_buf[num_putback++] = utf8[3];
1853+ putback_buf[num_putback++] = utf8[2];
1854+ putback_buf[num_putback++] = utf8[1];
1855+ break;
1856+ }
1857+ else
1858+ /* A valid 4-byte UTF-8 encoding. */
1859+ (void) display_utf8_char (utf8);
1860+ }
1861+ while (1);
1862+
1863+ if (output_separator)
1864+ fputs (output_separator, stdout);
1865+ else
1866+ putchar ('\n');
1867+ }
1868+
1869+ if (c != EOF)
1870+ /* FIXME: Using tail recursion here is lazy, but it works. */
1871+ print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1872+}
1873+
1874+/* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1875+ encountered according to the setting of the unicode_display variable.
1876+ The stream is positioned at ADDRESS and is attached to FILENAME. */
1877+
1878+static void
1879+print_unicode_stream (const char * filename,
1880+ file_ptr address,
1881+ FILE * stream)
1882+{
1883+ /* Paranoia checks... */
1884+ if (filename == NULL
1885+ || stream == NULL
1886+ || unicode_display == unicode_default
1887+ || encoding != 'S'
1888+ || encoding_bytes != 1)
1889+ {
1890+ fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1891+ return;
1892+ }
1893+
1894+ /* Allocate space for string_min 4-byte utf-8 characters. */
1895+ unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1896+ /* We should never have to put back more than 4 bytes. */
1897+ unsigned char putback_buf[5];
1898+ unsigned int num_putback = 0;
1899+
1900+ print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1901+ free (print_buf);
1902+}
1903
1904 /* Find the strings in file FILENAME, read from STREAM.
1905 Assume that STREAM is positioned so that the next byte read
1906@@ -566,20 +1236,29 @@ unget_part_char (long c, file_ptr *addre
1907
1908 static void
1909 print_strings (const char *filename, FILE *stream, file_ptr address,
1910- int stop_point, int magiccount, char *magic)
1911+ int magiccount, char *magic)
1912 {
1913+ if (unicode_display != unicode_default)
1914+ {
1915+ if (magic != NULL)
1916+ print_unicode_buffer (filename, address,
1917+ (const unsigned char *) magic, magiccount);
1918+
1919+ if (stream != NULL)
1920+ print_unicode_stream (filename, address, stream);
1921+ return;
1922+ }
1923+
1924 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1925
1926 while (1)
1927 {
1928 file_ptr start;
1929- int i;
1930+ unsigned int i;
1931 long c;
1932
1933 /* See if the next `string_min' chars are all graphic chars. */
1934 tryline:
1935- if (stop_point && address >= stop_point)
1936- break;
1937 start = address;
1938 for (i = 0; i < string_min; i++)
1939 {
1940@@ -601,51 +1280,7 @@ print_strings (const char *filename, FIL
1941
1942 /* We found a run of `string_min' graphic characters. Print up
1943 to the next non-graphic character. */
1944-
1945- if (print_filenames)
1946- printf ("%s: ", filename);
1947- if (print_addresses)
1948- switch (address_radix)
1949- {
1950- case 8:
1951- if (sizeof (start) > sizeof (long))
1952- {
1953-#ifndef __MSVCRT__
1954- printf ("%7llo ", (unsigned long long) start);
1955-#else
1956- printf ("%7I64o ", (unsigned long long) start);
1957-#endif
1958- }
1959- else
1960- printf ("%7lo ", (unsigned long) start);
1961- break;
1962-
1963- case 10:
1964- if (sizeof (start) > sizeof (long))
1965- {
1966-#ifndef __MSVCRT__
1967- printf ("%7llu ", (unsigned long long) start);
1968-#else
1969- printf ("%7I64d ", (unsigned long long) start);
1970-#endif
1971- }
1972- else
1973- printf ("%7ld ", (long) start);
1974- break;
1975-
1976- case 16:
1977- if (sizeof (start) > sizeof (long))
1978- {
1979-#ifndef __MSVCRT__
1980- printf ("%7llx ", (unsigned long long) start);
1981-#else
1982- printf ("%7I64x ", (unsigned long long) start);
1983-#endif
1984- }
1985- else
1986- printf ("%7lx ", (unsigned long) start);
1987- break;
1988- }
1989+ print_filename_and_address (filename, start);
1990
1991 buf[i] = '\0';
1992 fputs (buf, stdout);
1993@@ -697,6 +1332,8 @@ usage (FILE *stream, int status)
1994 -T --target=<BFDNAME> Specify the binary file format\n\
1995 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1996 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1997+ --unicode={default|show|invalid|hex|escape|highlight}\n\
1998+ -u {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1999 -s --output-separator=<string> String used to separate strings in output.\n\
2000 @<file> Read options from <file>\n\
2001 -h --help Display this information\n\