summaryrefslogtreecommitdiffstats
path: root/meta/recipes-devtools/gcc
diff options
context:
space:
mode:
authorpgowda <pgowda.cve@gmail.com>2022-01-04 20:57:50 -0800
committerRichard Purdie <richard.purdie@linuxfoundation.org>2022-01-14 09:34:04 +0000
commite31641c608bdbc1d04f516d2ac11b60a14746b85 (patch)
tree3f829bdd886a58143b678e0f2f4ccc48d0acfe46 /meta/recipes-devtools/gcc
parent88ca7b3d7eb7c68079a16fbf2a2dbc1056f7680e (diff)
downloadpoky-e31641c608bdbc1d04f516d2ac11b60a14746b85.tar.gz
gcc: Fix CVE-2021-42574
Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=004bb936d6d5f177af26ad4905595e843d5665a5] Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e] Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=51c500269bf53749b107807d84271385fad35628] Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=1a7f2c0774129750fdf73e9f1b78f0ce983c9ab3] Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=bef32d4a28595e933f24fef378cf052a30b674a7] (From OE-Core rev: d0f4614e2c6e9090a0c45052c36d0c7f3215de10) Signed-off-by: pgowda <pgowda.cve@gmail.com> Signed-off-by: Anuj Mittal <anuj.mittal@intel.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'meta/recipes-devtools/gcc')
-rw-r--r--meta/recipes-devtools/gcc/gcc-10.2.inc5
-rw-r--r--meta/recipes-devtools/gcc/gcc/0001-CVE-2021-42574.patch2906
-rw-r--r--meta/recipes-devtools/gcc/gcc/0002-CVE-2021-42574.patch2270
-rw-r--r--meta/recipes-devtools/gcc/gcc/0003-CVE-2021-42574.patch1724
-rw-r--r--meta/recipes-devtools/gcc/gcc/0004-CVE-2021-42574.patch138
-rw-r--r--meta/recipes-devtools/gcc/gcc/0005-CVE-2021-42574.patch575
6 files changed, 7618 insertions, 0 deletions
diff --git a/meta/recipes-devtools/gcc/gcc-10.2.inc b/meta/recipes-devtools/gcc/gcc-10.2.inc
index 89158258d7..656c43258c 100644
--- a/meta/recipes-devtools/gcc/gcc-10.2.inc
+++ b/meta/recipes-devtools/gcc/gcc-10.2.inc
@@ -75,6 +75,11 @@ SRC_URI = "\
75 file://0003-CVE-2021-35465.patch \ 75 file://0003-CVE-2021-35465.patch \
76 file://0004-CVE-2021-35465.patch \ 76 file://0004-CVE-2021-35465.patch \
77 file://0039-arm64-neoverse-n2-support.patch \ 77 file://0039-arm64-neoverse-n2-support.patch \
78 file://0001-CVE-2021-42574.patch \
79 file://0002-CVE-2021-42574.patch \
80 file://0003-CVE-2021-42574.patch \
81 file://0004-CVE-2021-42574.patch \
82 file://0005-CVE-2021-42574.patch \
78" 83"
79SRC_URI[sha256sum] = "b8dd4368bb9c7f0b98188317ee0254dd8cc99d1e3a18d0ff146c855fe16c1d8c" 84SRC_URI[sha256sum] = "b8dd4368bb9c7f0b98188317ee0254dd8cc99d1e3a18d0ff146c855fe16c1d8c"
80 85
diff --git a/meta/recipes-devtools/gcc/gcc/0001-CVE-2021-42574.patch b/meta/recipes-devtools/gcc/gcc/0001-CVE-2021-42574.patch
new file mode 100644
index 0000000000..e0f4f7d32f
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0001-CVE-2021-42574.patch
@@ -0,0 +1,2906 @@
1From 004bb936d6d5f177af26ad4905595e843d5665a5 Mon Sep 17 00:00:00 2001
2From: Lewis Hyatt <lhyatt@gmail.com>
3Date: Tue, 14 Jul 2020 12:05:56 -0400
4Subject: [PATCH] diagnostics: Support conversion of tabs to spaces [PR49973]
5 [PR86904]
6
7Supports conversion of tabs to spaces when outputting diagnostics. Also
8adds -fdiagnostics-column-unit and -fdiagnostics-column-origin options to
9control how the column number is output, thereby resolving the two PRs.
10
11gcc/c-family/ChangeLog:
12
13 PR other/86904
14 * c-indentation.c (should_warn_for_misleading_indentation): Get
15 global tabstop from the new source.
16 * c-opts.c (c_common_handle_option): Remove handling of -ftabstop, which
17 is now a common option.
18 * c.opt: Likewise.
19
20gcc/ChangeLog:
21
22 PR preprocessor/49973
23 PR other/86904
24 * common.opt: Handle -ftabstop here instead of in c-family
25 options. Add -fdiagnostics-column-unit= and
26 -fdiagnostics-column-origin= options.
27 * opts.c (common_handle_option): Handle the new options.
28 * diagnostic-format-json.cc (json_from_expanded_location): Add
29 diagnostic_context argument. Use it to convert column numbers as per
30 the new options.
31 (json_from_location_range): Likewise.
32 (json_from_fixit_hint): Likewise.
33 (json_end_diagnostic): Pass the new context argument to helper
34 functions above. Add "column-origin" field to the output.
35 (test_unknown_location): Add the new context argument to calls to
36 helper functions.
37 (test_bad_endpoints): Likewise.
38 * diagnostic-show-locus.c
39 (exploc_with_display_col::exploc_with_display_col): Support
40 tabstop parameter.
41 (layout_point::layout_point): Make use of class
42 exploc_with_display_col.
43 (layout_range::layout_range): Likewise.
44 (struct line_bounds): Clarify that the units are now always
45 display columns. Rename members accordingly. Add constructor.
46 (layout::print_source_line): Add support for tab expansion.
47 (make_range): Adapt to class layout_range changes.
48 (layout::maybe_add_location_range): Likewise.
49 (layout::layout): Adapt to class exploc_with_display_col changes.
50 (layout::calculate_x_offset_display): Support tabstop parameter.
51 (layout::print_annotation_line): Adapt to struct line_bounds changes.
52 (layout::print_line): Likewise.
53 (line_label::line_label): Add diagnostic_context argument.
54 (get_affected_range): Likewise.
55 (get_printed_columns): Likewise.
56 (layout::print_any_labels): Adapt to struct line_label changes.
57 (class correction): Add m_tabstop member.
58 (correction::correction): Add tabstop argument.
59 (correction::compute_display_cols): Use m_tabstop.
60 (class line_corrections): Add m_context member.
61 (line_corrections::line_corrections): Add diagnostic_context argument.
62 (line_corrections::add_hint): Use m_context to handle tabstops.
63 (layout::print_trailing_fixits): Adapt to class line_corrections
64 changes.
65 (test_layout_x_offset_display_utf8): Support tabstop parameter.
66 (test_layout_x_offset_display_tab): New selftest.
67 (test_one_liner_colorized_utf8): Likewise.
68 (test_tab_expansion): Likewise.
69 (test_diagnostic_show_locus_one_liner_utf8): Call the new tests.
70 (diagnostic_show_locus_c_tests): Likewise.
71 (test_overlapped_fixit_printing): Adapt to helper class and
72 function changes.
73 (test_overlapped_fixit_printing_utf8): Likewise.
74 (test_overlapped_fixit_printing_2): Likewise.
75 * diagnostic.h (enum diagnostics_column_unit): New enum.
76 (struct diagnostic_context): Add members for the new options.
77 (diagnostic_converted_column): Declare.
78 (json_from_expanded_location): Add new context argument.
79 * diagnostic.c (diagnostic_initialize): Initialize new members.
80 (diagnostic_converted_column): New function.
81 (maybe_line_and_column): Be willing to output a column of 0.
82 (diagnostic_get_location_text): Convert column number as per the new
83 options.
84 (diagnostic_report_current_module): Likewise.
85 (assert_location_text): Add origin and column_unit arguments for
86 testing the new functionality.
87 (test_diagnostic_get_location_text): Test the new functionality.
88 * doc/invoke.texi: Document the new options and behavior.
89 * input.h (location_compute_display_column): Add tabstop argument.
90 * input.c (location_compute_display_column): Likewise.
91 (test_cpp_utf8): Add selftests for tab expansion.
92 * tree-diagnostic-path.cc (default_tree_make_json_for_path): Pass the
93 new context argument to json_from_expanded_location().
94
95libcpp/ChangeLog:
96
97 PR preprocessor/49973
98 PR other/86904
99 * include/cpplib.h (struct cpp_options): Removed support for -ftabstop,
100 which is now handled by diagnostic_context.
101 (class cpp_display_width_computation): New class.
102 (cpp_byte_column_to_display_column): Add optional tabstop argument.
103 (cpp_display_width): Likewise.
104 (cpp_display_column_to_byte_column): Likewise.
105 * charset.c
106 (cpp_display_width_computation::cpp_display_width_computation): New
107 function.
108 (cpp_display_width_computation::advance_display_cols): Likewise.
109 (compute_next_display_width): Removed and implemented this
110 functionality in a new function...
111 (cpp_display_width_computation::process_next_codepoint): ...here.
112 (cpp_byte_column_to_display_column): Added tabstop argument.
113 Reimplemented in terms of class cpp_display_width_computation.
114 (cpp_display_column_to_byte_column): Likewise.
115 * init.c (cpp_create_reader): Remove handling of -ftabstop, which is now
116 handled by diagnostic_context.
117
118gcc/testsuite/ChangeLog:
119
120 PR preprocessor/49973
121 PR other/86904
122 * c-c++-common/Wmisleading-indentation-3.c: Adjust expected output
123 for new defaults.
124 * c-c++-common/Wmisleading-indentation.c: Likewise.
125 * c-c++-common/diagnostic-format-json-1.c: Likewise.
126 * c-c++-common/diagnostic-format-json-2.c: Likewise.
127 * c-c++-common/diagnostic-format-json-3.c: Likewise.
128 * c-c++-common/diagnostic-format-json-4.c: Likewise.
129 * c-c++-common/diagnostic-format-json-5.c: Likewise.
130 * c-c++-common/missing-close-symbol.c: Likewise.
131 * g++.dg/diagnostic/bad-binary-ops.C: Likewise.
132 * g++.dg/parse/error4.C: Likewise.
133 * g++.old-deja/g++.brendan/crash11.C: Likewise.
134 * g++.old-deja/g++.pt/overload2.C: Likewise.
135 * g++.old-deja/g++.robertl/eb109.C: Likewise.
136 * gcc.dg/analyzer/malloc-paths-9.c: Likewise.
137 * gcc.dg/bad-binary-ops.c: Likewise.
138 * gcc.dg/format/branch-1.c: Likewise.
139 * gcc.dg/format/pr79210.c: Likewise.
140 * gcc.dg/plugin/diagnostic-test-expressions-1.c: Likewise.
141 * gcc.dg/plugin/diagnostic-test-string-literals-1.c: Likewise.
142 * gcc.dg/redecl-4.c: Likewise.
143 * gfortran.dg/diagnostic-format-json-1.F90: Likewise.
144 * gfortran.dg/diagnostic-format-json-2.F90: Likewise.
145 * gfortran.dg/diagnostic-format-json-3.F90: Likewise.
146 * go.dg/arrayclear.go: Add a comment explaining why adding a
147 comment was necessary to work around a dejagnu bug.
148 * c-c++-common/diagnostic-units-1.c: New test.
149 * c-c++-common/diagnostic-units-2.c: New test.
150 * c-c++-common/diagnostic-units-3.c: New test.
151 * c-c++-common/diagnostic-units-4.c: New test.
152 * c-c++-common/diagnostic-units-5.c: New test.
153 * c-c++-common/diagnostic-units-6.c: New test.
154 * c-c++-common/diagnostic-units-7.c: New test.
155 * c-c++-common/diagnostic-units-8.c: New test.
156
157CVE: CVE-2021-42574
158Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=004bb936d6d5f177af26ad4905595e843d5665a5]
159Signed-off-by: Pgowda <pgowda.cve@gmail.com>
160---
161 gcc/c-family/c-indentation.c | 5 +-
162 gcc/c-family/c-opts.c | 6 -
163 gcc/c-family/c.opt | 4 -
164 gcc/common.opt | 21 +
165 gcc/diagnostic-format-json.cc | 55 +-
166 gcc/diagnostic-show-locus.c | 504 +++++++++++++-----
167 gcc/diagnostic.c | 113 +++-
168 gcc/diagnostic.h | 28 +-
169 gcc/doc/invoke.texi | 68 ++-
170 gcc/input.c | 72 ++-
171 gcc/input.h | 4 +-
172 gcc/opts.c | 14 +
173 .../c-c++-common/Wmisleading-indentation-3.c | 12 +-
174 .../c-c++-common/Wmisleading-indentation.c | 6 +-
175 .../c-c++-common/diagnostic-format-json-1.c | 5 +
176 .../c-c++-common/diagnostic-format-json-2.c | 5 +
177 .../c-c++-common/diagnostic-format-json-3.c | 5 +
178 .../c-c++-common/diagnostic-format-json-4.c | 9 +
179 .../c-c++-common/diagnostic-format-json-5.c | 9 +
180 .../c-c++-common/diagnostic-units-1.c | 28 +
181 .../c-c++-common/diagnostic-units-2.c | 28 +
182 .../c-c++-common/diagnostic-units-3.c | 28 +
183 .../c-c++-common/diagnostic-units-4.c | 28 +
184 .../c-c++-common/diagnostic-units-5.c | 28 +
185 .../c-c++-common/diagnostic-units-6.c | 28 +
186 .../c-c++-common/diagnostic-units-7.c | 28 +
187 .../c-c++-common/diagnostic-units-8.c | 28 +
188 .../c-c++-common/missing-close-symbol.c | 6 +-
189 .../g++.dg/diagnostic/bad-binary-ops.C | 8 +-
190 gcc/testsuite/g++.dg/parse/error4.C | 2 +-
191 .../g++.old-deja/g++.brendan/crash11.C | 4 +-
192 gcc/testsuite/g++.old-deja/g++.pt/overload2.C | 2 +-
193 .../g++.old-deja/g++.robertl/eb109.C | 4 +-
194 .../gcc.dg/analyzer/malloc-paths-9.c | 2 +-
195 gcc/testsuite/gcc.dg/bad-binary-ops.c | 8 +-
196 gcc/testsuite/gcc.dg/format/branch-1.c | 2 +-
197 gcc/testsuite/gcc.dg/format/pr79210.c | 2 +-
198 .../plugin/diagnostic-test-expressions-1.c | 16 +-
199 .../diagnostic-test-string-literals-1.c | 4 +-
200 gcc/testsuite/gcc.dg/redecl-4.c | 2 +-
201 .../gfortran.dg/diagnostic-format-json-1.F90 | 5 +
202 .../gfortran.dg/diagnostic-format-json-2.F90 | 5 +
203 .../gfortran.dg/diagnostic-format-json-3.F90 | 5 +
204 gcc/testsuite/go.dg/arrayclear.go | 3 +
205 gcc/tree-diagnostic-path.cc | 5 +-
206 libcpp/charset.c | 98 ++--
207 libcpp/include/cpplib.h | 40 +-
208 libcpp/init.c | 1 -
209 48 files changed, 1106 insertions(+), 287 deletions(-)
210 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-1.c
211 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-2.c
212 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-3.c
213 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-4.c
214 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-5.c
215 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-6.c
216 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-7.c
217 create mode 100644 gcc/testsuite/c-c++-common/diagnostic-units-8.c
218
219diff --git a/gcc/c-family/c-indentation.c b/gcc/c-family/c-indentation.c
220--- a/gcc/c-family/c-indentation.c 2020-07-22 23:35:17.296384022 -0700
221+++ b/gcc/c-family/c-indentation.c 2021-12-25 01:20:53.475636694 -0800
222@@ -24,8 +24,7 @@ along with GCC; see the file COPYING3.
223 #include "c-common.h"
224 #include "c-indentation.h"
225 #include "selftest.h"
226-
227-extern cpp_options *cpp_opts;
228+#include "diagnostic.h"
229
230 /* Round up VIS_COLUMN to nearest tab stop. */
231
232@@ -294,7 +293,7 @@ should_warn_for_misleading_indentation (
233 expanded_location next_stmt_exploc = expand_location (next_stmt_loc);
234 expanded_location guard_exploc = expand_location (guard_loc);
235
236- const unsigned int tab_width = cpp_opts->tabstop;
237+ const unsigned int tab_width = global_dc->tabstop;
238
239 /* They must be in the same file. */
240 if (next_stmt_exploc.file != body_exploc.file)
241diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
242--- a/gcc/c-family/c.opt 2021-12-24 20:23:42.816809230 -0800
243+++ b/gcc/c-family/c.opt 2021-12-25 01:20:53.475636694 -0800
244@@ -1876,10 +1876,6 @@ Enum(strong_eval_order) String(some) Val
245 EnumValue
246 Enum(strong_eval_order) String(all) Value(2)
247
248-ftabstop=
249-C ObjC C++ ObjC++ Joined RejectNegative UInteger
250--ftabstop=<number> Distance between tab stops for column reporting.
251-
252 ftemplate-backtrace-limit=
253 C++ ObjC++ Joined RejectNegative UInteger Var(template_backtrace_limit) Init(10)
254 Set the maximum number of template instantiation notes for a single warning or error.
255diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c
256--- a/gcc/c-family/c-opts.c 2021-12-24 20:23:44.824774786 -0800
257+++ b/gcc/c-family/c-opts.c 2021-12-25 01:20:53.475636694 -0800
258@@ -504,12 +504,6 @@ c_common_handle_option (size_t scode, co
259 cpp_opts->track_macro_expansion = 2;
260 break;
261
262- case OPT_ftabstop_:
263- /* It is documented that we silently ignore silly values. */
264- if (value >= 1 && value <= 100)
265- cpp_opts->tabstop = value;
266- break;
267-
268 case OPT_fexec_charset_:
269 cpp_opts->narrow_charset = arg;
270 break;
271diff --git a/gcc/common.opt b/gcc/common.opt
272--- a/gcc/common.opt 2021-12-24 20:23:42.480814993 -0800
273+++ b/gcc/common.opt 2021-12-25 01:20:53.475636694 -0800
274@@ -1325,6 +1325,14 @@ Enum(diagnostic_url_rule) String(always)
275 EnumValue
276 Enum(diagnostic_url_rule) String(auto) Value(DIAGNOSTICS_URL_AUTO)
277
278+fdiagnostics-column-unit=
279+Common Joined RejectNegative Enum(diagnostics_column_unit)
280+-fdiagnostics-column-unit=[display|byte] Select whether column numbers are output as display columns (default) or raw bytes.
281+
282+fdiagnostics-column-origin=
283+Common Joined RejectNegative UInteger
284+-fdiagnostics-column-origin=<number> Set the number of the first column. The default is 1-based as per GNU style, but some utilities may expect 0-based, for example.
285+
286 fdiagnostics-format=
287 Common Joined RejectNegative Enum(diagnostics_output_format)
288 -fdiagnostics-format=[text|json] Select output format.
289@@ -1334,6 +1342,15 @@ SourceInclude
290 diagnostic.h
291
292 Enum
293+Name(diagnostics_column_unit) Type(int)
294+
295+EnumValue
296+Enum(diagnostics_column_unit) String(display) Value(DIAGNOSTICS_COLUMN_UNIT_DISPLAY)
297+
298+EnumValue
299+Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE)
300+
301+Enum
302 Name(diagnostics_output_format) Type(int)
303
304 EnumValue
305@@ -1362,6 +1379,10 @@ fdiagnostics-path-format=
306 Common Joined RejectNegative Var(flag_diagnostics_path_format) Enum(diagnostic_path_format) Init(DPF_INLINE_EVENTS)
307 Specify how to print any control-flow path associated with a diagnostic.
308
309+ftabstop=
310+Common Joined RejectNegative UInteger
311+-ftabstop=<number> Distance between tab stops for column reporting.
312+
313 Enum
314 Name(diagnostic_path_format) Type(int)
315
316diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
317--- a/gcc/diagnostic.c 2020-07-22 23:35:17.556386887 -0700
318+++ b/gcc/diagnostic.c 2021-12-25 01:23:41.300841207 -0800
319@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.
320 #include "selftest.h"
321 #include "selftest-diagnostic.h"
322 #include "opts.h"
323+#include "cpplib.h"
324
325 #ifdef HAVE_TERMIOS_H
326 # include <termios.h>
327@@ -219,6 +220,9 @@ diagnostic_initialize (diagnostic_contex
328 context->min_margin_width = 0;
329 context->show_ruler_p = false;
330 context->parseable_fixits_p = false;
331+ context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY;
332+ context->column_origin = 1;
333+ context->tabstop = 8;
334 context->edit_context_ptr = NULL;
335 context->diagnostic_group_nesting_depth = 0;
336 context->diagnostic_group_emission_count = 0;
337@@ -353,8 +357,51 @@ diagnostic_get_color_for_kind (diagnosti
338 return diagnostic_kind_color[kind];
339 }
340
341+/* Given an expanded_location, convert the column (which is in 1-based bytes)
342+ to the requested units, without converting the origin.
343+ Return -1 if the column is invalid (<= 0). */
344+
345+static int
346+convert_column_unit (enum diagnostics_column_unit column_unit,
347+ int tabstop,
348+ expanded_location s)
349+{
350+ if (s.column <= 0)
351+ return -1;
352+
353+ switch (column_unit)
354+ {
355+ default:
356+ gcc_unreachable ();
357+
358+ case DIAGNOSTICS_COLUMN_UNIT_DISPLAY:
359+ {
360+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
361+ return location_compute_display_column (s, policy);
362+ }
363+
364+ case DIAGNOSTICS_COLUMN_UNIT_BYTE:
365+ return s.column;
366+ }
367+}
368+
369+/* Given an expanded_location, convert the column (which is in 1-based bytes)
370+ to the requested units and origin. Return -1 if the column is
371+ invalid (<= 0). */
372+int
373+diagnostic_converted_column (diagnostic_context *context, expanded_location s)
374+{
375+ int one_based_col
376+ = convert_column_unit (context->column_unit, context->tabstop, s);
377+ if (one_based_col <= 0)
378+ return -1;
379+ return one_based_col + (context->column_origin - 1);
380+}
381+
382 /* Return a formatted line and column ':%line:%column'. Elided if
383- zero. The result is a statically allocated buffer. */
384+ line == 0 or col < 0. (A column of 0 may be valid due to the
385+ -fdiagnostics-column-origin option.)
386+ The result is a statically allocated buffer. */
387
388 static const char *
389 maybe_line_and_column (int line, int col)
390@@ -363,8 +410,9 @@ maybe_line_and_column (int line, int col
391
392 if (line)
393 {
394- size_t l = snprintf (result, sizeof (result),
395- col ? ":%d:%d" : ":%d", line, col);
396+ size_t l
397+ = snprintf (result, sizeof (result),
398+ col >= 0 ? ":%d:%d" : ":%d", line, col);
399 gcc_checking_assert (l < sizeof (result));
400 }
401 else
402@@ -383,8 +431,14 @@ diagnostic_get_location_text (diagnostic
403 const char *locus_cs = colorize_start (pp_show_color (pp), "locus");
404 const char *locus_ce = colorize_stop (pp_show_color (pp));
405 const char *file = s.file ? s.file : progname;
406- int line = strcmp (file, N_("<built-in>")) ? s.line : 0;
407- int col = context->show_column ? s.column : 0;
408+ int line = 0;
409+ int col = -1;
410+ if (strcmp (file, N_("<built-in>")))
411+ {
412+ line = s.line;
413+ if (context->show_column)
414+ col = diagnostic_converted_column (context, s);
415+ }
416
417 const char *line_col = maybe_line_and_column (line, col);
418 return build_message_string ("%s%s%s:%s", locus_cs, file,
419@@ -650,14 +704,20 @@ diagnostic_report_current_module (diagno
420 if (! MAIN_FILE_P (map))
421 {
422 bool first = true;
423+ expanded_location s = {};
424 do
425 {
426 where = linemap_included_from (map);
427 map = linemap_included_from_linemap (line_table, map);
428- const char *line_col
429- = maybe_line_and_column (SOURCE_LINE (map, where),
430- first && context->show_column
431- ? SOURCE_COLUMN (map, where) : 0);
432+ s.file = LINEMAP_FILE (map);
433+ s.line = SOURCE_LINE (map, where);
434+ int col = -1;
435+ if (first && context->show_column)
436+ {
437+ s.column = SOURCE_COLUMN (map, where);
438+ col = diagnostic_converted_column (context, s);
439+ }
440+ const char *line_col = maybe_line_and_column (s.line, col);
441 static const char *const msgs[] =
442 {
443 N_("In file included from"),
444@@ -666,7 +726,7 @@ diagnostic_report_current_module (diagno
445 unsigned index = !first;
446 pp_verbatim (context->printer, "%s%s %r%s%s%R",
447 first ? "" : ",\n", _(msgs[index]),
448- "locus", LINEMAP_FILE (map), line_col);
449+ "locus", s.file, line_col);
450 first = false;
451 }
452 while (! MAIN_FILE_P (map));
453@@ -2042,10 +2102,15 @@ test_print_parseable_fixits_replace ()
454 static void
455 assert_location_text (const char *expected_loc_text,
456 const char *filename, int line, int column,
457- bool show_column)
458+ bool show_column,
459+ int origin = 1,
460+ enum diagnostics_column_unit column_unit
461+ = DIAGNOSTICS_COLUMN_UNIT_BYTE)
462 {
463 test_diagnostic_context dc;
464 dc.show_column = show_column;
465+ dc.column_unit = column_unit;
466+ dc.column_origin = origin;
467
468 expanded_location xloc;
469 xloc.file = filename;
470@@ -2069,7 +2134,10 @@ test_diagnostic_get_location_text ()
471 assert_location_text ("PROGNAME:", NULL, 0, 0, true);
472 assert_location_text ("<built-in>:", "<built-in>", 42, 10, true);
473 assert_location_text ("foo.c:42:10:", "foo.c", 42, 10, true);
474- assert_location_text ("foo.c:42:", "foo.c", 42, 0, true);
475+ assert_location_text ("foo.c:42:9:", "foo.c", 42, 10, true, 0);
476+ assert_location_text ("foo.c:42:1010:", "foo.c", 42, 10, true, 1001);
477+ for (int origin = 0; origin != 2; ++origin)
478+ assert_location_text ("foo.c:42:", "foo.c", 42, 0, true, origin);
479 assert_location_text ("foo.c:", "foo.c", 0, 10, true);
480 assert_location_text ("foo.c:42:", "foo.c", 42, 10, false);
481 assert_location_text ("foo.c:", "foo.c", 0, 10, false);
482@@ -2077,6 +2145,41 @@ test_diagnostic_get_location_text ()
483 maybe_line_and_column (INT_MAX, INT_MAX);
484 maybe_line_and_column (INT_MIN, INT_MIN);
485
486+ {
487+ /* In order to test display columns vs byte columns, we need to create a
488+ file for location_get_source_line() to read. */
489+
490+ const char *const content = "smile \xf0\x9f\x98\x82\n";
491+ const int line_bytes = strlen (content) - 1;
492+ const int def_tabstop = 8;
493+ const int display_width = cpp_display_width (content, line_bytes,
494+ def_tabstop);
495+ ASSERT_EQ (line_bytes - 2, display_width);
496+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
497+ const char *const fname = tmp.get_filename ();
498+ const int buf_len = strlen (fname) + 16;
499+ char *const expected = XNEWVEC (char, buf_len);
500+
501+ snprintf (expected, buf_len, "%s:1:%d:", fname, line_bytes);
502+ assert_location_text (expected, fname, 1, line_bytes, true,
503+ 1, DIAGNOSTICS_COLUMN_UNIT_BYTE);
504+
505+ snprintf (expected, buf_len, "%s:1:%d:", fname, line_bytes - 1);
506+ assert_location_text (expected, fname, 1, line_bytes, true,
507+ 0, DIAGNOSTICS_COLUMN_UNIT_BYTE);
508+
509+ snprintf (expected, buf_len, "%s:1:%d:", fname, display_width);
510+ assert_location_text (expected, fname, 1, line_bytes, true,
511+ 1, DIAGNOSTICS_COLUMN_UNIT_DISPLAY);
512+
513+ snprintf (expected, buf_len, "%s:1:%d:", fname, display_width - 1);
514+ assert_location_text (expected, fname, 1, line_bytes, true,
515+ 0, DIAGNOSTICS_COLUMN_UNIT_DISPLAY);
516+
517+ XDELETEVEC (expected);
518+ }
519+
520+
521 progname = old_progname;
522 }
523
524diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc
525--- a/gcc/diagnostic-format-json.cc 2020-07-22 23:35:17.556386887 -0700
526+++ b/gcc/diagnostic-format-json.cc 2021-12-25 01:20:53.475636694 -0800
527@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.
528 #include "system.h"
529 #include "coretypes.h"
530 #include "diagnostic.h"
531+#include "selftest-diagnostic.h"
532 #include "diagnostic-metadata.h"
533 #include "json.h"
534 #include "selftest.h"
535@@ -43,21 +44,43 @@ static json::array *cur_children_array;
536 /* Generate a JSON object for LOC. */
537
538 json::value *
539-json_from_expanded_location (location_t loc)
540+json_from_expanded_location (diagnostic_context *context, location_t loc)
541 {
542 expanded_location exploc = expand_location (loc);
543 json::object *result = new json::object ();
544 if (exploc.file)
545 result->set ("file", new json::string (exploc.file));
546 result->set ("line", new json::integer_number (exploc.line));
547- result->set ("column", new json::integer_number (exploc.column));
548+
549+ const enum diagnostics_column_unit orig_unit = context->column_unit;
550+ struct
551+ {
552+ const char *name;
553+ enum diagnostics_column_unit unit;
554+ } column_fields[] = {
555+ {"display-column", DIAGNOSTICS_COLUMN_UNIT_DISPLAY},
556+ {"byte-column", DIAGNOSTICS_COLUMN_UNIT_BYTE}
557+ };
558+ int the_column = INT_MIN;
559+ for (int i = 0; i != sizeof column_fields / sizeof (*column_fields); ++i)
560+ {
561+ context->column_unit = column_fields[i].unit;
562+ const int col = diagnostic_converted_column (context, exploc);
563+ result->set (column_fields[i].name, new json::integer_number (col));
564+ if (column_fields[i].unit == orig_unit)
565+ the_column = col;
566+ }
567+ gcc_assert (the_column != INT_MIN);
568+ result->set ("column", new json::integer_number (the_column));
569+ context->column_unit = orig_unit;
570 return result;
571 }
572
573 /* Generate a JSON object for LOC_RANGE. */
574
575 static json::object *
576-json_from_location_range (const location_range *loc_range, unsigned range_idx)
577+json_from_location_range (diagnostic_context *context,
578+ const location_range *loc_range, unsigned range_idx)
579 {
580 location_t caret_loc = get_pure_location (loc_range->m_loc);
581
582@@ -68,13 +91,13 @@ json_from_location_range (const location
583 location_t finish_loc = get_finish (loc_range->m_loc);
584
585 json::object *result = new json::object ();
586- result->set ("caret", json_from_expanded_location (caret_loc));
587+ result->set ("caret", json_from_expanded_location (context, caret_loc));
588 if (start_loc != caret_loc
589 && start_loc != UNKNOWN_LOCATION)
590- result->set ("start", json_from_expanded_location (start_loc));
591+ result->set ("start", json_from_expanded_location (context, start_loc));
592 if (finish_loc != caret_loc
593 && finish_loc != UNKNOWN_LOCATION)
594- result->set ("finish", json_from_expanded_location (finish_loc));
595+ result->set ("finish", json_from_expanded_location (context, finish_loc));
596
597 if (loc_range->m_label)
598 {
599@@ -91,14 +114,14 @@ json_from_location_range (const location
600 /* Generate a JSON object for HINT. */
601
602 static json::object *
603-json_from_fixit_hint (const fixit_hint *hint)
604+json_from_fixit_hint (diagnostic_context *context, const fixit_hint *hint)
605 {
606 json::object *fixit_obj = new json::object ();
607
608 location_t start_loc = hint->get_start_loc ();
609- fixit_obj->set ("start", json_from_expanded_location (start_loc));
610+ fixit_obj->set ("start", json_from_expanded_location (context, start_loc));
611 location_t next_loc = hint->get_next_loc ();
612- fixit_obj->set ("next", json_from_expanded_location (next_loc));
613+ fixit_obj->set ("next", json_from_expanded_location (context, next_loc));
614 fixit_obj->set ("string", new json::string (hint->get_string ()));
615
616 return fixit_obj;
617@@ -190,11 +213,13 @@ json_end_diagnostic (diagnostic_context
618 else
619 {
620 /* Otherwise, make diag_obj be the top-level object within the group;
621- add a "children" array. */
622+ add a "children" array and record the column origin. */
623 toplevel_array->append (diag_obj);
624 cur_group = diag_obj;
625 cur_children_array = new json::array ();
626 diag_obj->set ("children", cur_children_array);
627+ diag_obj->set ("column-origin",
628+ new json::integer_number (context->column_origin));
629 }
630
631 const rich_location *richloc = diagnostic->richloc;
632@@ -205,7 +230,7 @@ json_end_diagnostic (diagnostic_context
633 for (unsigned int i = 0; i < richloc->get_num_locations (); i++)
634 {
635 const location_range *loc_range = richloc->get_range (i);
636- json::object *loc_obj = json_from_location_range (loc_range, i);
637+ json::object *loc_obj = json_from_location_range (context, loc_range, i);
638 if (loc_obj)
639 loc_array->append (loc_obj);
640 }
641@@ -217,7 +242,7 @@ json_end_diagnostic (diagnostic_context
642 for (unsigned int i = 0; i < richloc->get_num_fixit_hints (); i++)
643 {
644 const fixit_hint *hint = richloc->get_fixit_hint (i);
645- json::object *fixit_obj = json_from_fixit_hint (hint);
646+ json::object *fixit_obj = json_from_fixit_hint (context, hint);
647 fixit_array->append (fixit_obj);
648 }
649 }
650@@ -320,7 +345,8 @@ namespace selftest {
651 static void
652 test_unknown_location ()
653 {
654- delete json_from_expanded_location (UNKNOWN_LOCATION);
655+ test_diagnostic_context dc;
656+ delete json_from_expanded_location (&dc, UNKNOWN_LOCATION);
657 }
658
659 /* Verify that we gracefully handle attempts to serialize bad
660@@ -338,7 +364,8 @@ test_bad_endpoints ()
661 loc_range.m_range_display_kind = SHOW_RANGE_WITH_CARET;
662 loc_range.m_label = NULL;
663
664- json::object *obj = json_from_location_range (&loc_range, 0);
665+ test_diagnostic_context dc;
666+ json::object *obj = json_from_location_range (&dc, &loc_range, 0);
667 /* We should have a "caret" value, but no "start" or "finish" values. */
668 ASSERT_TRUE (obj != NULL);
669 ASSERT_TRUE (obj->get ("caret") != NULL);
670diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
671--- a/gcc/diagnostic.h 2020-07-22 23:35:17.556386887 -0700
672+++ b/gcc/diagnostic.h 2021-12-25 01:20:53.479636627 -0800
673@@ -24,6 +24,20 @@ along with GCC; see the file COPYING3.
674 #include "pretty-print.h"
675 #include "diagnostic-core.h"
676
677+/* An enum for controlling what units to use for the column number
678+ when diagnostics are output, used by the -fdiagnostics-column-unit option.
679+ Tabs will be expanded or not according to the value of -ftabstop. The origin
680+ (default 1) is controlled by -fdiagnostics-column-origin. */
681+
682+enum diagnostics_column_unit
683+{
684+ /* The default from GCC 11 onwards: display columns. */
685+ DIAGNOSTICS_COLUMN_UNIT_DISPLAY,
686+
687+ /* The behavior in GCC 10 and earlier: simple bytes. */
688+ DIAGNOSTICS_COLUMN_UNIT_BYTE
689+};
690+
691 /* Enum for overriding the standard output format. */
692
693 enum diagnostics_output_format
694@@ -280,6 +294,15 @@ struct diagnostic_context
695 rest of the diagnostic. */
696 bool parseable_fixits_p;
697
698+ /* What units to use when outputting the column number. */
699+ enum diagnostics_column_unit column_unit;
700+
701+ /* The origin for the column number (1-based or 0-based typically). */
702+ int column_origin;
703+
704+ /* The size of the tabstop for tab expansion. */
705+ int tabstop;
706+
707 /* If non-NULL, an edit_context to which fix-it hints should be
708 applied, for generating patches. */
709 edit_context *edit_context_ptr;
710@@ -458,6 +481,8 @@ diagnostic_same_line (const diagnostic_c
711 }
712
713 extern const char *diagnostic_get_color_for_kind (diagnostic_t kind);
714+extern int diagnostic_converted_column (diagnostic_context *context,
715+ expanded_location s);
716
717 /* Pure text formatting support functions. */
718 extern char *file_name_as_prefix (diagnostic_context *, const char *);
719@@ -470,6 +495,7 @@ extern void diagnostic_output_format_ini
720 /* Compute the number of digits in the decimal representation of an integer. */
721 extern int num_digits (int);
722
723-extern json::value *json_from_expanded_location (location_t loc);
724+extern json::value *json_from_expanded_location (diagnostic_context *context,
725+ location_t loc);
726
727 #endif /* ! GCC_DIAGNOSTIC_H */
728diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c
729--- a/gcc/diagnostic-show-locus.c 2020-07-22 23:35:17.556386887 -0700
730+++ b/gcc/diagnostic-show-locus.c 2021-12-25 01:20:53.479636627 -0800
731@@ -175,9 +175,10 @@ enum column_unit {
732 class exploc_with_display_col : public expanded_location
733 {
734 public:
735- exploc_with_display_col (const expanded_location &exploc)
736+ exploc_with_display_col (const expanded_location &exploc, int tabstop)
737 : expanded_location (exploc),
738- m_display_col (location_compute_display_column (exploc)) {}
739+ m_display_col (location_compute_display_column (exploc, tabstop))
740+ {}
741
742 int m_display_col;
743 };
744@@ -189,11 +190,11 @@ class exploc_with_display_col : public e
745 class layout_point
746 {
747 public:
748- layout_point (const expanded_location &exploc)
749+ layout_point (const exploc_with_display_col &exploc)
750 : m_line (exploc.line)
751 {
752 m_columns[CU_BYTES] = exploc.column;
753- m_columns[CU_DISPLAY_COLS] = location_compute_display_column (exploc);
754+ m_columns[CU_DISPLAY_COLS] = exploc.m_display_col;
755 }
756
757 linenum_type m_line;
758@@ -205,10 +206,10 @@ class layout_point
759 class layout_range
760 {
761 public:
762- layout_range (const expanded_location *start_exploc,
763- const expanded_location *finish_exploc,
764+ layout_range (const exploc_with_display_col &start_exploc,
765+ const exploc_with_display_col &finish_exploc,
766 enum range_display_kind range_display_kind,
767- const expanded_location *caret_exploc,
768+ const exploc_with_display_col &caret_exploc,
769 unsigned original_idx,
770 const range_label *label);
771
772@@ -226,22 +227,18 @@ class layout_range
773
774 /* A struct for use by layout::print_source_line for telling
775 layout::print_annotation_line the extents of the source line that
776- it printed, so that underlines can be clipped appropriately. */
777+ it printed, so that underlines can be clipped appropriately. Units
778+ are 1-based display columns. */
779
780 struct line_bounds
781 {
782- int m_first_non_ws;
783- int m_last_non_ws;
784+ int m_first_non_ws_disp_col;
785+ int m_last_non_ws_disp_col;
786
787- void convert_to_display_cols (char_span line)
788+ line_bounds ()
789 {
790- m_first_non_ws = cpp_byte_column_to_display_column (line.get_buffer (),
791- line.length (),
792- m_first_non_ws);
793-
794- m_last_non_ws = cpp_byte_column_to_display_column (line.get_buffer (),
795- line.length (),
796- m_last_non_ws);
797+ m_first_non_ws_disp_col = INT_MAX;
798+ m_last_non_ws_disp_col = 0;
799 }
800 };
801
802@@ -351,8 +348,8 @@ class layout
803 private:
804 bool will_show_line_p (linenum_type row) const;
805 void print_leading_fixits (linenum_type row);
806- void print_source_line (linenum_type row, const char *line, int line_bytes,
807- line_bounds *lbounds_out);
808+ line_bounds print_source_line (linenum_type row, const char *line,
809+ int line_bytes);
810 bool should_print_annotation_line_p (linenum_type row) const;
811 void start_annotation_line (char margin_char = ' ') const;
812 void print_annotation_line (linenum_type row, const line_bounds lbounds);
813@@ -513,16 +510,16 @@ colorizer::get_color_by_name (const char
814 Initialize various layout_point fields from expanded_location
815 equivalents; we've already filtered on file. */
816
817-layout_range::layout_range (const expanded_location *start_exploc,
818- const expanded_location *finish_exploc,
819+layout_range::layout_range (const exploc_with_display_col &start_exploc,
820+ const exploc_with_display_col &finish_exploc,
821 enum range_display_kind range_display_kind,
822- const expanded_location *caret_exploc,
823+ const exploc_with_display_col &caret_exploc,
824 unsigned original_idx,
825 const range_label *label)
826-: m_start (*start_exploc),
827- m_finish (*finish_exploc),
828+: m_start (start_exploc),
829+ m_finish (finish_exploc),
830 m_range_display_kind (range_display_kind),
831- m_caret (*caret_exploc),
832+ m_caret (caret_exploc),
833 m_original_idx (original_idx),
834 m_label (label)
835 {
836@@ -646,6 +643,9 @@ layout_range::intersects_line_p (linenum
837
838 #if CHECKING_P
839
840+/* Default for when we don't care what the tab expansion is set to. */
841+static const int def_tabstop = 8;
842+
843 /* Create some expanded locations for testing layout_range. The filename
844 member of the explocs is set to the empty string. This member will only be
845 inspected by the calls to location_compute_display_column() made from the
846@@ -662,8 +662,11 @@ make_range (int start_line, int start_co
847 = {"", start_line, start_col, NULL, false};
848 const expanded_location finish_exploc
849 = {"", end_line, end_col, NULL, false};
850- return layout_range (&start_exploc, &finish_exploc, SHOW_RANGE_WITHOUT_CARET,
851- &start_exploc, 0, NULL);
852+ return layout_range (exploc_with_display_col (start_exploc, def_tabstop),
853+ exploc_with_display_col (finish_exploc, def_tabstop),
854+ SHOW_RANGE_WITHOUT_CARET,
855+ exploc_with_display_col (start_exploc, def_tabstop),
856+ 0, NULL);
857 }
858
859 /* Selftests for layout_range::contains_point and
860@@ -964,7 +967,7 @@ layout::layout (diagnostic_context * con
861 : m_context (context),
862 m_pp (context->printer),
863 m_primary_loc (richloc->get_range (0)->m_loc),
864- m_exploc (richloc->get_expanded_location (0)),
865+ m_exploc (richloc->get_expanded_location (0), context->tabstop),
866 m_colorizer (context, diagnostic_kind),
867 m_colorize_source_p (context->colorize_source_p),
868 m_show_labels_p (context->show_labels_p),
869@@ -1060,7 +1063,10 @@ layout::maybe_add_location_range (const
870
871 /* Everything is now known to be in the correct source file,
872 but it may require further sanitization. */
873- layout_range ri (&start, &finish, loc_range->m_range_display_kind, &caret,
874+ layout_range ri (exploc_with_display_col (start, m_context->tabstop),
875+ exploc_with_display_col (finish, m_context->tabstop),
876+ loc_range->m_range_display_kind,
877+ exploc_with_display_col (caret, m_context->tabstop),
878 original_idx, loc_range->m_label);
879
880 /* If we have a range that finishes before it starts (perhaps
881@@ -1394,7 +1400,7 @@ layout::calculate_x_offset_display ()
882 = get_line_bytes_without_trailing_whitespace (line.get_buffer (),
883 line.length ());
884 int eol_display_column
885- = cpp_display_width (line.get_buffer (), line_bytes);
886+ = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop);
887 if (caret_display_column > eol_display_column
888 || !caret_display_column)
889 {
890@@ -1445,16 +1451,13 @@ layout::calculate_x_offset_display ()
891 }
892
893 /* Print line ROW of source code, potentially colorized at any ranges, and
894- populate *LBOUNDS_OUT.
895- LINE is the source line (not necessarily 0-terminated) and LINE_BYTES
896- is its length in bytes.
897- This function deals only with byte offsets, not display columns, so
898- m_x_offset_display must be converted from display to byte units. In
899- particular, LINE_BYTES and LBOUNDS_OUT are in bytes. */
900+ return the line bounds. LINE is the source line (not necessarily
901+ 0-terminated) and LINE_BYTES is its length in bytes. In order to handle both
902+ colorization and tab expansion, this function tracks the line position in
903+ both byte and display column units. */
904
905-void
906-layout::print_source_line (linenum_type row, const char *line, int line_bytes,
907- line_bounds *lbounds_out)
908+line_bounds
909+layout::print_source_line (linenum_type row, const char *line, int line_bytes)
910 {
911 m_colorizer.set_normal_text ();
912
913@@ -1469,30 +1472,29 @@ layout::print_source_line (linenum_type
914 else
915 pp_space (m_pp);
916
917- /* We will stop printing the source line at any trailing whitespace, and start
918- printing it as per m_x_offset_display. */
919+ /* We will stop printing the source line at any trailing whitespace. */
920 line_bytes = get_line_bytes_without_trailing_whitespace (line,
921 line_bytes);
922- int x_offset_bytes = 0;
923- if (m_x_offset_display)
924- {
925- x_offset_bytes = cpp_display_column_to_byte_column (line, line_bytes,
926- m_x_offset_display);
927- /* In case the leading portion of the line that will be skipped over ends
928- with a character with wcwidth > 1, then it is possible we skipped too
929- much, so account for that by padding with spaces. */
930- const int overage
931- = cpp_byte_column_to_display_column (line, line_bytes, x_offset_bytes)
932- - m_x_offset_display;
933- for (int column = 0; column < overage; ++column)
934- pp_space (m_pp);
935- line += x_offset_bytes;
936- }
937
938- /* Print the line. */
939- int first_non_ws = INT_MAX;
940- int last_non_ws = 0;
941- for (int col_byte = 1 + x_offset_bytes; col_byte <= line_bytes; col_byte++)
942+ /* This object helps to keep track of which display column we are at, which is
943+ necessary for computing the line bounds in display units, for doing
944+ tab expansion, and for implementing m_x_offset_display. */
945+ cpp_display_width_computation dw (line, line_bytes, m_context->tabstop);
946+
947+ /* Skip the first m_x_offset_display display columns. In case the leading
948+ portion that will be skipped ends with a character with wcwidth > 1, then
949+ it is possible we skipped too much, so account for that by padding with
950+ spaces. Note that this does the right thing too in case a tab was the last
951+ character to be skipped over; the tab is effectively replaced by the
952+ correct number of trailing spaces needed to offset by the desired number of
953+ display columns. */
954+ for (int skipped_display_cols = dw.advance_display_cols (m_x_offset_display);
955+ skipped_display_cols > m_x_offset_display; --skipped_display_cols)
956+ pp_space (m_pp);
957+
958+ /* Print the line and compute the line_bounds. */
959+ line_bounds lbounds;
960+ while (!dw.done ())
961 {
962 /* Assuming colorization is enabled for the caret and underline
963 characters, we may also colorize the associated characters
964@@ -1510,7 +1512,8 @@ layout::print_source_line (linenum_type
965 {
966 bool in_range_p;
967 point_state state;
968- in_range_p = get_state_at_point (row, col_byte,
969+ const int start_byte_col = dw.bytes_processed () + 1;
970+ in_range_p = get_state_at_point (row, start_byte_col,
971 0, INT_MAX,
972 CU_BYTES,
973 &state);
974@@ -1519,22 +1522,44 @@ layout::print_source_line (linenum_type
975 else
976 m_colorizer.set_normal_text ();
977 }
978- char c = *line;
979- if (c == '\0' || c == '\t' || c == '\r')
980- c = ' ';
981- if (c != ' ')
982+
983+ /* Get the display width of the next character to be output, expanding
984+ tabs and replacing some control bytes with spaces as necessary. */
985+ const char *c = dw.next_byte ();
986+ const int start_disp_col = dw.display_cols_processed () + 1;
987+ const int this_display_width = dw.process_next_codepoint ();
988+ if (*c == '\t')
989+ {
990+ /* The returned display width is the number of spaces into which the
991+ tab should be expanded. */
992+ for (int i = 0; i != this_display_width; ++i)
993+ pp_space (m_pp);
994+ continue;
995+ }
996+ if (*c == '\0' || *c == '\r')
997 {
998- last_non_ws = col_byte;
999- if (first_non_ws == INT_MAX)
1000- first_non_ws = col_byte;
1001+ /* cpp_wcwidth() promises to return 1 for all control bytes, and we
1002+ want to output these as a single space too, so this case is
1003+ actually the same as the '\t' case. */
1004+ gcc_assert (this_display_width == 1);
1005+ pp_space (m_pp);
1006+ continue;
1007 }
1008- pp_character (m_pp, c);
1009- line++;
1010+
1011+ /* We have a (possibly multibyte) character to output; update the line
1012+ bounds if it is not whitespace. */
1013+ if (*c != ' ')
1014+ {
1015+ lbounds.m_last_non_ws_disp_col = dw.display_cols_processed ();
1016+ if (lbounds.m_first_non_ws_disp_col == INT_MAX)
1017+ lbounds.m_first_non_ws_disp_col = start_disp_col;
1018+ }
1019+
1020+ /* Output the character. */
1021+ while (c != dw.next_byte ()) pp_character (m_pp, *c++);
1022 }
1023 print_newline ();
1024-
1025- lbounds_out->m_first_non_ws = first_non_ws;
1026- lbounds_out->m_last_non_ws = last_non_ws;
1027+ return lbounds;
1028 }
1029
1030 /* Determine if we should print an annotation line for ROW.
1031@@ -1576,14 +1601,13 @@ layout::start_annotation_line (char marg
1032 }
1033
1034 /* Print a line consisting of the caret/underlines for the given
1035- source line. This function works with display columns, rather than byte
1036- counts; in particular, LBOUNDS should be in display column units. */
1037+ source line. */
1038
1039 void
1040 layout::print_annotation_line (linenum_type row, const line_bounds lbounds)
1041 {
1042 int x_bound = get_x_bound_for_row (row, m_exploc.m_display_col,
1043- lbounds.m_last_non_ws);
1044+ lbounds.m_last_non_ws_disp_col);
1045
1046 start_annotation_line ();
1047 pp_space (m_pp);
1048@@ -1593,8 +1617,8 @@ layout::print_annotation_line (linenum_t
1049 bool in_range_p;
1050 point_state state;
1051 in_range_p = get_state_at_point (row, column,
1052- lbounds.m_first_non_ws,
1053- lbounds.m_last_non_ws,
1054+ lbounds.m_first_non_ws_disp_col,
1055+ lbounds.m_last_non_ws_disp_col,
1056 CU_DISPLAY_COLS,
1057 &state);
1058 if (in_range_p)
1059@@ -1631,12 +1655,14 @@ layout::print_annotation_line (linenum_t
1060 class line_label
1061 {
1062 public:
1063- line_label (int state_idx, int column, label_text text)
1064+ line_label (diagnostic_context *context, int state_idx, int column,
1065+ label_text text)
1066 : m_state_idx (state_idx), m_column (column),
1067 m_text (text), m_label_line (0), m_has_vbar (true)
1068 {
1069 const int bytes = strlen (text.m_buffer);
1070- m_display_width = cpp_display_width (text.m_buffer, bytes);
1071+ m_display_width
1072+ = cpp_display_width (text.m_buffer, bytes, context->tabstop);
1073 }
1074
1075 /* Sorting is primarily by column, then by state index. */
1076@@ -1696,7 +1722,7 @@ layout::print_any_labels (linenum_type r
1077 if (text.m_buffer == NULL)
1078 continue;
1079
1080- labels.safe_push (line_label (i, disp_col, text));
1081+ labels.safe_push (line_label (m_context, i, disp_col, text));
1082 }
1083 }
1084
1085@@ -1976,7 +2002,8 @@ public:
1086
1087 /* Get the range of bytes or display columns that HINT would affect. */
1088 static column_range
1089-get_affected_range (const fixit_hint *hint, enum column_unit col_unit)
1090+get_affected_range (diagnostic_context *context,
1091+ const fixit_hint *hint, enum column_unit col_unit)
1092 {
1093 expanded_location exploc_start = expand_location (hint->get_start_loc ());
1094 expanded_location exploc_finish = expand_location (hint->get_next_loc ());
1095@@ -1986,11 +2013,13 @@ get_affected_range (const fixit_hint *hi
1096 int finish_column;
1097 if (col_unit == CU_DISPLAY_COLS)
1098 {
1099- start_column = location_compute_display_column (exploc_start);
1100+ start_column
1101+ = location_compute_display_column (exploc_start, context->tabstop);
1102 if (hint->insertion_p ())
1103 finish_column = start_column - 1;
1104 else
1105- finish_column = location_compute_display_column (exploc_finish);
1106+ finish_column
1107+ = location_compute_display_column (exploc_finish, context->tabstop);
1108 }
1109 else
1110 {
1111@@ -2003,12 +2032,12 @@ get_affected_range (const fixit_hint *hi
1112 /* Get the range of display columns that would be printed for HINT. */
1113
1114 static column_range
1115-get_printed_columns (const fixit_hint *hint)
1116+get_printed_columns (diagnostic_context *context, const fixit_hint *hint)
1117 {
1118 expanded_location exploc = expand_location (hint->get_start_loc ());
1119- int start_column = location_compute_display_column (exploc);
1120- int hint_width = cpp_display_width (hint->get_string (),
1121- hint->get_length ());
1122+ int start_column = location_compute_display_column (exploc, context->tabstop);
1123+ int hint_width = cpp_display_width (hint->get_string (), hint->get_length (),
1124+ context->tabstop);
1125 int final_hint_column = start_column + hint_width - 1;
1126 if (hint->insertion_p ())
1127 {
1128@@ -2018,7 +2047,8 @@ get_printed_columns (const fixit_hint *h
1129 {
1130 exploc = expand_location (hint->get_next_loc ());
1131 --exploc.column;
1132- int finish_column = location_compute_display_column (exploc);
1133+ int finish_column
1134+ = location_compute_display_column (exploc, context->tabstop);
1135 return column_range (start_column,
1136 MAX (finish_column, final_hint_column));
1137 }
1138@@ -2035,12 +2065,14 @@ public:
1139 correction (column_range affected_bytes,
1140 column_range affected_columns,
1141 column_range printed_columns,
1142- const char *new_text, size_t new_text_len)
1143+ const char *new_text, size_t new_text_len,
1144+ int tabstop)
1145 : m_affected_bytes (affected_bytes),
1146 m_affected_columns (affected_columns),
1147 m_printed_columns (printed_columns),
1148 m_text (xstrdup (new_text)),
1149 m_byte_length (new_text_len),
1150+ m_tabstop (tabstop),
1151 m_alloc_sz (new_text_len + 1)
1152 {
1153 compute_display_cols ();
1154@@ -2058,7 +2090,7 @@ public:
1155
1156 void compute_display_cols ()
1157 {
1158- m_display_cols = cpp_display_width (m_text, m_byte_length);
1159+ m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop);
1160 }
1161
1162 void overwrite (int dst_offset, const char_span &src_span)
1163@@ -2086,6 +2118,7 @@ public:
1164 char *m_text;
1165 size_t m_byte_length; /* Not including null-terminator. */
1166 int m_display_cols;
1167+ int m_tabstop;
1168 size_t m_alloc_sz;
1169 };
1170
1171@@ -2121,13 +2154,15 @@ correction::ensure_terminated ()
1172 class line_corrections
1173 {
1174 public:
1175- line_corrections (const char *filename, linenum_type row)
1176- : m_filename (filename), m_row (row)
1177+ line_corrections (diagnostic_context *context, const char *filename,
1178+ linenum_type row)
1179+ : m_context (context), m_filename (filename), m_row (row)
1180 {}
1181 ~line_corrections ();
1182
1183 void add_hint (const fixit_hint *hint);
1184
1185+ diagnostic_context *m_context;
1186 const char *m_filename;
1187 linenum_type m_row;
1188 auto_vec <correction *> m_corrections;
1189@@ -2173,9 +2208,10 @@ source_line::source_line (const char *fi
1190 void
1191 line_corrections::add_hint (const fixit_hint *hint)
1192 {
1193- column_range affected_bytes = get_affected_range (hint, CU_BYTES);
1194- column_range affected_columns = get_affected_range (hint, CU_DISPLAY_COLS);
1195- column_range printed_columns = get_printed_columns (hint);
1196+ column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES);
1197+ column_range affected_columns = get_affected_range (m_context, hint,
1198+ CU_DISPLAY_COLS);
1199+ column_range printed_columns = get_printed_columns (m_context, hint);
1200
1201 /* Potentially consolidate. */
1202 if (!m_corrections.is_empty ())
1203@@ -2243,7 +2279,8 @@ line_corrections::add_hint (const fixit_
1204 affected_columns,
1205 printed_columns,
1206 hint->get_string (),
1207- hint->get_length ()));
1208+ hint->get_length (),
1209+ m_context->tabstop));
1210 }
1211
1212 /* If there are any fixit hints on source line ROW, print them.
1213@@ -2257,7 +2294,7 @@ layout::print_trailing_fixits (linenum_t
1214 {
1215 /* Build a list of correction instances for the line,
1216 potentially consolidating hints (for the sake of readability). */
1217- line_corrections corrections (m_exploc.file, row);
1218+ line_corrections corrections (m_context, m_exploc.file, row);
1219 for (unsigned int i = 0; i < m_fixit_hints.length (); i++)
1220 {
1221 const fixit_hint *hint = m_fixit_hints[i];
1222@@ -2499,15 +2536,11 @@ layout::print_line (linenum_type row)
1223 if (!line)
1224 return;
1225
1226- line_bounds lbounds;
1227 print_leading_fixits (row);
1228- print_source_line (row, line.get_buffer (), line.length (), &lbounds);
1229+ const line_bounds lbounds
1230+ = print_source_line (row, line.get_buffer (), line.length ());
1231 if (should_print_annotation_line_p (row))
1232- {
1233- if (lbounds.m_first_non_ws != INT_MAX)
1234- lbounds.convert_to_display_cols (line);
1235- print_annotation_line (row, lbounds);
1236- }
1237+ print_annotation_line (row, lbounds);
1238 if (m_show_labels_p)
1239 print_any_labels (row);
1240 print_trailing_fixits (row);
1241@@ -2670,9 +2703,11 @@ test_layout_x_offset_display_utf8 (const
1242
1243 char_span lspan = location_get_source_line (tmp.get_filename (), 1);
1244 ASSERT_EQ (line_display_cols,
1245- cpp_display_width (lspan.get_buffer (), lspan.length ()));
1246+ cpp_display_width (lspan.get_buffer (), lspan.length (),
1247+ def_tabstop));
1248 ASSERT_EQ (line_display_cols,
1249- location_compute_display_column (expand_location (line_end)));
1250+ location_compute_display_column (expand_location (line_end),
1251+ def_tabstop));
1252 ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1),
1253 "\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8));
1254
1255@@ -2774,6 +2809,111 @@ test_layout_x_offset_display_utf8 (const
1256
1257 }
1258
1259+static void
1260+test_layout_x_offset_display_tab (const line_table_case &case_)
1261+{
1262+ const char *content
1263+ = "This line is very long, so that we can use it to test the logic for "
1264+ "clipping long lines. Also this: `\t' is a tab that occupies 1 byte and "
1265+ "a variable number of display columns, starting at column #103.\n";
1266+
1267+ /* Number of bytes in the line, subtracting one to remove the newline. */
1268+ const int line_bytes = strlen (content) - 1;
1269+
1270+ /* The column where the tab begins. Byte or display is the same as there are
1271+ no multibyte characters earlier on the line. */
1272+ const int tab_col = 103;
1273+
1274+ /* Effective extra size of the tab beyond what a single space would have taken
1275+ up, indexed by tabstop. */
1276+ static const int num_tabstops = 11;
1277+ int extra_width[num_tabstops];
1278+ for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
1279+ {
1280+ const int this_tab_size = tabstop - (tab_col - 1) % tabstop;
1281+ extra_width[tabstop] = this_tab_size - 1;
1282+ }
1283+ /* Example of this calculation: if tabstop is 10, the tab starting at column
1284+ #103 has to expand into 8 spaces, covering columns 103-110, so that the
1285+ next character is at column #111. So it takes up 7 more columns than
1286+ a space would have taken up. */
1287+ ASSERT_EQ (7, extra_width[10]);
1288+
1289+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
1290+ line_table_test ltt (case_);
1291+
1292+ linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1293+
1294+ location_t line_end = linemap_position_for_column (line_table, line_bytes);
1295+
1296+ /* Don't attempt to run the tests if column data might be unavailable. */
1297+ if (line_end > LINE_MAP_MAX_LOCATION_WITH_COLS)
1298+ return;
1299+
1300+ /* Check that cpp_display_width handles the tabs as expected. */
1301+ char_span lspan = location_get_source_line (tmp.get_filename (), 1);
1302+ ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1)));
1303+ for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
1304+ {
1305+ ASSERT_EQ (line_bytes + extra_width[tabstop],
1306+ cpp_display_width (lspan.get_buffer (), lspan.length (),
1307+ tabstop));
1308+ ASSERT_EQ (line_bytes + extra_width[tabstop],
1309+ location_compute_display_column (expand_location (line_end),
1310+ tabstop));
1311+ }
1312+
1313+ /* Check that the tab is expanded to the expected number of spaces. */
1314+ rich_location richloc (line_table,
1315+ linemap_position_for_column (line_table,
1316+ tab_col + 1));
1317+ for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
1318+ {
1319+ test_diagnostic_context dc;
1320+ dc.tabstop = tabstop;
1321+ layout test_layout (&dc, &richloc, DK_ERROR);
1322+ test_layout.print_line (1);
1323+ const char *out = pp_formatted_text (dc.printer);
1324+ ASSERT_EQ (NULL, strchr (out, '\t'));
1325+ const char *left_quote = strchr (out, '`');
1326+ const char *right_quote = strchr (out, '\'');
1327+ ASSERT_NE (NULL, left_quote);
1328+ ASSERT_NE (NULL, right_quote);
1329+ ASSERT_EQ (right_quote - left_quote, extra_width[tabstop] + 2);
1330+ }
1331+
1332+ /* Check that the line is offset properly and that the tab is broken up
1333+ into the expected number of spaces when it is the last character skipped
1334+ over. */
1335+ for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
1336+ {
1337+ test_diagnostic_context dc;
1338+ dc.tabstop = tabstop;
1339+ static const int small_width = 24;
1340+ dc.caret_max_width = small_width - 4;
1341+ dc.min_margin_width = test_left_margin - test_linenum_sep + 1;
1342+ dc.show_line_numbers_p = true;
1343+ layout test_layout (&dc, &richloc, DK_ERROR);
1344+ test_layout.print_line (1);
1345+
1346+ /* We have arranged things so that two columns will be printed before
1347+ the caret. If the tab results in more than one space, this should
1348+ produce two spaces in the output; otherwise, it will be a single space
1349+ preceded by the opening quote before the tab character. */
1350+ const char *output1
1351+ = " 1 | ' is a tab that occupies 1 byte and a variable number of "
1352+ "display columns, starting at column #103.\n"
1353+ " | ^\n\n";
1354+ const char *output2
1355+ = " 1 | ` ' is a tab that occupies 1 byte and a variable number of "
1356+ "display columns, starting at column #103.\n"
1357+ " | ^\n\n";
1358+ const char *expected_output = (extra_width[tabstop] ? output1 : output2);
1359+ ASSERT_STREQ (expected_output, pp_formatted_text (dc.printer));
1360+ }
1361+}
1362+
1363+
1364 /* Verify that diagnostic_show_locus works sanely on UNKNOWN_LOCATION. */
1365
1366 static void
1367@@ -3854,6 +3994,27 @@ test_one_liner_labels_utf8 ()
1368 }
1369 }
1370
1371+/* Make sure that colorization codes don't interrupt a multibyte
1372+ sequence, which would corrupt it. */
1373+static void
1374+test_one_liner_colorized_utf8 ()
1375+{
1376+ test_diagnostic_context dc;
1377+ dc.colorize_source_p = true;
1378+ diagnostic_color_init (&dc, DIAGNOSTICS_COLOR_YES);
1379+ const location_t pi = linemap_position_for_column (line_table, 12);
1380+ rich_location richloc (line_table, pi);
1381+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1382+
1383+ /* In order to avoid having the test depend on exactly how the colorization
1384+ was effected, just confirm there are two pi characters in the output. */
1385+ const char *result = pp_formatted_text (dc.printer);
1386+ const char *null_term = result + strlen (result);
1387+ const char *first_pi = strstr (result, "\xcf\x80");
1388+ ASSERT_TRUE (first_pi && first_pi <= null_term - 2);
1389+ ASSERT_STR_CONTAINS (first_pi + 2, "\xcf\x80");
1390+}
1391+
1392 /* Run the various one-liner tests. */
1393
1394 static void
1395@@ -3884,8 +4045,10 @@ test_diagnostic_show_locus_one_liner_utf
1396 ASSERT_EQ (31, LOCATION_COLUMN (line_end));
1397
1398 char_span lspan = location_get_source_line (tmp.get_filename (), 1);
1399- ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length ()));
1400- ASSERT_EQ (25, location_compute_display_column (expand_location (line_end)));
1401+ ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (),
1402+ def_tabstop));
1403+ ASSERT_EQ (25, location_compute_display_column (expand_location (line_end),
1404+ def_tabstop));
1405
1406 test_one_liner_simple_caret_utf8 ();
1407 test_one_liner_caret_and_range_utf8 ();
1408@@ -3900,6 +4063,7 @@ test_diagnostic_show_locus_one_liner_utf
1409 test_one_liner_many_fixits_1_utf8 ();
1410 test_one_liner_many_fixits_2_utf8 ();
1411 test_one_liner_labels_utf8 ();
1412+ test_one_liner_colorized_utf8 ();
1413 }
1414
1415 /* Verify that gcc_rich_location::add_location_if_nearby works. */
1416@@ -4272,25 +4436,28 @@ test_overlapped_fixit_printing (const li
1417 /* Unit-test the line_corrections machinery. */
1418 ASSERT_EQ (3, richloc.get_num_fixit_hints ());
1419 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1420- ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, CU_BYTES));
1421 ASSERT_EQ (column_range (12, 12),
1422- get_affected_range (hint_0, CU_DISPLAY_COLS));
1423- ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0));
1424+ get_affected_range (&dc, hint_0, CU_BYTES));
1425+ ASSERT_EQ (column_range (12, 12),
1426+ get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
1427+ ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
1428 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1429- ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, CU_BYTES));
1430 ASSERT_EQ (column_range (18, 18),
1431- get_affected_range (hint_1, CU_DISPLAY_COLS));
1432- ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1));
1433+ get_affected_range (&dc, hint_1, CU_BYTES));
1434+ ASSERT_EQ (column_range (18, 18),
1435+ get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
1436+ ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
1437 const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
1438- ASSERT_EQ (column_range (29, 28), get_affected_range (hint_2, CU_BYTES));
1439 ASSERT_EQ (column_range (29, 28),
1440- get_affected_range (hint_2, CU_DISPLAY_COLS));
1441- ASSERT_EQ (column_range (29, 29), get_printed_columns (hint_2));
1442+ get_affected_range (&dc, hint_2, CU_BYTES));
1443+ ASSERT_EQ (column_range (29, 28),
1444+ get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
1445+ ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2));
1446
1447 /* Add each hint in turn to a line_corrections instance,
1448 and verify that they are consolidated into one correction instance
1449 as expected. */
1450- line_corrections lc (tmp.get_filename (), 1);
1451+ line_corrections lc (&dc, tmp.get_filename (), 1);
1452
1453 /* The first replace hint by itself. */
1454 lc.add_hint (hint_0);
1455@@ -4484,25 +4651,28 @@ test_overlapped_fixit_printing_utf8 (con
1456 /* Unit-test the line_corrections machinery. */
1457 ASSERT_EQ (3, richloc.get_num_fixit_hints ());
1458 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1459- ASSERT_EQ (column_range (14, 14), get_affected_range (hint_0, CU_BYTES));
1460+ ASSERT_EQ (column_range (14, 14),
1461+ get_affected_range (&dc, hint_0, CU_BYTES));
1462 ASSERT_EQ (column_range (12, 12),
1463- get_affected_range (hint_0, CU_DISPLAY_COLS));
1464- ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0));
1465+ get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
1466+ ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
1467 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1468- ASSERT_EQ (column_range (22, 22), get_affected_range (hint_1, CU_BYTES));
1469+ ASSERT_EQ (column_range (22, 22),
1470+ get_affected_range (&dc, hint_1, CU_BYTES));
1471 ASSERT_EQ (column_range (18, 18),
1472- get_affected_range (hint_1, CU_DISPLAY_COLS));
1473- ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1));
1474+ get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
1475+ ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
1476 const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
1477- ASSERT_EQ (column_range (35, 34), get_affected_range (hint_2, CU_BYTES));
1478+ ASSERT_EQ (column_range (35, 34),
1479+ get_affected_range (&dc, hint_2, CU_BYTES));
1480 ASSERT_EQ (column_range (30, 29),
1481- get_affected_range (hint_2, CU_DISPLAY_COLS));
1482- ASSERT_EQ (column_range (30, 30), get_printed_columns (hint_2));
1483+ get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
1484+ ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2));
1485
1486 /* Add each hint in turn to a line_corrections instance,
1487 and verify that they are consolidated into one correction instance
1488 as expected. */
1489- line_corrections lc (tmp.get_filename (), 1);
1490+ line_corrections lc (&dc, tmp.get_filename (), 1);
1491
1492 /* The first replace hint by itself. */
1493 lc.add_hint (hint_0);
1494@@ -4689,6 +4859,8 @@ test_overlapped_fixit_printing_2 (const
1495
1496 /* Two insertions, in the wrong order. */
1497 {
1498+ test_diagnostic_context dc;
1499+
1500 rich_location richloc (line_table, col_20);
1501 richloc.add_fixit_insert_before (col_23, "{");
1502 richloc.add_fixit_insert_before (col_21, "}");
1503@@ -4696,14 +4868,15 @@ test_overlapped_fixit_printing_2 (const
1504 /* These fixits should be accepted; they can't be consolidated. */
1505 ASSERT_EQ (2, richloc.get_num_fixit_hints ());
1506 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1507- ASSERT_EQ (column_range (23, 22), get_affected_range (hint_0, CU_BYTES));
1508- ASSERT_EQ (column_range (23, 23), get_printed_columns (hint_0));
1509+ ASSERT_EQ (column_range (23, 22),
1510+ get_affected_range (&dc, hint_0, CU_BYTES));
1511+ ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0));
1512 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1513- ASSERT_EQ (column_range (21, 20), get_affected_range (hint_1, CU_BYTES));
1514- ASSERT_EQ (column_range (21, 21), get_printed_columns (hint_1));
1515+ ASSERT_EQ (column_range (21, 20),
1516+ get_affected_range (&dc, hint_1, CU_BYTES));
1517+ ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1));
1518
1519 /* Verify that they're printed correctly. */
1520- test_diagnostic_context dc;
1521 diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1522 ASSERT_STREQ (" int a5[][0][0] = { 1, 2 };\n"
1523 " ^\n"
1524@@ -4955,6 +5128,65 @@ test_fixit_deletion_affecting_newline (c
1525 pp_formatted_text (dc.printer));
1526 }
1527
1528+static void
1529+test_tab_expansion (const line_table_case &case_)
1530+{
1531+ /* Create a tempfile and write some text to it. This example uses a tabstop
1532+ of 8, as the column numbers attempt to indicate:
1533+
1534+ .....................000.01111111111.22222333333 display
1535+ .....................123.90123456789.56789012345 columns */
1536+ const char *content = " \t This: `\t' is a tab.\n";
1537+ /* ....................000 00000011111 11111222222 byte
1538+ ....................123 45678901234 56789012345 columns */
1539+
1540+ const int tabstop = 8;
1541+ const int first_non_ws_byte_col = 7;
1542+ const int right_quote_byte_col = 15;
1543+ const int last_byte_col = 25;
1544+ ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop));
1545+
1546+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
1547+ line_table_test ltt (case_);
1548+ linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1549+
1550+ /* Don't attempt to run the tests if column data might be unavailable. */
1551+ location_t line_end = linemap_position_for_column (line_table, last_byte_col);
1552+ if (line_end > LINE_MAP_MAX_LOCATION_WITH_COLS)
1553+ return;
1554+
1555+ /* Check that the leading whitespace with mixed tabs and spaces is expanded
1556+ into 11 spaces. Recall that print_line() also puts one space before
1557+ everything too. */
1558+ {
1559+ test_diagnostic_context dc;
1560+ dc.tabstop = tabstop;
1561+ rich_location richloc (line_table,
1562+ linemap_position_for_column (line_table,
1563+ first_non_ws_byte_col));
1564+ layout test_layout (&dc, &richloc, DK_ERROR);
1565+ test_layout.print_line (1);
1566+ ASSERT_STREQ (" This: ` ' is a tab.\n"
1567+ " ^\n",
1568+ pp_formatted_text (dc.printer));
1569+ }
1570+
1571+ /* Confirm the display width was tracked correctly across the internal tab
1572+ as well. */
1573+ {
1574+ test_diagnostic_context dc;
1575+ dc.tabstop = tabstop;
1576+ rich_location richloc (line_table,
1577+ linemap_position_for_column (line_table,
1578+ right_quote_byte_col));
1579+ layout test_layout (&dc, &richloc, DK_ERROR);
1580+ test_layout.print_line (1);
1581+ ASSERT_STREQ (" This: ` ' is a tab.\n"
1582+ " ^\n",
1583+ pp_formatted_text (dc.printer));
1584+ }
1585+}
1586+
1587 /* Verify that line numbers are correctly printed for the case of
1588 a multiline range in which the width of the line numbers changes
1589 (e.g. from "9" to "10"). */
1590@@ -5012,6 +5244,7 @@ diagnostic_show_locus_c_tests ()
1591 test_layout_range_for_multiple_lines ();
1592
1593 for_each_line_table_case (test_layout_x_offset_display_utf8);
1594+ for_each_line_table_case (test_layout_x_offset_display_tab);
1595
1596 test_get_line_bytes_without_trailing_whitespace ();
1597
1598@@ -5029,6 +5262,7 @@ diagnostic_show_locus_c_tests ()
1599 for_each_line_table_case (test_fixit_insert_containing_newline_2);
1600 for_each_line_table_case (test_fixit_replace_containing_newline);
1601 for_each_line_table_case (test_fixit_deletion_affecting_newline);
1602+ for_each_line_table_case (test_tab_expansion);
1603
1604 test_line_numbers_multiline_range ();
1605 }
1606diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
1607--- a/gcc/doc/invoke.texi 2021-12-24 20:23:46.876739587 -0800
1608+++ b/gcc/doc/invoke.texi 2021-12-25 01:20:53.487636494 -0800
1609@@ -293,7 +293,9 @@ Objective-C and Objective-C++ Dialects}.
1610 -fdiagnostics-show-template-tree -fno-elide-type @gol
1611 -fdiagnostics-path-format=@r{[}none@r{|}separate-events@r{|}inline-events@r{]} @gol
1612 -fdiagnostics-show-path-depths @gol
1613--fno-show-column}
1614+-fno-show-column @gol
1615+-fdiagnostics-column-unit=@r{[}display@r{|}byte@r{]} @gol
1616+-fdiagnostics-column-origin=@var{origin}}
1617
1618 @item Warning Options
1619 @xref{Warning Options,,Options to Request or Suppress Warnings}.
1620@@ -4424,6 +4426,31 @@ Do not print column numbers in diagnosti
1621 diagnostics are being scanned by a program that does not understand the
1622 column numbers, such as @command{dejagnu}.
1623
1624+@item -fdiagnostics-column-unit=@var{UNIT}
1625+@opindex fdiagnostics-column-unit
1626+Select the units for the column number. This affects traditional diagnostics
1627+(in the absence of @option{-fno-show-column}), as well as JSON format
1628+diagnostics if requested.
1629+
1630+The default @var{UNIT}, @samp{display}, considers the number of display
1631+columns occupied by each character. This may be larger than the number
1632+of bytes required to encode the character, in the case of tab
1633+characters, or it may be smaller, in the case of multibyte characters.
1634+For example, the character ``GREEK SMALL LETTER PI (U+03C0)'' occupies one
1635+display column, and its UTF-8 encoding requires two bytes; the character
1636+``SLIGHTLY SMILING FACE (U+1F642)'' occupies two display columns, and
1637+its UTF-8 encoding requires four bytes.
1638+
1639+Setting @var{UNIT} to @samp{byte} changes the column number to the raw byte
1640+count in all cases, as was traditionally output by GCC prior to version 11.1.0.
1641+
1642+@item -fdiagnostics-column-origin=@var{ORIGIN}
1643+@opindex fdiagnostics-column-origin
1644+Select the origin for column numbers, i.e. the column number assigned to the
1645+first column. The default value of 1 corresponds to traditional GCC
1646+behavior and to the GNU style guide. Some utilities may perform better with an
1647+origin of 0; any non-negative value may be specified.
1648+
1649 @item -fdiagnostics-format=@var{FORMAT}
1650 @opindex fdiagnostics-format
1651 Select a different format for printing diagnostics.
1652@@ -4459,11 +4486,15 @@ might be printed in JSON form (after for
1653 "locations": [
1654 @{
1655 "caret": @{
1656+ "display-column": 3,
1657+ "byte-column": 3,
1658 "column": 3,
1659 "file": "misleading-indentation.c",
1660 "line": 15
1661 @},
1662 "finish": @{
1663+ "display-column": 4,
1664+ "byte-column": 4,
1665 "column": 4,
1666 "file": "misleading-indentation.c",
1667 "line": 15
1668@@ -4479,6 +4510,8 @@ might be printed in JSON form (after for
1669 "locations": [
1670 @{
1671 "caret": @{
1672+ "display-column": 5,
1673+ "byte-column": 5,
1674 "column": 5,
1675 "file": "misleading-indentation.c",
1676 "line": 17
1677@@ -4488,6 +4521,7 @@ might be printed in JSON form (after for
1678 "message": "...this statement, but the latter is @dots{}"
1679 @}
1680 ]
1681+ "column-origin": 1,
1682 @},
1683 @dots{}
1684 ]
1685@@ -4500,10 +4534,34 @@ A diagnostic has a @code{kind}. If this
1686 an @code{option} key describing the command-line option controlling the
1687 warning.
1688
1689-A diagnostic can contain zero or more locations. Each location has up
1690-to three positions within it: a @code{caret} position and optional
1691-@code{start} and @code{finish} positions. A location can also have
1692-an optional @code{label} string. For example, this error:
1693+A diagnostic can contain zero or more locations. Each location has an
1694+optional @code{label} string and up to three positions within it: a
1695+@code{caret} position and optional @code{start} and @code{finish} positions.
1696+A position is described by a @code{file} name, a @code{line} number, and
1697+three numbers indicating a column position:
1698+@itemize @bullet
1699+
1700+@item
1701+@code{display-column} counts display columns, accounting for tabs and
1702+multibyte characters.
1703+
1704+@item
1705+@code{byte-column} counts raw bytes.
1706+
1707+@item
1708+@code{column} is equal to one of
1709+the previous two, as dictated by the @option{-fdiagnostics-column-unit}
1710+option.
1711+
1712+@end itemize
1713+All three columns are relative to the origin specified by
1714+@option{-fdiagnostics-column-origin}, which is typically equal to 1 but may
1715+be set, for instance, to 0 for compatibility with other utilities that
1716+number columns from 0. The column origin is recorded in the JSON output in
1717+the @code{column-origin} tag. In the remaining examples below, the extra
1718+column number outputs have been omitted for brevity.
1719+
1720+For example, this error:
1721
1722 @smallexample
1723 bad-binary-ops.c:64:23: error: invalid operands to binary + (have 'S' @{aka
1724diff --git a/gcc/input.c b/gcc/input.c
1725--- a/gcc/input.c 2020-07-22 23:35:17.664388078 -0700
1726+++ b/gcc/input.c 2021-12-25 01:20:53.487636494 -0800
1727@@ -913,7 +913,7 @@ make_location (location_t caret, source_
1728 source line in order to calculate the display width. If that cannot be done
1729 for any reason, then returns the byte column as a fallback. */
1730 int
1731-location_compute_display_column (expanded_location exploc)
1732+location_compute_display_column (expanded_location exploc, int tabstop)
1733 {
1734 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1735 return exploc.column;
1736@@ -921,7 +921,7 @@ location_compute_display_column (expande
1737 /* If line is NULL, this function returns exploc.column which is the
1738 desired fallback. */
1739 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1740- exploc.column);
1741+ exploc.column, tabstop);
1742 }
1743
1744 /* Dump statistics to stderr about the memory usage of the line_table
1745@@ -3608,33 +3608,46 @@ test_line_offset_overflow ()
1746
1747 void test_cpp_utf8 ()
1748 {
1749+ const int def_tabstop = 8;
1750 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
1751 {
1752- int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
1753+ int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
1754 ASSERT_EQ (8, w_bad);
1755- int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
1756- ASSERT_EQ (6, w_ctrl);
1757+ int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
1758+ ASSERT_EQ (5, w_ctrl);
1759 }
1760
1761 /* Verify that wcwidth of valid UTF-8 is as expected. */
1762 {
1763- const int w_pi = cpp_display_width ("\xcf\x80", 2);
1764+ const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
1765 ASSERT_EQ (1, w_pi);
1766- const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
1767+ const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
1768 ASSERT_EQ (2, w_emoji);
1769- const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2);
1770+ const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
1771+ def_tabstop);
1772 ASSERT_EQ (1, w_umlaut_precomposed);
1773- const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3);
1774+ const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
1775+ def_tabstop);
1776 ASSERT_EQ (1, w_umlaut_combining);
1777- const int w_han = cpp_display_width ("\xe4\xb8\xba", 3);
1778+ const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
1779 ASSERT_EQ (2, w_han);
1780- const int w_ascii = cpp_display_width ("GCC", 3);
1781+ const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
1782 ASSERT_EQ (3, w_ascii);
1783 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
1784- "\x9f! \xe4\xb8\xba y\xcc\x88", 24);
1785+ "\x9f! \xe4\xb8\xba y\xcc\x88",
1786+ 24, def_tabstop);
1787 ASSERT_EQ (18, w_mixed);
1788 }
1789
1790+ /* Verify that display width properly expands tabs. */
1791+ {
1792+ const char *tstr = "\tabc\td";
1793+ ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
1794+ ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
1795+ ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
1796+ ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
1797+ }
1798+
1799 /* Verify that cpp_byte_column_to_display_column can go past the end,
1800 and similar edge cases. */
1801 {
1802@@ -3645,10 +3658,13 @@ void test_cpp_utf8 ()
1803 /* 111122223456
1804 Byte columns. */
1805
1806- ASSERT_EQ (5, cpp_display_width (str, 6));
1807- ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
1808- ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
1809- ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
1810+ ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
1811+ ASSERT_EQ (105,
1812+ cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
1813+ ASSERT_EQ (10000,
1814+ cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
1815+ ASSERT_EQ (0,
1816+ cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
1817 }
1818
1819 /* Verify that cpp_display_column_to_byte_column can go past the end,
1820@@ -3662,21 +3678,25 @@ void test_cpp_utf8 ()
1821 /* 000000000000000000000000000000000111111
1822 111122223333444456666777788889999012345
1823 Byte columns. */
1824- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
1825- ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
1826- ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
1827- ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
1828- ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
1829+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
1830+ ASSERT_EQ (15,
1831+ cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
1832+ ASSERT_EQ (115,
1833+ cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
1834+ ASSERT_EQ (10000,
1835+ cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
1836+ ASSERT_EQ (0,
1837+ cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
1838
1839 /* Verify that we do not interrupt a UTF-8 sequence. */
1840- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1));
1841+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
1842
1843 for (int byte_col = 1; byte_col <= 15; ++byte_col)
1844 {
1845- const int disp_col = cpp_byte_column_to_display_column (str, 15,
1846- byte_col);
1847- const int byte_col2 = cpp_display_column_to_byte_column (str, 15,
1848- disp_col);
1849+ const int disp_col
1850+ = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
1851+ const int byte_col2
1852+ = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
1853
1854 /* If we ask for the display column in the middle of a UTF-8
1855 sequence, it will return the length of the partial sequence,
1856diff --git a/gcc/input.h b/gcc/input.h
1857--- a/gcc/input.h 2020-07-22 23:35:17.664388078 -0700
1858+++ b/gcc/input.h 2021-12-25 01:20:53.487636494 -0800
1859@@ -38,7 +38,9 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER
1860
1861 extern bool is_location_from_builtin_token (location_t);
1862 extern expanded_location expand_location (location_t);
1863-extern int location_compute_display_column (expanded_location);
1864+
1865+extern int location_compute_display_column (expanded_location exploc,
1866+ int tabstop);
1867
1868 /* A class capturing the bounds of a buffer, to allow for run-time
1869 bounds-checking in a checked build. */
1870diff --git a/gcc/opts.c b/gcc/opts.c
1871--- a/gcc/opts.c 2020-07-22 23:35:17.708388562 -0700
1872+++ b/gcc/opts.c 2021-12-25 01:20:53.487636494 -0800
1873@@ -2439,6 +2439,14 @@ common_handle_option (struct gcc_options
1874 dc->parseable_fixits_p = value;
1875 break;
1876
1877+ case OPT_fdiagnostics_column_unit_:
1878+ dc->column_unit = (enum diagnostics_column_unit)value;
1879+ break;
1880+
1881+ case OPT_fdiagnostics_column_origin_:
1882+ dc->column_origin = value;
1883+ break;
1884+
1885 case OPT_fdiagnostics_show_cwe:
1886 dc->show_cwe = value;
1887 break;
1888@@ -2825,6 +2833,12 @@ common_handle_option (struct gcc_options
1889 check_alignment_argument (loc, arg, "functions");
1890 break;
1891
1892+ case OPT_ftabstop_:
1893+ /* It is documented that we silently ignore silly values. */
1894+ if (value >= 1 && value <= 100)
1895+ dc->tabstop = value;
1896+ break;
1897+
1898 default:
1899 /* If the flag was handled in a standard way, assume the lack of
1900 processing here is intentional. */
1901diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c
1902--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2020-07-22 23:35:17.908390765 -0700
1903+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-25 01:20:53.487636494 -0800
1904@@ -8,17 +8,22 @@
1905 We can't rely on any ordering of the keys. */
1906
1907 /* { dg-regexp "\"kind\": \"error\"" } */
1908+/* { dg-regexp "\"column-origin\": 1" } */
1909 /* { dg-regexp "\"message\": \"#error message\"" } */
1910
1911 /* { dg-regexp "\"caret\": \{" } */
1912 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-1.c\"" } */
1913 /* { dg-regexp "\"line\": 4" } */
1914 /* { dg-regexp "\"column\": 2" } */
1915+/* { dg-regexp "\"display-column\": 2" } */
1916+/* { dg-regexp "\"byte-column\": 2" } */
1917
1918 /* { dg-regexp "\"finish\": \{" } */
1919 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-1.c\"" } */
1920 /* { dg-regexp "\"line\": 4" } */
1921 /* { dg-regexp "\"column\": 6" } */
1922+/* { dg-regexp "\"display-column\": 6" } */
1923+/* { dg-regexp "\"byte-column\": 6" } */
1924
1925 /* { dg-regexp "\"locations\": \[\[\{\}, \]*\]" } */
1926 /* { dg-regexp "\"children\": \[\[\]\[\]\]" } */
1927diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c
1928--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2020-07-22 23:35:17.908390765 -0700
1929+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-25 01:20:53.487636494 -0800
1930@@ -8,6 +8,7 @@
1931 We can't rely on any ordering of the keys. */
1932
1933 /* { dg-regexp "\"kind\": \"warning\"" } */
1934+/* { dg-regexp "\"column-origin\": 1" } */
1935 /* { dg-regexp "\"message\": \"#warning message\"" } */
1936 /* { dg-regexp "\"option\": \"-Wcpp\"" } */
1937 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */
1938@@ -16,11 +17,15 @@
1939 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-2.c\"" } */
1940 /* { dg-regexp "\"line\": 4" } */
1941 /* { dg-regexp "\"column\": 2" } */
1942+/* { dg-regexp "\"display-column\": 2" } */
1943+/* { dg-regexp "\"byte-column\": 2" } */
1944
1945 /* { dg-regexp "\"finish\": \{" } */
1946 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-2.c\"" } */
1947 /* { dg-regexp "\"line\": 4" } */
1948 /* { dg-regexp "\"column\": 8" } */
1949+/* { dg-regexp "\"display-column\": 8" } */
1950+/* { dg-regexp "\"byte-column\": 8" } */
1951
1952 /* { dg-regexp "\"locations\": \[\[\{\}, \]*\]" } */
1953 /* { dg-regexp "\"children\": \[\[\]\[\]\]" } */
1954diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c
1955--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2020-07-22 23:35:17.908390765 -0700
1956+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-25 01:20:53.487636494 -0800
1957@@ -8,6 +8,7 @@
1958 We can't rely on any ordering of the keys. */
1959
1960 /* { dg-regexp "\"kind\": \"error\"" } */
1961+/* { dg-regexp "\"column-origin\": 1" } */
1962 /* { dg-regexp "\"message\": \"#warning message\"" } */
1963 /* { dg-regexp "\"option\": \"-Werror=cpp\"" } */
1964 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */
1965@@ -16,11 +17,15 @@
1966 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-3.c\"" } */
1967 /* { dg-regexp "\"line\": 4" } */
1968 /* { dg-regexp "\"column\": 2" } */
1969+/* { dg-regexp "\"display-column\": 2" } */
1970+/* { dg-regexp "\"byte-column\": 2" } */
1971
1972 /* { dg-regexp "\"finish\": \{" } */
1973 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-3.c\"" } */
1974 /* { dg-regexp "\"line\": 4" } */
1975 /* { dg-regexp "\"column\": 8" } */
1976+/* { dg-regexp "\"display-column\": 8" } */
1977+/* { dg-regexp "\"byte-column\": 8" } */
1978
1979 /* { dg-regexp "\"locations\": \[\[\{\}, \]*\]" } */
1980 /* { dg-regexp "\"children\": \[\[\]\[\]\]" } */
1981diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c
1982--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2020-07-22 23:35:17.908390765 -0700
1983+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-25 01:20:53.487636494 -0800
1984@@ -24,15 +24,20 @@ int test (void)
1985 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */
1986 /* { dg-regexp "\"line\": 8" } */
1987 /* { dg-regexp "\"column\": 5" } */
1988+/* { dg-regexp "\"display-column\": 5" } */
1989+/* { dg-regexp "\"byte-column\": 5" } */
1990
1991 /* { dg-regexp "\"finish\": \{" } */
1992 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */
1993 /* { dg-regexp "\"line\": 8" } */
1994 /* { dg-regexp "\"column\": 10" } */
1995+/* { dg-regexp "\"display-column\": 10" } */
1996+/* { dg-regexp "\"byte-column\": 10" } */
1997
1998 /* The outer diagnostic. */
1999
2000 /* { dg-regexp "\"kind\": \"warning\"" } */
2001+/* { dg-regexp "\"column-origin\": 1" } */
2002 /* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */
2003 /* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */
2004 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wmisleading-indentation\"" } */
2005@@ -41,11 +46,15 @@ int test (void)
2006 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */
2007 /* { dg-regexp "\"line\": 6" } */
2008 /* { dg-regexp "\"column\": 3" } */
2009+/* { dg-regexp "\"display-column\": 3" } */
2010+/* { dg-regexp "\"byte-column\": 3" } */
2011
2012 /* { dg-regexp "\"finish\": \{" } */
2013 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */
2014 /* { dg-regexp "\"line\": 6" } */
2015 /* { dg-regexp "\"column\": 4" } */
2016+/* { dg-regexp "\"display-column\": 4" } */
2017+/* { dg-regexp "\"byte-column\": 4" } */
2018
2019 /* More from the nested diagnostic (we can't guarantee what order the
2020 "file" keys are consumed). */
2021diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c
2022--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2020-07-22 23:35:17.908390765 -0700
2023+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-25 01:20:53.487636494 -0800
2024@@ -13,6 +13,7 @@ int test (struct s *ptr)
2025 We can't rely on any ordering of the keys. */
2026
2027 /* { dg-regexp "\"kind\": \"error\"" } */
2028+/* { dg-regexp "\"column-origin\": 1" } */
2029 /* { dg-regexp "\"message\": \".*\"" } */
2030
2031 /* Verify fix-it hints. */
2032@@ -23,11 +24,15 @@ int test (struct s *ptr)
2033 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-5.c\"" } */
2034 /* { dg-regexp "\"line\": 8" } */
2035 /* { dg-regexp "\"column\": 15" } */
2036+/* { dg-regexp "\"display-column\": 15" } */
2037+/* { dg-regexp "\"byte-column\": 15" } */
2038
2039 /* { dg-regexp "\"next\": \{" } */
2040 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-5.c\"" } */
2041 /* { dg-regexp "\"line\": 8" } */
2042 /* { dg-regexp "\"column\": 21" } */
2043+/* { dg-regexp "\"display-column\": 21" } */
2044+/* { dg-regexp "\"byte-column\": 21" } */
2045
2046 /* { dg-regexp "\"fixits\": \[\[\{\}, \]*\]" } */
2047
2048@@ -35,11 +40,15 @@ int test (struct s *ptr)
2049 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-5.c\"" } */
2050 /* { dg-regexp "\"line\": 8" } */
2051 /* { dg-regexp "\"column\": 15" } */
2052+/* { dg-regexp "\"display-column\": 15" } */
2053+/* { dg-regexp "\"byte-column\": 15" } */
2054
2055 /* { dg-regexp "\"finish\": \{" } */
2056 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-5.c\"" } */
2057 /* { dg-regexp "\"line\": 8" } */
2058 /* { dg-regexp "\"column\": 20" } */
2059+/* { dg-regexp "\"display-column\": 20" } */
2060+/* { dg-regexp "\"byte-column\": 20" } */
2061
2062 /* { dg-regexp "\"locations\": \[\[\{\}, \]*\]" } */
2063 /* { dg-regexp "\"children\": \[\[\]\[\]\]" } */
2064diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-1.c b/gcc/testsuite/c-c++-common/diagnostic-units-1.c
2065--- a/gcc/testsuite/c-c++-common/diagnostic-units-1.c 1969-12-31 16:00:00.000000000 -0800
2066+++ b/gcc/testsuite/c-c++-common/diagnostic-units-1.c 2021-12-25 01:20:53.487636494 -0800
2067@@ -0,0 +1,28 @@
2068+/* { dg-do compile } */
2069+/* { dg-additional-options "-fdiagnostics-column-unit=byte -fshow-column -fdiagnostics-show-caret -Wmultichar" } */
2070+
2071+/* column units: bytes (via arg)
2072+ column origin: 1 (via default)
2073+ tabstop: 8 (via default) */
2074+
2075+/* This line starts with a tab. */
2076+ int c1 = 'c1'; /* { dg-warning "11: multi-character character constant" } */
2077+/* { dg-begin-multiline-output "" }
2078+ int c1 = 'c1';
2079+ ^~~~
2080+ { dg-end-multiline-output "" } */
2081+
2082+/* This line starts with <tabstop> spaces. */
2083+ int c2 = 'c2'; /* { dg-warning "18: multi-character character constant" } */
2084+/* { dg-begin-multiline-output "" }
2085+ int c2 = 'c2';
2086+ ^~~~
2087+ { dg-end-multiline-output "" } */
2088+
2089+/* This line starts with <tabstop> spaces and has an internal tab after
2090+ a space. */
2091+ int c3 = 'c3'; /* { dg-warning "19: multi-character character constant" } */
2092+/* { dg-begin-multiline-output "" }
2093+ int c3 = 'c3';
2094+ ^~~~
2095+ { dg-end-multiline-output "" } */
2096diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-2.c b/gcc/testsuite/c-c++-common/diagnostic-units-2.c
2097--- a/gcc/testsuite/c-c++-common/diagnostic-units-2.c 1969-12-31 16:00:00.000000000 -0800
2098+++ b/gcc/testsuite/c-c++-common/diagnostic-units-2.c 2021-12-25 01:20:53.487636494 -0800
2099@@ -0,0 +1,28 @@
2100+/* { dg-do compile } */
2101+/* { dg-additional-options "-fdiagnostics-column-unit=display -fshow-column -fdiagnostics-show-caret -Wmultichar" } */
2102+
2103+/* column units: display (via arg)
2104+ column origin: 1 (via default)
2105+ tabstop: 8 (via default) */
2106+
2107+/* This line starts with a tab. */
2108+ int c1 = 'c1'; /* { dg-warning "18: multi-character character constant" } */
2109+/* { dg-begin-multiline-output "" }
2110+ int c1 = 'c1';
2111+ ^~~~
2112+ { dg-end-multiline-output "" } */
2113+
2114+/* This line starts with <tabstop> spaces. */
2115+ int c2 = 'c2'; /* { dg-warning "18: multi-character character constant" } */
2116+/* { dg-begin-multiline-output "" }
2117+ int c2 = 'c2';
2118+ ^~~~
2119+ { dg-end-multiline-output "" } */
2120+
2121+/* This line starts with <tabstop> spaces and has an internal tab after
2122+ a space. */
2123+ int c3 = 'c3'; /* { dg-warning "25: multi-character character constant" } */
2124+/* { dg-begin-multiline-output "" }
2125+ int c3 = 'c3';
2126+ ^~~~
2127+ { dg-end-multiline-output "" } */
2128diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-3.c b/gcc/testsuite/c-c++-common/diagnostic-units-3.c
2129--- a/gcc/testsuite/c-c++-common/diagnostic-units-3.c 1969-12-31 16:00:00.000000000 -0800
2130+++ b/gcc/testsuite/c-c++-common/diagnostic-units-3.c 2021-12-25 01:20:53.487636494 -0800
2131@@ -0,0 +1,28 @@
2132+/* { dg-do compile } */
2133+/* { dg-additional-options "-fdiagnostics-column-unit=byte -fshow-column -fdiagnostics-show-caret -ftabstop=200 -Wmultichar" } */
2134+
2135+/* column units: bytes (via arg)
2136+ column origin: 1 (via fallback from overly large argument)
2137+ tabstop: 8 (via default) */
2138+
2139+/* This line starts with a tab. */
2140+ int c1 = 'c1'; /* { dg-warning "11: multi-character character constant" } */
2141+/* { dg-begin-multiline-output "" }
2142+ int c1 = 'c1';
2143+ ^~~~
2144+ { dg-end-multiline-output "" } */
2145+
2146+/* This line starts with <tabstop> spaces. */
2147+ int c2 = 'c2'; /* { dg-warning "18: multi-character character constant" } */
2148+/* { dg-begin-multiline-output "" }
2149+ int c2 = 'c2';
2150+ ^~~~
2151+ { dg-end-multiline-output "" } */
2152+
2153+/* This line starts with <tabstop> spaces and has an internal tab after
2154+ a space. */
2155+ int c3 = 'c3'; /* { dg-warning "19: multi-character character constant" } */
2156+/* { dg-begin-multiline-output "" }
2157+ int c3 = 'c3';
2158+ ^~~~
2159+ { dg-end-multiline-output "" } */
2160diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-4.c b/gcc/testsuite/c-c++-common/diagnostic-units-4.c
2161--- a/gcc/testsuite/c-c++-common/diagnostic-units-4.c 1969-12-31 16:00:00.000000000 -0800
2162+++ b/gcc/testsuite/c-c++-common/diagnostic-units-4.c 2021-12-25 01:20:53.487636494 -0800
2163@@ -0,0 +1,28 @@
2164+/* { dg-do compile } */
2165+/* { dg-additional-options "-fdiagnostics-column-unit=byte -fshow-column -fdiagnostics-show-caret -fdiagnostics-column-origin=0 -Wmultichar" } */
2166+
2167+/* column units: bytes (via arg)
2168+ column origin: 0 (via arg)
2169+ tabstop: 8 (via default) */
2170+
2171+/* This line starts with a tab. */
2172+ int c1 = 'c1'; /* { dg-warning "10: multi-character character constant" } */
2173+/* { dg-begin-multiline-output "" }
2174+ int c1 = 'c1';
2175+ ^~~~
2176+ { dg-end-multiline-output "" } */
2177+
2178+/* This line starts with <tabstop> spaces. */
2179+ int c2 = 'c2'; /* { dg-warning "17: multi-character character constant" } */
2180+/* { dg-begin-multiline-output "" }
2181+ int c2 = 'c2';
2182+ ^~~~
2183+ { dg-end-multiline-output "" } */
2184+
2185+/* This line starts with <tabstop> spaces and has an internal tab after
2186+ a space. */
2187+ int c3 = 'c3'; /* { dg-warning "18: multi-character character constant" } */
2188+/* { dg-begin-multiline-output "" }
2189+ int c3 = 'c3';
2190+ ^~~~
2191+ { dg-end-multiline-output "" } */
2192diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-5.c b/gcc/testsuite/c-c++-common/diagnostic-units-5.c
2193--- a/gcc/testsuite/c-c++-common/diagnostic-units-5.c 1969-12-31 16:00:00.000000000 -0800
2194+++ b/gcc/testsuite/c-c++-common/diagnostic-units-5.c 2021-12-25 01:20:53.491636427 -0800
2195@@ -0,0 +1,28 @@
2196+/* { dg-do compile } */
2197+/* { dg-additional-options "-fdiagnostics-column-unit=display -fshow-column -fdiagnostics-show-caret -fdiagnostics-column-origin=0 -Wmultichar" } */
2198+
2199+/* column units: display (via arg)
2200+ column origin: 0 (via arg)
2201+ tabstop: 8 (via default) */
2202+
2203+/* This line starts with a tab. */
2204+ int c1 = 'c1'; /* { dg-warning "17: multi-character character constant" } */
2205+/* { dg-begin-multiline-output "" }
2206+ int c1 = 'c1';
2207+ ^~~~
2208+ { dg-end-multiline-output "" } */
2209+
2210+/* This line starts with <tabstop> spaces. */
2211+ int c2 = 'c2'; /* { dg-warning "17: multi-character character constant" } */
2212+/* { dg-begin-multiline-output "" }
2213+ int c2 = 'c2';
2214+ ^~~~
2215+ { dg-end-multiline-output "" } */
2216+
2217+/* This line starts with <tabstop> spaces and has an internal tab after
2218+ a space. */
2219+ int c3 = 'c3'; /* { dg-warning "24: multi-character character constant" } */
2220+/* { dg-begin-multiline-output "" }
2221+ int c3 = 'c3';
2222+ ^~~~
2223+ { dg-end-multiline-output "" } */
2224diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-6.c b/gcc/testsuite/c-c++-common/diagnostic-units-6.c
2225--- a/gcc/testsuite/c-c++-common/diagnostic-units-6.c 1969-12-31 16:00:00.000000000 -0800
2226+++ b/gcc/testsuite/c-c++-common/diagnostic-units-6.c 2021-12-25 01:20:53.491636427 -0800
2227@@ -0,0 +1,28 @@
2228+/* { dg-do compile } */
2229+/* { dg-additional-options "-fdiagnostics-column-unit=byte -fshow-column -fdiagnostics-show-caret -fdiagnostics-column-origin=100 -Wmultichar" } */
2230+
2231+/* column units: bytes (via arg)
2232+ column origin: 100 (via arg)
2233+ tabstop: 8 (via default) */
2234+
2235+/* This line starts with a tab. */
2236+ int c1 = 'c1'; /* { dg-warning "110: multi-character character constant" } */
2237+/* { dg-begin-multiline-output "" }
2238+ int c1 = 'c1';
2239+ ^~~~
2240+ { dg-end-multiline-output "" } */
2241+
2242+/* This line starts with <tabstop> spaces. */
2243+ int c2 = 'c2'; /* { dg-warning "117: multi-character character constant" } */
2244+/* { dg-begin-multiline-output "" }
2245+ int c2 = 'c2';
2246+ ^~~~
2247+ { dg-end-multiline-output "" } */
2248+
2249+/* This line starts with <tabstop> spaces and has an internal tab after
2250+ a space. */
2251+ int c3 = 'c3'; /* { dg-warning "118: multi-character character constant" } */
2252+/* { dg-begin-multiline-output "" }
2253+ int c3 = 'c3';
2254+ ^~~~
2255+ { dg-end-multiline-output "" } */
2256diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-7.c b/gcc/testsuite/c-c++-common/diagnostic-units-7.c
2257--- a/gcc/testsuite/c-c++-common/diagnostic-units-7.c 1969-12-31 16:00:00.000000000 -0800
2258+++ b/gcc/testsuite/c-c++-common/diagnostic-units-7.c 2021-12-25 01:20:53.491636427 -0800
2259@@ -0,0 +1,28 @@
2260+/* { dg-do compile } */
2261+/* { dg-additional-options "-fdiagnostics-column-unit=byte -fshow-column -fdiagnostics-show-caret -ftabstop=9 -Wmultichar" } */
2262+
2263+/* column units: bytes (via arg)
2264+ column origin: 1 (via default)
2265+ tabstop: 9 (via arg) */
2266+
2267+/* This line starts with a tab. */
2268+ int c1 = 'c1'; /* { dg-warning "11: multi-character character constant" } */
2269+/* { dg-begin-multiline-output "" }
2270+ int c1 = 'c1';
2271+ ^~~~
2272+ { dg-end-multiline-output "" } */
2273+
2274+/* This line starts with <tabstop> spaces. */
2275+ int c2 = 'c2'; /* { dg-warning "19: multi-character character constant" } */
2276+/* { dg-begin-multiline-output "" }
2277+ int c2 = 'c2';
2278+ ^~~~
2279+ { dg-end-multiline-output "" } */
2280+
2281+/* This line starts with <tabstop> spaces and has an internal tab after
2282+ a space. */
2283+ int c3 = 'c3'; /* { dg-warning "20: multi-character character constant" } */
2284+/* { dg-begin-multiline-output "" }
2285+ int c3 = 'c3';
2286+ ^~~~
2287+ { dg-end-multiline-output "" } */
2288diff --git a/gcc/testsuite/c-c++-common/diagnostic-units-8.c b/gcc/testsuite/c-c++-common/diagnostic-units-8.c
2289--- a/gcc/testsuite/c-c++-common/diagnostic-units-8.c 1969-12-31 16:00:00.000000000 -0800
2290+++ b/gcc/testsuite/c-c++-common/diagnostic-units-8.c 2021-12-25 01:20:53.491636427 -0800
2291@@ -0,0 +1,28 @@
2292+/* { dg-do compile } */
2293+/* { dg-additional-options "-fshow-column -fdiagnostics-show-caret -ftabstop=9 -Wmultichar" } */
2294+
2295+/* column units: display (via default)
2296+ column origin: 1 (via default)
2297+ tabstop: 9 (via arg) */
2298+
2299+/* This line starts with a tab. */
2300+ int c1 = 'c1'; /* { dg-warning "19: multi-character character constant" } */
2301+/* { dg-begin-multiline-output "" }
2302+ int c1 = 'c1';
2303+ ^~~~
2304+ { dg-end-multiline-output "" } */
2305+
2306+/* This line starts with <tabstop> spaces. */
2307+ int c2 = 'c2'; /* { dg-warning "19: multi-character character constant" } */
2308+/* { dg-begin-multiline-output "" }
2309+ int c2 = 'c2';
2310+ ^~~~
2311+ { dg-end-multiline-output "" } */
2312+
2313+/* This line starts with <tabstop> spaces and has an internal tab after
2314+ a space. */
2315+ int c3 = 'c3'; /* { dg-warning "28: multi-character character constant" } */
2316+/* { dg-begin-multiline-output "" }
2317+ int c3 = 'c3';
2318+ ^~~~
2319+ { dg-end-multiline-output "" } */
2320diff --git a/gcc/testsuite/c-c++-common/missing-close-symbol.c b/gcc/testsuite/c-c++-common/missing-close-symbol.c
2321--- a/gcc/testsuite/c-c++-common/missing-close-symbol.c 2020-07-22 23:35:17.912390810 -0700
2322+++ b/gcc/testsuite/c-c++-common/missing-close-symbol.c 2021-12-25 01:20:53.491636427 -0800
2323@@ -24,9 +24,9 @@ void test_static_assert_different_line (
2324 _Static_assert(sizeof(int) >= sizeof(char), /* { dg-message "to match this '\\('" } */
2325 "msg"; /* { dg-error "expected '\\)' before ';' token" } */
2326 /* { dg-begin-multiline-output "" }
2327- "msg";
2328- ^
2329- )
2330+ "msg";
2331+ ^
2332+ )
2333 { dg-end-multiline-output "" } */
2334 /* { dg-begin-multiline-output "" }
2335 _Static_assert(sizeof(int) >= sizeof(char),
2336diff --git a/gcc/testsuite/c-c++-common/Wmisleading-indentation-3.c b/gcc/testsuite/c-c++-common/Wmisleading-indentation-3.c
2337--- a/gcc/testsuite/c-c++-common/Wmisleading-indentation-3.c 2020-07-22 23:35:17.904390722 -0700
2338+++ b/gcc/testsuite/c-c++-common/Wmisleading-indentation-3.c 2021-12-25 01:20:53.487636494 -0800
2339@@ -36,20 +36,20 @@ int fn_6 (int a, int b, int c)
2340 /* ... */
2341 if ((err = foo (a)) != 0)
2342 goto fail;
2343- if ((err = foo (b)) != 0) /* { dg-message "2: this 'if' clause does not guard..." } */
2344+ if ((err = foo (b)) != 0) /* { dg-message "9: this 'if' clause does not guard..." } */
2345 goto fail;
2346- goto fail; /* { dg-message "3: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2347+ goto fail; /* { dg-message "17: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2348 if ((err = foo (c)) != 0)
2349 goto fail;
2350 /* ... */
2351
2352 /* { dg-begin-multiline-output "" }
2353- if ((err = foo (b)) != 0)
2354- ^~
2355+ if ((err = foo (b)) != 0)
2356+ ^~
2357 { dg-end-multiline-output "" } */
2358 /* { dg-begin-multiline-output "" }
2359- goto fail;
2360- ^~~~
2361+ goto fail;
2362+ ^~~~
2363 { dg-end-multiline-output "" } */
2364
2365 fail:
2366diff --git a/gcc/testsuite/c-c++-common/Wmisleading-indentation.c b/gcc/testsuite/c-c++-common/Wmisleading-indentation.c
2367--- a/gcc/testsuite/c-c++-common/Wmisleading-indentation.c 2020-07-22 23:35:17.904390722 -0700
2368+++ b/gcc/testsuite/c-c++-common/Wmisleading-indentation.c 2021-12-25 01:20:53.487636494 -0800
2369@@ -65,9 +65,9 @@ int fn_6 (int a, int b, int c)
2370 /* ... */
2371 if ((err = foo (a)) != 0)
2372 goto fail;
2373- if ((err = foo (b)) != 0) /* { dg-message "2: this 'if' clause does not guard..." } */
2374+ if ((err = foo (b)) != 0) /* { dg-message "9: this 'if' clause does not guard..." } */
2375 goto fail;
2376- goto fail; /* { dg-message "3: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2377+ goto fail; /* { dg-message "17: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2378 if ((err = foo (c)) != 0)
2379 goto fail;
2380 /* ... */
2381@@ -178,7 +178,7 @@ void fn_16_tabs (void)
2382 while (flagA)
2383 if (flagB) /* { dg-message "7: this 'if' clause does not guard..." } */
2384 foo (0);
2385- foo (1);/* { dg-message "2: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2386+ foo (1);/* { dg-message "9: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'" } */
2387 }
2388
2389 void fn_17_spaces (void)
2390diff --git a/gcc/testsuite/gcc.dg/analyzer/malloc-paths-9.c b/gcc/testsuite/gcc.dg/analyzer/malloc-paths-9.c
2391--- a/gcc/testsuite/gcc.dg/analyzer/malloc-paths-9.c 2020-07-22 23:35:18.124393144 -0700
2392+++ b/gcc/testsuite/gcc.dg/analyzer/malloc-paths-9.c 2021-12-25 01:20:53.491636427 -0800
2393@@ -288,7 +288,7 @@ int test_3 (int x, int y)
2394 | | ~~~~~~~~~~
2395 | | |
2396 | | (4) ...to here
2397- | NN | to dereference it above
2398+ | NN | to dereference it above
2399 | NN | return *ptr;
2400 | | ~~~~
2401 | | |
2402diff --git a/gcc/testsuite/gcc.dg/bad-binary-ops.c b/gcc/testsuite/gcc.dg/bad-binary-ops.c
2403--- a/gcc/testsuite/gcc.dg/bad-binary-ops.c 2020-07-22 23:35:18.128393190 -0700
2404+++ b/gcc/testsuite/gcc.dg/bad-binary-ops.c 2021-12-25 01:20:53.491636427 -0800
2405@@ -35,10 +35,10 @@ int test_2 (void)
2406 ~~~~~~~~~~~~~~~~
2407 |
2408 struct s
2409- + some_other_function ());
2410- ^ ~~~~~~~~~~~~~~~~~~~~~~
2411- |
2412- struct t
2413+ + some_other_function ());
2414+ ^ ~~~~~~~~~~~~~~~~~~~~~~
2415+ |
2416+ struct t
2417 { dg-end-multiline-output "" } */
2418 }
2419
2420diff --git a/gcc/testsuite/gcc.dg/format/branch-1.c b/gcc/testsuite/gcc.dg/format/branch-1.c
2421--- a/gcc/testsuite/gcc.dg/format/branch-1.c 2020-07-22 23:35:18.152393454 -0700
2422+++ b/gcc/testsuite/gcc.dg/format/branch-1.c 2021-12-25 01:20:53.491636427 -0800
2423@@ -10,7 +10,7 @@ foo (long l, int nfoo)
2424 {
2425 printf ((nfoo > 1) ? "%d foos" : "%d foo", nfoo);
2426 printf ((l > 1) ? "%d foos" /* { dg-warning "23:int" "wrong type in conditional expr" } */
2427- : "%d foo", l); /* { dg-warning "16:int" "wrong type in conditional expr" } */
2428+ : "%d foo", l); /* { dg-warning "23:int" "wrong type in conditional expr" } */
2429 printf ((l > 1) ? "%ld foos" : "%d foo", l); /* { dg-warning "36:int" "wrong type in conditional expr" } */
2430 printf ((l > 1) ? "%d foos" : "%ld foo", l); /* { dg-warning "23:int" "wrong type in conditional expr" } */
2431 /* Should allow one case to have extra arguments. */
2432diff --git a/gcc/testsuite/gcc.dg/format/pr79210.c b/gcc/testsuite/gcc.dg/format/pr79210.c
2433--- a/gcc/testsuite/gcc.dg/format/pr79210.c 2020-07-22 23:35:18.152393454 -0700
2434+++ b/gcc/testsuite/gcc.dg/format/pr79210.c 2021-12-25 01:20:53.491636427 -0800
2435@@ -20,4 +20,4 @@ LPFC_VPORT_ATTR_R(peer_port_login,
2436 "Allow peer ports on the same physical port to login to each "
2437 "other.");
2438
2439-/* { dg-warning "6: format .%d. expects argument of type .int., but argument 4 has type .unsigned int. " "" { target *-*-* } .-12 } */
2440+/* { dg-warning "20: format .%d. expects argument of type .int., but argument 4 has type .unsigned int. " "" { target *-*-* } .-12 } */
2441diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-expressions-1.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-expressions-1.c
2442--- a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-expressions-1.c 2020-07-22 23:35:18.172393674 -0700
2443+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-expressions-1.c 2021-12-25 01:20:53.491636427 -0800
2444@@ -540,15 +540,15 @@ void test_builtin_types_compatible_p (un
2445 __emit_expression_range (0,
2446 f (i) + __builtin_types_compatible_p (long, int)); /* { dg-warning "range" } */
2447 /* { dg-begin-multiline-output "" }
2448- f (i) + __builtin_types_compatible_p (long, int));
2449- ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2450+ f (i) + __builtin_types_compatible_p (long, int));
2451+ ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2452 { dg-end-multiline-output "" } */
2453
2454 __emit_expression_range (0,
2455 __builtin_types_compatible_p (long, int) + f (i)); /* { dg-warning "range" } */
2456 /* { dg-begin-multiline-output "" }
2457- __builtin_types_compatible_p (long, int) + f (i));
2458- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~
2459+ __builtin_types_compatible_p (long, int) + f (i));
2460+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~
2461 { dg-end-multiline-output "" } */
2462 }
2463
2464@@ -671,8 +671,8 @@ void test_multiple_ordinary_maps (void)
2465 /* { dg-begin-multiline-output "" }
2466 __emit_expression_range (0, foo (0,
2467 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2468- "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"));
2469- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2470+ "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"));
2471+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2472 { dg-end-multiline-output "" } */
2473
2474 /* Another expression that transitions between ordinary maps; this
2475@@ -685,8 +685,8 @@ void test_multiple_ordinary_maps (void)
2476 /* { dg-begin-multiline-output "" }
2477 __emit_expression_range (0, foo (0, "01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789",
2478 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2479- 0));
2480- ~~
2481+ 0));
2482+ ~~
2483 { dg-end-multiline-output "" } */
2484 }
2485
2486diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c
2487--- a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c 2020-07-22 23:35:18.172393674 -0700
2488+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c 2021-12-25 01:20:53.491636427 -0800
2489@@ -335,11 +335,11 @@ pr87652 (const char *stem, int counter)
2490 /* { dg-error "unable to read substring location: unable to read source line" "" { target c } 329 } */
2491 /* { dg-error "unable to read substring location: failed to get ordinary maps" "" { target c++ } 329 } */
2492 /* { dg-begin-multiline-output "" }
2493- __emit_string_literal_range(__FILE__":%5d: " format, \
2494+ __emit_string_literal_range(__FILE__":%5d: " format, \
2495 ^~~~~~~~
2496 { dg-end-multiline-output "" { target c } } */
2497 /* { dg-begin-multiline-output "" }
2498- __emit_string_literal_range(__FILE__":%5d: " format, \
2499+ __emit_string_literal_range(__FILE__":%5d: " format, \
2500 ^
2501 { dg-end-multiline-output "" { target c++ } } */
2502
2503diff --git a/gcc/testsuite/gcc.dg/redecl-4.c b/gcc/testsuite/gcc.dg/redecl-4.c
2504--- a/gcc/testsuite/gcc.dg/redecl-4.c 2020-07-22 23:35:18.192393895 -0700
2505+++ b/gcc/testsuite/gcc.dg/redecl-4.c 2021-12-25 01:20:53.491636427 -0800
2506@@ -15,7 +15,7 @@ f (void)
2507 /* Should get format warnings even though the built-in declaration
2508 isn't "visible". */
2509 printf (
2510- "%s", 1); /* { dg-warning "8:format" } */
2511+ "%s", 1); /* { dg-warning "15:format" } */
2512 /* The type of strcmp here should have no prototype. */
2513 if (0)
2514 strcmp (1);
2515diff --git a/gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C b/gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C
2516--- a/gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C 2020-07-22 23:35:17.972391472 -0700
2517+++ b/gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C 2021-12-25 01:20:53.491636427 -0800
2518@@ -33,10 +33,10 @@ int test_2 (void)
2519 ~~~~~~~~~~~~~~~~
2520 |
2521 s
2522- + some_other_function ());
2523- ^ ~~~~~~~~~~~~~~~~~~~~~~
2524- |
2525- t
2526+ + some_other_function ());
2527+ ^ ~~~~~~~~~~~~~~~~~~~~~~
2528+ |
2529+ t
2530 { dg-end-multiline-output "" } */
2531 }
2532
2533diff --git a/gcc/testsuite/g++.dg/parse/error4.C b/gcc/testsuite/g++.dg/parse/error4.C
2534--- a/gcc/testsuite/g++.dg/parse/error4.C 2020-07-22 23:35:18.012391910 -0700
2535+++ b/gcc/testsuite/g++.dg/parse/error4.C 2021-12-25 01:20:53.491636427 -0800
2536@@ -7,4 +7,4 @@ struct X {
2537 int);
2538 };
2539
2540-// { dg-error "4:'itn' has not been declared" "" { target *-*-* } 6 }
2541+// { dg-error "18:'itn' has not been declared" "" { target *-*-* } 6 }
2542diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90
2543--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2020-07-22 23:35:18.512397420 -0700
2544+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-25 01:20:53.491636427 -0800
2545@@ -8,17 +8,22 @@
2546 ! We can't rely on any ordering of the keys.
2547
2548 ! { dg-regexp "\"kind\": \"error\"" }
2549+! { dg-regexp "\"column-origin\": 1" }
2550 ! { dg-regexp "\"message\": \"#error message\"" }
2551
2552 ! { dg-regexp "\"caret\": \{" }
2553 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-1.F90\"" }
2554 ! { dg-regexp "\"line\": 4" }
2555 ! { dg-regexp "\"column\": 2" }
2556+! { dg-regexp "\"display-column\": 2" }
2557+! { dg-regexp "\"byte-column\": 2" }
2558
2559 ! { dg-regexp "\"finish\": \{" }
2560 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-1.F90\"" }
2561 ! { dg-regexp "\"line\": 4" }
2562 ! { dg-regexp "\"column\": 6" }
2563+! { dg-regexp "\"display-column\": 6" }
2564+! { dg-regexp "\"byte-column\": 6" }
2565
2566 ! { dg-regexp "\"locations\": \[\[\{\}, \]*\]" }
2567 ! { dg-regexp "\"children\": \[\[\]\[\]\]" }
2568diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90
2569--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2020-07-22 23:35:18.512397420 -0700
2570+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-25 01:20:53.491636427 -0800
2571@@ -8,6 +8,7 @@
2572 ! We can't rely on any ordering of the keys.
2573
2574 ! { dg-regexp "\"kind\": \"warning\"" }
2575+! { dg-regexp "\"column-origin\": 1" }
2576 ! { dg-regexp "\"message\": \"#warning message\"" }
2577 ! { dg-regexp "\"option\": \"-Wcpp\"" }
2578 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
2579@@ -16,11 +17,15 @@
2580 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-2.F90\"" }
2581 ! { dg-regexp "\"line\": 4" }
2582 ! { dg-regexp "\"column\": 2" }
2583+! { dg-regexp "\"display-column\": 2" }
2584+! { dg-regexp "\"byte-column\": 2" }
2585
2586 ! { dg-regexp "\"finish\": \{" }
2587 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-2.F90\"" }
2588 ! { dg-regexp "\"line\": 4" }
2589 ! { dg-regexp "\"column\": 8" }
2590+! { dg-regexp "\"display-column\": 8" }
2591+! { dg-regexp "\"byte-column\": 8" }
2592
2593 ! { dg-regexp "\"locations\": \[\[\{\}, \]*\]" }
2594 ! { dg-regexp "\"children\": \[\[\]\[\]\]" }
2595diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90
2596--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2020-07-22 23:35:18.512397420 -0700
2597+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-25 01:20:53.491636427 -0800
2598@@ -8,6 +8,7 @@
2599 ! We can't rely on any ordering of the keys.
2600
2601 ! { dg-regexp "\"kind\": \"error\"" }
2602+! { dg-regexp "\"column-origin\": 1" }
2603 ! { dg-regexp "\"message\": \"#warning message\"" }
2604 ! { dg-regexp "\"option\": \"-Werror=cpp\"" }
2605 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
2606@@ -16,11 +17,15 @@
2607 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-3.F90\"" }
2608 ! { dg-regexp "\"line\": 4" }
2609 ! { dg-regexp "\"column\": 2" }
2610+! { dg-regexp "\"display-column\": 2" }
2611+! { dg-regexp "\"byte-column\": 2" }
2612
2613 ! { dg-regexp "\"finish\": \{" }
2614 ! { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-3.F90\"" }
2615 ! { dg-regexp "\"line\": 4" }
2616 ! { dg-regexp "\"column\": 8" }
2617+! { dg-regexp "\"display-column\": 8" }
2618+! { dg-regexp "\"byte-column\": 8" }
2619
2620 ! { dg-regexp "\"locations\": \[\[\{\}, \]*\]" }
2621 ! { dg-regexp "\"children\": \[\[\]\[\]\]" }
2622diff --git a/gcc/testsuite/go.dg/arrayclear.go b/gcc/testsuite/go.dg/arrayclear.go
2623--- a/gcc/testsuite/go.dg/arrayclear.go 2020-07-22 23:35:18.588398257 -0700
2624+++ b/gcc/testsuite/go.dg/arrayclear.go 2021-12-25 01:20:53.491636427 -0800
2625@@ -1,5 +1,8 @@
2626 // { dg-do compile }
2627 // { dg-options "-fgo-debug-optimization" }
2628+// This comment is necessary to work around a dejagnu bug. Otherwise, the
2629+// column of the second error message would equal the row of the first one, and
2630+// since the errors are also identical, dejagnu is not able to distinguish them.
2631
2632 package p
2633
2634diff --git a/gcc/testsuite/g++.old-deja/g++.brendan/crash11.C b/gcc/testsuite/g++.old-deja/g++.brendan/crash11.C
2635--- a/gcc/testsuite/g++.old-deja/g++.brendan/crash11.C 2020-07-22 23:35:18.048392308 -0700
2636+++ b/gcc/testsuite/g++.old-deja/g++.brendan/crash11.C 2021-12-25 01:20:53.491636427 -0800
2637@@ -9,13 +9,13 @@ class A {
2638 int h;
2639 A() { i=10; j=20; }
2640 virtual void f1() { printf("i=%d j=%d\n",i,j); }
2641- friend virtual void f2() { printf("i=%d j=%d\n",i,j); } // { dg-error "9:virtual functions cannot be friends" }
2642+ friend virtual void f2() { printf("i=%d j=%d\n",i,j); } // { dg-error "16:virtual functions cannot be friends" }
2643 };
2644
2645 class B : public A {
2646 public:
2647 virtual void f1() { printf("i=%d j=%d\n",i,j); }// { dg-error "" } member.*// ERROR - member.*
2648- friend virtual void f2() { printf("i=%d j=%d\n",i,j); } // { dg-error "9:virtual functions cannot be friends" }
2649+ friend virtual void f2() { printf("i=%d j=%d\n",i,j); } // { dg-error "16:virtual functions cannot be friends" }
2650 // { dg-error "private" "" { target *-*-* } .-1 }
2651 };
2652
2653diff --git a/gcc/testsuite/g++.old-deja/g++.pt/overload2.C b/gcc/testsuite/g++.old-deja/g++.pt/overload2.C
2654--- a/gcc/testsuite/g++.old-deja/g++.pt/overload2.C 2020-07-22 23:35:18.072392572 -0700
2655+++ b/gcc/testsuite/g++.old-deja/g++.pt/overload2.C 2021-12-25 01:20:53.491636427 -0800
2656@@ -12,5 +12,5 @@ int
2657 main()
2658 {
2659 C<char*> c;
2660- char* p = Z(c.O); //{ dg-error "13:'Z' was not declared" } ambiguous c.O
2661+ char* p = Z(c.O); //{ dg-error "29:'Z' was not declared" } ambiguous c.O
2662 }
2663diff --git a/gcc/testsuite/g++.old-deja/g++.robertl/eb109.C b/gcc/testsuite/g++.old-deja/g++.robertl/eb109.C
2664--- a/gcc/testsuite/g++.old-deja/g++.robertl/eb109.C 2020-07-22 23:35:18.076392617 -0700
2665+++ b/gcc/testsuite/g++.old-deja/g++.robertl/eb109.C 2021-12-25 01:20:53.491636427 -0800
2666@@ -48,8 +48,8 @@ ostream& operator<<(ostream& os, Graph<V
2667
2668 // The compiler does not like this line!!!!!!
2669 typename Graph<VertexType, EdgeType>::Successor::iterator
2670- startN = G[i].second.begin(), // { dg-error "14:no match" } no index operator
2671- endN = G[i].second.end(); // { dg-error "14:no match" } no index operator
2672+ startN = G[i].second.begin(), // { dg-error "21:no match" } no index operator
2673+ endN = G[i].second.end(); // { dg-error "21:no match" } no index operator
2674
2675 while(startN != endN)
2676 {
2677diff --git a/gcc/tree-diagnostic-path.cc b/gcc/tree-diagnostic-path.cc
2678--- a/gcc/tree-diagnostic-path.cc 2020-07-22 23:35:18.628398698 -0700
2679+++ b/gcc/tree-diagnostic-path.cc 2021-12-25 01:20:53.491636427 -0800
2680@@ -493,7 +493,7 @@ default_tree_diagnostic_path_printer (di
2681 doesn't have access to trees (for m_fndecl). */
2682
2683 json::value *
2684-default_tree_make_json_for_path (diagnostic_context *,
2685+default_tree_make_json_for_path (diagnostic_context *context,
2686 const diagnostic_path *path)
2687 {
2688 json::array *path_array = new json::array ();
2689@@ -504,7 +504,8 @@ default_tree_make_json_for_path (diagnos
2690 json::object *event_obj = new json::object ();
2691 if (event.get_location ())
2692 event_obj->set ("location",
2693- json_from_expanded_location (event.get_location ()));
2694+ json_from_expanded_location (context,
2695+ event.get_location ()));
2696 label_text event_text (event.get_desc (false));
2697 event_obj->set ("description", new json::string (event_text.m_buffer));
2698 event_text.maybe_free ();
2699diff --git a/libcpp/charset.c b/libcpp/charset.c
2700--- a/libcpp/charset.c 2020-07-22 23:35:18.712399623 -0700
2701+++ b/libcpp/charset.c 2021-12-25 01:20:53.491636427 -0800
2702@@ -2276,49 +2276,90 @@ cpp_string_location_reader::get_next ()
2703 return result;
2704 }
2705
2706-/* Helper for cpp_byte_column_to_display_column and its inverse. Given a
2707- pointer to a UTF-8-encoded character, compute its display width. *INBUFP
2708- points on entry to the start of the UTF-8 encoding of the character, and
2709- is updated to point just after the last byte of the encoding. *INBYTESLEFTP
2710- contains on entry the remaining size of the buffer into which *INBUFP
2711- points, and this is also updated accordingly. If *INBUFP does not
2712+cpp_display_width_computation::
2713+cpp_display_width_computation (const char *data, int data_length, int tabstop) :
2714+ m_begin (data),
2715+ m_next (m_begin),
2716+ m_bytes_left (data_length),
2717+ m_tabstop (tabstop),
2718+ m_display_cols (0)
2719+{
2720+ gcc_assert (m_tabstop > 0);
2721+}
2722+
2723+
2724+/* The main implementation function for class cpp_display_width_computation.
2725+ m_next points on entry to the start of the UTF-8 encoding of the next
2726+ character, and is updated to point just after the last byte of the encoding.
2727+ m_bytes_left contains on entry the remaining size of the buffer into which
2728+ m_next points, and this is also updated accordingly. If m_next does not
2729 point to a valid UTF-8-encoded sequence, then it will be treated as a single
2730- byte with display width 1. */
2731+ byte with display width 1. m_cur_display_col is the current display column,
2732+ relative to which tab stops should be expanded. Returns the display width of
2733+ the codepoint just processed. */
2734
2735-static inline int
2736-compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp)
2737+int
2738+cpp_display_width_computation::process_next_codepoint ()
2739 {
2740 cppchar_t c;
2741- if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0)
2742+ int next_width;
2743+
2744+ if (*m_next == '\t')
2745+ {
2746+ ++m_next;
2747+ --m_bytes_left;
2748+ next_width = m_tabstop - (m_display_cols % m_tabstop);
2749+ }
2750+ else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c)
2751+ != 0)
2752 {
2753 /* Input is not convertible to UTF-8. This could be fine, e.g. in a
2754 string literal, so don't complain. Just treat it as if it has a width
2755 of one. */
2756- ++*inbufp;
2757- --*inbytesleftp;
2758- return 1;
2759+ ++m_next;
2760+ --m_bytes_left;
2761+ next_width = 1;
2762+ }
2763+ else
2764+ {
2765+ /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */
2766+ next_width = cpp_wcwidth (c);
2767 }
2768
2769- /* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */
2770- return cpp_wcwidth (c);
2771+ m_display_cols += next_width;
2772+ return next_width;
2773+}
2774+
2775+/* Utility to advance the byte stream by the minimum amount needed to consume
2776+ N display columns. Returns the number of display columns that were
2777+ actually skipped. This could be less than N, if there was not enough data,
2778+ or more than N, if the last character to be skipped had a sufficiently large
2779+ display width. */
2780+int
2781+cpp_display_width_computation::advance_display_cols (int n)
2782+{
2783+ const int start = m_display_cols;
2784+ const int target = start + n;
2785+ while (m_display_cols < target && !done ())
2786+ process_next_codepoint ();
2787+ return m_display_cols - start;
2788 }
2789
2790 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute
2791 how many display columns are occupied by the first COLUMN bytes. COLUMN
2792 may exceed DATA_LENGTH, in which case the phantom bytes at the end are
2793- treated as if they have display width 1. */
2794+ treated as if they have display width 1. Tabs are expanded to the next tab
2795+ stop, relative to the start of DATA. */
2796
2797 int
2798 cpp_byte_column_to_display_column (const char *data, int data_length,
2799- int column)
2800+ int column, int tabstop)
2801 {
2802- int display_col = 0;
2803- const uchar *udata = (const uchar *) data;
2804 const int offset = MAX (0, column - data_length);
2805- size_t inbytesleft = column - offset;
2806- while (inbytesleft)
2807- display_col += compute_next_display_width (&udata, &inbytesleft);
2808- return display_col + offset;
2809+ cpp_display_width_computation dw (data, column - offset, tabstop);
2810+ while (!dw.done ())
2811+ dw.process_next_codepoint ();
2812+ return dw.display_cols_processed () + offset;
2813 }
2814
2815 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute
2816@@ -2328,14 +2369,11 @@ cpp_byte_column_to_display_column (const
2817
2818 int
2819 cpp_display_column_to_byte_column (const char *data, int data_length,
2820- int display_col)
2821+ int display_col, int tabstop)
2822 {
2823- int column = 0;
2824- const uchar *udata = (const uchar *) data;
2825- size_t inbytesleft = data_length;
2826- while (column < display_col && inbytesleft)
2827- column += compute_next_display_width (&udata, &inbytesleft);
2828- return data_length - inbytesleft + MAX (0, display_col - column);
2829+ cpp_display_width_computation dw (data, data_length, tabstop);
2830+ const int avail_display = dw.advance_display_cols (display_col);
2831+ return dw.bytes_processed () + MAX (0, display_col - avail_display);
2832 }
2833
2834 /* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
2835diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
2836--- a/libcpp/include/cpplib.h 2020-07-22 23:35:18.712399623 -0700
2837+++ b/libcpp/include/cpplib.h 2021-12-25 01:20:53.491636427 -0800
2838@@ -312,9 +312,6 @@ enum cpp_normalize_level {
2839 carries all the options visible to the command line. */
2840 struct cpp_options
2841 {
2842- /* Characters between tab stops. */
2843- unsigned int tabstop;
2844-
2845 /* The language we're preprocessing. */
2846 enum c_lang lang;
2847
2848@@ -1322,14 +1319,43 @@ extern const char * cpp_get_userdef_suff
2849 (const cpp_token *);
2850
2851 /* In charset.c */
2852+
2853+/* A class to manage the state while converting a UTF-8 sequence to cppchar_t
2854+ and computing the display width one character at a time. */
2855+class cpp_display_width_computation {
2856+ public:
2857+ cpp_display_width_computation (const char *data, int data_length,
2858+ int tabstop);
2859+ const char *next_byte () const { return m_next; }
2860+ int bytes_processed () const { return m_next - m_begin; }
2861+ int bytes_left () const { return m_bytes_left; }
2862+ bool done () const { return !bytes_left (); }
2863+ int display_cols_processed () const { return m_display_cols; }
2864+
2865+ int process_next_codepoint ();
2866+ int advance_display_cols (int n);
2867+
2868+ private:
2869+ const char *const m_begin;
2870+ const char *m_next;
2871+ size_t m_bytes_left;
2872+ const int m_tabstop;
2873+ int m_display_cols;
2874+};
2875+
2876+/* Convenience functions that are simple use cases for class
2877+ cpp_display_width_computation. Tab characters will be expanded to spaces
2878+ as determined by TABSTOP. */
2879 int cpp_byte_column_to_display_column (const char *data, int data_length,
2880- int column);
2881-inline int cpp_display_width (const char *data, int data_length)
2882+ int column, int tabstop);
2883+inline int cpp_display_width (const char *data, int data_length,
2884+ int tabstop)
2885 {
2886- return cpp_byte_column_to_display_column (data, data_length, data_length);
2887+ return cpp_byte_column_to_display_column (data, data_length, data_length,
2888+ tabstop);
2889 }
2890 int cpp_display_column_to_byte_column (const char *data, int data_length,
2891- int display_col);
2892+ int display_col, int tabstop);
2893 int cpp_wcwidth (cppchar_t c);
2894
2895 #endif /* ! LIBCPP_CPPLIB_H */
2896diff --git a/libcpp/init.c b/libcpp/init.c
2897--- a/libcpp/init.c 2020-07-22 23:35:18.712399623 -0700
2898+++ b/libcpp/init.c 2021-12-25 01:20:53.491636427 -0800
2899@@ -190,7 +190,6 @@ cpp_create_reader (enum c_lang lang, cpp
2900 CPP_OPTION (pfile, discard_comments) = 1;
2901 CPP_OPTION (pfile, discard_comments_in_macro_exp) = 1;
2902 CPP_OPTION (pfile, max_include_depth) = 200;
2903- CPP_OPTION (pfile, tabstop) = 8;
2904 CPP_OPTION (pfile, operator_names) = 1;
2905 CPP_OPTION (pfile, warn_trigraphs) = 2;
2906 CPP_OPTION (pfile, warn_endif_labels) = 1;
diff --git a/meta/recipes-devtools/gcc/gcc/0002-CVE-2021-42574.patch b/meta/recipes-devtools/gcc/gcc/0002-CVE-2021-42574.patch
new file mode 100644
index 0000000000..5b1896ed69
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0002-CVE-2021-42574.patch
@@ -0,0 +1,2270 @@
1From bd5e882cf6e0def3dd1bc106075d59a303fe0d1e Mon Sep 17 00:00:00 2001
2From: David Malcolm <dmalcolm@redhat.com>
3Date: Mon, 18 Oct 2021 18:55:31 -0400
4Subject: [PATCH] diagnostics: escape non-ASCII source bytes for certain
5 diagnostics
6MIME-Version: 1.0
7Content-Type: text/plain; charset=utf8
8Content-Transfer-Encoding: 8bit
9
10This patch adds support to GCC's diagnostic subsystem for escaping certain
11bytes and Unicode characters when quoting source code.
12
13Specifically, this patch adds a new flag rich_location::m_escape_on_output
14which is a hint from a diagnostic that non-ASCII bytes in the pertinent
15lines of the user's source code should be escaped when printed.
16
17The patch sets this for the following diagnostics:
18- when complaining about stray bytes in the program (when these
19are non-printable)
20- when complaining about "null character(s) ignored");
21- for -Wnormalized= (and generate source ranges for such warnings)
22
23The escaping is controlled by a new option:
24 -fdiagnostics-escape-format=[unicode|bytes]
25
26For example, consider a diagnostic involing a source line containing the
27string "before" followed by the Unicode character U+03C0 ("GREEK SMALL
28LETTER PI", with UTF-8 encoding 0xCF 0x80) followed by the byte 0xBF
29(a stray UTF-8 trailing byte), followed by the string "after", where the
30diagnostic highlights the U+03C0 character.
31
32By default, this line will be printed verbatim to the user when
33reporting a diagnostic at it, as:
34
35 beforeÏXafter
36 ^
37
38(using X for the stray byte to avoid putting invalid UTF-8 in this
39commit message)
40
41If the diagnostic sets the "escape" flag, it will be printed as:
42
43 before<U+03C0><BF>after
44 ^~~~~~~~
45
46with -fdiagnostics-escape-format=unicode (the default), or as:
47
48 before<CF><80><BF>after
49 ^~~~~~~~
50
51if the user supplies -fdiagnostics-escape-format=bytes.
52
53This only affects how the source is printed; it does not affect
54how column numbers that are printed (as per -fdiagnostics-column-unit=
55and -fdiagnostics-column-origin=).
56
57gcc/c-family/ChangeLog:
58 * c-lex.c (c_lex_with_flags): When complaining about non-printable
59 CPP_OTHER tokens, set the "escape on output" flag.
60
61gcc/ChangeLog:
62 * common.opt (fdiagnostics-escape-format=): New.
63 (diagnostics_escape_format): New enum.
64 (DIAGNOSTICS_ESCAPE_FORMAT_UNICODE): New enum value.
65 (DIAGNOSTICS_ESCAPE_FORMAT_BYTES): Likewise.
66 * diagnostic-format-json.cc (json_end_diagnostic): Add
67 "escape-source" attribute.
68 * diagnostic-show-locus.c
69 (exploc_with_display_col::exploc_with_display_col): Replace
70 "tabstop" param with a cpp_char_column_policy and add an "aspect"
71 param. Use these to compute m_display_col accordingly.
72 (struct char_display_policy): New struct.
73 (layout::m_policy): New field.
74 (layout::m_escape_on_output): New field.
75 (def_policy): New function.
76 (make_range): Update for changes to exploc_with_display_col ctor.
77 (default_print_decoded_ch): New.
78 (width_per_escaped_byte): New.
79 (escape_as_bytes_width): New.
80 (escape_as_bytes_print): New.
81 (escape_as_unicode_width): New.
82 (escape_as_unicode_print): New.
83 (make_policy): New.
84 (layout::layout): Initialize new fields. Update m_exploc ctor
85 call for above change to ctor.
86 (layout::maybe_add_location_range): Update for changes to
87 exploc_with_display_col ctor.
88 (layout::calculate_x_offset_display): Update for change to
89 cpp_display_width.
90 (layout::print_source_line): Pass policy
91 to cpp_display_width_computation. Capture cpp_decoded_char when
92 calling process_next_codepoint. Move printing of source code to
93 m_policy.m_print_cb.
94 (line_label::line_label): Pass in policy rather than context.
95 (layout::print_any_labels): Update for change to line_label ctor.
96 (get_affected_range): Pass in policy rather than context, updating
97 calls to location_compute_display_column accordingly.
98 (get_printed_columns): Likewise, also for cpp_display_width.
99 (correction::correction): Pass in policy rather than tabstop.
100 (correction::compute_display_cols): Pass m_policy rather than
101 m_tabstop to cpp_display_width.
102 (correction::m_tabstop): Replace with...
103 (correction::m_policy): ...this.
104 (line_corrections::line_corrections): Pass in policy rather than
105 context.
106 (line_corrections::m_context): Replace with...
107 (line_corrections::m_policy): ...this.
108 (line_corrections::add_hint): Update to use m_policy rather than
109 m_context.
110 (line_corrections::add_hint): Likewise.
111 (layout::print_trailing_fixits): Likewise.
112 (selftest::test_display_widths): New.
113 (selftest::test_layout_x_offset_display_utf8): Update to use
114 policy rather than tabstop.
115 (selftest::test_one_liner_labels_utf8): Add test of escaping
116 source lines.
117 (selftest::test_diagnostic_show_locus_one_liner_utf8): Update to
118 use policy rather than tabstop.
119 (selftest::test_overlapped_fixit_printing): Likewise.
120 (selftest::test_overlapped_fixit_printing_utf8): Likewise.
121 (selftest::test_overlapped_fixit_printing_2): Likewise.
122 (selftest::test_tab_expansion): Likewise.
123 (selftest::test_escaping_bytes_1): New.
124 (selftest::test_escaping_bytes_2): New.
125 (selftest::diagnostic_show_locus_c_tests): Call the new tests.
126 * diagnostic.c (diagnostic_initialize): Initialize
127 context->escape_format.
128 (convert_column_unit): Update to use default character width policy.
129 (selftest::test_diagnostic_get_location_text): Likewise.
130 * diagnostic.h (enum diagnostics_escape_format): New enum.
131 (diagnostic_context::escape_format): New field.
132 * doc/invoke.texi (-fdiagnostics-escape-format=): New option.
133 (-fdiagnostics-format=): Add "escape-source" attribute to examples
134 of JSON output, and document it.
135 * input.c (location_compute_display_column): Pass in "policy"
136 rather than "tabstop", passing to
137 cpp_byte_column_to_display_column.
138 (selftest::test_cpp_utf8): Update to use cpp_char_column_policy.
139 * input.h (class cpp_char_column_policy): New forward decl.
140 (location_compute_display_column): Pass in "policy" rather than
141 "tabstop".
142 * opts.c (common_handle_option): Handle
143 OPT_fdiagnostics_escape_format_.
144 * selftest.c (temp_source_file::temp_source_file): New ctor
145 overload taking a size_t.
146 * selftest.h (temp_source_file::temp_source_file): Likewise.
147
148gcc/testsuite/ChangeLog:
149 * c-c++-common/diagnostic-format-json-1.c: Add regexp to consume
150 "escape-source" attribute.
151 * c-c++-common/diagnostic-format-json-2.c: Likewise.
152 * c-c++-common/diagnostic-format-json-3.c: Likewise.
153 * c-c++-common/diagnostic-format-json-4.c: Likewise, twice.
154 * c-c++-common/diagnostic-format-json-5.c: Likewise.
155 * gcc.dg/cpp/warn-normalized-4-bytes.c: New test.
156 * gcc.dg/cpp/warn-normalized-4-unicode.c: New test.
157 * gcc.dg/encoding-issues-bytes.c: New test.
158 * gcc.dg/encoding-issues-unicode.c: New test.
159 * gfortran.dg/diagnostic-format-json-1.F90: Add regexp to consume
160 "escape-source" attribute.
161 * gfortran.dg/diagnostic-format-json-2.F90: Likewise.
162 * gfortran.dg/diagnostic-format-json-3.F90: Likewise.
163
164libcpp/ChangeLog:
165 * charset.c (convert_escape): Use encoding_rich_location when
166 complaining about nonprintable unknown escape sequences.
167 (cpp_display_width_computation::::cpp_display_width_computation):
168 Pass in policy rather than tabstop.
169 (cpp_display_width_computation::process_next_codepoint): Add "out"
170 param and populate *out if non-NULL.
171 (cpp_display_width_computation::advance_display_cols): Pass NULL
172 to process_next_codepoint.
173 (cpp_byte_column_to_display_column): Pass in policy rather than
174 tabstop. Pass NULL to process_next_codepoint.
175 (cpp_display_column_to_byte_column): Pass in policy rather than
176 tabstop.
177 * errors.c (cpp_diagnostic_get_current_location): New function,
178 splitting out the logic from...
179 (cpp_diagnostic): ...here.
180 (cpp_warning_at): New function.
181 (cpp_pedwarning_at): New function.
182 * include/cpplib.h (cpp_warning_at): New decl for rich_location.
183 (cpp_pedwarning_at): Likewise.
184 (struct cpp_decoded_char): New.
185 (struct cpp_char_column_policy): New.
186 (cpp_display_width_computation::cpp_display_width_computation):
187 Replace "tabstop" param with "policy".
188 (cpp_display_width_computation::process_next_codepoint): Add "out"
189 param.
190 (cpp_display_width_computation::m_tabstop): Replace with...
191 (cpp_display_width_computation::m_policy): ...this.
192 (cpp_byte_column_to_display_column): Replace "tabstop" param with
193 "policy".
194 (cpp_display_width): Likewise.
195 (cpp_display_column_to_byte_column): Likewise.
196 * include/line-map.h (rich_location::escape_on_output_p): New.
197 (rich_location::set_escape_on_output): New.
198 (rich_location::m_escape_on_output): New.
199 * internal.h (cpp_diagnostic_get_current_location): New decl.
200 (class encoding_rich_location): New.
201 * lex.c (skip_whitespace): Use encoding_rich_location when
202 complaining about null characters.
203 (warn_about_normalization): Generate a source range when
204 complaining about improperly normalized tokens, rather than just a
205 point, and use encoding_rich_location so that the source code
206 is escaped on printing.
207 * line-map.c (rich_location::rich_location): Initialize
208 m_escape_on_output.
209
210Signed-off-by: David Malcolm <dmalcolm@redhat.com>
211
212CVE: CVE-2021-42574
213Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e]
214Signed-off-by: Pgowda <pgowda.cve@gmail.com>
215
216---
217 gcc/c-family/c-lex.c | 6 +-
218 gcc/common.opt | 13 +
219 gcc/diagnostic-format-json.cc | 3 +
220 gcc/diagnostic-show-locus.c | 580 +++++++++++++++---
221 gcc/diagnostic.c | 10 +-
222 gcc/diagnostic.h | 18 +
223 gcc/doc/invoke.texi | 43 +-
224 gcc/input.c | 62 +-
225 gcc/input.h | 7 +-
226 gcc/opts.c | 4 +
227 gcc/selftest.c | 15 +
228 gcc/selftest.h | 2 +
229 .../c-c++-common/diagnostic-format-json-1.c | 1 +
230 .../c-c++-common/diagnostic-format-json-2.c | 1 +
231 .../c-c++-common/diagnostic-format-json-3.c | 1 +
232 .../c-c++-common/diagnostic-format-json-4.c | 2 +
233 .../c-c++-common/diagnostic-format-json-5.c | 1 +
234 .../gcc.dg/cpp/warn-normalized-4-bytes.c | 21 +
235 .../gcc.dg/cpp/warn-normalized-4-unicode.c | 19 +
236 gcc/testsuite/gcc.dg/encoding-issues-bytes.c | Bin 0 -> 595 bytes
237 .../gcc.dg/encoding-issues-unicode.c | Bin 0 -> 613 bytes
238 .../gfortran.dg/diagnostic-format-json-1.F90 | 1 +
239 .../gfortran.dg/diagnostic-format-json-2.F90 | 1 +
240 .../gfortran.dg/diagnostic-format-json-3.F90 | 1 +
241 libcpp/charset.c | 63 +-
242 libcpp/errors.c | 82 ++-
243 libcpp/include/cpplib.h | 76 ++-
244 libcpp/include/line-map.h | 13 +
245 libcpp/internal.h | 23 +
246 libcpp/lex.c | 38 +-
247 libcpp/line-map.c | 3 +-
248 31 files changed, 942 insertions(+), 168 deletions(-)
249 create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c
250 create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c
251 create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-bytes.c
252 create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-unicode.c
253
254diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
255--- a/gcc/c-family/c-lex.c 2020-07-22 23:35:17.296384022 -0700
256+++ b/gcc/c-family/c-lex.c 2021-12-25 01:30:50.669689023 -0800
257@@ -587,7 +587,11 @@ c_lex_with_flags (tree *value, location_
258 else if (ISGRAPH (c))
259 error_at (*loc, "stray %qc in program", (int) c);
260 else
261- error_at (*loc, "stray %<\\%o%> in program", (int) c);
262+ {
263+ rich_location rich_loc (line_table, *loc);
264+ rich_loc.set_escape_on_output (true);
265+ error_at (&rich_loc, "stray %<\\%o%> in program", (int) c);
266+ }
267 }
268 goto retry;
269
270diff --git a/gcc/common.opt b/gcc/common.opt
271--- a/gcc/common.opt 2021-12-25 01:29:12.915317374 -0800
272+++ b/gcc/common.opt 2021-12-25 01:30:50.669689023 -0800
273@@ -1337,6 +1337,10 @@ fdiagnostics-format=
274 Common Joined RejectNegative Enum(diagnostics_output_format)
275 -fdiagnostics-format=[text|json] Select output format.
276
277+fdiagnostics-escape-format=
278+Common Joined RejectNegative Enum(diagnostics_escape_format)
279+-fdiagnostics-escape-format=[unicode|bytes] Select how to escape non-printable-ASCII bytes in the source for diagnostics that suggest it.
280+
281 ; Required for these enum values.
282 SourceInclude
283 diagnostic.h
284@@ -1351,6 +1355,15 @@ EnumValue
285 Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE)
286
287 Enum
288+Name(diagnostics_escape_format) Type(int)
289+
290+EnumValue
291+Enum(diagnostics_escape_format) String(unicode) Value(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE)
292+
293+EnumValue
294+Enum(diagnostics_escape_format) String(bytes) Value(DIAGNOSTICS_ESCAPE_FORMAT_BYTES)
295+
296+Enum
297 Name(diagnostics_output_format) Type(int)
298
299 EnumValue
300diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
301--- a/gcc/diagnostic.c 2021-12-25 01:29:12.915317374 -0800
302+++ b/gcc/diagnostic.c 2021-12-25 01:30:50.669689023 -0800
303@@ -223,6 +223,7 @@ diagnostic_initialize (diagnostic_contex
304 context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY;
305 context->column_origin = 1;
306 context->tabstop = 8;
307+ context->escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
308 context->edit_context_ptr = NULL;
309 context->diagnostic_group_nesting_depth = 0;
310 context->diagnostic_group_emission_count = 0;
311@@ -2152,8 +2153,8 @@ test_diagnostic_get_location_text ()
312 const char *const content = "smile \xf0\x9f\x98\x82\n";
313 const int line_bytes = strlen (content) - 1;
314 const int def_tabstop = 8;
315- const int display_width = cpp_display_width (content, line_bytes,
316- def_tabstop);
317+ const cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
318+ const int display_width = cpp_display_width (content, line_bytes, policy);
319 ASSERT_EQ (line_bytes - 2, display_width);
320 temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
321 const char *const fname = tmp.get_filename ();
322diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc
323--- a/gcc/diagnostic-format-json.cc 2021-12-25 01:29:12.915317374 -0800
324+++ b/gcc/diagnostic-format-json.cc 2021-12-25 01:30:50.669689023 -0800
325@@ -264,6 +264,9 @@ json_end_diagnostic (diagnostic_context
326 json::value *path_value = context->make_json_for_path (context, path);
327 diag_obj->set ("path", path_value);
328 }
329+
330+ diag_obj->set ("escape-source",
331+ new json::literal (richloc->escape_on_output_p ()));
332 }
333
334 /* No-op implementation of "begin_group_cb" for JSON output. */
335diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
336--- a/gcc/diagnostic.h 2021-12-25 01:29:12.919317307 -0800
337+++ b/gcc/diagnostic.h 2021-12-25 01:30:50.669689023 -0800
338@@ -38,6 +38,20 @@ enum diagnostics_column_unit
339 DIAGNOSTICS_COLUMN_UNIT_BYTE
340 };
341
342+/* An enum for controlling how to print non-ASCII characters/bytes when
343+ a diagnostic suggests escaping the source code on output. */
344+
345+enum diagnostics_escape_format
346+{
347+ /* Escape non-ASCII Unicode characters in the form <U+XXXX> and
348+ non-UTF-8 bytes in the form <XX>. */
349+ DIAGNOSTICS_ESCAPE_FORMAT_UNICODE,
350+
351+ /* Escape non-ASCII bytes in the form <XX> (thus showing the underlying
352+ encoding of non-ASCII Unicode characters). */
353+ DIAGNOSTICS_ESCAPE_FORMAT_BYTES
354+};
355+
356 /* Enum for overriding the standard output format. */
357
358 enum diagnostics_output_format
359@@ -303,6 +317,10 @@ struct diagnostic_context
360 /* The size of the tabstop for tab expansion. */
361 int tabstop;
362
363+ /* How should non-ASCII/non-printable bytes be escaped when
364+ a diagnostic suggests escaping the source code on output. */
365+ enum diagnostics_escape_format escape_format;
366+
367 /* If non-NULL, an edit_context to which fix-it hints should be
368 applied, for generating patches. */
369 edit_context *edit_context_ptr;
370diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c
371--- a/gcc/diagnostic-show-locus.c 2021-12-25 01:29:12.919317307 -0800
372+++ b/gcc/diagnostic-show-locus.c 2021-12-25 01:30:50.673688956 -0800
373@@ -175,10 +175,26 @@ enum column_unit {
374 class exploc_with_display_col : public expanded_location
375 {
376 public:
377- exploc_with_display_col (const expanded_location &exploc, int tabstop)
378- : expanded_location (exploc),
379- m_display_col (location_compute_display_column (exploc, tabstop))
380- {}
381+ exploc_with_display_col (const expanded_location &exploc,
382+ const cpp_char_column_policy &policy,
383+ enum location_aspect aspect)
384+ : expanded_location (exploc),
385+ m_display_col (location_compute_display_column (exploc, policy))
386+ {
387+ if (exploc.column > 0)
388+ {
389+ /* m_display_col is now the final column of the byte.
390+ If escaping has happened, we may want the first column instead. */
391+ if (aspect != LOCATION_ASPECT_FINISH)
392+ {
393+ expanded_location prev_exploc (exploc);
394+ prev_exploc.column--;
395+ int prev_display_col
396+ = (location_compute_display_column (prev_exploc, policy));
397+ m_display_col = prev_display_col + 1;
398+ }
399+ }
400+ }
401
402 int m_display_col;
403 };
404@@ -313,6 +329,31 @@ test_line_span ()
405
406 #endif /* #if CHECKING_P */
407
408+/* A bundle of information containing how to print unicode
409+ characters and bytes when quoting source code.
410+
411+ Provides a unified place to support escaping some subset
412+ of characters to some format.
413+
414+ Extends char_column_policy; printing is split out to avoid
415+ libcpp having to know about pretty_printer. */
416+
417+struct char_display_policy : public cpp_char_column_policy
418+{
419+ public:
420+ char_display_policy (int tabstop,
421+ int (*width_cb) (cppchar_t c),
422+ void (*print_cb) (pretty_printer *pp,
423+ const cpp_decoded_char &cp))
424+ : cpp_char_column_policy (tabstop, width_cb),
425+ m_print_cb (print_cb)
426+ {
427+ }
428+
429+ void (*m_print_cb) (pretty_printer *pp,
430+ const cpp_decoded_char &cp);
431+};
432+
433 /* A class to control the overall layout when printing a diagnostic.
434
435 The layout is determined within the constructor.
436@@ -345,6 +386,8 @@ class layout
437
438 void print_line (linenum_type row);
439
440+ void on_bad_codepoint (const char *ptr, cppchar_t ch, size_t ch_sz);
441+
442 private:
443 bool will_show_line_p (linenum_type row) const;
444 void print_leading_fixits (linenum_type row);
445@@ -386,6 +429,7 @@ class layout
446 private:
447 diagnostic_context *m_context;
448 pretty_printer *m_pp;
449+ char_display_policy m_policy;
450 location_t m_primary_loc;
451 exploc_with_display_col m_exploc;
452 colorizer m_colorizer;
453@@ -398,6 +442,7 @@ class layout
454 auto_vec <line_span> m_line_spans;
455 int m_linenum_width;
456 int m_x_offset_display;
457+ bool m_escape_on_output;
458 };
459
460 /* Implementation of "class colorizer". */
461@@ -646,6 +691,11 @@ layout_range::intersects_line_p (linenum
462 /* Default for when we don't care what the tab expansion is set to. */
463 static const int def_tabstop = 8;
464
465+static cpp_char_column_policy def_policy ()
466+{
467+ return cpp_char_column_policy (8, cpp_wcwidth);
468+}
469+
470 /* Create some expanded locations for testing layout_range. The filename
471 member of the explocs is set to the empty string. This member will only be
472 inspected by the calls to location_compute_display_column() made from the
473@@ -662,10 +712,13 @@ make_range (int start_line, int start_co
474 = {"", start_line, start_col, NULL, false};
475 const expanded_location finish_exploc
476 = {"", end_line, end_col, NULL, false};
477- return layout_range (exploc_with_display_col (start_exploc, def_tabstop),
478- exploc_with_display_col (finish_exploc, def_tabstop),
479+ return layout_range (exploc_with_display_col (start_exploc, def_policy (),
480+ LOCATION_ASPECT_START),
481+ exploc_with_display_col (finish_exploc, def_policy (),
482+ LOCATION_ASPECT_FINISH),
483 SHOW_RANGE_WITHOUT_CARET,
484- exploc_with_display_col (start_exploc, def_tabstop),
485+ exploc_with_display_col (start_exploc, def_policy (),
486+ LOCATION_ASPECT_CARET),
487 0, NULL);
488 }
489
490@@ -950,6 +1003,164 @@ fixit_cmp (const void *p_a, const void *
491 return hint_a->get_start_loc () - hint_b->get_start_loc ();
492 }
493
494+/* Callbacks for use when not escaping the source. */
495+
496+/* The default callback for char_column_policy::m_width_cb is cpp_wcwidth. */
497+
498+/* Callback for char_display_policy::m_print_cb for printing source chars
499+ when not escaping the source. */
500+
501+static void
502+default_print_decoded_ch (pretty_printer *pp,
503+ const cpp_decoded_char &decoded_ch)
504+{
505+ for (const char *ptr = decoded_ch.m_start_byte;
506+ ptr != decoded_ch.m_next_byte; ptr++)
507+ {
508+ if (*ptr == '\0' || *ptr == '\r')
509+ {
510+ pp_space (pp);
511+ continue;
512+ }
513+
514+ pp_character (pp, *ptr);
515+ }
516+}
517+
518+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
519+
520+static const int width_per_escaped_byte = 4;
521+
522+/* Callback for char_column_policy::m_width_cb for determining the
523+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
524+
525+static int
526+escape_as_bytes_width (cppchar_t ch)
527+{
528+ if (ch < 0x80 && ISPRINT (ch))
529+ return cpp_wcwidth (ch);
530+ else
531+ {
532+ if (ch <= 0x7F) return 1 * width_per_escaped_byte;
533+ if (ch <= 0x7FF) return 2 * width_per_escaped_byte;
534+ if (ch <= 0xFFFF) return 3 * width_per_escaped_byte;
535+ return 4 * width_per_escaped_byte;
536+ }
537+}
538+
539+/* Callback for char_display_policy::m_print_cb for printing source chars
540+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
541+
542+static void
543+escape_as_bytes_print (pretty_printer *pp,
544+ const cpp_decoded_char &decoded_ch)
545+{
546+ if (!decoded_ch.m_valid_ch)
547+ {
548+ for (const char *iter = decoded_ch.m_start_byte;
549+ iter != decoded_ch.m_next_byte; ++iter)
550+ {
551+ char buf[16];
552+ sprintf (buf, "<%02x>", (unsigned char)*iter);
553+ pp_string (pp, buf);
554+ }
555+ return;
556+ }
557+
558+ cppchar_t ch = decoded_ch.m_ch;
559+ if (ch < 0x80 && ISPRINT (ch))
560+ pp_character (pp, ch);
561+ else
562+ {
563+ for (const char *iter = decoded_ch.m_start_byte;
564+ iter < decoded_ch.m_next_byte; ++iter)
565+ {
566+ char buf[16];
567+ sprintf (buf, "<%02x>", (unsigned char)*iter);
568+ pp_string (pp, buf);
569+ }
570+ }
571+}
572+
573+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
574+
575+/* Callback for char_column_policy::m_width_cb for determining the
576+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
577+
578+static int
579+escape_as_unicode_width (cppchar_t ch)
580+{
581+ if (ch < 0x80 && ISPRINT (ch))
582+ return cpp_wcwidth (ch);
583+ else
584+ {
585+ // Width of "<U+%04x>"
586+ if (ch > 0xfffff)
587+ return 10;
588+ else if (ch > 0xffff)
589+ return 9;
590+ else
591+ return 8;
592+ }
593+}
594+
595+/* Callback for char_display_policy::m_print_cb for printing source chars
596+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
597+
598+static void
599+escape_as_unicode_print (pretty_printer *pp,
600+ const cpp_decoded_char &decoded_ch)
601+{
602+ if (!decoded_ch.m_valid_ch)
603+ {
604+ escape_as_bytes_print (pp, decoded_ch);
605+ return;
606+ }
607+
608+ cppchar_t ch = decoded_ch.m_ch;
609+ if (ch < 0x80 && ISPRINT (ch))
610+ pp_character (pp, ch);
611+ else
612+ {
613+ char buf[16];
614+ sprintf (buf, "<U+%04X>", ch);
615+ pp_string (pp, buf);
616+ }
617+}
618+
619+/* Populate a char_display_policy based on DC and RICHLOC. */
620+
621+static char_display_policy
622+make_policy (const diagnostic_context &dc,
623+ const rich_location &richloc)
624+{
625+ /* The default is to not escape non-ASCII bytes. */
626+ char_display_policy result
627+ (dc.tabstop, cpp_wcwidth, default_print_decoded_ch);
628+
629+ /* If the diagnostic suggests escaping non-ASCII bytes, then
630+ use policy from user-supplied options. */
631+ if (richloc.escape_on_output_p ())
632+ {
633+ result.m_undecoded_byte_width = width_per_escaped_byte;
634+ switch (dc.escape_format)
635+ {
636+ default:
637+ gcc_unreachable ();
638+ case DIAGNOSTICS_ESCAPE_FORMAT_UNICODE:
639+ result.m_width_cb = escape_as_unicode_width;
640+ result.m_print_cb = escape_as_unicode_print;
641+ break;
642+ case DIAGNOSTICS_ESCAPE_FORMAT_BYTES:
643+ result.m_width_cb = escape_as_bytes_width;
644+ result.m_print_cb = escape_as_bytes_print;
645+ break;
646+ }
647+ }
648+
649+ return result;
650+}
651+
652 /* Implementation of class layout. */
653
654 /* Constructor for class layout.
655@@ -966,8 +1177,10 @@ layout::layout (diagnostic_context * con
656 diagnostic_t diagnostic_kind)
657 : m_context (context),
658 m_pp (context->printer),
659+ m_policy (make_policy (*context, *richloc)),
660 m_primary_loc (richloc->get_range (0)->m_loc),
661- m_exploc (richloc->get_expanded_location (0), context->tabstop),
662+ m_exploc (richloc->get_expanded_location (0), m_policy,
663+ LOCATION_ASPECT_CARET),
664 m_colorizer (context, diagnostic_kind),
665 m_colorize_source_p (context->colorize_source_p),
666 m_show_labels_p (context->show_labels_p),
667@@ -977,7 +1190,8 @@ layout::layout (diagnostic_context * con
668 m_fixit_hints (richloc->get_num_fixit_hints ()),
669 m_line_spans (1 + richloc->get_num_locations ()),
670 m_linenum_width (0),
671- m_x_offset_display (0)
672+ m_x_offset_display (0),
673+ m_escape_on_output (richloc->escape_on_output_p ())
674 {
675 for (unsigned int idx = 0; idx < richloc->get_num_locations (); idx++)
676 {
677@@ -1063,10 +1277,13 @@ layout::maybe_add_location_range (const
678
679 /* Everything is now known to be in the correct source file,
680 but it may require further sanitization. */
681- layout_range ri (exploc_with_display_col (start, m_context->tabstop),
682- exploc_with_display_col (finish, m_context->tabstop),
683+ layout_range ri (exploc_with_display_col (start, m_policy,
684+ LOCATION_ASPECT_START),
685+ exploc_with_display_col (finish, m_policy,
686+ LOCATION_ASPECT_FINISH),
687 loc_range->m_range_display_kind,
688- exploc_with_display_col (caret, m_context->tabstop),
689+ exploc_with_display_col (caret, m_policy,
690+ LOCATION_ASPECT_CARET),
691 original_idx, loc_range->m_label);
692
693 /* If we have a range that finishes before it starts (perhaps
694@@ -1400,7 +1617,7 @@ layout::calculate_x_offset_display ()
695 = get_line_bytes_without_trailing_whitespace (line.get_buffer (),
696 line.length ());
697 int eol_display_column
698- = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop);
699+ = cpp_display_width (line.get_buffer (), line_bytes, m_policy);
700 if (caret_display_column > eol_display_column
701 || !caret_display_column)
702 {
703@@ -1479,7 +1696,7 @@ layout::print_source_line (linenum_type
704 /* This object helps to keep track of which display column we are at, which is
705 necessary for computing the line bounds in display units, for doing
706 tab expansion, and for implementing m_x_offset_display. */
707- cpp_display_width_computation dw (line, line_bytes, m_context->tabstop);
708+ cpp_display_width_computation dw (line, line_bytes, m_policy);
709
710 /* Skip the first m_x_offset_display display columns. In case the leading
711 portion that will be skipped ends with a character with wcwidth > 1, then
712@@ -1527,7 +1744,8 @@ layout::print_source_line (linenum_type
713 tabs and replacing some control bytes with spaces as necessary. */
714 const char *c = dw.next_byte ();
715 const int start_disp_col = dw.display_cols_processed () + 1;
716- const int this_display_width = dw.process_next_codepoint ();
717+ cpp_decoded_char cp;
718+ const int this_display_width = dw.process_next_codepoint (&cp);
719 if (*c == '\t')
720 {
721 /* The returned display width is the number of spaces into which the
722@@ -1536,15 +1754,6 @@ layout::print_source_line (linenum_type
723 pp_space (m_pp);
724 continue;
725 }
726- if (*c == '\0' || *c == '\r')
727- {
728- /* cpp_wcwidth() promises to return 1 for all control bytes, and we
729- want to output these as a single space too, so this case is
730- actually the same as the '\t' case. */
731- gcc_assert (this_display_width == 1);
732- pp_space (m_pp);
733- continue;
734- }
735
736 /* We have a (possibly multibyte) character to output; update the line
737 bounds if it is not whitespace. */
738@@ -1556,7 +1765,8 @@ layout::print_source_line (linenum_type
739 }
740
741 /* Output the character. */
742- while (c != dw.next_byte ()) pp_character (m_pp, *c++);
743+ m_policy.m_print_cb (m_pp, cp);
744+ c = dw.next_byte ();
745 }
746 print_newline ();
747 return lbounds;
748@@ -1655,14 +1865,14 @@ layout::print_annotation_line (linenum_t
749 class line_label
750 {
751 public:
752- line_label (diagnostic_context *context, int state_idx, int column,
753+ line_label (const cpp_char_column_policy &policy,
754+ int state_idx, int column,
755 label_text text)
756 : m_state_idx (state_idx), m_column (column),
757 m_text (text), m_label_line (0), m_has_vbar (true)
758 {
759 const int bytes = strlen (text.m_buffer);
760- m_display_width
761- = cpp_display_width (text.m_buffer, bytes, context->tabstop);
762+ m_display_width = cpp_display_width (text.m_buffer, bytes, policy);
763 }
764
765 /* Sorting is primarily by column, then by state index. */
766@@ -1722,7 +1932,7 @@ layout::print_any_labels (linenum_type r
767 if (text.m_buffer == NULL)
768 continue;
769
770- labels.safe_push (line_label (m_context, i, disp_col, text));
771+ labels.safe_push (line_label (m_policy, i, disp_col, text));
772 }
773 }
774
775@@ -2002,7 +2212,7 @@ public:
776
777 /* Get the range of bytes or display columns that HINT would affect. */
778 static column_range
779-get_affected_range (diagnostic_context *context,
780+get_affected_range (const cpp_char_column_policy &policy,
781 const fixit_hint *hint, enum column_unit col_unit)
782 {
783 expanded_location exploc_start = expand_location (hint->get_start_loc ());
784@@ -2013,13 +2223,11 @@ get_affected_range (diagnostic_context *
785 int finish_column;
786 if (col_unit == CU_DISPLAY_COLS)
787 {
788- start_column
789- = location_compute_display_column (exploc_start, context->tabstop);
790+ start_column = location_compute_display_column (exploc_start, policy);
791 if (hint->insertion_p ())
792 finish_column = start_column - 1;
793 else
794- finish_column
795- = location_compute_display_column (exploc_finish, context->tabstop);
796+ finish_column = location_compute_display_column (exploc_finish, policy);
797 }
798 else
799 {
800@@ -2032,12 +2240,13 @@ get_affected_range (diagnostic_context *
801 /* Get the range of display columns that would be printed for HINT. */
802
803 static column_range
804-get_printed_columns (diagnostic_context *context, const fixit_hint *hint)
805+get_printed_columns (const cpp_char_column_policy &policy,
806+ const fixit_hint *hint)
807 {
808 expanded_location exploc = expand_location (hint->get_start_loc ());
809- int start_column = location_compute_display_column (exploc, context->tabstop);
810+ int start_column = location_compute_display_column (exploc, policy);
811 int hint_width = cpp_display_width (hint->get_string (), hint->get_length (),
812- context->tabstop);
813+ policy);
814 int final_hint_column = start_column + hint_width - 1;
815 if (hint->insertion_p ())
816 {
817@@ -2047,8 +2256,7 @@ get_printed_columns (diagnostic_context
818 {
819 exploc = expand_location (hint->get_next_loc ());
820 --exploc.column;
821- int finish_column
822- = location_compute_display_column (exploc, context->tabstop);
823+ int finish_column = location_compute_display_column (exploc, policy);
824 return column_range (start_column,
825 MAX (finish_column, final_hint_column));
826 }
827@@ -2066,13 +2274,13 @@ public:
828 column_range affected_columns,
829 column_range printed_columns,
830 const char *new_text, size_t new_text_len,
831- int tabstop)
832+ const cpp_char_column_policy &policy)
833 : m_affected_bytes (affected_bytes),
834 m_affected_columns (affected_columns),
835 m_printed_columns (printed_columns),
836 m_text (xstrdup (new_text)),
837 m_byte_length (new_text_len),
838- m_tabstop (tabstop),
839+ m_policy (policy),
840 m_alloc_sz (new_text_len + 1)
841 {
842 compute_display_cols ();
843@@ -2090,7 +2298,7 @@ public:
844
845 void compute_display_cols ()
846 {
847- m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop);
848+ m_display_cols = cpp_display_width (m_text, m_byte_length, m_policy);
849 }
850
851 void overwrite (int dst_offset, const char_span &src_span)
852@@ -2118,7 +2326,7 @@ public:
853 char *m_text;
854 size_t m_byte_length; /* Not including null-terminator. */
855 int m_display_cols;
856- int m_tabstop;
857+ const cpp_char_column_policy &m_policy;
858 size_t m_alloc_sz;
859 };
860
861@@ -2154,15 +2362,16 @@ correction::ensure_terminated ()
862 class line_corrections
863 {
864 public:
865- line_corrections (diagnostic_context *context, const char *filename,
866+ line_corrections (const char_display_policy &policy,
867+ const char *filename,
868 linenum_type row)
869- : m_context (context), m_filename (filename), m_row (row)
870+ : m_policy (policy), m_filename (filename), m_row (row)
871 {}
872 ~line_corrections ();
873
874 void add_hint (const fixit_hint *hint);
875
876- diagnostic_context *m_context;
877+ const char_display_policy &m_policy;
878 const char *m_filename;
879 linenum_type m_row;
880 auto_vec <correction *> m_corrections;
881@@ -2208,10 +2417,10 @@ source_line::source_line (const char *fi
882 void
883 line_corrections::add_hint (const fixit_hint *hint)
884 {
885- column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES);
886- column_range affected_columns = get_affected_range (m_context, hint,
887+ column_range affected_bytes = get_affected_range (m_policy, hint, CU_BYTES);
888+ column_range affected_columns = get_affected_range (m_policy, hint,
889 CU_DISPLAY_COLS);
890- column_range printed_columns = get_printed_columns (m_context, hint);
891+ column_range printed_columns = get_printed_columns (m_policy, hint);
892
893 /* Potentially consolidate. */
894 if (!m_corrections.is_empty ())
895@@ -2280,7 +2489,7 @@ line_corrections::add_hint (const fixit_
896 printed_columns,
897 hint->get_string (),
898 hint->get_length (),
899- m_context->tabstop));
900+ m_policy));
901 }
902
903 /* If there are any fixit hints on source line ROW, print them.
904@@ -2294,7 +2503,7 @@ layout::print_trailing_fixits (linenum_t
905 {
906 /* Build a list of correction instances for the line,
907 potentially consolidating hints (for the sake of readability). */
908- line_corrections corrections (m_context, m_exploc.file, row);
909+ line_corrections corrections (m_policy, m_exploc.file, row);
910 for (unsigned int i = 0; i < m_fixit_hints.length (); i++)
911 {
912 const fixit_hint *hint = m_fixit_hints[i];
913@@ -2635,6 +2844,59 @@ namespace selftest {
914
915 /* Selftests for diagnostic_show_locus. */
916
917+/* Verify that cpp_display_width correctly handles escaping. */
918+
919+static void
920+test_display_widths ()
921+{
922+ gcc_rich_location richloc (UNKNOWN_LOCATION);
923+
924+ /* U+03C0 "GREEK SMALL LETTER PI". */
925+ const char *pi = "\xCF\x80";
926+ /* U+1F642 "SLIGHTLY SMILING FACE". */
927+ const char *emoji = "\xF0\x9F\x99\x82";
928+ /* Stray trailing byte of a UTF-8 character. */
929+ const char *stray = "\xBF";
930+ /* U+10FFFF. */
931+ const char *max_codepoint = "\xF4\x8F\xBF\xBF";
932+
933+ /* No escaping. */
934+ {
935+ test_diagnostic_context dc;
936+ char_display_policy policy (make_policy (dc, richloc));
937+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 1);
938+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 2);
939+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 1);
940+ /* Don't check width of U+10FFFF; it's in a private use plane. */
941+ }
942+
943+ richloc.set_escape_on_output (true);
944+
945+ {
946+ test_diagnostic_context dc;
947+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
948+ char_display_policy policy (make_policy (dc, richloc));
949+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8);
950+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 9);
951+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4);
952+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint),
953+ policy),
954+ strlen ("<U+10FFFF>"));
955+ }
956+
957+ {
958+ test_diagnostic_context dc;
959+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
960+ char_display_policy policy (make_policy (dc, richloc));
961+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8);
962+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 16);
963+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4);
964+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint),
965+ policy),
966+ 16);
967+ }
968+}
969+
970 /* For precise tests of the layout, make clear where the source line will
971 start. test_left_margin sets the total byte count from the left side of the
972 screen to the start of source lines, after the line number and the separator,
973@@ -2704,10 +2966,10 @@ test_layout_x_offset_display_utf8 (const
974 char_span lspan = location_get_source_line (tmp.get_filename (), 1);
975 ASSERT_EQ (line_display_cols,
976 cpp_display_width (lspan.get_buffer (), lspan.length (),
977- def_tabstop));
978+ def_policy ()));
979 ASSERT_EQ (line_display_cols,
980 location_compute_display_column (expand_location (line_end),
981- def_tabstop));
982+ def_policy ()));
983 ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1),
984 "\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8));
985
986@@ -2855,12 +3117,13 @@ test_layout_x_offset_display_tab (const
987 ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1)));
988 for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
989 {
990+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
991 ASSERT_EQ (line_bytes + extra_width[tabstop],
992 cpp_display_width (lspan.get_buffer (), lspan.length (),
993- tabstop));
994+ policy));
995 ASSERT_EQ (line_bytes + extra_width[tabstop],
996 location_compute_display_column (expand_location (line_end),
997- tabstop));
998+ policy));
999 }
1000
1001 /* Check that the tab is expanded to the expected number of spaces. */
1002@@ -3992,6 +4255,43 @@ test_one_liner_labels_utf8 ()
1003 " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n",
1004 pp_formatted_text (dc.printer));
1005 }
1006+
1007+ /* Example of escaping the source lines. */
1008+ {
1009+ text_range_label label0 ("label 0\xf0\x9f\x98\x82");
1010+ text_range_label label1 ("label 1\xcf\x80");
1011+ text_range_label label2 ("label 2\xcf\x80");
1012+ gcc_rich_location richloc (foo, &label0);
1013+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1);
1014+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2);
1015+ richloc.set_escape_on_output (true);
1016+
1017+ {
1018+ test_diagnostic_context dc;
1019+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
1020+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1021+ ASSERT_STREQ (" <U+1F602>_foo = <U+03C0>_bar.<U+1F602>_field<U+03C0>;\n"
1022+ " ^~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~\n"
1023+ " | | |\n"
1024+ " | | label 2\xcf\x80\n"
1025+ " | label 1\xcf\x80\n"
1026+ " label 0\xf0\x9f\x98\x82\n",
1027+ pp_formatted_text (dc.printer));
1028+ }
1029+ {
1030+ test_diagnostic_context dc;
1031+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
1032+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1033+ ASSERT_STREQ
1034+ (" <f0><9f><98><82>_foo = <cf><80>_bar.<f0><9f><98><82>_field<cf><80>;\n"
1035+ " ^~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
1036+ " | | |\n"
1037+ " | | label 2\xcf\x80\n"
1038+ " | label 1\xcf\x80\n"
1039+ " label 0\xf0\x9f\x98\x82\n",
1040+ pp_formatted_text (dc.printer));
1041+ }
1042+ }
1043 }
1044
1045 /* Make sure that colorization codes don't interrupt a multibyte
1046@@ -4046,9 +4346,9 @@ test_diagnostic_show_locus_one_liner_utf
1047
1048 char_span lspan = location_get_source_line (tmp.get_filename (), 1);
1049 ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (),
1050- def_tabstop));
1051+ def_policy ()));
1052 ASSERT_EQ (25, location_compute_display_column (expand_location (line_end),
1053- def_tabstop));
1054+ def_policy ()));
1055
1056 test_one_liner_simple_caret_utf8 ();
1057 test_one_liner_caret_and_range_utf8 ();
1058@@ -4434,30 +4734,31 @@ test_overlapped_fixit_printing (const li
1059 pp_formatted_text (dc.printer));
1060
1061 /* Unit-test the line_corrections machinery. */
1062+ char_display_policy policy (make_policy (dc, richloc));
1063 ASSERT_EQ (3, richloc.get_num_fixit_hints ());
1064 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1065 ASSERT_EQ (column_range (12, 12),
1066- get_affected_range (&dc, hint_0, CU_BYTES));
1067+ get_affected_range (policy, hint_0, CU_BYTES));
1068 ASSERT_EQ (column_range (12, 12),
1069- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
1070- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
1071+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS));
1072+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0));
1073 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1074 ASSERT_EQ (column_range (18, 18),
1075- get_affected_range (&dc, hint_1, CU_BYTES));
1076+ get_affected_range (policy, hint_1, CU_BYTES));
1077 ASSERT_EQ (column_range (18, 18),
1078- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
1079- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
1080+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS));
1081+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1));
1082 const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
1083 ASSERT_EQ (column_range (29, 28),
1084- get_affected_range (&dc, hint_2, CU_BYTES));
1085+ get_affected_range (policy, hint_2, CU_BYTES));
1086 ASSERT_EQ (column_range (29, 28),
1087- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
1088- ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2));
1089+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS));
1090+ ASSERT_EQ (column_range (29, 29), get_printed_columns (policy, hint_2));
1091
1092 /* Add each hint in turn to a line_corrections instance,
1093 and verify that they are consolidated into one correction instance
1094 as expected. */
1095- line_corrections lc (&dc, tmp.get_filename (), 1);
1096+ line_corrections lc (policy, tmp.get_filename (), 1);
1097
1098 /* The first replace hint by itself. */
1099 lc.add_hint (hint_0);
1100@@ -4649,30 +4950,31 @@ test_overlapped_fixit_printing_utf8 (con
1101 pp_formatted_text (dc.printer));
1102
1103 /* Unit-test the line_corrections machinery. */
1104+ char_display_policy policy (make_policy (dc, richloc));
1105 ASSERT_EQ (3, richloc.get_num_fixit_hints ());
1106 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1107 ASSERT_EQ (column_range (14, 14),
1108- get_affected_range (&dc, hint_0, CU_BYTES));
1109+ get_affected_range (policy, hint_0, CU_BYTES));
1110 ASSERT_EQ (column_range (12, 12),
1111- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
1112- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
1113+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS));
1114+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0));
1115 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1116 ASSERT_EQ (column_range (22, 22),
1117- get_affected_range (&dc, hint_1, CU_BYTES));
1118+ get_affected_range (policy, hint_1, CU_BYTES));
1119 ASSERT_EQ (column_range (18, 18),
1120- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
1121- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
1122+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS));
1123+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1));
1124 const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
1125 ASSERT_EQ (column_range (35, 34),
1126- get_affected_range (&dc, hint_2, CU_BYTES));
1127+ get_affected_range (policy, hint_2, CU_BYTES));
1128 ASSERT_EQ (column_range (30, 29),
1129- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
1130- ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2));
1131+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS));
1132+ ASSERT_EQ (column_range (30, 30), get_printed_columns (policy, hint_2));
1133
1134 /* Add each hint in turn to a line_corrections instance,
1135 and verify that they are consolidated into one correction instance
1136 as expected. */
1137- line_corrections lc (&dc, tmp.get_filename (), 1);
1138+ line_corrections lc (policy, tmp.get_filename (), 1);
1139
1140 /* The first replace hint by itself. */
1141 lc.add_hint (hint_0);
1142@@ -4866,15 +5168,16 @@ test_overlapped_fixit_printing_2 (const
1143 richloc.add_fixit_insert_before (col_21, "}");
1144
1145 /* These fixits should be accepted; they can't be consolidated. */
1146+ char_display_policy policy (make_policy (dc, richloc));
1147 ASSERT_EQ (2, richloc.get_num_fixit_hints ());
1148 const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
1149 ASSERT_EQ (column_range (23, 22),
1150- get_affected_range (&dc, hint_0, CU_BYTES));
1151- ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0));
1152+ get_affected_range (policy, hint_0, CU_BYTES));
1153+ ASSERT_EQ (column_range (23, 23), get_printed_columns (policy, hint_0));
1154 const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
1155 ASSERT_EQ (column_range (21, 20),
1156- get_affected_range (&dc, hint_1, CU_BYTES));
1157- ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1));
1158+ get_affected_range (policy, hint_1, CU_BYTES));
1159+ ASSERT_EQ (column_range (21, 21), get_printed_columns (policy, hint_1));
1160
1161 /* Verify that they're printed correctly. */
1162 diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1163@@ -5141,10 +5444,11 @@ test_tab_expansion (const line_table_cas
1164 ....................123 45678901234 56789012345 columns */
1165
1166 const int tabstop = 8;
1167+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
1168 const int first_non_ws_byte_col = 7;
1169 const int right_quote_byte_col = 15;
1170 const int last_byte_col = 25;
1171- ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop));
1172+ ASSERT_EQ (35, cpp_display_width (content, last_byte_col, policy));
1173
1174 temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
1175 line_table_test ltt (case_);
1176@@ -5187,6 +5491,114 @@ test_tab_expansion (const line_table_cas
1177 }
1178 }
1179
1180+/* Verify that the escaping machinery can cope with a variety of different
1181+ invalid bytes. */
1182+
1183+static void
1184+test_escaping_bytes_1 (const line_table_case &case_)
1185+{
1186+ const char content[] = "before\0\1\2\3\r\x80\xff""after\n";
1187+ const size_t sz = sizeof (content);
1188+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz);
1189+ line_table_test ltt (case_);
1190+ const line_map_ordinary *ord_map = linemap_check_ordinary
1191+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0));
1192+ linemap_line_start (line_table, 1, 100);
1193+
1194+ location_t finish
1195+ = linemap_position_for_line_and_column (line_table, ord_map, 1,
1196+ strlen (content));
1197+
1198+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS)
1199+ return;
1200+
1201+ /* Locations of the NUL and \r bytes. */
1202+ location_t nul_loc
1203+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 7);
1204+ location_t r_loc
1205+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 11);
1206+ gcc_rich_location richloc (nul_loc);
1207+ richloc.add_range (r_loc);
1208+
1209+ {
1210+ test_diagnostic_context dc;
1211+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1212+ ASSERT_STREQ (" before \1\2\3 \x80\xff""after\n"
1213+ " ^ ~\n",
1214+ pp_formatted_text (dc.printer));
1215+ }
1216+ richloc.set_escape_on_output (true);
1217+ {
1218+ test_diagnostic_context dc;
1219+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
1220+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1221+ ASSERT_STREQ
1222+ (" before<U+0000><U+0001><U+0002><U+0003><U+000D><80><ff>after\n"
1223+ " ^~~~~~~~ ~~~~~~~~\n",
1224+ pp_formatted_text (dc.printer));
1225+ }
1226+ {
1227+ test_diagnostic_context dc;
1228+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
1229+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1230+ ASSERT_STREQ (" before<00><01><02><03><0d><80><ff>after\n"
1231+ " ^~~~ ~~~~\n",
1232+ pp_formatted_text (dc.printer));
1233+ }
1234+}
1235+
1236+/* As above, but verify that we handle the initial byte of a line
1237+ correctly. */
1238+
1239+static void
1240+test_escaping_bytes_2 (const line_table_case &case_)
1241+{
1242+ const char content[] = "\0after\n";
1243+ const size_t sz = sizeof (content);
1244+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz);
1245+ line_table_test ltt (case_);
1246+ const line_map_ordinary *ord_map = linemap_check_ordinary
1247+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0));
1248+ linemap_line_start (line_table, 1, 100);
1249+
1250+ location_t finish
1251+ = linemap_position_for_line_and_column (line_table, ord_map, 1,
1252+ strlen (content));
1253+
1254+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS)
1255+ return;
1256+
1257+ /* Location of the NUL byte. */
1258+ location_t nul_loc
1259+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 1);
1260+ gcc_rich_location richloc (nul_loc);
1261+
1262+ {
1263+ test_diagnostic_context dc;
1264+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1265+ ASSERT_STREQ (" after\n"
1266+ " ^\n",
1267+ pp_formatted_text (dc.printer));
1268+ }
1269+ richloc.set_escape_on_output (true);
1270+ {
1271+ test_diagnostic_context dc;
1272+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
1273+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1274+ ASSERT_STREQ (" <U+0000>after\n"
1275+ " ^~~~~~~~\n",
1276+ pp_formatted_text (dc.printer));
1277+ }
1278+ {
1279+ test_diagnostic_context dc;
1280+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
1281+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
1282+ ASSERT_STREQ (" <00>after\n"
1283+ " ^~~~\n",
1284+ pp_formatted_text (dc.printer));
1285+ }
1286+}
1287+
1288 /* Verify that line numbers are correctly printed for the case of
1289 a multiline range in which the width of the line numbers changes
1290 (e.g. from "9" to "10"). */
1291@@ -5243,6 +5655,8 @@ diagnostic_show_locus_c_tests ()
1292 test_layout_range_for_single_line ();
1293 test_layout_range_for_multiple_lines ();
1294
1295+ test_display_widths ();
1296+
1297 for_each_line_table_case (test_layout_x_offset_display_utf8);
1298 for_each_line_table_case (test_layout_x_offset_display_tab);
1299
1300@@ -5263,6 +5677,8 @@ diagnostic_show_locus_c_tests ()
1301 for_each_line_table_case (test_fixit_replace_containing_newline);
1302 for_each_line_table_case (test_fixit_deletion_affecting_newline);
1303 for_each_line_table_case (test_tab_expansion);
1304+ for_each_line_table_case (test_escaping_bytes_1);
1305+ for_each_line_table_case (test_escaping_bytes_2);
1306
1307 test_line_numbers_multiline_range ();
1308 }
1309diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
1310--- a/gcc/doc/invoke.texi 2021-12-25 01:29:12.927317174 -0800
1311+++ b/gcc/doc/invoke.texi 2021-12-25 01:30:50.681688823 -0800
1312@@ -295,7 +295,8 @@ Objective-C and Objective-C++ Dialects}.
1313 -fdiagnostics-show-path-depths @gol
1314 -fno-show-column @gol
1315 -fdiagnostics-column-unit=@r{[}display@r{|}byte@r{]} @gol
1316--fdiagnostics-column-origin=@var{origin}}
1317+-fdiagnostics-column-origin=@var{origin} @gol
1318+-fdiagnostics-escape-format=@r{[}unicode@r{|}bytes@r{]}}
1319
1320 @item Warning Options
1321 @xref{Warning Options,,Options to Request or Suppress Warnings}.
1322@@ -4451,6 +4452,38 @@ first column. The default value of 1 co
1323 behavior and to the GNU style guide. Some utilities may perform better with an
1324 origin of 0; any non-negative value may be specified.
1325
1326+@item -fdiagnostics-escape-format=@var{FORMAT}
1327+@opindex fdiagnostics-escape-format
1328+When GCC prints pertinent source lines for a diagnostic it normally attempts
1329+to print the source bytes directly. However, some diagnostics relate to encoding
1330+issues in the source file, such as malformed UTF-8, or issues with Unicode
1331+normalization. These diagnostics are flagged so that GCC will escape bytes
1332+that are not printable ASCII when printing their pertinent source lines.
1333+
1334+This option controls how such bytes should be escaped.
1335+
1336+The default @var{FORMAT}, @samp{unicode} displays Unicode characters that
1337+are not printable ASCII in the form @samp{<U+XXXX>}, and bytes that do not
1338+correspond to a Unicode character validly-encoded in UTF-8-encoded will be
1339+displayed as hexadecimal in the form @samp{<XX>}.
1340+
1341+For example, a source line containing the string @samp{before} followed by the
1342+Unicode character U+03C0 (``GREEK SMALL LETTER PI'', with UTF-8 encoding
1343+0xCF 0x80) followed by the byte 0xBF (a stray UTF-8 trailing byte), followed by
1344+the string @samp{after} will be printed for such a diagnostic as:
1345+
1346+@smallexample
1347+ before<U+03C0><BF>after
1348+@end smallexample
1349+
1350+Setting @var{FORMAT} to @samp{bytes} will display all non-printable-ASCII bytes
1351+in the form @samp{<XX>}, thus showing the underlying encoding of non-ASCII
1352+Unicode characters. For the example above, the following will be printed:
1353+
1354+@smallexample
1355+ before<CF><80><BF>after
1356+@end smallexample
1357+
1358 @item -fdiagnostics-format=@var{FORMAT}
1359 @opindex fdiagnostics-format
1360 Select a different format for printing diagnostics.
1361@@ -4518,9 +4551,11 @@ might be printed in JSON form (after for
1362 @}
1363 @}
1364 ],
1365+ "escape-source": false,
1366 "message": "...this statement, but the latter is @dots{}"
1367 @}
1368 ]
1369+ "escape-source": false,
1370 "column-origin": 1,
1371 @},
1372 @dots{}
1373@@ -4607,6 +4642,7 @@ of the expression, which have labels. I
1374 "label": "T @{aka struct t@}"
1375 @}
1376 ],
1377+ "escape-source": false,
1378 "message": "invalid operands to binary + @dots{}"
1379 @}
1380 @end smallexample
1381@@ -4660,6 +4696,7 @@ might be printed in JSON form as:
1382 @}
1383 @}
1384 ],
1385+ "escape-source": false,
1386 "message": "\u2018struct s\u2019 has no member named @dots{}"
1387 @}
1388 @end smallexample
1389@@ -4717,6 +4754,10 @@ For example, the intraprocedural example
1390 ]
1391 @end smallexample
1392
1393+Diagnostics have a boolean attribute @code{escape-source}, hinting whether
1394+non-ASCII bytes should be escaped when printing the pertinent lines of
1395+source code (@code{true} for diagnostics involving source encoding issues).
1396+
1397 @end table
1398
1399 @node Warning Options
1400diff --git a/gcc/input.c b/gcc/input.c
1401--- a/gcc/input.c 2021-12-25 01:29:12.927317174 -0800
1402+++ b/gcc/input.c 2021-12-25 01:30:50.681688823 -0800
1403@@ -913,7 +913,8 @@ make_location (location_t caret, source_
1404 source line in order to calculate the display width. If that cannot be done
1405 for any reason, then returns the byte column as a fallback. */
1406 int
1407-location_compute_display_column (expanded_location exploc, int tabstop)
1408+location_compute_display_column (expanded_location exploc,
1409+ const cpp_char_column_policy &policy)
1410 {
1411 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1412 return exploc.column;
1413@@ -921,7 +922,7 @@ location_compute_display_column (expande
1414 /* If line is NULL, this function returns exploc.column which is the
1415 desired fallback. */
1416 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1417- exploc.column, tabstop);
1418+ exploc.column, policy);
1419 }
1420
1421 /* Dump statistics to stderr about the memory usage of the line_table
1422@@ -3609,43 +3610,50 @@ test_line_offset_overflow ()
1423 void test_cpp_utf8 ()
1424 {
1425 const int def_tabstop = 8;
1426+ cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
1427+
1428 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
1429 {
1430- int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
1431+ int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
1432 ASSERT_EQ (8, w_bad);
1433- int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
1434+ int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
1435 ASSERT_EQ (5, w_ctrl);
1436 }
1437
1438 /* Verify that wcwidth of valid UTF-8 is as expected. */
1439 {
1440- const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
1441+ const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
1442 ASSERT_EQ (1, w_pi);
1443- const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
1444+ const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
1445 ASSERT_EQ (2, w_emoji);
1446 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
1447- def_tabstop);
1448+ policy);
1449 ASSERT_EQ (1, w_umlaut_precomposed);
1450 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
1451- def_tabstop);
1452+ policy);
1453 ASSERT_EQ (1, w_umlaut_combining);
1454- const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
1455+ const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
1456 ASSERT_EQ (2, w_han);
1457- const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
1458+ const int w_ascii = cpp_display_width ("GCC", 3, policy);
1459 ASSERT_EQ (3, w_ascii);
1460 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
1461 "\x9f! \xe4\xb8\xba y\xcc\x88",
1462- 24, def_tabstop);
1463+ 24, policy);
1464 ASSERT_EQ (18, w_mixed);
1465 }
1466
1467 /* Verify that display width properly expands tabs. */
1468 {
1469 const char *tstr = "\tabc\td";
1470- ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
1471- ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
1472- ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
1473- ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
1474+ ASSERT_EQ (6, cpp_display_width (tstr, 6,
1475+ cpp_char_column_policy (1, cpp_wcwidth)));
1476+ ASSERT_EQ (10, cpp_display_width (tstr, 6,
1477+ cpp_char_column_policy (3, cpp_wcwidth)));
1478+ ASSERT_EQ (17, cpp_display_width (tstr, 6,
1479+ cpp_char_column_policy (8, cpp_wcwidth)));
1480+ ASSERT_EQ (1,
1481+ cpp_display_column_to_byte_column
1482+ (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
1483 }
1484
1485 /* Verify that cpp_byte_column_to_display_column can go past the end,
1486@@ -3658,13 +3666,13 @@ void test_cpp_utf8 ()
1487 /* 111122223456
1488 Byte columns. */
1489
1490- ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
1491+ ASSERT_EQ (5, cpp_display_width (str, 6, policy));
1492 ASSERT_EQ (105,
1493- cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
1494+ cpp_byte_column_to_display_column (str, 6, 106, policy));
1495 ASSERT_EQ (10000,
1496- cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
1497+ cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
1498 ASSERT_EQ (0,
1499- cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
1500+ cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
1501 }
1502
1503 /* Verify that cpp_display_column_to_byte_column can go past the end,
1504@@ -3678,25 +3686,25 @@ void test_cpp_utf8 ()
1505 /* 000000000000000000000000000000000111111
1506 111122223333444456666777788889999012345
1507 Byte columns. */
1508- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
1509+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
1510 ASSERT_EQ (15,
1511- cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
1512+ cpp_display_column_to_byte_column (str, 15, 11, policy));
1513 ASSERT_EQ (115,
1514- cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
1515+ cpp_display_column_to_byte_column (str, 15, 111, policy));
1516 ASSERT_EQ (10000,
1517- cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
1518+ cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
1519 ASSERT_EQ (0,
1520- cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
1521+ cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
1522
1523 /* Verify that we do not interrupt a UTF-8 sequence. */
1524- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
1525+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
1526
1527 for (int byte_col = 1; byte_col <= 15; ++byte_col)
1528 {
1529 const int disp_col
1530- = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
1531+ = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
1532 const int byte_col2
1533- = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
1534+ = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
1535
1536 /* If we ask for the display column in the middle of a UTF-8
1537 sequence, it will return the length of the partial sequence,
1538diff --git a/gcc/input.h b/gcc/input.h
1539--- a/gcc/input.h 2021-12-25 01:29:12.927317174 -0800
1540+++ b/gcc/input.h 2021-12-25 01:30:50.681688823 -0800
1541@@ -39,8 +39,11 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER
1542 extern bool is_location_from_builtin_token (location_t);
1543 extern expanded_location expand_location (location_t);
1544
1545-extern int location_compute_display_column (expanded_location exploc,
1546- int tabstop);
1547+class cpp_char_column_policy;
1548+
1549+extern int
1550+location_compute_display_column (expanded_location exploc,
1551+ const cpp_char_column_policy &policy);
1552
1553 /* A class capturing the bounds of a buffer, to allow for run-time
1554 bounds-checking in a checked build. */
1555diff --git a/gcc/opts.c b/gcc/opts.c
1556--- a/gcc/opts.c 2021-12-25 01:29:12.927317174 -0800
1557+++ b/gcc/opts.c 2021-12-25 01:30:50.681688823 -0800
1558@@ -2447,6 +2447,10 @@ common_handle_option (struct gcc_options
1559 dc->column_origin = value;
1560 break;
1561
1562+ case OPT_fdiagnostics_escape_format_:
1563+ dc->escape_format = (enum diagnostics_escape_format)value;
1564+ break;
1565+
1566 case OPT_fdiagnostics_show_cwe:
1567 dc->show_cwe = value;
1568 break;
1569diff --git a/gcc/selftest.c b/gcc/selftest.c
1570--- a/gcc/selftest.c 2020-07-22 23:35:17.820389797 -0700
1571+++ b/gcc/selftest.c 2021-12-25 01:30:50.681688823 -0800
1572@@ -193,6 +193,21 @@ temp_source_file::temp_source_file (cons
1573 fclose (out);
1574 }
1575
1576+/* As above, but with a size, to allow for NUL bytes in CONTENT. */
1577+
1578+temp_source_file::temp_source_file (const location &loc,
1579+ const char *suffix,
1580+ const char *content,
1581+ size_t sz)
1582+: named_temp_file (suffix)
1583+{
1584+ FILE *out = fopen (get_filename (), "w");
1585+ if (!out)
1586+ fail_formatted (loc, "unable to open tempfile: %s", get_filename ());
1587+ fwrite (content, sz, 1, out);
1588+ fclose (out);
1589+}
1590+
1591 /* Avoid introducing locale-specific differences in the results
1592 by hardcoding open_quote and close_quote. */
1593
1594diff --git a/gcc/selftest.h b/gcc/selftest.h
1595--- a/gcc/selftest.h 2020-07-22 23:35:17.820389797 -0700
1596+++ b/gcc/selftest.h 2021-12-25 01:30:50.681688823 -0800
1597@@ -112,6 +112,8 @@ class temp_source_file : public named_te
1598 public:
1599 temp_source_file (const location &loc, const char *suffix,
1600 const char *content);
1601+ temp_source_file (const location &loc, const char *suffix,
1602+ const char *content, size_t sz);
1603 };
1604
1605 /* RAII-style class for avoiding introducing locale-specific differences
1606diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c
1607--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-25 01:29:12.927317174 -0800
1608+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-25 01:30:50.681688823 -0800
1609@@ -9,6 +9,7 @@
1610
1611 /* { dg-regexp "\"kind\": \"error\"" } */
1612 /* { dg-regexp "\"column-origin\": 1" } */
1613+/* { dg-regexp "\"escape-source\": false" } */
1614 /* { dg-regexp "\"message\": \"#error message\"" } */
1615
1616 /* { dg-regexp "\"caret\": \{" } */
1617diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c
1618--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-25 01:29:12.927317174 -0800
1619+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-25 01:30:50.681688823 -0800
1620@@ -9,6 +9,7 @@
1621
1622 /* { dg-regexp "\"kind\": \"warning\"" } */
1623 /* { dg-regexp "\"column-origin\": 1" } */
1624+/* { dg-regexp "\"escape-source\": false" } */
1625 /* { dg-regexp "\"message\": \"#warning message\"" } */
1626 /* { dg-regexp "\"option\": \"-Wcpp\"" } */
1627 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */
1628diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c
1629--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-25 01:29:12.927317174 -0800
1630+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-25 01:30:50.681688823 -0800
1631@@ -9,6 +9,7 @@
1632
1633 /* { dg-regexp "\"kind\": \"error\"" } */
1634 /* { dg-regexp "\"column-origin\": 1" } */
1635+/* { dg-regexp "\"escape-source\": false" } */
1636 /* { dg-regexp "\"message\": \"#warning message\"" } */
1637 /* { dg-regexp "\"option\": \"-Werror=cpp\"" } */
1638 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */
1639diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c
1640--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-25 01:29:12.927317174 -0800
1641+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-25 01:30:50.681688823 -0800
1642@@ -19,6 +19,7 @@ int test (void)
1643
1644 /* { dg-regexp "\"kind\": \"note\"" } */
1645 /* { dg-regexp "\"message\": \"...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'\"" } */
1646+/* { dg-regexp "\"escape-source\": false" } */
1647
1648 /* { dg-regexp "\"caret\": \{" } */
1649 /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */
1650@@ -39,6 +40,7 @@ int test (void)
1651 /* { dg-regexp "\"kind\": \"warning\"" } */
1652 /* { dg-regexp "\"column-origin\": 1" } */
1653 /* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */
1654+/* { dg-regexp "\"escape-source\": false" } */
1655 /* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */
1656 /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wmisleading-indentation\"" } */
1657
1658diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c
1659--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-25 01:29:12.927317174 -0800
1660+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-25 01:30:50.681688823 -0800
1661@@ -14,6 +14,7 @@ int test (struct s *ptr)
1662
1663 /* { dg-regexp "\"kind\": \"error\"" } */
1664 /* { dg-regexp "\"column-origin\": 1" } */
1665+/* { dg-regexp "\"escape-source\": false" } */
1666 /* { dg-regexp "\"message\": \".*\"" } */
1667
1668 /* Verify fix-it hints. */
1669diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c
1670--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1969-12-31 16:00:00.000000000 -0800
1671+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 2021-12-25 01:30:50.681688823 -0800
1672@@ -0,0 +1,21 @@
1673+// { dg-do preprocess }
1674+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=bytes" }
1675+/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */
1676+
1677+/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e.
1678+ U+0F42 TIBETAN LETTER GA: à½
1679+ U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ
1680+
1681+ The UTF-8 encoding of U+0F43 TIBETAN LETTER GHA is: E0 BD 83. */
1682+
1683+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
1684+/* { dg-begin-multiline-output "" }
1685+ foo before_\u0F43_after bar
1686+ ^~~~~~~~~~~~~~~~~~~
1687+ { dg-end-multiline-output "" } */
1688+
1689+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
1690+/* { dg-begin-multiline-output "" }
1691+ foo before_<e0><bd><83>_after bar
1692+ ^~~~~~~~~~~~~~~~~~~~~~~~~
1693+ { dg-end-multiline-output "" } */
1694diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c
1695--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1969-12-31 16:00:00.000000000 -0800
1696+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 2021-12-25 01:30:50.681688823 -0800
1697@@ -0,0 +1,19 @@
1698+// { dg-do preprocess }
1699+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=unicode" }
1700+/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */
1701+
1702+/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e.
1703+ U+0F42 TIBETAN LETTER GA: à½
1704+ U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ */
1705+
1706+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
1707+/* { dg-begin-multiline-output "" }
1708+ foo before_\u0F43_after bar
1709+ ^~~~~~~~~~~~~~~~~~~
1710+ { dg-end-multiline-output "" } */
1711+
1712+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
1713+/* { dg-begin-multiline-output "" }
1714+ foo before_<U+0F43>_after bar
1715+ ^~~~~~~~~~~~~~~~~~~~~
1716+ { dg-end-multiline-output "" } */
1717diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90
1718--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-25 01:29:12.931317107 -0800
1719+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-25 01:30:50.681688823 -0800
1720@@ -9,6 +9,7 @@
1721
1722 ! { dg-regexp "\"kind\": \"error\"" }
1723 ! { dg-regexp "\"column-origin\": 1" }
1724+! { dg-regexp "\"escape-source\": false" }
1725 ! { dg-regexp "\"message\": \"#error message\"" }
1726
1727 ! { dg-regexp "\"caret\": \{" }
1728diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90
1729--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-25 01:29:12.931317107 -0800
1730+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-25 01:30:50.681688823 -0800
1731@@ -9,6 +9,7 @@
1732
1733 ! { dg-regexp "\"kind\": \"warning\"" }
1734 ! { dg-regexp "\"column-origin\": 1" }
1735+! { dg-regexp "\"escape-source\": false" }
1736 ! { dg-regexp "\"message\": \"#warning message\"" }
1737 ! { dg-regexp "\"option\": \"-Wcpp\"" }
1738 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
1739diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90
1740--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-25 01:29:12.931317107 -0800
1741+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-25 01:30:50.681688823 -0800
1742@@ -9,6 +9,7 @@
1743
1744 ! { dg-regexp "\"kind\": \"error\"" }
1745 ! { dg-regexp "\"column-origin\": 1" }
1746+! { dg-regexp "\"escape-source\": false" }
1747 ! { dg-regexp "\"message\": \"#warning message\"" }
1748 ! { dg-regexp "\"option\": \"-Werror=cpp\"" }
1749 ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
1750diff --git a/libcpp/charset.c b/libcpp/charset.c
1751--- a/libcpp/charset.c 2021-12-25 01:29:12.931317107 -0800
1752+++ b/libcpp/charset.c 2021-12-25 01:30:50.681688823 -0800
1753@@ -1549,12 +1549,14 @@ convert_escape (cpp_reader *pfile, const
1754 "unknown escape sequence: '\\%c'", (int) c);
1755 else
1756 {
1757+ encoding_rich_location rich_loc (pfile);
1758+
1759 /* diagnostic.c does not support "%03o". When it does, this
1760 code can use %03o directly in the diagnostic again. */
1761 char buf[32];
1762 sprintf(buf, "%03o", (int) c);
1763- cpp_error (pfile, CPP_DL_PEDWARN,
1764- "unknown escape sequence: '\\%s'", buf);
1765+ cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc,
1766+ "unknown escape sequence: '\\%s'", buf);
1767 }
1768 }
1769
1770@@ -2277,14 +2279,16 @@ cpp_string_location_reader::get_next ()
1771 }
1772
1773 cpp_display_width_computation::
1774-cpp_display_width_computation (const char *data, int data_length, int tabstop) :
1775+cpp_display_width_computation (const char *data, int data_length,
1776+ const cpp_char_column_policy &policy) :
1777 m_begin (data),
1778 m_next (m_begin),
1779 m_bytes_left (data_length),
1780- m_tabstop (tabstop),
1781+ m_policy (policy),
1782 m_display_cols (0)
1783 {
1784- gcc_assert (m_tabstop > 0);
1785+ gcc_assert (policy.m_tabstop > 0);
1786+ gcc_assert (policy.m_width_cb);
1787 }
1788
1789
1790@@ -2296,19 +2300,28 @@ cpp_display_width_computation (const cha
1791 point to a valid UTF-8-encoded sequence, then it will be treated as a single
1792 byte with display width 1. m_cur_display_col is the current display column,
1793 relative to which tab stops should be expanded. Returns the display width of
1794- the codepoint just processed. */
1795+ the codepoint just processed.
1796+ If OUT is non-NULL, it is populated. */
1797
1798 int
1799-cpp_display_width_computation::process_next_codepoint ()
1800+cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out)
1801 {
1802 cppchar_t c;
1803 int next_width;
1804
1805+ if (out)
1806+ out->m_start_byte = m_next;
1807+
1808 if (*m_next == '\t')
1809 {
1810 ++m_next;
1811 --m_bytes_left;
1812- next_width = m_tabstop - (m_display_cols % m_tabstop);
1813+ next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop);
1814+ if (out)
1815+ {
1816+ out->m_ch = '\t';
1817+ out->m_valid_ch = true;
1818+ }
1819 }
1820 else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c)
1821 != 0)
1822@@ -2318,14 +2331,24 @@ cpp_display_width_computation::process_n
1823 of one. */
1824 ++m_next;
1825 --m_bytes_left;
1826- next_width = 1;
1827+ next_width = m_policy.m_undecoded_byte_width;
1828+ if (out)
1829+ out->m_valid_ch = false;
1830 }
1831 else
1832 {
1833 /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */
1834- next_width = cpp_wcwidth (c);
1835+ next_width = m_policy.m_width_cb (c);
1836+ if (out)
1837+ {
1838+ out->m_ch = c;
1839+ out->m_valid_ch = true;
1840+ }
1841 }
1842
1843+ if (out)
1844+ out->m_next_byte = m_next;
1845+
1846 m_display_cols += next_width;
1847 return next_width;
1848 }
1849@@ -2341,7 +2364,7 @@ cpp_display_width_computation::advance_d
1850 const int start = m_display_cols;
1851 const int target = start + n;
1852 while (m_display_cols < target && !done ())
1853- process_next_codepoint ();
1854+ process_next_codepoint (NULL);
1855 return m_display_cols - start;
1856 }
1857
1858@@ -2349,29 +2372,33 @@ cpp_display_width_computation::advance_d
1859 how many display columns are occupied by the first COLUMN bytes. COLUMN
1860 may exceed DATA_LENGTH, in which case the phantom bytes at the end are
1861 treated as if they have display width 1. Tabs are expanded to the next tab
1862- stop, relative to the start of DATA. */
1863+ stop, relative to the start of DATA, and non-printable-ASCII characters
1864+ will be escaped as per POLICY. */
1865
1866 int
1867 cpp_byte_column_to_display_column (const char *data, int data_length,
1868- int column, int tabstop)
1869+ int column,
1870+ const cpp_char_column_policy &policy)
1871 {
1872 const int offset = MAX (0, column - data_length);
1873- cpp_display_width_computation dw (data, column - offset, tabstop);
1874+ cpp_display_width_computation dw (data, column - offset, policy);
1875 while (!dw.done ())
1876- dw.process_next_codepoint ();
1877+ dw.process_next_codepoint (NULL);
1878 return dw.display_cols_processed () + offset;
1879 }
1880
1881 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute
1882 the least number of bytes that will result in at least DISPLAY_COL display
1883 columns. The return value may exceed DATA_LENGTH if the entire string does
1884- not occupy enough display columns. */
1885+ not occupy enough display columns. Non-printable-ASCII characters
1886+ will be escaped as per POLICY. */
1887
1888 int
1889 cpp_display_column_to_byte_column (const char *data, int data_length,
1890- int display_col, int tabstop)
1891+ int display_col,
1892+ const cpp_char_column_policy &policy)
1893 {
1894- cpp_display_width_computation dw (data, data_length, tabstop);
1895+ cpp_display_width_computation dw (data, data_length, policy);
1896 const int avail_display = dw.advance_display_cols (display_col);
1897 return dw.bytes_processed () + MAX (0, display_col - avail_display);
1898 }
1899diff --git a/libcpp/errors.c b/libcpp/errors.c
1900--- a/libcpp/errors.c 2020-07-22 23:35:18.712399623 -0700
1901+++ b/libcpp/errors.c 2021-12-25 01:30:50.681688823 -0800
1902@@ -27,6 +27,31 @@ along with this program; see the file CO
1903 #include "cpplib.h"
1904 #include "internal.h"
1905
1906+/* Get a location_t for the current location in PFILE,
1907+ generally that of the previously lexed token. */
1908+
1909+location_t
1910+cpp_diagnostic_get_current_location (cpp_reader *pfile)
1911+{
1912+ if (CPP_OPTION (pfile, traditional))
1913+ {
1914+ if (pfile->state.in_directive)
1915+ return pfile->directive_line;
1916+ else
1917+ return pfile->line_table->highest_line;
1918+ }
1919+ /* We don't want to refer to a token before the beginning of the
1920+ current run -- that is invalid. */
1921+ else if (pfile->cur_token == pfile->cur_run->base)
1922+ {
1923+ return 0;
1924+ }
1925+ else
1926+ {
1927+ return pfile->cur_token[-1].src_loc;
1928+ }
1929+}
1930+
1931 /* Print a diagnostic at the given location. */
1932
1933 ATTRIBUTE_FPTR_PRINTF(5,0)
1934@@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum
1935 enum cpp_warning_reason reason,
1936 const char *msgid, va_list *ap)
1937 {
1938- location_t src_loc;
1939-
1940- if (CPP_OPTION (pfile, traditional))
1941- {
1942- if (pfile->state.in_directive)
1943- src_loc = pfile->directive_line;
1944- else
1945- src_loc = pfile->line_table->highest_line;
1946- }
1947- /* We don't want to refer to a token before the beginning of the
1948- current run -- that is invalid. */
1949- else if (pfile->cur_token == pfile->cur_run->base)
1950- {
1951- src_loc = 0;
1952- }
1953- else
1954- {
1955- src_loc = pfile->cur_token[-1].src_loc;
1956- }
1957+ location_t src_loc = cpp_diagnostic_get_current_location (pfile);
1958 rich_location richloc (pfile->line_table, src_loc);
1959 return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap);
1960 }
1961@@ -142,6 +149,43 @@ cpp_warning_syshdr (cpp_reader * pfile,
1962
1963 va_end (ap);
1964 return ret;
1965+}
1966+
1967+/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */
1968+
1969+bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason,
1970+ rich_location *richloc, const char *msgid, ...)
1971+{
1972+ va_list ap;
1973+ bool ret;
1974+
1975+ va_start (ap, msgid);
1976+
1977+ ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc,
1978+ msgid, &ap);
1979+
1980+ va_end (ap);
1981+ return ret;
1982+
1983+}
1984+
1985+/* As cpp_pedwarning above, but use RICHLOC as the location of the
1986+ diagnostic. */
1987+
1988+bool
1989+cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason,
1990+ rich_location *richloc, const char *msgid, ...)
1991+{
1992+ va_list ap;
1993+ bool ret;
1994+
1995+ va_start (ap, msgid);
1996+
1997+ ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc,
1998+ msgid, &ap);
1999+
2000+ va_end (ap);
2001+ return ret;
2002 }
2003
2004 /* Print a diagnostic at a specific location. */
2005diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
2006--- a/libcpp/include/cpplib.h 2021-12-25 01:29:12.931317107 -0800
2007+++ b/libcpp/include/cpplib.h 2021-12-25 01:30:50.685688757 -0800
2008@@ -1176,6 +1176,14 @@ extern bool cpp_warning_syshdr (cpp_read
2009 const char *msgid, ...)
2010 ATTRIBUTE_PRINTF_3;
2011
2012+/* As their counterparts above, but use RICHLOC. */
2013+extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason,
2014+ rich_location *richloc, const char *msgid, ...)
2015+ ATTRIBUTE_PRINTF_4;
2016+extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason,
2017+ rich_location *richloc, const char *msgid, ...)
2018+ ATTRIBUTE_PRINTF_4;
2019+
2020 /* Output a diagnostic with "MSGID: " preceding the
2021 error string of errno. No location is printed. */
2022 extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level,
2023@@ -1320,42 +1328,95 @@ extern const char * cpp_get_userdef_suff
2024
2025 /* In charset.c */
2026
2027+/* The result of attempting to decode a run of UTF-8 bytes. */
2028+
2029+struct cpp_decoded_char
2030+{
2031+ const char *m_start_byte;
2032+ const char *m_next_byte;
2033+
2034+ bool m_valid_ch;
2035+ cppchar_t m_ch;
2036+};
2037+
2038+/* Information for mapping between code points and display columns.
2039+
2040+ This is a tabstop value, along with a callback for getting the
2041+ widths of characters. Normally this callback is cpp_wcwidth, but we
2042+ support other schemes for escaping non-ASCII unicode as a series of
2043+ ASCII chars when printing the user's source code in diagnostic-show-locus.c
2044+
2045+ For example, consider:
2046+ - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80)
2047+ - the Unicode character U+1F642 "SLIGHTLY SMILING FACE"
2048+ (UTF-8: 0xF0 0x9F 0x99 0x82)
2049+ - the byte 0xBF (a stray trailing byte of a UTF-8 character)
2050+ Normally U+03C0 would occupy one display column, U+1F642
2051+ would occupy two display columns, and the stray byte would be
2052+ printed verbatim as one display column.
2053+
2054+ However when escaping them as unicode code points as "<U+03C0>"
2055+ and "<U+1F642>" they occupy 8 and 9 display columns respectively,
2056+ and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>"
2057+ they occupy 8 and 16 display columns respectively. In both cases
2058+ the stray byte is escaped to <BF> as 4 display columns. */
2059+
2060+struct cpp_char_column_policy
2061+{
2062+ cpp_char_column_policy (int tabstop,
2063+ int (*width_cb) (cppchar_t c))
2064+ : m_tabstop (tabstop),
2065+ m_undecoded_byte_width (1),
2066+ m_width_cb (width_cb)
2067+ {}
2068+
2069+ int m_tabstop;
2070+ /* Width in display columns of a stray byte that isn't decodable
2071+ as UTF-8. */
2072+ int m_undecoded_byte_width;
2073+ int (*m_width_cb) (cppchar_t c);
2074+};
2075+
2076 /* A class to manage the state while converting a UTF-8 sequence to cppchar_t
2077 and computing the display width one character at a time. */
2078 class cpp_display_width_computation {
2079 public:
2080 cpp_display_width_computation (const char *data, int data_length,
2081- int tabstop);
2082+ const cpp_char_column_policy &policy);
2083 const char *next_byte () const { return m_next; }
2084 int bytes_processed () const { return m_next - m_begin; }
2085 int bytes_left () const { return m_bytes_left; }
2086 bool done () const { return !bytes_left (); }
2087 int display_cols_processed () const { return m_display_cols; }
2088
2089- int process_next_codepoint ();
2090+ int process_next_codepoint (cpp_decoded_char *out);
2091 int advance_display_cols (int n);
2092
2093 private:
2094 const char *const m_begin;
2095 const char *m_next;
2096 size_t m_bytes_left;
2097- const int m_tabstop;
2098+ const cpp_char_column_policy &m_policy;
2099 int m_display_cols;
2100 };
2101
2102 /* Convenience functions that are simple use cases for class
2103 cpp_display_width_computation. Tab characters will be expanded to spaces
2104- as determined by TABSTOP. */
2105+ as determined by POLICY.m_tabstop, and non-printable-ASCII characters
2106+ will be escaped as per POLICY. */
2107+
2108 int cpp_byte_column_to_display_column (const char *data, int data_length,
2109- int column, int tabstop);
2110+ int column,
2111+ const cpp_char_column_policy &policy);
2112 inline int cpp_display_width (const char *data, int data_length,
2113- int tabstop)
2114+ const cpp_char_column_policy &policy)
2115 {
2116 return cpp_byte_column_to_display_column (data, data_length, data_length,
2117- tabstop);
2118+ policy);
2119 }
2120 int cpp_display_column_to_byte_column (const char *data, int data_length,
2121- int display_col, int tabstop);
2122+ int display_col,
2123+ const cpp_char_column_policy &policy);
2124 int cpp_wcwidth (cppchar_t c);
2125
2126 #endif /* ! LIBCPP_CPPLIB_H */
2127diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h
2128--- a/libcpp/include/line-map.h 2020-07-22 23:35:18.712399623 -0700
2129+++ b/libcpp/include/line-map.h 2021-12-25 01:30:50.685688757 -0800
2130@@ -1732,6 +1732,18 @@ class rich_location
2131 const diagnostic_path *get_path () const { return m_path; }
2132 void set_path (const diagnostic_path *path) { m_path = path; }
2133
2134+ /* A flag for hinting that the diagnostic involves character encoding
2135+ issues, and thus that it will be helpful to the user if we show some
2136+ representation of how the characters in the pertinent source lines
2137+ are encoded.
2138+ The default is false (i.e. do not escape).
2139+ When set to true, non-ASCII bytes in the pertinent source lines will
2140+ be escaped in a manner controlled by the user-supplied option
2141+ -fdiagnostics-escape-format=, so that the user can better understand
2142+ what's going on with the encoding in their source file. */
2143+ bool escape_on_output_p () const { return m_escape_on_output; }
2144+ void set_escape_on_output (bool flag) { m_escape_on_output = flag; }
2145+
2146 private:
2147 bool reject_impossible_fixit (location_t where);
2148 void stop_supporting_fixits ();
2149@@ -1758,6 +1770,7 @@ protected:
2150 bool m_fixits_cannot_be_auto_applied;
2151
2152 const diagnostic_path *m_path;
2153+ bool m_escape_on_output;
2154 };
2155
2156 /* A struct for the result of range_label::get_text: a NUL-terminated buffer
2157diff --git a/libcpp/internal.h b/libcpp/internal.h
2158--- a/libcpp/internal.h 2020-07-22 23:35:18.712399623 -0700
2159+++ b/libcpp/internal.h 2021-12-25 01:30:50.685688757 -0800
2160@@ -758,6 +758,9 @@ struct _cpp_dir_only_callbacks
2161 extern void _cpp_preprocess_dir_only (cpp_reader *,
2162 const struct _cpp_dir_only_callbacks *);
2163
2164+/* In errors.c */
2165+extern location_t cpp_diagnostic_get_current_location (cpp_reader *);
2166+
2167 /* In traditional.c. */
2168 extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *, bool);
2169 extern bool _cpp_read_logical_line_trad (cpp_reader *);
2170@@ -946,6 +949,26 @@ int linemap_get_expansion_line (class li
2171 const char* linemap_get_expansion_filename (class line_maps *,
2172 location_t);
2173
2174+/* A subclass of rich_location for emitting a diagnostic
2175+ at the current location of the reader, but flagging
2176+ it with set_escape_on_output (true). */
2177+class encoding_rich_location : public rich_location
2178+{
2179+ public:
2180+ encoding_rich_location (cpp_reader *pfile)
2181+ : rich_location (pfile->line_table,
2182+ cpp_diagnostic_get_current_location (pfile))
2183+ {
2184+ set_escape_on_output (true);
2185+ }
2186+
2187+ encoding_rich_location (cpp_reader *pfile, location_t loc)
2188+ : rich_location (pfile->line_table, loc)
2189+ {
2190+ set_escape_on_output (true);
2191+ }
2192+};
2193+
2194 #ifdef __cplusplus
2195 }
2196 #endif
2197diff --git a/libcpp/lex.c b/libcpp/lex.c
2198--- a/libcpp/lex.c 2021-12-24 20:23:45.568762024 -0800
2199+++ b/libcpp/lex.c 2021-12-25 01:30:50.685688757 -0800
2200@@ -1268,7 +1268,11 @@ skip_whitespace (cpp_reader *pfile, cppc
2201 while (is_nvspace (c));
2202
2203 if (saw_NUL)
2204- cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
2205+ {
2206+ encoding_rich_location rich_loc (pfile);
2207+ cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
2208+ "null character(s) ignored");
2209+ }
2210
2211 buffer->cur--;
2212 }
2213@@ -1297,6 +1301,28 @@ warn_about_normalization (cpp_reader *pf
2214 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2215 && !pfile->state.skipping)
2216 {
2217+ location_t loc = token->src_loc;
2218+
2219+ /* If possible, create a location range for the token. */
2220+ if (loc >= RESERVED_LOCATION_COUNT
2221+ && token->type != CPP_EOF
2222+ /* There must be no line notes to process. */
2223+ && (!(pfile->buffer->cur
2224+ >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2225+ && !pfile->overlaid_buffer)))
2226+ {
2227+ source_range tok_range;
2228+ tok_range.m_start = loc;
2229+ tok_range.m_finish
2230+ = linemap_position_for_column (pfile->line_table,
2231+ CPP_BUF_COLUMN (pfile->buffer,
2232+ pfile->buffer->cur));
2233+ loc = COMBINE_LOCATION_DATA (pfile->line_table,
2234+ loc, tok_range, NULL);
2235+ }
2236+
2237+ encoding_rich_location rich_loc (pfile, loc);
2238+
2239 /* Make sure that the token is printed using UCNs, even
2240 if we'd otherwise happily print UTF-8. */
2241 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2242@@ -1304,11 +1330,11 @@ warn_about_normalization (cpp_reader *pf
2243
2244 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2245 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2246- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
2247- "`%.*s' is not in NFKC", (int) sz, buf);
2248+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2249+ "`%.*s' is not in NFKC", (int) sz, buf);
2250 else
2251- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
2252- "`%.*s' is not in NFC", (int) sz, buf);
2253+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2254+ "`%.*s' is not in NFC", (int) sz, buf);
2255 free (buf);
2256 }
2257 }
2258diff --git a/libcpp/line-map.c b/libcpp/line-map.c
2259--- a/libcpp/line-map.c 2020-07-22 23:35:18.712399623 -0700
2260+++ b/libcpp/line-map.c 2021-12-25 01:30:50.685688757 -0800
2261@@ -2007,7 +2007,8 @@ rich_location::rich_location (line_maps
2262 m_fixit_hints (),
2263 m_seen_impossible_fixit (false),
2264 m_fixits_cannot_be_auto_applied (false),
2265- m_path (NULL)
2266+ m_path (NULL),
2267+ m_escape_on_output (false)
2268 {
2269 add_range (loc, SHOW_RANGE_WITH_CARET, label);
2270 }
diff --git a/meta/recipes-devtools/gcc/gcc/0003-CVE-2021-42574.patch b/meta/recipes-devtools/gcc/gcc/0003-CVE-2021-42574.patch
new file mode 100644
index 0000000000..6bfaf8402d
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0003-CVE-2021-42574.patch
@@ -0,0 +1,1724 @@
1From 51c500269bf53749b107807d84271385fad35628 Mon Sep 17 00:00:00 2001
2From: Marek Polacek <polacek@redhat.com>
3Date: Wed, 6 Oct 2021 14:33:59 -0400
4Subject: [PATCH] libcpp: Implement -Wbidi-chars for CVE-2021-42574 [PR103026]
5
6From a link below:
7"An issue was discovered in the Bidirectional Algorithm in the Unicode
8Specification through 14.0. It permits the visual reordering of
9characters via control sequences, which can be used to craft source code
10that renders different logic than the logical ordering of tokens
11ingested by compilers and interpreters. Adversaries can leverage this to
12encode source code for compilers accepting Unicode such that targeted
13vulnerabilities are introduced invisibly to human reviewers."
14
15More info:
16https://nvd.nist.gov/vuln/detail/CVE-2021-42574
17https://trojansource.codes/
18
19This is not a compiler bug. However, to mitigate the problem, this patch
20implements -Wbidi-chars=[none|unpaired|any] to warn about possibly
21misleading Unicode bidirectional control characters the preprocessor may
22encounter.
23
24The default is =unpaired, which warns about improperly terminated
25bidirectional control characters; e.g. a LRE without its corresponding PDF.
26The level =any warns about any use of bidirectional control characters.
27
28This patch handles both UCNs and UTF-8 characters. UCNs designating
29bidi characters in identifiers are accepted since r204886. Then r217144
30enabled -fextended-identifiers by default. Extended characters in C/C++
31identifiers have been accepted since r275979. However, this patch still
32warns about mixing UTF-8 and UCN bidi characters; there seems to be no
33good reason to allow mixing them.
34
35We warn in different contexts: comments (both C and C++-style), string
36literals, character constants, and identifiers. Expectedly, UCNs are ignored
37in comments and raw string literals. The bidirectional control characters
38can nest so this patch handles that as well.
39
40I have not included nor tested this at all with Fortran (which also has
41string literals and line comments).
42
43Dave M. posted patches improving diagnostic involving Unicode characters.
44This patch does not make use of this new infrastructure yet.
45
46 PR preprocessor/103026
47
48gcc/c-family/ChangeLog:
49
50 * c.opt (Wbidi-chars, Wbidi-chars=): New option.
51
52gcc/ChangeLog:
53
54 * doc/invoke.texi: Document -Wbidi-chars.
55
56libcpp/ChangeLog:
57
58 * include/cpplib.h (enum cpp_bidirectional_level): New.
59 (struct cpp_options): Add cpp_warn_bidirectional.
60 (enum cpp_warning_reason): Add CPP_W_BIDIRECTIONAL.
61 * internal.h (struct cpp_reader): Add warn_bidi_p member
62 function.
63 * init.c (cpp_create_reader): Set cpp_warn_bidirectional.
64 * lex.c (bidi): New namespace.
65 (get_bidi_utf8): New function.
66 (get_bidi_ucn): Likewise.
67 (maybe_warn_bidi_on_close): Likewise.
68 (maybe_warn_bidi_on_char): Likewise.
69 (_cpp_skip_block_comment): Implement warning about bidirectional
70 control characters.
71 (skip_line_comment): Likewise.
72 (forms_identifier_p): Likewise.
73 (lex_identifier): Likewise.
74 (lex_string): Likewise.
75 (lex_raw_string): Likewise.
76
77gcc/testsuite/ChangeLog:
78
79 * c-c++-common/Wbidi-chars-1.c: New test.
80 * c-c++-common/Wbidi-chars-2.c: New test.
81 * c-c++-common/Wbidi-chars-3.c: New test.
82 * c-c++-common/Wbidi-chars-4.c: New test.
83 * c-c++-common/Wbidi-chars-5.c: New test.
84 * c-c++-common/Wbidi-chars-6.c: New test.
85 * c-c++-common/Wbidi-chars-7.c: New test.
86 * c-c++-common/Wbidi-chars-8.c: New test.
87 * c-c++-common/Wbidi-chars-9.c: New test.
88 * c-c++-common/Wbidi-chars-10.c: New test.
89 * c-c++-common/Wbidi-chars-11.c: New test.
90 * c-c++-common/Wbidi-chars-12.c: New test.
91 * c-c++-common/Wbidi-chars-13.c: New test.
92 * c-c++-common/Wbidi-chars-14.c: New test.
93 * c-c++-common/Wbidi-chars-15.c: New test.
94 * c-c++-common/Wbidi-chars-16.c: New test.
95 * c-c++-common/Wbidi-chars-17.c: New test.
96
97CVE: CVE-2021-42574
98Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=51c500269bf53749b107807d84271385fad35628]
99Signed-off-by: Pgowda <pgowda.cve@gmail.com>
100
101---
102 gcc/c-family/c.opt | 24 ++
103 gcc/doc/invoke.texi | 21 +-
104 gcc/testsuite/c-c++-common/Wbidi-chars-1.c | 12 +
105 gcc/testsuite/c-c++-common/Wbidi-chars-10.c | 27 ++
106 gcc/testsuite/c-c++-common/Wbidi-chars-11.c | 13 +
107 gcc/testsuite/c-c++-common/Wbidi-chars-12.c | 19 +
108 gcc/testsuite/c-c++-common/Wbidi-chars-13.c | 17 +
109 gcc/testsuite/c-c++-common/Wbidi-chars-14.c | 38 ++
110 gcc/testsuite/c-c++-common/Wbidi-chars-15.c | 59 +++
111 gcc/testsuite/c-c++-common/Wbidi-chars-16.c | 26 ++
112 gcc/testsuite/c-c++-common/Wbidi-chars-17.c | 30 ++
113 gcc/testsuite/c-c++-common/Wbidi-chars-2.c | 9 +
114 gcc/testsuite/c-c++-common/Wbidi-chars-3.c | 11 +
115 gcc/testsuite/c-c++-common/Wbidi-chars-4.c | 188 +++++++++
116 gcc/testsuite/c-c++-common/Wbidi-chars-5.c | 188 +++++++++
117 gcc/testsuite/c-c++-common/Wbidi-chars-6.c | 155 ++++++++
118 gcc/testsuite/c-c++-common/Wbidi-chars-7.c | 9 +
119 gcc/testsuite/c-c++-common/Wbidi-chars-8.c | 13 +
120 gcc/testsuite/c-c++-common/Wbidi-chars-9.c | 29 ++
121 libcpp/include/cpplib.h | 18 +-
122 libcpp/init.c | 1 +
123 libcpp/internal.h | 7 +
124 libcpp/lex.c | 408 +++++++++++++++++++-
125 23 files changed, 1315 insertions(+), 7 deletions(-)
126 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-1.c
127 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-10.c
128 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-11.c
129 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-12.c
130 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-13.c
131 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-14.c
132 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-15.c
133 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-16.c
134 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-17.c
135 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-2.c
136 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-3.c
137 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-4.c
138 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-5.c
139 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-6.c
140 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-7.c
141 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-8.c
142 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-9.c
143
144diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
145--- a/gcc/c-family/c.opt 2021-12-25 01:29:12.915317374 -0800
146+++ b/gcc/c-family/c.opt 2021-12-25 01:36:22.040018701 -0800
147@@ -350,6 +350,30 @@ Wbad-function-cast
148 C ObjC Var(warn_bad_function_cast) Warning
149 Warn about casting functions to incompatible types.
150
151+Wbidi-chars
152+C ObjC C++ ObjC++ Warning Alias(Wbidi-chars=,any,none)
153+;
154+
155+Wbidi-chars=
156+C ObjC C++ ObjC++ RejectNegative Joined Warning CPP(cpp_warn_bidirectional) CppReason(CPP_W_BIDIRECTIONAL) Var(warn_bidirectional) Init(bidirectional_unpaired) Enum(cpp_bidirectional_level)
157+-Wbidi-chars=[none|unpaired|any] Warn about UTF-8 bidirectional control characters.
158+
159+; Required for these enum values.
160+SourceInclude
161+cpplib.h
162+
163+Enum
164+Name(cpp_bidirectional_level) Type(int) UnknownError(argument %qs to %<-Wbidi-chars%> not recognized)
165+
166+EnumValue
167+Enum(cpp_bidirectional_level) String(none) Value(bidirectional_none)
168+
169+EnumValue
170+Enum(cpp_bidirectional_level) String(unpaired) Value(bidirectional_unpaired)
171+
172+EnumValue
173+Enum(cpp_bidirectional_level) String(any) Value(bidirectional_any)
174+
175 Wbool-compare
176 C ObjC C++ ObjC++ Var(warn_bool_compare) Warning LangEnabledBy(C ObjC C++ ObjC++,Wall)
177 Warn about boolean expression compared with an integer value different from true/false.
178diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
179--- a/gcc/doc/invoke.texi 2021-12-25 01:35:33.284883488 -0800
180+++ b/gcc/doc/invoke.texi 2021-12-25 01:36:22.048018559 -0800
181@@ -310,7 +310,9 @@ Objective-C and Objective-C++ Dialects}.
182 -Warith-conversion @gol
183 -Warray-bounds -Warray-bounds=@var{n} @gol
184 -Wno-attributes -Wattribute-alias=@var{n} -Wno-attribute-alias @gol
185--Wno-attribute-warning -Wbool-compare -Wbool-operation @gol
186+-Wno-attribute-warning @gol
187+-Wbidi-chars=@r{[}none@r{|}unpaired@r{|}any@r{]} @gol
188+-Wbool-compare -Wbool-operation @gol
189 -Wno-builtin-declaration-mismatch @gol
190 -Wno-builtin-macro-redefined -Wc90-c99-compat -Wc99-c11-compat @gol
191 -Wc11-c2x-compat @gol
192@@ -6860,6 +6862,23 @@ Attributes considered include @code{allo
193 This is the default. You can disable these warnings with either
194 @option{-Wno-attribute-alias} or @option{-Wattribute-alias=0}.
195
196+@item -Wbidi-chars=@r{[}none@r{|}unpaired@r{|}any@r{]}
197+@opindex Wbidi-chars=
198+@opindex Wbidi-chars
199+@opindex Wno-bidi-chars
200+Warn about possibly misleading UTF-8 bidirectional control characters in
201+comments, string literals, character constants, and identifiers. Such
202+characters can change left-to-right writing direction into right-to-left
203+(and vice versa), which can cause confusion between the logical order and
204+visual order. This may be dangerous; for instance, it may seem that a piece
205+of code is not commented out, whereas it in fact is.
206+
207+There are three levels of warning supported by GCC@. The default is
208+@option{-Wbidi-chars=unpaired}, which warns about improperly terminated
209+bidi contexts. @option{-Wbidi-chars=none} turns the warning off.
210+@option{-Wbidi-chars=any} warns about any use of bidirectional control
211+characters.
212+
213 @item -Wbool-compare
214 @opindex Wno-bool-compare
215 @opindex Wbool-compare
216diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-10.c b/gcc/testsuite/c-c++-common/Wbidi-chars-10.c
217--- a/gcc/testsuite/c-c++-common/Wbidi-chars-10.c 1969-12-31 16:00:00.000000000 -0800
218+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-10.c 2021-12-25 01:36:22.048018559 -0800
219@@ -0,0 +1,27 @@
220+/* PR preprocessor/103026 */
221+/* { dg-do compile } */
222+/* { dg-options "-Wbidi-chars=unpaired" } */
223+/* More nesting testing. */
224+
225+/* RLE‫ LRI⁦ PDF‬ PDI⁩*/
226+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
227+int LRE_\u202a_PDF_\u202c;
228+int LRE_\u202a_PDF_\u202c_LRE_\u202a_PDF_\u202c;
229+int LRE_\u202a_LRI_\u2066_PDF_\u202c_PDI_\u2069;
230+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
231+int RLE_\u202b_RLI_\u2067_PDF_\u202c_PDI_\u2069;
232+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
233+int RLE_\u202b_RLI_\u2067_PDI_\u2069_PDF_\u202c;
234+int FSI_\u2068_LRO_\u202d_PDI_\u2069_PDF_\u202c;
235+int FSI_\u2068;
236+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
237+int FSI_\u2068_PDI_\u2069;
238+int FSI_\u2068_FSI_\u2068_PDI_\u2069;
239+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
240+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
241+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
242+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
243+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDF_\u202c;
244+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
245+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_FSI_\u2068_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
246+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
247diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-11.c b/gcc/testsuite/c-c++-common/Wbidi-chars-11.c
248--- a/gcc/testsuite/c-c++-common/Wbidi-chars-11.c 1969-12-31 16:00:00.000000000 -0800
249+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-11.c 2021-12-25 01:36:22.048018559 -0800
250@@ -0,0 +1,13 @@
251+/* PR preprocessor/103026 */
252+/* { dg-do compile } */
253+/* { dg-options "-Wbidi-chars=unpaired" } */
254+/* Test that we warn when mixing UCN and UTF-8. */
255+
256+int LRE_‪_PDF_\u202c;
257+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
258+int LRE_\u202a_PDF_‬_;
259+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
260+const char *s1 = "LRE_‪_PDF_\u202c";
261+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
262+const char *s2 = "LRE_\u202a_PDF_‬";
263+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
264diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-12.c b/gcc/testsuite/c-c++-common/Wbidi-chars-12.c
265--- a/gcc/testsuite/c-c++-common/Wbidi-chars-12.c 1969-12-31 16:00:00.000000000 -0800
266+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-12.c 2021-12-25 01:36:22.048018559 -0800
267@@ -0,0 +1,19 @@
268+/* PR preprocessor/103026 */
269+/* { dg-do compile { target { c || c++11 } } } */
270+/* { dg-options "-Wbidi-chars=any" } */
271+/* Test raw strings. */
272+
273+const char *s1 = R"(a b c LRE‪ 1 2 3 PDF‬ x y z)";
274+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
275+const char *s2 = R"(a b c RLE‫ 1 2 3 PDF‬ x y z)";
276+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
277+const char *s3 = R"(a b c LRO‭ 1 2 3 PDF‬ x y z)";
278+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
279+const char *s4 = R"(a b c RLO‮ 1 2 3 PDF‬ x y z)";
280+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
281+const char *s7 = R"(a b c FSI⁨ 1 2 3 PDI⁩ x y) z";
282+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
283+const char *s8 = R"(a b c PDI⁩ x y )z";
284+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
285+const char *s9 = R"(a b c PDF‬ x y z)";
286+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
287diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-13.c b/gcc/testsuite/c-c++-common/Wbidi-chars-13.c
288--- a/gcc/testsuite/c-c++-common/Wbidi-chars-13.c 1969-12-31 16:00:00.000000000 -0800
289+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-13.c 2021-12-25 01:36:22.048018559 -0800
290@@ -0,0 +1,17 @@
291+/* PR preprocessor/103026 */
292+/* { dg-do compile { target { c || c++11 } } } */
293+/* { dg-options "-Wbidi-chars=unpaired" } */
294+/* Test raw strings. */
295+
296+const char *s1 = R"(a b c LRE‪ 1 2 3)";
297+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
298+const char *s2 = R"(a b c RLE‫ 1 2 3)";
299+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
300+const char *s3 = R"(a b c LRO‭ 1 2 3)";
301+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
302+const char *s4 = R"(a b c FSI⁨ 1 2 3)";
303+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
304+const char *s5 = R"(a b c LRI⁦ 1 2 3)";
305+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
306+const char *s6 = R"(a b c RLI⁧ 1 2 3)";
307+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
308diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-14.c b/gcc/testsuite/c-c++-common/Wbidi-chars-14.c
309--- a/gcc/testsuite/c-c++-common/Wbidi-chars-14.c 1969-12-31 16:00:00.000000000 -0800
310+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-14.c 2021-12-25 01:36:22.048018559 -0800
311@@ -0,0 +1,38 @@
312+/* PR preprocessor/103026 */
313+/* { dg-do compile } */
314+/* { dg-options "-Wbidi-chars=unpaired" } */
315+/* Test PDI handling, which also pops any subsequent LREs, RLEs, LROs,
316+ or RLOs. */
317+
318+/* LRI_⁦_LRI_⁦_RLE_‫_RLE_‫_RLE_‫_PDI_⁩*/
319+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
320+// LRI_⁦_RLE_‫_RLE_‫_RLE_‫_PDI_⁩
321+// LRI_⁦_RLO_‮_RLE_‫_RLE_‫_PDI_⁩
322+// LRI_⁦_RLO_‮_RLE_‫_PDI_⁩
323+// FSI_⁨_RLO_‮_PDI_⁩
324+// FSI_⁨_FSI_⁨_RLO_‮_PDI_⁩
325+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
326+
327+int LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069;
328+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
329+int LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069_PDI_\u2069;
330+int LRI_\u2066_LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069_PDI_\u2069;
331+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
332+int PDI_\u2069;
333+int LRI_\u2066_PDI_\u2069;
334+int RLI_\u2067_PDI_\u2069;
335+int LRE_\u202a_LRI_\u2066_PDI_\u2069;
336+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
337+int LRI_\u2066_LRE_\u202a_PDF_\u202c_PDI_\u2069;
338+int LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
339+int RLI_\u2067_LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
340+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
341+int FSI_\u2068_LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
342+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
343+int RLO_\u202e_PDI_\u2069;
344+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
345+int RLI_\u2067_PDI_\u2069_RLI_\u2067;
346+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
347+int FSI_\u2068_PDF_\u202c_PDI_\u2069;
348+int FSI_\u2068_FSI_\u2068_PDF_\u202c_PDI_\u2069;
349+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
350diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-15.c b/gcc/testsuite/c-c++-common/Wbidi-chars-15.c
351--- a/gcc/testsuite/c-c++-common/Wbidi-chars-15.c 1969-12-31 16:00:00.000000000 -0800
352+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-15.c 2021-12-25 01:36:22.048018559 -0800
353@@ -0,0 +1,59 @@
354+/* PR preprocessor/103026 */
355+/* { dg-do compile } */
356+/* { dg-options "-Wbidi-chars=unpaired" } */
357+/* Test unpaired bidi control chars in multiline comments. */
358+
359+/*
360+ * LRE‪ end
361+ */
362+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
363+/*
364+ * RLE‫ end
365+ */
366+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
367+/*
368+ * LRO‭ end
369+ */
370+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
371+/*
372+ * RLO‮ end
373+ */
374+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
375+/*
376+ * LRI⁦ end
377+ */
378+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
379+/*
380+ * RLI⁧ end
381+ */
382+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
383+/*
384+ * FSI⁨ end
385+ */
386+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
387+/* LRE‪
388+ PDF‬ */
389+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
390+/* FSI⁨
391+ PDI⁩ */
392+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
393+
394+/* LRE<‪>
395+ *
396+ */
397+/* { dg-warning "unpaired" "" { target *-*-* } .-3 } */
398+
399+/*
400+ * LRE<‪>
401+ */
402+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
403+
404+/*
405+ *
406+ * LRE<‪> */
407+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
408+
409+/* RLI<⁧> */ /* PDI<⁩> */
410+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
411+/* LRE<‪> */ /* PDF<‬> */
412+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
413diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-16.c b/gcc/testsuite/c-c++-common/Wbidi-chars-16.c
414--- a/gcc/testsuite/c-c++-common/Wbidi-chars-16.c 1969-12-31 16:00:00.000000000 -0800
415+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-16.c 2021-12-25 01:36:22.048018559 -0800
416@@ -0,0 +1,26 @@
417+/* PR preprocessor/103026 */
418+/* { dg-do compile } */
419+/* { dg-options "-Wbidi-chars=any" } */
420+/* Test LTR/RTL chars. */
421+
422+/* LTR<‎> */
423+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
424+// LTR<‎>
425+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
426+/* RTL<‏> */
427+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
428+// RTL<‏>
429+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
430+
431+const char *s1 = "LTR<‎>";
432+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
433+const char *s2 = "LTR\u200e";
434+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
435+const char *s3 = "LTR\u200E";
436+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
437+const char *s4 = "RTL<‏>";
438+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
439+const char *s5 = "RTL\u200f";
440+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
441+const char *s6 = "RTL\u200F";
442+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
443diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-17.c b/gcc/testsuite/c-c++-common/Wbidi-chars-17.c
444--- a/gcc/testsuite/c-c++-common/Wbidi-chars-17.c 1969-12-31 16:00:00.000000000 -0800
445+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-17.c 2021-12-25 01:36:22.048018559 -0800
446@@ -0,0 +1,30 @@
447+/* PR preprocessor/103026 */
448+/* { dg-do compile } */
449+/* { dg-options "-Wbidi-chars=unpaired" } */
450+/* Test LTR/RTL chars. */
451+
452+/* LTR<‎> */
453+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
454+// LTR<‎>
455+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
456+/* RTL<‏> */
457+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
458+// RTL<‏>
459+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
460+int ltr_\u200e;
461+/* { dg-error "universal character " "" { target *-*-* } .-1 } */
462+int rtl_\u200f;
463+/* { dg-error "universal character " "" { target *-*-* } .-1 } */
464+
465+const char *s1 = "LTR<‎>";
466+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
467+const char *s2 = "LTR\u200e";
468+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
469+const char *s3 = "LTR\u200E";
470+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
471+const char *s4 = "RTL<‏>";
472+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
473+const char *s5 = "RTL\u200f";
474+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
475+const char *s6 = "RTL\u200F";
476+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
477diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-1.c b/gcc/testsuite/c-c++-common/Wbidi-chars-1.c
478--- a/gcc/testsuite/c-c++-common/Wbidi-chars-1.c 1969-12-31 16:00:00.000000000 -0800
479+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-1.c 2021-12-25 01:36:22.048018559 -0800
480@@ -0,0 +1,12 @@
481+/* PR preprocessor/103026 */
482+/* { dg-do compile } */
483+
484+int main() {
485+ int isAdmin = 0;
486+ /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */
487+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
488+ __builtin_printf("You are an admin.\n");
489+ /* end admins only ‮ { ⁦*/
490+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
491+ return 0;
492+}
493diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-2.c b/gcc/testsuite/c-c++-common/Wbidi-chars-2.c
494--- a/gcc/testsuite/c-c++-common/Wbidi-chars-2.c 1969-12-31 16:00:00.000000000 -0800
495+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-2.c 2021-12-25 01:36:22.048018559 -0800
496@@ -0,0 +1,9 @@
497+/* PR preprocessor/103026 */
498+/* { dg-do compile } */
499+
500+int main() {
501+ /* Say hello; newline⁧/*/ return 0 ;
502+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
503+ __builtin_printf("Hello world.\n");
504+ return 0;
505+}
506diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-3.c b/gcc/testsuite/c-c++-common/Wbidi-chars-3.c
507--- a/gcc/testsuite/c-c++-common/Wbidi-chars-3.c 1969-12-31 16:00:00.000000000 -0800
508+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-3.c 2021-12-25 01:36:22.048018559 -0800
509@@ -0,0 +1,11 @@
510+/* PR preprocessor/103026 */
511+/* { dg-do compile } */
512+
513+int main() {
514+ const char* access_level = "user";
515+ if (__builtin_strcmp(access_level, "user‮ ⁦// Check if admin⁩ ⁦")) {
516+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
517+ __builtin_printf("You are an admin.\n");
518+ }
519+ return 0;
520+}
521diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-4.c b/gcc/testsuite/c-c++-common/Wbidi-chars-4.c
522--- a/gcc/testsuite/c-c++-common/Wbidi-chars-4.c 1969-12-31 16:00:00.000000000 -0800
523+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-4.c 2021-12-25 01:36:22.048018559 -0800
524@@ -0,0 +1,188 @@
525+/* PR preprocessor/103026 */
526+/* { dg-do compile } */
527+/* { dg-options "-Wbidi-chars=any -Wno-multichar -Wno-overflow" } */
528+/* Test all bidi chars in various contexts (identifiers, comments,
529+ string literals, character constants), both UCN and UTF-8. The bidi
530+ chars here are properly terminated, except for the character constants. */
531+
532+/* a b c LRE‪ 1 2 3 PDF‬ x y z */
533+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
534+/* a b c RLE‫ 1 2 3 PDF‬ x y z */
535+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
536+/* a b c LRO‭ 1 2 3 PDF‬ x y z */
537+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
538+/* a b c RLO‮ 1 2 3 PDF‬ x y z */
539+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
540+/* a b c LRI⁦ 1 2 3 PDI⁩ x y z */
541+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
542+/* a b c RLI⁧ 1 2 3 PDI⁩ x y */
543+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
544+/* a b c FSI⁨ 1 2 3 PDI⁩ x y z */
545+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
546+
547+/* Same but C++ comments instead. */
548+// a b c LRE‪ 1 2 3 PDF‬ x y z
549+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
550+// a b c RLE‫ 1 2 3 PDF‬ x y z
551+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
552+// a b c LRO‭ 1 2 3 PDF‬ x y z
553+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
554+// a b c RLO‮ 1 2 3 PDF‬ x y z
555+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
556+// a b c LRI⁦ 1 2 3 PDI⁩ x y z
557+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
558+// a b c RLI⁧ 1 2 3 PDI⁩ x y
559+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
560+// a b c FSI⁨ 1 2 3 PDI⁩ x y z
561+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
562+
563+/* Here we're closing an unopened context, warn when =any. */
564+/* a b c PDI⁩ x y z */
565+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
566+/* a b c PDF‬ x y z */
567+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
568+// a b c PDI⁩ x y z
569+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
570+// a b c PDF‬ x y z
571+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
572+
573+/* Multiline comments. */
574+/* a b c PDI⁩ x y z
575+ */
576+/* { dg-warning "U\\+2069" "" { target *-*-* } .-2 } */
577+/* a b c PDF‬ x y z
578+ */
579+/* { dg-warning "U\\+202C" "" { target *-*-* } .-2 } */
580+/* first
581+ a b c PDI⁩ x y z
582+ */
583+/* { dg-warning "U\\+2069" "" { target *-*-* } .-2 } */
584+/* first
585+ a b c PDF‬ x y z
586+ */
587+/* { dg-warning "U\\+202C" "" { target *-*-* } .-2 } */
588+/* first
589+ a b c PDI⁩ x y z */
590+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
591+/* first
592+ a b c PDF‬ x y z */
593+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
594+
595+void
596+g1 ()
597+{
598+ const char *s1 = "a b c LRE‪ 1 2 3 PDF‬ x y z";
599+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
600+ const char *s2 = "a b c RLE‫ 1 2 3 PDF‬ x y z";
601+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
602+ const char *s3 = "a b c LRO‭ 1 2 3 PDF‬ x y z";
603+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
604+ const char *s4 = "a b c RLO‮ 1 2 3 PDF‬ x y z";
605+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
606+ const char *s5 = "a b c LRI⁦ 1 2 3 PDI⁩ x y z";
607+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
608+ const char *s6 = "a b c RLI⁧ 1 2 3 PDI⁩ x y z";
609+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
610+ const char *s7 = "a b c FSI⁨ 1 2 3 PDI⁩ x y z";
611+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
612+ const char *s8 = "a b c PDI⁩ x y z";
613+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
614+ const char *s9 = "a b c PDF‬ x y z";
615+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
616+
617+ const char *s10 = "a b c LRE\u202a 1 2 3 PDF\u202c x y z";
618+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
619+ const char *s11 = "a b c LRE\u202A 1 2 3 PDF\u202c x y z";
620+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
621+ const char *s12 = "a b c RLE\u202b 1 2 3 PDF\u202c x y z";
622+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
623+ const char *s13 = "a b c RLE\u202B 1 2 3 PDF\u202c x y z";
624+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
625+ const char *s14 = "a b c LRO\u202d 1 2 3 PDF\u202c x y z";
626+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
627+ const char *s15 = "a b c LRO\u202D 1 2 3 PDF\u202c x y z";
628+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
629+ const char *s16 = "a b c RLO\u202e 1 2 3 PDF\u202c x y z";
630+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
631+ const char *s17 = "a b c RLO\u202E 1 2 3 PDF\u202c x y z";
632+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
633+ const char *s18 = "a b c LRI\u2066 1 2 3 PDI\u2069 x y z";
634+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
635+ const char *s19 = "a b c RLI\u2067 1 2 3 PDI\u2069 x y z";
636+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
637+ const char *s20 = "a b c FSI\u2068 1 2 3 PDI\u2069 x y z";
638+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
639+}
640+
641+void
642+g2 ()
643+{
644+ const char c1 = '\u202a';
645+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
646+ const char c2 = '\u202A';
647+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
648+ const char c3 = '\u202b';
649+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
650+ const char c4 = '\u202B';
651+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
652+ const char c5 = '\u202d';
653+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
654+ const char c6 = '\u202D';
655+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
656+ const char c7 = '\u202e';
657+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
658+ const char c8 = '\u202E';
659+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
660+ const char c9 = '\u2066';
661+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
662+ const char c10 = '\u2067';
663+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
664+ const char c11 = '\u2068';
665+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
666+}
667+
668+int a‪b‬c;
669+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
670+int a‫b‬c;
671+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
672+int a‭b‬c;
673+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
674+int a‮b‬c;
675+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
676+int a⁦b⁩c;
677+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
678+int a⁧b⁩c;
679+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
680+int a⁨b⁩c;
681+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
682+int A‬X;
683+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
684+int A\u202cY;
685+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
686+int A\u202CY2;
687+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
688+
689+int d\u202ae\u202cf;
690+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
691+int d\u202Ae\u202cf2;
692+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
693+int d\u202be\u202cf;
694+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
695+int d\u202Be\u202cf2;
696+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
697+int d\u202de\u202cf;
698+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
699+int d\u202De\u202cf2;
700+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
701+int d\u202ee\u202cf;
702+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
703+int d\u202Ee\u202cf2;
704+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
705+int d\u2066e\u2069f;
706+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
707+int d\u2067e\u2069f;
708+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
709+int d\u2068e\u2069f;
710+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
711+int X\u2069;
712+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
713diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-5.c b/gcc/testsuite/c-c++-common/Wbidi-chars-5.c
714--- a/gcc/testsuite/c-c++-common/Wbidi-chars-5.c 1969-12-31 16:00:00.000000000 -0800
715+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-5.c 2021-12-25 01:36:22.048018559 -0800
716@@ -0,0 +1,188 @@
717+/* PR preprocessor/103026 */
718+/* { dg-do compile } */
719+/* { dg-options "-Wbidi-chars=unpaired -Wno-multichar -Wno-overflow" } */
720+/* Test all bidi chars in various contexts (identifiers, comments,
721+ string literals, character constants), both UCN and UTF-8. The bidi
722+ chars here are properly terminated, except for the character constants. */
723+
724+/* a b c LRE‪ 1 2 3 PDF‬ x y z */
725+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
726+/* a b c RLE‫ 1 2 3 PDF‬ x y z */
727+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
728+/* a b c LRO‭ 1 2 3 PDF‬ x y z */
729+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
730+/* a b c RLO‮ 1 2 3 PDF‬ x y z */
731+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
732+/* a b c LRI⁦ 1 2 3 PDI⁩ x y z */
733+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
734+/* a b c RLI⁧ 1 2 3 PDI⁩ x y */
735+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
736+/* a b c FSI⁨ 1 2 3 PDI⁩ x y z */
737+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
738+
739+/* Same but C++ comments instead. */
740+// a b c LRE‪ 1 2 3 PDF‬ x y z
741+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
742+// a b c RLE‫ 1 2 3 PDF‬ x y z
743+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
744+// a b c LRO‭ 1 2 3 PDF‬ x y z
745+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
746+// a b c RLO‮ 1 2 3 PDF‬ x y z
747+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
748+// a b c LRI⁦ 1 2 3 PDI⁩ x y z
749+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
750+// a b c RLI⁧ 1 2 3 PDI⁩ x y
751+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
752+// a b c FSI⁨ 1 2 3 PDI⁩ x y z
753+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
754+
755+/* Here we're closing an unopened context, warn when =any. */
756+/* a b c PDI⁩ x y z */
757+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
758+/* a b c PDF‬ x y z */
759+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
760+// a b c PDI⁩ x y z
761+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
762+// a b c PDF‬ x y z
763+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
764+
765+/* Multiline comments. */
766+/* a b c PDI⁩ x y z
767+ */
768+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
769+/* a b c PDF‬ x y z
770+ */
771+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
772+/* first
773+ a b c PDI⁩ x y z
774+ */
775+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
776+/* first
777+ a b c PDF‬ x y z
778+ */
779+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
780+/* first
781+ a b c PDI⁩ x y z */
782+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
783+/* first
784+ a b c PDF‬ x y z */
785+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
786+
787+void
788+g1 ()
789+{
790+ const char *s1 = "a b c LRE‪ 1 2 3 PDF‬ x y z";
791+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
792+ const char *s2 = "a b c RLE‫ 1 2 3 PDF‬ x y z";
793+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
794+ const char *s3 = "a b c LRO‭ 1 2 3 PDF‬ x y z";
795+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
796+ const char *s4 = "a b c RLO‮ 1 2 3 PDF‬ x y z";
797+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
798+ const char *s5 = "a b c LRI⁦ 1 2 3 PDI⁩ x y z";
799+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
800+ const char *s6 = "a b c RLI⁧ 1 2 3 PDI⁩ x y z";
801+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
802+ const char *s7 = "a b c FSI⁨ 1 2 3 PDI⁩ x y z";
803+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
804+ const char *s8 = "a b c PDI⁩ x y z";
805+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
806+ const char *s9 = "a b c PDF‬ x y z";
807+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
808+
809+ const char *s10 = "a b c LRE\u202a 1 2 3 PDF\u202c x y z";
810+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
811+ const char *s11 = "a b c LRE\u202A 1 2 3 PDF\u202c x y z";
812+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
813+ const char *s12 = "a b c RLE\u202b 1 2 3 PDF\u202c x y z";
814+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
815+ const char *s13 = "a b c RLE\u202B 1 2 3 PDF\u202c x y z";
816+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
817+ const char *s14 = "a b c LRO\u202d 1 2 3 PDF\u202c x y z";
818+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
819+ const char *s15 = "a b c LRO\u202D 1 2 3 PDF\u202c x y z";
820+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
821+ const char *s16 = "a b c RLO\u202e 1 2 3 PDF\u202c x y z";
822+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
823+ const char *s17 = "a b c RLO\u202E 1 2 3 PDF\u202c x y z";
824+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
825+ const char *s18 = "a b c LRI\u2066 1 2 3 PDI\u2069 x y z";
826+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
827+ const char *s19 = "a b c RLI\u2067 1 2 3 PDI\u2069 x y z";
828+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
829+ const char *s20 = "a b c FSI\u2068 1 2 3 PDI\u2069 x y z";
830+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
831+}
832+
833+void
834+g2 ()
835+{
836+ const char c1 = '\u202a';
837+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
838+ const char c2 = '\u202A';
839+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
840+ const char c3 = '\u202b';
841+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
842+ const char c4 = '\u202B';
843+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
844+ const char c5 = '\u202d';
845+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
846+ const char c6 = '\u202D';
847+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
848+ const char c7 = '\u202e';
849+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
850+ const char c8 = '\u202E';
851+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
852+ const char c9 = '\u2066';
853+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
854+ const char c10 = '\u2067';
855+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
856+ const char c11 = '\u2068';
857+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
858+}
859+
860+int a‪b‬c;
861+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
862+int a‫b‬c;
863+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
864+int a‭b‬c;
865+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
866+int a‮b‬c;
867+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
868+int a⁦b⁩c;
869+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
870+int a⁧b⁩c;
871+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
872+int a⁨b⁩c;
873+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
874+int A‬X;
875+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
876+int A\u202cY;
877+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
878+int A\u202CY2;
879+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
880+
881+int d\u202ae\u202cf;
882+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
883+int d\u202Ae\u202cf2;
884+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
885+int d\u202be\u202cf;
886+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
887+int d\u202Be\u202cf2;
888+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
889+int d\u202de\u202cf;
890+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
891+int d\u202De\u202cf2;
892+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
893+int d\u202ee\u202cf;
894+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
895+int d\u202Ee\u202cf2;
896+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
897+int d\u2066e\u2069f;
898+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
899+int d\u2067e\u2069f;
900+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
901+int d\u2068e\u2069f;
902+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
903+int X\u2069;
904+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
905diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-6.c b/gcc/testsuite/c-c++-common/Wbidi-chars-6.c
906--- a/gcc/testsuite/c-c++-common/Wbidi-chars-6.c 1969-12-31 16:00:00.000000000 -0800
907+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-6.c 2021-12-25 01:36:22.052018489 -0800
908@@ -0,0 +1,155 @@
909+/* PR preprocessor/103026 */
910+/* { dg-do compile } */
911+/* { dg-options "-Wbidi-chars=unpaired" } */
912+/* Test nesting of bidi chars in various contexts. */
913+
914+/* Terminated by the wrong char: */
915+/* a b c LRE‪ 1 2 3 PDI⁩ x y z */
916+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
917+/* a b c RLE‫ 1 2 3 PDI⁩ x y z*/
918+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
919+/* a b c LRO‭ 1 2 3 PDI⁩ x y z */
920+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
921+/* a b c RLO‮ 1 2 3 PDI⁩ x y z */
922+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
923+/* a b c LRI⁦ 1 2 3 PDF‬ x y z */
924+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
925+/* a b c RLI⁧ 1 2 3 PDF‬ x y z */
926+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
927+/* a b c FSI⁨ 1 2 3 PDF‬ x y z*/
928+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
929+
930+/* LRE‪ PDF‬ */
931+/* LRE‪ LRE‪ PDF‬ PDF‬ */
932+/* PDF‬ LRE‪ PDF‬ */
933+/* LRE‪ PDF‬ LRE‪ PDF‬ */
934+/* LRE‪ LRE‪ PDF‬ */
935+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
936+/* PDF‬ LRE‪ */
937+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
938+
939+// a b c LRE‪ 1 2 3 PDI⁩ x y z
940+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
941+// a b c RLE‫ 1 2 3 PDI⁩ x y z*/
942+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
943+// a b c LRO‭ 1 2 3 PDI⁩ x y z
944+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
945+// a b c RLO‮ 1 2 3 PDI⁩ x y z
946+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
947+// a b c LRI⁦ 1 2 3 PDF‬ x y z
948+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
949+// a b c RLI⁧ 1 2 3 PDF‬ x y z
950+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
951+// a b c FSI⁨ 1 2 3 PDF‬ x y z
952+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
953+
954+// LRE‪ PDF‬
955+// LRE‪ LRE‪ PDF‬ PDF‬
956+// PDF‬ LRE‪ PDF‬
957+// LRE‪ PDF‬ LRE‪ PDF‬
958+// LRE‪ LRE‪ PDF‬
959+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
960+// PDF‬ LRE‪
961+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
962+
963+void
964+g1 ()
965+{
966+ const char *s1 = "a b c LRE‪ 1 2 3 PDI⁩ x y z";
967+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
968+ const char *s2 = "a b c LRE\u202a 1 2 3 PDI\u2069 x y z";
969+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
970+ const char *s3 = "a b c RLE‫ 1 2 3 PDI⁩ x y ";
971+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
972+ const char *s4 = "a b c RLE\u202b 1 2 3 PDI\u2069 x y z";
973+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
974+ const char *s5 = "a b c LRO‭ 1 2 3 PDI⁩ x y z";
975+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
976+ const char *s6 = "a b c LRO\u202d 1 2 3 PDI\u2069 x y z";
977+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
978+ const char *s7 = "a b c RLO‮ 1 2 3 PDI⁩ x y z";
979+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
980+ const char *s8 = "a b c RLO\u202e 1 2 3 PDI\u2069 x y z";
981+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
982+ const char *s9 = "a b c LRI⁦ 1 2 3 PDF‬ x y z";
983+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
984+ const char *s10 = "a b c LRI\u2066 1 2 3 PDF\u202c x y z";
985+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
986+ const char *s11 = "a b c RLI⁧ 1 2 3 PDF‬ x y z\
987+ ";
988+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
989+ const char *s12 = "a b c RLI\u2067 1 2 3 PDF\u202c x y z";
990+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
991+ const char *s13 = "a b c FSI⁨ 1 2 3 PDF‬ x y z";
992+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
993+ const char *s14 = "a b c FSI\u2068 1 2 3 PDF\u202c x y z";
994+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
995+ const char *s15 = "PDF‬ LRE‪";
996+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
997+ const char *s16 = "PDF\u202c LRE\u202a";
998+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
999+ const char *s17 = "LRE‪ PDF‬";
1000+ const char *s18 = "LRE\u202a PDF\u202c";
1001+ const char *s19 = "LRE‪ LRE‪ PDF‬ PDF‬";
1002+ const char *s20 = "LRE\u202a LRE\u202a PDF\u202c PDF\u202c";
1003+ const char *s21 = "PDF‬ LRE‪ PDF‬";
1004+ const char *s22 = "PDF\u202c LRE\u202a PDF\u202c";
1005+ const char *s23 = "LRE‪ LRE‪ PDF‬";
1006+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1007+ const char *s24 = "LRE\u202a LRE\u202a PDF\u202c";
1008+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1009+ const char *s25 = "PDF‬ LRE‪";
1010+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1011+ const char *s26 = "PDF\u202c LRE\u202a";
1012+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1013+ const char *s27 = "PDF‬ LRE\u202a";
1014+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1015+ const char *s28 = "PDF\u202c LRE‪";
1016+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1017+}
1018+
1019+int aLRE‪bPDI⁩;
1020+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1021+int A\u202aB\u2069C;
1022+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1023+int aRLE‫bPDI⁩;
1024+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1025+int a\u202bB\u2069c;
1026+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1027+int aLRO‭bPDI⁩;
1028+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1029+int a\u202db\u2069c2;
1030+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1031+int aRLO‮bPDI⁩;
1032+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1033+int a\u202eb\u2069;
1034+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1035+int aLRI⁦bPDF‬;
1036+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1037+int a\u2066b\u202c;
1038+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1039+int aRLI⁧bPDF‬c
1040+;
1041+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
1042+int a\u2067b\u202c;
1043+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1044+int aFSI⁨bPDF‬;
1045+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1046+int a\u2068b\u202c;
1047+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1048+int aFSI⁨bPD\u202C;
1049+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1050+int aFSI\u2068bPDF‬_;
1051+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1052+int aLRE‪bPDF‬b;
1053+int A\u202aB\u202c;
1054+int a_LRE‪_LRE‪_b_PDF‬_PDF‬;
1055+int A\u202aA\u202aB\u202cB\u202c;
1056+int aPDF‬bLREadPDF‬;
1057+int a_\u202C_\u202a_\u202c;
1058+int a_LRE‪_b_PDF‬_c_LRE‪_PDF‬;
1059+int a_\u202a_\u202c_\u202a_\u202c_;
1060+int a_LRE‪_b_PDF‬_c_LRE‪;
1061+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1062+int a_\u202a_\u202c_\u202a_;
1063+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1064diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-7.c b/gcc/testsuite/c-c++-common/Wbidi-chars-7.c
1065--- a/gcc/testsuite/c-c++-common/Wbidi-chars-7.c 1969-12-31 16:00:00.000000000 -0800
1066+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-7.c 2021-12-25 01:36:22.052018489 -0800
1067@@ -0,0 +1,9 @@
1068+/* PR preprocessor/103026 */
1069+/* { dg-do compile } */
1070+/* { dg-options "-Wbidi-chars=any" } */
1071+/* Test we ignore UCNs in comments. */
1072+
1073+// a b c \u202a 1 2 3
1074+// a b c \u202A 1 2 3
1075+/* a b c \u202a 1 2 3 */
1076+/* a b c \u202A 1 2 3 */
1077diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-8.c b/gcc/testsuite/c-c++-common/Wbidi-chars-8.c
1078--- a/gcc/testsuite/c-c++-common/Wbidi-chars-8.c 1969-12-31 16:00:00.000000000 -0800
1079+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-8.c 2021-12-25 01:36:22.052018489 -0800
1080@@ -0,0 +1,13 @@
1081+/* PR preprocessor/103026 */
1082+/* { dg-do compile } */
1083+/* { dg-options "-Wbidi-chars=any" } */
1084+/* Test \u vs \U. */
1085+
1086+int a_\u202A;
1087+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1088+int a_\u202a_2;
1089+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1090+int a_\U0000202A_3;
1091+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1092+int a_\U0000202a_4;
1093+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1094diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-9.c b/gcc/testsuite/c-c++-common/Wbidi-chars-9.c
1095--- a/gcc/testsuite/c-c++-common/Wbidi-chars-9.c 1969-12-31 16:00:00.000000000 -0800
1096+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-9.c 2021-12-25 01:36:22.052018489 -0800
1097@@ -0,0 +1,29 @@
1098+/* PR preprocessor/103026 */
1099+/* { dg-do compile } */
1100+/* { dg-options "-Wbidi-chars=unpaired" } */
1101+/* Test that we properly separate bidi contexts (comment/identifier/character
1102+ constant/string literal). */
1103+
1104+/* LRE ->‪<- */ int pdf_\u202c_1;
1105+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1106+/* RLE ->‫<- */ int pdf_\u202c_2;
1107+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1108+/* LRO ->‭<- */ int pdf_\u202c_3;
1109+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1110+/* RLO ->‮<- */ int pdf_\u202c_4;
1111+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1112+/* LRI ->⁦<-*/ int pdi_\u2069_1;
1113+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1114+/* RLI ->⁧<- */ int pdi_\u2069_12;
1115+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1116+/* FSI ->⁨<- */ int pdi_\u2069_3;
1117+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1118+
1119+const char *s1 = "LRE\u202a"; /* PDF ->‬<- */
1120+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1121+/* LRE ->‪<- */ const char *s2 = "PDF\u202c";
1122+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1123+const char *s3 = "LRE\u202a"; int pdf_\u202c_5;
1124+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1125+int lre_\u202a; const char *s4 = "PDF\u202c";
1126+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1127diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
1128--- a/libcpp/include/cpplib.h 2021-12-25 01:35:33.288883417 -0800
1129+++ b/libcpp/include/cpplib.h 2021-12-25 01:36:22.052018489 -0800
1130@@ -308,6 +308,17 @@ enum cpp_normalize_level {
1131 normalized_none
1132 };
1133
1134+/* The possible bidirectional control characters checking levels, from least
1135+ restrictive to most. */
1136+enum cpp_bidirectional_level {
1137+ /* No checking. */
1138+ bidirectional_none,
1139+ /* Only detect unpaired uses of bidirectional control characters. */
1140+ bidirectional_unpaired,
1141+ /* Detect any use of bidirectional control characters. */
1142+ bidirectional_any
1143+};
1144+
1145 /* This structure is nested inside struct cpp_reader, and
1146 carries all the options visible to the command line. */
1147 struct cpp_options
1148@@ -515,6 +526,10 @@ struct cpp_options
1149 /* True if warn about differences between C++98 and C++11. */
1150 bool cpp_warn_cxx11_compat;
1151
1152+ /* Nonzero if bidirectional control characters checking is on. See enum
1153+ cpp_bidirectional_level. */
1154+ unsigned char cpp_warn_bidirectional;
1155+
1156 /* Dependency generation. */
1157 struct
1158 {
1159@@ -613,7 +628,8 @@ enum cpp_warning_reason {
1160 CPP_W_C90_C99_COMPAT,
1161 CPP_W_C11_C2X_COMPAT,
1162 CPP_W_CXX11_COMPAT,
1163- CPP_W_EXPANSION_TO_DEFINED
1164+ CPP_W_EXPANSION_TO_DEFINED,
1165+ CPP_W_BIDIRECTIONAL
1166 };
1167
1168 /* Callback for header lookup for HEADER, which is the name of a
1169diff --git a/libcpp/init.c b/libcpp/init.c
1170--- a/libcpp/init.c 2021-12-25 01:29:12.931317107 -0800
1171+++ b/libcpp/init.c 2021-12-25 01:36:22.052018489 -0800
1172@@ -215,6 +215,7 @@ cpp_create_reader (enum c_lang lang, cpp
1173 = ENABLE_CANONICAL_SYSTEM_HEADERS;
1174 CPP_OPTION (pfile, ext_numeric_literals) = 1;
1175 CPP_OPTION (pfile, warn_date_time) = 0;
1176+ CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired;
1177
1178 /* Default CPP arithmetic to something sensible for the host for the
1179 benefit of dumb users like fix-header. */
1180diff --git a/libcpp/internal.h b/libcpp/internal.h
1181--- a/libcpp/internal.h 2021-12-25 01:35:33.288883417 -0800
1182+++ b/libcpp/internal.h 2021-12-25 01:36:22.052018489 -0800
1183@@ -581,6 +581,10 @@ struct cpp_reader
1184 /* If non-zero, the lexer will use this location for the next token
1185 instead of getting a location from the linemap. */
1186 location_t forced_token_location;
1187+ bool warn_bidi_p () const
1188+ {
1189+ return CPP_OPTION (this, cpp_warn_bidirectional) != bidirectional_none;
1190+ }
1191 };
1192
1193 /* Character classes. Based on the more primitive macros in safe-ctype.h.
1194diff --git a/libcpp/lex.c b/libcpp/lex.c
1195--- a/libcpp/lex.c 2021-12-25 01:35:33.288883417 -0800
1196+++ b/libcpp/lex.c 2021-12-25 01:36:22.052018489 -0800
1197@@ -1164,6 +1164,324 @@ _cpp_process_line_notes (cpp_reader *pfi
1198 }
1199 }
1200
1201+namespace bidi {
1202+ enum class kind {
1203+ NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1204+ };
1205+
1206+ /* All the UTF-8 encodings of bidi characters start with E2. */
1207+ constexpr uchar utf8_start = 0xe2;
1208+
1209+ /* A vector holding currently open bidi contexts. We use a char for
1210+ each context, its LSB is 1 if it represents a PDF context, 0 if it
1211+ represents a PDI context. The next bit is 1 if this context was open
1212+ by a bidi character written as a UCN, and 0 when it was UTF-8. */
1213+ semi_embedded_vec <unsigned char, 16> vec;
1214+
1215+ /* Close the whole comment/identifier/string literal/character constant
1216+ context. */
1217+ void on_close ()
1218+ {
1219+ vec.truncate (0);
1220+ }
1221+
1222+ /* Pop the last element in the vector. */
1223+ void pop ()
1224+ {
1225+ unsigned int len = vec.count ();
1226+ gcc_checking_assert (len > 0);
1227+ vec.truncate (len - 1);
1228+ }
1229+
1230+ /* Return the context of the Ith element. */
1231+ kind ctx_at (unsigned int i)
1232+ {
1233+ return (vec[i] & 1) ? kind::PDF : kind::PDI;
1234+ }
1235+
1236+ /* Return which context is currently opened. */
1237+ kind current_ctx ()
1238+ {
1239+ unsigned int len = vec.count ();
1240+ if (len == 0)
1241+ return kind::NONE;
1242+ return ctx_at (len - 1);
1243+ }
1244+
1245+ /* Return true if the current context comes from a UCN origin, that is,
1246+ the bidi char which started this bidi context was written as a UCN. */
1247+ bool current_ctx_ucn_p ()
1248+ {
1249+ unsigned int len = vec.count ();
1250+ gcc_checking_assert (len > 0);
1251+ return (vec[len - 1] >> 1) & 1;
1252+ }
1253+
1254+ /* We've read a bidi char, update the current vector as necessary. */
1255+ void on_char (kind k, bool ucn_p)
1256+ {
1257+ switch (k)
1258+ {
1259+ case kind::LRE:
1260+ case kind::RLE:
1261+ case kind::LRO:
1262+ case kind::RLO:
1263+ vec.push (ucn_p ? 3u : 1u);
1264+ break;
1265+ case kind::LRI:
1266+ case kind::RLI:
1267+ case kind::FSI:
1268+ vec.push (ucn_p ? 2u : 0u);
1269+ break;
1270+ /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271+ whose scope has not yet been terminated. */
1272+ case kind::PDF:
1273+ if (current_ctx () == kind::PDF)
1274+ pop ();
1275+ break;
1276+ /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277+ scope has not yet been terminated, as well as the scopes of
1278+ any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279+ yet been terminated. */
1280+ case kind::PDI:
1281+ for (int i = vec.count () - 1; i >= 0; --i)
1282+ if (ctx_at (i) == kind::PDI)
1283+ {
1284+ vec.truncate (i);
1285+ break;
1286+ }
1287+ break;
1288+ case kind::LTR:
1289+ case kind::RTL:
1290+ /* These aren't popped by a PDF/PDI. */
1291+ break;
1292+ [[likely]] case kind::NONE:
1293+ break;
1294+ default:
1295+ abort ();
1296+ }
1297+ }
1298+
1299+ /* Return a descriptive string for K. */
1300+ const char *to_str (kind k)
1301+ {
1302+ switch (k)
1303+ {
1304+ case kind::LRE:
1305+ return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306+ case kind::RLE:
1307+ return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308+ case kind::LRO:
1309+ return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310+ case kind::RLO:
1311+ return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312+ case kind::LRI:
1313+ return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314+ case kind::RLI:
1315+ return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316+ case kind::FSI:
1317+ return "U+2068 (FIRST STRONG ISOLATE)";
1318+ case kind::PDF:
1319+ return "U+202C (POP DIRECTIONAL FORMATTING)";
1320+ case kind::PDI:
1321+ return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322+ case kind::LTR:
1323+ return "U+200E (LEFT-TO-RIGHT MARK)";
1324+ case kind::RTL:
1325+ return "U+200F (RIGHT-TO-LEFT MARK)";
1326+ default:
1327+ abort ();
1328+ }
1329+ }
1330+}
1331+
1332+/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1333+
1334+static bidi::kind
1335+get_bidi_utf8 (const unsigned char *const p)
1336+{
1337+ gcc_checking_assert (p[0] == bidi::utf8_start);
1338+
1339+ if (p[1] == 0x80)
1340+ switch (p[2])
1341+ {
1342+ case 0xaa:
1343+ return bidi::kind::LRE;
1344+ case 0xab:
1345+ return bidi::kind::RLE;
1346+ case 0xac:
1347+ return bidi::kind::PDF;
1348+ case 0xad:
1349+ return bidi::kind::LRO;
1350+ case 0xae:
1351+ return bidi::kind::RLO;
1352+ case 0x8e:
1353+ return bidi::kind::LTR;
1354+ case 0x8f:
1355+ return bidi::kind::RTL;
1356+ default:
1357+ break;
1358+ }
1359+ else if (p[1] == 0x81)
1360+ switch (p[2])
1361+ {
1362+ case 0xa6:
1363+ return bidi::kind::LRI;
1364+ case 0xa7:
1365+ return bidi::kind::RLI;
1366+ case 0xa8:
1367+ return bidi::kind::FSI;
1368+ case 0xa9:
1369+ return bidi::kind::PDI;
1370+ default:
1371+ break;
1372+ }
1373+
1374+ return bidi::kind::NONE;
1375+}
1376+
1377+/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1378+
1379+static bidi::kind
1380+get_bidi_ucn (const unsigned char *p, bool is_U)
1381+{
1382+ /* 6.4.3 Universal Character Names
1383+ \u hex-quad
1384+ \U hex-quad hex-quad
1385+ where \unnnn means \U0000nnnn. */
1386+
1387+ if (is_U)
1388+ {
1389+ if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1390+ return bidi::kind::NONE;
1391+ /* Skip 4B so we can treat \u and \U the same below. */
1392+ p += 4;
1393+ }
1394+
1395+ /* All code points we are looking for start with 20xx. */
1396+ if (p[0] != '2' || p[1] != '0')
1397+ return bidi::kind::NONE;
1398+ else if (p[2] == '2')
1399+ switch (p[3])
1400+ {
1401+ case 'a':
1402+ case 'A':
1403+ return bidi::kind::LRE;
1404+ case 'b':
1405+ case 'B':
1406+ return bidi::kind::RLE;
1407+ case 'c':
1408+ case 'C':
1409+ return bidi::kind::PDF;
1410+ case 'd':
1411+ case 'D':
1412+ return bidi::kind::LRO;
1413+ case 'e':
1414+ case 'E':
1415+ return bidi::kind::RLO;
1416+ default:
1417+ break;
1418+ }
1419+ else if (p[2] == '6')
1420+ switch (p[3])
1421+ {
1422+ case '6':
1423+ return bidi::kind::LRI;
1424+ case '7':
1425+ return bidi::kind::RLI;
1426+ case '8':
1427+ return bidi::kind::FSI;
1428+ case '9':
1429+ return bidi::kind::PDI;
1430+ default:
1431+ break;
1432+ }
1433+ else if (p[2] == '0')
1434+ switch (p[3])
1435+ {
1436+ case 'e':
1437+ case 'E':
1438+ return bidi::kind::LTR;
1439+ case 'f':
1440+ case 'F':
1441+ return bidi::kind::RTL;
1442+ default:
1443+ break;
1444+ }
1445+
1446+ return bidi::kind::NONE;
1447+}
1448+
1449+/* We're closing a bidi context, that is, we've encountered a newline,
1450+ are closing a C-style comment, or are at the end of a string literal,
1451+ character constant, or identifier. Warn if this context was not
1452+ properly terminated by a PDI or PDF. P points to the last character
1453+ in this context. */
1454+
1455+static void
1456+maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1457+{
1458+ if (CPP_OPTION (pfile, cpp_warn_bidirectional) == bidirectional_unpaired
1459+ && bidi::vec.count () > 0)
1460+ {
1461+ const location_t loc
1462+ = linemap_position_for_column (pfile->line_table,
1463+ CPP_BUF_COLUMN (pfile->buffer, p));
1464+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1465+ "unpaired UTF-8 bidirectional control character "
1466+ "detected");
1467+ }
1468+ /* We're done with this context. */
1469+ bidi::on_close ();
1470+}
1471+
1472+/* We're at the beginning or in the middle of an identifier/comment/string
1473+ literal/character constant. Warn if we've encountered a bidi character.
1474+ KIND says which bidi character it was; P points to it in the character
1475+ stream. UCN_P is true iff this bidi character was written as a UCN. */
1476+
1477+static void
1478+maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
1479+ bool ucn_p)
1480+{
1481+ if (__builtin_expect (kind == bidi::kind::NONE, 1))
1482+ return;
1483+
1484+ const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1485+
1486+ if (warn_bidi != bidirectional_none)
1487+ {
1488+ const location_t loc
1489+ = linemap_position_for_column (pfile->line_table,
1490+ CPP_BUF_COLUMN (pfile->buffer, p));
1491+ /* It seems excessive to warn about a PDI/PDF that is closing
1492+ an opened context because we've already warned about the
1493+ opening character. Except warn when we have a UCN x UTF-8
1494+ mismatch. */
1495+ if (kind == bidi::current_ctx ())
1496+ {
1497+ if (warn_bidi == bidirectional_unpaired
1498+ && bidi::current_ctx_ucn_p () != ucn_p)
1499+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1500+ "UTF-8 vs UCN mismatch when closing "
1501+ "a context by \"%s\"", bidi::to_str (kind));
1502+ }
1503+ else if (warn_bidi == bidirectional_any)
1504+ {
1505+ if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1506+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1507+ "\"%s\" is closing an unopened context",
1508+ bidi::to_str (kind));
1509+ else
1510+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1511+ "found problematic Unicode character \"%s\"",
1512+ bidi::to_str (kind));
1513+ }
1514+ }
1515+ /* We're done with this context. */
1516+ bidi::on_char (kind, ucn_p);
1517+}
1518+
1519 /* Skip a C-style block comment. We find the end of the comment by
1520 seeing if an asterisk is before every '/' we encounter. Returns
1521 nonzero if comment terminated by EOF, zero otherwise.
1522@@ -1175,6 +1493,7 @@ _cpp_skip_block_comment (cpp_reader *pfi
1523 cpp_buffer *buffer = pfile->buffer;
1524 const uchar *cur = buffer->cur;
1525 uchar c;
1526+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1527
1528 cur++;
1529 if (*cur == '/')
1530@@ -1189,7 +1508,11 @@ _cpp_skip_block_comment (cpp_reader *pfi
1531 if (c == '/')
1532 {
1533 if (cur[-2] == '*')
1534- break;
1535+ {
1536+ if (warn_bidi_p)
1537+ maybe_warn_bidi_on_close (pfile, cur);
1538+ break;
1539+ }
1540
1541 /* Warn about potential nested comments, but not if the '/'
1542 comes immediately before the true comment delimiter.
1543@@ -1208,6 +1531,8 @@ _cpp_skip_block_comment (cpp_reader *pfi
1544 {
1545 unsigned int cols;
1546 buffer->cur = cur - 1;
1547+ if (warn_bidi_p)
1548+ maybe_warn_bidi_on_close (pfile, cur);
1549 _cpp_process_line_notes (pfile, true);
1550 if (buffer->next_line >= buffer->rlimit)
1551 return true;
1552@@ -1218,6 +1543,13 @@ _cpp_skip_block_comment (cpp_reader *pfi
1553
1554 cur = buffer->cur;
1555 }
1556+ /* If this is a beginning of a UTF-8 encoding, it might be
1557+ a bidirectional control character. */
1558+ else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1559+ {
1560+ bidi::kind kind = get_bidi_utf8 (cur - 1);
1561+ maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/false);
1562+ }
1563 }
1564
1565 buffer->cur = cur;
1566@@ -1233,9 +1565,31 @@ skip_line_comment (cpp_reader *pfile)
1567 {
1568 cpp_buffer *buffer = pfile->buffer;
1569 location_t orig_line = pfile->line_table->highest_line;
1570+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1571
1572- while (*buffer->cur != '\n')
1573- buffer->cur++;
1574+ if (!warn_bidi_p)
1575+ while (*buffer->cur != '\n')
1576+ buffer->cur++;
1577+ else
1578+ {
1579+ while (*buffer->cur != '\n'
1580+ && *buffer->cur != bidi::utf8_start)
1581+ buffer->cur++;
1582+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1583+ {
1584+ while (*buffer->cur != '\n')
1585+ {
1586+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1587+ {
1588+ bidi::kind kind = get_bidi_utf8 (buffer->cur);
1589+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1590+ /*ucn_p=*/false);
1591+ }
1592+ buffer->cur++;
1593+ }
1594+ maybe_warn_bidi_on_close (pfile, buffer->cur);
1595+ }
1596+ }
1597
1598 _cpp_process_line_notes (pfile, true);
1599 return orig_line != pfile->line_table->highest_line;
1600@@ -1343,11 +1697,13 @@ static const cppchar_t utf8_signifier =
1601
1602 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1603 an identifier. FIRST is TRUE if this starts an identifier. */
1604+
1605 static bool
1606 forms_identifier_p (cpp_reader *pfile, int first,
1607 struct normalize_state *state)
1608 {
1609 cpp_buffer *buffer = pfile->buffer;
1610+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1611
1612 if (*buffer->cur == '$')
1613 {
1614@@ -1370,6 +1726,13 @@ forms_identifier_p (cpp_reader *pfile, i
1615 cppchar_t s;
1616 if (*buffer->cur >= utf8_signifier)
1617 {
1618+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1619+ && warn_bidi_p)
1620+ {
1621+ bidi::kind kind = get_bidi_utf8 (buffer->cur);
1622+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1623+ /*ucn_p=*/false);
1624+ }
1625 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1626 state, &s))
1627 return true;
1628@@ -1378,6 +1741,13 @@ forms_identifier_p (cpp_reader *pfile, i
1629 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1630 {
1631 buffer->cur += 2;
1632+ if (warn_bidi_p)
1633+ {
1634+ bidi::kind kind = get_bidi_ucn (buffer->cur,
1635+ buffer->cur[-1] == 'U');
1636+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1637+ /*ucn_p=*/true);
1638+ }
1639 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1640 state, &s, NULL, NULL))
1641 return true;
1642@@ -1486,6 +1856,7 @@ lex_identifier (cpp_reader *pfile, const
1643 const uchar *cur;
1644 unsigned int len;
1645 unsigned int hash = HT_HASHSTEP (0, *base);
1646+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1647
1648 cur = pfile->buffer->cur;
1649 if (! starts_ucn)
1650@@ -1509,6 +1880,8 @@ lex_identifier (cpp_reader *pfile, const
1651 pfile->buffer->cur++;
1652 }
1653 } while (forms_identifier_p (pfile, false, nst));
1654+ if (warn_bidi_p)
1655+ maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
1656 result = _cpp_interpret_identifier (pfile, base,
1657 pfile->buffer->cur - base);
1658 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1659@@ -1697,6 +2070,7 @@ lex_raw_string (cpp_reader *pfile, cpp_t
1660 {
1661 uchar raw_prefix[17];
1662 uchar temp_buffer[18];
1663+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1664 const uchar *orig_base;
1665 unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1666 enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1667@@ -1946,8 +2320,15 @@ lex_raw_string (cpp_reader *pfile, cpp_t
1668 cur = base = pfile->buffer->cur;
1669 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1670 }
1671+ else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
1672+ && warn_bidi_p)
1673+ maybe_warn_bidi_on_char (pfile, pos - 1, get_bidi_utf8 (pos - 1),
1674+ /*ucn_p=*/false);
1675 }
1676
1677+ if (warn_bidi_p)
1678+ maybe_warn_bidi_on_close (pfile, pos);
1679+
1680 if (CPP_OPTION (pfile, user_literals))
1681 {
1682 /* If a string format macro, say from inttypes.h, is placed touching
1683@@ -2042,15 +2423,27 @@ lex_string (cpp_reader *pfile, cpp_token
1684 else
1685 terminator = '>', type = CPP_HEADER_NAME;
1686
1687+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1688 for (;;)
1689 {
1690 cppchar_t c = *cur++;
1691
1692 /* In #include-style directives, terminators are not escapable. */
1693 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1694- cur++;
1695+ {
1696+ if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
1697+ {
1698+ bidi::kind kind = get_bidi_ucn (cur + 1, cur[0] == 'U');
1699+ maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/true);
1700+ }
1701+ cur++;
1702+ }
1703 else if (c == terminator)
1704- break;
1705+ {
1706+ if (warn_bidi_p)
1707+ maybe_warn_bidi_on_close (pfile, cur - 1);
1708+ break;
1709+ }
1710 else if (c == '\n')
1711 {
1712 cur--;
1713@@ -2067,6 +2460,11 @@ lex_string (cpp_reader *pfile, cpp_token
1714 }
1715 else if (c == '\0')
1716 saw_NUL = true;
1717+ else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1718+ {
1719+ bidi::kind kind = get_bidi_utf8 (cur - 1);
1720+ maybe_warn_bidi_on_char (pfile, cur - 1, kind, /*ucn_p=*/false);
1721+ }
1722 }
1723
1724 if (saw_NUL && !pfile->state.skipping)
diff --git a/meta/recipes-devtools/gcc/gcc/0004-CVE-2021-42574.patch b/meta/recipes-devtools/gcc/gcc/0004-CVE-2021-42574.patch
new file mode 100644
index 0000000000..877b8a6452
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0004-CVE-2021-42574.patch
@@ -0,0 +1,138 @@
1From 1a7f2c0774129750fdf73e9f1b78f0ce983c9ab3 Mon Sep 17 00:00:00 2001
2From: David Malcolm <dmalcolm@redhat.com>
3Date: Tue, 2 Nov 2021 09:54:32 -0400
4Subject: [PATCH] libcpp: escape non-ASCII source bytes in -Wbidi-chars=
5 [PR103026]
6MIME-Version: 1.0
7Content-Type: text/plain; charset=utf8
8Content-Transfer-Encoding: 8bit
9
10This flags rich_locations associated with -Wbidi-chars= so that
11non-ASCII bytes will be escaped when printing the source lines
12(using the diagnostics support I added in
13r12-4825-gbd5e882cf6e0def3dd1bc106075d59a303fe0d1e).
14
15In particular, this ensures that the printed source lines will
16be pure ASCII, and thus the visual ordering of the characters
17will be the same as the logical ordering.
18
19Before:
20
21 Wbidi-chars-1.c: In function âmainâ:
22 Wbidi-chars-1.c:6:43: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
23 6 | /*â® } â¦if (isAdmin)⩠⦠begin admins only */
24 | ^
25 Wbidi-chars-1.c:9:28: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
26 9 | /* end admins only â® { â¦*/
27 | ^
28
29 Wbidi-chars-11.c:6:15: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
30 6 | int LRE_âª_PDF_\u202c;
31 | ^
32 Wbidi-chars-11.c:8:19: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
33 8 | int LRE_\u202a_PDF_â¬_;
34 | ^
35 Wbidi-chars-11.c:10:28: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
36 10 | const char *s1 = "LRE_âª_PDF_\u202c";
37 | ^
38 Wbidi-chars-11.c:12:33: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
39 12 | const char *s2 = "LRE_\u202a_PDF_â¬";
40 | ^
41
42After:
43
44 Wbidi-chars-1.c: In function âmainâ:
45 Wbidi-chars-1.c:6:43: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
46 6 | /*<U+202E> } <U+2066>if (isAdmin)<U+2069> <U+2066> begin admins only */
47 | ^
48 Wbidi-chars-1.c:9:28: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
49 9 | /* end admins only <U+202E> { <U+2066>*/
50 | ^
51
52 Wbidi-chars-11.c:6:15: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
53 6 | int LRE_<U+202A>_PDF_\u202c;
54 | ^
55 Wbidi-chars-11.c:8:19: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
56 8 | int LRE_\u202a_PDF_<U+202C>_;
57 | ^
58 Wbidi-chars-11.c:10:28: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
59 10 | const char *s1 = "LRE_<U+202A>_PDF_\u202c";
60 | ^
61 Wbidi-chars-11.c:12:33: warning: UTF-8 vs UCN mismatch when closing a context by "U+202C (POP DIRECTIONAL FORMATTING)" [-Wbidi-chars=]
62 12 | const char *s2 = "LRE_\u202a_PDF_<U+202C>";
63 | ^
64
65libcpp/ChangeLog:
66 PR preprocessor/103026
67 * lex.c (maybe_warn_bidi_on_close): Use a rich_location
68 and call set_escape_on_output (true) on it.
69 (maybe_warn_bidi_on_char): Likewise.
70
71Signed-off-by: David Malcolm <dmalcolm@redhat.com>
72
73CVE: CVE-2021-42574
74Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=1a7f2c0774129750fdf73e9f1b78f0ce983c9ab3]
75Signed-off-by: Pgowda <pgowda.cve@gmail.com>
76
77---
78 libcpp/lex.c | 29 +++++++++++++++++------------
79 1 file changed, 17 insertions(+), 12 deletions(-)
80
81diff --git a/libcpp/lex.c b/libcpp/lex.c
82--- a/libcpp/lex.c 2021-12-14 20:44:11.647815287 -0800
83+++ b/libcpp/lex.c 2021-12-14 20:43:38.008383220 -0800
84@@ -1427,9 +1427,11 @@ maybe_warn_bidi_on_close (cpp_reader *pf
85 const location_t loc
86 = linemap_position_for_column (pfile->line_table,
87 CPP_BUF_COLUMN (pfile->buffer, p));
88- cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
89- "unpaired UTF-8 bidirectional control character "
90- "detected");
91+ rich_location rich_loc (pfile->line_table, loc);
92+ rich_loc.set_escape_on_output (true);
93+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
94+ "unpaired UTF-8 bidirectional control character "
95+ "detected");
96 }
97 /* We're done with this context. */
98 bidi::on_close ();
99@@ -1454,6 +1456,9 @@ maybe_warn_bidi_on_char (cpp_reader *pfi
100 const location_t loc
101 = linemap_position_for_column (pfile->line_table,
102 CPP_BUF_COLUMN (pfile->buffer, p));
103+ rich_location rich_loc (pfile->line_table, loc);
104+ rich_loc.set_escape_on_output (true);
105+
106 /* It seems excessive to warn about a PDI/PDF that is closing
107 an opened context because we've already warned about the
108 opening character. Except warn when we have a UCN x UTF-8
109@@ -1462,20 +1467,20 @@ maybe_warn_bidi_on_char (cpp_reader *pfi
110 {
111 if (warn_bidi == bidirectional_unpaired
112 && bidi::current_ctx_ucn_p () != ucn_p)
113- cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
114- "UTF-8 vs UCN mismatch when closing "
115- "a context by \"%s\"", bidi::to_str (kind));
116+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
117+ "UTF-8 vs UCN mismatch when closing "
118+ "a context by \"%s\"", bidi::to_str (kind));
119 }
120 else if (warn_bidi == bidirectional_any)
121 {
122 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
123- cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
124- "\"%s\" is closing an unopened context",
125- bidi::to_str (kind));
126+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
127+ "\"%s\" is closing an unopened context",
128+ bidi::to_str (kind));
129 else
130- cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
131- "found problematic Unicode character \"%s\"",
132- bidi::to_str (kind));
133+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
134+ "found problematic Unicode character \"%s\"",
135+ bidi::to_str (kind));
136 }
137 }
138 /* We're done with this context. */
diff --git a/meta/recipes-devtools/gcc/gcc/0005-CVE-2021-42574.patch b/meta/recipes-devtools/gcc/gcc/0005-CVE-2021-42574.patch
new file mode 100644
index 0000000000..6e983a67b6
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0005-CVE-2021-42574.patch
@@ -0,0 +1,575 @@
1From bef32d4a28595e933f24fef378cf052a30b674a7 Mon Sep 17 00:00:00 2001
2From: David Malcolm <dmalcolm@redhat.com>
3Date: Tue, 2 Nov 2021 15:45:22 -0400
4Subject: [PATCH] libcpp: capture and underline ranges in -Wbidi-chars=
5 [PR103026]
6MIME-Version: 1.0
7Content-Type: text/plain; charset=utf8
8Content-Transfer-Encoding: 8bit
9
10This patch converts the bidi::vec to use a struct so that we can
11capture location_t values for the bidirectional control characters.
12
13Before:
14
15 Wbidi-chars-1.c: In function âmainâ:
16 Wbidi-chars-1.c:6:43: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
17 6 | /*<U+202E> } <U+2066>if (isAdmin)<U+2069> <U+2066> begin admins only */
18 | ^
19 Wbidi-chars-1.c:9:28: warning: unpaired UTF-8 bidirectional control character detected [-Wbidi-chars=]
20 9 | /* end admins only <U+202E> { <U+2066>*/
21 | ^
22
23After:
24
25 Wbidi-chars-1.c: In function âmainâ:
26 Wbidi-chars-1.c:6:43: warning: unpaired UTF-8 bidirectional control characters detected [-Wbidi-chars=]
27 6 | /*<U+202E> } <U+2066>if (isAdmin)<U+2069> <U+2066> begin admins only */
28 | ~~~~~~~~ ~~~~~~~~ ^
29 | | | |
30 | | | end of bidirectional context
31 | U+202E (RIGHT-TO-LEFT OVERRIDE) U+2066 (LEFT-TO-RIGHT ISOLATE)
32 Wbidi-chars-1.c:9:28: warning: unpaired UTF-8 bidirectional control characters detected [-Wbidi-chars=]
33 9 | /* end admins only <U+202E> { <U+2066>*/
34 | ~~~~~~~~ ~~~~~~~~ ^
35 | | | |
36 | | | end of bidirectional context
37 | | U+2066 (LEFT-TO-RIGHT ISOLATE)
38 | U+202E (RIGHT-TO-LEFT OVERRIDE)
39
40Signed-off-by: David Malcolm <dmalcolm@redhat.com>
41
42gcc/testsuite/ChangeLog:
43 PR preprocessor/103026
44 * c-c++-common/Wbidi-chars-ranges.c: New test.
45
46libcpp/ChangeLog:
47 PR preprocessor/103026
48 * lex.c (struct bidi::context): New.
49 (bidi::vec): Convert to a vec of context rather than unsigned
50 char.
51 (bidi::ctx_at): Rename to...
52 (bidi::pop_kind_at): ...this and reimplement for above change.
53 (bidi::current_ctx): Update for change to vec.
54 (bidi::current_ctx_ucn_p): Likewise.
55 (bidi::current_ctx_loc): New.
56 (bidi::on_char): Update for usage of context struct. Add "loc"
57 param and pass it when pushing contexts.
58 (get_location_for_byte_range_in_cur_line): New.
59 (get_bidi_utf8): Rename to...
60 (get_bidi_utf8_1): ...this, reintroducing...
61 (get_bidi_utf8): ...as a wrapper, setting *OUT when the result is
62 not NONE.
63 (get_bidi_ucn): Rename to...
64 (get_bidi_ucn_1): ...this, reintroducing...
65 (get_bidi_ucn): ...as a wrapper, setting *OUT when the result is
66 not NONE.
67 (class unpaired_bidi_rich_location): New.
68 (maybe_warn_bidi_on_close): Use unpaired_bidi_rich_location when
69 reporting on unpaired bidi chars. Split into singular vs plural
70 spellings.
71 (maybe_warn_bidi_on_char): Pass in a location_t rather than a
72 const uchar * and use it when emitting warnings, and when calling
73 bidi::on_char.
74 (_cpp_skip_block_comment): Capture location when kind is not NONE
75 and pass it to maybe_warn_bidi_on_char.
76 (skip_line_comment): Likewise.
77 (forms_identifier_p): Likewise.
78 (lex_raw_string): Likewise.
79 (lex_string): Likewise.
80
81Signed-off-by: David Malcolm <dmalcolm@redhat.com>
82
83CVE: CVE-2021-42574
84Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bef32d4a28595e933f24fef378cf052a30b674a7]
85Signed-off-by: Pgowda <pgowda.cve@gmail.com>
86
87---
88 .../c-c++-common/Wbidi-chars-ranges.c | 54 ++++
89 libcpp/lex.c | 251 ++++++++++++++----
90 2 files changed, 257 insertions(+), 48 deletions(-)
91 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-ranges.c
92
93diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-ranges.c b/gcc/testsuite/c-c++-common/Wbidi-chars-ranges.c
94--- a/gcc/testsuite/c-c++-common/Wbidi-chars-ranges.c 1969-12-31 16:00:00.000000000 -0800
95+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-ranges.c 2021-12-25 01:39:55.116281847 -0800
96@@ -0,0 +1,54 @@
97+/* PR preprocessor/103026 */
98+/* { dg-do compile } */
99+/* { dg-options "-Wbidi-chars=unpaired -fdiagnostics-show-caret" } */
100+/* Verify that we escape and underline pertinent bidirectional
101+ control characters when quoting the source. */
102+
103+int test_unpaired_bidi () {
104+ int isAdmin = 0;
105+ /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */
106+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
107+#if 0
108+ { dg-begin-multiline-output "" }
109+ /*<U+202E> } <U+2066>if (isAdmin)<U+2069> <U+2066> begin admins only */
110+ ~~~~~~~~ ~~~~~~~~ ^
111+ | | |
112+ | | end of bidirectional context
113+ U+202E (RIGHT-TO-LEFT OVERRIDE) U+2066 (LEFT-TO-RIGHT ISOLATE)
114+ { dg-end-multiline-output "" }
115+#endif
116+
117+ __builtin_printf("You are an admin.\n");
118+ /* end admins only ‮ { ⁦*/
119+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
120+#if 0
121+ { dg-begin-multiline-output "" }
122+ /* end admins only <U+202E> { <U+2066>*/
123+ ~~~~~~~~ ~~~~~~~~ ^
124+ | | |
125+ | | end of bidirectional context
126+ | U+2066 (LEFT-TO-RIGHT ISOLATE)
127+ U+202E (RIGHT-TO-LEFT OVERRIDE)
128+ { dg-end-multiline-output "" }
129+#endif
130+
131+ return 0;
132+}
133+
134+int LRE_‪_PDF_\u202c;
135+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
136+#if 0
137+ { dg-begin-multiline-output "" }
138+ int LRE_<U+202A>_PDF_\u202c;
139+ ~~~~~~~~ ^~~~~~
140+ { dg-end-multiline-output "" }
141+#endif
142+
143+const char *s1 = "LRE_‪_PDF_\u202c";
144+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
145+#if 0
146+ { dg-begin-multiline-output "" }
147+ const char *s1 = "LRE_<U+202A>_PDF_\u202c";
148+ ~~~~~~~~ ^~~~~~
149+ { dg-end-multiline-output "" }
150+#endif
151diff --git a/libcpp/lex.c b/libcpp/lex.c
152--- a/libcpp/lex.c 2021-12-25 01:41:16.522868808 -0800
153+++ b/libcpp/lex.c 2021-12-25 06:28:58.530680302 -0800
154@@ -1172,11 +1172,34 @@ namespace bidi {
155 /* All the UTF-8 encodings of bidi characters start with E2. */
156 constexpr uchar utf8_start = 0xe2;
157
158+ struct context
159+ {
160+ context () {}
161+ context (location_t loc, kind k, bool pdf, bool ucn)
162+ : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
163+ {
164+ }
165+
166+ kind get_pop_kind () const
167+ {
168+ return m_pdf ? kind::PDF : kind::PDI;
169+ }
170+ bool ucn_p () const
171+ {
172+ return m_ucn;
173+ }
174+
175+ location_t m_loc;
176+ kind m_kind;
177+ unsigned m_pdf : 1;
178+ unsigned m_ucn : 1;
179+ };
180+
181 /* A vector holding currently open bidi contexts. We use a char for
182 each context, its LSB is 1 if it represents a PDF context, 0 if it
183 represents a PDI context. The next bit is 1 if this context was open
184 by a bidi character written as a UCN, and 0 when it was UTF-8. */
185- semi_embedded_vec <unsigned char, 16> vec;
186+ semi_embedded_vec <context, 16> vec;
187
188 /* Close the whole comment/identifier/string literal/character constant
189 context. */
190@@ -1193,19 +1216,19 @@ namespace bidi {
191 vec.truncate (len - 1);
192 }
193
194- /* Return the context of the Ith element. */
195- kind ctx_at (unsigned int i)
196+ /* Return the pop kind of the context of the Ith element. */
197+ kind pop_kind_at (unsigned int i)
198 {
199- return (vec[i] & 1) ? kind::PDF : kind::PDI;
200+ return vec[i].get_pop_kind ();
201 }
202
203- /* Return which context is currently opened. */
204+ /* Return the pop kind of the context that is currently opened. */
205 kind current_ctx ()
206 {
207 unsigned int len = vec.count ();
208 if (len == 0)
209 return kind::NONE;
210- return ctx_at (len - 1);
211+ return vec[len - 1].get_pop_kind ();
212 }
213
214 /* Return true if the current context comes from a UCN origin, that is,
215@@ -1214,11 +1237,19 @@ namespace bidi {
216 {
217 unsigned int len = vec.count ();
218 gcc_checking_assert (len > 0);
219- return (vec[len - 1] >> 1) & 1;
220+ return vec[len - 1].m_ucn;
221 }
222
223- /* We've read a bidi char, update the current vector as necessary. */
224- void on_char (kind k, bool ucn_p)
225+ location_t current_ctx_loc ()
226+ {
227+ unsigned int len = vec.count ();
228+ gcc_checking_assert (len > 0);
229+ return vec[len - 1].m_loc;
230+ }
231+
232+ /* We've read a bidi char, update the current vector as necessary.
233+ LOC is only valid when K is not kind::NONE. */
234+ void on_char (kind k, bool ucn_p, location_t loc)
235 {
236 switch (k)
237 {
238@@ -1226,12 +1257,12 @@ namespace bidi {
239 case kind::RLE:
240 case kind::LRO:
241 case kind::RLO:
242- vec.push (ucn_p ? 3u : 1u);
243+ vec.push (context (loc, k, true, ucn_p));
244 break;
245 case kind::LRI:
246 case kind::RLI:
247 case kind::FSI:
248- vec.push (ucn_p ? 2u : 0u);
249+ vec.push (context (loc, k, false, ucn_p));
250 break;
251 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
252 whose scope has not yet been terminated. */
253@@ -1245,7 +1276,7 @@ namespace bidi {
254 yet been terminated. */
255 case kind::PDI:
256 for (int i = vec.count () - 1; i >= 0; --i)
257- if (ctx_at (i) == kind::PDI)
258+ if (pop_kind_at (i) == kind::PDI)
259 {
260 vec.truncate (i);
261 break;
262@@ -1295,10 +1326,47 @@ namespace bidi {
263 }
264 }
265
266+/* Get location_t for the range of bytes [START, START + NUM_BYTES)
267+ within the current line in FILE, with the caret at START. */
268+
269+static location_t
270+get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
271+ const unsigned char *const start,
272+ size_t num_bytes)
273+{
274+ gcc_checking_assert (num_bytes > 0);
275+
276+ /* CPP_BUF_COLUMN and linemap_position_for_column both refer
277+ to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
278+ whereas linemap_position_for_column is 1-based. */
279+
280+ /* Get 0-based offsets within the line. */
281+ size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
282+ size_t end_offset = start_offset + num_bytes - 1;
283+
284+ /* Now convert to location_t, where "columns" are 1-based byte offsets. */
285+ location_t start_loc = linemap_position_for_column (pfile->line_table,
286+ start_offset + 1);
287+ location_t end_loc = linemap_position_for_column (pfile->line_table,
288+ end_offset + 1);
289+
290+ if (start_loc == end_loc)
291+ return start_loc;
292+
293+ source_range src_range;
294+ src_range.m_start = start_loc;
295+ src_range.m_finish = end_loc;
296+ location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
297+ start_loc,
298+ src_range,
299+ NULL);
300+ return combined_loc;
301+}
302+
303 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
304
305 static bidi::kind
306-get_bidi_utf8 (const unsigned char *const p)
307+get_bidi_utf8_1 (const unsigned char *const p)
308 {
309 gcc_checking_assert (p[0] == bidi::utf8_start);
310
311@@ -1340,10 +1408,25 @@ get_bidi_utf8 (const unsigned char *cons
312 return bidi::kind::NONE;
313 }
314
315+/* Parse a sequence of 3 bytes starting with P and return its bidi code.
316+ If the kind is not NONE, write the location to *OUT.*/
317+
318+static bidi::kind
319+get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
320+{
321+ bidi::kind result = get_bidi_utf8_1 (p);
322+ if (result != bidi::kind::NONE)
323+ {
324+ /* We have a sequence of 3 bytes starting at P. */
325+ *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
326+ }
327+ return result;
328+}
329+
330 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
331
332 static bidi::kind
333-get_bidi_ucn (const unsigned char *p, bool is_U)
334+get_bidi_ucn_1 (const unsigned char *p, bool is_U)
335 {
336 /* 6.4.3 Universal Character Names
337 \u hex-quad
338@@ -1412,6 +1495,62 @@ get_bidi_ucn (const unsigned char *p, bo
339 return bidi::kind::NONE;
340 }
341
342+/* Parse a UCN where P points just past \u or \U and return its bidi code.
343+ If the kind is not NONE, write the location to *OUT.*/
344+
345+static bidi::kind
346+get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
347+ location_t *out)
348+{
349+ bidi::kind result = get_bidi_ucn_1 (p, is_U);
350+ if (result != bidi::kind::NONE)
351+ {
352+ const unsigned char *start = p - 2;
353+ size_t num_bytes = 2 + (is_U ? 8 : 4);
354+ *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
355+ }
356+ return result;
357+}
358+
359+/* Subclass of rich_location for reporting on unpaired UTF-8
360+ bidirectional control character(s).
361+ Escape the source lines on output, and show all unclosed
362+ bidi context, labelling everything. */
363+
364+class unpaired_bidi_rich_location : public rich_location
365+{
366+ public:
367+ class custom_range_label : public range_label
368+ {
369+ public:
370+ label_text get_text (unsigned range_idx) const FINAL OVERRIDE
371+ {
372+ /* range 0 is the primary location; each subsequent range i + 1
373+ is for bidi::vec[i]. */
374+ if (range_idx > 0)
375+ {
376+ const bidi::context &ctxt (bidi::vec[range_idx - 1]);
377+ return label_text::borrow (bidi::to_str (ctxt.m_kind));
378+ }
379+ else
380+ return label_text::borrow (_("end of bidirectional context"));
381+ }
382+ };
383+
384+ unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
385+ : rich_location (pfile->line_table, loc, &m_custom_label)
386+ {
387+ set_escape_on_output (true);
388+ for (unsigned i = 0; i < bidi::vec.count (); i++)
389+ add_range (bidi::vec[i].m_loc,
390+ SHOW_RANGE_WITHOUT_CARET,
391+ &m_custom_label);
392+ }
393+
394+ private:
395+ custom_range_label m_custom_label;
396+};
397+
398 /* We're closing a bidi context, that is, we've encountered a newline,
399 are closing a C-style comment, or are at the end of a string literal,
400 character constant, or identifier. Warn if this context was not
401@@ -1427,11 +1566,17 @@ maybe_warn_bidi_on_close (cpp_reader *pf
402 const location_t loc
403 = linemap_position_for_column (pfile->line_table,
404 CPP_BUF_COLUMN (pfile->buffer, p));
405- rich_location rich_loc (pfile->line_table, loc);
406- rich_loc.set_escape_on_output (true);
407- cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
408- "unpaired UTF-8 bidirectional control character "
409- "detected");
410+ unpaired_bidi_rich_location rich_loc (pfile, loc);
411+ /* cpp_callbacks doesn't yet have a way to handle singular vs plural
412+ forms of a diagnostic, so fake it for now. */
413+ if (bidi::vec.count () > 1)
414+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
415+ "unpaired UTF-8 bidirectional control characters "
416+ "detected");
417+ else
418+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
419+ "unpaired UTF-8 bidirectional control character "
420+ "detected");
421 }
422 /* We're done with this context. */
423 bidi::on_close ();
424@@ -1439,12 +1584,13 @@ maybe_warn_bidi_on_close (cpp_reader *pf
425
426 /* We're at the beginning or in the middle of an identifier/comment/string
427 literal/character constant. Warn if we've encountered a bidi character.
428- KIND says which bidi character it was; P points to it in the character
429- stream. UCN_P is true iff this bidi character was written as a UCN. */
430+ KIND says which bidi control character it was; UCN_P is true iff this bidi
431+ control character was written as a UCN. LOC is the location of the
432+ character, but is only valid if KIND != bidi::kind::NONE. */
433
434 static void
435-maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
436- bool ucn_p)
437+maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
438+ bool ucn_p, location_t loc)
439 {
440 if (__builtin_expect (kind == bidi::kind::NONE, 1))
441 return;
442@@ -1453,9 +1599,6 @@ maybe_warn_bidi_on_char (cpp_reader *pfi
443
444 if (warn_bidi != bidirectional_none)
445 {
446- const location_t loc
447- = linemap_position_for_column (pfile->line_table,
448- CPP_BUF_COLUMN (pfile->buffer, p));
449 rich_location rich_loc (pfile->line_table, loc);
450 rich_loc.set_escape_on_output (true);
451
452@@ -1467,9 +1610,12 @@ maybe_warn_bidi_on_char (cpp_reader *pfi
453 {
454 if (warn_bidi == bidirectional_unpaired
455 && bidi::current_ctx_ucn_p () != ucn_p)
456- cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
457- "UTF-8 vs UCN mismatch when closing "
458- "a context by \"%s\"", bidi::to_str (kind));
459+ {
460+ rich_loc.add_range (bidi::current_ctx_loc ());
461+ cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
462+ "UTF-8 vs UCN mismatch when closing "
463+ "a context by \"%s\"", bidi::to_str (kind));
464+ }
465 }
466 else if (warn_bidi == bidirectional_any)
467 {
468@@ -1484,7 +1630,7 @@ maybe_warn_bidi_on_char (cpp_reader *pfi
469 }
470 }
471 /* We're done with this context. */
472- bidi::on_char (kind, ucn_p);
473+ bidi::on_char (kind, ucn_p, loc);
474 }
475
476 /* Skip a C-style block comment. We find the end of the comment by
477@@ -1552,8 +1698,9 @@ _cpp_skip_block_comment (cpp_reader *pfi
478 a bidirectional control character. */
479 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
480 {
481- bidi::kind kind = get_bidi_utf8 (cur - 1);
482- maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/false);
483+ location_t loc;
484+ bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
485+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
486 }
487 }
488
489@@ -1586,9 +1733,9 @@ skip_line_comment (cpp_reader *pfile)
490 {
491 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
492 {
493- bidi::kind kind = get_bidi_utf8 (buffer->cur);
494- maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
495- /*ucn_p=*/false);
496+ location_t loc;
497+ bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
498+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
499 }
500 buffer->cur++;
501 }
502@@ -1734,9 +1881,9 @@ forms_identifier_p (cpp_reader *pfile, i
503 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
504 && warn_bidi_p)
505 {
506- bidi::kind kind = get_bidi_utf8 (buffer->cur);
507- maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
508- /*ucn_p=*/false);
509+ location_t loc;
510+ bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
511+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
512 }
513 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
514 state, &s))
515@@ -1748,10 +1895,12 @@ forms_identifier_p (cpp_reader *pfile, i
516 buffer->cur += 2;
517 if (warn_bidi_p)
518 {
519- bidi::kind kind = get_bidi_ucn (buffer->cur,
520- buffer->cur[-1] == 'U');
521- maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
522- /*ucn_p=*/true);
523+ location_t loc;
524+ bidi::kind kind = get_bidi_ucn (pfile,
525+ buffer->cur,
526+ buffer->cur[-1] == 'U',
527+ &loc);
528+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
529 }
530 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
531 state, &s, NULL, NULL))
532@@ -2327,12 +2476,15 @@ lex_raw_string (cpp_reader *pfile, cpp_t
533 }
534 else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
535 && warn_bidi_p)
536- maybe_warn_bidi_on_char (pfile, pos - 1, get_bidi_utf8 (pos - 1),
537- /*ucn_p=*/false);
538+ {
539+ location_t loc;
540+ bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
541+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
542+ }
543 }
544
545 if (warn_bidi_p)
546- maybe_warn_bidi_on_close (pfile, pos);
547+ maybe_warn_bidi_on_close (pfile, cur);
548
549 if (CPP_OPTION (pfile, user_literals))
550 {
551@@ -2438,8 +2590,10 @@ lex_string (cpp_reader *pfile, cpp_token
552 {
553 if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
554 {
555- bidi::kind kind = get_bidi_ucn (cur + 1, cur[0] == 'U');
556- maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/true);
557+ location_t loc;
558+ bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
559+ &loc);
560+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
561 }
562 cur++;
563 }
564@@ -2467,8 +2621,9 @@ lex_string (cpp_reader *pfile, cpp_token
565 saw_NUL = true;
566 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
567 {
568- bidi::kind kind = get_bidi_utf8 (cur - 1);
569- maybe_warn_bidi_on_char (pfile, cur - 1, kind, /*ucn_p=*/false);
570+ location_t loc;
571+ bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
572+ maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
573 }
574 }
575