1 files changed, 283 insertions, 0 deletions
diff --git a/meta/recipes-core/expat/expat/CVE-2022-25235.patch b/meta/recipes-core/expat/expat/CVE-2022-25235.patch
new file mode 100644
index 0000000000..be9182a5c1
--- /dev/null
+++ b/meta/recipes-core/expat/expat/CVE-2022-25235.patch
@@ -0,0 +1,283 @@
+From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
+From: Sebastian Pipping <sebastian@pipping.org>
+Date: Tue, 8 Feb 2022 17:37:14 +0100
+Subject: [PATCH] lib: Drop unused macro UTF8_GET_NAMING
+Upstream-Status: Backport
+https://github.com/libexpat/libexpat/pull/562/commits
+CVE: CVE-2022-25235
+Signed-off-by: Steve Sakoman <steve@sakoman.com>
+---
+ expat/lib/xmltok.c | 5 -----
+ 1 file changed, 5 deletions(-)
+diff --git a/lib/xmltok.c b/lib/xmltok.c
+index a72200e8..3bddf125 100644
+--- a/lib/xmltok.c
+++ b/lib/xmltok.c
+@@ -95,11 +95,6 @@
+         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
+    & (1u << (((byte)[2]) & 0x1F)))
+ 
+-#define UTF8_GET_NAMING(pages, p, n)                                           \
+-  ((n) == 2                                                                    \
+-       ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
+-       : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
+-
+ /* Detection of invalid UTF-8 sequences is based on Table 3.1B
+    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
+    with the additional restriction of not allowing the Unicode
+From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001
+From: Sebastian Pipping <sebastian@pipping.org>
+Date: Tue, 8 Feb 2022 04:32:20 +0100
+Subject: [PATCH] lib: Add missing validation of encoding (CVE-2022-25235)
+---
+ expat/lib/xmltok_impl.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
+index 0430591b4..64a3b2c15 100644
+--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
+@@ -61,7 +61,7 @@
+   case BT_LEAD##n:                                                             \
+     if (end - ptr < n)                                                         \
+       return XML_TOK_PARTIAL_CHAR;                                             \
+-    if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
+       *nextTokPtr = ptr;                                                       \
+       return XML_TOK_INVALID;                                                  \
+     }                                                                          \
+@@ -90,7 +90,7 @@
+   case BT_LEAD##n:                                                             \
+     if (end - ptr < n)                                                         \
+       return XML_TOK_PARTIAL_CHAR;                                             \
+-    if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
+       *nextTokPtr = ptr;                                                       \
+       return XML_TOK_INVALID;                                                  \
+     }                                                                          \
+@@ -1134,6 +1134,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
+   case BT_LEAD##n:                                                             \
+     if (end - ptr < n)                                                         \
+       return XML_TOK_PARTIAL_CHAR;                                             \
+    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
+      *nextTokPtr = ptr;                                                       \
+      return XML_TOK_INVALID;                                                  \
+    }                                                                          \
+     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
+       ptr += n;                                                                \
+       tok = XML_TOK_NAME;                                                      \
+From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001
+From: Sebastian Pipping <sebastian@pipping.org>
+Date: Wed, 9 Feb 2022 01:00:38 +0100
+Subject: [PATCH] lib: Add comments to BT_LEAD* cases where encoding has
+ already been validated
+---
+ expat/lib/xmltok_impl.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
+index 64a3b2c1..84ff35f9 100644
+--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
+@@ -1266,7 +1266,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
+     switch (BYTE_TYPE(enc, ptr)) {
+ #  define LEAD_CASE(n)                                                         \
+   case BT_LEAD##n:                                                             \
+-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
+     break;
+       LEAD_CASE(2)
+       LEAD_CASE(3)
+@@ -1335,7 +1335,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
+     switch (BYTE_TYPE(enc, ptr)) {
+ #  define LEAD_CASE(n)                                                         \
+   case BT_LEAD##n:                                                             \
+-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
+     break;
+       LEAD_CASE(2)
+       LEAD_CASE(3)
+@@ -1514,7 +1514,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
+       state = inName;                                                          \
+     }
+ #  define LEAD_CASE(n)                                                         \
+-  case BT_LEAD##n:                                                             \
+  case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
+     START_NAME ptr += (n - MINBPC(enc));                                       \
+     break;
+       LEAD_CASE(2)
+@@ -1726,7 +1726,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
+     switch (BYTE_TYPE(enc, ptr)) {
+ #  define LEAD_CASE(n)                                                         \
+   case BT_LEAD##n:                                                             \
+-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
+     break;
+       LEAD_CASE(2)
+       LEAD_CASE(3)
+@@ -1771,7 +1771,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
+     switch (BYTE_TYPE(enc, ptr)) {
+ #  define LEAD_CASE(n)                                                         \
+   case BT_LEAD##n:                                                             \
+-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
+     break;
+       LEAD_CASE(2)
+       LEAD_CASE(3)
+From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001
+From: Sebastian Pipping <sebastian@pipping.org>
+Date: Tue, 8 Feb 2022 04:06:21 +0100
+Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235)
+---
+ expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 109 insertions(+)
+diff --git a/tests/runtests.c b/tests/runtests.c
+index bc5344b1..9b155b82 100644
+--- a/tests/runtests.c
+++ b/tests/runtests.c
+@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
+ }
+ END_TEST
+ 
+START_TEST(test_utf8_in_start_tags) {
+  struct test_case {
+    bool goodName;
+    bool goodNameStart;
+    const char *tagName;
+  };
+
+  // The idea with the tests below is this:
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+  // go to isNever and are hence not a concern.
+  //
+  // We start with a character that is a valid name character
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
+  // and (2) we stay in the same n-byte sequence family.
+  //
+  // The flipped bits are highlighted in angle brackets in comments,
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+  // the most significant bit to 1 to leave UTF-8 encoding space.
+  struct test_case cases[] = {
+      // 1-byte UTF-8: [0xxx xxxx]
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
+      {false, false, "\xBA"}, // [<1>011 1010]
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
+      {false, false, "\xB9"}, // [<1>011 1001]
+
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
+                                  // Arabic small waw U+06E5
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
+                                  // combining char U+0301
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
+                                      // Devanagari Letter A U+0905
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
+                                      // combining char U+0901
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+  };
+  const bool atNameStart[] = {true, false};
+
+  size_t i = 0;
+  char doc[1024];
+  size_t failCount = 0;
+
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    size_t j = 0;
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+      const bool expectedSuccess
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+      sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+      XML_Parser parser = XML_ParserCreate(NULL);
+
+      const enum XML_Status status
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+      bool success = true;
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
+        success = false;
+      }
+      if ((status == XML_STATUS_ERROR)
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+        success = false;
+      }
+
+      if (! success) {
+        fprintf(
+            stderr,
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+        failCount++;
+      }
+
+      XML_ParserFree(parser);
+    }
+  }
+
+  if (failCount > 0) {
+    fail("UTF-8 regression detected");
+  }
+}
+END_TEST
+
+ /* Test trailing spaces in elements are accepted */
+ static void XMLCALL
+ record_element_end_handler(void *userData, const XML_Char *name) {
+@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
+ }
+ END_TEST
+ 
+START_TEST(test_bad_doctype_utf8) {
+  const char *text = "<!DOCTYPE \xDB\x25"
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
+                 "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
+ START_TEST(test_bad_doctype_utf16) {
+   const char text[] =
+       /* <!DOCTYPE doc [ \x06f2 ]><doc/>
+@@ -11870,6 +11977,7 @@ make_suite(void) {
+   tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
+   tcase_add_test(tc_basic, test_utf8_in_cdata_section);
+   tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
+  tcase_add_test(tc_basic, test_utf8_in_start_tags);
+   tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
+   tcase_add_test(tc_basic, test_utf16_attribute);
+   tcase_add_test(tc_basic, test_utf16_second_attr);
+@@ -11878,6 +11986,7 @@ make_suite(void) {
+   tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
+   tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
+   tcase_add_test(tc_basic, test_bad_doctype);
+  tcase_add_test(tc_basic, test_bad_doctype_utf8);
+   tcase_add_test(tc_basic, test_bad_doctype_utf16);
+   tcase_add_test(tc_basic, test_bad_doctype_plus);
+   tcase_add_test(tc_basic, test_bad_doctype_star);

diff --git a/meta/recipes-core/expat/expat/CVE-2022-25235.patch b/meta/recipes-core/expat/expat/CVE-2022-25235.patch new file mode 100644 index 0000000000..be9182a5c1 --- /dev/null +++ b/meta/recipes-core/expat/expat/CVE-2022-25235.patch
@@ -0,0 +1,283 @@
	1	From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
	2	From: Sebastian Pipping <sebastian@pipping.org>
	3	Date: Tue, 8 Feb 2022 17:37:14 +0100
	4	Subject: [PATCH] lib: Drop unused macro UTF8_GET_NAMING
	5
	6	Upstream-Status: Backport
	7	https://github.com/libexpat/libexpat/pull/562/commits
	8
	9	CVE: CVE-2022-25235
	10
	11	Signed-off-by: Steve Sakoman <steve@sakoman.com>
	12
	13	---
	14	expat/lib/xmltok.c \| 5 -----
	15	1 file changed, 5 deletions(-)
	16
	17	diff --git a/lib/xmltok.c b/lib/xmltok.c
	18	index a72200e8..3bddf125 100644
	19	--- a/lib/xmltok.c
	20	+++ b/lib/xmltok.c
	21	@@ -95,11 +95,6 @@
	22	+ ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
	23	& (1u << (((byte)[2]) & 0x1F)))
	24
	25	-#define UTF8_GET_NAMING(pages, p, n) \
	26	- ((n) == 2 \
	27	- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
	28	- : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
	29	-
	30	/* Detection of invalid UTF-8 sequences is based on Table 3.1B
	31	of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
	32	with the additional restriction of not allowing the Unicode
	33	From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001
	34	From: Sebastian Pipping <sebastian@pipping.org>
	35	Date: Tue, 8 Feb 2022 04:32:20 +0100
	36	Subject: [PATCH] lib: Add missing validation of encoding (CVE-2022-25235)
	37
	38	---
	39	expat/lib/xmltok_impl.c \| 8 ++++++--
	40	1 file changed, 6 insertions(+), 2 deletions(-)
	41
	42	diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
	43	index 0430591b4..64a3b2c15 100644
	44	--- a/lib/xmltok_impl.c
	45	+++ b/lib/xmltok_impl.c
	46	@@ -61,7 +61,7 @@
	47	case BT_LEAD##n: \
	48	if (end - ptr < n) \
	49	return XML_TOK_PARTIAL_CHAR; \
	50	- if (! IS_NAME_CHAR(enc, ptr, n)) { \
	51	+ if (IS_INVALID_CHAR(enc, ptr, n) \|\| ! IS_NAME_CHAR(enc, ptr, n)) { \
	52	*nextTokPtr = ptr; \
	53	return XML_TOK_INVALID; \
	54	} \
	55	@@ -90,7 +90,7 @@
	56	case BT_LEAD##n: \
	57	if (end - ptr < n) \
	58	return XML_TOK_PARTIAL_CHAR; \
	59	- if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
	60	+ if (IS_INVALID_CHAR(enc, ptr, n) \|\| ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
	61	*nextTokPtr = ptr; \
	62	return XML_TOK_INVALID; \
	63	} \
	64	@@ -1134,6 +1134,10 @@ PREFIX(prologTok)(const ENCODING enc, const char ptr, const char *end,
	65	case BT_LEAD##n: \
	66	if (end - ptr < n) \
	67	return XML_TOK_PARTIAL_CHAR; \
	68	+ if (IS_INVALID_CHAR(enc, ptr, n)) { \
	69	+ *nextTokPtr = ptr; \
	70	+ return XML_TOK_INVALID; \
	71	+ } \
	72	if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
	73	ptr += n; \
	74	tok = XML_TOK_NAME; \
	75	From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001
	76	From: Sebastian Pipping <sebastian@pipping.org>
	77	Date: Wed, 9 Feb 2022 01:00:38 +0100
	78	Subject: [PATCH] lib: Add comments to BT_LEAD* cases where encoding has
	79	already been validated
	80
	81	---
	82	expat/lib/xmltok_impl.c \| 10 +++++-----
	83	1 file changed, 5 insertions(+), 5 deletions(-)
	84
	85	diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
	86	index 64a3b2c1..84ff35f9 100644
	87	--- a/lib/xmltok_impl.c
	88	+++ b/lib/xmltok_impl.c
	89	@@ -1266,7 +1266,7 @@ PREFIX(attributeValueTok)(const ENCODING enc, const char ptr, const char *end,
	90	switch (BYTE_TYPE(enc, ptr)) {
	91	# define LEAD_CASE(n) \
	92	case BT_LEAD##n: \
	93	- ptr += n; \
	94	+ ptr += n; /* NOTE: The encoding has already been validated. */ \
	95	break;
	96	LEAD_CASE(2)
	97	LEAD_CASE(3)
	98	@@ -1335,7 +1335,7 @@ PREFIX(entityValueTok)(const ENCODING enc, const char ptr, const char *end,
	99	switch (BYTE_TYPE(enc, ptr)) {
	100	# define LEAD_CASE(n) \
	101	case BT_LEAD##n: \
	102	- ptr += n; \
	103	+ ptr += n; /* NOTE: The encoding has already been validated. */ \
	104	break;
	105	LEAD_CASE(2)
	106	LEAD_CASE(3)
	107	@@ -1514,7 +1514,7 @@ PREFIX(getAtts)(const ENCODING enc, const char ptr, int attsMax,
	108	state = inName; \
	109	}
	110	# define LEAD_CASE(n) \
	111	- case BT_LEAD##n: \
	112	+ case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
	113	START_NAME ptr += (n - MINBPC(enc)); \
	114	break;
	115	LEAD_CASE(2)
	116	@@ -1726,7 +1726,7 @@ PREFIX(nameLength)(const ENCODING enc, const char ptr) {
	117	switch (BYTE_TYPE(enc, ptr)) {
	118	# define LEAD_CASE(n) \
	119	case BT_LEAD##n: \
	120	- ptr += n; \
	121	+ ptr += n; /* NOTE: The encoding has already been validated. */ \
	122	break;
	123	LEAD_CASE(2)
	124	LEAD_CASE(3)
	125	@@ -1771,7 +1771,7 @@ PREFIX(updatePosition)(const ENCODING enc, const char ptr, const char *end,
	126	switch (BYTE_TYPE(enc, ptr)) {
	127	# define LEAD_CASE(n) \
	128	case BT_LEAD##n: \
	129	- ptr += n; \
	130	+ ptr += n; /* NOTE: The encoding has already been validated. */ \
	131	break;
	132	LEAD_CASE(2)
	133	LEAD_CASE(3)
	134	From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001
	135	From: Sebastian Pipping <sebastian@pipping.org>
	136	Date: Tue, 8 Feb 2022 04:06:21 +0100
	137	Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235)
	138
	139	---
	140	expat/tests/runtests.c \| 109 +++++++++++++++++++++++++++++++++++++++++
	141	1 file changed, 109 insertions(+)
	142
	143	diff --git a/tests/runtests.c b/tests/runtests.c
	144	index bc5344b1..9b155b82 100644
	145	--- a/tests/runtests.c
	146	+++ b/tests/runtests.c
	147	@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
	148	}
	149	END_TEST
	150
	151	+START_TEST(test_utf8_in_start_tags) {
	152	+ struct test_case {
	153	+ bool goodName;
	154	+ bool goodNameStart;
	155	+ const char *tagName;
	156	+ };
	157	+
	158	+ // The idea with the tests below is this:
	159	+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
	160	+ // go to isNever and are hence not a concern.
	161	+ //
	162	+ // We start with a character that is a valid name character
	163	+ // (or even name-start character, see XML 1.0r4 spec) and then we flip
	164	+ // single bits at places where (1) the result leaves the UTF-8 encoding space
	165	+ // and (2) we stay in the same n-byte sequence family.
	166	+ //
	167	+ // The flipped bits are highlighted in angle brackets in comments,
	168	+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
	169	+ // the most significant bit to 1 to leave UTF-8 encoding space.
	170	+ struct test_case cases[] = {
	171	+ // 1-byte UTF-8: [0xxx xxxx]
	172	+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
	173	+ {false, false, "\xBA"}, // [<1>011 1010]
	174	+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
	175	+ {false, false, "\xB9"}, // [<1>011 1001]
	176	+
	177	+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
	178	+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
	179	+ // Arabic small waw U+06E5
	180	+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
	181	+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
	182	+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
	183	+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
	184	+ // combining char U+0301
	185	+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
	186	+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
	187	+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
	188	+
	189	+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
	190	+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
	191	+ // Devanagari Letter A U+0905
	192	+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
	193	+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
	194	+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
	195	+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
	196	+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
	197	+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
	198	+ // combining char U+0901
	199	+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
	200	+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
	201	+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
	202	+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
	203	+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
	204	+ };
	205	+ const bool atNameStart[] = {true, false};
	206	+
	207	+ size_t i = 0;
	208	+ char doc[1024];
	209	+ size_t failCount = 0;
	210	+
	211	+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
	212	+ size_t j = 0;
	213	+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
	214	+ const bool expectedSuccess
	215	+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
	216	+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
	217	+ XML_Parser parser = XML_ParserCreate(NULL);
	218	+
	219	+ const enum XML_Status status
	220	+ = XML_Parse(parser, doc, (int)strlen(doc), /isFinal=/XML_FALSE);
	221	+
	222	+ bool success = true;
	223	+ if ((status == XML_STATUS_OK) != expectedSuccess) {
	224	+ success = false;
	225	+ }
	226	+ if ((status == XML_STATUS_ERROR)
	227	+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
	228	+ success = false;
	229	+ }
	230	+
	231	+ if (! success) {
	232	+ fprintf(
	233	+ stderr,
	234	+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
	235	+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ",
	236	+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
	237	+ failCount++;
	238	+ }
	239	+
	240	+ XML_ParserFree(parser);
	241	+ }
	242	+ }
	243	+
	244	+ if (failCount > 0) {
	245	+ fail("UTF-8 regression detected");
	246	+ }
	247	+}
	248	+END_TEST
	249	+
	250	/* Test trailing spaces in elements are accepted */
	251	static void XMLCALL
	252	record_element_end_handler(void userData, const XML_Char name) {
	253	@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
	254	}
	255	END_TEST
	256
	257	+START_TEST(test_bad_doctype_utf8) {
	258	+ const char *text = "<!DOCTYPE \xDB\x25"
	259	+ "doc><doc/>"; // [1101 1011] [<0>010 0101]
	260	+ expect_failure(text, XML_ERROR_INVALID_TOKEN,
	261	+ "Invalid UTF-8 in DOCTYPE not faulted");
	262	+}
	263	+END_TEST
	264	+
	265	START_TEST(test_bad_doctype_utf16) {
	266	const char text[] =
	267	/* <!DOCTYPE doc [ \x06f2 ]><doc/>
	268	@@ -11870,6 +11977,7 @@ make_suite(void) {
	269	tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
	270	tcase_add_test(tc_basic, test_utf8_in_cdata_section);
	271	tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
	272	+ tcase_add_test(tc_basic, test_utf8_in_start_tags);
	273	tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
	274	tcase_add_test(tc_basic, test_utf16_attribute);
	275	tcase_add_test(tc_basic, test_utf16_second_attr);
	276	@@ -11878,6 +11986,7 @@ make_suite(void) {
	277	tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
	278	tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
	279	tcase_add_test(tc_basic, test_bad_doctype);
	280	+ tcase_add_test(tc_basic, test_bad_doctype_utf8);
	281	tcase_add_test(tc_basic, test_bad_doctype_utf16);
	282	tcase_add_test(tc_basic, test_bad_doctype_plus);
	283	tcase_add_test(tc_basic, test_bad_doctype_star);