1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
|
From 3a22dc1079be5a75750d24dc6992956e7b84b5a0 Mon Sep 17 00:00:00 2001
From: Seth Michael Larson <seth@python.org>
Date: Tue, 3 Sep 2024 10:07:53 -0500
Subject: [PATCH 2/2] [3.10] gh-121285: Remove backtracking when parsing
tarfile headers (GH-121286) (#123640)
* Remove backtracking when parsing tarfile headers
* Rewrite PAX header parsing to be stricter
* Optimize parsing of GNU extended sparse headers v0.0
(cherry picked from commit 34ddb64d088dd7ccc321f6103d23153256caa5d4)
Upstream-Status: Backport from https://github.com/python/cpython/commit/743acbe872485dc18df4d8ab2dc7895187f062c4
CVE: CVE-2024-6232
Co-authored-by: Kirill Podoprigora <kirill.bast9@mail.ru>
Co-authored-by: Gregory P. Smith <greg@krypto.org>
Signed-off-by: Hugo SIMELIERE <hsimeliere.opensource@witekio.com>
---
Lib/tarfile.py | 105 +++++++++++-------
Lib/test/test_tarfile.py | 42 +++++++
...-07-02-13-39-20.gh-issue-121285.hrl-yI.rst | 2 +
3 files changed, 111 insertions(+), 38 deletions(-)
create mode 100644 Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 495349f08f9..3ab6811d633 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -841,6 +841,9 @@ def data_filter(member, dest_path):
# Sentinel for replace() defaults, meaning "don't change the attribute"
_KEEP = object()
+# Header length is digits followed by a space.
+_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")
+
class TarInfo(object):
"""Informational class which holds the details about an
archive member given by a tar header block.
@@ -1410,41 +1413,59 @@ def _proc_pax(self, tarfile):
else:
pax_headers = tarfile.pax_headers.copy()
- # Check if the pax header contains a hdrcharset field. This tells us
- # the encoding of the path, linkpath, uname and gname fields. Normally,
- # these fields are UTF-8 encoded but since POSIX.1-2008 tar
- # implementations are allowed to store them as raw binary strings if
- # the translation to UTF-8 fails.
- match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
- if match is not None:
- pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
-
- # For the time being, we don't care about anything other than "BINARY".
- # The only other value that is currently allowed by the standard is
- # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
- hdrcharset = pax_headers.get("hdrcharset")
- if hdrcharset == "BINARY":
- encoding = tarfile.encoding
- else:
- encoding = "utf-8"
-
# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
- # the newline. keyword and value are both UTF-8 encoded strings.
- regex = re.compile(br"(\d+) ([^=]+)=")
+ # the newline.
pos = 0
- while True:
- match = regex.match(buf, pos)
- if not match:
- break
+ encoding = None
+ raw_headers = []
+ while len(buf) > pos and buf[pos] != 0x00:
+ if not (match := _header_length_prefix_re.match(buf, pos)):
+ raise InvalidHeaderError("invalid header")
+ try:
+ length = int(match.group(1))
+ except ValueError:
+ raise InvalidHeaderError("invalid header")
+ # Headers must be at least 5 bytes, shortest being '5 x=\n'.
+ # Value is allowed to be empty.
+ if length < 5:
+ raise InvalidHeaderError("invalid header")
+ if pos + length > len(buf):
+ raise InvalidHeaderError("invalid header")
- length, keyword = match.groups()
- length = int(length)
- if length == 0:
+ header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
+ keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
+ raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")
+
+ # Check the framing of the header. The last character must be '\n' (0x0A)
+ if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
raise InvalidHeaderError("invalid header")
- value = buf[match.end(2) + 1:match.start(1) + length - 1]
+ raw_headers.append((length, raw_keyword, raw_value))
+
+ # Check if the pax header contains a hdrcharset field. This tells us
+ # the encoding of the path, linkpath, uname and gname fields. Normally,
+ # these fields are UTF-8 encoded but since POSIX.1-2008 tar
+ # implementations are allowed to store them as raw binary strings if
+ # the translation to UTF-8 fails. For the time being, we don't care about
+ # anything other than "BINARY". The only other value that is currently
+ # allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
+ # Note that we only follow the initial 'hdrcharset' setting to preserve
+ # the initial behavior of the 'tarfile' module.
+ if raw_keyword == b"hdrcharset" and encoding is None:
+ if raw_value == b"BINARY":
+ encoding = tarfile.encoding
+ else: # This branch ensures only the first 'hdrcharset' header is used.
+ encoding = "utf-8"
+
+ pos += length
+ # If no explicit hdrcharset is set, we use UTF-8 as a default.
+ if encoding is None:
+ encoding = "utf-8"
+
+ # After parsing the raw headers we can decode them to text.
+ for length, raw_keyword, raw_value in raw_headers:
# Normally, we could just use "utf-8" as the encoding and "strict"
# as the error handler, but we better not take the risk. For
# example, GNU tar <= 1.23 is known to store filenames it cannot
@@ -1452,17 +1473,16 @@ def _proc_pax(self, tarfile):
# hdrcharset=BINARY header).
# We first try the strict standard encoding, and if that fails we
# fall back on the user's encoding and error handler.
- keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
+ keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
tarfile.errors)
if keyword in PAX_NAME_FIELDS:
- value = self._decode_pax_field(value, encoding, tarfile.encoding,
+ value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
tarfile.errors)
else:
- value = self._decode_pax_field(value, "utf-8", "utf-8",
+ value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
tarfile.errors)
pax_headers[keyword] = value
- pos += length
# Fetch the next header.
try:
@@ -1477,7 +1497,7 @@ def _proc_pax(self, tarfile):
elif "GNU.sparse.size" in pax_headers:
# GNU extended sparse format version 0.0.
- self._proc_gnusparse_00(next, pax_headers, buf)
+ self._proc_gnusparse_00(next, raw_headers)
elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
# GNU extended sparse format version 1.0.
@@ -1499,15 +1519,24 @@ def _proc_pax(self, tarfile):
return next
- def _proc_gnusparse_00(self, next, pax_headers, buf):
+ def _proc_gnusparse_00(self, next, raw_headers):
"""Process a GNU tar extended sparse header, version 0.0.
"""
offsets = []
- for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
- offsets.append(int(match.group(1)))
numbytes = []
- for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
- numbytes.append(int(match.group(1)))
+ for _, keyword, value in raw_headers:
+ if keyword == b"GNU.sparse.offset":
+ try:
+ offsets.append(int(value.decode()))
+ except ValueError:
+ raise InvalidHeaderError("invalid header")
+
+ elif keyword == b"GNU.sparse.numbytes":
+ try:
+ numbytes.append(int(value.decode()))
+ except ValueError:
+ raise InvalidHeaderError("invalid header")
+
next.sparse = list(zip(offsets, numbytes))
def _proc_gnusparse_01(self, next, pax_headers):
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
index cfc13bccb20..007c3e94acb 100644
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -1139,6 +1139,48 @@ def test_pax_number_fields(self):
finally:
tar.close()
+ def test_pax_header_bad_formats(self):
+ # The fields from the pax header have priority over the
+ # TarInfo.
+ pax_header_replacements = (
+ b" foo=bar\n",
+ b"0 \n",
+ b"1 \n",
+ b"2 \n",
+ b"3 =\n",
+ b"4 =a\n",
+ b"1000000 foo=bar\n",
+ b"0 foo=bar\n",
+ b"-12 foo=bar\n",
+ b"000000000000000000000000036 foo=bar\n",
+ )
+ pax_headers = {"foo": "bar"}
+
+ for replacement in pax_header_replacements:
+ with self.subTest(header=replacement):
+ tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT,
+ encoding="iso8859-1")
+ try:
+ t = tarfile.TarInfo()
+ t.name = "pax" # non-ASCII
+ t.uid = 1
+ t.pax_headers = pax_headers
+ tar.addfile(t)
+ finally:
+ tar.close()
+
+ with open(tmpname, "rb") as f:
+ data = f.read()
+ self.assertIn(b"11 foo=bar\n", data)
+ data = data.replace(b"11 foo=bar\n", replacement)
+
+ with open(tmpname, "wb") as f:
+ f.truncate()
+ f.write(data)
+
+ with self.assertRaisesRegex(tarfile.ReadError, r"method tar: ReadError\('invalid header'\)"):
+ tarfile.open(tmpname, encoding="iso8859-1")
+
class WriteTestBase(TarTest):
# Put all write tests in here that are supposed to be tested
diff --git a/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst b/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst
new file mode 100644
index 00000000000..81f918bfe2b
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst
@@ -0,0 +1,2 @@
+Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
+GNU sparse headers.
--
2.46.0
|