summaryrefslogtreecommitdiffstats
path: root/meta/recipes-devtools/python/python3-urllib3/CVE-2025-66471.patch
blob: 5329e262725376b1db6861078bea3f5308a6cd9e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
From f25c0d11e1b640e3c7e0addb66a1ff50730be508 Mon Sep 17 00:00:00 2001
From: Illia Volochii <illia.volochii@gmail.com>
Date: Fri, 5 Dec 2025 16:40:41 +0200
Subject: [PATCH] Merge commit from fork

* Prevent decompression bomb for zstd in Python 3.14

* Add experimental `decompress_iter` for Brotli

* Update changes for Brotli

* Add `GzipDecoder.decompress_iter`

* Test https://github.com/python-hyper/brotlicffi/pull/207

* Pin Brotli

* Add `decompress_iter` to all decoders and make tests pass

* Pin brotlicffi to an official release

* Revert changes to response.py

* Add `max_length` parameter to all `decompress` methods

* Fix the `test_brotlipy` session

* Unset `_data` on gzip error

* Add a test for memory usage

* Test more methods

* Fix the test for `stream`

* Cover more lines with tests

* Add more coverage

* Make `read1` a bit more efficient

* Fix PyPy tests for Brotli

* Revert an unnecessarily moved check

* Add some comments

* Leave just one `self._obj.decompress` call in `GzipDecoder`

* Refactor test params

* Test reads with all data already in the decompressor

* Prevent needless copying of data decoded with `max_length`

* Rename the changed test

* Note that responses of unknown length should be streamed too

* Add a changelog entry

* Avoid returning a memory view from `BytesQueueBuffer`

* Add one more note to the changelog entry

CVE: CVE-2025-66471

Upstream-Status: Backport
[https://github.com/urllib3/urllib3/commit/c19571de34c47de3a766541b041637ba5f716ed7]

Signed-off-by: Jiaying Song <jiaying.song.cn@windriver.com>
---
 docs/advanced-usage.rst |   3 +-
 docs/user-guide.rst     |   4 +-
 pyproject.toml          |   5 +-
 src/urllib3/response.py | 278 ++++++++++++++++++++++++++++++++++------
 4 files changed, 246 insertions(+), 44 deletions(-)

diff --git a/docs/advanced-usage.rst b/docs/advanced-usage.rst
index 36a51e67..a12c7143 100644
--- a/docs/advanced-usage.rst
+++ b/docs/advanced-usage.rst
@@ -66,7 +66,8 @@ When using ``preload_content=True`` (the default setting) the
 response body will be read immediately into memory and the HTTP connection
 will be released back into the pool without manual intervention.
 
-However, when dealing with large responses it's often better to stream the response
+However, when dealing with responses of large or unknown length,
+it's often better to stream the response
 content using ``preload_content=False``. Setting ``preload_content`` to ``False`` means
 that urllib3 will only read from the socket when data is requested.
 
diff --git a/docs/user-guide.rst b/docs/user-guide.rst
index 5c78c8af..1d9d0bbd 100644
--- a/docs/user-guide.rst
+++ b/docs/user-guide.rst
@@ -145,8 +145,8 @@ to a byte string representing the response content:
     print(resp.data)
     # b"\xaa\xa5H?\x95\xe9\x9b\x11"
 
-.. note:: For larger responses, it's sometimes better to :ref:`stream <stream>`
-    the response.
+.. note:: For responses of large or unknown length, it's sometimes better to
+    :ref:`stream <stream>` the response.
 
 Using io Wrappers with Response Content
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pyproject.toml b/pyproject.toml
index 1fe82937..58a2c2db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,8 +40,8 @@ dynamic = ["version"]
 
 [project.optional-dependencies]
 brotli = [
-  "brotli>=1.0.9; platform_python_implementation == 'CPython'",
-  "brotlicffi>=0.8.0; platform_python_implementation != 'CPython'"
+  "brotli>=1.2.0; platform_python_implementation == 'CPython'",
+  "brotlicffi>=1.2.0.0; platform_python_implementation != 'CPython'"
 ]
 zstd = [
   "zstandard>=0.18.0",
@@ -95,6 +95,7 @@ filterwarnings = [
     '''default:ssl\.PROTOCOL_TLSv1_1 is deprecated:DeprecationWarning''',
     '''default:ssl\.PROTOCOL_TLSv1_2 is deprecated:DeprecationWarning''',
     '''default:ssl NPN is deprecated, use ALPN instead:DeprecationWarning''',
+     '''default:Brotli >= 1.2.0 is required to prevent decompression bombs\.:urllib3.exceptions.DependencyWarning''',
     '''default:Async generator 'quart\.wrappers\.response\.DataBody\.__aiter__\.<locals>\._aiter' was garbage collected.*:ResourceWarning''',  # https://github.com/pallets/quart/issues/301
     '''default:unclosed file <_io\.BufferedWriter name='/dev/null'>:ResourceWarning''',  # https://github.com/SeleniumHQ/selenium/issues/13328
 ]
diff --git a/src/urllib3/response.py b/src/urllib3/response.py
index b8e8565c..4304133e 100644
--- a/src/urllib3/response.py
+++ b/src/urllib3/response.py
@@ -49,6 +49,7 @@ from .connection import BaseSSLError, HTTPConnection, HTTPException
 from .exceptions import (
     BodyNotHttplibCompatible,
     DecodeError,
+    DependencyWarning,
     HTTPError,
     IncompleteRead,
     InvalidChunkLength,
@@ -68,7 +69,11 @@ log = logging.getLogger(__name__)
 
 
 class ContentDecoder:
-    def decompress(self, data: bytes) -> bytes:
+    def decompress(self, data: bytes, max_length: int = -1) -> bytes:
+        raise NotImplementedError()
+
+    @property
+    def has_unconsumed_tail(self) -> bool:
         raise NotImplementedError()
 
     def flush(self) -> bytes:
@@ -78,30 +83,57 @@ class ContentDecoder:
 class DeflateDecoder(ContentDecoder):
     def __init__(self) -> None:
         self._first_try = True
-        self._data = b""
+        self._first_try_data = b""
+        self._unfed_data = b""
         self._obj = zlib.decompressobj()
 
-    def decompress(self, data: bytes) -> bytes:
-        if not data:
+    def decompress(self, data: bytes, max_length: int = -1) -> bytes:
+        data = self._unfed_data + data
+        self._unfed_data = b""
+        if not data and not self._obj.unconsumed_tail:
             return data
+        original_max_length = max_length
+        if original_max_length < 0:
+            max_length = 0
+        elif original_max_length == 0:
+            # We should not pass 0 to the zlib decompressor because 0 is
+            # the default value that will make zlib decompress without a
+            # length limit.
+            # Data should be stored for subsequent calls.
+            self._unfed_data = data
+            return b""
 
+        # Subsequent calls always reuse `self._obj`. zlib requires
+        # passing the unconsumed tail if decompression is to continue.
         if not self._first_try:
-            return self._obj.decompress(data)
+            return self._obj.decompress(
+                self._obj.unconsumed_tail + data, max_length=max_length
+            )
 
-        self._data += data
+        # First call tries with RFC 1950 ZLIB format.
+        self._first_try_data += data
         try:
-            decompressed = self._obj.decompress(data)
+            decompressed = self._obj.decompress(data, max_length=max_length)
             if decompressed:
                 self._first_try = False
-                self._data = None  # type: ignore[assignment]
+                self._first_try_data = b""
             return decompressed
+        # On failure, it falls back to RFC 1951 DEFLATE format.
         except zlib.error:
             self._first_try = False
             self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
             try:
-                return self.decompress(self._data)
+                return self.decompress(
+                    self._first_try_data, max_length=original_max_length
+                )
             finally:
-                self._data = None  # type: ignore[assignment]
+                self._first_try_data = b""
+
+    @property
+    def has_unconsumed_tail(self) -> bool:
+        return bool(self._unfed_data) or (
+            bool(self._obj.unconsumed_tail) and not self._first_try
+        )
 
     def flush(self) -> bytes:
         return self._obj.flush()
@@ -117,27 +149,61 @@ class GzipDecoder(ContentDecoder):
     def __init__(self) -> None:
         self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
         self._state = GzipDecoderState.FIRST_MEMBER
+        self._unconsumed_tail = b""
 
-    def decompress(self, data: bytes) -> bytes:
+    def decompress(self, data: bytes, max_length: int = -1) -> bytes:
         ret = bytearray()
-        if self._state == GzipDecoderState.SWALLOW_DATA or not data:
+        if self._state == GzipDecoderState.SWALLOW_DATA:
             return bytes(ret)
+
+        if max_length == 0:
+            # We should not pass 0 to the zlib decompressor because 0 is
+            # the default value that will make zlib decompress without a
+            # length limit.
+            # Data should be stored for subsequent calls.
+            self._unconsumed_tail += data
+            return b""
+
+        # zlib requires passing the unconsumed tail to the subsequent
+        # call if decompression is to continue.
+        data = self._unconsumed_tail + data
+        if not data and self._obj.eof:
+            return bytes(ret)
+
         while True:
             try:
-                ret += self._obj.decompress(data)
+                ret += self._obj.decompress(
+                    data, max_length=max(max_length - len(ret), 0)
+                )
             except zlib.error:
                 previous_state = self._state
                 # Ignore data after the first error
                 self._state = GzipDecoderState.SWALLOW_DATA
+                self._unconsumed_tail = b""
                 if previous_state == GzipDecoderState.OTHER_MEMBERS:
                     # Allow trailing garbage acceptable in other gzip clients
                     return bytes(ret)
                 raise
-            data = self._obj.unused_data
+
+            self._unconsumed_tail = data = (
+                self._obj.unconsumed_tail or self._obj.unused_data
+            )
+            if max_length > 0 and len(ret) >= max_length:
+                break
+
             if not data:
                 return bytes(ret)
-            self._state = GzipDecoderState.OTHER_MEMBERS
-            self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
+            # When the end of a gzip member is reached, a new decompressor
+            # must be created for unused (possibly future) data.
+            if self._obj.eof:
+                self._state = GzipDecoderState.OTHER_MEMBERS
+                self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
+
+        return bytes(ret)
+
+    @property
+    def has_unconsumed_tail(self) -> bool:
+        return bool(self._unconsumed_tail)
 
     def flush(self) -> bytes:
         return self._obj.flush()
@@ -152,9 +218,35 @@ if brotli is not None:
         def __init__(self) -> None:
             self._obj = brotli.Decompressor()
             if hasattr(self._obj, "decompress"):
-                setattr(self, "decompress", self._obj.decompress)
+                setattr(self, "_decompress", self._obj.decompress)
             else:
-                setattr(self, "decompress", self._obj.process)
+                setattr(self, "_decompress", self._obj.process)
+
+        # Requires Brotli >= 1.2.0 for `output_buffer_limit`.
+        def _decompress(self, data: bytes, output_buffer_limit: int = -1) -> bytes:
+            raise NotImplementedError()
+
+        def decompress(self, data: bytes, max_length: int = -1) -> bytes:
+            try:
+                if max_length > 0:
+                    return self._decompress(data, output_buffer_limit=max_length)
+                else:
+                    return self._decompress(data)
+            except TypeError:
+                # Fallback for Brotli/brotlicffi/brotlipy versions without
+                # the `output_buffer_limit` parameter.
+                warnings.warn(
+                    "Brotli >= 1.2.0 is required to prevent decompression bombs.",
+                    DependencyWarning,
+                )
+                return self._decompress(data)
+
+        @property
+        def has_unconsumed_tail(self) -> bool:
+            try:
+                return not self._obj.can_accept_more_data()
+            except AttributeError:
+                return False
 
         def flush(self) -> bytes:
             if hasattr(self._obj, "flush"):
@@ -168,16 +260,46 @@ if HAS_ZSTD:
         def __init__(self) -> None:
             self._obj = zstd.ZstdDecompressor().decompressobj()
 
-        def decompress(self, data: bytes) -> bytes:
-            if not data:
+        def decompress(self, data: bytes, max_length: int = -1) -> bytes:
+            if not data and not self.has_unconsumed_tail:
                 return b""
-            data_parts = [self._obj.decompress(data)]
-            while self._obj.eof and self._obj.unused_data:
+            if self._obj.eof:
+                data = self._obj.unused_data + data
+                self._obj = zstd.ZstdDecompressor()
+            part = self._obj.decompress(data, max_length=max_length)
+            length = len(part)
+            data_parts = [part]
+            # Every loop iteration is supposed to read data from a separate frame.
+            # The loop breaks when:
+            #   - enough data is read;
+            #   - no more unused data is available;
+            #   - end of the last read frame has not been reached (i.e.,
+            #     more data has to be fed).
+            while (
+                self._obj.eof
+                and self._obj.unused_data
+                and (max_length < 0 or length < max_length)
+            ):
                 unused_data = self._obj.unused_data
-                self._obj = zstd.ZstdDecompressor().decompressobj()
-                data_parts.append(self._obj.decompress(unused_data))
+                if not self._obj.needs_input:
+                    self._obj = zstd.ZstdDecompressor()
+                part = self._obj.decompress(
+                    unused_data,
+                    max_length=(max_length - length) if max_length > 0 else -1,
+                )
+                if part_length := len(part):
+                    data_parts.append(part)
+                    length += part_length
+                elif self._obj.needs_input:
+                    break
             return b"".join(data_parts)
 
+        @property
+        def has_unconsumed_tail(self) -> bool:
+            return not (self._obj.needs_input or self._obj.eof) or bool(
+                self._obj.unused_data
+            )
+
         def flush(self) -> bytes:
             ret = self._obj.flush()  # note: this is a no-op
             if not self._obj.eof:
@@ -210,10 +332,35 @@ class MultiDecoder(ContentDecoder):
     def flush(self) -> bytes:
         return self._decoders[0].flush()
 
-    def decompress(self, data: bytes) -> bytes:
-        for d in reversed(self._decoders):
-            data = d.decompress(data)
-        return data
+    def decompress(self, data: bytes, max_length: int = -1) -> bytes:
+        if max_length <= 0:
+            for d in reversed(self._decoders):
+                data = d.decompress(data)
+            return data
+
+        ret = bytearray()
+        # Every while loop iteration goes through all decoders once.
+        # It exits when enough data is read or no more data can be read.
+        # It is possible that the while loop iteration does not produce
+        # any data because we retrieve up to `max_length` from every
+        # decoder, and the amount of bytes may be insufficient for the
+        # next decoder to produce enough/any output.
+        while True:
+            any_data = False
+            for d in reversed(self._decoders):
+                data = d.decompress(data, max_length=max_length - len(ret))
+                if data:
+                    any_data = True
+                # We should not break when no data is returned because
+                # next decoders may produce data even with empty input.
+            ret += data
+            if not any_data or len(ret) >= max_length:
+                return bytes(ret)
+            data = b""
+
+    @property
+    def has_unconsumed_tail(self) -> bool:
+        return any(d.has_unconsumed_tail for d in self._decoders)
 
 
 def _get_decoder(mode: str) -> ContentDecoder:
@@ -246,9 +393,6 @@ class BytesQueueBuffer:
 
      * self.buffer, which contains the full data
      * the largest chunk that we will copy in get()
-
-    The worst case scenario is a single chunk, in which case we'll make a full copy of
-    the data inside get().
     """
 
     def __init__(self) -> None:
@@ -270,6 +414,10 @@ class BytesQueueBuffer:
         elif n < 0:
             raise ValueError("n should be > 0")
 
+        if len(self.buffer[0]) == n and isinstance(self.buffer[0], bytes):
+            self._size -= n
+            return self.buffer.popleft()
+
         fetched = 0
         ret = io.BytesIO()
         while fetched < n:
@@ -473,7 +621,11 @@ class BaseHTTPResponse(io.IOBase):
                     self._decoder = _get_decoder(content_encoding)
 
     def _decode(
-        self, data: bytes, decode_content: bool | None, flush_decoder: bool
+        self,
+        data: bytes,
+        decode_content: bool | None,
+        flush_decoder: bool,
+        max_length: int | None = None,
     ) -> bytes:
         """
         Decode the data passed in and potentially flush the decoder.
@@ -486,9 +638,12 @@ class BaseHTTPResponse(io.IOBase):
                 )
             return data
 
+        if max_length is None or flush_decoder:
+            max_length = -1
+
         try:
             if self._decoder:
-                data = self._decoder.decompress(data)
+                data = self._decoder.decompress(data, max_length=max_length)
                 self._has_decoded_content = True
         except self.DECODER_ERROR_CLASSES as e:
             content_encoding = self.headers.get("content-encoding", "").lower()
@@ -953,6 +1108,14 @@ class HTTPResponse(BaseHTTPResponse):
         elif amt is not None:
             cache_content = False
 
+            if self._decoder and self._decoder.has_unconsumed_tail:
+                decoded_data = self._decode(
+                    b"",
+                    decode_content,
+                    flush_decoder=False,
+                    max_length=amt - len(self._decoded_buffer),
+                )
+                self._decoded_buffer.put(decoded_data)
             if len(self._decoded_buffer) >= amt:
                 return self._decoded_buffer.get(amt)
 
@@ -960,7 +1123,11 @@ class HTTPResponse(BaseHTTPResponse):
 
         flush_decoder = amt is None or (amt != 0 and not data)
 
-        if not data and len(self._decoded_buffer) == 0:
+        if (
+            not data
+            and len(self._decoded_buffer) == 0
+            and not (self._decoder and self._decoder.has_unconsumed_tail)
+        ):
             return data
 
         if amt is None:
@@ -977,7 +1144,12 @@ class HTTPResponse(BaseHTTPResponse):
                     )
                 return data
 
-            decoded_data = self._decode(data, decode_content, flush_decoder)
+            decoded_data = self._decode(
+                data,
+                decode_content,
+                flush_decoder,
+                max_length=amt - len(self._decoded_buffer),
+            )
             self._decoded_buffer.put(decoded_data)
 
             while len(self._decoded_buffer) < amt and data:
@@ -985,7 +1157,12 @@ class HTTPResponse(BaseHTTPResponse):
                 # For example, the GZ file header takes 10 bytes, we don't want to read
                 # it one byte at a time
                 data = self._raw_read(amt)
-                decoded_data = self._decode(data, decode_content, flush_decoder)
+                decoded_data = self._decode(
+                    data,
+                    decode_content,
+                    flush_decoder,
+                    max_length=amt - len(self._decoded_buffer),
+                )
                 self._decoded_buffer.put(decoded_data)
             data = self._decoded_buffer.get(amt)
 
@@ -1020,6 +1197,20 @@ class HTTPResponse(BaseHTTPResponse):
                     "Calling read1(decode_content=False) is not supported after "
                     "read1(decode_content=True) was called."
                 )
+            if (
+                self._decoder
+                and self._decoder.has_unconsumed_tail
+                and (amt is None or len(self._decoded_buffer) < amt)
+            ):
+                decoded_data = self._decode(
+                    b"",
+                    decode_content,
+                    flush_decoder=False,
+                    max_length=(
+                        amt - len(self._decoded_buffer) if amt is not None else None
+                    ),
+                )
+                self._decoded_buffer.put(decoded_data)
             if len(self._decoded_buffer) > 0:
                 if amt is None:
                     return self._decoded_buffer.get_all()
@@ -1035,7 +1226,9 @@ class HTTPResponse(BaseHTTPResponse):
         self._init_decoder()
         while True:
             flush_decoder = not data
-            decoded_data = self._decode(data, decode_content, flush_decoder)
+            decoded_data = self._decode(
+                data, decode_content, flush_decoder, max_length=amt
+            )
             self._decoded_buffer.put(decoded_data)
             if decoded_data or flush_decoder:
                 break
@@ -1066,7 +1259,11 @@ class HTTPResponse(BaseHTTPResponse):
         if self.chunked and self.supports_chunked_reads():
             yield from self.read_chunked(amt, decode_content=decode_content)
         else:
-            while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0:
+            while (
+                not is_fp_closed(self._fp)
+                or len(self._decoded_buffer) > 0
+                or (self._decoder and self._decoder.has_unconsumed_tail)
+            ):
                 data = self.read(amt=amt, decode_content=decode_content)
 
                 if data:
@@ -1218,7 +1415,10 @@ class HTTPResponse(BaseHTTPResponse):
                     break
                 chunk = self._handle_chunk(amt)
                 decoded = self._decode(
-                    chunk, decode_content=decode_content, flush_decoder=False
+                    chunk,
+                    decode_content=decode_content,
+                    flush_decoder=False,
+                    max_length=amt,
                 )
                 if decoded:
                     yield decoded
-- 
2.34.1