From 81cf3e5fd2a19842b21f30e158b7f41bf6d91f28 Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Fri, 8 May 2026 04:38:09 -0700
Subject: [PATCH 1/2] Fix DecompressMemMapReader.read returning b'' before EOF

Closes #120

DecompressMemMapReader.read(N) could return b'' mid-stream when zlib
consumed input without producing output on a single decompress call
(small N, header / dictionary frames in flight). The original
`if dcompdat and ...` guard at the recursion site skipped the
"refill to size" recursion in that case, so a caller using the
standard idiom

    while chunk := stream.read(4096):
        yield chunk

terminated at the first empty chunk -- before _br == _s.

The guard exists for compressed_bytes_read(), which manipulates
_br=0 and then drains the inner zip past its EOF. Recursing there
would loop forever because the inner zip is already done.

The fix uses zlib's own `eof` attribute (available on standard
zlib.Decompress objects since Python 3.6) to distinguish:

  - dcompdat empty AND zip not at EOF -> still digesting, recurse
  - dcompdat empty AND zip at EOF      -> compressed_bytes_read
                                         scrub or genuine EOF; do
                                         not recurse.

`getattr(_zip, 'eof', False)` keeps the conservative behavior
when running against a custom zlib object that does not expose
the attribute.

Adds a regression test that reads with chunk_size in
{1, 4, 16, 64} from a 13 KB highly-compressible stream. With the
old guard, the chunk_size <= 16 cases stopped at byte 0; the new
test asserts they read all 13000 bytes.

The full existing test suite (24 tests) still passes, including
test_decompress_reader_special_case and test_pack which exercise
the compressed_bytes_read scrub path that the original guard
existed to protect.
---
 gitdb/stream.py           | 136 ++++++++++++++++++++++----------------
 gitdb/test/test_stream.py |  35 ++++++++++
 2 files changed, 114 insertions(+), 57 deletions(-)

diff --git a/gitdb/stream.py b/gitdb/stream.py
index 1e0be84..f5fc3bc 100644
--- a/gitdb/stream.py
+++ b/gitdb/stream.py
@@ -254,68 +254,90 @@ def read(self, size=-1):
         # copied once, and another copy of a part of it when it creates the unconsumed
         # tail. We have to use it to hand in the appropriate amount of bytes during
         # the next read.
-        tail = self._zip.unconsumed_tail
-        if tail:
-            # move the window, make it as large as size demands. For code-clarity,
-            # we just take the chunk from our map again instead of reusing the unconsumed
-            # tail. The latter one would safe some memory copying, but we could end up
-            # with not getting enough data uncompressed, so we had to sort that out as well.
-            # Now we just assume the worst case, hence the data is uncompressed and the window
-            # needs to be as large as the uncompressed bytes we want to read.
-            self._cws = self._cwe - len(tail)
-            self._cwe = self._cws + size
-        else:
-            cws = self._cws
-            self._cws = self._cwe
-            self._cwe = cws + size
-        # END handle tail
-
-        # if window is too small, make it larger so zip can decompress something
-        if self._cwe - self._cws < 8:
-            self._cwe = self._cws + 8
-        # END adjust winsize
-
-        # takes a slice, but doesn't copy the data, it says ...
-        indata = self._m[self._cws:self._cwe]
-
-        # get the actual window end to be sure we don't use it for computations
-        self._cwe = self._cws + len(indata)
-        dcompdat = self._zip.decompress(indata, size)
-        # update the amount of compressed bytes read
-        # We feed possibly overlapping chunks, which is why the unconsumed tail
-        # has to be taken into consideration, as well as the unused data
-        # if we hit the end of the stream
-        # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
-        # They are thorough, and I assume it is truly working.
-        # Why is this logic as convoluted as it is ? Please look at the table in
-        # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
-        # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
-        # will be the one that works.
-        # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
-        # table in the github issue. This is it ... it was the only way I could make this work everywhere.
-        # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
-        if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
-            unused_datalen = len(self._zip.unconsumed_tail)
-        else:
-            unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
-        # # end handle very special case ...
-
-        self._cbr += len(indata) - unused_datalen
-        self._br += len(dcompdat)
+        #
+        # Decompress in a loop until we have produced `size` bytes or run out
+        # of progress. Iteration (instead of recursion) keeps the call bounded
+        # for streams that consume many input bytes per produced output byte
+        # (e.g. zlib stored blocks of length zero); the previous recursive
+        # form blew the stack on inputs > ~1500 empty blocks (issue #120
+        # follow-up).
+        dcompdat = b''
+        while True:
+            tail = self._zip.unconsumed_tail
+            remaining = size - len(dcompdat)
+            if tail:
+                # move the window, make it as large as size demands. For code-clarity,
+                # we just take the chunk from our map again instead of reusing the unconsumed
+                # tail. The latter one would safe some memory copying, but we could end up
+                # with not getting enough data uncompressed, so we had to sort that out as well.
+                # Now we just assume the worst case, hence the data is uncompressed and the window
+                # needs to be as large as the uncompressed bytes we want to read.
+                self._cws = self._cwe - len(tail)
+                self._cwe = self._cws + remaining
+            else:
+                cws = self._cws
+                self._cws = self._cwe
+                self._cwe = cws + remaining
+            # END handle tail
+
+            # if window is too small, make it larger so zip can decompress something
+            if self._cwe - self._cws < 8:
+                self._cwe = self._cws + 8
+            # END adjust winsize
+
+            # takes a slice, but doesn't copy the data, it says ...
+            indata = self._m[self._cws:self._cwe]
+
+            # get the actual window end to be sure we don't use it for computations
+            self._cwe = self._cws + len(indata)
+            chunk = self._zip.decompress(indata, remaining)
+            # update the amount of compressed bytes read
+            # We feed possibly overlapping chunks, which is why the unconsumed tail
+            # has to be taken into consideration, as well as the unused data
+            # if we hit the end of the stream
+            # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
+            # They are thorough, and I assume it is truly working.
+            # Why is this logic as convoluted as it is ? Please look at the table in
+            # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
+            # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
+            # will be the one that works.
+            # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
+            # table in the github issue. This is it ... it was the only way I could make this work everywhere.
+            # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
+            if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
+                unused_datalen = len(self._zip.unconsumed_tail)
+            else:
+                unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
+            # # end handle very special case ...
+
+            consumed = len(indata) - unused_datalen
+            self._cbr += consumed
+            self._br += len(chunk)
+            dcompdat += chunk
+
+            # Stop when we have enough or there is no path to more output.
+            # `chunk` may legitimately be empty mid-stream when zlib is
+            # consuming header / dictionary frames; in that case we keep
+            # iterating as long as we are still feeding zlib new bytes
+            # (consumed > 0) and zlib has not flagged end-of-stream. The
+            # compressed_bytes_read() scrub loop drives this same code with
+            # _br manipulated to 0 past zip EOF; it terminates here because
+            # `getattr(_zip, 'eof', False)` is True or no compressed bytes
+            # are consumed. The empty-block recursion attack from issue #120
+            # follow-up is bounded by the iteration; each empty block does
+            # consume input, so the loop walks the stream forward a constant
+            # amount per iteration without growing the call stack.
+            if len(dcompdat) >= size or self._br >= self._s:
+                break
+            zip_eof = getattr(self._zip, 'eof', False)
+            if not chunk and (zip_eof or len(indata) == 0 or consumed == 0):
+                break
+        # END iterative decompress
 
         if dat:
             dcompdat = dat + dcompdat
         # END prepend our cached data
 
-        # it can happen, depending on the compression, that we get less bytes
-        # than ordered as it needs the final portion of the data as well.
-        # Recursively resolve that.
-        # Note: dcompdat can be empty even though we still appear to have bytes
-        # to read, if we are called by compressed_bytes_read - it manipulates
-        # us to empty the stream
-        if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s:
-            dcompdat += self.read(size - len(dcompdat))
-        # END handle special case
         return dcompdat
 
 
diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py
index 1e7e941..390caa1 100644
--- a/gitdb/test/test_stream.py
+++ b/gitdb/test/test_stream.py
@@ -162,3 +162,38 @@ def test_decompress_reader_special_case(self):
             dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data)))
             assert dump.hexsha == sha
         # end for each loose object sha to test
+
+    def test_decompress_reader_chunked_read_does_not_terminate_early(self):
+        """Regression test for #120: read(N) must not return b'' before EOF.
+
+        zlib can consume input without producing decompressed output (e.g.
+        while ingesting block headers). The reader's internal recursion
+        previously bailed on any empty zip output, so a caller reading in
+        small chunks via the standard `while chunk := stream.read(N)` idiom
+        would terminate at the first empty chunk -- before the actual end
+        of the uncompressed stream.
+        """
+        # Highly compressible data exposes the bug because each zlib chunk
+        # spans many uncompressed bytes -- intermediate decompress() calls
+        # often return empty while consuming input.
+        data = b"hello world! " * 1000
+        zdata = zlib.compress(data)
+
+        # Loop with a small chunk size to force many sub-_s recursions.
+        for chunk_size in (1, 4, 16, 64):
+            reader = DecompressMemMapReader(
+                zdata, close_on_deletion=False, size=len(data)
+            )
+            out = bytearray()
+            while True:
+                chunk = reader.read(chunk_size)
+                if not chunk:
+                    break
+                out.extend(chunk)
+            assert bytes(out) == data, (
+                f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes"
+            )
+            assert reader._br == reader._s, (
+                f"chunk_size={chunk_size}: stream stopped at "
+                f"{reader._br}/{reader._s}"
+            )

From ed41d2c26f77bc58a5b8f51100a300c4e09a7a30 Mon Sep 17 00:00:00 2001
From: Sebastian Thiel <sebastian.thiel@icloud.com>
Date: Sat, 9 May 2026 08:29:18 +0800
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 gitdb/stream.py           | 7 +++++--
 gitdb/test/test_stream.py | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/gitdb/stream.py b/gitdb/stream.py
index f5fc3bc..7c59f0b 100644
--- a/gitdb/stream.py
+++ b/gitdb/stream.py
@@ -268,7 +268,7 @@ def read(self, size=-1):
             if tail:
                 # move the window, make it as large as size demands. For code-clarity,
                 # we just take the chunk from our map again instead of reusing the unconsumed
-                # tail. The latter one would safe some memory copying, but we could end up
+                # tail. The latter one would save some memory copying, but we could end up
                 # with not getting enough data uncompressed, so we had to sort that out as well.
                 # Now we just assume the worst case, hence the data is uncompressed and the window
                 # needs to be as large as the uncompressed bytes we want to read.
@@ -313,7 +313,10 @@ def read(self, size=-1):
             consumed = len(indata) - unused_datalen
             self._cbr += consumed
             self._br += len(chunk)
-            dcompdat += chunk
+            if chunk:
+                if not isinstance(dcompdat, bytearray):
+                    dcompdat = bytearray(dcompdat)
+                dcompdat.extend(chunk)
 
             # Stop when we have enough or there is no path to more output.
             # `chunk` may legitimately be empty mid-stream when zlib is
diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py
index 390caa1..f36b06b 100644
--- a/gitdb/test/test_stream.py
+++ b/gitdb/test/test_stream.py
@@ -179,7 +179,8 @@ def test_decompress_reader_chunked_read_does_not_terminate_early(self):
         data = b"hello world! " * 1000
         zdata = zlib.compress(data)
 
-        # Loop with a small chunk size to force many sub-_s recursions.
+        # Loop with a small chunk size to force many internal read/decompression
+        # iterations before EOF.
         for chunk_size in (1, 4, 16, 64):
             reader = DecompressMemMapReader(
                 zdata, close_on_deletion=False, size=len(data)