From 81cf3e5fd2a19842b21f30e158b7f41bf6d91f28 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Fri, 8 May 2026 04:38:09 -0700 Subject: [PATCH 1/2] Fix DecompressMemMapReader.read returning b'' before EOF Closes #120 DecompressMemMapReader.read(N) could return b'' mid-stream when zlib consumed input without producing output on a single decompress call (small N, header / dictionary frames in flight). The original `if dcompdat and ...` guard at the recursion site skipped the "refill to size" recursion in that case, so a caller using the standard idiom while chunk := stream.read(4096): yield chunk terminated at the first empty chunk -- before _br == _s. The guard exists for compressed_bytes_read(), which manipulates _br=0 and then drains the inner zip past its EOF. Recursing there would loop forever because the inner zip is already done. The fix uses zlib's own `eof` attribute (available on standard zlib.Decompress objects since Python 3.6) to distinguish: - dcompdat empty AND zip not at EOF -> still digesting, recurse - dcompdat empty AND zip at EOF -> compressed_bytes_read scrub or genuine EOF; do not recurse. `getattr(_zip, 'eof', False)` keeps the conservative behavior when running against a custom zlib object that does not expose the attribute. Adds a regression test that reads with chunk_size in {1, 4, 16, 64} from a 13 KB highly-compressible stream. With the old guard, the chunk_size <= 16 cases stopped at byte 0; the new test asserts they read all 13000 bytes. The full existing test suite (24 tests) still passes, including test_decompress_reader_special_case and test_pack which exercise the compressed_bytes_read scrub path that the original guard existed to protect. --- gitdb/stream.py | 136 ++++++++++++++++++++++---------------- gitdb/test/test_stream.py | 35 ++++++++++ 2 files changed, 114 insertions(+), 57 deletions(-) diff --git a/gitdb/stream.py b/gitdb/stream.py index 1e0be84..f5fc3bc 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -254,68 +254,90 @@ def read(self, size=-1): # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes during # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - # END handle tail - - # if window is too small, make it larger so zip can decompress something - if self._cwe - self._cws < 8: - self._cwe = self._cws + 8 - # END adjust winsize - - # takes a slice, but doesn't copy the data, it says ... - indata = self._m[self._cws:self._cwe] - - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - dcompdat = self._zip.decompress(indata, size) - # update the amount of compressed bytes read - # We feed possibly overlapping chunks, which is why the unconsumed tail - # has to be taken into consideration, as well as the unused data - # if we hit the end of the stream - # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. - # They are thorough, and I assume it is truly working. - # Why is this logic as convoluted as it is ? Please look at the table in - # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. - # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch - # will be the one that works. - # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the - # table in the github issue. This is it ... it was the only way I could make this work everywhere. - # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . - if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': - unused_datalen = len(self._zip.unconsumed_tail) - else: - unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) - # # end handle very special case ... - - self._cbr += len(indata) - unused_datalen - self._br += len(dcompdat) + # + # Decompress in a loop until we have produced `size` bytes or run out + # of progress. Iteration (instead of recursion) keeps the call bounded + # for streams that consume many input bytes per produced output byte + # (e.g. zlib stored blocks of length zero); the previous recursive + # form blew the stack on inputs > ~1500 empty blocks (issue #120 + # follow-up). + dcompdat = b'' + while True: + tail = self._zip.unconsumed_tail + remaining = size - len(dcompdat) + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + remaining + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + remaining + # END handle tail + + # if window is too small, make it larger so zip can decompress something + if self._cwe - self._cws < 8: + self._cwe = self._cws + 8 + # END adjust winsize + + # takes a slice, but doesn't copy the data, it says ... + indata = self._m[self._cws:self._cwe] + + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + chunk = self._zip.decompress(indata, remaining) + # update the amount of compressed bytes read + # We feed possibly overlapping chunks, which is why the unconsumed tail + # has to be taken into consideration, as well as the unused data + # if we hit the end of the stream + # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. + # They are thorough, and I assume it is truly working. + # Why is this logic as convoluted as it is ? Please look at the table in + # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. + # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # table in the github issue. This is it ... it was the only way I could make this work everywhere. + # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . + if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': + unused_datalen = len(self._zip.unconsumed_tail) + else: + unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) + # # end handle very special case ... + + consumed = len(indata) - unused_datalen + self._cbr += consumed + self._br += len(chunk) + dcompdat += chunk + + # Stop when we have enough or there is no path to more output. + # `chunk` may legitimately be empty mid-stream when zlib is + # consuming header / dictionary frames; in that case we keep + # iterating as long as we are still feeding zlib new bytes + # (consumed > 0) and zlib has not flagged end-of-stream. The + # compressed_bytes_read() scrub loop drives this same code with + # _br manipulated to 0 past zip EOF; it terminates here because + # `getattr(_zip, 'eof', False)` is True or no compressed bytes + # are consumed. The empty-block recursion attack from issue #120 + # follow-up is bounded by the iteration; each empty block does + # consume input, so the loop walks the stream forward a constant + # amount per iteration without growing the call stack. + if len(dcompdat) >= size or self._br >= self._s: + break + zip_eof = getattr(self._zip, 'eof', False) + if not chunk and (zip_eof or len(indata) == 0 or consumed == 0): + break + # END iterative decompress if dat: dcompdat = dat + dcompdat # END prepend our cached data - # it can happen, depending on the compression, that we get less bytes - # than ordered as it needs the final portion of the data as well. - # Recursively resolve that. - # Note: dcompdat can be empty even though we still appear to have bytes - # to read, if we are called by compressed_bytes_read - it manipulates - # us to empty the stream - if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: - dcompdat += self.read(size - len(dcompdat)) - # END handle special case return dcompdat diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 1e7e941..390caa1 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -162,3 +162,38 @@ def test_decompress_reader_special_case(self): dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) assert dump.hexsha == sha # end for each loose object sha to test + + def test_decompress_reader_chunked_read_does_not_terminate_early(self): + """Regression test for #120: read(N) must not return b'' before EOF. + + zlib can consume input without producing decompressed output (e.g. + while ingesting block headers). The reader's internal recursion + previously bailed on any empty zip output, so a caller reading in + small chunks via the standard `while chunk := stream.read(N)` idiom + would terminate at the first empty chunk -- before the actual end + of the uncompressed stream. + """ + # Highly compressible data exposes the bug because each zlib chunk + # spans many uncompressed bytes -- intermediate decompress() calls + # often return empty while consuming input. + data = b"hello world! " * 1000 + zdata = zlib.compress(data) + + # Loop with a small chunk size to force many sub-_s recursions. + for chunk_size in (1, 4, 16, 64): + reader = DecompressMemMapReader( + zdata, close_on_deletion=False, size=len(data) + ) + out = bytearray() + while True: + chunk = reader.read(chunk_size) + if not chunk: + break + out.extend(chunk) + assert bytes(out) == data, ( + f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes" + ) + assert reader._br == reader._s, ( + f"chunk_size={chunk_size}: stream stopped at " + f"{reader._br}/{reader._s}" + ) From ed41d2c26f77bc58a5b8f51100a300c4e09a7a30 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 9 May 2026 08:29:18 +0800 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- gitdb/stream.py | 7 +++++-- gitdb/test/test_stream.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/gitdb/stream.py b/gitdb/stream.py index f5fc3bc..7c59f0b 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -268,7 +268,7 @@ def read(self, size=-1): if tail: # move the window, make it as large as size demands. For code-clarity, # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up + # tail. The latter one would save some memory copying, but we could end up # with not getting enough data uncompressed, so we had to sort that out as well. # Now we just assume the worst case, hence the data is uncompressed and the window # needs to be as large as the uncompressed bytes we want to read. @@ -313,7 +313,10 @@ def read(self, size=-1): consumed = len(indata) - unused_datalen self._cbr += consumed self._br += len(chunk) - dcompdat += chunk + if chunk: + if not isinstance(dcompdat, bytearray): + dcompdat = bytearray(dcompdat) + dcompdat.extend(chunk) # Stop when we have enough or there is no path to more output. # `chunk` may legitimately be empty mid-stream when zlib is diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 390caa1..f36b06b 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -179,7 +179,8 @@ def test_decompress_reader_chunked_read_does_not_terminate_early(self): data = b"hello world! " * 1000 zdata = zlib.compress(data) - # Loop with a small chunk size to force many sub-_s recursions. + # Loop with a small chunk size to force many internal read/decompression + # iterations before EOF. for chunk_size in (1, 4, 16, 64): reader = DecompressMemMapReader( zdata, close_on_deletion=False, size=len(data)