diff --git a/gitdb/stream.py b/gitdb/stream.py index 1e0be84..7c59f0b 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -254,68 +254,93 @@ def read(self, size=-1): # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes during # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - # END handle tail - - # if window is too small, make it larger so zip can decompress something - if self._cwe - self._cws < 8: - self._cwe = self._cws + 8 - # END adjust winsize - - # takes a slice, but doesn't copy the data, it says ... - indata = self._m[self._cws:self._cwe] - - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - dcompdat = self._zip.decompress(indata, size) - # update the amount of compressed bytes read - # We feed possibly overlapping chunks, which is why the unconsumed tail - # has to be taken into consideration, as well as the unused data - # if we hit the end of the stream - # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. - # They are thorough, and I assume it is truly working. - # Why is this logic as convoluted as it is ? Please look at the table in - # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. - # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch - # will be the one that works. - # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the - # table in the github issue. This is it ... it was the only way I could make this work everywhere. - # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . - if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': - unused_datalen = len(self._zip.unconsumed_tail) - else: - unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) - # # end handle very special case ... - - self._cbr += len(indata) - unused_datalen - self._br += len(dcompdat) + # + # Decompress in a loop until we have produced `size` bytes or run out + # of progress. Iteration (instead of recursion) keeps the call bounded + # for streams that consume many input bytes per produced output byte + # (e.g. zlib stored blocks of length zero); the previous recursive + # form blew the stack on inputs > ~1500 empty blocks (issue #120 + # follow-up). + dcompdat = b'' + while True: + tail = self._zip.unconsumed_tail + remaining = size - len(dcompdat) + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would save some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + remaining + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + remaining + # END handle tail + + # if window is too small, make it larger so zip can decompress something + if self._cwe - self._cws < 8: + self._cwe = self._cws + 8 + # END adjust winsize + + # takes a slice, but doesn't copy the data, it says ... + indata = self._m[self._cws:self._cwe] + + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + chunk = self._zip.decompress(indata, remaining) + # update the amount of compressed bytes read + # We feed possibly overlapping chunks, which is why the unconsumed tail + # has to be taken into consideration, as well as the unused data + # if we hit the end of the stream + # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. + # They are thorough, and I assume it is truly working. + # Why is this logic as convoluted as it is ? Please look at the table in + # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. + # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # table in the github issue. This is it ... it was the only way I could make this work everywhere. + # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . + if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': + unused_datalen = len(self._zip.unconsumed_tail) + else: + unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) + # # end handle very special case ... + + consumed = len(indata) - unused_datalen + self._cbr += consumed + self._br += len(chunk) + if chunk: + if not isinstance(dcompdat, bytearray): + dcompdat = bytearray(dcompdat) + dcompdat.extend(chunk) + + # Stop when we have enough or there is no path to more output. + # `chunk` may legitimately be empty mid-stream when zlib is + # consuming header / dictionary frames; in that case we keep + # iterating as long as we are still feeding zlib new bytes + # (consumed > 0) and zlib has not flagged end-of-stream. The + # compressed_bytes_read() scrub loop drives this same code with + # _br manipulated to 0 past zip EOF; it terminates here because + # `getattr(_zip, 'eof', False)` is True or no compressed bytes + # are consumed. The empty-block recursion attack from issue #120 + # follow-up is bounded by the iteration; each empty block does + # consume input, so the loop walks the stream forward a constant + # amount per iteration without growing the call stack. + if len(dcompdat) >= size or self._br >= self._s: + break + zip_eof = getattr(self._zip, 'eof', False) + if not chunk and (zip_eof or len(indata) == 0 or consumed == 0): + break + # END iterative decompress if dat: dcompdat = dat + dcompdat # END prepend our cached data - # it can happen, depending on the compression, that we get less bytes - # than ordered as it needs the final portion of the data as well. - # Recursively resolve that. - # Note: dcompdat can be empty even though we still appear to have bytes - # to read, if we are called by compressed_bytes_read - it manipulates - # us to empty the stream - if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: - dcompdat += self.read(size - len(dcompdat)) - # END handle special case return dcompdat diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 1e7e941..f36b06b 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -162,3 +162,39 @@ def test_decompress_reader_special_case(self): dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) assert dump.hexsha == sha # end for each loose object sha to test + + def test_decompress_reader_chunked_read_does_not_terminate_early(self): + """Regression test for #120: read(N) must not return b'' before EOF. + + zlib can consume input without producing decompressed output (e.g. + while ingesting block headers). The reader's internal recursion + previously bailed on any empty zip output, so a caller reading in + small chunks via the standard `while chunk := stream.read(N)` idiom + would terminate at the first empty chunk -- before the actual end + of the uncompressed stream. + """ + # Highly compressible data exposes the bug because each zlib chunk + # spans many uncompressed bytes -- intermediate decompress() calls + # often return empty while consuming input. + data = b"hello world! " * 1000 + zdata = zlib.compress(data) + + # Loop with a small chunk size to force many internal read/decompression + # iterations before EOF. + for chunk_size in (1, 4, 16, 64): + reader = DecompressMemMapReader( + zdata, close_on_deletion=False, size=len(data) + ) + out = bytearray() + while True: + chunk = reader.read(chunk_size) + if not chunk: + break + out.extend(chunk) + assert bytes(out) == data, ( + f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes" + ) + assert reader._br == reader._s, ( + f"chunk_size={chunk_size}: stream stopped at " + f"{reader._br}/{reader._s}" + )