# HG changeset patch # User Matt Mackall # Date 1192081614 18000 # Node ID 0a43875677b1eec91a08f94fb8a06e4c855b7736 # Parent c728424d44c61416645e9d8d6e62523ea654fc74 revlog: break up compression of large deltas Python's zlib apparently makes an internal copy of strings passed to compress(). To avoid this, compress strings 1M at a time, then join them at the end if the result would be smaller than the original. For initial commits of large but compressible files, this cuts peak memory usage nearly in half. diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -61,12 +61,27 @@ def compress(text): """ generate a possibly-compressed representation of text """ if not text: return ("", text) - if len(text) < 44: + l = len(text) + if l < 44: if text[0] == '\0': return ("", text) return ('u', text) - bin = _compress(text) - if len(bin) > len(text): + elif l > 1000000: + # zlib makes an internal copy, thus doubling memory usage for + # large files, so lets do this in pieces + z = zlib.compressobj() + p = [] + pos = 0 + while pos < l: + pos2 = pos + 2**20 + p.append(z.compress(text[pos:pos2])) + pos = pos2 + p.append(z.flush()) + if sum(map(len, p)) < l: + bin = "".join(p) + else: + bin = _compress(text) + if len(bin) > l: if text[0] == '\0': return ("", text) return ('u', text)