revlog: break up compression of large deltas
authorMatt Mackall <mpm@selenic.com>
Thu, 11 Oct 2007 00:46:54 -0500
changeset 5451 0a43875677b1
parent 5450 c728424d44c6
child 5452 82b4ff3abbcd
revlog: break up compression of large deltas Python's zlib apparently makes an internal copy of strings passed to compress(). To avoid this, compress strings 1M at a time, then join them at the end if the result would be smaller than the original. For initial commits of large but compressible files, this cuts peak memory usage nearly in half.
mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -61,12 +61,27 @@ def compress(text):
     """ generate a possibly-compressed representation of text """
     if not text:
         return ("", text)
-    if len(text) < 44:
+    l = len(text)
+    if l < 44:
         if text[0] == '\0':
             return ("", text)
         return ('u', text)
-    bin = _compress(text)
-    if len(bin) > len(text):
+    elif l > 1000000:
+        # zlib makes an internal copy, thus doubling memory usage for
+        # large files, so lets do this in pieces
+        z = zlib.compressobj()
+        p = []
+        pos = 0
+        while pos < l:
+            pos2 = pos + 2**20
+            p.append(z.compress(text[pos:pos2]))
+            pos = pos2
+        p.append(z.flush())
+        if sum(map(len, p)) < l:
+            bin = "".join(p)
+    else:
+        bin = _compress(text)
+    if len(bin) > l:
         if text[0] == '\0':
             return ("", text)
         return ('u', text)