# HG changeset patch # User mason@suse.com # Date 1131762022 28800 # Node ID 80a3d6a0af716f9a05d6a4eabe31dbb136eb5b2f # Parent 3d11f81c9145267a80644b077c11127a0afd7c68 Optimize manifest.add Testing shows that manifest.add is spending a significant percentage of its time running calcoffsets and doing text = "".join(addlist). This patch removes the need for both of these by storying the manifest in a character array, and using a modified bisect search to find lines without the help of a separate index of line offsets. manifest.add was also reworked to push delta construction/combination into the main loop. Time to apply 2751 patches (without psyco, ext3 noatime,data=writeback): Stock hg: 4m45s real 3m32s user 55s sys patched: 2m48s real 1m53s user 43s sys quilt: 2m30s real 45s user 50s sys (quilt does much more io...) diff --git a/mercurial/manifest.py b/mercurial/manifest.py --- a/mercurial/manifest.py +++ b/mercurial/manifest.py @@ -9,13 +9,12 @@ import sys, struct from revlog import * from i18n import gettext as _ from demandload import * -demandload(globals(), "bisect") +demandload(globals(), "bisect array") class manifest(revlog): def __init__(self, opener): self.mapcache = None self.listcache = None - self.addlist = None revlog.__init__(self, opener, "00manifest.i", "00manifest.d") def read(self, node): @@ -25,8 +24,9 @@ class manifest(revlog): text = self.revision(node) map = {} flag = {} - self.listcache = (text, text.splitlines(1)) - for l in self.listcache[1]: + self.listcache = array.array('c', text) + lines = text.splitlines(1) + for l in lines: (f, n) = l.split('\0') map[f] = bin(n[:40]) flag[f] = (n[40:-1] == "x") @@ -39,57 +39,67 @@ class manifest(revlog): self.read(node) return self.mapcache[2] + def diff(self, a, b): + return mdiff.textdiff(str(a), str(b)) + def add(self, map, flags, transaction, link, p1=None, p2=None, changed=None): - # directly generate the mdiff delta from the data collected during - # the bisect loop below - def gendelta(delta): - i = 0 - result = [] - while i < len(delta): - start = delta[i][2] - end = delta[i][3] - l = delta[i][4] - if l == None: - l = "" - while i < len(delta) - 1 and start <= delta[i+1][2] \ - and end >= delta[i+1][2]: - if delta[i+1][3] > end: - end = delta[i+1][3] - if delta[i+1][4]: - l += delta[i+1][4] + + # returns a tuple (start, end). If the string is found + # m[start:end] are the line containing that string. If start == end + # the string was not found and they indicate the proper sorted + # insertion point. This was taken from bisect_left, and modified + # to find line start/end as it goes along. + # + # m should be a buffer or a string + # s is a string + # + def manifestsearch(m, s, lo=0, hi=None): + def advance(i, c): + while i < lenm and m[i] != c: i += 1 - result.append(struct.pack(">lll", start, end, len(l)) + l) - i += 1 - return result + return i + lenm = len(m) + if not hi: + hi = lenm + while lo < hi: + mid = (lo + hi) // 2 + start = mid + while start > 0 and m[start-1] != '\n': + start -= 1 + end = advance(start, '\0') + if m[start:end] < s: + # we know that after the null there are 40 bytes of sha1 + # this translates to the bisect lo = mid + 1 + lo = advance(end + 40, '\n') + 1 + else: + # this translates to the bisect hi = mid + hi = start + end = advance(lo, '\0') + found = m[lo:end] + if cmp(s, found) == 0: + # we know that after the null there are 40 bytes of sha1 + end = advance(end + 40, '\n') + return (lo, end+1) + else: + return (lo, lo) # apply the changes collected during the bisect loop to our addlist - def addlistdelta(addlist, delta): - # apply the deltas to the addlist. start from the bottom up + # return a delta suitable for addrevision + def addlistdelta(addlist, x): + # start from the bottom up # so changes to the offsets don't mess things up. - i = len(delta) + i = len(x) while i > 0: i -= 1 - start = delta[i][0] - end = delta[i][1] - if delta[i][4]: - addlist[start:end] = [delta[i][4]] + start = x[i][0] + end = x[i][1] + if x[i][2]: + addlist[start:end] = array.array('c', x[i][2]) else: del addlist[start:end] - return addlist - - # calculate the byte offset of the start of each line in the - # manifest - def calcoffsets(addlist): - offsets = [0] * (len(addlist) + 1) - offset = 0 - i = 0 - while i < len(addlist): - offsets[i] = offset - offset += len(addlist[i]) - i += 1 - offsets[i] = offset - return offsets + return "".join([struct.pack(">lll", d[0], d[1], len(d[2])) + d[2] \ + for d in x ]) # if we're using the listcache, make sure it is valid and # parented by the same node we're diffing against @@ -98,15 +108,13 @@ class manifest(revlog): files = map.keys() files.sort() - self.addlist = ["%s\000%s%s\n" % + text = ["%s\000%s%s\n" % (f, hex(map[f]), flags[f] and "x" or '') for f in files] + self.listcache = array.array('c', "".join(text)) cachedelta = None else: - addlist = self.listcache[1] - - # find the starting offset for each line in the add list - offsets = calcoffsets(addlist) + addlist = self.listcache # combine the changed lists into one list for sorting work = [[x, 0] for x in changed[0]] @@ -114,45 +122,52 @@ class manifest(revlog): work.sort() delta = [] - bs = 0 + dstart = None + dend = None + dline = [""] + start = 0 + # zero copy representation of addlist as a buffer + addbuf = buffer(addlist) + # start with a readonly loop that finds the offset of + # each line and creates the deltas for w in work: f = w[0] # bs will either be the index of the item or the insert point - bs = bisect.bisect(addlist, f, bs) - if bs < len(addlist): - fn = addlist[bs][:addlist[bs].index('\0')] - else: - fn = None + start, end = manifestsearch(addbuf, f, start) if w[1] == 0: l = "%s\000%s%s\n" % (f, hex(map[f]), flags[f] and "x" or '') else: - l = None - start = bs - if fn != f: - # item not found, insert a new one - end = bs - if w[1] == 1: - raise AssertionError( + l = "" + if start == end and w[1] == 1: + # item we want to delete was not found, error out + raise AssertionError( _("failed to remove %s from manifest\n") % f) + if dstart != None and dstart <= start and dend >= start: + if dend < end: + dend = end + if l: + dline.append(l) else: - # item is found, replace/delete the existing line - end = bs + 1 - delta.append([start, end, offsets[start], offsets[end], l]) + if dstart != None: + delta.append([dstart, dend, "".join(dline)]) + dstart = start + dend = end + dline = [l] - self.addlist = addlistdelta(addlist, delta) - if self.mapcache[0] == self.tip(): - cachedelta = "".join(gendelta(delta)) - else: - cachedelta = None + if dstart != None: + delta.append([dstart, dend, "".join(dline)]) + # apply the delta to the addlist, and get a delta for addrevision + cachedelta = addlistdelta(addlist, delta) - text = "".join(self.addlist) - if cachedelta and mdiff.patch(self.listcache[0], cachedelta) != text: - raise AssertionError(_("manifest delta failure\n")) - n = self.addrevision(text, transaction, link, p1, p2, cachedelta) + # the delta is only valid if we've been processing the tip revision + if self.mapcache[0] != self.tip(): + cachedelta = None + self.listcache = addlist + + n = self.addrevision(buffer(self.listcache), transaction, link, p1, \ + p2, cachedelta) self.mapcache = (n, map, flags) - self.listcache = (text, self.addlist) - self.addlist = None return n