# HG changeset patch # User mason@suse.com # Date 1144183632 14400 # Node ID ee96ca273f32f71343a6dba099d547526543b559 # Parent 441ea218414e83dd0d91e2bf8438d796828f89c1 New lazy index code for revlogs. This tunes for large repositories. It does not read the whole index file in one big chunk, but tries to buffer reads in more reasonable chunks instead. Search speeds are improved in two ways. When trying to find a specific sha hash, it searches from the end of the file backward. More recent entries are more likely to be relevant, especially the tip. Also, this can load only the mapping of nodes to revlog index number. Loading the map uses less cpu (no struct.unpack) and much less memory than loading both the map and the index. This cuts down the time for hg tip on the 80,000 changeset kernel repo from 1.8s to 3.69s. Most commands the pull a single rev out of a big index get roughly the same benefit. Commands that read the whole index are not slower. diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -64,6 +64,7 @@ def decompress(bin): raise RevlogError(_("unknown compression type %r") % t) indexformatv0 = ">4l20s20s20s" +v0shaoffset = 56 # index ng: # 6 bytes offset # 2 bytes flags @@ -75,48 +76,135 @@ indexformatv0 = ">4l20s20s20s" # 4 bytes parent 2 rev # 32 bytes: nodeid indexformatng = ">Qiiiiii20s12x" +ngshaoffset = 32 versionformat = ">i" class lazyparser(object): """ this class avoids the need to parse the entirety of large indices - - By default we parse and load 1000 entries at a time. - - If no position is specified, we load the whole index, and replace - the lazy objects in revlog with the underlying objects for - efficiency in cases where we look at most of the nodes. """ - def __init__(self, data, revlog, indexformat): - self.data = data + def __init__(self, dataf, size, indexformat, shaoffset): + self.dataf = dataf + self.format = indexformat self.s = struct.calcsize(indexformat) self.indexformat = indexformat - self.l = len(data)/self.s + self.datasize = size + self.l = size/self.s self.index = [None] * self.l self.map = {nullid: -1} + self.allmap = 0 self.all = 0 - self.revlog = revlog + self.mapfind_count = 0 + self.shaoffset = shaoffset - def load(self, pos=None): + def loadmap(self): + """ + during a commit, we need to make sure the rev being added is + not a duplicate. This requires loading the entire index, + which is fairly slow. loadmap can load up just the node map, + which takes much less time. + """ + if self.allmap: return + start = 0 + end = self.datasize + self.allmap = 1 + cur = 0 + count = 0 + blocksize = self.s * 256 + self.dataf.seek(0) + while cur < end: + data = self.dataf.read(blocksize) + off = 0 + for x in xrange(256): + n = data[off + self.shaoffset:off + self.shaoffset + 20] + self.map[n] = count + count += 1 + if count >= self.l: + break + off += self.s + cur += blocksize + + def loadblock(self, blockstart, blocksize, data=None): if self.all: return - if pos is not None: - block = pos / 1000 - i = block * 1000 - end = min(self.l, i + 1000) + if data is None: + self.dataf.seek(blockstart) + data = self.dataf.read(blocksize) + lend = len(data) / self.s + i = blockstart / self.s + off = 0 + for x in xrange(lend): + if self.index[i + x] == None: + b = data[off : off + self.s] + e = struct.unpack(self.format, b) + self.index[i + x] = e + self.map[e[-1]] = i + x + off += self.s + + def findnode(self, node): + """search backwards through the index file for a specific node""" + if self.allmap: return None + + # hg log will cause many many searches for the manifest + # nodes. After we get called a few times, just load the whole + # thing. + if self.mapfind_count > 8: + self.loadmap() + if node in self.map: + return node + return None + self.mapfind_count += 1 + last = self.l - 1 + while self.index[last] != None: + if last == 0: + self.all = 1 + self.allmap = 1 + return None + last -= 1 + end = (last + 1) * self.s + blocksize = self.s * 256 + while end >= 0: + start = max(end - blocksize, 0) + self.dataf.seek(start) + data = self.dataf.read(end - start) + findend = end - start + while True: + # we're searching backwards, so weh have to make sure + # we don't find a changeset where this node is a parent + off = data.rfind(node, 0, findend) + findend = off + if off >= 0: + i = off / self.s + off = i * self.s + n = data[off + self.shaoffset:off + self.shaoffset + 20] + if n == node: + self.map[n] = i + start / self.s + return node + else: + break + end -= blocksize + return None + + def loadindex(self, i=None, end=None): + if self.all: return + all = False + if i == None: + blockstart = 0 + blocksize = (512 / self.s) * self.s + end = self.datasize + all = True else: - self.all = 1 - i = 0 - end = self.l - self.revlog.index = self.index - self.revlog.nodemap = self.map - - while i < end: - if not self.index[i]: - d = self.data[i * self.s: (i + 1) * self.s] - e = struct.unpack(self.indexformat, d) - self.index[i] = e - self.map[e[-1]] = i - i += 1 + if end: + blockstart = i * self.s + end = end * self.s + blocksize = end - blockstart + else: + blockstart = (i & ~(32)) * self.s + blocksize = self.s * 64 + end = blockstart + blocksize + while blockstart < end: + self.loadblock(blockstart, blocksize) + blockstart += blocksize + if all: self.all = True class lazyindex(object): """a lazy version of the index array""" @@ -127,7 +215,7 @@ class lazyindex(object): def load(self, pos): if pos < 0: pos += len(self.p.index) - self.p.load(pos) + self.p.loadindex(pos) return self.p.index[pos] def __getitem__(self, pos): return self.p.index[pos] or self.load(pos) @@ -143,14 +231,13 @@ class lazymap(object): def __init__(self, parser): self.p = parser def load(self, key): - if self.p.all: return - n = self.p.data.find(key) - if n < 0: + n = self.p.findnode(key) + if n == None: raise KeyError(key) - pos = n / self.p.s - self.p.load(pos) def __contains__(self, key): - self.p.load() + if key in self.p.map: + return True + self.p.loadmap() return key in self.p.map def __iter__(self): yield nullid @@ -158,7 +245,7 @@ class lazymap(object): try: yield self.p.index[i][-1] except: - self.p.load(i) + self.p.loadindex(i) yield self.p.index[i][-1] def __getitem__(self, key): try: @@ -222,7 +309,8 @@ class revlog(object): v = self.defversion try: f = self.opener(self.indexfile) - i = f.read() + i = f.read(4) + f.seek(0) except IOError, inst: if inst.errno != errno.ENOENT: raise @@ -241,7 +329,7 @@ class revlog(object): return self.indexstat = st if len(i) > 0: - v = struct.unpack(versionformat, i[:4])[0] + v = struct.unpack(versionformat, i)[0] flags = v & ~0xFFFF fmt = v & 0xFFFF if fmt == 0: @@ -258,16 +346,19 @@ class revlog(object): self.version = v if v == 0: self.indexformat = indexformatv0 + shaoffset = v0shaoffset else: self.indexformat = indexformatng + shaoffset = ngshaoffset if i: if not self.inlinedata() and st and st.st_size > 10000: # big index, let's parse it on demand - parser = lazyparser(i, self, self.indexformat) + parser = lazyparser(f, st.st_size, self.indexformat, shaoffset) self.index = lazyindex(parser) self.nodemap = lazymap(parser) else: + i = f.read() self.parseindex(i) if self.inlinedata(): # we've already got the entire data file read in, save it @@ -312,11 +403,23 @@ class revlog(object): def offset_type(self, offset, type): return long(long(offset) << 16 | type) + def loadindex(self, start, end): + """load a block of indexes all at once from the lazy parser""" + if isinstance(self.index, lazyindex): + self.index.p.loadindex(start, end) + def loadindexmap(self): """loads both the map and the index from the lazy parser""" if isinstance(self.index, lazyindex): p = self.index.p - p.load() + p.loadindex() + self.nodemap = p.map + + def loadmap(self): + """loads the map from the lazy parser""" + if isinstance(self.nodemap, lazymap): + self.nodemap.p.loadmap() + self.nodemap = self.nodemap.p.map def inlinedata(self): return self.version & REVLOGNGINLINEDATA def tip(self): return self.node(len(self.index) - 1) @@ -690,7 +793,9 @@ class revlog(object): if self.cache and self.cache[1] >= base and self.cache[1] < rev: base = self.cache[1] text = self.cache[2] + self.loadindex(base, rev + 1) else: + self.loadindex(base, rev + 1) text = self.chunk(base, df=df) bins = []