comparison mercurial/revlog.py @ 2079:ee96ca273f32

New lazy index code for revlogs. This tunes for large repositories. It does not read the whole index file in one big chunk, but tries to buffer reads in more reasonable chunks instead. Search speeds are improved in two ways. When trying to find a specific sha hash, it searches from the end of the file backward. More recent entries are more likely to be relevant, especially the tip. Also, this can load only the mapping of nodes to revlog index number. Loading the map uses less cpu (no struct.unpack) and much less memory than loading both the map and the index. This cuts down the time for hg tip on the 80,000 changeset kernel repo from 1.8s to 3.69s. Most commands the pull a single rev out of a big index get roughly the same benefit. Commands that read the whole index are not slower.
author mason@suse.com
date Tue, 04 Apr 2006 16:47:12 -0400
parents 441ea218414e
children 1cbb14c048cb
comparison
equal deleted inserted replaced
2078:441ea218414e 2079:ee96ca273f32
62 if t == 'x': return zlib.decompress(bin) 62 if t == 'x': return zlib.decompress(bin)
63 if t == 'u': return bin[1:] 63 if t == 'u': return bin[1:]
64 raise RevlogError(_("unknown compression type %r") % t) 64 raise RevlogError(_("unknown compression type %r") % t)
65 65
66 indexformatv0 = ">4l20s20s20s" 66 indexformatv0 = ">4l20s20s20s"
67 v0shaoffset = 56
67 # index ng: 68 # index ng:
68 # 6 bytes offset 69 # 6 bytes offset
69 # 2 bytes flags 70 # 2 bytes flags
70 # 4 bytes compressed length 71 # 4 bytes compressed length
71 # 4 bytes uncompressed length 72 # 4 bytes uncompressed length
73 # 4 bytes link rev 74 # 4 bytes link rev
74 # 4 bytes parent 1 rev 75 # 4 bytes parent 1 rev
75 # 4 bytes parent 2 rev 76 # 4 bytes parent 2 rev
76 # 32 bytes: nodeid 77 # 32 bytes: nodeid
77 indexformatng = ">Qiiiiii20s12x" 78 indexformatng = ">Qiiiiii20s12x"
79 ngshaoffset = 32
78 versionformat = ">i" 80 versionformat = ">i"
79 81
80 class lazyparser(object): 82 class lazyparser(object):
81 """ 83 """
82 this class avoids the need to parse the entirety of large indices 84 this class avoids the need to parse the entirety of large indices
83
84 By default we parse and load 1000 entries at a time.
85
86 If no position is specified, we load the whole index, and replace
87 the lazy objects in revlog with the underlying objects for
88 efficiency in cases where we look at most of the nodes.
89 """ 85 """
90 def __init__(self, data, revlog, indexformat): 86 def __init__(self, dataf, size, indexformat, shaoffset):
91 self.data = data 87 self.dataf = dataf
88 self.format = indexformat
92 self.s = struct.calcsize(indexformat) 89 self.s = struct.calcsize(indexformat)
93 self.indexformat = indexformat 90 self.indexformat = indexformat
94 self.l = len(data)/self.s 91 self.datasize = size
92 self.l = size/self.s
95 self.index = [None] * self.l 93 self.index = [None] * self.l
96 self.map = {nullid: -1} 94 self.map = {nullid: -1}
95 self.allmap = 0
97 self.all = 0 96 self.all = 0
98 self.revlog = revlog 97 self.mapfind_count = 0
99 98 self.shaoffset = shaoffset
100 def load(self, pos=None): 99
100 def loadmap(self):
101 """
102 during a commit, we need to make sure the rev being added is
103 not a duplicate. This requires loading the entire index,
104 which is fairly slow. loadmap can load up just the node map,
105 which takes much less time.
106 """
107 if self.allmap: return
108 start = 0
109 end = self.datasize
110 self.allmap = 1
111 cur = 0
112 count = 0
113 blocksize = self.s * 256
114 self.dataf.seek(0)
115 while cur < end:
116 data = self.dataf.read(blocksize)
117 off = 0
118 for x in xrange(256):
119 n = data[off + self.shaoffset:off + self.shaoffset + 20]
120 self.map[n] = count
121 count += 1
122 if count >= self.l:
123 break
124 off += self.s
125 cur += blocksize
126
127 def loadblock(self, blockstart, blocksize, data=None):
101 if self.all: return 128 if self.all: return
102 if pos is not None: 129 if data is None:
103 block = pos / 1000 130 self.dataf.seek(blockstart)
104 i = block * 1000 131 data = self.dataf.read(blocksize)
105 end = min(self.l, i + 1000) 132 lend = len(data) / self.s
106 else: 133 i = blockstart / self.s
107 self.all = 1 134 off = 0
108 i = 0 135 for x in xrange(lend):
109 end = self.l 136 if self.index[i + x] == None:
110 self.revlog.index = self.index 137 b = data[off : off + self.s]
111 self.revlog.nodemap = self.map 138 e = struct.unpack(self.format, b)
112 139 self.index[i + x] = e
113 while i < end: 140 self.map[e[-1]] = i + x
114 if not self.index[i]: 141 off += self.s
115 d = self.data[i * self.s: (i + 1) * self.s] 142
116 e = struct.unpack(self.indexformat, d) 143 def findnode(self, node):
117 self.index[i] = e 144 """search backwards through the index file for a specific node"""
118 self.map[e[-1]] = i 145 if self.allmap: return None
119 i += 1 146
147 # hg log will cause many many searches for the manifest
148 # nodes. After we get called a few times, just load the whole
149 # thing.
150 if self.mapfind_count > 8:
151 self.loadmap()
152 if node in self.map:
153 return node
154 return None
155 self.mapfind_count += 1
156 last = self.l - 1
157 while self.index[last] != None:
158 if last == 0:
159 self.all = 1
160 self.allmap = 1
161 return None
162 last -= 1
163 end = (last + 1) * self.s
164 blocksize = self.s * 256
165 while end >= 0:
166 start = max(end - blocksize, 0)
167 self.dataf.seek(start)
168 data = self.dataf.read(end - start)
169 findend = end - start
170 while True:
171 # we're searching backwards, so weh have to make sure
172 # we don't find a changeset where this node is a parent
173 off = data.rfind(node, 0, findend)
174 findend = off
175 if off >= 0:
176 i = off / self.s
177 off = i * self.s
178 n = data[off + self.shaoffset:off + self.shaoffset + 20]
179 if n == node:
180 self.map[n] = i + start / self.s
181 return node
182 else:
183 break
184 end -= blocksize
185 return None
186
187 def loadindex(self, i=None, end=None):
188 if self.all: return
189 all = False
190 if i == None:
191 blockstart = 0
192 blocksize = (512 / self.s) * self.s
193 end = self.datasize
194 all = True
195 else:
196 if end:
197 blockstart = i * self.s
198 end = end * self.s
199 blocksize = end - blockstart
200 else:
201 blockstart = (i & ~(32)) * self.s
202 blocksize = self.s * 64
203 end = blockstart + blocksize
204 while blockstart < end:
205 self.loadblock(blockstart, blocksize)
206 blockstart += blocksize
207 if all: self.all = True
120 208
121 class lazyindex(object): 209 class lazyindex(object):
122 """a lazy version of the index array""" 210 """a lazy version of the index array"""
123 def __init__(self, parser): 211 def __init__(self, parser):
124 self.p = parser 212 self.p = parser
125 def __len__(self): 213 def __len__(self):
126 return len(self.p.index) 214 return len(self.p.index)
127 def load(self, pos): 215 def load(self, pos):
128 if pos < 0: 216 if pos < 0:
129 pos += len(self.p.index) 217 pos += len(self.p.index)
130 self.p.load(pos) 218 self.p.loadindex(pos)
131 return self.p.index[pos] 219 return self.p.index[pos]
132 def __getitem__(self, pos): 220 def __getitem__(self, pos):
133 return self.p.index[pos] or self.load(pos) 221 return self.p.index[pos] or self.load(pos)
134 def __setitem__(self, pos, item): 222 def __setitem__(self, pos, item):
135 self.p.index[pos] = item 223 self.p.index[pos] = item
141 class lazymap(object): 229 class lazymap(object):
142 """a lazy version of the node map""" 230 """a lazy version of the node map"""
143 def __init__(self, parser): 231 def __init__(self, parser):
144 self.p = parser 232 self.p = parser
145 def load(self, key): 233 def load(self, key):
146 if self.p.all: return 234 n = self.p.findnode(key)
147 n = self.p.data.find(key) 235 if n == None:
148 if n < 0:
149 raise KeyError(key) 236 raise KeyError(key)
150 pos = n / self.p.s
151 self.p.load(pos)
152 def __contains__(self, key): 237 def __contains__(self, key):
153 self.p.load() 238 if key in self.p.map:
239 return True
240 self.p.loadmap()
154 return key in self.p.map 241 return key in self.p.map
155 def __iter__(self): 242 def __iter__(self):
156 yield nullid 243 yield nullid
157 for i in xrange(self.p.l): 244 for i in xrange(self.p.l):
158 try: 245 try:
159 yield self.p.index[i][-1] 246 yield self.p.index[i][-1]
160 except: 247 except:
161 self.p.load(i) 248 self.p.loadindex(i)
162 yield self.p.index[i][-1] 249 yield self.p.index[i][-1]
163 def __getitem__(self, key): 250 def __getitem__(self, key):
164 try: 251 try:
165 return self.p.map[key] 252 return self.p.map[key]
166 except KeyError: 253 except KeyError:
220 307
221 def load(self): 308 def load(self):
222 v = self.defversion 309 v = self.defversion
223 try: 310 try:
224 f = self.opener(self.indexfile) 311 f = self.opener(self.indexfile)
225 i = f.read() 312 i = f.read(4)
313 f.seek(0)
226 except IOError, inst: 314 except IOError, inst:
227 if inst.errno != errno.ENOENT: 315 if inst.errno != errno.ENOENT:
228 raise 316 raise
229 i = "" 317 i = ""
230 else: 318 else:
239 and st.st_mtime == oldst.st_mtime 327 and st.st_mtime == oldst.st_mtime
240 and st.st_ctime == oldst.st_ctime): 328 and st.st_ctime == oldst.st_ctime):
241 return 329 return
242 self.indexstat = st 330 self.indexstat = st
243 if len(i) > 0: 331 if len(i) > 0:
244 v = struct.unpack(versionformat, i[:4])[0] 332 v = struct.unpack(versionformat, i)[0]
245 flags = v & ~0xFFFF 333 flags = v & ~0xFFFF
246 fmt = v & 0xFFFF 334 fmt = v & 0xFFFF
247 if fmt == 0: 335 if fmt == 0:
248 if flags: 336 if flags:
249 raise RevlogError(_("index %s invalid flags %x for format v0" % 337 raise RevlogError(_("index %s invalid flags %x for format v0" %
256 raise RevlogError(_("index %s invalid format %d" % 344 raise RevlogError(_("index %s invalid format %d" %
257 (self.indexfile, fmt))) 345 (self.indexfile, fmt)))
258 self.version = v 346 self.version = v
259 if v == 0: 347 if v == 0:
260 self.indexformat = indexformatv0 348 self.indexformat = indexformatv0
349 shaoffset = v0shaoffset
261 else: 350 else:
262 self.indexformat = indexformatng 351 self.indexformat = indexformatng
352 shaoffset = ngshaoffset
263 353
264 if i: 354 if i:
265 if not self.inlinedata() and st and st.st_size > 10000: 355 if not self.inlinedata() and st and st.st_size > 10000:
266 # big index, let's parse it on demand 356 # big index, let's parse it on demand
267 parser = lazyparser(i, self, self.indexformat) 357 parser = lazyparser(f, st.st_size, self.indexformat, shaoffset)
268 self.index = lazyindex(parser) 358 self.index = lazyindex(parser)
269 self.nodemap = lazymap(parser) 359 self.nodemap = lazymap(parser)
270 else: 360 else:
361 i = f.read()
271 self.parseindex(i) 362 self.parseindex(i)
272 if self.inlinedata(): 363 if self.inlinedata():
273 # we've already got the entire data file read in, save it 364 # we've already got the entire data file read in, save it
274 # in the chunk data 365 # in the chunk data
275 self.chunkcache = (0, i) 366 self.chunkcache = (0, i)
310 return int(q & 0xFFFF) 401 return int(q & 0xFFFF)
311 402
312 def offset_type(self, offset, type): 403 def offset_type(self, offset, type):
313 return long(long(offset) << 16 | type) 404 return long(long(offset) << 16 | type)
314 405
406 def loadindex(self, start, end):
407 """load a block of indexes all at once from the lazy parser"""
408 if isinstance(self.index, lazyindex):
409 self.index.p.loadindex(start, end)
410
315 def loadindexmap(self): 411 def loadindexmap(self):
316 """loads both the map and the index from the lazy parser""" 412 """loads both the map and the index from the lazy parser"""
317 if isinstance(self.index, lazyindex): 413 if isinstance(self.index, lazyindex):
318 p = self.index.p 414 p = self.index.p
319 p.load() 415 p.loadindex()
416 self.nodemap = p.map
417
418 def loadmap(self):
419 """loads the map from the lazy parser"""
420 if isinstance(self.nodemap, lazymap):
421 self.nodemap.p.loadmap()
422 self.nodemap = self.nodemap.p.map
320 423
321 def inlinedata(self): return self.version & REVLOGNGINLINEDATA 424 def inlinedata(self): return self.version & REVLOGNGINLINEDATA
322 def tip(self): return self.node(len(self.index) - 1) 425 def tip(self): return self.node(len(self.index) - 1)
323 def count(self): return len(self.index) 426 def count(self): return len(self.index)
324 def node(self, rev): 427 def node(self, rev):
688 791
689 # do we have useful data cached? 792 # do we have useful data cached?
690 if self.cache and self.cache[1] >= base and self.cache[1] < rev: 793 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
691 base = self.cache[1] 794 base = self.cache[1]
692 text = self.cache[2] 795 text = self.cache[2]
693 else: 796 self.loadindex(base, rev + 1)
797 else:
798 self.loadindex(base, rev + 1)
694 text = self.chunk(base, df=df) 799 text = self.chunk(base, df=df)
695 800
696 bins = [] 801 bins = []
697 for r in xrange(base + 1, rev + 1): 802 for r in xrange(base + 1, rev + 1):
698 bins.append(self.chunk(r, df=df)) 803 bins.append(self.chunk(r, df=df))