comparison mercurial/revlog.py @ 2072:74d3f5336b66

Implement revlogng. revlogng results in smaller indexes, can address larger data files, and supports flags and version numbers. By default the original revlog format is used. To use the new format, use the following .hgrc field: [revlog] # format choices are 0 (classic revlog format) and 1 revlogng format=1
author mason@suse.com
date Tue, 04 Apr 2006 16:38:43 -0400
parents 4aab906517c6
children 1e6745f78989
comparison
equal deleted inserted replaced
2042:a514c7509fa9 2072:74d3f5336b66
13 from node import * 13 from node import *
14 from i18n import gettext as _ 14 from i18n import gettext as _
15 from demandload import demandload 15 from demandload import demandload
16 demandload(globals(), "binascii changegroup errno heapq mdiff os") 16 demandload(globals(), "binascii changegroup errno heapq mdiff os")
17 demandload(globals(), "sha struct zlib") 17 demandload(globals(), "sha struct zlib")
18
19 # revlog version strings
20 REVLOGV0 = 0
21 REVLOGNG = 1
18 22
19 def hash(text, p1, p2): 23 def hash(text, p1, p2):
20 """generate a hash from the given text and its parent hashes 24 """generate a hash from the given text and its parent hashes
21 25
22 This hash combines both the current file contents and its history 26 This hash combines both the current file contents and its history
49 if t == '\0': return bin 53 if t == '\0': return bin
50 if t == 'x': return zlib.decompress(bin) 54 if t == 'x': return zlib.decompress(bin)
51 if t == 'u': return bin[1:] 55 if t == 'u': return bin[1:]
52 raise RevlogError(_("unknown compression type %r") % t) 56 raise RevlogError(_("unknown compression type %r") % t)
53 57
54 indexformat = ">4l20s20s20s" 58 indexformatv0 = ">4l20s20s20s"
59 # index ng:
60 # 6 bytes offset
61 # 2 bytes flags
62 # 4 bytes compressed length
63 # 4 bytes uncompressed length
64 # 4 bytes: base rev
65 # 4 bytes link rev
66 # 4 bytes parent 1 rev
67 # 4 bytes parent 2 rev
68 # 32 bytes: nodeid
69 indexformatng = ">Qiiiiii20s12x"
70 versionformat = ">i"
55 71
56 class lazyparser(object): 72 class lazyparser(object):
57 """ 73 """
58 this class avoids the need to parse the entirety of large indices 74 this class avoids the need to parse the entirety of large indices
59 75
61 77
62 If no position is specified, we load the whole index, and replace 78 If no position is specified, we load the whole index, and replace
63 the lazy objects in revlog with the underlying objects for 79 the lazy objects in revlog with the underlying objects for
64 efficiency in cases where we look at most of the nodes. 80 efficiency in cases where we look at most of the nodes.
65 """ 81 """
66 def __init__(self, data, revlog): 82 def __init__(self, data, revlog, indexformat):
67 self.data = data 83 self.data = data
68 self.s = struct.calcsize(indexformat) 84 self.s = struct.calcsize(indexformat)
85 self.indexformat = indexformat
69 self.l = len(data)/self.s 86 self.l = len(data)/self.s
70 self.index = [None] * self.l 87 self.index = [None] * self.l
71 self.map = {nullid: -1} 88 self.map = {nullid: -1}
72 self.all = 0 89 self.all = 0
73 self.revlog = revlog 90 self.revlog = revlog
74
75 def trunc(self, pos):
76 self.l = pos/self.s
77 91
78 def load(self, pos=None): 92 def load(self, pos=None):
79 if self.all: return 93 if self.all: return
80 if pos is not None: 94 if pos is not None:
81 block = pos / 1000 95 block = pos / 1000
87 end = self.l 101 end = self.l
88 self.revlog.index = self.index 102 self.revlog.index = self.index
89 self.revlog.nodemap = self.map 103 self.revlog.nodemap = self.map
90 104
91 while i < end: 105 while i < end:
92 d = self.data[i * self.s: (i + 1) * self.s] 106 if not self.index[i]:
93 e = struct.unpack(indexformat, d) 107 d = self.data[i * self.s: (i + 1) * self.s]
94 self.index[i] = e 108 e = struct.unpack(self.indexformat, d)
95 self.map[e[6]] = i 109 self.index[i] = e
110 self.map[e[-1]] = i
96 i += 1 111 i += 1
97 112
98 class lazyindex(object): 113 class lazyindex(object):
99 """a lazy version of the index array""" 114 """a lazy version of the index array"""
100 def __init__(self, parser): 115 def __init__(self, parser):
106 pos += len(self.p.index) 121 pos += len(self.p.index)
107 self.p.load(pos) 122 self.p.load(pos)
108 return self.p.index[pos] 123 return self.p.index[pos]
109 def __getitem__(self, pos): 124 def __getitem__(self, pos):
110 return self.p.index[pos] or self.load(pos) 125 return self.p.index[pos] or self.load(pos)
126 def __setitem__(self, pos, item):
127 self.p.index[pos] = item
111 def __delitem__(self, pos): 128 def __delitem__(self, pos):
112 del self.p.index[pos] 129 del self.p.index[pos]
113 def append(self, e): 130 def append(self, e):
114 self.p.index.append(e) 131 self.p.index.append(e)
115 def trunc(self, pos):
116 self.p.trunc(pos)
117 132
118 class lazymap(object): 133 class lazymap(object):
119 """a lazy version of the node map""" 134 """a lazy version of the node map"""
120 def __init__(self, parser): 135 def __init__(self, parser):
121 self.p = parser 136 self.p = parser
131 return key in self.p.map 146 return key in self.p.map
132 def __iter__(self): 147 def __iter__(self):
133 yield nullid 148 yield nullid
134 for i in xrange(self.p.l): 149 for i in xrange(self.p.l):
135 try: 150 try:
136 yield self.p.index[i][6] 151 yield self.p.index[i][-1]
137 except: 152 except:
138 self.p.load(i) 153 self.p.load(i)
139 yield self.p.index[i][6] 154 yield self.p.index[i][-1]
140 def __getitem__(self, key): 155 def __getitem__(self, key):
141 try: 156 try:
142 return self.p.map[key] 157 return self.p.map[key]
143 except KeyError: 158 except KeyError:
144 try: 159 try:
176 Both pieces of the revlog are written to in an append-only 191 Both pieces of the revlog are written to in an append-only
177 fashion, which means we never need to rewrite a file to insert or 192 fashion, which means we never need to rewrite a file to insert or
178 remove data, and can use some simple techniques to avoid the need 193 remove data, and can use some simple techniques to avoid the need
179 for locking while reading. 194 for locking while reading.
180 """ 195 """
181 def __init__(self, opener, indexfile, datafile): 196 def __init__(self, opener, indexfile, datafile, defversion=0):
182 """ 197 """
183 create a revlog object 198 create a revlog object
184 199
185 opener is a function that abstracts the file opening operation 200 opener is a function that abstracts the file opening operation
186 and can be used to implement COW semantics or the like. 201 and can be used to implement COW semantics or the like.
190 self.opener = opener 205 self.opener = opener
191 206
192 self.indexstat = None 207 self.indexstat = None
193 self.cache = None 208 self.cache = None
194 self.chunkcache = None 209 self.chunkcache = None
210 self.defversion = defversion
195 self.load() 211 self.load()
196 212
197 def load(self): 213 def load(self):
214 v = self.defversion
198 try: 215 try:
199 f = self.opener(self.indexfile) 216 f = self.opener(self.indexfile)
217 i = f.read()
200 except IOError, inst: 218 except IOError, inst:
201 if inst.errno != errno.ENOENT: 219 if inst.errno != errno.ENOENT:
202 raise 220 raise
203 i = "" 221 i = ""
204 else: 222 else:
211 if (oldst and st.st_dev == oldst.st_dev 229 if (oldst and st.st_dev == oldst.st_dev
212 and st.st_ino == oldst.st_ino 230 and st.st_ino == oldst.st_ino
213 and st.st_mtime == oldst.st_mtime 231 and st.st_mtime == oldst.st_mtime
214 and st.st_ctime == oldst.st_ctime): 232 and st.st_ctime == oldst.st_ctime):
215 return 233 return
216 self.indexstat = st 234 self.indexstat = st
217 i = f.read() 235 if len(i) > 0:
218 236 v = struct.unpack(versionformat, i[:4])[0]
219 if i and i[:4] != "\0\0\0\0": 237 if v != 0:
220 raise RevlogError(_("incompatible revlog signature on %s") % 238 flags = v & ~0xFFFF
221 self.indexfile) 239 fmt = v & 0xFFFF
222 240 if fmt != REVLOGNG or (flags & ~(REVLOGNGINLINEDATA)):
223 if len(i) > 10000: 241 raise RevlogError(
224 # big index, let's parse it on demand 242 _("unknown version format %d or flags %x on %s") %
225 parser = lazyparser(i, self) 243 (v, flags, self.indexfile))
226 self.index = lazyindex(parser) 244 self.version = v
227 self.nodemap = lazymap(parser) 245 if v == 0:
228 else: 246 self.indexformat = indexformatv0
229 s = struct.calcsize(indexformat) 247 else:
230 l = len(i) / s 248 self.indexformat = indexformatng
231 self.index = [None] * l 249
232 m = [None] * l 250 if i:
233 251 if st and st.st_size > 10000:
234 n = 0 252 # big index, let's parse it on demand
235 for f in xrange(0, l * s, s): 253 parser = lazyparser(i, self, self.indexformat)
236 # offset, size, base, linkrev, p1, p2, nodeid 254 self.index = lazyindex(parser)
237 e = struct.unpack(indexformat, i[f:f + s]) 255 self.nodemap = lazymap(parser)
238 m[n] = (e[6], n) 256 else:
239 self.index[n] = e 257 self.parseindex(i)
240 n += 1 258 if self.version != 0:
241 259 e = list(self.index[0])
242 self.nodemap = dict(m) 260 type = self.ngtype(e[0])
243 self.nodemap[nullid] = -1 261 e[0] = self.offset_type(0, type)
262 self.index[0] = e
263 else:
264 self.nodemap = { nullid: -1}
265 self.index = []
266
267
268 def parseindex(self, data):
269 s = struct.calcsize(self.indexformat)
270 l = len(data)
271 self.index = []
272 self.nodemap = {nullid: -1}
273 off = 0
274 n = 0
275 while off < l:
276 e = struct.unpack(self.indexformat, data[off:off + s])
277 self.index.append(e)
278 self.nodemap[e[-1]] = n
279 n += 1
280 off += s
281
282 def ngoffset(self, q):
283 if q & 0xFFFF:
284 raise RevlogError(_('%s: incompatible revision flag %x') %
285 (self.indexfile, type))
286 return long(q >> 16)
287
288 def ngtype(self, q):
289 return int(q & 0xFFFF)
290
291 def offset_type(self, offset, type):
292 return long(long(offset) << 16 | type)
293
294 def loadindexmap(self):
295 """loads both the map and the index from the lazy parser"""
296 if isinstance(self.index, lazyindex):
297 p = self.index.p
298 p.load()
244 299
245 def tip(self): return self.node(len(self.index) - 1) 300 def tip(self): return self.node(len(self.index) - 1)
246 def count(self): return len(self.index) 301 def count(self): return len(self.index)
247 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6] 302 def node(self, rev):
303 return (rev < 0) and nullid or self.index[rev][-1]
248 def rev(self, node): 304 def rev(self, node):
249 try: 305 try:
250 return self.nodemap[node] 306 return self.nodemap[node]
251 except KeyError: 307 except KeyError:
252 raise RevlogError(_('%s: no node %s') % (self.indexfile, hex(node))) 308 raise RevlogError(_('%s: no node %s') % (self.indexfile, hex(node)))
253 def linkrev(self, node): return self.index[self.rev(node)][3] 309 def linkrev(self, node): return self.index[self.rev(node)][-4]
254 def parents(self, node): 310 def parents(self, node):
255 if node == nullid: return (nullid, nullid) 311 if node == nullid: return (nullid, nullid)
256 return self.index[self.rev(node)][4:6] 312 r = self.rev(node)
257 313 d = self.index[r][-3:-1]
258 def start(self, rev): return (rev < 0) and -1 or self.index[rev][0] 314 if self.version == 0:
315 return d
316 return [ self.node(x) for x in d ]
317 def start(self, rev):
318 if rev < 0:
319 return -1
320 if self.version != 0:
321 return self.ngoffset(self.index[rev][0])
322 return self.index[rev][0]
323 def end(self, rev): return self.start(rev) + self.length(rev)
324
259 def length(self, rev): 325 def length(self, rev):
260 if rev < 0: 326 if rev < 0:
261 return 0 327 return 0
262 else: 328 else:
263 return self.index[rev][1] 329 return self.index[rev][1]
264 def end(self, rev): return self.start(rev) + self.length(rev) 330 def base(self, rev): return (rev < 0) and rev or self.index[rev][-5]
265 def base(self, rev): return (rev < 0) and rev or self.index[rev][2]
266 331
267 def reachable(self, rev, stop=None): 332 def reachable(self, rev, stop=None):
268 reachable = {} 333 reachable = {}
269 visit = [rev] 334 visit = [rev]
270 reachable[rev] = 1 335 reachable[rev] = 1
499 564
500 def patches(self, t, pl): 565 def patches(self, t, pl):
501 """apply a list of patches to a string""" 566 """apply a list of patches to a string"""
502 return mdiff.patches(t, pl) 567 return mdiff.patches(t, pl)
503 568
504 def chunk(self, rev): 569 def chunk(self, rev, df=None, cachelen=4096):
505 start, length = self.start(rev), self.length(rev) 570 start, length = self.start(rev), self.length(rev)
506 end = start + length 571 end = start + length
507 572 def loadcache(df):
508 def loadcache(): 573 cache_length = max(cachelen, length) # 4k
509 cache_length = max(4096 * 1024, length) # 4Mo 574 if not df:
510 df = self.opener(self.datafile) 575 df = self.opener(self.datafile)
511 df.seek(start) 576 df.seek(start)
512 self.chunkcache = (start, df.read(cache_length)) 577 self.chunkcache = (start, df.read(cache_length))
513 578
514 if not self.chunkcache: 579 if not self.chunkcache:
515 loadcache() 580 loadcache(df)
516 581
517 cache_start = self.chunkcache[0] 582 cache_start = self.chunkcache[0]
518 cache_end = cache_start + len(self.chunkcache[1]) 583 cache_end = cache_start + len(self.chunkcache[1])
519 if start >= cache_start and end <= cache_end: 584 if start >= cache_start and end <= cache_end:
520 # it is cached 585 # it is cached
521 offset = start - cache_start 586 offset = start - cache_start
522 else: 587 else:
523 loadcache() 588 loadcache(df)
524 offset = 0 589 offset = 0
525 590
526 #def checkchunk(): 591 #def checkchunk():
527 # df = self.opener(self.datafile) 592 # df = self.opener(self.datafile)
528 # df.seek(start) 593 # df.seek(start)
553 # look up what we need to read 618 # look up what we need to read
554 text = None 619 text = None
555 rev = self.rev(node) 620 rev = self.rev(node)
556 base = self.base(rev) 621 base = self.base(rev)
557 622
623 df = self.opener(self.datafile)
624
558 # do we have useful data cached? 625 # do we have useful data cached?
559 if self.cache and self.cache[1] >= base and self.cache[1] < rev: 626 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
560 base = self.cache[1] 627 base = self.cache[1]
561 text = self.cache[2] 628 text = self.cache[2]
562 else: 629 else:
563 text = self.chunk(base) 630 text = self.chunk(base, df=df)
564 631
565 bins = [] 632 bins = []
566 for r in xrange(base + 1, rev + 1): 633 for r in xrange(base + 1, rev + 1):
567 bins.append(self.chunk(r)) 634 bins.append(self.chunk(r, df=df))
568 635
569 text = self.patches(text, bins) 636 text = self.patches(text, bins)
570 637
571 p1, p2 = self.parents(node) 638 p1, p2 = self.parents(node)
572 if node != hash(text, p1, p2): 639 if node != hash(text, p1, p2):
619 686
620 offset = 0 687 offset = 0
621 if t >= 0: 688 if t >= 0:
622 offset = self.end(t) 689 offset = self.end(t)
623 690
624 e = (offset, l, base, link, p1, p2, node) 691 if self.version == 0:
692 e = (offset, l, base, link, p1, p2, node)
693 else:
694 e = (self.offset_type(offset, 0), l, len(text),
695 base, link, self.rev(p1), self.rev(p2), node)
625 696
626 self.index.append(e) 697 self.index.append(e)
627 self.nodemap[node] = n 698 self.nodemap[node] = n
628 entry = struct.pack(indexformat, *e) 699 entry = struct.pack(self.indexformat, *e)
629 700
630 transaction.add(self.datafile, e[0]) 701 transaction.add(self.datafile, offset)
702 transaction.add(self.indexfile, n * len(entry))
631 f = self.opener(self.datafile, "a") 703 f = self.opener(self.datafile, "a")
632 if data[0]: 704 if data[0]:
633 f.write(data[0]) 705 f.write(data[0])
634 f.write(data[1]) 706 f.write(data[1])
635 transaction.add(self.indexfile, n * len(entry)) 707 f = self.opener(self.indexfile, "a")
636 self.opener(self.indexfile, "a").write(entry) 708
709 if len(self.index) == 1 and self.version != 0:
710 l = struct.pack(versionformat, self.version)
711 f.write(l)
712 entry = entry[4:]
713
714 f.write(entry)
637 715
638 self.cache = (node, n, text) 716 self.cache = (node, n, text)
639 return node 717 return node
640 718
641 def ancestor(self, a, b): 719 def ancestor(self, a, b):
746 node = None 824 node = None
747 825
748 base = prev = -1 826 base = prev = -1
749 start = end = measure = 0 827 start = end = measure = 0
750 if r: 828 if r:
751 base = self.base(t)
752 start = self.start(base)
753 end = self.end(t) 829 end = self.end(t)
754 measure = self.length(base) 830
755 prev = self.tip() 831 ifh = self.opener(self.indexfile, "a+")
756 832 transaction.add(self.indexfile, ifh.tell())
757 transaction.add(self.datafile, end) 833 transaction.add(self.datafile, end)
758 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
759 dfh = self.opener(self.datafile, "a") 834 dfh = self.opener(self.datafile, "a")
760 ifh = self.opener(self.indexfile, "a")
761 835
762 # loop through our set of deltas 836 # loop through our set of deltas
763 chain = None 837 chain = None
764 for chunk in revs: 838 for chunk in revs:
765 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80]) 839 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
792 tempd = compress(delta) 866 tempd = compress(delta)
793 cdelta = tempd[0] + tempd[1] 867 cdelta = tempd[0] + tempd[1]
794 868
795 if chain != prev or (end - start + len(cdelta)) > measure * 2: 869 if chain != prev or (end - start + len(cdelta)) > measure * 2:
796 # flush our writes here so we can read it in revision 870 # flush our writes here so we can read it in revision
797 dfh.flush() 871 if dfh:
872 dfh.flush()
798 ifh.flush() 873 ifh.flush()
799 text = self.revision(chain) 874 text = self.revision(chain)
800 text = self.patches(text, [delta]) 875 text = self.patches(text, [delta])
801 chk = self.addrevision(text, transaction, link, p1, p2) 876 chk = self.addrevision(text, transaction, link, p1, p2)
802 if chk != node: 877 if chk != node:
803 raise RevlogError(_("consistency error adding group")) 878 raise RevlogError(_("consistency error adding group"))
804 measure = len(text) 879 measure = len(text)
805 else: 880 else:
806 e = (end, len(cdelta), base, link, p1, p2, node) 881 if self.version == 0:
882 e = (end, len(cdelta), base, link, p1, p2, node)
883 else:
884 e = (self.offset_type(end, 0), len(cdelta), -1, base,
885 link, self.rev(p1), self.rev(p2), node)
807 self.index.append(e) 886 self.index.append(e)
808 self.nodemap[node] = r 887 self.nodemap[node] = r
809 dfh.write(cdelta) 888 dfh.write(cdelta)
810 ifh.write(struct.pack(indexformat, *e)) 889 ifh.write(struct.pack(self.indexformat, *e))
811 890
812 t, r, chain, prev = r, r + 1, node, node 891 t, r, chain, prev = r, r + 1, node, node
813 base = self.base(t) 892 base = self.base(t)
814 start = self.start(base) 893 start = self.start(base)
815 end = self.end(t) 894 end = self.end(t)
816 895
817 dfh.close()
818 ifh.close()
819 if node is None: 896 if node is None:
820 raise RevlogError(_("group to be added is empty")) 897 raise RevlogError(_("group to be added is empty"))
821 return node 898 return node
822 899
823 def strip(self, rev, minlink): 900 def strip(self, rev, minlink):
824 if self.count() == 0 or rev >= self.count(): 901 if self.count() == 0 or rev >= self.count():
825 return 902 return
903
904 if isinstance(self.index, lazyindex):
905 self.loadindexmap()
826 906
827 # When stripping away a revision, we need to make sure it 907 # When stripping away a revision, we need to make sure it
828 # does not actually belong to an older changeset. 908 # does not actually belong to an older changeset.
829 # The minlink parameter defines the oldest revision 909 # The minlink parameter defines the oldest revision
830 # we're allowed to strip away. 910 # we're allowed to strip away.
831 while minlink > self.index[rev][3]: 911 while minlink > self.index[rev][-4]:
832 rev += 1 912 rev += 1
833 if rev >= self.count(): 913 if rev >= self.count():
834 return 914 return
835 915
836 # first truncate the files on disk 916 # first truncate the files on disk
837 end = self.start(rev) 917 end = self.start(rev)
838 self.opener(self.datafile, "a").truncate(end) 918 df = self.opener(self.datafile, "a")
839 end = rev * struct.calcsize(indexformat) 919 df.truncate(end)
840 self.opener(self.indexfile, "a").truncate(end) 920 end = rev * struct.calcsize(self.indexformat)
921
922 indexf = self.opener(self.indexfile, "a")
923 indexf.truncate(end)
841 924
842 # then reset internal state in memory to forget those revisions 925 # then reset internal state in memory to forget those revisions
843 self.cache = None 926 self.cache = None
844 self.chunkcache = None 927 self.chunkcache = None
845 for p in self.index[rev:]: 928 for x in xrange(rev, self.count()):
846 del self.nodemap[p[6]] 929 del self.nodemap[self.node(x)]
930
847 del self.index[rev:] 931 del self.index[rev:]
848
849 # truncating the lazyindex also truncates the lazymap.
850 if isinstance(self.index, lazyindex):
851 self.index.trunc(end)
852
853 932
854 def checksize(self): 933 def checksize(self):
855 expected = 0 934 expected = 0
856 if self.count(): 935 if self.count():
857 expected = self.end(self.count() - 1) 936 expected = self.end(self.count() - 1)
868 947
869 try: 948 try:
870 f = self.opener(self.indexfile) 949 f = self.opener(self.indexfile)
871 f.seek(0, 2) 950 f.seek(0, 2)
872 actual = f.tell() 951 actual = f.tell()
873 s = struct.calcsize(indexformat) 952 s = struct.calcsize(self.indexformat)
874 i = actual / s 953 i = actual / s
875 di = actual - (i * s) 954 di = actual - (i * s)
876 except IOError, inst: 955 except IOError, inst:
877 if inst.errno != errno.ENOENT: 956 if inst.errno != errno.ENOENT:
878 raise 957 raise