comparison mercurial/dirstate.py @ 1183:d9e85a75dbda

Optimize dirstate walking This generally cuts the time for hg status/diff in half, from 2s down to 1s. The main parts I'm trying to optimize are: 1) os.walk stats every file. dirstate.changes then stats every file again. 2) os.walk yields every file and subdir to dirstate.traverse who yields every file and everything in the dirstate map. dirstate.walk then filters this mass and yields every file to the caller. There should be fewer steps in here, and fewer duplicate strings yielded. 3) dirstate.walk runs util.unique on the results from dirstate.traverse, even though it is also passing things through dirstate.seen to look for duplicates. I've turned os.walk into something hg specific that takes all the dirstate ignore and matching rules into account. The new function also takes an function arg (statmatch()) the caller supplies to help filter out files it doesn't care about. dirstate.changes uses this to update state for each file, avoiding the second stat call. dirstate.walk is changed to turn the match function it is passed into a statmatch function. The only real difference is that a statmatch function takes the stat data as a second parameter. It now calls dirstate.walkhelper, who requires a statmatch function to be passed. This fails test-walk, but right now I think this is from a sorting error fixed by this patch. Index: crew/mercurial/dirstate.py ===================================================================
author mason@suse.com
date Thu, 01 Sep 2005 07:34:53 -0700
parents 30ab5b8ee8ec
children cc61d366bc3b
comparison
equal deleted inserted replaced
1182:24d553b598e8 1183:d9e85a75dbda
20 self.ui = ui 20 self.ui = ui
21 self.map = None 21 self.map = None
22 self.pl = None 22 self.pl = None
23 self.copies = {} 23 self.copies = {}
24 self.ignorefunc = None 24 self.ignorefunc = None
25 self.blockignore = False
25 26
26 def wjoin(self, f): 27 def wjoin(self, f):
27 return os.path.join(self.root, f) 28 return os.path.join(self.root, f)
28 29
29 def getcwd(self): 30 def getcwd(self):
30 cwd = os.getcwd() 31 cwd = os.getcwd()
31 if cwd == self.root: return '' 32 if cwd == self.root: return ''
32 return cwd[len(self.root) + 1:] 33 return cwd[len(self.root) + 1:]
33 34
34 def ignore(self, f): 35 def ignore(self, f):
36 if self.blockignore:
37 return False
35 if not self.ignorefunc: 38 if not self.ignorefunc:
36 bigpat = [] 39 bigpat = []
37 try: 40 try:
38 l = file(self.wjoin(".hgignore")) 41 l = file(self.wjoin(".hgignore"))
39 for pat in l: 42 for pat in l:
212 if not dc: 215 if not dc:
213 dc = self.map.copy() 216 dc = self.map.copy()
214 elif not dc: 217 elif not dc:
215 dc = self.filterfiles(files) 218 dc = self.filterfiles(files)
216 219
220 def statmatch(file, stat):
221 if file not in dc and self.ignore(file):
222 return False
223 return match(file)
224 return self.walkhelper(files=files, statmatch=statmatch, dc=dc)
225
226 # walk recursively through the directory tree, finding all files
227 # matched by the statmatch function
228 #
229 # results are yielded in a tuple (src, filename), where src is one of:
230 # 'f' the file was found in the directory tree
231 # 'm' the file was only in the dirstate and not in the tree
232 #
233 # dc is an optional arg for the current dirstate. dc is not modified
234 # directly by this function, but might be modified by your statmatch call.
235 #
236 def walkhelper(self, files, statmatch, dc):
237 # recursion free walker, faster than os.walk.
238 def findfiles(s):
239 retfiles = []
240 work = [s]
241 while work:
242 top = work.pop()
243 names = os.listdir(top)
244 names.sort()
245 # nd is the top of the repository dir tree
246 nd = util.normpath(top[len(self.root) + 1:])
247 if nd == '.': nd = ''
248 for f in names:
249 np = os.path.join(nd, f)
250 if seen(np):
251 continue
252 p = os.path.join(top, f)
253 st = os.stat(p)
254 if stat.S_ISDIR(st.st_mode):
255 ds = os.path.join(nd, f +'/')
256 if statmatch(ds, st):
257 work.append(p)
258 else:
259 if statmatch(np, st):
260 yield np
261
217 known = {'.hg': 1} 262 known = {'.hg': 1}
218 def seen(fn): 263 def seen(fn):
219 if fn in known: return True 264 if fn in known: return True
220 known[fn] = 1 265 known[fn] = 1
221 def traverse(): 266
222 for ff in util.unique(files): 267 # step one, find all files that match our criteria
223 f = os.path.join(self.root, ff) 268 files.sort()
224 try: 269 for ff in util.unique(files):
225 st = os.stat(f) 270 f = os.path.join(self.root, ff)
226 except OSError, inst: 271 try:
227 if ff not in dc: self.ui.warn('%s: %s\n' % ( 272 st = os.stat(f)
228 util.pathto(self.getcwd(), ff), 273 except OSError, inst:
229 inst.strerror)) 274 if ff not in dc: self.ui.warn('%s: %s\n' % (
275 util.pathto(self.getcwd(), ff),
276 inst.strerror))
277 continue
278 if stat.S_ISDIR(st.st_mode):
279 sorted = [ x for x in findfiles(f) ]
280 sorted.sort()
281 for fl in sorted:
282 yield 'f', fl
283 elif stat.S_ISREG(st.st_mode):
284 ff = util.normpath(ff)
285 if seen(ff):
230 continue 286 continue
231 if stat.S_ISDIR(st.st_mode): 287 found = False
232 for dir, subdirs, fl in os.walk(f): 288 self.blockignore = True
233 d = dir[len(self.root) + 1:] 289 if statmatch(ff, st):
234 nd = util.normpath(d) 290 found = True
235 if nd == '.': nd = '' 291 self.blockignore = False
236 if seen(nd): 292 if found:
237 subdirs[:] = []
238 continue
239 for sd in subdirs:
240 ds = os.path.join(nd, sd +'/')
241 if self.ignore(ds) or not match(ds):
242 subdirs.remove(sd)
243 subdirs.sort()
244 fl.sort()
245 for fn in fl:
246 fn = util.pconvert(os.path.join(d, fn))
247 yield 'f', fn
248 elif stat.S_ISREG(st.st_mode):
249 yield 'f', ff 293 yield 'f', ff
250 else: 294 else:
251 kind = 'unknown' 295 kind = 'unknown'
252 if stat.S_ISCHR(st.st_mode): kind = 'character device' 296 if stat.S_ISCHR(st.st_mode): kind = 'character device'
253 elif stat.S_ISBLK(st.st_mode): kind = 'block device' 297 elif stat.S_ISBLK(st.st_mode): kind = 'block device'
254 elif stat.S_ISFIFO(st.st_mode): kind = 'fifo' 298 elif stat.S_ISFIFO(st.st_mode): kind = 'fifo'
255 elif stat.S_ISLNK(st.st_mode): kind = 'symbolic link' 299 elif stat.S_ISLNK(st.st_mode): kind = 'symbolic link'
256 elif stat.S_ISSOCK(st.st_mode): kind = 'socket' 300 elif stat.S_ISSOCK(st.st_mode): kind = 'socket'
257 self.ui.warn('%s: unsupported file type (type is %s)\n' % ( 301 self.ui.warn('%s: unsupported file type (type is %s)\n' % (
258 util.pathto(self.getcwd(), ff), 302 util.pathto(self.getcwd(), ff),
259 kind)) 303 kind))
260 304
261 ks = dc.keys() 305 # step two run through anything left in the dc hash and yield
262 ks.sort() 306 # if we haven't already seen it
263 for k in ks: 307 ks = dc.keys()
308 ks.sort()
309 for k in ks:
310 if not seen(k) and (statmatch(k, None)):
264 yield 'm', k 311 yield 'm', k
265 312
266 # yield only files that match: all in dirstate, others only if
267 # not in .hgignore
268
269 for src, fn in util.unique(traverse()):
270 fn = util.normpath(fn)
271 if seen(fn): continue
272 if fn not in dc and self.ignore(fn):
273 continue
274 if match(fn):
275 yield src, fn
276
277 def changes(self, files=None, match=util.always): 313 def changes(self, files=None, match=util.always):
278 self.read() 314 self.read()
279 if not files: 315 if not files:
316 files = [self.root]
280 dc = self.map.copy() 317 dc = self.map.copy()
281 else: 318 else:
282 dc = self.filterfiles(files) 319 dc = self.filterfiles(files)
283 lookup, modified, added, unknown = [], [], [], [] 320 lookup, modified, added, unknown = [], [], [], []
284 removed, deleted = [], [] 321 removed, deleted = [], []
285 322
286 for src, fn in self.walk(files, match, dc=dc): 323 # statmatch function to eliminate entries from the dirstate copy
287 try: 324 # and put files into the appropriate array. This gets passed
288 s = os.stat(os.path.join(self.root, fn)) 325 # to the walking code
289 except OSError: 326 def statmatch(fn, s):
290 continue 327 def checkappend(l, fn):
328 if match is util.always or match(fn):
329 l.append(fn)
330
331 if not s or stat.S_ISDIR(s.st_mode):
332 return self.ignore(fn) and False or match(fn)
333
291 if not stat.S_ISREG(s.st_mode): 334 if not stat.S_ISREG(s.st_mode):
292 continue 335 return False
293 c = dc.get(fn) 336 c = dc.pop(fn, None)
294 if c: 337 if c:
295 del dc[fn] 338 type, mode, size, time = c
296 if c[0] == 'm': 339 # check the common case first
297 modified.append(fn) 340 if type == 'n':
298 elif c[0] == 'a': 341 if size != s.st_size or (mode ^ s.st_mode) & 0100:
299 added.append(fn) 342 checkappend(modified, fn)
300 elif c[0] == 'r': 343 elif time != s.st_mtime:
344 checkappend(lookup, fn)
345 elif type == 'm':
346 checkappend(modified, fn)
347 elif type == 'a':
348 checkappend(added, fn)
349 elif type == 'r':
350 checkappend(unknown, fn)
351 else:
352 if not self.ignore(fn) and match(fn):
301 unknown.append(fn) 353 unknown.append(fn)
302 elif c[2] != s.st_size or (c[1] ^ s.st_mode) & 0100: 354 # return false because we've already handled all cases above.
303 modified.append(fn) 355 # there's no need for the walking code to process the file
304 elif c[3] != s.st_mtime: 356 # any further.
305 lookup.append(fn) 357 return False
306 else: 358
307 unknown.append(fn) 359 # because our statmatch always returns false, self.walk will only
308 360 # return files in the dirstate map that are not present in the FS.
361 # But, we still need to iterate through the results to force the
362 # walk to complete
363 for src, fn in self.walkhelper(files, statmatch, dc):
364 pass
365
366 # anything left in dc didn't exist in the filesystem
309 for fn, c in [(fn, c) for fn, c in dc.items() if match(fn)]: 367 for fn, c in [(fn, c) for fn, c in dc.items() if match(fn)]:
310 if c[0] == 'r': 368 if c[0] == 'r':
311 removed.append(fn) 369 removed.append(fn)
312 else: 370 else:
313 deleted.append(fn) 371 deleted.append(fn)