comparison mercurial/cmdutil.py @ 4135:6cb6cfe43c5d

Avoid some false positives for addremove -s The original code uses the similary score 1 - len(diff(after, before)) / len(after) The diff can at most be the size of the 'before' file, so any small 'before' file would be considered very similar. Removing an empty file would cause all files added in the same revision to be considered copies of the removed file. This changes the metric to bytes_overlap(before, after) / len(before + after) i.e. the actual percentage of bytes shared between the two files.
author Erling Ellingsen <erlingalf@gmail.com>
date Sun, 18 Feb 2007 20:39:25 +0100
parents 431f3c1d3a37
children eb5d4fec1487
comparison
equal deleted inserted replaced
4134:9dc64c8414ca 4135:6cb6cfe43c5d
5 # This software may be used and distributed according to the terms 5 # This software may be used and distributed according to the terms
6 # of the GNU General Public License, incorporated herein by reference. 6 # of the GNU General Public License, incorporated herein by reference.
7 7
8 from node import * 8 from node import *
9 from i18n import _ 9 from i18n import _
10 import os, sys, mdiff, util, templater, patch 10 import os, sys, mdiff, bdiff, util, templater, patch
11 11
12 revrangesep = ':' 12 revrangesep = ':'
13 13
14 def revpair(repo, revs): 14 def revpair(repo, revs):
15 '''return pair of nodes, given list of revisions. second item can 15 '''return pair of nodes, given list of revisions. second item can
144 for src, fn in repo.walk(node=node, files=files, match=matchfn, 144 for src, fn in repo.walk(node=node, files=files, match=matchfn,
145 badmatch=badmatch): 145 badmatch=badmatch):
146 yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact 146 yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact
147 147
148 def findrenames(repo, added=None, removed=None, threshold=0.5): 148 def findrenames(repo, added=None, removed=None, threshold=0.5):
149 '''find renamed files -- yields (before, after, score) tuples'''
149 if added is None or removed is None: 150 if added is None or removed is None:
150 added, removed = repo.status()[1:3] 151 added, removed = repo.status()[1:3]
151 ctx = repo.changectx() 152 ctx = repo.changectx()
152 for a in added: 153 for a in added:
153 aa = repo.wread(a) 154 aa = repo.wread(a)
154 bestscore, bestname = None, None 155 bestname, bestscore = None, threshold
155 for r in removed: 156 for r in removed:
156 rr = ctx.filectx(r).data() 157 rr = ctx.filectx(r).data()
157 delta = mdiff.textdiff(aa, rr) 158
158 if len(delta) < len(aa): 159 # bdiff.blocks() returns blocks of matching lines
159 myscore = 1.0 - (float(len(delta)) / len(aa)) 160 # count the number of bytes in each
160 if bestscore is None or myscore > bestscore: 161 equal = 0
161 bestscore, bestname = myscore, r 162 alines = mdiff.splitnewlines(aa)
162 if bestname and bestscore >= threshold: 163 matches = bdiff.blocks(aa, rr)
164 for x1,x2,y1,y2 in matches:
165 for line in alines[x1:x2]:
166 equal += len(line)
167
168 myscore = equal*2.0 / (len(aa)+len(rr))
169 if myscore >= bestscore:
170 bestname, bestscore = r, myscore
171 if bestname:
163 yield bestname, a, bestscore 172 yield bestname, a, bestscore
164 173
165 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None, 174 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None,
166 similarity=None): 175 similarity=None):
167 if dry_run is None: 176 if dry_run is None: