Mercurial > hg > mercurial-crew-with-dirclash
comparison mercurial/cmdutil.py @ 4135:6cb6cfe43c5d
Avoid some false positives for addremove -s
The original code uses the similary score
1 - len(diff(after, before)) / len(after)
The diff can at most be the size of the 'before' file, so any small
'before' file would be considered very similar. Removing an empty file
would cause all files added in the same revision to be considered
copies of the removed file.
This changes the metric to
bytes_overlap(before, after) / len(before + after)
i.e. the actual percentage of bytes shared between the two files.
author | Erling Ellingsen <erlingalf@gmail.com> |
---|---|
date | Sun, 18 Feb 2007 20:39:25 +0100 |
parents | 431f3c1d3a37 |
children | eb5d4fec1487 |
comparison
equal
deleted
inserted
replaced
4134:9dc64c8414ca | 4135:6cb6cfe43c5d |
---|---|
5 # This software may be used and distributed according to the terms | 5 # This software may be used and distributed according to the terms |
6 # of the GNU General Public License, incorporated herein by reference. | 6 # of the GNU General Public License, incorporated herein by reference. |
7 | 7 |
8 from node import * | 8 from node import * |
9 from i18n import _ | 9 from i18n import _ |
10 import os, sys, mdiff, util, templater, patch | 10 import os, sys, mdiff, bdiff, util, templater, patch |
11 | 11 |
12 revrangesep = ':' | 12 revrangesep = ':' |
13 | 13 |
14 def revpair(repo, revs): | 14 def revpair(repo, revs): |
15 '''return pair of nodes, given list of revisions. second item can | 15 '''return pair of nodes, given list of revisions. second item can |
144 for src, fn in repo.walk(node=node, files=files, match=matchfn, | 144 for src, fn in repo.walk(node=node, files=files, match=matchfn, |
145 badmatch=badmatch): | 145 badmatch=badmatch): |
146 yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact | 146 yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact |
147 | 147 |
148 def findrenames(repo, added=None, removed=None, threshold=0.5): | 148 def findrenames(repo, added=None, removed=None, threshold=0.5): |
149 '''find renamed files -- yields (before, after, score) tuples''' | |
149 if added is None or removed is None: | 150 if added is None or removed is None: |
150 added, removed = repo.status()[1:3] | 151 added, removed = repo.status()[1:3] |
151 ctx = repo.changectx() | 152 ctx = repo.changectx() |
152 for a in added: | 153 for a in added: |
153 aa = repo.wread(a) | 154 aa = repo.wread(a) |
154 bestscore, bestname = None, None | 155 bestname, bestscore = None, threshold |
155 for r in removed: | 156 for r in removed: |
156 rr = ctx.filectx(r).data() | 157 rr = ctx.filectx(r).data() |
157 delta = mdiff.textdiff(aa, rr) | 158 |
158 if len(delta) < len(aa): | 159 # bdiff.blocks() returns blocks of matching lines |
159 myscore = 1.0 - (float(len(delta)) / len(aa)) | 160 # count the number of bytes in each |
160 if bestscore is None or myscore > bestscore: | 161 equal = 0 |
161 bestscore, bestname = myscore, r | 162 alines = mdiff.splitnewlines(aa) |
162 if bestname and bestscore >= threshold: | 163 matches = bdiff.blocks(aa, rr) |
164 for x1,x2,y1,y2 in matches: | |
165 for line in alines[x1:x2]: | |
166 equal += len(line) | |
167 | |
168 myscore = equal*2.0 / (len(aa)+len(rr)) | |
169 if myscore >= bestscore: | |
170 bestname, bestscore = r, myscore | |
171 if bestname: | |
163 yield bestname, a, bestscore | 172 yield bestname, a, bestscore |
164 | 173 |
165 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None, | 174 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None, |
166 similarity=None): | 175 similarity=None): |
167 if dry_run is None: | 176 if dry_run is None: |