diff mercurial/cmdutil.py @ 4135:6cb6cfe43c5d

Avoid some false positives for addremove -s The original code uses the similary score 1 - len(diff(after, before)) / len(after) The diff can at most be the size of the 'before' file, so any small 'before' file would be considered very similar. Removing an empty file would cause all files added in the same revision to be considered copies of the removed file. This changes the metric to bytes_overlap(before, after) / len(before + after) i.e. the actual percentage of bytes shared between the two files.
author Erling Ellingsen <erlingalf@gmail.com>
date Sun, 18 Feb 2007 20:39:25 +0100
parents 431f3c1d3a37
children eb5d4fec1487
line wrap: on
line diff
--- a/mercurial/cmdutil.py
+++ b/mercurial/cmdutil.py
@@ -7,7 +7,7 @@
 
 from node import *
 from i18n import _
-import os, sys, mdiff, util, templater, patch
+import os, sys, mdiff, bdiff, util, templater, patch
 
 revrangesep = ':'
 
@@ -146,20 +146,29 @@ def walk(repo, pats=[], opts={}, node=No
         yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact
 
 def findrenames(repo, added=None, removed=None, threshold=0.5):
+    '''find renamed files -- yields (before, after, score) tuples'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
     ctx = repo.changectx()
     for a in added:
         aa = repo.wread(a)
-        bestscore, bestname = None, None
+        bestname, bestscore = None, threshold
         for r in removed:
             rr = ctx.filectx(r).data()
-            delta = mdiff.textdiff(aa, rr)
-            if len(delta) < len(aa):
-                myscore = 1.0 - (float(len(delta)) / len(aa))
-                if bestscore is None or myscore > bestscore:
-                    bestscore, bestname = myscore, r
-        if bestname and bestscore >= threshold:
+
+            # bdiff.blocks() returns blocks of matching lines
+            # count the number of bytes in each
+            equal = 0
+            alines = mdiff.splitnewlines(aa)
+            matches = bdiff.blocks(aa, rr)
+            for x1,x2,y1,y2 in matches:
+                for line in alines[x1:x2]:
+                    equal += len(line)
+
+            myscore = equal*2.0 / (len(aa)+len(rr))
+            if myscore >= bestscore:
+                bestname, bestscore = r, myscore
+        if bestname:
             yield bestname, a, bestscore
 
 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None,