mercurial/mdiff.py
author "Wallace, Eric S" <eric.s.wallace@intel.com>
Thu, 04 Aug 2005 13:25:59 -0800
changeset 827 a61728b58dc0
parent 582 df8a5a0098d4
child 1015 22571b8d35d3
permissions -rw-r--r--
Fix array overflow bug in bdiff I ran into a bug while importing a large repository into mercurial. The diff algorithm does not allocate a big enough array of hunks for some test cases. This results in memory corruption, and possibly, as in my case, a seg fault. You should be able to reproduce this problem with any case of more than a few lines that follows this pattern: a b = = 1 1 2 2 3 4 3 5 . 4 . . 5 . . . I.e., "a" has blank lines on every other line that have been removed in "b". In this case, the number of matching hunks is equal to the number of lines in "b". This is more than ((an + bn)/4 + 2). I'm not sure what motivates this formula, but when I changed it to the smaller of an or bn (+ 1), it works. [comment added by mpm]

# mdiff.py - diff and patch routines for mercurial
#
# Copyright 2005 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms
# of the GNU General Public License, incorporated herein by reference.

import difflib, struct, bdiff
from mpatch import *

def unidiff(a, ad, b, bd, fn, r=None):

    if not a and not b: return ""

    if a == None:
        b = b.splitlines(1)
        l1 = "--- %s\t%s\n" % ("/dev/null", ad)
        l2 = "+++ %s\t%s\n" % ("b/" + fn, bd)
        l3 = "@@ -0,0 +1,%d @@\n" % len(b)
        l = [l1, l2, l3] + ["+" + e for e in b]
    elif b == None:
        a = a.splitlines(1)
        l1 = "--- %s\t%s\n" % ("a/" + fn, ad)
        l2 = "+++ %s\t%s\n" % ("/dev/null", bd)
        l3 = "@@ -1,%d +0,0 @@\n" % len(a)
        l = [l1, l2, l3] + ["-" + e for e in a]
    else:
        a = a.splitlines(1)
        b = b.splitlines(1)
        l = list(difflib.unified_diff(a, b, "a/" + fn, "b/" + fn))
        if not l: return ""
        # difflib uses a space, rather than a tab
        l[0] = l[0][:-2] + "\t" + ad + "\n"
        l[1] = l[1][:-2] + "\t" + bd + "\n"

    for ln in xrange(len(l)):
        if l[ln][-1] != '\n':
            l[ln] += "\n\ No newline at end of file\n"

    if r:
        l.insert(0, "diff %s %s\n" %
                    (' '.join(["-r %s" % rev for rev in r]), fn))

    return "".join(l)

def sortdiff(a, b):
    la = lb = 0
    lena = len(a)
    lenb = len(b)

    while 1:
        am, bm, = la, lb

        # walk over matching lines
        while lb < lenb and la < lena and a[la] == b[lb] :
            la += 1
            lb += 1

        if la > am:
            yield (am, bm, la - am) # return a match

        # skip mismatched lines from b
        while la < lena and lb < lenb and b[lb] < a[la]:
            lb += 1

        if lb >= lenb:
            break

        # skip mismatched lines from a
        while la < lena and lb < lenb and b[lb] > a[la]:
            la += 1

        if la >= lena:
            break

    yield (lena, lenb, 0)

def diff(a, b, sorted=0):
    if not a:
        s = "".join(b)
        return s and (struct.pack(">lll", 0, 0, len(s)) + s)

    bin = []
    p = [0]
    for i in a: p.append(p[-1] + len(i))

    if sorted:
        try:
            d = sortdiff(a, b)
        except:
            raise
    else:
        d = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
    la = 0
    lb = 0
    for am, bm, size in d:
        s = "".join(b[lb:bm])
        if am > la or s:
            bin.append(struct.pack(">lll", p[la], p[am], len(s)) + s)
        la = am + size
        lb = bm + size

    return "".join(bin)

def patchtext(bin):
    pos = 0
    t = []
    while pos < len(bin):
        p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12])
        pos += 12
        t.append(bin[pos:pos + l])
        pos += l
    return "".join(t)

def patch(a, bin):
    return patches(a, [bin])

textdiff = bdiff.bdiff