mercurial/bdiff.c
changeset 400 8b067bde6679
child 411 9e9f7ab43ce2
equal deleted inserted replaced
399:f060d728fe54 400:8b067bde6679
       
     1 /*
       
     2  bdiff.c - efficient binary diff extension for Mercurial
       
     3 
       
     4  Copyright 2005 Matt Mackall <mpm@selenic.com>
       
     5 
       
     6  This software may be used and distributed according to the terms of
       
     7  the GNU General Public License, incorporated herein by reference.
       
     8 
       
     9  Based roughly on Python difflib
       
    10 */
       
    11 
       
    12 #include <Python.h>
       
    13 #include <stdlib.h>
       
    14 #include <string.h>
       
    15 #include <netinet/in.h>
       
    16 #include <sys/types.h>
       
    17 
       
    18 struct line {
       
    19 	int h, len, n;
       
    20 	const char *l;
       
    21 };
       
    22 
       
    23 struct hunk {
       
    24 	int a1, a2, b1, b2;
       
    25 };
       
    26 
       
    27 struct hunklist {
       
    28 	struct hunk *base, *head;
       
    29 };
       
    30 
       
    31 static inline uint32_t rol32(uint32_t word, unsigned int shift)
       
    32 {
       
    33         return (word << shift) | (word >> (32 - shift));
       
    34 }
       
    35 
       
    36 int splitlines(const char *a, int len, struct line **lr)
       
    37 {
       
    38 	int h, i;
       
    39 	const char *p, *b = a;
       
    40 	struct line *l;
       
    41 
       
    42 	/* count the lines */
       
    43 	i = 1; /* extra line for sentinel */
       
    44 	for (p = a; p < a + len; p++)
       
    45 		if (*p == '\n' || p == a + len - 1)
       
    46 			i++;
       
    47 
       
    48 	*lr = l = malloc(sizeof(struct line) * i);
       
    49 	if (!l)
       
    50 		return -1;
       
    51 
       
    52 	/* build the line array and calculate hashes */
       
    53 	h = 0;
       
    54 	for (p = a; p < a + len; p++) {
       
    55 		h = *p + rol32(h, 7); /* a simple hash from GNU diff */
       
    56 		if (*p == '\n' || p == a + len - 1) {
       
    57 			l->len = p - b + 1;
       
    58 			l->h = h;
       
    59 			l->l = b;
       
    60 			l->n = -1;
       
    61 			l++;
       
    62 			b = p + 1;
       
    63 			h = 0;
       
    64 		}
       
    65 	}
       
    66 
       
    67 	/* set up a sentinel */
       
    68 	l->h = l->len = 0;
       
    69 	l->l = a + len;
       
    70 	return i - 1;
       
    71 }
       
    72 
       
    73 int inline cmp(struct line *a, struct line *b)
       
    74 {
       
    75 	return a->len != b->len || memcmp(a->l, b->l, a->len);
       
    76 }
       
    77 
       
    78 static int equatelines(struct line *a, int an, struct line *b, int bn)
       
    79 {
       
    80 	int i, j, buckets = 1, t, *h, *l;
       
    81 
       
    82 	/* build a hash table of the next highest power of 2 */
       
    83 	while (buckets < bn + 1)
       
    84 		buckets *= 2;
       
    85 
       
    86 	h = malloc(buckets * sizeof(int));
       
    87 	l = calloc(buckets, sizeof(int));
       
    88 	buckets = buckets - 1;
       
    89 	if (!h || !l) {
       
    90 		free(h);
       
    91 		return 0;
       
    92 	}
       
    93 
       
    94 	/* clear the hash table */
       
    95 	for (i = 0; i <= buckets; i++)
       
    96 		h[i] = -1;
       
    97 
       
    98 	/* add lines to the hash table chains */
       
    99 	for (i = bn - 1; i >= 0; i--) {
       
   100 		/* find the equivalence class */
       
   101 		for (j = b[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
       
   102 			if (!cmp(b + i, b + h[j]))
       
   103 				break;
       
   104 
       
   105 		/* add to the head of the equivalence class */
       
   106 		b[i].n = h[j];
       
   107 		b[i].h = j;
       
   108 		h[j] = i;
       
   109 		l[j]++; /* keep track of popularity */
       
   110 	}
       
   111 
       
   112 	/* compute popularity threshold */
       
   113 	t = (bn >= 200) ? bn / 100 : bn + 1;
       
   114 
       
   115 	/* match items in a to their equivalence class in b */
       
   116 	for (i = 0; i < an; i++) {
       
   117 		/* find the equivalence class */
       
   118 		for (j = a[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
       
   119 			if (!cmp(a + i, b + h[j]))
       
   120 				break;
       
   121 
       
   122 		a[i].h = j; /* use equivalence class for quick compare */
       
   123 		if(l[j] <= t)
       
   124 			a[i].n = h[j]; /* point to head of match list */
       
   125 		else
       
   126 			a[i].n = -1; /* too popular */
       
   127 	}
       
   128 
       
   129 	/* discard hash tables */
       
   130 	free(h);
       
   131 	free(l);
       
   132 	return 1;
       
   133 }
       
   134 
       
   135 static int longest_match(struct line *a, struct line *b, int *jpos, int *jlen,
       
   136 			 int a1, int a2, int b1, int b2, int *omi, int *omj)
       
   137 {
       
   138 	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
       
   139 
       
   140 	for (i = a1; i < a2; i++) {
       
   141 		/* skip things before the current block */
       
   142 		for (j = a[i].n; j != -1 && j < b1; j = b[j].n)
       
   143 			;
       
   144 
       
   145 		/* loop through all lines match a[i] in b */
       
   146 		for (; j != -1 && j < b2; j = b[j].n) {
       
   147 			/* does this extend an earlier match? */
       
   148 			if (i > a1 && j > b1 && jpos[j - 1] == i)
       
   149 				k = jlen[j - 1] + 1;
       
   150 			else
       
   151 				k = 1;
       
   152 			jpos[j] = i + 1;
       
   153 			jlen[j] = k;
       
   154 
       
   155 			/* best match so far? */
       
   156 			if (k > mk) {
       
   157 				mi = i;
       
   158 				mj = j;
       
   159 				mk = k;
       
   160 			}
       
   161 		}
       
   162 	}
       
   163 
       
   164 	if (mk) {
       
   165 		mi = mi - mk + 1;
       
   166 		mj = mj - mk + 1;
       
   167 	}
       
   168 
       
   169 	/* expand match to include neighboring popular lines */
       
   170 	while (mi - mb > a1 && mj - mb > b1 &&
       
   171 	       a[mi - mb - 1].h == b[mj - mb - 1].h)
       
   172 		mb++;
       
   173 	while (mi + mk < a2 && mj + mk < b2 &&
       
   174 	       a[mi + mk].h == b[mj + mk].h)
       
   175 		mk++;
       
   176 
       
   177 	*omi = mi - mb;
       
   178 	*omj = mj - mb;
       
   179 	return mk + mb;
       
   180 }
       
   181 
       
   182 static void recurse(struct line *a, struct line *b, int *jpos, int *jlen,
       
   183 		    int a1, int a2, int b1, int b2, struct hunklist *l)
       
   184 {
       
   185 	int i, j, k;
       
   186 
       
   187 	/* find the longest match in this chunk */
       
   188 	k = longest_match(a, b, jpos, jlen, a1, a2, b1, b2, &i, &j);
       
   189 	if (!k)
       
   190 		return;
       
   191 
       
   192 	/* and recurse on the remaining chunks on either side */
       
   193 	recurse(a, b, jpos, jlen, a1, i, b1, j, l);
       
   194 	l->head->a1 = i;
       
   195 	l->head->a2 = i + k;
       
   196 	l->head->b1 = j;
       
   197 	l->head->b2 = j + k;
       
   198 	l->head++;
       
   199 	recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l);
       
   200 }
       
   201 
       
   202 static PyObject *bdiff(PyObject *self, PyObject *args)
       
   203 {
       
   204 	PyObject *sa, *sb, *result = NULL;
       
   205 	struct hunklist l;
       
   206 	struct hunk *h;
       
   207 	struct line *al, *bl;
       
   208 	char encode[12], *rb;
       
   209 	int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen;
       
   210 
       
   211 	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
       
   212 		return NULL;
       
   213 
       
   214 	/* allocate and fill arrays */
       
   215 	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
       
   216 	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);
       
   217 	t = equatelines(al, an, bl, bn);
       
   218 	jpos = calloc(bn, sizeof(int));
       
   219 	jlen = calloc(bn, sizeof(int));
       
   220 	l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
       
   221 	if (!al || !bl || !jpos || !jlen || !l.base || !t)
       
   222 		goto nomem;
       
   223 
       
   224 	/* generate the matching block list */
       
   225 	recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l);
       
   226 	l.head->a1 = an;
       
   227 	l.head->b1 = bn;
       
   228 	l.head++;
       
   229 
       
   230 	/* calculate length of output */
       
   231 	for(h = l.base; h != l.head; h++) {
       
   232 		if (h->a1 != la || h->b1 != lb)
       
   233 			len += 12 + bl[h->b1].l - bl[lb].l;
       
   234 		la = h->a2;
       
   235 		lb = h->b2;
       
   236 	}
       
   237 
       
   238 	result = PyString_FromStringAndSize(NULL, len);
       
   239 	if (!result)
       
   240 		goto nomem;
       
   241 
       
   242 	/* build binary patch */
       
   243 	rb = PyString_AsString(result);
       
   244 	la = lb = 0;
       
   245 
       
   246 	for(h = l.base; h != l.head; h++) {
       
   247 		if (h->a1 != la || h->b1 != lb) {
       
   248 			len = bl[h->b1].l - bl[lb].l;
       
   249 			*(uint32_t *)(encode)     = htonl(al[la].l - al->l);
       
   250 			*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
       
   251 			*(uint32_t *)(encode + 8) = htonl(len);
       
   252 			memcpy(rb, encode, 12);
       
   253 			memcpy(rb + 12, bl[lb].l, len);
       
   254 			rb += 12 + len;
       
   255 		}
       
   256 		la = h->a2;
       
   257 		lb = h->b2;
       
   258 	}
       
   259 
       
   260 nomem:
       
   261 	free(al);
       
   262 	free(bl);
       
   263 	free(jpos);
       
   264 	free(jlen);
       
   265 	free(l.base);
       
   266 	return result ? result : PyErr_NoMemory();
       
   267 }
       
   268 
       
   269 static char mdiff_doc[] = "Efficient binary diff.";
       
   270 
       
   271 static PyMethodDef methods[] = {
       
   272 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
       
   273 	{NULL, NULL}
       
   274 };
       
   275 
       
   276 PyMODINIT_FUNC initbdiff(void)
       
   277 {
       
   278 	Py_InitModule3("bdiff", methods, mdiff_doc);
       
   279 }