Mercurial > hg > mercurial-crew-with-dirclash
comparison mercurial/bdiff.c @ 400:8b067bde6679
Add a fast binary diff extension (not yet used)
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Add a fast binary diff extension (not yet used)
manifest hash: fbfdcab2b1f4856af45176fcf2423d602dfe88c4
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.0 (GNU/Linux)
iD8DBQFCuL6eywK+sNU5EO8RAoT+AJ95z9gx2IrucBS30CnCQFkZT7lSbgCgnXh5
ScZcgwzJ/mEo9i5vuKNlCIs=
=Gr58
-----END PGP SIGNATURE-----
author | mpm@selenic.com |
---|---|
date | Tue, 21 Jun 2005 17:27:58 -0800 |
parents | |
children | 9e9f7ab43ce2 |
comparison
equal
deleted
inserted
replaced
399:f060d728fe54 | 400:8b067bde6679 |
---|---|
1 /* | |
2 bdiff.c - efficient binary diff extension for Mercurial | |
3 | |
4 Copyright 2005 Matt Mackall <mpm@selenic.com> | |
5 | |
6 This software may be used and distributed according to the terms of | |
7 the GNU General Public License, incorporated herein by reference. | |
8 | |
9 Based roughly on Python difflib | |
10 */ | |
11 | |
12 #include <Python.h> | |
13 #include <stdlib.h> | |
14 #include <string.h> | |
15 #include <netinet/in.h> | |
16 #include <sys/types.h> | |
17 | |
18 struct line { | |
19 int h, len, n; | |
20 const char *l; | |
21 }; | |
22 | |
23 struct hunk { | |
24 int a1, a2, b1, b2; | |
25 }; | |
26 | |
27 struct hunklist { | |
28 struct hunk *base, *head; | |
29 }; | |
30 | |
31 static inline uint32_t rol32(uint32_t word, unsigned int shift) | |
32 { | |
33 return (word << shift) | (word >> (32 - shift)); | |
34 } | |
35 | |
36 int splitlines(const char *a, int len, struct line **lr) | |
37 { | |
38 int h, i; | |
39 const char *p, *b = a; | |
40 struct line *l; | |
41 | |
42 /* count the lines */ | |
43 i = 1; /* extra line for sentinel */ | |
44 for (p = a; p < a + len; p++) | |
45 if (*p == '\n' || p == a + len - 1) | |
46 i++; | |
47 | |
48 *lr = l = malloc(sizeof(struct line) * i); | |
49 if (!l) | |
50 return -1; | |
51 | |
52 /* build the line array and calculate hashes */ | |
53 h = 0; | |
54 for (p = a; p < a + len; p++) { | |
55 h = *p + rol32(h, 7); /* a simple hash from GNU diff */ | |
56 if (*p == '\n' || p == a + len - 1) { | |
57 l->len = p - b + 1; | |
58 l->h = h; | |
59 l->l = b; | |
60 l->n = -1; | |
61 l++; | |
62 b = p + 1; | |
63 h = 0; | |
64 } | |
65 } | |
66 | |
67 /* set up a sentinel */ | |
68 l->h = l->len = 0; | |
69 l->l = a + len; | |
70 return i - 1; | |
71 } | |
72 | |
73 int inline cmp(struct line *a, struct line *b) | |
74 { | |
75 return a->len != b->len || memcmp(a->l, b->l, a->len); | |
76 } | |
77 | |
78 static int equatelines(struct line *a, int an, struct line *b, int bn) | |
79 { | |
80 int i, j, buckets = 1, t, *h, *l; | |
81 | |
82 /* build a hash table of the next highest power of 2 */ | |
83 while (buckets < bn + 1) | |
84 buckets *= 2; | |
85 | |
86 h = malloc(buckets * sizeof(int)); | |
87 l = calloc(buckets, sizeof(int)); | |
88 buckets = buckets - 1; | |
89 if (!h || !l) { | |
90 free(h); | |
91 return 0; | |
92 } | |
93 | |
94 /* clear the hash table */ | |
95 for (i = 0; i <= buckets; i++) | |
96 h[i] = -1; | |
97 | |
98 /* add lines to the hash table chains */ | |
99 for (i = bn - 1; i >= 0; i--) { | |
100 /* find the equivalence class */ | |
101 for (j = b[i].h & buckets; h[j] != -1; j = (j + 1) & buckets) | |
102 if (!cmp(b + i, b + h[j])) | |
103 break; | |
104 | |
105 /* add to the head of the equivalence class */ | |
106 b[i].n = h[j]; | |
107 b[i].h = j; | |
108 h[j] = i; | |
109 l[j]++; /* keep track of popularity */ | |
110 } | |
111 | |
112 /* compute popularity threshold */ | |
113 t = (bn >= 200) ? bn / 100 : bn + 1; | |
114 | |
115 /* match items in a to their equivalence class in b */ | |
116 for (i = 0; i < an; i++) { | |
117 /* find the equivalence class */ | |
118 for (j = a[i].h & buckets; h[j] != -1; j = (j + 1) & buckets) | |
119 if (!cmp(a + i, b + h[j])) | |
120 break; | |
121 | |
122 a[i].h = j; /* use equivalence class for quick compare */ | |
123 if(l[j] <= t) | |
124 a[i].n = h[j]; /* point to head of match list */ | |
125 else | |
126 a[i].n = -1; /* too popular */ | |
127 } | |
128 | |
129 /* discard hash tables */ | |
130 free(h); | |
131 free(l); | |
132 return 1; | |
133 } | |
134 | |
135 static int longest_match(struct line *a, struct line *b, int *jpos, int *jlen, | |
136 int a1, int a2, int b1, int b2, int *omi, int *omj) | |
137 { | |
138 int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k; | |
139 | |
140 for (i = a1; i < a2; i++) { | |
141 /* skip things before the current block */ | |
142 for (j = a[i].n; j != -1 && j < b1; j = b[j].n) | |
143 ; | |
144 | |
145 /* loop through all lines match a[i] in b */ | |
146 for (; j != -1 && j < b2; j = b[j].n) { | |
147 /* does this extend an earlier match? */ | |
148 if (i > a1 && j > b1 && jpos[j - 1] == i) | |
149 k = jlen[j - 1] + 1; | |
150 else | |
151 k = 1; | |
152 jpos[j] = i + 1; | |
153 jlen[j] = k; | |
154 | |
155 /* best match so far? */ | |
156 if (k > mk) { | |
157 mi = i; | |
158 mj = j; | |
159 mk = k; | |
160 } | |
161 } | |
162 } | |
163 | |
164 if (mk) { | |
165 mi = mi - mk + 1; | |
166 mj = mj - mk + 1; | |
167 } | |
168 | |
169 /* expand match to include neighboring popular lines */ | |
170 while (mi - mb > a1 && mj - mb > b1 && | |
171 a[mi - mb - 1].h == b[mj - mb - 1].h) | |
172 mb++; | |
173 while (mi + mk < a2 && mj + mk < b2 && | |
174 a[mi + mk].h == b[mj + mk].h) | |
175 mk++; | |
176 | |
177 *omi = mi - mb; | |
178 *omj = mj - mb; | |
179 return mk + mb; | |
180 } | |
181 | |
182 static void recurse(struct line *a, struct line *b, int *jpos, int *jlen, | |
183 int a1, int a2, int b1, int b2, struct hunklist *l) | |
184 { | |
185 int i, j, k; | |
186 | |
187 /* find the longest match in this chunk */ | |
188 k = longest_match(a, b, jpos, jlen, a1, a2, b1, b2, &i, &j); | |
189 if (!k) | |
190 return; | |
191 | |
192 /* and recurse on the remaining chunks on either side */ | |
193 recurse(a, b, jpos, jlen, a1, i, b1, j, l); | |
194 l->head->a1 = i; | |
195 l->head->a2 = i + k; | |
196 l->head->b1 = j; | |
197 l->head->b2 = j + k; | |
198 l->head++; | |
199 recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l); | |
200 } | |
201 | |
202 static PyObject *bdiff(PyObject *self, PyObject *args) | |
203 { | |
204 PyObject *sa, *sb, *result = NULL; | |
205 struct hunklist l; | |
206 struct hunk *h; | |
207 struct line *al, *bl; | |
208 char encode[12], *rb; | |
209 int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen; | |
210 | |
211 if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) | |
212 return NULL; | |
213 | |
214 /* allocate and fill arrays */ | |
215 an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al); | |
216 bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl); | |
217 t = equatelines(al, an, bl, bn); | |
218 jpos = calloc(bn, sizeof(int)); | |
219 jlen = calloc(bn, sizeof(int)); | |
220 l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2)); | |
221 if (!al || !bl || !jpos || !jlen || !l.base || !t) | |
222 goto nomem; | |
223 | |
224 /* generate the matching block list */ | |
225 recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l); | |
226 l.head->a1 = an; | |
227 l.head->b1 = bn; | |
228 l.head++; | |
229 | |
230 /* calculate length of output */ | |
231 for(h = l.base; h != l.head; h++) { | |
232 if (h->a1 != la || h->b1 != lb) | |
233 len += 12 + bl[h->b1].l - bl[lb].l; | |
234 la = h->a2; | |
235 lb = h->b2; | |
236 } | |
237 | |
238 result = PyString_FromStringAndSize(NULL, len); | |
239 if (!result) | |
240 goto nomem; | |
241 | |
242 /* build binary patch */ | |
243 rb = PyString_AsString(result); | |
244 la = lb = 0; | |
245 | |
246 for(h = l.base; h != l.head; h++) { | |
247 if (h->a1 != la || h->b1 != lb) { | |
248 len = bl[h->b1].l - bl[lb].l; | |
249 *(uint32_t *)(encode) = htonl(al[la].l - al->l); | |
250 *(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l); | |
251 *(uint32_t *)(encode + 8) = htonl(len); | |
252 memcpy(rb, encode, 12); | |
253 memcpy(rb + 12, bl[lb].l, len); | |
254 rb += 12 + len; | |
255 } | |
256 la = h->a2; | |
257 lb = h->b2; | |
258 } | |
259 | |
260 nomem: | |
261 free(al); | |
262 free(bl); | |
263 free(jpos); | |
264 free(jlen); | |
265 free(l.base); | |
266 return result ? result : PyErr_NoMemory(); | |
267 } | |
268 | |
269 static char mdiff_doc[] = "Efficient binary diff."; | |
270 | |
271 static PyMethodDef methods[] = { | |
272 {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, | |
273 {NULL, NULL} | |
274 }; | |
275 | |
276 PyMODINIT_FUNC initbdiff(void) | |
277 { | |
278 Py_InitModule3("bdiff", methods, mdiff_doc); | |
279 } |