changeset 2577:fa76c5d609c9

bdiff: improve worst case behavior by 100x. on 5.8MB (244.000 lines) text file with similar lines, hash before this change made diff against empty file take 75 seconds. this change improves performance to 0.6 seconds. result is that clone of smallish repo (137MB) with some files like this takes 1 minute instead of 10 minutes. common case of diff is 10% slower now, probably because of worse cache locality. but diff does not affect overall performance in common case (less than 1% of runtime is in diff when it is working ok), so this tradeoff looks good.
author Vadim Gelfer <vadim.gelfer@gmail.com>
date Fri, 07 Jul 2006 15:02:55 -0700
parents 6a961a54f953
children cf4f0322851d
files mercurial/bdiff.c
diffstat 1 files changed, 11 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/bdiff.c
+++ b/mercurial/bdiff.c
@@ -65,7 +65,7 @@ static inline uint32_t rol32(uint32_t wo
 
 int splitlines(const char *a, int len, struct line **lr)
 {
-	int h, i;
+	int g, h, i;
 	const char *p, *b = a;
 	struct line *l;
 
@@ -82,7 +82,16 @@ int splitlines(const char *a, int len, s
 	/* build the line array and calculate hashes */
 	h = 0;
 	for (p = a; p < a + len; p++) {
-		h = *p + rol32(h, 7); /* a simple hash from GNU diff */
+		/*
+		 * a simple hash from GNU diff, with better collision
+		 * resistance from hashpjw. this slows down common
+		 * case by 10%, but speeds up worst case by 100x.
+		 */
+		h = *p + rol32(h, 7);
+		if ((g = h & 0xf0000000)) {
+			h ^= g >> 24;
+			h ^= g;
+		}
 		if (*p == '\n' || p == a + len - 1) {
 			l->len = p - b + 1;
 			l->h = h * l->len;