changeset 5366:d0c48891dd4a

bdiff: switch to lyhash lyhash is a very simple and fast hash function that had the fewest hash collisions on a 3.9M line text corpus and 190k line binary corpus and should have significantly fewer collisions than the current hash function.
author Matt Mackall <mpm@selenic.com>
date Thu, 27 Sep 2007 23:59:18 -0500
parents 458acf92b49e
children adce4d30a6ea
files mercurial/bdiff.c
diffstat 1 files changed, 6 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/bdiff.c
+++ b/mercurial/bdiff.c
@@ -59,14 +59,9 @@ struct hunklist {
 	struct hunk *base, *head;
 };
 
-static inline uint32_t rol32(uint32_t word, unsigned int shift)
-{
-        return (word << shift) | (word >> (32 - shift));
-}
-
 int splitlines(const char *a, int len, struct line **lr)
 {
-	int g, h, i;
+	int h, i;
 	const char *p, *b = a;
 	const char * const plast = a + len - 1;
 	struct line *l;
@@ -84,24 +79,17 @@ int splitlines(const char *a, int len, s
 	/* build the line array and calculate hashes */
 	h = 0;
 	for (p = a; p < a + len; p++) {
-		/*
-		 * a simple hash from GNU diff, with better collision
-		 * resistance from hashpjw. this slows down common
-		 * case by 10%, but speeds up worst case by 100x.
-		 */
-		h = *p + rol32(h, 7);
-		if ((g = h & 0xf0000000)) {
-			h ^= g >> 24;
-			h ^= g;
-		}
+		/* Leonid Yuriev's hash */
+                h = (h * 1664525) + *p + 1013904223;
+
 		if (*p == '\n' || p == plast) {
+			l->h = h;
+			h = 0;
 			l->len = p - b + 1;
-			l->h = h * l->len;
 			l->l = b;
 			l->n = INT_MAX;
 			l++;
 			b = p + 1;
-			h = 0;
 		}
 	}