Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Mar 2008 12:27:47 -0500
Subject: [GFS2] optimise loop in gfs2_bitfit
Message-id: 1205256468.2873.74.camel@technetium.msp.redhat.com
O-Subject: [RHEL5.2 PATCH][GFS2] bz 435456: GFS2: Optimise loop in gfs2_bitfit
Bugzilla: 435456

Hi,

This patch is for GFS2 bug #435456.  It optimizes our gfs2_bitfit
function, making that function 8 to 10 times faster than the method
we're using today.  Initial tests indicate this results in an
overall GFS2 performance increase of around 19 percent.  More
performance tests are pending.

Regards,

Bob Peterson
Red Hat GFS

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
--

Acked-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f390803..0100533 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
 /*
  * These routines are used by the resource group routines (rgrp.c)
  * to keep track of block allocation.  Each block is represented by two
@@ -126,47 +137,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
  * Return: the block number (bitmap buffer scope) that was found
  */
 
-static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
-		       unsigned char old_state)
+static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
+		       u8 old_state)
 {
-	unsigned char *byte;
-	u32 blk = goal;
-	unsigned int bit, bitlong;
-	unsigned long *plong, plong55;
-
-	byte = buffer + (goal / GFS2_NBBY);
-	plong = (unsigned long *)buffer + (goal / GFS2_NBBY);
-	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	bitlong = bit;
-#if BITS_PER_LONG == 32
-	plong55 = 0x55555555;
-#else
-	plong55 = 0x5555555555555555;
-#endif
-	while (byte < buffer + buflen) {
-
-		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-			plong++;
-			byte += sizeof(unsigned long);
-			blk += sizeof(unsigned long) * GFS2_NBBY;
-			continue;
+	const u8 *byte, *start, *end;
+	int bit, startbit;
+	u32 g1, g2, misaligned;
+	unsigned long *plong;
+	unsigned long lskipval;
+
+	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
+	g1 = (goal / GFS2_NBBY);
+	start = buffer + g1;
+	byte = start;
+        end = buffer + buflen;
+	g2 = ALIGN(g1, sizeof(unsigned long));
+	plong = (unsigned long *)(buffer + g2);
+	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	misaligned = g2 - g1;
+	if (!misaligned)
+		goto ulong_aligned;
+/* parse the bitmap a byte at a time */
+misaligned:
+	while (byte < end) {
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			return goal +
+				(((byte - start) * GFS2_NBBY) +
+				 ((bit - startbit) >> 1));
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-			return blk;
 		bit += GFS2_BIT_SIZE;
-		if (bit >= 8) {
+		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
 			bit = 0;
 			byte++;
+			misaligned--;
+			if (!misaligned) {
+				plong = (unsigned long *)byte;
+				goto ulong_aligned;
+			}
 		}
-		bitlong += GFS2_BIT_SIZE;
-		if (bitlong >= sizeof(unsigned long) * 8) {
-			bitlong = 0;
-			plong++;
-		}
-
-		blk++;
 	}
+	return BFITNOENT;
 
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
+	   We could "if" it but we'd lose some of the performance gained.
+	   This way will only slow down searching the very last 4/8 bytes
+	   depending on architecture.  I've experimented with several ways
+	   of writing this section such as using an else before the goto
+	   but this one seems to be the fastest. */
+	while ((unsigned char *)plong < end - 1) {
+		prefetch(plong + 1);
+		if (((*plong) & LBITMASK) != lskipval)
+			break;
+		plong++;
+	}
+	if ((unsigned char *)plong < end) {
+		byte = (const u8 *)plong;
+		misaligned += sizeof(unsigned long) - 1;
+		goto misaligned;
+	}
 	return BFITNOENT;
 }