Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Thomas Graf <tgraf@redhat.com>
Date: Fri, 3 Sep 2010 10:44:32 -0400
Subject: [net] ipv4: fix leak, rcu and length in route cache gc
Message-id: <20100903104432.GA18556@lsx.localdomain>
Patchwork-id: 28061
O-Subject: [RHEL5.6 PATCH] net: fix leak, rcu assignment and length computation
	in route cache gc
Bugzilla: 541224
RH-Acked-by: Neil Horman <nhorman@redhat.com>
RH-Acked-by: David S. Miller <davem@redhat.com>

The addition of emergency route cache flushing introduced a bug
which results in the leakage of dst_entries. This patch addresses
this issue together with two other bugs in the same code path.

The following upstream commits have been back ported:

commit 1ddbcb005c395518c2cd0df504cff3d4b5c85853
Author: Eric Dumazet <dada1@cosmosbay.com>
Date:   Tue May 19 20:14:28 2009 +0000

    net: fix rtable leak in net/ipv4/route.c

     [...]
     2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately
     patch has at least one critical flaw, and another problem.

     rt_intern_hash calculates rthi pointer, which is later used for new entry
     insertion. The same loop calculates cand pointer which is used to clean the
     list. If the pointers are the same, rtable leak occurs, as first the cand i
     removed then the new entry is appended to it.
     [...]

commit 00269b54edbf25f3bb0dccb558ae23a6fc77ed86
Author: Eric Dumazet <dada1@cosmosbay.com>
Date:   Thu Oct 16 14:18:29 2008 -0700

    ipv4: Add a missing rcu_assign_pointer() in routing cache.

    rt_intern_hash() is doing an update of a RCU guarded hash chain
    without using rcu_assign_pointer() or equivalent barrier.

commit cf8da764fc6959b7efb482f375dfef9830e98205
Author: Eric Dumazet <dada1@cosmosbay.com>
Date:   Tue May 19 18:54:22 2009 +0000

    net: fix length computation in rt_check_expire()

    rt_check_expire() computes average and standard deviation of chain lengths,
    but not correclty reset length to 0 at beginning of each chain.
    This probably gives overflows for sum2 (and sum) on loaded machines instead
    of meaningful results.

Tested by reporter and myself.

Brew build:
https://brewweb.devel.redhat.com/taskinfo?taskID=2719827

Signed-off-by: Jarod Wilson <jarod@redhat.com>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 261c70f..9b2e68d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -646,8 +646,8 @@ static void rt_check_expire(unsigned long dummy)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
-	struct rtable *rth, **rthp;
-	unsigned long length = 0, samples = 0;
+	struct rtable *rth, *aux, **rthp;
+	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	unsigned long now = jiffies;
 	u64 mult;
@@ -659,6 +659,7 @@ static void rt_check_expire(unsigned long dummy)
 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
@@ -667,39 +668,37 @@ static void rt_check_expire(unsigned long dummy)
 
 		if (*rthp == 0)
 			continue;
+		length = 0;
 		spin_lock(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
+			prefetch(rth->u.rt_next);
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
 				if (time_before_eq(now, rth->u.dst.expires)) {
+nofree:
 					tmo >>= 1;
 					rthp = &rth->u.rt_next;
 					/*
-					 * Only bump our length if the hash
-					 * inputs on entries n and n+1 are not
-					 * the same, we only count entries on
+					 * We only count entries on
 					 * a chain with equal hash inputs once
 					 * so that entries for different QOS
 					 * levels, and other non-hash input
 					 * attributes don't unfairly skew
 					 * the length computation
 					 */
-					if ((*rthp == NULL) ||
-					    !compare_hash_inputs(&(*rthp)->fl,
-								 &rth->fl))
-						length += ONE;
-
+					for (aux = rt_hash_table[i].chain;;) {
+						if (aux == rth) {
+							length += ONE;
+							break;
+						}
+						if (compare_hash_inputs(&aux->fl, &rth->fl))
+							break;
+						aux = aux->u.rt_next;
+					}
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.rt_next;
-				if ((*rthp == NULL) || 
-				   !compare_hash_inputs(&(*rthp)->fl,
-							&rth->fl))
-					length += ONE;
-				continue;
-			}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
 
 			/* Cleanup aged off entries. */
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
@@ -995,7 +994,6 @@ out:	return 0;
 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
-	struct rtable   *rthi;
 	unsigned long	now;
 	struct rtable *cand, **candp;
 	u32 		min_score;
@@ -1043,7 +1041,6 @@ restart:
 	}
 
 	rthp = &rt_hash_table[hash].chain;
-	rthi = NULL;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -1091,17 +1088,6 @@ restart:
 		chain_length++;
 
 		rthp = &rth->u.rt_next;
-
-		/*
-		 * check to see if the next entry in the chain
-		 * contains the same hash input values as rt.  If it does
-		 * This is where we will insert into the list, instead of
-		 * at the head.  This groups entries that differ by aspects not
-		 * relvant to the hash function together, which we use to adjust
-		 * our chain length
-		 */
-		if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
-			rthi = rth;
 	}
 
 	if (cand) {
@@ -1168,10 +1154,7 @@ restart:
 		}
 	}
 
-	if (rthi)
-		rt->u.rt_next = rthi->u.rt_next;
-	else
-		rt->u.rt_next = rt_hash_table[hash].chain;
+	rt->u.rt_next = rt_hash_table[hash].chain;
 
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.rt_next) {
@@ -1183,10 +1166,7 @@ restart:
 		printk("\n");
 	}
 #endif
-	if (rthi)
-		rthi->u.rt_next = rt;
-	else
-		rt_hash_table[hash].chain = rt;
+	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 skip_hashing: