Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1643

kernel-2.6.18-238.el5.src.rpm

From: David S. Miller <davem@redhat.com>
Subject: Re: [RHEL5 bz#222122]: Fix ipv6 routing regression.
Date: Wed, 18 Apr 2007 19:47:51 -0400 (EDT)
Bugzilla: 222122
Message-Id: <20070418.194751.48867114.davem@redhat.com>
Changelog: [ipv6]: Fix routing regression.



> On Wed, 2007-04-18 at 16:27 -0400, David S. Miller wrote:
> > Thomas do you have any idea what might be going on here?
> 
> We never disabled the round robin code upstream and RHEL5
> is missing the Changeset
> 
> [IPV6] ROUTE: Do not enable router reachability probing in router mode
> 
> which adds the RT6_LOOKUP_F_REACHABLE conditions.
> 
> Don, I think your patch is correct.

Here is what happened, I posted the upstream patch instead
of the RHEL5 one even after going over it a million
times with a fine-toothed comb. :-/

Here is the patch I should have posted:

commit 461d2bb49596fae02db2e1c4500ee48b1a5bafe5
Author: David S. Miller <davem@sunset.davemloft.net>

    [IPV6]: Fix routing round-robin locking.
    
    As per RFC2461, section 6.3.6, item #2, when no routers on the
    matching list are known to be reachable or probably reachable we
    do round robin on those available routes so that we make sure
    to probe as many of them as possible to detect when one becomes
    reachable faster.
    
    Each routing table has a rwlock protecting the tree and the linked
    list of routes at each leaf.  The round robin code executes during
    lookup and thus with the rwlock taken as a reader.  A small local
    spinlock tries to provide protection but this does not work at all
    for two reasons:
    
    1) The round-robin list manipulation, as coded, goes like this (with
       read lock held):
    
    	walk routes finding head and tail
    
    	spin_lock();
    	rotate list using head and tail
    	spin_unlock();
    
       While one thread is rotating the list, another thread can
       end up with stale values of head and tail and then proceed
       to corrupt the list when it gets the lock.  This ends up causing
       the OOPS in fib6_add() later onthat many people have been hitting.
    
    2) All the other code paths that run with the rwlock held as
       a reader do not expect the list to change on them, they
       expect it to remain completely fixed while they hold the
       lock in that way.
    
    So, simply stated, it is impossible to implement this correctly using
    a manipulation of the list without violating the rwlock locking
    semantics.
    
    Reimplement using a per-fib6_node round-robin pointer.  This way we
    don't need to manipulate the list at all, and since the round-robin
    pointer can only ever point to real existing entries we don't need
    to perform any locking on the changing of the round-robin pointer
    itself.  We only need to reset the round-robin pointer to NULL when
    the entry it is pointing to is removed.
    
    The idea is from Thomas Graf and it is very similar to how this
    was implemented before the advanced router selection code when in.
    
    Signed-off-by: David S. Miller <davem@davemloft.net>

--- a/include/net/ip6_fib.h.~1~	2007-03-26 18:57:45.000000000 -0700
+++ b/include/net/ip6_fib.h	2007-03-26 19:04:39.000000000 -0700
@@ -37,6 +37,9 @@ struct fib6_node
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
 	__u32			fn_sernum;
+#ifndef __GENKSYMS__
+	struct rt6_info		*rr_ptr;
+#endif
 };
 
 #ifndef CONFIG_IPV6_SUBTREES
--- a/net/ipv6/route.c.~1~	2007-03-26 18:58:00.000000000 -0700
+++ b/net/ipv6/route.c	2007-03-26 19:04:59.000000000 -0700
@@ -352,64 +352,75 @@ static int rt6_score_route(struct rt6_in
 	return m;
 }
 
-static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
-				   int strict)
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+				   int *mpri, struct rt6_info *match)
 {
-	struct rt6_info *match = NULL, *last = NULL;
-	struct rt6_info *rt, *rt0 = *head;
-	u32 metric;
+	int m;
+
+	if (rt6_check_expired(rt))
+		goto out;
+
+	m = rt6_score_route(rt, oif, strict);
+	if (m < 0)
+		goto out;
+
+	if (m > *mpri) {
+		rt6_probe(match);
+		*mpri = m;
+		match = rt;
+	} else {
+		rt6_probe(rt);
+	}
+
+out:
+	return match;
+}
+
+static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+				     struct rt6_info *rr_head,
+				     u32 metric, int oif, int strict)
+{
+	struct rt6_info *rt, *match;
 	int mpri = -1;
 
-	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
-		  __FUNCTION__, head, head ? *head : NULL, oif);
+	match = NULL;
+	for (rt = rr_head; rt && rt->rt6i_metric == metric;
+	     rt = rt->u.next)
+		match = find_match(rt, oif, strict, &mpri, match);
+	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
+	     rt = rt->u.next)
+		match = find_match(rt, oif, strict, &mpri, match);
 
-	for (rt = rt0, metric = rt0->rt6i_metric;
-	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
-	     rt = rt->u.next) {
-		int m;
+	return match;
+}
 
-		if (rt6_check_expired(rt))
-			continue;
+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+{
+	struct rt6_info *match, *rt0;
 
-		last = rt;
+	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
+		  __FUNCTION__, fn->leaf, oif);
 
-		m = rt6_score_route(rt, oif, strict);
-		if (m < 0)
-			continue;
+	rt0 = fn->rr_ptr;
+	if (!rt0)
+		fn->rr_ptr = rt0 = fn->leaf;
 
-		if (m > mpri) {
-			rt6_probe(match);
-			match = rt;
-			mpri = m;
-		} else {
-			rt6_probe(rt);
-		}
-	}
+	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 
-#if 0
-	/*
-	 * The round-robin code below assumes and produces cyclic
-	 * leaf lists. Some code in ip6_fib.c expects and produces
-	 * NULL terminated lists which leads to the current leaf
-	 * head to turn NULL when the end of the list is being
-	 * reached. It's best to disable this code alltogether until
-	 * all of the list management has been fixed.
-	 */
 	if (!match &&
-	    (strict & RT6_LOOKUP_F_REACHABLE) &&
-	    last && last != rt0) {
+	    (strict & RT6_LOOKUP_F_REACHABLE)) {
+		struct rt6_info *next = rt0->u.next;
+
 		/* no entries matched; do round-robin */
-		static DEFINE_SPINLOCK(lock);
-		spin_lock(&lock);
-		*head = rt0->u.next;
-		rt0->u.next = last->u.next;
-		last->u.next = rt0;
-		spin_unlock(&lock);
+		if (!next || next->rt6i_metric != rt0->rt6i_metric)
+			next = fn->leaf;
+
+		if (next != rt0)
+			fn->rr_ptr = next;
 	}
-#endif
 
-	RT6_TRACE("%s() => %p, score=%d\n",
-		  __FUNCTION__, match, mpri);
+	RT6_TRACE("%s() => %p\n",
+		  __FUNCTION__, match);
 
 	return (match ? match : &ip6_null_entry);
 }
@@ -653,7 +664,7 @@ restart_2:
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
+	rt = rt6_select(fn, fl->iif, strict | reachable);
 	BACKTRACK(&fl->fl6_src);
 	if (rt == &ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
@@ -750,7 +761,7 @@ restart_2:
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
+	rt = rt6_select(fn, fl->oif, strict | reachable);
 	BACKTRACK(&fl->fl6_src);
 	if (rt == &ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
--- a/net/ipv6/ip6_fib.c.~1~	2007-03-26 18:57:53.000000000 -0700
+++ b/net/ipv6/ip6_fib.c	2007-03-26 19:04:39.000000000 -0700
@@ -639,6 +639,10 @@ static int fib6_add_rt2node(struct fib6_
 		ins = &iter->u.next;
 	}
 
+	/* Reset round-robin state, if necessary */
+	if (ins == &fn->leaf)
+		fn->rr_ptr = NULL;
+
 	/*
 	 *	insert node
 	 */
@@ -1091,6 +1095,10 @@ static void fib6_del_route(struct fib6_n
 	rt6_stats.fib_rt_entries--;
 	rt6_stats.fib_discarded_routes++;
 
+	/* Reset round-robin state, if necessary */
+	if (fn->rr_ptr == rt)
+		fn->rr_ptr = NULL;
+
 	/* Adjust walkers */
 	read_lock(&fib6_walker_lock);
 	FOR_WALKERS(w) {