Sophie: kernel24-0:2.4.34-1 src

kernel24-2.4.34-1.src.rpm

diff -urN linux-2.4.29.orig/net/ipv4/route.c linux-2.4.29/net/ipv4/route.c
--- linux-2.4.29.orig/net/ipv4/route.c	2005-03-27 16:46:24.102855312 +0200
+++ linux-2.4.29/net/ipv4/route.c	2005-03-27 16:44:58.950800392 +0200
@@ -53,6 +53,10 @@
  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
+ *      Krzysztof Taraszka      :       hashed spinlocks and rt_check_expire() fixes. 
+ *				:	bugfix in rt_cpu_seq_show()
+ *				:	(merged from Eric Dumazet linux 2.6 patch)
+ *				:	TODO: proc stat for gc_interval_ms
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -61,6 +65,7 @@
  */
 
 #include <linux/config.h>
+#include <asm/div64.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/bitops.h>
@@ -102,12 +107,13 @@
 #define IP_MAX_MTU	0xFFF0
 
 #define RT_GC_TIMEOUT (300*HZ)
+#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */
 
 int ip_rt_min_delay		= 2 * HZ;
 int ip_rt_max_delay		= 10 * HZ;
 int ip_rt_max_size;
 int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
-int ip_rt_gc_interval		= 60 * HZ;
+int ip_rt_gc_interval		= RT_GC_INTERVAL;
 int ip_rt_gc_min_interval	= HZ / 2;
 int ip_rt_redirect_number	= 9;
 int ip_rt_redirect_load		= HZ / 50;
@@ -119,6 +125,7 @@
 int ip_rt_min_pmtu		= 512 + 20 + 20;
 int ip_rt_min_advmss		= 256;
 int ip_rt_secret_interval	= 10 * 60 * HZ;
+int ip_rt_debug ;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
@@ -190,8 +197,24 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	rwlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of fixed size spinlocks
+ */
+# define RT_HASH_LOCK_SZ 256
+	static rwlock_t 	rt_hash_lock[RT_HASH_LOCK_SZ];
+# define rt_hash_lock_addr(slot) &rt_hash_lock[slot & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			rwlock_init(&rt_hash_lock[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -227,7 +250,7 @@
   	}
 	
 	for (i = rt_hash_mask; i >= 0; i--) {
-		read_lock_bh(&rt_hash_table[i].lock);
+		read_lock_bh(rt_hash_lock_addr(i));
 		for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
 			/*
 			 *	Spin through entries until we are ready
@@ -263,11 +286,11 @@
 			sprintf(buffer + len, "%-127s\n", temp);
 			len += 128;
 			if (pos >= offset+length) {
-				read_unlock_bh(&rt_hash_table[i].lock);
+				read_unlock_bh(rt_hash_lock_addr(i));
 				goto done;
 			}
 		}
-		read_unlock_bh(&rt_hash_table[i].lock);
+		read_unlock_bh(rt_hash_lock_addr(i));
         }
 
 done:
@@ -393,45 +416,93 @@
 /* This runs via a timer and thus is always in BH context. */
 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	static unsigned int effective_interval = RT_GC_INTERVAL;
+	static unsigned int cached_gc_interval = RT_GC_INTERVAL;
+	unsigned int i, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
+	unsigned int freed = 0 , t0;
+	u64 mult;
 
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
-		unsigned long tmo = ip_rt_gc_timeout;
-
+	if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */
+		cached_gc_interval = ip_rt_gc_interval;
+		effective_interval = cached_gc_interval;
+	}
+	/* Computes the number of slots we should examin in this run :
+	 * We want to perform a full scan every ip_rt_gc_timeout, and
+	 * the timer is started every 'effective_interval' ticks.
+	 * so goal = (number_of_slots) * (effective_interval / ip_rt_gc_timeout)
+	 */
+	mult = ((u64)effective_interval) << rt_hash_log;
+	do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+
+	i = atomic_read(&ipv4_dst_ops.entries) << 3;
+	if (i > ip_rt_max_size) {
+		goal <<= 1; /* be more aggressive */
+		i >>= 1;
+		if (i > ip_rt_max_size) {
+			goal <<= 1; /* be more aggressive */
+			i >>= 1;
+			if (i > ip_rt_max_size) {
+				goal <<= 1; /* be more aggressive */
+				now++; /* give us one more tick (time) to do our job */
+			}
+		}
+	}
+	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	t0 = goal;
+	i = rover ;
+	for ( ; goal > 0; goal--) {
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		write_lock(&rt_hash_table[i].lock);
-		while ((rth = *rthp) != NULL) {
-			if (rth->u.dst.expires) {
-				/* Entry is expired even if it is in use */
-				if (time_before_eq(now, rth->u.dst.expires)) {
+		if (*rthp) {
+			unsigned long tmo = ip_rt_gc_timeout;
+			write_lock(rt_hash_lock_addr(i));
+			while ((rth = *rthp) != NULL) {
+				if (rth->u.dst.expires) {
+					/* Entry is expired even if it is in use */
+					if (time_before_eq(now, rth->u.dst.expires)) {
+						tmo >>= 1;
+						rthp = &rth->u.rt_next;
+						continue;
+					}
+				} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 					tmo >>= 1;
 					rthp = &rth->u.rt_next;
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.rt_next;
-				continue;
+				/* Cleanup aged off entries. */
+				*rthp = rth->u.rt_next;
+				freed++;
+				rt_free(rth);
 			}
-
-			/* Cleanup aged off entries. */
-			*rthp = rth->u.rt_next;
-			rt_free(rth);
+			write_unlock(rt_hash_lock_addr(i));
 		}
-		write_unlock(&rt_hash_table[i].lock);
-
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	if (goal != 0) {
+		/* Not enough time to perform our job, try to adjust the timer.
+		 * Firing the timer sooner means less planned work.
+		 * We allow the timer to be 1/8 of the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval/8)/2;
+	}
+	else {
+		/* We finished our job before time limit, try to increase the timer
+		 * The limit is the sysctl value, we use a weight of 3/1 to
+		 * increase slowly.
+		 */
+		effective_interval = (3*effective_interval + cached_gc_interval + 3)/4;
+	}
+	if (ip_rt_debug & 1)
+		printk(KERN_WARNING "rt_check_expire() : %u freed, goal=%u/%u, interval=%u ticks\n", freed, goal, t0, effective_interval);
+	mod_timer(&rt_periodic_timer, jiffies + effective_interval);
 }
 
 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
@@ -449,11 +520,11 @@
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		write_lock_bh(&rt_hash_table[i].lock);
+		write_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		write_unlock_bh(&rt_hash_table[i].lock);
+		write_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -585,7 +656,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			write_lock_bh(&rt_hash_table[k].lock);
+			write_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -596,7 +667,7 @@
 				rt_free(rth);
 				goal--;
 			}
-			write_unlock_bh(&rt_hash_table[k].lock);
+			write_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -666,7 +737,7 @@
 
 	rthp = &rt_hash_table[hash].chain;
 
-	write_lock_bh(&rt_hash_table[hash].lock);
+	write_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 			/* Put it first */
@@ -677,7 +748,7 @@
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			write_unlock_bh(&rt_hash_table[hash].lock);
+			write_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -718,7 +789,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			write_unlock_bh(&rt_hash_table[hash].lock);
+			write_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -759,7 +830,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	write_unlock_bh(&rt_hash_table[hash].lock);
+	write_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -826,7 +897,7 @@
 {
 	struct rtable **rthp;
 
-	write_lock_bh(&rt_hash_table[hash].lock);
+	write_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -835,7 +906,7 @@
 			rt_free(rt);
 			break;
 		}
-	write_unlock_bh(&rt_hash_table[hash].lock);
+	write_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -874,7 +945,7 @@
 
 			rthp=&rt_hash_table[hash].chain;
 
-			read_lock(&rt_hash_table[hash].lock);
+			read_lock(rt_hash_lock_addr(hash));
 			while ((rth = *rthp) != NULL) {
 				struct rtable *rt;
 
@@ -895,7 +966,7 @@
 					break;
 
 				dst_hold(&rth->u.dst);
-				read_unlock(&rt_hash_table[hash].lock);
+				read_unlock(rt_hash_lock_addr(hash));
 
 				rt = dst_alloc(&ipv4_dst_ops);
 				if (rt == NULL) {
@@ -942,7 +1013,7 @@
 					ip_rt_put(rt);
 				goto do_next;
 			}
-			read_unlock(&rt_hash_table[hash].lock);
+			read_unlock(rt_hash_lock_addr(hash));
 		do_next:
 			;
 		}
@@ -1123,7 +1194,7 @@
 	for (i = 0; i < 2; i++) {
 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 
-		read_lock(&rt_hash_table[hash].lock);
+		read_lock(rt_hash_lock_addr(hash));
 		for (rth = rt_hash_table[hash].chain; rth;
 		     rth = rth->u.rt_next) {
 			if (rth->key.dst == daddr &&
@@ -1161,7 +1232,7 @@
 				}
 			}
 		}
-		read_unlock(&rt_hash_table[hash].lock);
+		read_unlock(rt_hash_lock_addr(hash));
 	}
 	return est_mtu ? : new_mtu;
 }
@@ -1738,7 +1809,7 @@
 	tos &= IPTOS_RT_MASK;
 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
 
-	read_lock(&rt_hash_table[hash].lock);
+	read_lock(rt_hash_lock_addr(hash));
 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
@@ -1753,13 +1824,13 @@
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			rt_cache_stat[smp_processor_id()].in_hit++;
-			read_unlock(&rt_hash_table[hash].lock);
+			read_unlock(rt_hash_lock_addr(hash));
 			skb->dst = (struct dst_entry*)rth;
 			return 0;
 		}
 		rt_cache_stat[smp_processor_id()].in_hlist_search++;
 	}
-	read_unlock(&rt_hash_table[hash].lock);
+	read_unlock(rt_hash_lock_addr(hash));
 
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
@@ -2115,7 +2186,7 @@
 
 	hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
 
-	read_lock_bh(&rt_hash_table[hash].lock);
+	read_lock_bh(rt_hash_lock_addr(hash));
 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
 		if (rth->key.dst == key->dst &&
 		    rth->key.src == key->src &&
@@ -2131,13 +2202,13 @@
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			rt_cache_stat[smp_processor_id()].out_hit++;
-			read_unlock_bh(&rt_hash_table[hash].lock);
+			read_unlock_bh(rt_hash_lock_addr(hash));
 			*rp = rth;
 			return 0;
 		}
 		rt_cache_stat[smp_processor_id()].out_hlist_search++;
 	}
-	read_unlock_bh(&rt_hash_table[hash].lock);
+	read_unlock_bh(rt_hash_lock_addr(hash));
 
 	return ip_route_output_slow(rp, key);
 }	
@@ -2325,7 +2396,7 @@
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
-		read_lock_bh(&rt_hash_table[h].lock);
+		read_lock_bh(rt_hash_lock_addr(h));
 		for (rt = rt_hash_table[h].chain, idx = 0; rt;
 		     rt = rt->u.rt_next, idx++) {
 			if (idx < s_idx)
@@ -2335,12 +2406,12 @@
 					 cb->nlh->nlmsg_seq,
 					 RTM_NEWROUTE, 1) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-				read_unlock_bh(&rt_hash_table[h].lock);
+				read_unlock_bh(rt_hash_lock_addr(h));
 				goto done;
 			}
 			dst_release(xchg(&skb->dst, NULL));
 		}
-		read_unlock_bh(&rt_hash_table[h].lock);
+		read_unlock_bh(rt_hash_lock_addr(h));
 	}
 
 done:
@@ -2456,6 +2527,14 @@
 		strategy:	&sysctl_jiffies,
 	},
 	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_DEBUG,
+		.procname	= "gc_debug",
+		.data		= &ip_rt_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		ctl_name:	NET_IPV4_ROUTE_REDIRECT_LOAD,
 		procname:	"redirect_load",
 		data:		&ip_rt_redirect_load,
@@ -2593,7 +2672,7 @@
 
 void __init ip_rt_init(void)
 {
-	int i, order, goal;
+	int order, goal, rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
@@ -2640,14 +2719,12 @@
 	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
 		/* NOTHING */;
 
-	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
-		rt_hash_table[i].chain = NULL;
-	}
+	memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = rt_hash_mask;
+	ip_rt_max_size = rt_hash_mask * 16;
+	rt_hash_mask--;
 
 	devinet_init();
 	ip_fib_init();
--- linux-2.4.29.orig/include/linux/sysctl.h	2005-01-19 15:10:13.000000000 +0100
+++ linux-2.4.29/include/linux/sysctl.h	2005-03-27 15:06:12.000000000 +0200
@@ -347,6 +347,7 @@
 	NET_IPV4_ROUTE_MIN_PMTU=16,
 	NET_IPV4_ROUTE_MIN_ADVMSS=17,
 	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
+	NET_IPV4_ROUTE_GC_DEBUG=21,
 };
 
 enum