diff -urN linux-2.4.29.orig/net/ipv4/route.c linux-2.4.29/net/ipv4/route.c --- linux-2.4.29.orig/net/ipv4/route.c 2005-03-27 16:46:24.102855312 +0200 +++ linux-2.4.29/net/ipv4/route.c 2005-03-27 16:44:58.950800392 +0200 @@ -53,6 +53,10 @@ * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics + * Krzysztof Taraszka : hashed spinlocks and rt_check_expire() fixes. + * : bugfix in rt_cpu_seq_show() + * : (merged from Eric Dumazet linux 2.6 patch) + * : TODO: proc stat for gc_interval_ms * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -61,6 +65,7 @@ */ #include <linux/config.h> +#include <asm/div64.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -102,12 +107,13 @@ #define IP_MAX_MTU 0xFFF0 #define RT_GC_TIMEOUT (300*HZ) +#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */ int ip_rt_min_delay = 2 * HZ; int ip_rt_max_delay = 10 * HZ; int ip_rt_max_size; int ip_rt_gc_timeout = RT_GC_TIMEOUT; -int ip_rt_gc_interval = 60 * HZ; +int ip_rt_gc_interval = RT_GC_INTERVAL; int ip_rt_gc_min_interval = HZ / 2; int ip_rt_redirect_number = 9; int ip_rt_redirect_load = HZ / 50; @@ -119,6 +125,7 @@ int ip_rt_min_pmtu = 512 + 20 + 20; int ip_rt_min_advmss = 256; int ip_rt_secret_interval = 10 * 60 * HZ; +int ip_rt_debug ; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) @@ -190,8 +197,24 @@ struct rt_hash_bucket { struct rtable *chain; - rwlock_t lock; -} __attribute__((__aligned__(8))); +}; + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of fixed size spinlocks + */ +# define RT_HASH_LOCK_SZ 256 + static rwlock_t rt_hash_lock[RT_HASH_LOCK_SZ]; +# define rt_hash_lock_addr(slot) &rt_hash_lock[slot & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + rwlock_init(&rt_hash_lock[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -227,7 +250,7 @@ } for (i = rt_hash_mask; i >= 0; i--) { - read_lock_bh(&rt_hash_table[i].lock); + read_lock_bh(rt_hash_lock_addr(i)); for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { /* * Spin through entries until we are ready @@ -263,11 +286,11 @@ sprintf(buffer + len, "%-127s\n", temp); len += 128; if (pos >= offset+length) { - read_unlock_bh(&rt_hash_table[i].lock); + read_unlock_bh(rt_hash_lock_addr(i)); goto done; } } - read_unlock_bh(&rt_hash_table[i].lock); + read_unlock_bh(rt_hash_lock_addr(i)); } done: @@ -393,45 +416,93 @@ /* This runs via a timer and thus is always in BH context. */ static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + static unsigned int effective_interval = RT_GC_INTERVAL; + static unsigned int cached_gc_interval = RT_GC_INTERVAL; + unsigned int i, goal; struct rtable *rth, **rthp; unsigned long now = jiffies; + unsigned int freed = 0 , t0; + u64 mult; - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { - unsigned long tmo = ip_rt_gc_timeout; - + if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */ + cached_gc_interval = ip_rt_gc_interval; + effective_interval = cached_gc_interval; + } + /* Computes the number of slots we should examin in this run : + * We want to perform a full scan every ip_rt_gc_timeout, and + * the timer is started every 'effective_interval' ticks. + * so goal = (number_of_slots) * (effective_interval / ip_rt_gc_timeout) + */ + mult = ((u64)effective_interval) << rt_hash_log; + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + + i = atomic_read(&ipv4_dst_ops.entries) << 3; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + i >>= 1; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + i >>= 1; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + now++; /* give us one more tick (time) to do our job */ + } + } + } + if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + t0 = goal; + i = rover ; + for ( ; goal > 0; goal--) { i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - write_lock(&rt_hash_table[i].lock); - while ((rth = *rthp) != NULL) { - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (*rthp) { + unsigned long tmo = ip_rt_gc_timeout; + write_lock(rt_hash_lock_addr(i)); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; + /* Cleanup aged off entries. */ + *rthp = rth->u.rt_next; + freed++; + rt_free(rth); } - - /* Cleanup aged off entries. */ - *rthp = rth->u.rt_next; - rt_free(rth); + write_unlock(rt_hash_lock_addr(i)); } - write_unlock(&rt_hash_table[i].lock); - /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + if (goal != 0) { + /* Not enough time to perform our job, try to adjust the timer. + * Firing the timer sooner means less planned work. + * We allow the timer to be 1/8 of the sysctl value. + */ + effective_interval = (effective_interval + cached_gc_interval/8)/2; + } + else { + /* We finished our job before time limit, try to increase the timer + * The limit is the sysctl value, we use a weight of 3/1 to + * increase slowly. + */ + effective_interval = (3*effective_interval + cached_gc_interval + 3)/4; + } + if (ip_rt_debug & 1) + printk(KERN_WARNING "rt_check_expire() : %u freed, goal=%u/%u, interval=%u ticks\n", freed, goal, t0, effective_interval); + mod_timer(&rt_periodic_timer, jiffies + effective_interval); } SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task); @@ -449,11 +520,11 @@ get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - write_lock_bh(&rt_hash_table[i].lock); + write_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - write_unlock_bh(&rt_hash_table[i].lock); + write_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -585,7 +656,7 @@ k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - write_lock_bh(&rt_hash_table[k].lock); + write_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -596,7 +667,7 @@ rt_free(rth); goal--; } - write_unlock_bh(&rt_hash_table[k].lock); + write_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -666,7 +737,7 @@ rthp = &rt_hash_table[hash].chain; - write_lock_bh(&rt_hash_table[hash].lock); + write_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { /* Put it first */ @@ -677,7 +748,7 @@ rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - write_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -718,7 +789,7 @@ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - write_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -759,7 +830,7 @@ } #endif rt_hash_table[hash].chain = rt; - write_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -826,7 +897,7 @@ { struct rtable **rthp; - write_lock_bh(&rt_hash_table[hash].lock); + write_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -835,7 +906,7 @@ rt_free(rt); break; } - write_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -874,7 +945,7 @@ rthp=&rt_hash_table[hash].chain; - read_lock(&rt_hash_table[hash].lock); + read_lock(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { struct rtable *rt; @@ -895,7 +966,7 @@ break; dst_hold(&rth->u.dst); - read_unlock(&rt_hash_table[hash].lock); + read_unlock(rt_hash_lock_addr(hash)); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { @@ -942,7 +1013,7 @@ ip_rt_put(rt); goto do_next; } - read_unlock(&rt_hash_table[hash].lock); + read_unlock(rt_hash_lock_addr(hash)); do_next: ; } @@ -1123,7 +1194,7 @@ for (i = 0; i < 2; i++) { unsigned hash = rt_hash_code(daddr, skeys[i], tos); - read_lock(&rt_hash_table[hash].lock); + read_lock(rt_hash_lock_addr(hash)); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { if (rth->key.dst == daddr && @@ -1161,7 +1232,7 @@ } } } - read_unlock(&rt_hash_table[hash].lock); + read_unlock(rt_hash_lock_addr(hash)); } return est_mtu ? : new_mtu; } @@ -1738,7 +1809,7 @@ tos &= IPTOS_RT_MASK; hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); - read_lock(&rt_hash_table[hash].lock); + read_lock(rt_hash_lock_addr(hash)); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && @@ -1753,13 +1824,13 @@ dst_hold(&rth->u.dst); rth->u.dst.__use++; rt_cache_stat[smp_processor_id()].in_hit++; - read_unlock(&rt_hash_table[hash].lock); + read_unlock(rt_hash_lock_addr(hash)); skb->dst = (struct dst_entry*)rth; return 0; } rt_cache_stat[smp_processor_id()].in_hlist_search++; } - read_unlock(&rt_hash_table[hash].lock); + read_unlock(rt_hash_lock_addr(hash)); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing @@ -2115,7 +2186,7 @@ hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos); - read_lock_bh(&rt_hash_table[hash].lock); + read_lock_bh(rt_hash_lock_addr(hash)); for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { if (rth->key.dst == key->dst && rth->key.src == key->src && @@ -2131,13 +2202,13 @@ dst_hold(&rth->u.dst); rth->u.dst.__use++; rt_cache_stat[smp_processor_id()].out_hit++; - read_unlock_bh(&rt_hash_table[hash].lock); + read_unlock_bh(rt_hash_lock_addr(hash)); *rp = rth; return 0; } rt_cache_stat[smp_processor_id()].out_hlist_search++; } - read_unlock_bh(&rt_hash_table[hash].lock); + read_unlock_bh(rt_hash_lock_addr(hash)); return ip_route_output_slow(rp, key); } @@ -2325,7 +2396,7 @@ if (h < s_h) continue; if (h > s_h) s_idx = 0; - read_lock_bh(&rt_hash_table[h].lock); + read_lock_bh(rt_hash_lock_addr(h)); for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) { if (idx < s_idx) @@ -2335,12 +2406,12 @@ cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { dst_release(xchg(&skb->dst, NULL)); - read_unlock_bh(&rt_hash_table[h].lock); + read_unlock_bh(rt_hash_lock_addr(h)); goto done; } dst_release(xchg(&skb->dst, NULL)); } - read_unlock_bh(&rt_hash_table[h].lock); + read_unlock_bh(rt_hash_lock_addr(h)); } done: @@ -2456,6 +2527,14 @@ strategy: &sysctl_jiffies, }, { + .ctl_name = NET_IPV4_ROUTE_GC_DEBUG, + .procname = "gc_debug", + .data = &ip_rt_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD, procname: "redirect_load", data: &ip_rt_redirect_load, @@ -2593,7 +2672,7 @@ void __init ip_rt_init(void) { - int i, order, goal; + int order, goal, rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); @@ -2640,14 +2719,12 @@ for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) /* NOTHING */; - rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - rt_hash_table[i].lock = RW_LOCK_UNLOCKED; - rt_hash_table[i].chain = NULL; - } + memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + ipv4_dst_ops.gc_thresh = rt_hash_mask; + ip_rt_max_size = rt_hash_mask * 16; + rt_hash_mask--; devinet_init(); ip_fib_init(); --- linux-2.4.29.orig/include/linux/sysctl.h 2005-01-19 15:10:13.000000000 +0100 +++ linux-2.4.29/include/linux/sysctl.h 2005-03-27 15:06:12.000000000 +0200 @@ -347,6 +347,7 @@ NET_IPV4_ROUTE_MIN_PMTU=16, NET_IPV4_ROUTE_MIN_ADVMSS=17, NET_IPV4_ROUTE_SECRET_INTERVAL=18, + NET_IPV4_ROUTE_GC_DEBUG=21, }; enum