Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 720

kernel-2.6.18-238.el5.src.rpm

From: Andy Gospodarek <gospo@redhat.com>
Subject: [RHEL5.1 PATCH] e1000: fix watchdog timeout panics
Date: Wed, 2 May 2007 14:01:19 -0400
Bugzilla: 217483
Message-Id: <20070502180119.GB9293@gospo.rdu.redhat.com>
Changelog: [e1000] fix watchdog timeout panics




This resolves a problem that we've seen for quite a while on rhel and
upstream with the e1000 driver.  The e1000 watchdog task times out (and
panics the box) if a perfectly timed ethtool request is made.

This resolves BZ 217483.  

It has been customer tested and verified that is resolves the reported
issue.

This patch is currently in -mm and I expect it will make it to netdev
(if not Linus' tree shortly).  The following 3 commits were pulled and
backported from Auke Kok's e1000 git tree:

git://lost.foo-projects.org/~ahkok/git/netdev-2.6

commit 1314bbf3a3d911218fc153e14873e2e384d08084
Author: Auke Kok <auke\-jan h kok intel com>

    e1000: driver state fixes (race fix)

commit 313340de0cb03b0cef8cd5b91cfb384792d15a3b
Author: Auke Kok <auke-jan h kok intel com>

    e1000: introduce watchdog task
    
commit 38fe2f8adb14761600d571f3ec269c55060e0da3
Author: Auke Kok <auke-jan h kok intel com>

    e1000: skip unneeded PHY reads in watchdog when link is OK

---

 e1000.h         |    4 ++-
 e1000_ethtool.c |    6 ++---
 e1000_main.c    |   67 ++++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 20 deletions(-)

--- linux-2.6.18.x86_64/drivers/net/e1000/e1000.h.gospo	2007-05-07 08:09:29.701964000 -0400
+++ linux-2.6.18.x86_64/drivers/net/e1000/e1000.h	2007-05-07 08:20:31.732878000 -0400
@@ -256,6 +256,7 @@ struct e1000_adapter {
 #endif
 	atomic_t irq_sem;
 	struct work_struct reset_task;
+	struct work_struct watchdog_task;
 	uint8_t fc_autoneg;
 
 	struct timer_list blink_timer;
@@ -346,8 +347,9 @@ struct e1000_adapter {
 };
 
 enum e1000_state_t {
-	__E1000_DRIVER_TESTING,
+	__E1000_TESTING,
 	__E1000_RESETTING,
+	__E1000_DOWN
 };
 
 /*  e1000_main.c  */
--- linux-2.6.18.x86_64/drivers/net/e1000/e1000_ethtool.c.gospo	2007-05-07 08:09:06.318372000 -0400
+++ linux-2.6.18.x86_64/drivers/net/e1000/e1000_ethtool.c	2007-05-07 08:20:49.086500000 -0400
@@ -1606,7 +1606,7 @@ e1000_diag_test(struct net_device *netde
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	boolean_t if_running = netif_running(netdev);
 
-	set_bit(__E1000_DRIVER_TESTING, &adapter->flags);
+	set_bit(__E1000_TESTING, &adapter->flags);
 	if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
 		/* Offline tests */
 
@@ -1651,7 +1651,7 @@ e1000_diag_test(struct net_device *netde
 		adapter->hw.autoneg = autoneg;
 
 		e1000_reset(adapter);
-		clear_bit(__E1000_DRIVER_TESTING, &adapter->flags);
+		clear_bit(__E1000_TESTING, &adapter->flags);
 		if (if_running)
 			dev_open(netdev);
 	} else {
@@ -1666,7 +1666,7 @@ e1000_diag_test(struct net_device *netde
 		data[2] = 0;
 		data[3] = 0;
 
-		clear_bit(__E1000_DRIVER_TESTING, &adapter->flags);
+		clear_bit(__E1000_TESTING, &adapter->flags);
 	}
 	msleep_interruptible(4 * 1000);
 }
--- linux-2.6.18.x86_64/drivers/net/e1000/e1000_main.c.gospo	2007-05-07 08:09:17.271409000 -0400
+++ linux-2.6.18.x86_64/drivers/net/e1000/e1000_main.c	2007-05-07 08:20:31.753861000 -0400
@@ -144,6 +144,7 @@ static void e1000_clean_rx_ring(struct e
 static void e1000_set_multi(struct net_device *netdev);
 static void e1000_update_phy_info(unsigned long data);
 static void e1000_watchdog(unsigned long data);
+static void e1000_watchdog_task(struct net_device *netdev);
 static void e1000_82547_tx_fifo_stall(unsigned long data);
 static int e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 static struct net_device_stats * e1000_get_stats(struct net_device *netdev);
@@ -469,13 +470,14 @@ e1000_up(struct e1000_adapter *adapter)
 
 	adapter->tx_queue_len = netdev->tx_queue_len;
 
-	mod_timer(&adapter->watchdog_timer, jiffies);
-
 #ifdef CONFIG_E1000_NAPI
 	netif_poll_enable(netdev);
 #endif
 	e1000_irq_enable(adapter);
 
+	clear_bit(__E1000_DOWN, &adapter->flags);
+
+	mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ);
 	return 0;
 }
 
@@ -531,6 +533,10 @@ e1000_down(struct e1000_adapter *adapter
 {
 	struct net_device *netdev = adapter->netdev;
 
+	/* signal that we're down so the interrupt handler does not
+	 * reschedule our watchdog timer */
+	set_bit(__E1000_DOWN, &adapter->flags);
+
 	e1000_irq_disable(adapter);
 
 	del_timer_sync(&adapter->tx_fifo_stall_timer);
@@ -865,11 +871,8 @@ e1000_probe(struct pci_dev *pdev,
 
 	INIT_WORK(&adapter->reset_task,
 		(void (*)(void *))e1000_reset_task, netdev);
-
-	/* we're going to reset, so assume we have no link for now */
-
-	netif_carrier_off(netdev);
-	netif_stop_queue(netdev);
+	INIT_WORK(&adapter->watchdog_task,
+		(void (*)(void *))e1000_watchdog_task, netdev);
 
 	e1000_check_options(adapter);
 
@@ -978,6 +981,10 @@ e1000_probe(struct pci_dev *pdev,
 	if ((err = register_netdev(netdev)))
 		goto err_register;
 
+	/* tell the stack to leave us alone until e1000_open() is called */
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
 	DPRINTK(PROBE, INFO, "Intel(R) PRO/1000 Network Connection\n");
 
 	cards_found++;
@@ -1034,6 +1041,13 @@ e1000_remove(struct pci_dev *pdev)
 	int i;
 #endif
 
+	/* flush_scheduled work may reschedule our watchdog task, so
+	 * explicitly disable watchdog tasks from being rescheduled  */
+	set_bit(__E1000_DOWN, &adapter->flags);
+	del_timer_sync(&adapter->tx_fifo_stall_timer);
+	del_timer_sync(&adapter->watchdog_timer);
+	del_timer_sync(&adapter->phy_info_timer);
+
 	flush_scheduled_work();
 
 	if (adapter->hw.mac_type >= e1000_82540 &&
@@ -1165,6 +1179,8 @@ e1000_sw_init(struct e1000_adapter *adap
 	atomic_set(&adapter->irq_sem, 1);
 	spin_lock_init(&adapter->stats_lock);
 
+	set_bit(__E1000_DOWN, &adapter->flags);
+
 	return 0;
 }
 
@@ -1230,7 +1246,7 @@ e1000_open(struct net_device *netdev)
 	int err;
 
 	/* disallow open during test */
-	if (test_bit(__E1000_DRIVER_TESTING, &adapter->flags))
+	if (test_bit(__E1000_TESTING, &adapter->flags))
 		return -EBUSY;
 
 	/* allocate transmit descriptors */
@@ -2353,9 +2369,8 @@ e1000_82547_tx_fifo_stall(unsigned long 
 			adapter->tx_fifo_head = 0;
 			atomic_set(&adapter->tx_fifo_stall, 0);
 			netif_wake_queue(netdev);
-		} else {
+		} else if (!test_bit(__E1000_DOWN, &adapter->flags))
 			mod_timer(&adapter->tx_fifo_stall_timer, jiffies + 1);
-		}
 	}
 }
 
@@ -2367,11 +2382,23 @@ static void
 e1000_watchdog(unsigned long data)
 {
 	struct e1000_adapter *adapter = (struct e1000_adapter *) data;
-	struct net_device *netdev = adapter->netdev;
+
+	/* Do the rest outside of interrupt context */
+	schedule_work(&adapter->watchdog_task);
+}
+
+static void
+e1000_watchdog_task(struct net_device *netdev)
+{
+	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_tx_ring *txdr = adapter->tx_ring;
 	uint32_t link, tctl;
 	int32_t ret_val;
 
+	if ((netif_carrier_ok(netdev)) &&
+	    (E1000_READ_REG(&adapter->hw, STATUS) & E1000_STATUS_LU))
+		goto link_up;
+
 	ret_val = e1000_check_for_link(&adapter->hw);
 	if ((ret_val == E1000_ERR_PHY) &&
 	    (adapter->hw.phy_type == e1000_phy_igp_3) &&
@@ -2461,7 +2488,8 @@ e1000_watchdog(unsigned long data)
 
 			netif_carrier_on(netdev);
 			netif_wake_queue(netdev);
-			mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ);
+			if (!test_bit(__E1000_DOWN, &adapter->flags))
+				mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ);
 			adapter->smartspeed = 0;
 		}
 	} else {
@@ -2471,7 +2499,8 @@ e1000_watchdog(unsigned long data)
 			DPRINTK(LINK, INFO, "NIC Link is Down\n");
 			netif_carrier_off(netdev);
 			netif_stop_queue(netdev);
-			mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ);
+			if (!test_bit(__E1000_DOWN, &adapter->flags))
+				mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ);
 
 			/* 80003ES2LAN workaround--
 			 * For packet buffer work-around on link down event;
@@ -2486,6 +2515,7 @@ e1000_watchdog(unsigned long data)
 		e1000_smartspeed(adapter);
 	}
 
+link_up:
 	e1000_update_stats(adapter);
 
 	adapter->hw.tx_packet_delta = adapter->stats.tpt - adapter->tpt_old;
@@ -2536,7 +2566,8 @@ e1000_watchdog(unsigned long data)
 		e1000_rar_set(&adapter->hw, adapter->hw.mac_addr, 0);
 
 	/* Reset the timer */
-	mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ);
+	if (!test_bit(__E1000_DOWN, &adapter->flags))
+		mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ);
 }
 
 #define E1000_TX_FLAGS_CSUM		0x00000001
@@ -3028,7 +3059,9 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	if (unlikely(adapter->hw.mac_type == e1000_82547)) {
 		if (unlikely(e1000_82547_fifo_workaround(adapter, skb))) {
 			netif_stop_queue(netdev);
-			mod_timer(&adapter->tx_fifo_stall_timer, jiffies);
+			if (!test_bit(__E1000_DOWN, &adapter->flags))
+				mod_timer(&adapter->tx_fifo_stall_timer,
+					jiffies + 1);
 			spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
 			return NETDEV_TX_BUSY;
 		}
@@ -3422,7 +3455,9 @@ e1000_intr(int irq, void *data, struct p
 			rctl = E1000_READ_REG(hw, RCTL);
 			E1000_WRITE_REG(hw, RCTL, rctl & ~E1000_RCTL_EN);
 		}
-		mod_timer(&adapter->watchdog_timer, jiffies);
+		/* guard against interrupt when we're going down */
+		if (!test_bit(__E1000_DOWN, &adapter->flags))
+			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
 #ifdef CONFIG_E1000_NAPI