Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From f8edbc1befc39d56d5f78257b3911a2c9303f633 Mon Sep 17 00:00:00 2001
From: Anton Arapov <aarapov@redhat.com>
Date: Thu, 3 Jul 2008 16:45:07 -0400
Subject: [PATCH] [misc] kernel crashes on futex

Message-id: <20080317111427.GA16762@bandura.englab.brq.redhat.com>
O-Subject: [RHEL5.3 PATCH] BZ#435178: Kernel crash on futex
Bugzilla: 435178

BZ#435178:
  https://bugzilla.redhat.com/show_bug.cgi?id=435178

Details:
  One of our customers facing kernel crash and soft lockups while using
futexes.

  The problem is, we have a race issue. While one thread fetching the
pointer to entry from the list, another thread removes it, so that
first thread gets wrong pointer. In case, described in bz, it gets
pointer to head of the list.
  Actually, different race issues possible. And I hit them while ported
fixes from upstream.

  Proposed fix is based on the following backports:
    - futex_compat: fix list traversal bugs:
        179c85ea53bef807621f335767e41e23f86f01df
    - fix address computation in compat code:
        3c5fd9c77d609b51c0bab682c9d40cbb496ec6f1
    - futex: PI state locking fix:
        21778867b1c8e0feb567addb6dc0a7e2ca6ecdec
    - pi-futexes: fix exit races and locking problems:
        778e9a9c3e7193ea9f434f382947155ffb59c755
    - robust futex thread exit race:
        9f96cb1e8bca179a92afa40dfc3c49990f1cfc71
    other commit ##, are the fixes to the commits above.

  I had not have a success in reproducing the original issue. Only this
patches together fixed kernel panic and all soft lockups that customer
experienced.

Upstream status:
  commits ##
    3c5fd9c77d609b51c0bab682c9d40cbb496ec6f1
    179c85ea53bef807621f335767e41e23f86f01df
    cdf71a10c7b6432d9b48e292cca2c62a0b9fa6cf
    cde898fa80a45bb23eab2a060fc79d0913081409
    9f96cb1e8bca179a92afa40dfc3c49990f1cfc71
    187226f57f1381cfc63216979b4375f30e593795
    a06381fec77bf88ec6c5eb6324457cb04e9ffd69
    778e9a9c3e7193ea9f434f382947155ffb59c755
    21778867b1c8e0feb567addb6dc0a7e2ca6ecdec

Test status:
  patch has been tested for compilation and boot.
  KernelTier1 test does not show regressions.
  http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1213849

Notice:
  Patch looks big, but the real impact is not big, all the changes are
within two functions. And one new helper function introduced.
> > Accelerated Fix/HOTFIX Requested http://seg.rdu.redhat.com/scripts/hotfix/edit.pl?id=2646

Acked-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Peter Zijlstra <pzijlstr@redhat.com>

==
---
 include/linux/sched.h |    1 +
 kernel/exit.c         |   24 +++-
 kernel/futex.c        |  361 +++++++++++++++++++++++++++++++++----------------
 kernel/futex_compat.c |   50 +++++--
 4 files changed, 300 insertions(+), 136 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4bf2f47..c7502a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1077,6 +1077,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_DEAD		0x00000008	/* Dead */
+#define PF_EXITPIDONE	0x00000010	/* pi exit done on shut down */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c
index 45a1562..24d2b68 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -827,13 +827,29 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		printk(KERN_ALERT
 			"Fixing recursive fault but reboot is needed!\n");
+		/*
+		 * We can do this unlocked here. The futex code uses
+		 * this flag just to verify whether the pi state
+		 * cleanup has been done or not. In the worst case it
+		 * loops once more. We pretend that the cleanup was
+		 * done as there is no way to return. Either the
+		 * OWNER_DIED bit is set by now or we push the blocked
+		 * task into the wait for ever nirwana as well.
+		 */
+		tsk->flags |= PF_EXITPIDONE;
 		if (tsk->io_context)
 			exit_io_context();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
 
+	/*
+	 * tsk->flags are checked in the futex code to protect against
+	 * an exiting task cleaning up the robust pi futexes.
+	 */
+	spin_lock_irq(&tsk->pi_lock);
 	tsk->flags |= PF_EXITING;
+	spin_unlock_irq(&tsk->pi_lock);
 
 	ptrace_exit(tsk);
 
@@ -851,7 +867,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		hrtimer_cancel(&tsk->signal->real_timer);
+		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 	}
 
@@ -913,6 +929,12 @@ fastcall NORET_TYPE void do_exit(long code)
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held(tsk);
+	/*
+	 * We can do this unlocked here. The futex code uses this flag
+	 * just to verify whether the pi state cleanup has been done
+	 * or not. In the worst case it loops once more.
+	 */
+	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
 		exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e8..b7b1197 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -391,18 +391,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	if (!p)
-		goto out_unlock;
-	if ((current->euid != p->euid) && (current->euid != p->uid)) {
-		p = NULL;
-		goto out_unlock;
-	}
-	if (p->exit_state != 0) {
-		p = NULL;
-		goto out_unlock;
-	}
-	get_task_struct(p);
-out_unlock:
+
+	if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+		p = ERR_PTR(-ESRCH);
+	else
+		get_task_struct(p);
+
 	read_unlock(&tasklist_lock);
 
 	return p;
@@ -468,7 +462,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	struct futex_q *this, *next;
 	struct list_head *head;
 	struct task_struct *p;
-	pid_t pid;
+	pid_t pid = uval & FUTEX_TID_MASK;
 
 	head = &hb->chain;
 
@@ -486,6 +480,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 				return -EINVAL;
 
 			WARN_ON(!atomic_read(&pi_state->refcount));
+			WARN_ON(pid && pi_state->owner &&
+				pi_state->owner->pid != pid);
 
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
@@ -496,15 +492,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 
 	/*
 	 * We are the first waiter - try to look up the real owner and attach
-	 * the new pi_state to it, but bail out when the owner died bit is set
-	 * and TID = 0:
+	 * the new pi_state to it, but bail out when TID = 0
 	 */
-	pid = uval & FUTEX_TID_MASK;
-	if (!pid && (uval & FUTEX_OWNER_DIED))
+	if (!pid)
 		return -ESRCH;
 	p = futex_find_get_task(pid);
-	if (!p)
-		return -ESRCH;
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	/*
+	 * We need to look at the task state flags to figure out,
+	 * whether the task is exiting. To protect against the do_exit
+	 * change of the task flags, we do this protected by
+	 * p->pi_lock:
+	 */
+	spin_lock_irq(&p->pi_lock);
+	if (unlikely(p->flags & PF_EXITING)) {
+		/*
+		 * The task is on the way out. When PF_EXITPIDONE is
+		 * set, we know that the task has finished the
+		 * cleanup:
+		 */
+		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+		spin_unlock_irq(&p->pi_lock);
+		put_task_struct(p);
+		return ret;
+	}
 
 	pi_state = alloc_pi_state();
 
@@ -517,7 +531,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	/* Store the key for possible exit cleanups: */
 	pi_state->key = me->key;
 
-	spin_lock_irq(&p->pi_lock);
 	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &p->pi_state_list);
 	pi_state->owner = p;
@@ -566,6 +579,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (!pi_state)
 		return -EINVAL;
 
+	spin_lock(&pi_state->pi_mutex.wait_lock);
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
 	/*
@@ -583,15 +597,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 * preserve the owner died bit.)
 	 */
 	if (!(uval & FUTEX_OWNER_DIED)) {
+		int ret = 0;
 		newval = FUTEX_WAITERS | new_owner->pid;
 
 		inc_preempt_count();
 		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
 		dec_preempt_count();
 		if (curval == -EFAULT)
-			return -EFAULT;
-		if (curval != uval)
-			return -EINVAL;
+			ret = -EFAULT;
+		else if (curval != uval)
+			ret = -EINVAL;
+		if (ret) {
+			spin_unlock(&pi_state->pi_mutex.wait_lock);
+			return ret;
+		}
 	}
 
 	spin_lock_irq(&pi_state->owner->pi_lock);
@@ -605,6 +624,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	pi_state->owner = new_owner;
 	spin_unlock_irq(&new_owner->pi_lock);
 
+	spin_unlock(&pi_state->pi_mutex.wait_lock);
 	rt_mutex_unlock(&pi_state->pi_mutex);
 
 	return 0;
@@ -1001,6 +1021,60 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 	drop_key_refs(&q->key);
 }
 
+/*
+ * Fixup the pi_state owner with the new owner.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 *uaddr, struct futex_q *q,
+				struct task_struct *newowner)
+{
+	u32 newtid = newowner->pid | FUTEX_WAITERS;
+	struct futex_pi_state *pi_state = q->pi_state;
+	u32 uval, curval, newval;
+	int ret;
+
+	/* Owner died? */
+	if (pi_state->owner != NULL) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+	} else
+		newtid |= FUTEX_OWNER_DIED;
+
+	pi_state->owner = newowner;
+
+	spin_lock_irq(&newowner->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
+	list_add(&pi_state->list, &newowner->pi_state_list);
+	spin_unlock_irq(&newowner->pi_lock);
+
+	/*
+	 * We own it, so we have to replace the pending owner
+	 * TID. This must be atomic as we have preserve the
+	 * owner died bit here.
+	 */
+	ret = get_futex_value_locked(&uval, uaddr);
+
+	while (!ret) {
+		newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+		inc_preempt_count();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							uval, newval);
+		dec_preempt_count();
+
+		if (curval == -EFAULT)
+			ret = -EFAULT;
+		if (curval == uval)
+			break;
+		uval = curval;
+	}
+	return ret;
+}
+
 static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 {
 	struct task_struct *curr = current;
@@ -1128,7 +1202,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	struct futex_hash_bucket *hb;
 	u32 uval, newval, curval;
 	struct futex_q q;
-	int ret, attempt = 0;
+	int ret, lock_taken, ownerdied = 0, attempt = 0;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1148,9 +1222,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
+ retry_unlocked:
 	hb = queue_lock(&q, -1, NULL);
 
  retry_locked:
+	ret = lock_taken = 0;
+
 	/*
 	 * To avoid races, we attempt to take the lock here again
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1165,24 +1242,44 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	if (unlikely(curval == -EFAULT))
 		goto uaddr_faulted;
 
-	/* We own the lock already */
+	/*
+	 * Detect deadlocks. In case of REQUEUE_PI this is a valid
+	 * situation and we return success to user space.
+	 */
 	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
-		if (!detect && 0)
-			force_sig(SIGKILL, current);
 		ret = -EDEADLK;
 		goto out_unlock_release_sem;
 	}
 
 	/*
-	 * Surprise - we got the lock. Just return
-	 * to userspace:
+	 * Surprise - we got the lock. Just return to userspace:
 	 */
 	if (unlikely(!curval))
 		goto out_unlock_release_sem;
 
 	uval = curval;
+
+	/*
+	 * Set the WAITERS flag, so the owner will know it has someone
+	 * to wake at next unlock
+	 */
 	newval = uval | FUTEX_WAITERS;
 
+	/*
+	 * There are two cases, where a futex might have no owner (the
+	 * owner TID is 0): OWNER_DIED or REQUEUE. We take over the
+	 * futex in this case. We also do an unconditional take over,
+	 * when the owner of the futex died.
+	 *
+	 * This is safe as we are protected by the hash bucket lock !
+	 */
+	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+		/* Keep the OWNER_DIED bit */
+		newval = (curval & ~FUTEX_TID_MASK) | current->pid;
+		ownerdied = 0;
+		lock_taken = 1;
+	}
+
 	inc_preempt_count();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
 	dec_preempt_count();
@@ -1193,40 +1290,51 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 		goto retry_locked;
 
 	/*
+	 * We took the lock due to requeue or owner died take over.
+	 */
+	if (unlikely(lock_taken))
+		goto out_unlock_release_sem;
+
+	/*
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
 	ret = lookup_pi_state(uval, hb, &q);
 
 	if (unlikely(ret)) {
-		/*
-		 * There were no waiters and the owner task lookup
-		 * failed. When the OWNER_DIED bit is set, then we
-		 * know that this is a robust futex and we actually
-		 * take the lock. This is safe as we are protected by
-		 * the hash bucket lock. We also set the waiters bit
-		 * unconditionally here, to simplify glibc handling of
-		 * multiple tasks racing to acquire the lock and
-		 * cleanup the problems which were left by the dead
-		 * owner.
-		 */
-		if (curval & FUTEX_OWNER_DIED) {
-			uval = newval;
-			newval = current->pid |
-				FUTEX_OWNER_DIED | FUTEX_WAITERS;
+		switch (ret) {
 
-			inc_preempt_count();
-			curval = futex_atomic_cmpxchg_inatomic(uaddr,
-							       uval, newval);
-			dec_preempt_count();
+		case -EAGAIN:
+			/*
+			 * Task is exiting and we just wait for the
+			 * exit to complete.
+			 */
+			queue_unlock(&q, hb);
+			up_read(&curr->mm->mmap_sem);
+			cond_resched();
+			goto retry;
 
-			if (unlikely(curval == -EFAULT))
+		case -ESRCH:
+			/*
+			 * No owner found for this futex. Check if the
+			 * OWNER_DIED bit is set to figure out whether
+			 * this is a robust futex or not.
+			 */
+			if (get_futex_value_locked(&curval, uaddr))
 				goto uaddr_faulted;
-			if (unlikely(curval != uval))
+
+			/*
+			 * We simply start over in case of a robust
+			 * futex. The code above will take the futex
+			 * and return happy.
+			 */
+			if (curval & FUTEX_OWNER_DIED) {
+				ownerdied = 1;
 				goto retry_locked;
-			ret = 0;
+			}
+		default:
+			goto out_unlock_release_sem;
 		}
-		goto out_unlock_release_sem;
 	}
 
 	/*
@@ -1255,65 +1363,63 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	down_read(&curr->mm->mmap_sem);
 	spin_lock(q.lock_ptr);
 
-	/*
-	 * Got the lock. We might not be the anticipated owner if we
-	 * did a lock-steal - fix up the PI-state in that case.
-	 */
-	if (!ret && q.pi_state->owner != curr) {
-		u32 newtid = current->pid | FUTEX_WAITERS;
-
-		/* Owner died? */
-		if (q.pi_state->owner != NULL) {
-			spin_lock_irq(&q.pi_state->owner->pi_lock);
-			WARN_ON(list_empty(&q.pi_state->list));
-			list_del_init(&q.pi_state->list);
-			spin_unlock_irq(&q.pi_state->owner->pi_lock);
-		} else
-			newtid |= FUTEX_OWNER_DIED;
-
-		q.pi_state->owner = current;
-
-		spin_lock_irq(&current->pi_lock);
-		WARN_ON(!list_empty(&q.pi_state->list));
-		list_add(&q.pi_state->list, &current->pi_state_list);
-		spin_unlock_irq(&current->pi_lock);
-
-		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
-		up_read(&curr->mm->mmap_sem);
+	if (!ret) {
 		/*
-		 * We own it, so we have to replace the pending owner
-		 * TID. This must be atomic as we have preserve the
-		 * owner died bit here.
+		 * Got the lock. We might not be the anticipated owner 
+		 * if we did a lock-steal - fix up the PI-state in
+		 * that case:
 		 */
-		ret = get_user(uval, uaddr);
-		while (!ret) {
-			newval = (uval & FUTEX_OWNER_DIED) | newtid;
-			curval = futex_atomic_cmpxchg_inatomic(uaddr,
-							       uval, newval);
-			if (curval == -EFAULT)
-				ret = -EFAULT;
-			if (curval == uval)
-				break;
-			uval = curval;
-		}
+		if (q.pi_state->owner != curr)
+			ret = fixup_pi_state_owner(uaddr, &q, curr);
 	} else {
 		/*
 		 * Catch the rare case, where the lock was released
-		 * when we were on the way back before we locked
-		 * the hash bucket.
+		 * when we were on the way back before we locked the
+		 * hash bucket.
 		 */
-		if (ret && q.pi_state->owner == curr) {
+		if (q.pi_state->owner == curr) {
 			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
 				ret = 0;
+			else {
+				/*
+				 * pi_state is incorrect, some other
+				 * task did a lock steal and we
+				 * returned due to timeout or signal
+				 * without taking the rt_mutex. Too
+				 * late. We can access the
+				 * rt_mutex_owner without locking, as
+				 * the other task is now blocked on
+				 * the hash bucket lock. Fix the state
+				 * up.
+				 */
+				struct task_struct *owner;
+				int res;
+
+				owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+				res = fixup_pi_state_owner(uaddr, &q, owner);
+
+				/* propagate -EFAULT, if the fixup failed */
+				if (res)
+					ret = res;
+			}
+		} else {
+			/*
+			 * Paranoia check. If we did not take the lock
+			 * in the trylock above, then we should not be
+			 * the owner of the rtmutex, neither the real
+			 * nor the pending one:
+			 */
+			if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+				printk(KERN_ERR "futex_lock_pi: ret = %d "
+				       "pi-mutex: %p pi-state %p\n", ret,
+				       q.pi_state->pi_mutex.owner,
+				       q.pi_state->owner);
 		}
-		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
-		up_read(&curr->mm->mmap_sem);
 	}
-
-	if (!detect && ret == -EDEADLK && 0)
-		force_sig(SIGKILL, current);
+	
+	/* Unqueue and drop the lock */
+	unqueue_me_pi(&q, hb);
+	up_read(&curr->mm->mmap_sem);
 
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
@@ -1330,16 +1436,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	 * non-atomically.  Therefore, if get_user below is not
 	 * enough, we need to handle the fault ourselves, while
 	 * still holding the mmap_sem.
+	 *
+	 * ... and hb->lock.  :-)  --ANK
 	 */
+	queue_unlock(&q, hb);
+
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
-			goto out_unlock_release_sem;
-		}
-		goto retry_locked;
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
+		if (ret)
+			goto out_release_sem;
+		goto retry_unlocked;
 	}
 
-	queue_unlock(&q, hb);
 	up_read(&curr->mm->mmap_sem);
 
 	ret = get_user(uval, uaddr);
@@ -1381,9 +1489,9 @@ retry:
 		goto out;
 
 	hb = hash_futex(&key);
+retry_unlocked:
 	spin_lock(&hb->lock);
 
-retry_locked:
 	/*
 	 * To avoid races, try to do the TID -> 0 atomic transition
 	 * again. If it succeeds then we can return without waking
@@ -1445,16 +1553,19 @@ pi_faulted:
 	 * non-atomically.  Therefore, if get_user below is not
 	 * enough, we need to handle the fault ourselves, while
 	 * still holding the mmap_sem.
+	 *
+	 * ... and hb->lock. --ANK
 	 */
+	spin_unlock(&hb->lock);
+
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
-			goto out_unlock;
-		}
-		goto retry_locked;
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
+		if (ret)
+			goto out;
+		uval = 0;
+		goto retry_unlocked;
 	}
 
-	spin_unlock(&hb->lock);
 	up_read(&current->mm->mmap_sem);
 
 	ret = get_user(uval, uaddr);
@@ -1716,9 +1827,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
 void exit_robust_list(struct task_struct *curr)
 {
 	struct robust_list_head __user *head = curr->robust_list;
-	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	struct robust_list __user *entry, *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
 	unsigned long futex_offset;
+	int rc;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
@@ -1741,8 +1853,14 @@ void exit_robust_list(struct task_struct *curr)
 	if (pending)
 		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
+	next_entry = NULL;      /* avoid warning with gcc */
 	while (entry != &head->list) {
 		/*
+		 * Fetch the next entry in the list before calling
+		 * handle_futex_death:
+		 */
+		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+		/*
 		 * A pending lock might already be on the list, so
 		 * don't process it twice:
 		 */
@@ -1750,11 +1868,10 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void *)entry + futex_offset,
 						curr, pi))
 				return;
-		/*
-		 * Fetch the next entry in the list:
-		 */
-		if (fetch_robust_entry(&entry, &entry->next, &pi))
+		if (rc)
 			return;
+		entry = next_entry;
+		pi = next_pi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
@@ -1763,6 +1880,10 @@ void exit_robust_list(struct task_struct *curr)
 
 		cond_resched();
 	}
+
+	if (pending)
+		handle_futex_death((void __user *)pending + futex_offset,
+				   curr, pip);
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c5cca3f..a31f13d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -29,6 +29,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
 	return 0;
 }
 
+static void __user *futex_uaddr(struct robust_list *entry,
+				compat_long_t futex_offset)
+{
+	compat_uptr_t base = ptr_to_compat(entry);
+	void __user *uaddr = compat_ptr(base + futex_offset);
+
+	return uaddr;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -38,10 +47,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
 void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	compat_uptr_t uentry, upending;
+	struct robust_list __user *entry, *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+	compat_uptr_t uentry, next_uentry, upending;
 	compat_long_t futex_offset;
+	int rc;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
@@ -61,25 +71,30 @@ void compat_exit_robust_list(struct task_struct *curr)
 	if (fetch_robust_entry(&upending, &pending,
 			       &head->list_op_pending, &pip))
 		return;
-	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
-	while (compat_ptr(uentry) != &head->list) {
+	next_entry = NULL;      /* avoid warning with gcc */
+	while (entry != (struct robust_list __user *) &head->list) {
+		/*
+		 * Fetch the next entry in the list before calling
+		 * handle_futex_death:
+		 */
+		rc = fetch_robust_entry(&next_uentry, &next_entry,
+			(compat_uptr_t __user *)&entry->next, &next_pi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
 		 */
-		if (entry != pending)
-			if (handle_futex_death((void *)entry + futex_offset,
-						curr, pi))
-				return;
+		if (entry != pending) {
+			void *uaddr = futex_uaddr(entry, futex_offset);
 
-		/*
-		 * Fetch the next entry in the list:
-		 */
-		if (fetch_robust_entry(&uentry, &entry,
-				       (compat_uptr_t *)&entry->next, &pi))
+			if (handle_futex_death(uaddr, curr, pi))
+				return;
+		}
+		if (rc)
 			return;
+		uentry = next_uentry;
+		entry = next_entry;
+		pi = next_pi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
@@ -88,6 +103,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 
 		cond_resched();
 	}
+	if (pending) {
+		void *uaddr = futex_uaddr(pending, futex_offset);
+
+		handle_futex_death(uaddr, curr, pip);
+	}
 }
 
 asmlinkage long
-- 
1.5.5.1