Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Neil Horman <nhorman@redhat.com>
Date: Thu, 4 Dec 2008 10:18:50 -0500
Subject: [net] fix unix sockets kernel panic
Message-id: 20081204151850.GE6725@hmsendeavour.rdu.redhat.com
O-Subject: Re: [RHEL 5.3 PATCH] net: fix CVE-2008-5029 kernel: Unix sockets kernel panic (bz470436)
Bugzilla: 470436
RH-Acked-by: David Miller <davem@redhat.com>
RH-Acked-by: Thomas Graf <tgraf@redhat.com>
CVE: CVE-2008-5029

On Wed, Nov 26, 2008 at 04:01:54PM -0500, Neil Horman wrote:
> On Fri, Nov 14, 2008 at 11:08:25AM -0500, Neil Horman wrote:
> > Hey all-
> > 	Backport of the following upstream commits:
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=f8d570a
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3b53fbf
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=1fd05ba5a2f2aa8e7b9b52ef55df850e2e7d54c9
> >
> > It fixes a condition in which repeatedly passing unix sockets between child and
> > parent processes can trigger a kernel panic.  I've tested it myself with the
> > provided reproducer, and confirmed that prior to the fix we are oops the kernel
> > within 10-20 iterations of the test case, and can run unaffected for several
> > hours with the patch.
> >
> > I'm sure someone will comment on this, so I'll throw it out there now:  I'm not
> > a huge fan of the addition of a member to task_struct.  But I've looked and
> > looked, and I can't seem to find any case in which doing so is a problem.  No
> > one needs to allocate task structs outside of the sys_clone path, and certainly
> > no one is statically declaring a task struct anywhere except for INIT_TASK.  No
> > one inlines a task_struct in another structure, so I think we're safe guarding
> > the ABI with __GENKSYMS__ there.  So barring some strange use of task struct
> > that I'm unaware of, it should be ok.
> >
> > Resolves bz 470436.
> >
> > Regards
> > Neil
> >
> >
> > --- linux-2.6.18.noarch/include/linux/sched.h.orig	2008-11-11 16:00:07.000000000 -0500
> > +++ linux-2.6.18.noarch/include/linux/sched.h	2008-11-14 09:46:23.000000000 -0500
> > @@ -1058,6 +1058,9 @@
> >  #ifdef	CONFIG_TASK_DELAY_ACCT
> >  	struct task_delay_info *delays;
> >  #endif
> > +#ifndef __GENKSYMS__
> > +	struct list_head        *scm_work_list;
> > +#endif
> >  };
> >
> I'm sorry, but I have to self NAK this.  As it turns out, despite my efforts
> we've discovered a ABI break with this.  It appears restricted to ia64 (gosh how
> I love ia64), but I suppose an arch is an arch.  I'll repost as soon as I have a
> version of this patch that moves the scm_work_list out of the task_struct
> Neil
>

Ok, heres a new version of the patch.  To avoid ABI breakage, I forward ported
the RHEL4 infrastructure patch that hijacks the vfork_wait member of task_struct
to point to a task_struct_aux structure, which we can extend as needed.  I've
verified that it fixes the unix socket dos attack, and incorporated that fix for
bz 473270 (the associated oom kill that Eugene mentioned), and had Fujitsu
verify that it was ABI safe from their point of view.

So we're in kind of a bad spot with this here.  Its a security bug, and a
serious one, so it really needs to make 5.3 if at all possible, but its really
late in the cycle, and it affect task_struct.  Again, its technically ABI safe,
but we are hijacking the vfork completion queue to provide that safety.  If
anyone outside of the kernel uses that pointer, we have a problem.  But we did
the same thing post GA in RHEL4 and it worked out ok.  Nevertheless, I'd
appreciate it if everyone gave this an extra close look.  I'd really like to
avoid screwing this up if we can.  Thanks!

Neil

diff --git a/fs/exec.c b/fs/exec.c
index f4e4906..3b6f6bc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1655,9 +1655,9 @@ static int coredump_wait(int exit_code)
 	 * Make sure nobody is waiting for us to release the VM,
 	 * otherwise we can deadlock when we wait on each other
 	 */
-	vfork_done = tsk->vfork_done;
+	vfork_done = task_aux(tsk)->vfork_done;
 	if (vfork_done) {
-		tsk->vfork_done = NULL;
+		task_aux(tsk)->vfork_done = NULL;
 		complete(vfork_done);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 411d561..ce42dbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -823,6 +823,14 @@ enum sleep_type {
 
 struct prio_array;
 
+/* auxilliary task structure to avoid KABI breakage */
+struct task_struct_aux {
+	struct completion *vfork_done;  /* for vfork() [displaced from task_struct] */
+	struct list_head  *scm_work_list; /*displaced from task_struct for abi compat*/
+};
+
+#define task_aux(tsk) ((tsk)->auxilliary)
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -887,8 +895,11 @@ struct task_struct {
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
-
+#ifndef __GENKSYMS__
+	struct task_struct_aux *auxilliary;	/* KABI-resistant auxilliary task data */
+#else
 	struct completion *vfork_done;		/* for vfork() */
+#endif
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index c0398f5..4e462f8 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -9,6 +9,7 @@
 extern void unix_inflight(struct file *fp);
 extern void unix_notinflight(struct file *fp);
 extern void unix_gc(void);
+extern void wait_for_unix_gc(void);
 
 #define UNIX_HASH_SIZE	256
 
@@ -85,6 +86,11 @@ struct unix_sock {
         atomic_t                inflight;
         spinlock_t		lock;
         wait_queue_head_t       peer_wait;
+#ifndef __GENKSYMS__
+	unsigned int		gc_candidate : 1;
+	unsigned int		gc_maybe_cycle : 1;
+	struct list_head	link;
+#endif
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 5637d5e..a4f8704 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -14,6 +14,9 @@ struct scm_fp_list
 {
 	int		count;
 	struct file	*fp[SCM_MAX_FD];
+#ifndef __GENKSYMS__
+	struct list_head	list;
+#endif
 };
 
 struct scm_cookie
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a33b29..f038aff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -120,6 +120,7 @@ static kmem_cache_t *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
+	kfree(task_aux(tsk));
 	free_thread_info(tsk->thread_info);
 	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
@@ -142,8 +143,11 @@ void __put_task_struct(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(__put_task_struct);
 
+static struct task_struct_aux init_task_aux;
+
 void __init fork_init(unsigned long mempages)
 {
+	task_aux(current) = &init_task_aux;
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -175,6 +179,7 @@ void __init fork_init(unsigned long mempages)
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
+	struct task_struct_aux *aux;
 	struct task_struct *tsk;
 	struct thread_info *ti;
 
@@ -190,9 +195,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		return NULL;
 	}
 
+	aux = kmalloc(sizeof(*aux), GFP_KERNEL);
+	if (!aux) {
+		free_thread_info(ti);
+		free_task_struct(tsk);
+		return NULL;
+	}
+
 	*tsk = *orig;
+	*aux = *task_aux(orig);
 	tsk->thread_info = ti;
 	setup_thread_stack(tsk, orig);
+	task_aux(tsk) = aux;
 
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
@@ -547,14 +561,14 @@ EXPORT_SYMBOL_GPL(get_task_mm);
  */
 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
-	struct completion *vfork_done = tsk->vfork_done;
+	struct completion *vfork_done = task_aux(tsk)->vfork_done;
 
 	/* Get rid of any cached register state */
 	deactivate_mm(tsk, mm);
 
 	/* notify parent sleeping on vfork() */
 	if (vfork_done) {
-		tsk->vfork_done = NULL;
+		task_aux(tsk)->vfork_done = NULL;
 		complete(vfork_done);
 	}
 	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
@@ -1140,7 +1154,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	p->vfork_done = NULL;
+	task_aux(p)->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 	ptrace_init_task(p);
 
@@ -1468,7 +1482,7 @@ long do_fork(unsigned long clone_flags,
 		trace_sched_process_fork(current, p);
 
 		if (clone_flags & CLONE_VFORK) {
-			p->vfork_done = &vfork;
+			task_aux(p)->vfork_done = &vfork;
 			init_completion(&vfork);
 		}
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c53b531..8ac7e80 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -106,7 +106,7 @@ int freeze_processes(void)
 				 * Freeze it unless there's a vfork completion
 				 * pending
 				 */
-				if (!p->vfork_done)
+				if (!task_aux(p)->vfork_done)
 					freeze_process(p);
 				nr_user++;
 			} else {
diff --git a/net/core/scm.c b/net/core/scm.c
index 649d01e..f24b2eb 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -73,6 +73,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
+		INIT_LIST_HEAD(&fpl->list);
 		fpl->count = 0;
 	}
 	fpp = &fpl->fp[fpl->count];
@@ -104,9 +105,25 @@ void __scm_destroy(struct scm_cookie *scm)
 
 	if (fpl) {
 		scm->fp = NULL;
-		for (i=fpl->count-1; i>=0; i--)
-			fput(fpl->fp[i]);
-		kfree(fpl);
+		if (task_aux(current)->scm_work_list) {
+			list_add_tail(&fpl->list, task_aux(current)->scm_work_list);
+		} else {
+			LIST_HEAD(work_list);
+
+			task_aux(current)->scm_work_list = &work_list;
+
+			list_add(&fpl->list, &work_list);
+			while (!list_empty(&work_list)) {
+				fpl = list_first_entry(&work_list, struct scm_fp_list, list);
+
+				list_del(&fpl->list);
+				for (i=fpl->count-1; i>=0; i--)
+					fput(fpl->fp[i]);
+				kfree(fpl);
+			}
+
+			task_aux(current)->scm_work_list = NULL;
+		}
 	}
 }
 
@@ -277,6 +294,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 
 	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
 	if (new_fpl) {
+		INIT_LIST_HEAD(&new_fpl->list);
 		for (i=fpl->count-1; i>=0; i--)
 			get_file(fpl->fp[i]);
 		memcpy(new_fpl, fpl, sizeof(*fpl));
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acde97e..be0a838 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -592,7 +592,8 @@ static struct sock * unix_create1(struct socket *sock)
 	u->dentry = NULL;
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
-	atomic_set(&u->inflight, sock ? 0 : -1);
+	atomic_set(&u->inflight, 0);
+	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
 	unix_insert_socket(unix_sockets_unbound, sk);
@@ -1101,9 +1102,6 @@ restart:
 	/* take ten and and send info to listening sock */
 	spin_lock(&other->sk_receive_queue.lock);
 	__skb_queue_tail(&other->sk_receive_queue, skb);
-	/* Undo artificially decreased inflight after embrion
-	 * is installed to listening socket. */
-	atomic_inc(&newu->inflight);
 	spin_unlock(&other->sk_receive_queue.lock);
 	unix_state_runlock(other);
 	other->sk_data_ready(other, 0);
@@ -1249,14 +1247,23 @@ static void unix_destruct_fds(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
-static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
+
+	/*
+	 * Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
 	for (i=scm->fp->count-1; i>=0; i--)
 		unix_inflight(scm->fp->fp[i]);
-	UNIXCB(skb).fp = scm->fp;
 	skb->destructor = unix_destruct_fds;
-	scm->fp = NULL;
+	return 0;
 }
 
 /*
@@ -1280,6 +1287,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
@@ -1314,8 +1322,11 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 
 	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-	if (siocb->scm->fp)
-		unix_attach_fds(siocb->scm, skb);
+	if (siocb->scm->fp) {
+		err = unix_attach_fds(siocb->scm, skb);
+		if (err)
+			goto out_free;
+	}
 	unix_get_secdata(siocb->scm, skb);
 
 	skb->h.raw = skb->data;
@@ -1429,6 +1440,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
@@ -1486,8 +1498,13 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		size = min_t(int, size, skb_tailroom(skb));
 
 		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-		if (siocb->scm->fp)
-			unix_attach_fds(siocb->scm, skb);
+		if (siocb->scm->fp) {
+			err = unix_attach_fds(siocb->scm, skb);
+			if (err) {
+				kfree_skb(skb);
+				goto out_err;
+			}
+		}
 
 		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
 			kfree_skb(skb);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 746c2f4..bf52905 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -62,10 +62,13 @@
  *	AV		1 Mar 1999
  *		Damn. Added missing check for ->dead in listen queues scanning.
  *
+ *	Miklos Szeredi 25 Jun 2007
+ *		Reimplement with a cycle collecting algorithm. This should
+ *		solve several problems with the previous code, like being racy
+ *		wrt receive and holding up unrelated socket operations.
  */
- 
+
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/un.h>
@@ -77,6 +80,7 @@
 #include <linux/file.h>
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
+#include <linux/wait.h>
 
 #include <net/sock.h>
 #include <net/af_unix.h>
@@ -85,12 +89,12 @@
 
 /* Internal data structures and random procedures: */
 
-#define GC_HEAD		((struct sock *)(-1))
-#define GC_ORPHAN	((struct sock *)(-3))
-
-static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */
+static LIST_HEAD(gc_inflight_list);
+static LIST_HEAD(gc_candidates);
+static DEFINE_SPINLOCK(unix_gc_lock);
+static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 
-atomic_t unix_tot_inflight = ATOMIC_INIT(0);
+atomic_t unix_tot_inflight;
 
 
 static struct sock *unix_get_socket(struct file *filp)
@@ -118,13 +122,21 @@ static struct sock *unix_get_socket(struct file *filp)
  *	Keep the number of times in flight count for the file
  *	descriptor if it is for an AF_UNIX socket.
  */
- 
+
 void unix_inflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 	if(s) {
-		atomic_inc(&unix_sk(s)->inflight);
+		struct unix_sock *u = unix_sk(s);
+		spin_lock(&unix_gc_lock);
+		if (atomic_inc_return(&u->inflight) == 1) {
+			BUG_ON(!list_empty(&u->link));
+			list_add_tail(&u->link, &gc_inflight_list);
+		} else {
+			BUG_ON(list_empty(&u->link));
+		}
 		atomic_inc(&unix_tot_inflight);
+		spin_unlock(&unix_gc_lock);
 	}
 }
 
@@ -132,182 +144,251 @@ void unix_notinflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 	if(s) {
-		atomic_dec(&unix_sk(s)->inflight);
+		struct unix_sock *u = unix_sk(s);
+		spin_lock(&unix_gc_lock);
+		BUG_ON(list_empty(&u->link));
+		if (atomic_dec_and_test(&u->inflight))
+			list_del_init(&u->link);
 		atomic_dec(&unix_tot_inflight);
+		spin_unlock(&unix_gc_lock);
 	}
 }
 
+static inline struct sk_buff *sock_queue_head(struct sock *sk)
+{
+	return (struct sk_buff *) &sk->sk_receive_queue;
+}
 
-/*
- *	Garbage Collector Support Functions
- */
+#define receive_queue_for_each_skb(sk, next, skb) \
+	for (skb = sock_queue_head(sk)->next, next = skb->next; \
+	     skb != sock_queue_head(sk); skb = next, next = skb->next)
 
-static inline struct sock *pop_stack(void)
+static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+			  struct sk_buff_head *hitlist)
 {
-	struct sock *p = gc_current;
-	gc_current = unix_sk(p)->gc_tree;
-	return p;
+	struct sk_buff *skb;
+	struct sk_buff *next;
+
+	spin_lock(&x->sk_receive_queue.lock);
+	receive_queue_for_each_skb(x, next, skb) {
+		/*
+		 *	Do we have file descriptors ?
+		 */
+		if (UNIXCB(skb).fp) {
+			bool hit = false;
+			/*
+			 *	Process the descriptors of this socket
+			 */
+			int nfd = UNIXCB(skb).fp->count;
+			struct file **fp = UNIXCB(skb).fp->fp;
+			while (nfd--) {
+				/*
+				 *	Get the socket the fd matches
+				 *	if it indeed does so
+				 */
+				struct sock *sk = unix_get_socket(*fp++);
+				if (sk) {
+					struct unix_sock *u = unix_sk(sk);
+
+					/*
+					 * Ignore non-candidates, they could
+					 * have been added to the queues after
+					 * starting the garbage collection
+					 */
+					if (u->gc_candidate) {
+						hit = true;
+						func(u);
+					}
+				}
+			}
+			if (hit && hitlist != NULL) {
+				__skb_unlink(skb, &x->sk_receive_queue);
+				__skb_queue_tail(hitlist, skb);
+			}
+		}
+	}
+	spin_unlock(&x->sk_receive_queue.lock);
 }
 
-static inline int empty_stack(void)
+static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
+			  struct sk_buff_head *hitlist)
 {
-	return gc_current == GC_HEAD;
+	if (x->sk_state != TCP_LISTEN)
+		scan_inflight(x, func, hitlist);
+	else {
+		struct sk_buff *skb;
+		struct sk_buff *next;
+		struct unix_sock *u;
+		LIST_HEAD(embryos);
+
+		/*
+		 * For a listening socket collect the queued embryos
+		 * and perform a scan on them as well.
+		 */
+		spin_lock(&x->sk_receive_queue.lock);
+		receive_queue_for_each_skb(x, next, skb) {
+			u = unix_sk(skb->sk);
+
+			/*
+			 * An embryo cannot be in-flight, so it's safe
+			 * to use the list link.
+			 */
+			BUG_ON(!list_empty(&u->link));
+			list_add_tail(&u->link, &embryos);
+		}
+		spin_unlock(&x->sk_receive_queue.lock);
+
+		while (!list_empty(&embryos)) {
+			u = list_entry(embryos.next, struct unix_sock, link);
+			scan_inflight(&u->sk, func, hitlist);
+			list_del_init(&u->link);
+		}
+	}
 }
 
-static void maybe_unmark_and_push(struct sock *x)
+static void dec_inflight(struct unix_sock *usk)
 {
-	struct unix_sock *u = unix_sk(x);
+	atomic_dec(&usk->inflight);
+}
 
-	if (u->gc_tree != GC_ORPHAN)
-		return;
-	sock_hold(x);
-	u->gc_tree = gc_current;
-	gc_current = x;
+static void inc_inflight(struct unix_sock *usk)
+{
+	atomic_inc(&usk->inflight);
 }
 
+static void inc_inflight_move_tail(struct unix_sock *u)
+{
+	atomic_inc(&u->inflight);
+	/*
+	 * If this still might be part of a cycle, move it to the end
+	 * of the list, so that it's checked even if it was already
+	 * passed over
+	 */
+	if (u->gc_maybe_cycle)
+		list_move_tail(&u->link, &gc_candidates);
+}
 
-/* The external entry point: unix_gc() */
+static bool gc_in_progress = false;
 
+void wait_for_unix_gc(void)
+{
+	int error;
+
+	do {
+		error = wait_event_interruptible(unix_gc_wait,
+						 gc_in_progress == false);
+	} while(error);
+}
+
+/* The external entry point: unix_gc() */
 void unix_gc(void)
 {
-	static DEFINE_MUTEX(unix_gc_sem);
-	int i;
-	struct sock *s;
+	struct unix_sock *u;
+	struct unix_sock *next;
 	struct sk_buff_head hitlist;
-	struct sk_buff *skb;
+	struct list_head cursor;
+	LIST_HEAD(not_cycle_list);
 
-	/*
-	 *	Avoid a recursive GC.
-	 */
-
-	if (!mutex_trylock(&unix_gc_sem))
-		return;
+	spin_lock(&unix_gc_lock);
 
-	spin_lock(&unix_table_lock);
+	/* Avoid a recursive GC. */
+	if (gc_in_progress)
+		goto out;
 
-	forall_unix_sockets(i, s)
-	{
-		unix_sk(s)->gc_tree = GC_ORPHAN;
-	}
+	gc_in_progress = true;
 	/*
-	 *	Everything is now marked 
+	 * First, select candidates for garbage collection.  Only
+	 * in-flight sockets are considered, and from those only ones
+	 * which don't have any external reference.
+	 *
+	 * Holding unix_gc_lock will protect these candidates from
+	 * being detached, and hence from gaining an external
+	 * reference.  Since there are no possible receivers, all
+	 * buffers currently on the candidates' queues stay there
+	 * during the garbage collection.
+	 *
+	 * We also know that no new candidate can be added onto the
+	 * receive queues.  Other, non candidate sockets _can_ be
+	 * added to queue, so we must make sure only to touch
+	 * candidates.
 	 */
+	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
+		long total_refs;
+		long inflight_refs;
+
+		total_refs = file_count(u->sk.sk_socket->file);
+		inflight_refs = atomic_read(&u->inflight);
+
+		BUG_ON(inflight_refs < 1);
+		BUG_ON(total_refs < inflight_refs);
+		if (total_refs == inflight_refs) {
+			list_move_tail(&u->link, &gc_candidates);
+			u->gc_candidate = 1;
+			u->gc_maybe_cycle = 1;
+		}
+	}
 
-	/* Invariant to be maintained:
-		- everything unmarked is either:
-		-- (a) on the stack, or
-		-- (b) has all of its children unmarked
-		- everything on the stack is always unmarked
-		- nothing is ever pushed onto the stack twice, because:
-		-- nothing previously unmarked is ever pushed on the stack
+	/*
+	 * Now remove all internal in-flight reference to children of
+	 * the candidates.
 	 */
+	list_for_each_entry(u, &gc_candidates, link)
+		scan_children(&u->sk, dec_inflight, NULL);
 
 	/*
-	 *	Push root set
+	 * Restore the references for children of all candidates,
+	 * which have remaining references.  Do this recursively, so
+	 * only those remain, which form cyclic references.
+	 *
+	 * Use a "cursor" link, to make the list traversal safe, even
+	 * though elements might be moved about.
 	 */
+	list_add(&cursor, &gc_candidates);
+	while (cursor.next != &gc_candidates) {
+		u = list_entry(cursor.next, struct unix_sock, link);
 
-	forall_unix_sockets(i, s)
-	{
-		int open_count = 0;
+		/* Move cursor to after the current position. */
+		list_move(&cursor, &u->link);
 
-		/*
-		 *	If all instances of the descriptor are not
-		 *	in flight we are in use.
-		 *
-		 *	Special case: when socket s is embrion, it may be
-		 *	hashed but still not in queue of listening socket.
-		 *	In this case (see unix_create1()) we set artificial
-		 *	negative inflight counter to close race window.
-		 *	It is trick of course and dirty one.
-		 */
-		if (s->sk_socket && s->sk_socket->file)
-			open_count = file_count(s->sk_socket->file);
-		if (open_count > atomic_read(&unix_sk(s)->inflight))
-			maybe_unmark_and_push(s);
+		if (atomic_read(&u->inflight) > 0) {
+			list_move_tail(&u->link, &not_cycle_list);
+			u->gc_maybe_cycle = 0;
+			scan_children(&u->sk, inc_inflight_move_tail, NULL);
+		}
 	}
+	list_del(&cursor);
 
 	/*
-	 *	Mark phase 
+	 * not_cycle_list contains those sockets which do not make up a
+	 * cycle.  Restore these to the inflight list.
 	 */
-
-	while (!empty_stack())
-	{
-		struct sock *x = pop_stack();
-		struct sock *sk;
-
-		spin_lock(&x->sk_receive_queue.lock);
-		skb = skb_peek(&x->sk_receive_queue);
-		
-		/*
-		 *	Loop through all but first born 
-		 */
-		
-		while (skb && skb != (struct sk_buff *)&x->sk_receive_queue) {
-			/*
-			 *	Do we have file descriptors ?
-			 */
-			if(UNIXCB(skb).fp)
-			{
-				/*
-				 *	Process the descriptors of this socket
-				 */
-				int nfd=UNIXCB(skb).fp->count;
-				struct file **fp = UNIXCB(skb).fp->fp;
-				while(nfd--)
-				{
-					/*
-					 *	Get the socket the fd matches if
-					 *	it indeed does so
-					 */
-					if((sk=unix_get_socket(*fp++))!=NULL)
-					{
-						maybe_unmark_and_push(sk);
-					}
-				}
-			}
-			/* We have to scan not-yet-accepted ones too */
-			if (x->sk_state == TCP_LISTEN)
-				maybe_unmark_and_push(skb->sk);
-			skb=skb->next;
-		}
-		spin_unlock(&x->sk_receive_queue.lock);
-		sock_put(x);
+	while (!list_empty(&not_cycle_list)) {
+		u = list_entry(not_cycle_list.next, struct unix_sock, link);
+		u->gc_candidate = 0;
+		list_move_tail(&u->link, &gc_inflight_list);
 	}
 
+	/*
+	 * Now gc_candidates contains only garbage.  Restore original
+	 * inflight counters for these as well, and remove the skbuffs
+	 * which are creating the cycle(s).
+	 */
 	skb_queue_head_init(&hitlist);
+	list_for_each_entry(u, &gc_candidates, link)
+		scan_children(&u->sk, inc_inflight, &hitlist);
 
-	forall_unix_sockets(i, s)
-	{
-		struct unix_sock *u = unix_sk(s);
+	spin_unlock(&unix_gc_lock);
 
-		if (u->gc_tree == GC_ORPHAN) {
-			struct sk_buff *nextsk;
+	/* Here we are. Hitlist is filled. Die. */
+	__skb_queue_purge(&hitlist);
 
-			spin_lock(&s->sk_receive_queue.lock);
-			skb = skb_peek(&s->sk_receive_queue);
-			while (skb &&
-			       skb != (struct sk_buff *)&s->sk_receive_queue) {
-				nextsk = skb->next;
-				/*
-				 *	Do we have file descriptors ?
-				 */
-				if (UNIXCB(skb).fp) {
-					__skb_unlink(skb,
-						     &s->sk_receive_queue);
-					__skb_queue_tail(&hitlist, skb);
-				}
-				skb = nextsk;
-			}
-			spin_unlock(&s->sk_receive_queue.lock);
-		}
-		u->gc_tree = GC_ORPHAN;
-	}
-	spin_unlock(&unix_table_lock);
+	spin_lock(&unix_gc_lock);
 
-	/*
-	 *	Here we are. Hitlist is filled. Die.
-	 */
+	/* All candidates should have been detached by now. */
+	BUG_ON(!list_empty(&gc_candidates));
+	gc_in_progress = false;
+	wake_up(&unix_gc_wait);
 
-	__skb_queue_purge(&hitlist);
-	mutex_unlock(&unix_gc_sem);
+ out:
+	spin_unlock(&unix_gc_lock);
 }