From: Benjamin Marzinski <bmarzins@redhat.com> Subject: [RHEL 5.1 PATCH] GFS2 - bz #248480: GFS2: distributed mmap test cases deadlock Date: Fri, 24 Aug 2007 13:24:38 -0500 Bugzilla: 248480 Message-Id: <20070824182437.GP24772@ether.msp.redhat.com> Changelog: [GFS2] distributed mmap test cases deadlock BZ#248480 https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=248480 Description: When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in a cluster, it will deadlock. The reason is that do_no_page() will repeatedly call gfs2_sharewrite_nopage(), because each node keeps giving up the glock too early, and is forced to call unmap_mapping_range(). This bumps the mapping->truncate_count sequence count, forcing do_no_page() to retry. This patch institutes a minimum glock hold time of a tenth a second, by putting the demote on a work queue with the appropriate delay. This insures that even in heavy contention cases, the node has enough time to get some useful work done before it gives up the glock. A second issue is that when gfs2_glock_dq() is called from within a page fault to demote a lock, and the associated page needs to be written out, it will try to acqire a lock on it, but it has already been locked at a higher level This patch makes gfs2_glock_dq() use the work queue as well, to avoid this issue. This patch is upstream: http://git.kernel.org/?p=linux/kernel/git/steve/gfs2-2.6-nmw.git;a=commit;h=478781d55a347853c66c4f32df7bd3642280a458 Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> diff -uprN --exclude-from=linux-2.6.18-43.gfs2abhi.001_clean/Documentation/dontdiff linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/glock.c linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/glock.c --- linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/glock.c 2007-08-23 04:53:34.000000000 -0500 +++ linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/glock.c 2007-08-23 05:01:59.000000000 -0500 @@ -28,6 +28,8 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/kthread.h> +#include <linux/workqueue.h> +#include <linux/jiffies.h> #include "gfs2.h" #include "incore.h" @@ -59,6 +61,7 @@ static int gfs2_dump_lockstate(struct gf static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); static void gfs2_glock_drop_th(struct gfs2_glock *gl); +static void run_queue(struct gfs2_glock *gl); static DECLARE_RWSEM(gfs2_umount_flush_sem); #define GFS2_GL_HASH_SHIFT 15 @@ -69,6 +72,7 @@ static struct gfs2_gl_hash_bucket gl_has static struct dentry *gfs2_root; static struct task_struct *scand_process; static unsigned int scand_secs = 5; +static struct workqueue_struct *glock_workqueue; /* * Despite what you might think, the numbers below are not arbitrary :-) @@ -277,6 +281,18 @@ static struct gfs2_glock *gfs2_glock_fin return gl; } +static void glock_work_func(void *data) +{ + struct gfs2_glock *gl = (struct gfs2_glock *)data; + + spin_lock(&gl->gl_spin); + if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) + set_bit(GLF_DEMOTE, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -316,6 +332,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, gl->gl_name = name; atomic_set(&gl->gl_ref, 1); gl->gl_state = LM_ST_UNLOCKED; + gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_hash = hash; gl->gl_owner_pid = 0; gl->gl_ip = 0; @@ -324,10 +341,12 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, gl->gl_req_bh = NULL; gl->gl_vn = 0; gl->gl_stamp = jiffies; + gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_sbd = sdp; gl->gl_aspace = NULL; lops_init_le(&gl->gl_le, &gfs2_glock_lops); + INIT_WORK(&gl->gl_work, glock_work_func, (void *)gl); /* If this glock protects actual on-disk data or metadata blocks, create a VFS inode to manage the pages/buffers holding them. */ @@ -441,6 +460,8 @@ static void wait_on_holder(struct gfs2_h static void gfs2_demote_wake(struct gfs2_glock *gl) { + BUG_ON(!spin_is_locked(&gl->gl_spin)); + gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb__after_clear_bit(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); @@ -682,10 +703,14 @@ static void gfs2_glmutex_unlock(struct g * practise: LM_ST_SHARED and LM_ST_UNLOCKED */ -static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote) +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + int remote, unsigned long delay) { + int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; + spin_lock(&gl->gl_spin); - if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) { + set_bit(bit, &gl->gl_flags); + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && @@ -727,6 +752,7 @@ static void state_change(struct gfs2_glo } gl->gl_state = new_state; + gl->gl_tchange = jiffies; } /** @@ -813,7 +839,6 @@ out: gl->gl_req_gh = NULL; gl->gl_req_bh = NULL; clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl); spin_unlock(&gl->gl_spin); } @@ -885,7 +910,6 @@ static void drop_bh(struct gfs2_glock *g gfs2_assert_warn(sdp, !ret); state_change(gl, LM_ST_UNLOCKED); - gfs2_demote_wake(gl); if (glops->go_inval) glops->go_inval(gl, DIO_METADATA); @@ -898,10 +922,10 @@ static void drop_bh(struct gfs2_glock *g } spin_lock(&gl->gl_spin); + gfs2_demote_wake(gl); gl->gl_req_gh = NULL; gl->gl_req_bh = NULL; clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl); spin_unlock(&gl->gl_spin); gfs2_glock_put(gl); @@ -1209,9 +1233,10 @@ void gfs2_glock_dq(struct gfs2_holder *g { struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned delay = 0; if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, 0); gfs2_glmutex_lock(gl); @@ -1229,8 +1254,14 @@ void gfs2_glock_dq(struct gfs2_holder *g } clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl); spin_unlock(&gl->gl_spin); + + gfs2_glock_hold(gl); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) @@ -1457,18 +1488,21 @@ static void blocking_cb(struct gfs2_sbd unsigned int state) { struct gfs2_glock *gl; + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; gl = gfs2_glock_find(sdp, name); if (!gl) return; - handle_callback(gl, state, 1); - - spin_lock(&gl->gl_spin); - run_queue(gl); - spin_unlock(&gl->gl_spin); + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; - gfs2_glock_put(gl); + handle_callback(gl, state, 1, delay); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); } /** @@ -1509,7 +1543,8 @@ void gfs2_glock_cb(void *cb_data, unsign return; if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) gl->gl_req_bh(gl, async->lc_ret); - gfs2_glock_put(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); up_read(&gfs2_umount_flush_sem); return; } @@ -1602,7 +1637,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd if (gfs2_glmutex_trylock(gl)) { if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, 0); gfs2_glmutex_unlock(gl); } @@ -1702,7 +1737,7 @@ static void clear_glock(struct gfs2_gloc if (gfs2_glmutex_trylock(gl)) { if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, 0); gfs2_glmutex_unlock(gl); } } @@ -2009,11 +2044,18 @@ int __init gfs2_glock_init(void) if (IS_ERR(scand_process)) return PTR_ERR(scand_process); + glock_workqueue = create_workqueue("glock_workqueue"); + if (IS_ERR(glock_workqueue)) { + kthread_stop(scand_process); + return PTR_ERR(glock_workqueue); + } + return 0; } void gfs2_glock_exit(void) { + destroy_workqueue(glock_workqueue); kthread_stop(scand_process); } diff -uprN --exclude-from=linux-2.6.18-43.gfs2abhi.001_clean/Documentation/dontdiff linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/glops.c linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/glops.c --- linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/glops.c 2007-08-23 04:53:34.000000000 -0500 +++ linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/glops.c 2007-08-23 04:58:29.000000000 -0500 @@ -457,6 +457,7 @@ const struct gfs2_glock_operations gfs2_ .go_lock = inode_go_lock, .go_unlock = inode_go_unlock, .go_type = LM_TYPE_INODE, + .go_min_hold_time = HZ / 10, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { @@ -467,6 +468,7 @@ const struct gfs2_glock_operations gfs2_ .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, .go_type = LM_TYPE_RGRP, + .go_min_hold_time = HZ / 10, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff -uprN --exclude-from=linux-2.6.18-43.gfs2abhi.001_clean/Documentation/dontdiff linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/incore.h linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/incore.h --- linux-2.6.18-43.gfs2abhi.001_clean/fs/gfs2/incore.h 2007-08-23 04:53:34.000000000 -0500 +++ linux-2.6.18-43.gfs2abhi.001_test/fs/gfs2/incore.h 2007-08-23 04:58:29.000000000 -0500 @@ -11,6 +11,7 @@ #define __INCORE_DOT_H__ #include <linux/fs.h> +#include <linux/workqueue.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -130,6 +131,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); const int go_type; + const unsigned long go_min_hold_time; }; enum { @@ -161,6 +163,7 @@ enum { GLF_LOCK = 1, GLF_STICKY = 2, GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, GLF_DIRTY = 5, }; @@ -193,6 +196,7 @@ struct gfs2_glock { u64 gl_vn; unsigned long gl_stamp; + unsigned long gl_tchange; void *gl_object; struct list_head gl_reclaim; @@ -203,6 +207,7 @@ struct gfs2_glock { struct gfs2_log_element gl_le; struct list_head gl_ail_list; atomic_t gl_ail_count; + struct work_struct gl_work; }; struct gfs2_alloc {