From: Jeff Layton <jlayton@redhat.com> Date: Thu, 8 Jul 2010 14:30:13 -0400 Subject: [fs] nfs: fix resolution in nfs_inode_attrs_need_update Message-id: <1278599414-23803-1-git-send-email-jlayton@redhat.com> Patchwork-id: 26766 O-Subject: [RHEL5.6 PATCH 7/4] BZ#601800: nfs: fix resolution problem with nfs_inode_attrs_need_update Bugzilla: 601800 RH-Acked-by: Steve Dickson <SteveD@redhat.com> From: Trond Myklebust <Trond.Myklebust@netapp.com> Jarod Wilson and Jeff Burke noticed that we were still getting intermittent connectathon test failures even after the last patches for this bug were added on. The test failures were always in the same spot and it suggested that the client was occasionally ignoring metadata updates that it shouldn't have. This patch is a backported fusion of the following two upstream patches: 4704f0e274829e3af00737d2d9adace2d71a9605 ae05f269400533cbb32bfba131ab528d78dffd16 The main idea is to turn the last_updated timestamp in the nfs inode to a counter. The upstream code used atomic_long_t for the counter, but RHEL5 doesn't have that so I used atomic_t instead. The code is guarded against overflow of the counter and atomic_long_t == atomic_t on 32 bit arches anyway. Based on that I believe this should be sufficient. This was tested by Jarod and Jeff with RHTS and it seems to have fixed all of the connectathon test failures that they were seeing. I also ran several thousand Connectathon test passes on this kernel. ----------------------[snip]------------------- NFS: Fix the resolution problem with nfs_inode_attrs_need_update() It appears that 'jiffies' timestamps do not have high enough resolution for nfs_inode_attrs_need_update(). One problem is that a GETATTR can be launched within < 1 jiffy of the last operation that updated the attribute. Another problem is that RPC calls can take < 1 jiffy to execute. We can fix this by switching the variables to use a simple global counter that gets incremented every time we start another GETATTR call. ----------------------[snip]------------------- NFS: Convert nfs_attr_generation_counter into an atomic_long The most important property we need from nfs_attr_generation_counter is monotonicity, which is not guaranteed by the current system of smp memory barriers. We should convert it to an atomic_long_t, and drop the memory barriers. diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e07dd1c..3d90422 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -152,8 +152,9 @@ typedef struct { struct nfs_entry *entry; decode_dirent_t decode; int plus; - unsigned long timestamp; - int timestamp_valid; + unsigned long timestamp; + int gencount; + int timestamp_valid; } nfs_readdir_descriptor_t; /* Now we cache directories properly, by stuffing the dirent @@ -175,6 +176,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct inode *inode = file->f_dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); unsigned long timestamp; + int gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -183,6 +185,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -196,6 +199,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -221,9 +225,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -474,6 +479,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct page *page = NULL; int status; unsigned long timestamp; + int gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -484,6 +490,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -492,6 +499,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e8ef5cf..68d89c6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -303,7 +303,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = jiffies; + nfsi->attr_gencount = fattr->gencount; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -862,6 +862,25 @@ static int nfs_size_need_update(struct inode *inode, const struct nfs_fattr *fat return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } +static atomic_t nfs_attr_generation_counter; + +static int nfs_read_attr_generation_counter(void) +{ + return atomic_read(&nfs_attr_generation_counter); +} + +int nfs_inc_attr_generation_counter(void) +{ + return atomic_inc_return(&nfs_attr_generation_counter); +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + /** * nfs_inode_attrs_need_update - check if the inode attributes need updating * @inode - pointer to inode @@ -875,8 +894,7 @@ static int nfs_size_need_update(struct inode *inode, const struct nfs_fattr *fat * catch the case where ctime either didn't change, or went backwards * (if someone reset the clock on the server) by looking at whether * or not this RPC call was started after the inode was last updated. - * Note also the check for jiffy wraparound if the last_updated timestamp - * is later than 'jiffies'. + * Note also the check for wraparound of 'attr_gencount' * * The function returns 'true' if it thinks the attributes in 'fattr' are * more recent than the ones cached in the inode. @@ -886,10 +904,10 @@ static int nfs_inode_attrs_need_update(struct inode *inode, struct nfs_fattr *fa { const struct nfs_inode *nfsi = NFS_I(inode); - return time_after(fattr->time_start, nfsi->last_updated) || + return (fattr->gencount - nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || nfs_size_need_update(inode, fattr) || - time_after(nfsi->last_updated, jiffies); + (nfsi->attr_gencount - nfs_read_attr_generation_counter() > 0); } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) @@ -1033,7 +1051,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1085,7 +1103,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - nfsi->last_updated = jiffies; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else if (!time_in_range_open(jiffies, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4fc3a1d..60da650 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -139,7 +139,7 @@ struct nfs_inode { unsigned long attrtimeo_timestamp; __u64 change_attr; /* v4 only */ - unsigned long last_updated; + int attr_gencount; /* "Generation counter" for the attribute cache. This is * bumped whenever we update the metadata on the * server. @@ -332,15 +332,11 @@ extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, struct nfs_fh *fh, struct nfs_fattr *fattr); extern u64 nfs_compat_user_ino64(u64 fileid); +extern void nfs_fattr_init(struct nfs_fattr *fattr); /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern u32 root_nfs_parse_addr(char *name); /*__init*/ - -static inline void nfs_fattr_init(struct nfs_fattr *fattr) -{ - fattr->valid = 0; - fattr->time_start = jiffies; -} +extern int nfs_inc_attr_generation_counter(void); /* * linux/fs/nfs/file.c diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 317713a..f340b94 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -57,6 +57,7 @@ struct nfs_fattr { __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ unsigned long time_start; + int gencount; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */