Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 859

kernel-2.6.18-238.el5.src.rpm

From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 28 Aug 2008 19:07:35 -0300
Subject: [fs] anon_inodes implementation
Message-id: 20080828220735.GA17182@blackpad
O-Subject: [RHEL5.3 PATCH] anon_inodes implementation
Bugzilla: 459835
RH-Acked-by: Chris Wright <chrisw@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Glauber Costa <glommer@redhat.com>

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=459835

anon_inodes is necessary for KVM. KVM includes a compat module for
anon_inodes when compiling as an external module, but including a
anon_inodes module closer to upstream instead of a compat module, we
will be able to remove the compat hacks from the KVM module and use a
native fs/anon_inodes.c implementation.

Using the compat anon_inodes.c as starting point, I did the following
to get an implementation as close as possible to upstream anon_inodes.c.

- Remove differences on the content of comments (mostly typos that are fixed
  upstream)
- Removed the Linux version #ifdefs
- Use a proper fs_initcall() init function instead of a hack to initialize
  it at KVM module load.
- Remove the igrab() call, like the code upstream, as anon_inode_inode
  will always have a non-zero reference count
- Fill the file->f_flags O_NONBLOCK flag according to the 'flags' arg, like upstream

The result is a fs/anon_inodes.c that is as close as possible to upstream,
except for a few differences, due to differences between RHEL5 kernel
and upstream:

- Use get_empty_filp() because alloc_file() isn't available
- get_unused_fd() because get_unused_fd_flags() isn't available
- current->[fg]suid because current_[fg]suid() isn't available

For reference, below is the diff between upstream fs/anon_inodes.c and
the resulting file added by this patch.

> --- /home/ehabkost/code/kernel/linux-2.6/fs/anon_inodes.c	2008-08-19 15:36:40.000000000 -0300
> +++ fs/anon_inodes.c	2008-08-28 18:17:54.000000000 -0300
> @@ -78,10 +78,13 @@
>
>  	if (IS_ERR(anon_inode_inode))
>  		return -ENODEV;
> +	file = get_empty_filp();
> +	if (!file)
> +		return -ENFILE;
>
> -	error = get_unused_fd_flags(flags);
> +	error = get_unused_fd();
>  	if (error < 0)
> -		return error;
> +		goto err_put_filp;
>  	fd = error;
>
>  	/*
> @@ -108,15 +111,14 @@
>  	dentry->d_flags &= ~DCACHE_UNHASHED;
>  	d_instantiate(dentry, anon_inode_inode);
>
> -	error = -ENFILE;
> -	file = alloc_file(anon_inode_mnt, dentry,
> -			  FMODE_READ | FMODE_WRITE, fops);
> -	if (!file)
> -		goto err_dput;
> +	file->f_vfsmnt = mntget(anon_inode_mnt);
> +	file->f_dentry = dentry;
>  	file->f_mapping = anon_inode_inode->i_mapping;
>
>  	file->f_pos = 0;
>  	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
> +	file->f_op = (struct file_operations *)fops;
> +	file->f_mode = FMODE_READ | FMODE_WRITE;
>  	file->f_version = 0;
>  	file->private_data = priv;
>
> @@ -124,10 +126,10 @@
>
>  	return fd;
>
> -err_dput:
> -	dput(dentry);
>  err_put_unused_fd:
>  	put_unused_fd(fd);
> +err_put_filp:
> +	fput(file);
>  	return error;
>  }
>  EXPORT_SYMBOL_GPL(anon_inode_getfd);
> @@ -154,8 +156,8 @@
>  	 */
>  	inode->i_state = I_DIRTY;
>  	inode->i_mode = S_IRUSR | S_IWUSR;
> -	inode->i_uid = current_fsuid();
> -	inode->i_gid = current_fsgid();
> +	inode->i_uid = current->fsuid;
> +	inode->i_gid = current->fsgid;
>  	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
>  	return inode;
>  }

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>

diff --git a/fs/Makefile b/fs/Makefile
index e6b3fc3..6bc3688 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,6 +16,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
+obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
new file mode 100644
index 0000000..65ccc4c
--- /dev/null
+++ b/fs/anon_inodes.c
@@ -0,0 +1,194 @@
+/*
+ *  fs/anon_inodes.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *  Thanks to Arnd Bergmann for code review and suggestions.
+ *  More changes for Thomas Gleixner suggestions.
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+
+static struct vfsmount *anon_inode_mnt __read_mostly;
+static struct inode *anon_inode_inode;
+static const struct file_operations anon_inode_fops;
+
+static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
+			     mnt);
+}
+
+static int anon_inodefs_delete_dentry(struct dentry *dentry)
+{
+	/*
+	 * We faked vfs to believe the dentry was hashed when we created it.
+	 * Now we restore the flag so that dput() will work correctly.
+	 */
+	dentry->d_flags |= DCACHE_UNHASHED;
+	return 1;
+}
+
+static struct file_system_type anon_inode_fs_type = {
+	.name		= "anon_inodefs",
+	.get_sb		= anon_inodefs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+static struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_delete	= anon_inodefs_delete_dentry,
+};
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
+ *                    anonymous inode, and a dentry that describe the "class"
+ *                    of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns new descriptor or -error.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	struct qstr this;
+	struct dentry *dentry;
+	struct file *file;
+	int error, fd;
+
+	if (IS_ERR(anon_inode_inode))
+		return -ENODEV;
+	file = get_empty_filp();
+	if (!file)
+		return -ENFILE;
+
+	error = get_unused_fd();
+	if (error < 0)
+		goto err_put_filp;
+	fd = error;
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode sequence number.
+	 */
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0;
+	dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_put_unused_fd;
+
+	/*
+	 * We know the anon_inode inode count is always greater than zero,
+	 * so we can avoid doing an igrab() and we can use an open-coded
+	 * atomic_inc().
+	 */
+	atomic_inc(&anon_inode_inode->i_count);
+
+	dentry->d_op = &anon_inodefs_dentry_operations;
+	/* Do not publish this dentry inside the global dentry hash table */
+	dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(dentry, anon_inode_inode);
+
+	file->f_vfsmnt = mntget(anon_inode_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = anon_inode_inode->i_mapping;
+
+	file->f_pos = 0;
+	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
+	file->f_op = (struct file_operations *)fops;
+	file->f_mode = FMODE_READ | FMODE_WRITE;
+	file->f_version = 0;
+	file->private_data = priv;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+err_put_filp:
+	fput(file);
+	return error;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfd);
+
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+static struct inode *anon_inode_mkinode(void)
+{
+	struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_fop = &anon_inode_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+
+static int __init anon_inode_init(void)
+{
+	int error;
+
+	error = register_filesystem(&anon_inode_fs_type);
+	if (error)
+		goto err_exit;
+	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
+	if (IS_ERR(anon_inode_mnt)) {
+		error = PTR_ERR(anon_inode_mnt);
+		goto err_unregister_filesystem;
+	}
+	anon_inode_inode = anon_inode_mkinode();
+	if (IS_ERR(anon_inode_inode)) {
+		error = PTR_ERR(anon_inode_inode);
+		goto err_mntput;
+	}
+
+	return 0;
+
+err_mntput:
+	mntput(anon_inode_mnt);
+err_unregister_filesystem:
+	unregister_filesystem(&anon_inode_fs_type);
+err_exit:
+	panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+}
+
+fs_initcall(anon_inode_init);
+
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
new file mode 100644
index 0000000..1595b9d
--- /dev/null
+++ b/include/linux/anon_inodes.h
@@ -0,0 +1,16 @@
+/*
+ *  include/linux/anon_inodes.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_ANON_INODES_H
+#define _LINUX_ANON_INODES_H
+
+struct file_operations;
+
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags);
+
+#endif /* _LINUX_ANON_INODES_H */
diff --git a/include/linux/magic.h b/include/linux/magic.h
new file mode 100644
index 0000000..74e68e2
--- /dev/null
+++ b/include/linux/magic.h
@@ -0,0 +1,46 @@
+#ifndef __LINUX_MAGIC_H__
+#define __LINUX_MAGIC_H__
+
+#define ADFS_SUPER_MAGIC	0xadf5
+#define AFFS_SUPER_MAGIC	0xadff
+#define AFS_SUPER_MAGIC                0x5346414F
+#define AUTOFS_SUPER_MAGIC	0x0187
+#define CODA_SUPER_MAGIC	0x73757245
+#define EFS_SUPER_MAGIC		0x414A53
+#define EXT2_SUPER_MAGIC	0xEF53
+#define EXT3_SUPER_MAGIC	0xEF53
+#define EXT4_SUPER_MAGIC	0xEF53
+#define HPFS_SUPER_MAGIC	0xf995e849
+#define ISOFS_SUPER_MAGIC	0x9660
+#define JFFS2_SUPER_MAGIC	0x72b6
+#define ANON_INODE_FS_MAGIC	0x09041934
+
+#define MINIX_SUPER_MAGIC	0x137F		/* original minix fs */
+#define MINIX_SUPER_MAGIC2	0x138F		/* minix fs, 30 char names */
+#define MINIX2_SUPER_MAGIC	0x2468		/* minix V2 fs */
+#define MINIX2_SUPER_MAGIC2	0x2478		/* minix V2 fs, 30 char names */
+#define MINIX3_SUPER_MAGIC	0x4d5a		/* minix V3 fs */
+
+#define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
+#define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
+#define NFS_SUPER_MAGIC		0x6969
+#define OPENPROM_SUPER_MAGIC	0x9fa1
+#define PROC_SUPER_MAGIC	0x9fa0
+#define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
+
+#define REISERFS_SUPER_MAGIC	0x52654973	/* used by gcc */
+					/* used by file system utilities that
+	                                   look at the superblock, etc.  */
+#define REISERFS_SUPER_MAGIC_STRING	"ReIsErFs"
+#define REISER2FS_SUPER_MAGIC_STRING	"ReIsEr2Fs"
+#define REISER2FS_JR_SUPER_MAGIC_STRING	"ReIsEr3Fs"
+
+#define SMB_SUPER_MAGIC		0x517B
+#define USBDEVICE_SUPER_MAGIC	0x9fa2
+#define CGROUP_SUPER_MAGIC	0x27e0eb
+
+#define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
+#define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
+
+#define STACK_END_MAGIC		0x57AC6E9D
+#endif /* __LINUX_MAGIC_H__ */
diff --git a/init/Kconfig b/init/Kconfig
index af5842b..cac65e2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -373,6 +373,10 @@ config FUTEX
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config ANON_INODES
+	bool
+	default y
+
 config EPOLL
 	bool "Enable eventpoll support" if EMBEDDED
 	default y