Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1176

kernel-2.6.18-238.el5.src.rpm

From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 13 Aug 2008 16:48:30 -0400
Subject: [fs] relayfs: support larger on-memory buffer
Message-id: 48A3489E.2070406@redhat.com
O-Subject: [RHEL5.3 PATCH] BZ439269: relayfs support larger on-memory buffer
Bugzilla: 439269
RH-Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
RH-Acked-by: Anton Arapov <aarapov@redhat.com>

relayfs: support larger relay buffer

Bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=439269

Here is a backported kernel patch for 2.6.18-101.el5.

Description:
Use vmalloc() and memset() instead of kcalloc() to allocate a page* array when
the array size is bigger than one page.  This enables relayfs to support
bigger relay buffers than 64MB on 4k-page system, 512MB on 16k-page system.

This patch also includes is_vmalloc_addr() macro patches and resolving dependency
of pgtable.h on highmem.h patch, because new relayfs code uses is_vmalloc_addr()
and VMALLOC_END macro(which is used in is_vmalloc_addr) depends on PKMAP_BASE.

I also ported pgtable.h and highmem.h dependency solution patch to
redhat xen kernel, because it had same dependency problem.

Upstream status:
9e2779fa281cfda13ac060753d674bbcaa23367e //is_vmalloc_addr macro patch
8ca3ed87db062201e1fa15b64a9214e193fc3a8a //is_vmalloc_addr fix1
0738c4bb8f2a8bf15178f852494643b0981f578b //is_vmalloc_addr fix2
0b7a96114bd5991d355a1f1c1d3d9c0c9d9c1cfc //resolve dependency of pgtable.h on highmem.h
68ab3d883a2df13f4b93a923bae3a287cbee29d3 //relayfs larger buffer support

Brew build:
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1420045

Testing:
Tested with the latest systemtap.
I ran stap with 1024MB relay buffer on x86-64 and checked VmallocUsed
in /proc/meminfo.

 Masami

diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c
index a1f6584..fcc53b6 100644
--- a/drivers/net/cxgb3/cxgb3_offload.c
+++ b/drivers/net/cxgb3/cxgb3_offload.c
@@ -1067,9 +1067,7 @@ void *cxgb_alloc_mem(unsigned long size)
  */
 void cxgb_free_mem(void *addr)
 {
-	unsigned long p = (unsigned long)addr;
-
-	if (p >= VMALLOC_START && p < VMALLOC_END)
+	if (is_vmalloc_addr(addr))
 		vfree(addr);
 	else
 		kfree(addr);
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index e38e402..cd0be3f 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,8 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 
 static inline void ntfs_free(void *addr)
 {
-	if (likely(((unsigned long)addr < VMALLOC_START) ||
-			((unsigned long)addr >= VMALLOC_END ))) {
+	if (!is_vmalloc_addr(addr)) {
 		kfree(addr);
 		/* free_page((unsigned long)addr); */
 		return;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0ca9a2f..0331300 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -328,7 +328,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		if (m == NULL) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
-		} else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) {
+		} else if (is_vmalloc_addr((void *)start)) {
 			char * elf_buf;
 			struct vm_struct *m;
 			unsigned long curstart = start;
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index aba7fcf..6ba9684 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -63,8 +63,7 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
 void
 kmem_free(void *ptr, size_t size)
 {
-	if (((unsigned long)ptr < VMALLOC_START) ||
-	    ((unsigned long)ptr >= VMALLOC_END)) {
+	if (!is_vmalloc_addr(ptr)) {
 		kfree(ptr);
 	} else {
 		vfree(ptr);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2af528d..38d0dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -693,8 +693,7 @@ static inline struct page *
 mem_to_page(
 	void			*addr)
 {
-	if (((unsigned long)addr < VMALLOC_START) ||
-	    ((unsigned long)addr >= VMALLOC_END)) {
+	if ((!is_vmalloc_addr(addr))) {
 		return virt_to_page(addr);
 	} else {
 		return vmalloc_to_page(addr);
diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h
index e9a34eb..d027b71 100644
--- a/include/asm-i386/highmem.h
+++ b/include/asm-i386/highmem.h
@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
 /*
  * Ordering is:
  *
@@ -57,7 +52,6 @@ extern pte_t *pkmap_page_table;
  * VMALLOC_START
  * high_memory
  */
-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
diff --git a/include/asm-i386/mach-xen/asm/highmem.h b/include/asm-i386/mach-xen/asm/highmem.h
index d379186..00ea2a4 100644
--- a/include/asm-i386/mach-xen/asm/highmem.h
+++ b/include/asm-i386/mach-xen/asm/highmem.h
@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
 /*
  * Ordering is:
  *
@@ -57,7 +52,6 @@ extern pte_t *pkmap_page_table;
  * VMALLOC_START
  * high_memory
  */
-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
diff --git a/include/asm-i386/mach-xen/asm/pgtable.h b/include/asm-i386/mach-xen/asm/pgtable.h
index 05b6e71..bb44c27 100644
--- a/include/asm-i386/mach-xen/asm/pgtable.h
+++ b/include/asm-i386/mach-xen/asm/pgtable.h
@@ -82,6 +82,14 @@ void paging_init(void);
 #define VMALLOC_OFFSET	(8*1024*1024)
 #define VMALLOC_START	(((unsigned long) high_memory + vmalloc_earlyreserve + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
 #else
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 09697fe..b3704b4 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -81,6 +81,14 @@ void paging_init(void);
 #define VMALLOC_OFFSET	(8*1024*1024)
 #define VMALLOC_START	(((unsigned long) high_memory + vmalloc_earlyreserve + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8fe1de1..0afb317 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -322,6 +322,23 @@ static inline int get_page_unless_zero(struct page *page)
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
+/*
+ * Determine if an address is within the vmalloc range
+ *
+ * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
+ * is no special casing required.
+ */
+static inline int is_vmalloc_addr(const void *x)
+{
+#ifdef CONFIG_MMU
+	unsigned long addr = (unsigned long)x;
+
+	return addr >= VMALLOC_START && addr < VMALLOC_END;
+#else
+	return 0;
+#endif
+}
+
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
diff --git a/kernel/relay.c b/kernel/relay.c
index d82d6da..c780a8d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,6 +70,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
 	.close = relay_file_mmap_close,
 };
 
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+	struct page **array;
+	size_t pa_size = n_pages * sizeof(struct page *);
+
+	if (pa_size > PAGE_SIZE) {
+		array = vmalloc(pa_size);
+		if (array)
+			memset(array, 0, pa_size);
+	} else {
+		array = kzalloc(pa_size, GFP_KERNEL);
+	}
+	return array;
+}
+
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+	if (is_vmalloc_addr(array))
+		vfree(array);
+	else
+		kfree(array);
+}
+
 /**
  *	relay_mmap_buf: - mmap channel buffer to process address space
  *	@buf: relay channel buffer
@@ -114,7 +143,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 	*size = PAGE_ALIGN(*size);
 	n_pages = *size >> PAGE_SHIFT;
 
-	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	buf->page_array = relay_alloc_page_array(n_pages);
 	if (!buf->page_array)
 		return NULL;
 
@@ -134,7 +163,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 depopulate:
 	for (j = 0; j < i; j++)
 		__free_page(buf->page_array[j]);
-	kfree(buf->page_array);
+	relay_free_page_array(buf->page_array);
 	return NULL;
 }
 
@@ -193,7 +222,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
 		vunmap(buf->start);
 		for (i = 0; i < buf->page_count; i++)
 			__free_page(buf->page_array[i]);
-		kfree(buf->page_array);
+		relay_free_page_array(buf->page_array);
 	}
 	chan->buf[buf->cpu] = NULL;
 	kfree(buf->padding);
diff --git a/mm/sparse.c b/mm/sparse.c
index a80415c..9c4bb5f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -239,17 +239,9 @@ got_map_ptr:
 	return ret;
 }
 
-static int vaddr_in_vmalloc_area(void *addr)
-{
-	if (addr >= (void *)VMALLOC_START &&
-	    addr < (void *)VMALLOC_END)
-		return 1;
-	return 0;
-}
-
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-	if (vaddr_in_vmalloc_area(memmap))
+	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
 		free_pages((unsigned long)memmap,