From 4df1d488d823dd1231459faa9aebfa7764706e9e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli <aarcange@redhat.com> Date: Tue, 20 Jan 2009 23:17:30 -0200 Subject: [PATCH 44/54] [PATCH 40/54] ksm KSM tree implementation with kprobes wp_notifier. Signed-off-by: Izik Eidus <ieidus@redhat.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> RH-Upstream-status: pending --- kernel/ksm/Kbuild | 3 + kernel/ksm/external-module-compat.h | 377 ++++++++++ kernel/ksm/ksm.c | 1367 +++++++++++++++++++++++++++++++++++ kernel/ksm/ksm.h | 84 +++ kernel/ksm/wp_notifier.c | 60 ++ kernel/ksm/wp_notifier.h | 12 + 6 files changed, 1903 insertions(+), 0 deletions(-) create mode 100644 kernel/ksm/Kbuild create mode 100644 kernel/ksm/Makefile create mode 100644 kernel/ksm/external-module-compat.h create mode 100644 kernel/ksm/ksm.c create mode 100644 kernel/ksm/ksm.h create mode 100644 kernel/ksm/wp_notifier.c create mode 100644 kernel/ksm/wp_notifier.h diff --git a/kernel/ksm/Kbuild b/kernel/ksm/Kbuild new file mode 100644 index 0000000..06b3a63 --- /dev/null +++ b/kernel/ksm/Kbuild @@ -0,0 +1,3 @@ +obj-m := ksm-mem.o +ksm-mem-objs := ksm.o +ksm-mem-objs := ksm.o wp_notifier.o diff --git a/kernel/ksm/Makefile b/kernel/ksm/Makefile new file mode 100644 index 0000000..e69de29 diff --git a/kernel/ksm/external-module-compat.h b/kernel/ksm/external-module-compat.h new file mode 100644 index 0000000..032dc8e --- /dev/null +++ b/kernel/ksm/external-module-compat.h @@ -0,0 +1,377 @@ + +/* + * Compatibility header for building as an external module. + */ + +/* + * Avoid picking up the kernel's kvm.h in case we have a newer one. + */ + +#include <linux/compiler.h> +#include <linux/version.h> +#include <linux/string.h> +#include <linux/cpu.h> +#include <linux/list.h> +#include <asm/processor.h> +#include <linux/hrtimer.h> +#include <asm/bitops.h> +#include <linux/mm.h> +#include <linux/rmap.h> +#include <asm/tlbflush.h> +#include <linux/module.h> +#include <asm/cacheflush.h> +#include <asm-generic/pgtable.h> + +/* + * 2.6.16 does not have GFP_NOWAIT + */ + +#include <linux/gfp.h> + +void kvm_ksm_set_pte(struct mm_struct *mm, unsigned long address, pte_t pte); +int kvm_ksm_spte_count(struct mm_struct *mm, + unsigned long address); + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) page->mapping; + if (!(anon_mapping & PAGE_MAPPING_ANON)) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + return anon_vma; +out: + rcu_read_unlock(); + return NULL; +} + +static void page_unlock_anon_vma(struct anon_vma *anon_vma) +{ + spin_unlock(&anon_vma->lock); + rcu_read_unlock(); +} + +/* + * At what user virtual address is page expected in @vma? + * Returns virtual address or -EFAULT if page's index/offset is not + * within the range mapped the @vma. + */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index; + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + /* page should be within @vma mapping range */ + return -EFAULT; + } + return address; +} + +/* + * At what user virtual address is page expected in vma? checking that the + * page matches the vma: currently only used on anon pages, by unuse_vma; + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + if (PageAnon(page)) { + if ((void *)vma->anon_vma != + (void *)page->mapping - PAGE_MAPPING_ANON) + return -EFAULT; + } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { + if (!vma->vm_file || + vma->vm_file->f_mapping != page->mapping) + return -EFAULT; + } else + return -EFAULT; + return vma_address(page, vma); +} + +/* + * Check that @page is mapped at @address into @mm. + * + * On success returns with pte mapped and locked. + */ +pte_t *page_check_address(struct page *page, struct mm_struct *mm, + unsigned long address, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + + pte = pte_offset_map(pmd, address); + /* Make a quick check before getting the lock */ + if (!pte_present(*pte)) { + pte_unmap(pte); + return NULL; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + return NULL; +} + +void page_remove_rmap_old(struct page *page, struct vm_area_struct *vma) +{ + if (atomic_add_negative(-1, &page->_mapcount)) { + if (unlikely(page_mapcount(page) < 0)) { + printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); + printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); + printk (KERN_EMERG " page->flags = %lx\n", page->flags); + printk (KERN_EMERG " page->count = %x\n", page_count(page)); + printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + BUG(); + } + + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + } +} + +void page_add_file_rmap_old(struct page *page) +{ + if (atomic_inc_and_test(&page->_mapcount)) + __inc_zone_page_state(page, NR_FILE_MAPPED); +} + +static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma, + int *odirect_sync, int count_offset) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; + + if (pte_write(*pte)) { + pte_t entry; + + /* + * this is needed here to balance the mapcount of the page + */ + count_offset += kvm_ksm_spte_count(mm, address); + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if ((page_mapcount(page) + count_offset) != page_count(page)) { + *odirect_sync = 0; + goto out_unlock; + } + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + set_pte_at(mm, address, pte, entry); + BUG_ON(pte_write(entry)); + kvm_ksm_set_pte(mm, address, entry); + } + ret = 1; + +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_wrprotect_anon(struct page *page, int *odirect_sync, + int count_offset) +{ + struct vm_area_struct *vma; + struct anon_vma *anon_vma; + int ret = 0; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return ret; + + /* + * If the page is inside the swap cache, its _count number was + * increased by one, therefore we have to increase count_offset by one. + */ + if (PageSwapCache(page)) + count_offset++; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + ret += page_wrprotect_one(page, vma, odirect_sync, + count_offset); + + page_unlock_anon_vma(anon_vma); + + return ret; +} + +/** + * page_wrprotect - set all ptes pointing to a page as readonly + * @page: the page to set as readonly + * @odirect_sync: boolean value that is set to 0 when some of the ptes were not + * marked as readonly beacuse page_wrprotect_one() was not able + * to mark this ptes as readonly without opening window to a race + * with odirect + * @count_offset: number of times page_wrprotect() caller had called get_page() + * on the page + * + * returns the number of ptes which were marked as readonly. + * (ptes that were readonly before this function was called are counted as well) + */ +int page_wrprotect(struct page *page, int *odirect_sync, int count_offset) +{ + int ret = 0; + + /* + * Page lock is needed for anon pages for the PageSwapCache check, + * and for page_mapping for filebacked pages + */ + BUG_ON(!PageLocked(page)); + + *odirect_sync = 1; + if (PageAnon(page)) + ret = page_wrprotect_anon(page, odirect_sync, count_offset); + + return ret; +} + +/** + * replace_page - replace page in vma with new page + * @vma: vma that hold the pte oldpage is pointed by. + * @oldpage: the page we are replacing with newpage + * @newpage: the page we replace oldpage with + * @orig_pte: the original value of the pte + * @prot: page protection bits + * + * Returns 0 on success, -EFAULT on failure. + * + * Note: @newpage must not be an anonymous page because replace_page() does + * not change the mapping of @newpage to have the same values as @oldpage. + * @newpage can be mapped in several vmas at different offsets (page->index). + */ +int replace_page(struct vm_area_struct *vma, struct page *oldpage, + struct page *newpage, pte_t orig_pte, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + pte_t new_pte; + spinlock_t *ptl; + unsigned long addr; + int ret; + + BUG_ON(PageAnon(newpage)); + + ret = -EFAULT; + addr = page_address_in_vma(oldpage, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!ptep) + goto out; + + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + ret = 0; + get_page(newpage); + page_add_file_rmap_old(newpage); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + new_pte = mk_pte(newpage, prot); + set_pte_at(mm, addr, ptep, new_pte); + update_mmu_cache(vma, addr, new_pte); + BUG_ON(pte_write(new_pte)); + kvm_ksm_set_pte(mm, addr, new_pte); + + page_remove_rmap_old(oldpage, vma); + if (PageAnon(oldpage)) { + dec_mm_counter(mm, anon_rss); + inc_mm_counter(mm, file_rss); + } + put_page(oldpage); + + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + + +#include <linux/smp.h> + +/* HRTIMER_MODE_ABS started life with a different name */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) +#define HRTIMER_MODE_ABS HRTIMER_ABS +#endif + +/* __mmdrop() is not exported before 2.6.25 */ +#include <linux/sched.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + +#define mmdrop(x) do { (void)(x); } while (0) + +#endif diff --git a/kernel/ksm/ksm.c b/kernel/ksm/ksm.c new file mode 100644 index 0000000..8d1fb20 --- /dev/null +++ b/kernel/ksm/ksm.c @@ -0,0 +1,1367 @@ +/* + * Memory merging driver for Linux + * + * This module enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/vmalloc.h> +#include <linux/file.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/scatterlist.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/rbtree.h> +#include <linux/anon_inodes.h> + +#include <asm/tlbflush.h> + +#include "ksm.h" +#include "wp_notifier.h" +#include "external-module-compat.h" + +#define KSM_MINOR 234 + +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +static int rmap_hash_size; +module_param(rmap_hash_size, int, 0); +MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping"); + +/* + * ksm_mem_slot - hold information for an userspace scanning range + * (the scanning for this region will be from addr untill addr + + * npages * PAGE_SIZE inside mm) + */ +struct ksm_mem_slot { + struct list_head link; + struct list_head sma_link; + struct mm_struct *mm; + unsigned long addr; /* the begining of the virtual address */ + int npages; /* number of pages to share */ +}; + +/* + * ksm_sma - shared memory area, each process have its own sma that contain the + * information about the slots that it own + */ +struct ksm_sma { + struct list_head sma_slots; +}; + +/** + * struct ksm_scan - cursor for scanning + * @slot_index: the current slot we are scanning + * @page_index: the page inside the sma that is currently being scanned + * + * ksm uses it to know what are the next pages it need to scan + */ +struct ksm_scan { + struct ksm_mem_slot *slot_index; + unsigned long page_index; +}; + +/* + * Few notes about ksm scanning progress (make it easier to understand the + * structures below): + * + * In order to reduce excessive scanning, pages are sorted into the hash + * table, page_hash. After a page is inserted into the hash table, its + * contents may have changed. In this case, ksm must remove the page from + * the hash table and potentially rehash it. Ksm uses a reverse mapping, + * rmap_hash, to efficiently manage this. + */ + +struct rmap_item; + +/* + * tree_item - object of the write protected pages tree + */ +struct tree_item { + struct rb_node node; + struct rmap_item *rmap_item; +}; + +/* + * rmap_item - object of the rmap_hash hash table + * (it is holding the previous hash value (oldindex), + * pointer into the page_hash_item, and pointer into the tree_item) + */ +struct rmap_item { + struct hlist_node link; + struct mm_struct *mm; + unsigned long address; + unsigned int oldchecksum; /* old checksum value */ + unsigned char stable_tree; // 1 stable_tree 0 unstable tree + struct tree_item *tree_item; + struct rmap_item *next; + struct rmap_item *prev; +}; + +/* + * slots is linked list that hold all the memory regions that were registred + * to be scanned. + */ +static LIST_HEAD(slots); +static DECLARE_RWSEM(slots_lock); + +struct rb_root root_stable_tree = RB_ROOT; +struct rb_root root_unstable_tree = RB_ROOT; + +static int nrmaps_hash; +/* rmap_hash hash table */ +static struct hlist_head *rmap_hash; + +static struct kmem_cache *tree_item_cache; +static struct kmem_cache *rmap_item_cache; + +static int kthread_sleep; /* sleep time of the kernel thread */ +static int kthread_pages_to_scan; /* npages to scan for the kernel thread */ +static struct ksm_scan kthread_ksm_scan; +static int ksmd_flags; +static struct task_struct *kthread; +static DECLARE_WAIT_QUEUE_HEAD(kthread_wait); +static DECLARE_RWSEM(kthread_lock); + +static int ksm_slab_init(void) +{ + int ret = -ENOMEM; + + tree_item_cache = KMEM_CACHE(tree_item, 0); + if (!tree_item_cache) + goto out; + + rmap_item_cache = KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out_free; + + return 0; + +out_free: + kmem_cache_destroy(tree_item_cache); +out: + return ret; +} + +static void ksm_slab_free(void) +{ + kmem_cache_destroy(rmap_item_cache); + kmem_cache_destroy(tree_item_cache); +} + +static inline struct tree_item *alloc_tree_item(void) +{ + return kmem_cache_zalloc(tree_item_cache, GFP_KERNEL); +} + +static void free_tree_item(struct tree_item *tree_item) +{ + kmem_cache_free(tree_item_cache, tree_item); +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +/* + * PageKsm - this type of pages are the write protected pages that ksm map + * into multiple vmas (this is the "shared page") + * this page was allocated using alloc_page(), every pte that pointing to it + * is always write protected (therefore its data content cant ever be changed) + * and this page cant be swapped. + */ +static inline int PageKsm(struct page *page) +{ + return !PageAnon(page); +} + +static int rmap_hash_init(void) +{ + if (!rmap_hash_size) { + struct sysinfo sinfo; + + si_meminfo(&sinfo); + rmap_hash_size = sinfo.totalram / 10; + } + nrmaps_hash = rmap_hash_size; + rmap_hash = vmalloc(nrmaps_hash * sizeof(struct hlist_head)); + if (!rmap_hash) + return -ENOMEM; + memset(rmap_hash, 0, nrmaps_hash * sizeof(struct hlist_head)); + return 0; +} + +static void rmap_hash_free(void) +{ + int i; + struct hlist_head *bucket; + struct hlist_node *node, *n; + struct rmap_item *rmap_item; + + for (i = 0; i < nrmaps_hash; ++i) { + bucket = &rmap_hash[i]; + hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) { + hlist_del(&rmap_item->link); + free_rmap_item(rmap_item); + } + } + vfree(rmap_hash); +} + +static inline u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page, KM_USER0); + checksum = jhash(addr, PAGE_SIZE, 17); + kunmap_atomic(addr, KM_USER0); + return checksum; +} + +static struct rmap_item *get_rmap_item(struct mm_struct *mm, unsigned long addr) +{ + struct rmap_item *rmap_item; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &rmap_hash[addr % nrmaps_hash]; + hlist_for_each_entry(rmap_item, node, bucket, link) { + if (mm == rmap_item->mm && rmap_item->address == addr) { + return rmap_item; + } + } + return NULL; +} + +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + struct tree_item *tree_item; + + tree_item = rmap_item->tree_item; + rmap_item->tree_item = NULL; + + if (rmap_item->stable_tree) { + if (rmap_item->prev) { + BUG_ON(rmap_item->prev->next != rmap_item); + rmap_item->prev->next = rmap_item->next; + } + if (rmap_item->next) { + BUG_ON(rmap_item->next->prev != rmap_item); + rmap_item->next->prev = rmap_item->prev; + } + } + + if (tree_item) { + if (rmap_item->stable_tree) { + if (!rmap_item->next && !rmap_item->prev) { + rb_erase(&tree_item->node, &root_stable_tree); + free_tree_item(tree_item); + } else if (!rmap_item->prev) + tree_item->rmap_item = rmap_item->next; + else + tree_item->rmap_item = rmap_item->prev; + } else if (!rmap_item->stable_tree) + free_tree_item(tree_item); + } + + hlist_del(&rmap_item->link); + free_rmap_item(rmap_item); +} + +static void remove_page_from_tree(struct mm_struct *mm, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + rmap_item = get_rmap_item(mm, addr); + if (!rmap_item) + return; + remove_rmap_item_from_tree(rmap_item); + return; +} + +static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma, + struct ksm_memory_region *mem) +{ + struct ksm_mem_slot *slot; + int ret = -EPERM; + + slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL); + if (!slot) { + ret = -ENOMEM; + goto out; + } + + slot->mm = get_task_mm(current); + if (!slot->mm) + goto out_free; + slot->addr = mem->addr; + slot->npages = mem->npages; + + down_write(&slots_lock); + + list_add_tail(&slot->link, &slots); + list_add_tail(&slot->sma_link, &ksm_sma->sma_slots); + + up_write(&slots_lock); + return 0; + +out_free: + kfree(slot); +out: + return ret; +} + +static void remove_mm_from_hash_and_tree(struct mm_struct *mm) +{ + struct ksm_mem_slot *slot; + int pages_count; + + list_for_each_entry(slot, &slots, link) + if (slot->mm == mm) + break; + BUG_ON(!slot); + + root_unstable_tree = RB_ROOT; + for (pages_count = 0; pages_count < slot->npages; ++pages_count) + remove_page_from_tree(mm, slot->addr + + pages_count * PAGE_SIZE); + list_del(&slot->link); +} + +static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma) +{ + struct ksm_mem_slot *slot, *node; + + down_write(&slots_lock); + list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) { + remove_mm_from_hash_and_tree(slot->mm); + mmput(slot->mm); + list_del(&slot->sma_link); + kfree(slot); + } + up_write(&slots_lock); + return 0; +} + +static int ksm_sma_release(struct inode *inode, struct file *filp) +{ + struct ksm_sma *ksm_sma = filp->private_data; + int r; + + r = ksm_sma_ioctl_remove_memory_region(ksm_sma); + kfree(ksm_sma); + return r; +} + +static long ksm_sma_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct ksm_sma *sma = filp->private_data; + void __user *argp = (void __user *)arg; + int r = EINVAL; + + switch (ioctl) { + case KSM_REGISTER_MEMORY_REGION: { + struct ksm_memory_region ksm_memory_region; + + r = -EFAULT; + if (copy_from_user(&ksm_memory_region, argp, + sizeof(ksm_memory_region))) + goto out; + r = ksm_sma_ioctl_register_memory_region(sma, + &ksm_memory_region); + break; + } + case KSM_REMOVE_MEMORY_REGION: + r = ksm_sma_ioctl_remove_memory_region(sma); + break; + } + +out: + return r; +} + +static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(addr < vma->vm_start || addr >= vma->vm_end)) + return -EFAULT; + return addr; +} + +static pte_t *get_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep = NULL; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map(pmd, addr); +out: + return ptep; +} + +static int is_present_pte(struct mm_struct *mm, unsigned long addr) +{ + pte_t *ptep; + int r; + + ptep = get_pte(mm, addr); + if (!ptep) + return 0; + + r = pte_present(*ptep); + pte_unmap(ptep); + + return r; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int r; + + addr1 = kmap_atomic(page1, KM_USER0); + addr2 = kmap_atomic(page2, KM_USER1); + r = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr1, KM_USER0); + kunmap_atomic(addr2, KM_USER1); + return r; +} + +/* pages_identical + * return 1 if identical, 0 otherwise. + */ +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * note: + * oldpage should be anon page while newpage should be file mapped page + * + * this function return 0 if the pages were merged, 1 otherwise. + */ +static int try_to_merge_one_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct page *oldpage, + struct page *newpage, + pgprot_t newprot) +{ + int ret = 1; + int odirect_sync; + unsigned long page_addr_in_vma; + pte_t orig_pte, *orig_ptep; + + get_page(newpage); + get_page(oldpage); + + down_read(&mm->mmap_sem); + + page_addr_in_vma = addr_in_vma(vma, oldpage); + if (page_addr_in_vma == -EFAULT) + goto out_unlock; + + orig_ptep = get_pte(mm, page_addr_in_vma); + if (!orig_ptep) + goto out_unlock; + orig_pte = *orig_ptep; + pte_unmap(orig_ptep); + if (!pte_present(orig_pte)) + goto out_unlock; + if (page_to_pfn(oldpage) != pte_pfn(orig_pte)) + goto out_unlock; + /* + * we need the page lock to read a stable PageSwapCache in + * page_wrprotect() + */ + if (TestSetPageLocked(oldpage)) + goto out_unlock; + /* + * page_wrprotect check if the page is swapped or in swap cache, + * in the future we might want to run here if_present_pte and then + * swap_free + */ + if (!page_wrprotect(oldpage, &odirect_sync, 2)) { + unlock_page(oldpage); + goto out_unlock; + } + unlock_page(oldpage); + if (!odirect_sync) + goto out_unlock; + + orig_pte = pte_wrprotect(orig_pte); + + if (pages_identical(oldpage, newpage)) + ret = replace_page(vma, oldpage, newpage, orig_pte, newprot); + +out_unlock: + up_read(&mm->mmap_sem); + put_page(oldpage); + put_page(newpage); + return ret; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them to be + * merged into one page. + * + * this function return 0 if we successfully mapped two identical pages into one + * page, 1 otherwise. + * (note in case we created KsmPage and mapped one page into it but the second + * page was not mapped we consider it as a failure and return 1) + */ +static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1, + struct mm_struct *mm2, struct page *page2, + unsigned long addr1, unsigned long addr2) +{ + struct vm_area_struct *vma; + pgprot_t prot; + int ret = 1; + + /* + * If page2 isn't shared (it isn't PageKsm) we have to allocate a new + * file mapped page and make the two ptes of mm1(page1) and mm2(page2) + * point to it. If page2 is shared, we can just make the pte of + * mm1(page1) point to page2 + */ + if (PageKsm(page2)) { + down_read(&mm1->mmap_sem); + vma = find_vma(mm1, addr1); + up_read(&mm1->mmap_sem); + if (!vma) + return ret; + prot = vma->vm_page_prot; + pgprot_val(prot) &= ~_PAGE_RW; + ret = try_to_merge_one_page(mm1, vma, page1, page2, prot); + } else { + struct page *kpage; + + kpage = alloc_page(GFP_HIGHUSER); + if (!kpage) + return ret; + down_read(&mm1->mmap_sem); + vma = find_vma(mm1, addr1); + up_read(&mm1->mmap_sem); + if (!vma) { + put_page(kpage); + return ret; + } + prot = vma->vm_page_prot; + pgprot_val(prot) &= ~_PAGE_RW; + + copy_user_highpage(kpage, page1, addr1); + ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot); + + if (!ret) { + down_read(&mm2->mmap_sem); + vma = find_vma(mm2, addr2); + up_read(&mm2->mmap_sem); + if (!vma) { + put_page(kpage); + ret = 1; + return ret; + } + + prot = vma->vm_page_prot; + pgprot_val(prot) &= ~_PAGE_RW; + + ret = try_to_merge_one_page(mm2, vma, page2, kpage, + prot); + /* + * If the secoend try_to_merge_one_page call was failed, + * we are in situation where we have Ksm page that have + * just one pte pointing to it, in this case we break + * it. + */ + if (ret) { + struct page *tmppage[1]; + + down_read(&mm1->mmap_sem); + if (get_user_pages(current, mm1, addr1, 1, 1, + 0, tmppage, NULL)) { + put_page(tmppage[0]); + } + up_read(&mm1->mmap_sem); + } + } + put_page(kpage); + } + return ret; +} + +static int is_zapped_item(struct rmap_item *rmap_item, + struct page **page) +{ + int ret = 0; + + cond_resched(); + if (is_present_pte(rmap_item->mm, rmap_item->address)) { + down_read(&rmap_item->mm->mmap_sem); + ret = get_user_pages(current, rmap_item->mm, rmap_item->address, + 1, 0, 0, page, NULL); + up_read(&rmap_item->mm->mmap_sem); + } + + if (!ret) + return 1; + + if (unlikely(!PageKsm(page[0]))) { + put_page(page[0]); + return 1; + } + return 0; +} + +static struct rmap_item *stable_tree_search(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node *node = root_stable_tree.rb_node; + struct tree_item *tree_item; + struct rmap_item *found_rmap_item; + + while (node) { + int ret; + + tree_item = rb_entry(node, struct tree_item, node); + found_rmap_item = tree_item->rmap_item; + while (found_rmap_item) { + BUG_ON(!found_rmap_item->stable_tree); + BUG_ON(!found_rmap_item->tree_item); + if (!rmap_item || + !(found_rmap_item->mm == rmap_item->mm && + found_rmap_item->address == rmap_item->address)) { + if (!is_zapped_item(found_rmap_item, page2)) + break; + remove_rmap_item_from_tree(found_rmap_item); + } + found_rmap_item = found_rmap_item->next; + } + if (!found_rmap_item) + goto out_didnt_find; + + /* + * We can trust the value of the memcmp as we know the pages + * are write protected. + */ + ret = memcmp_pages(page, page2[0]); + + if (ret < 0) { + put_page(page2[0]); + node = node->rb_left; + } + else if (ret > 0) { + put_page(page2[0]); + node = node->rb_right; + } + else + goto out_found; + } +out_didnt_find: + found_rmap_item = NULL; +out_found: + return found_rmap_item; +} + +static int stable_tree_insert(struct page *page, + struct tree_item *new_tree_item, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &(root_stable_tree.rb_node); + struct rb_node *parent = NULL; + struct tree_item *tree_item; + struct page *page2[1]; + + while (*new) { + int ret; + struct rmap_item *insert_rmap_item; + + tree_item = rb_entry(*new, struct tree_item, node); + BUG_ON(!tree_item); + BUG_ON(!tree_item->rmap_item); + + insert_rmap_item = tree_item->rmap_item; + while (insert_rmap_item) { + BUG_ON(!insert_rmap_item->stable_tree); + BUG_ON(!insert_rmap_item->tree_item); + if (!rmap_item || + !(insert_rmap_item->mm == rmap_item->mm && + insert_rmap_item->address == rmap_item->address)) { + if (!is_zapped_item(insert_rmap_item, page2)) + break; + remove_rmap_item_from_tree(insert_rmap_item); + } + insert_rmap_item = insert_rmap_item->next; + } + if (!insert_rmap_item) + return 1; + + ret = memcmp_pages(page, page2[0]); + + parent = *new; + if (ret < 0) { + put_page(page2[0]); + new = &((*new)->rb_left); + } + else if (ret > 0) { + put_page(page2[0]); + new = &((*new)->rb_right); + } + else { + /* + * It isnt a bug when we are here, + * beacuse after we release the stable_tree_lock + * someone else could have merge identical page to the + * tree. + */ + return 1; + } + } + + rb_link_node(&new_tree_item->node, parent, new); + rb_insert_color(&new_tree_item->node, &root_stable_tree); + rmap_item->stable_tree = 1; + rmap_item->tree_item = new_tree_item; + + return 0; +} + +static struct tree_item *unstable_tree_search_insert(struct page *page, + struct page **page2, + struct rmap_item *page_rmap_item) +{ + struct rb_node **new = &(root_unstable_tree.rb_node); + struct rb_node *parent = NULL; + struct tree_item *tree_item; + struct tree_item *new_tree_item; + struct rmap_item *rmap_item; + unsigned int checksum; + + while (*new) { + int ret; + + tree_item = rb_entry(*new, struct tree_item, node); + BUG_ON(!tree_item); + rmap_item = tree_item->rmap_item; + BUG_ON(!rmap_item); + + /* + * We dont want to swap in pages + */ + if (!is_present_pte(rmap_item->mm, rmap_item->address)) + return NULL; + + down_read(&rmap_item->mm->mmap_sem); + ret = get_user_pages(current, rmap_item->mm, rmap_item->address, + 1, 0, 0, page2, NULL); + up_read(&rmap_item->mm->mmap_sem); + if (!ret) + return NULL; + + ret = memcmp_pages(page, page2[0]); + + parent = *new; + if (ret < 0) { + put_page(page2[0]); + new = &((*new)->rb_left); + } + else if (ret > 0) { + put_page(page2[0]); + new = &((*new)->rb_right); + } else + return tree_item; + } + + if (!page_rmap_item) + return NULL; + + checksum = calc_checksum(page); + if (page_rmap_item->oldchecksum != checksum) { + page_rmap_item->oldchecksum = checksum; + return NULL; + } + + new_tree_item = alloc_tree_item(); + if (!new_tree_item) + return NULL; + + page_rmap_item->tree_item = new_tree_item; + page_rmap_item->stable_tree = 0; + new_tree_item->rmap_item = page_rmap_item; + rb_link_node(&new_tree_item->node, parent, new); + rb_insert_color(&new_tree_item->node, &root_unstable_tree); + + return NULL; +} + +/* + * update_stable_tree - check if the page inside the tree got zapped, + * and if it got zapped, kick it from the tree. + */ +int update_tree(struct rmap_item *rmap_item, int *wait) +{ + struct page *page[1]; + + if (!rmap_item->stable_tree) { + if (rmap_item->tree_item) { + remove_rmap_item_from_tree(rmap_item); + return 1; + } + return 0; + } + if (is_zapped_item(rmap_item, page)) { + remove_rmap_item_from_tree(rmap_item); + *wait = 1; + return 1; + } + put_page(page[0]); + return 0; +} + +static struct rmap_item *create_new_rmap_item(struct mm_struct *mm, + unsigned long addr, + unsigned int checksum) +{ + struct rmap_item *rmap_item; + struct hlist_head *bucket; + + rmap_item = alloc_rmap_item(); + if (!rmap_item) + return NULL; + + rmap_item->mm = mm; + rmap_item->address = addr; + rmap_item->oldchecksum = checksum; + rmap_item->stable_tree = 0; + rmap_item->tree_item = NULL; + + bucket = &rmap_hash[addr % nrmaps_hash]; + hlist_add_head(&rmap_item->link, bucket); + + return rmap_item; +} + +/* + * cmp_and_merge_page - take a page computes its hash value and check if there + * is similar hash value to different page, + * in case we find that there is similar hash to different page we call to + * try_to_merge_two_pages(). + */ +static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page) +{ + struct page *page2[1]; + struct ksm_mem_slot *slot; + struct tree_item *tree_item; + struct rmap_item *rmap_item; + struct rmap_item *tree_rmap_item; + unsigned int checksum; + unsigned long addr; + int wait = 0; + + slot = ksm_scan->slot_index; + addr = slot->addr + ksm_scan->page_index * PAGE_SIZE; + rmap_item = get_rmap_item(slot->mm, addr); + if (rmap_item) { + if (update_tree(rmap_item, &wait)) + rmap_item = NULL; + } + + tree_rmap_item = stable_tree_search(page, page2, rmap_item); + if (tree_rmap_item) { + int ret; + + BUG_ON(!tree_rmap_item->tree_item); + ret = try_to_merge_two_pages(slot->mm, page, tree_rmap_item->mm, + page2[0], addr, + tree_rmap_item->address); + put_page(page2[0]); + if (!ret) { + if (!rmap_item) + rmap_item = create_new_rmap_item(slot->mm, + addr, 0); + if (!rmap_item) + return !ret; + + + rmap_item->next = tree_rmap_item->next; + rmap_item->prev = tree_rmap_item; + + if (tree_rmap_item->next) + tree_rmap_item->next->prev = rmap_item; + + tree_rmap_item->next = rmap_item; + + rmap_item->stable_tree = 1; + rmap_item->tree_item = tree_rmap_item->tree_item; + } + return !ret; + } + + tree_item = unstable_tree_search_insert(page, page2, rmap_item); + if (tree_item) { + int ret; + + rmap_item = tree_item->rmap_item; + BUG_ON(!rmap_item); + ret = try_to_merge_two_pages(slot->mm, page, rmap_item->mm, + page2[0], addr, + rmap_item->address); + if (!ret) { + rb_erase(&tree_item->node, &root_unstable_tree); + stable_tree_insert(page2[0], tree_item, rmap_item); + } + put_page(page2[0]); + return !ret; + } + if (!wait && !rmap_item) { + checksum = calc_checksum(page); + create_new_rmap_item(slot->mm, addr, checksum); + return 0; + } + return 0; +} + +/* return -EAGAIN - no slots registered, nothing to be done */ +static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan) +{ + struct ksm_mem_slot *slot; + + if (list_empty(&slots)) + return -EAGAIN; + + slot = ksm_scan->slot_index; + + /* Are there pages left in this slot to scan? */ + if ((slot->npages - ksm_scan->page_index - nscan) > 0) { + ksm_scan->page_index += nscan; + return 0; + } + + list_for_each_entry_from(slot, &slots, link) { + if (slot == ksm_scan->slot_index) + continue; + ksm_scan->page_index = 0; + ksm_scan->slot_index = slot; + return 0; + } + + /* look like we finished scanning the whole memory, starting again */ + root_unstable_tree = RB_ROOT; + ksm_scan->page_index = 0; + ksm_scan->slot_index = list_first_entry(&slots, + struct ksm_mem_slot, link); + return 0; +} + +/* + * update slot_index - make sure ksm_scan will point to vaild data, + * it is possible that by the time we are here the data that ksm_scan was + * pointed to was released so we have to call this function every time after + * taking the slots_lock + */ +static void scan_update_old_index(struct ksm_scan *ksm_scan) +{ + struct ksm_mem_slot *slot; + + if (list_empty(&slots)) + return; + + list_for_each_entry(slot, &slots, link) { + if (ksm_scan->slot_index == slot) + return; + } + + ksm_scan->slot_index = list_first_entry(&slots, + struct ksm_mem_slot, link); + ksm_scan->page_index = 0; +} + +/** + * ksm_scan_start - the ksm scanner main worker function. + * @ksm_scan - the scanner. + * @scan_npages - number of pages we are want to scan before we return from this + * @function. + * + * (this function can be called from the kernel thread scanner, or from + * userspace ioctl context scanner) + * + * The function return -EAGAIN in case there are not slots to scan. + */ +static int ksm_scan_start(struct ksm_scan *ksm_scan, int scan_npages) +{ + struct ksm_mem_slot *slot; + struct page *page[1]; + int val; + int ret = 0; + + down_read(&slots_lock); + + scan_update_old_index(ksm_scan); + + while (scan_npages > 0) { + ret = scan_get_next_index(ksm_scan, 1); + if (ret) + goto out; + + slot = ksm_scan->slot_index; + + cond_resched(); + + /* + * If the page is swapped out or in swap cache, we don't want to + * scan it (it is just for performance). + */ + if (is_present_pte(slot->mm, slot->addr + + ksm_scan->page_index * PAGE_SIZE)) { + down_read(&slot->mm->mmap_sem); + val = get_user_pages(current, slot->mm, slot->addr + + ksm_scan->page_index * PAGE_SIZE , + 1, 0, 0, page, NULL); + up_read(&slot->mm->mmap_sem); + if (val == 1) { + if (!PageKsm(page[0])) + cmp_and_merge_page(ksm_scan, page[0]); + put_page(page[0]); + } + } + scan_npages--; + } + scan_get_next_index(ksm_scan, 1); +out: + up_read(&slots_lock); + return ret; +} + +/* + * no multithreaded ksm for ovirt + */ +/*static int ksm_scan_ioctl_start(struct ksm_scan *ksm_scan, + struct ksm_user_scan *scan) +{ + if (!(scan->flags & ksm_control_flags_run)) + return 0; + + return ksm_scan_start(ksm_scan, scan->pages_to_scan); +}*/ + +static int ksm_scan_release(struct inode *inode, struct file *filp) +{ + struct ksm_scan *ksm_scan = filp->private_data; + + kfree(ksm_scan); + return 0; +} + +static long ksm_scan_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ +// struct ksm_scan *ksm_scan = filp->private_data; + //void __user *argp = (void __user *)arg; + int r = EINVAL; + + switch (ioctl) { + /* + * i didnt implemented the locking yet, and in ovirt we dont run + * multi-threaded ksm. + */ + /*case KSM_SCAN: { + struct ksm_user_scan scan; + + r = -EFAULT; + if (copy_from_user(&scan, argp, + sizeof(struct ksm_user_scan))) + break; + + r = ksm_scan_ioctl_start(ksm_scan, &scan); + }*/ + } + return r; +} + +static struct file_operations ksm_sma_fops = { + .release = ksm_sma_release, + .unlocked_ioctl = ksm_sma_ioctl, + .compat_ioctl = ksm_sma_ioctl, +}; + +static int ksm_dev_ioctl_create_shared_memory_area(void) +{ + int fd = -1; + struct ksm_sma *ksm_sma; + + ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL); + if (!ksm_sma) + goto out; + + INIT_LIST_HEAD(&ksm_sma->sma_slots); + + fd = anon_inode_getfd("ksm-sma", &ksm_sma_fops, ksm_sma, 0); + if (fd < 0) + goto out_free; + + return fd; +out_free: + kfree(ksm_sma); +out: + return fd; +} + +static struct file_operations ksm_scan_fops = { + .release = ksm_scan_release, + .unlocked_ioctl = ksm_scan_ioctl, + .compat_ioctl = ksm_scan_ioctl, +}; + +static struct ksm_scan *ksm_scan_create(void) +{ + return kzalloc(sizeof(struct ksm_scan), GFP_KERNEL); +} + +static int ksm_dev_ioctl_create_scan(void) +{ + int fd = -ENOMEM; + struct ksm_scan *ksm_scan; + + ksm_scan = ksm_scan_create(); + if (!ksm_scan) + goto out; + + fd = anon_inode_getfd("ksm-scan", &ksm_scan_fops, ksm_scan, 0); + if (fd < 0) + goto out_free; + return fd; + +out_free: + kfree(ksm_scan); +out: + return fd; +} + +/* + * ksm_dev_ioctl_start_stop_kthread - control the kernel thread scanning running + * speed. + * This function allow us to control on the time the kernel thread will sleep + * how many pages it will scan between sleep and sleep, and how many pages it + * will maximum merge between sleep and sleep. + */ +static int ksm_dev_ioctl_start_stop_kthread(struct ksm_kthread_info *info) +{ + int rc = 0; + + down_write(&kthread_lock); + + if (info->flags & ksm_control_flags_run) { + if (!info->pages_to_scan) { + rc = EPERM; + up_write(&kthread_lock); + goto out; + } + } + + kthread_sleep = info->sleep; + kthread_pages_to_scan = info->pages_to_scan; + ksmd_flags = info->flags; + + up_write(&kthread_lock); + + if (ksmd_flags & ksm_control_flags_run) + wake_up_interruptible(&kthread_wait); + +out: + return rc; +} + +/* + * ksm_dev_ioctl_get_info_kthread - write into info the scanning information + * of the ksm kernel thread + */ +static void ksm_dev_ioctl_get_info_kthread(struct ksm_kthread_info *info) +{ + down_read(&kthread_lock); + + info->sleep = kthread_sleep; + info->pages_to_scan = kthread_pages_to_scan; + info->flags = ksmd_flags; + + up_read(&kthread_lock); +} + +static long ksm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r = -EINVAL; + + switch (ioctl) { + case KSM_GET_API_VERSION: + r = KSM_API_VERSION; + break; + case KSM_CREATE_SHARED_MEMORY_AREA: + r = ksm_dev_ioctl_create_shared_memory_area(); + break; + case KSM_CREATE_SCAN: + r = ksm_dev_ioctl_create_scan(); + break; + case KSM_START_STOP_KTHREAD: { + struct ksm_kthread_info info; + + r = -EFAULT; + if (copy_from_user(&info, argp, + sizeof(struct ksm_kthread_info))) + break; + + r = ksm_dev_ioctl_start_stop_kthread(&info); + break; + } + case KSM_GET_INFO_KTHREAD: { + struct ksm_kthread_info info; + + ksm_dev_ioctl_get_info_kthread(&info); + r = -EFAULT; + if (copy_to_user(argp, &info, + sizeof(struct ksm_kthread_info))) + break; + r = 0; + break; + } + default: + break; + } + return r; +} + +static struct file_operations ksm_chardev_ops = { + .unlocked_ioctl = ksm_dev_ioctl, + .compat_ioctl = ksm_dev_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice ksm_dev = { + KSM_MINOR, + "ksm", + &ksm_chardev_ops, +}; + +int kthread_ksm_scan_thread(void *nothing) +{ + while (!kthread_should_stop()) { + if (ksmd_flags & ksm_control_flags_run) { + down_read(&kthread_lock); + ksm_scan_start(&kthread_ksm_scan, + kthread_pages_to_scan); + up_read(&kthread_lock); + schedule_timeout_interruptible( + usecs_to_jiffies(kthread_sleep)); + } else + wait_event_interruptible(kthread_wait, + ksmd_flags & ksm_control_flags_run || + kthread_should_stop()); + } + return 0; +} + +static int __init ksm_init(void) +{ + int r; + + r = ksm_slab_init(); + if (r) + goto out; + + r = rmap_hash_init(); + if (r) + goto out_free1; + + kthread = kthread_run(kthread_ksm_scan_thread, NULL, "kksmd"); + if (IS_ERR(kthread)) { + printk(KERN_ERR "ksm: creating kthread failed\n"); + r = PTR_ERR(kthread); + goto out_free2; + } + + r = init_wp_notifier(); + if (r) + goto out_free3; + + r = misc_register(&ksm_dev); + if (r) { + printk(KERN_ERR "ksm: misc device register failed\n"); + goto out_free4; + } + + printk(KERN_WARNING "ksm loaded\n"); + return 0; + +out_free4: + exit_wp_notifier(); +out_free3: + kthread_stop(kthread); +out_free2: + rmap_hash_free(); +out_free1: + ksm_slab_free(); +out: + return r; +} + +static void __exit ksm_exit(void) +{ + misc_deregister(&ksm_dev); + exit_wp_notifier(); + ksmd_flags = ksm_control_flags_run; + kthread_stop(kthread); + rmap_hash_free(); + ksm_slab_free(); +} + +module_init(ksm_init) +module_exit(ksm_exit) diff --git a/kernel/ksm/ksm.h b/kernel/ksm/ksm.h new file mode 100644 index 0000000..91ca286 --- /dev/null +++ b/kernel/ksm/ksm.h @@ -0,0 +1,84 @@ +#ifndef __LINUX_KSM_H +#define __LINUX_KSM_H + +/* + * Userspace interface for /dev/ksm - kvm shared memory + */ + +#ifdef __KERNEL__ +#include <linux/types.h> +#include <linux/ioctl.h> +#else +#include <sys/types.h> +#include <sys/ioctl.h> +#endif + +#include <asm/types.h> + +#define KSM_API_VERSION 1 + +#define ksm_control_flags_run 1 + +/* for KSM_REGISTER_MEMORY_REGION */ +struct ksm_memory_region { + __u32 npages; /* number of pages to share */ + __u32 pad; + __u64 addr; /* the begining of the virtual address */ +}; + +struct ksm_user_scan { + __u32 pages_to_scan; + __u32 flags; /* control flags */ +}; + +struct ksm_kthread_info { + __u32 sleep; /* number of microsecoends to sleep */ + __u32 pages_to_scan; /* number of pages to scan */ + __u32 flags; /* control flags */ +}; + +#define KSMIO 0xAB + +/* ioctls for /dev/ksm */ + +#define KSM_GET_API_VERSION _IO(KSMIO, 0x00) +/* + * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd + */ +#define KSM_CREATE_SHARED_MEMORY_AREA _IO(KSMIO, 0x01) /* return SMA fd */ +/* + * KSM_CREATE_SCAN - create the scanner fd + */ +#define KSM_CREATE_SCAN _IO(KSMIO, 0x02) /* return SCAN fd */ +/* + * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed + * (can stop the kernel thread from working by setting running = 0) + */ +#define KSM_START_STOP_KTHREAD _IOW(KSMIO, 0x03,\ + struct ksm_kthread_info) +/* + * KSM_GET_INFO_KTHREAD - return information about the kernel thread + * scanning speed. + */ +#define KSM_GET_INFO_KTHREAD _IOW(KSMIO, 0x04,\ + struct ksm_kthread_info) + + +/* ioctls for SMA fds */ + +/* + * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be + * scanned by kvm. + */ +#define KSM_REGISTER_MEMORY_REGION _IOW(KSMIO, 0x20,\ + struct ksm_memory_region) +/* + * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm. + */ +#define KSM_REMOVE_MEMORY_REGION _IO(KSMIO, 0x21) + +/* ioctls for SCAN fds */ +#define KSM_SCAN _IOW(KSMIO, 0x40,\ + struct ksm_user_scan) + +#endif diff --git a/kernel/ksm/wp_notifier.c b/kernel/ksm/wp_notifier.c new file mode 100644 index 0000000..0da4499 --- /dev/null +++ b/kernel/ksm/wp_notifier.c @@ -0,0 +1,60 @@ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/kprobes.h> +#include <linux/kallsyms.h> +#include "wp_notifier.h" + +static int pre_do_wp_page(struct kprobe *p, + struct pt_regs *regs) +{ + struct mm_struct *mm; + unsigned long address; + + /* + * kprobes runs with irq disabled and preempt disabled but we + * need irq enabled to flush the smp tlb with IPIs while + * tearing down sptes. + */ + local_irq_enable(); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + mm = (struct mm_struct *) regs->rdi; + address = (unsigned long) regs->rdx; +#else + mm = (struct mm_struct *) regs->di; + address = (unsigned long) regs->dx; +#endif + kvm_wp_notifier(mm, address); + + local_irq_disable(); + + return 0; +} + +static struct kprobe not_kprobe; + +int init_wp_notifier(void) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) && defined(CONFIG_KALLSYMS) && !defined(RHEL_RELEASE_CODE) + not_kprobe.addr = (kprobe_opcode_t *)kallsyms_lookup_name("do_wp_page"); + if (!not_kprobe.addr) { + printk(KERN_WARNING "do_wp_page not found"); + return 1; + } +#else + not_kprobe.symbol_name = "do_wp_page"; +#endif + not_kprobe.pre_handler = pre_do_wp_page; + + if (register_kprobe(¬_kprobe)) { + printk(KERN_WARNING "cant register kprobe for do_wp_page"); + return 1; + } + + return 0; +} + +void exit_wp_notifier(void) +{ + unregister_kprobe(¬_kprobe); +} diff --git a/kernel/ksm/wp_notifier.h b/kernel/ksm/wp_notifier.h new file mode 100644 index 0000000..3788e8c --- /dev/null +++ b/kernel/ksm/wp_notifier.h @@ -0,0 +1,12 @@ +#ifndef WP_NOTIFIER_H +#define WP_NOTIFIER_H + +#include <linux/mm.h> + +int init_wp_notifier(void); +void exit_wp_notifier(void); + +void kvm_wp_notifier(struct mm_struct *mm, + unsigned long address); + +#endif -- 1.6.1