--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -37,6 +37,7 @@ invoke one of the following flush methods _after_ the page table
 	This is usually invoked when the kernel page tables are
 	changed, since such translations are "global" in nature.
 
+
 2) void flush_tlb_mm(struct mm_struct *mm)
 
 	This interface flushes an entire user address space from
@@ -317,10 +318,10 @@ maps this page at its virtual address.
 	about doing this.
 
 	The idea is, first at flush_dcache_page() time, if
-	page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
-	an empty list, just mark the architecture private page flag bit.
-	Later, in update_mmu_cache(), a check is made of this flag bit,
-	and if set the flush is done and the flag bit is cleared.
+	page->mapping->i_mmap is an empty tree, just mark the architecture
+	private page flag bit.  Later, in update_mmu_cache(), a check is
+	made of this flag bit, and if set the flush is done and the flag
+	bit is cleared.
 
 	IMPORTANT NOTE: It is often important, if you defer the flush,
 			that the actual flush occurs on the same CPU
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -24,64 +24,27 @@ Please note that implementation details can be changed.
 
    a page/swp_entry may be charged (usage += PAGE_SIZE) at
 
-	mem_cgroup_newpage_charge()
-	  Called at new page fault and Copy-On-Write.
-
-	mem_cgroup_try_charge_swapin()
-	  Called at do_swap_page() (page fault on swap entry) and swapoff.
-	  Followed by charge-commit-cancel protocol. (With swap accounting)
-	  At commit, a charge recorded in swap_cgroup is removed.
-
-	mem_cgroup_cache_charge()
-	  Called at add_to_page_cache()
-
-	mem_cgroup_cache_charge_swapin()
-	  Called at shmem's swapin.
-
-	mem_cgroup_prepare_migration()
-	  Called before migration. "extra" charge is done and followed by
-	  charge-commit-cancel protocol.
-	  At commit, charge against oldpage or newpage will be committed.
+	mem_cgroup_try_charge()
 
 2. Uncharge
   a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
 
-	mem_cgroup_uncharge_page()
-	  Called when an anonymous page is fully unmapped. I.e., mapcount goes
-	  to 0. If the page is SwapCache, uncharge is delayed until
-	  mem_cgroup_uncharge_swapcache().
-
-	mem_cgroup_uncharge_cache_page()
-	  Called when a page-cache is deleted from radix-tree. If the page is
-	  SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache().
-
-	mem_cgroup_uncharge_swapcache()
-	  Called when SwapCache is removed from radix-tree. The charge itself
-	  is moved to swap_cgroup. (If mem+swap controller is disabled, no
-	  charge to swap occurs.)
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
 
 	mem_cgroup_uncharge_swap()
 	  Called when swp_entry's refcnt goes down to 0. A charge against swap
 	  disappears.
 
-	mem_cgroup_end_migration(old, new)
-	At success of migration old is uncharged (if necessary), a charge
-	to new page is committed. At failure, charge to old page is committed.
-
 3. charge-commit-cancel
-	In some case, we can't know this "charge" is valid or not at charging
-	(because of races).
-	To handle such case, there are charge-commit-cancel functions.
-		mem_cgroup_try_charge_XXX
-		mem_cgroup_commit_charge_XXX
-		mem_cgroup_cancel_charge_XXX
-	these are used in swap-in and migration.
+	Memcg pages are charged in two steps:
+		mem_cgroup_try_charge()
+		mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
 
 	At try_charge(), there are no flags to say "this page is charged".
 	at this point, usage += PAGE_SIZE.
 
-	At commit(), the function checks the page should be charged or not
-	and set flags or avoid charging.(usage -= PAGE_SIZE)
+	At commit(), the page is associated with the memcg.
 
 	At cancel(), simply usage -= PAGE_SIZE.
 
@@ -91,18 +54,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	Anonymous page is newly allocated at
 		  - page fault into MAP_ANONYMOUS mapping.
 		  - Copy-On-Write.
- 	It is charged right after it's allocated before doing any page table
-	related operations. Of course, it's uncharged when another page is used
-	for the fault address.
-
-	At freeing anonymous page (by exit() or munmap()), zap_pte() is called
-	and pages for ptes are freed one by one.(see mm/memory.c). Uncharges
-	are done at page_remove_rmap() when page_mapcount() goes down to 0.
-
-	Another page freeing is by page-reclaim (vmscan.c) and anonymous
-	pages are swapped out. In this case, the page is marked as
-	PageSwapCache(). uncharge() routine doesn't uncharge the page marked
-	as SwapCache(). It's delayed until __delete_from_swap_cache().
 
 	4.1 Swap-in.
 	At swap-in, the page is taken from swap-cache. There are 2 cases.
@@ -111,41 +62,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	(b) If the SwapCache has been mapped by processes, it has been
 	    charged already.
 
-	This swap-in is one of the most complicated work. In do_swap_page(),
-	following events occur when pte is unchanged.
-
-	(1) the page (SwapCache) is looked up.
-	(2) lock_page()
-	(3) try_charge_swapin()
-	(4) reuse_swap_page() (may call delete_swap_cache())
-	(5) commit_charge_swapin()
-	(6) swap_free().
-
-	Considering following situation for example.
-
-	(A) The page has not been charged before (2) and reuse_swap_page()
-	    doesn't call delete_from_swap_cache().
-	(B) The page has not been charged before (2) and reuse_swap_page()
-	    calls delete_from_swap_cache().
-	(C) The page has been charged before (2) and reuse_swap_page() doesn't
-	    call delete_from_swap_cache().
-	(D) The page has been charged before (2) and reuse_swap_page() calls
-	    delete_from_swap_cache().
-
-	    memory.usage/memsw.usage changes to this page/swp_entry will be
-	 Case          (A)      (B)       (C)     (D)
-         Event
-       Before (2)     0/ 1     0/ 1      1/ 1    1/ 1
-          ===========================================
-          (3)        +1/+1    +1/+1     +1/+1   +1/+1
-          (4)          -       0/ 0       -     -1/ 0
-          (5)         0/-1     0/ 0     -1/-1    0/ 0
-          (6)          -       0/-1       -      0/-1
-          ===========================================
-       Result         1/ 1     1/ 1      1/ 1    1/ 1
-
-       In any cases, charges to this page should be 1/ 1.
-
 	4.2 Swap-out.
 	At swap-out, typical state transition is below.
 
@@ -158,28 +74,20 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	    swp_entry's refcnt -= 1.
 
 
-	At (b), the page is marked as SwapCache and not uncharged.
-	At (d), the page is removed from SwapCache and a charge in page_cgroup
-	is moved to swap_cgroup.
-
 	Finally, at task exit,
 	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-	Here, a charge in swap_cgroup disappears.
 
 5. Page Cache
    	Page Cache is charged at
 	- add_to_page_cache_locked().
 
-	uncharged at
-	- __remove_from_page_cache().
-
 	The logic is very clear. (About migration, see below)
 	Note: __remove_from_page_cache() is called by remove_from_page_cache()
 	and __remove_mapping().
 
 6. Shmem(tmpfs) Page Cache
-	Memcg's charge/uncharge have special handlers of shmem. The best way
-	to understand shmem's page state transition is to read mm/shmem.c.
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
 	But brief explanation of the behavior of memcg around shmem will be
 	helpful to understand the logic.
 
@@ -192,56 +100,10 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	It's charged when...
 	- A new page is added to shmem's radix-tree.
 	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-	It's uncharged when
-	- A page is removed from radix-tree and not SwapCache.
-	- When SwapCache is removed, a charge is moved to swap_cgroup.
-	- When swp_entry's refcnt goes down to 0, a charge in swap_cgroup
-	  disappears.
 
 7. Page Migration
-   	One of the most complicated functions is page-migration-handler.
-	Memcg has 2 routines. Assume that we are migrating a page's contents
-	from OLDPAGE to NEWPAGE.
-
-	Usual migration logic is..
-	(a) remove the page from LRU.
-	(b) allocate NEWPAGE (migration target)
-	(c) lock by lock_page().
-	(d) unmap all mappings.
-	(e-1) If necessary, replace entry in radix-tree.
-	(e-2) move contents of a page.
-	(f) map all mappings again.
-	(g) pushback the page to LRU.
-	(-) OLDPAGE will be freed.
-
-	Before (g), memcg should complete all necessary charge/uncharge to
-	NEWPAGE/OLDPAGE.
-
-	The point is....
-	- If OLDPAGE is anonymous, all charges will be dropped at (d) because
-          try_to_unmap() drops all mapcount and the page will not be
-	  SwapCache.
-
-	- If OLDPAGE is SwapCache, charges will be kept at (g) because
-	  __delete_from_swap_cache() isn't called at (e-1)
-
-	- If OLDPAGE is page-cache, charges will be kept at (g) because
-	  remove_from_swap_cache() isn't called at (e-1)
-
-	memcg provides following hooks.
-
-	- mem_cgroup_prepare_migration(OLDPAGE)
-	  Called after (b) to account a charge (usage += PAGE_SIZE) against
-	  memcg which OLDPAGE belongs to.
-
-        - mem_cgroup_end_migration(OLDPAGE, NEWPAGE)
-	  Called after (f) before (g).
-	  If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already
-	  charged, a charge by prepare_migration() is automatically canceled.
-	  If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE.
-
-	  But zap_pte() (by exit or munmap) can be called while migration,
-	  we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
+
+	mem_cgroup_migrate()
 
 8. LRU
         Each memcg has its own private LRU. Now, its handling is under global
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1743,12 +1743,16 @@ pair provide additional information particular to the objects they represent.
 	pos:	0
 	flags:	02
 	mnt_id:	9
-	tfd:        5 events:       1d data: ffffffffffffffff
+	tfd:        5 events:       1d data: ffffffffffffffff pos:0 ino:61af sdev:7
 
 	where 'tfd' is a target file descriptor number in decimal form,
 	'events' is events mask being watched and the 'data' is data
 	associated with a target [see epoll(7) for more details].
 
+	The 'pos' is current offset of the target file in decimal form
+	[see lseek(2)], 'ino' and 'sdev' are inode and device numbers
+	where target file resides, all in hex format.
+
 	Fsnotify files
 	~~~~~~~~~~~~~~
 	For inotify files the format is the following
@@ -1823,6 +1827,7 @@ Configuring procfs
 The following mount options are supported:
 
 	hidepid=	Set /proc/<pid>/ access mode.
+	hidepidns=	Hide tasks from nested pid-namespaces.
 	gid=		Set the group authorized to learn processes information.
 
 hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
@@ -1845,6 +1850,9 @@ information about running processes, whether some daemon runs with elevated
 privileges, whether other user runs some sensitive program, whether other users
 run any program at all, etc.
 
+hidepidns=1 makes all tasks from nested pid-namespaces invisible. They are still
+accessible via /proc/<pid>/, but readdir will not show them.
+
 gid= defines a group authorized to learn processes information otherwise
 prohibited by hidepid=.  If you use some daemon like identd which needs to learn
 information about processes information, just add identd to this group.
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -323,6 +323,7 @@ Code  Seq#(hex)	Include File		Comments
 0xDB	00-0F	drivers/char/mwave/mwavepub.h
 0xDD	00-3F	ZFCP device driver	see drivers/s390/scsi/
 					<mailto:aherrman@de.ibm.com>
+0xE5	00-3F	linux/fuse.h
 0xF3	00-3F	drivers/usb/misc/sisusbvga/sisusb.h	sisfb (in development)
 					<mailto:thomas@winischhofer.net>
 0xF4	00-1F	video/mbxfb.h		mbxfb
--- /dev/null
+++ b/Documentation/kasan.txt
@@ -0,0 +1,171 @@
+KernelAddressSanitizer (KASAN)
+==============================
+
+0. Overview
+===========
+
+KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides
+a fast and comprehensive solution for finding use-after-free and out-of-bounds
+bugs.
+
+KASAN uses compile-time instrumentation for checking every memory access,
+therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
+required for detection of out-of-bounds accesses to stack or global variables.
+
+Currently KASAN is supported only for x86_64 architecture.
+
+1. Usage
+========
+
+To enable KASAN configure kernel with:
+
+	  CONFIG_KASAN = y
+
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
+inline are compiler instrumentation types. The former produces smaller binary
+the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
+version 5.0 or later.
+
+KASAN works with both SLUB and SLAB memory allocators.
+For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
+
+To disable instrumentation for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                KASAN_SANITIZE_main.o := n
+
+        For all files in one directory:
+                KASAN_SANITIZE := n
+
+1.1 Error reports
+=================
+
+A typical out of bounds access report looks like this:
+
+==================================================================
+BUG: AddressSanitizer: out of bounds access in kmalloc_oob_right+0x65/0x75 [test_kasan] at addr ffff8800693bc5d3
+Write of size 1 by task modprobe/1689
+=============================================================================
+BUG kmalloc-128 (Not tainted): kasan error
+-----------------------------------------------------------------------------
+
+Disabling lock debugging due to kernel taint
+INFO: Allocated in kmalloc_oob_right+0x3d/0x75 [test_kasan] age=0 cpu=0 pid=1689
+ __slab_alloc+0x4b4/0x4f0
+ kmem_cache_alloc_trace+0x10b/0x190
+ kmalloc_oob_right+0x3d/0x75 [test_kasan]
+ init_module+0x9/0x47 [test_kasan]
+ do_one_initcall+0x99/0x200
+ load_module+0x2cb3/0x3b20
+ SyS_finit_module+0x76/0x80
+ system_call_fastpath+0x12/0x17
+INFO: Slab 0xffffea0001a4ef00 objects=17 used=7 fp=0xffff8800693bd728 flags=0x100000000004080
+INFO: Object 0xffff8800693bc558 @offset=1368 fp=0xffff8800693bc720
+
+Bytes b4 ffff8800693bc548: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a  ........ZZZZZZZZ
+Object ffff8800693bc558: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc568: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc578: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc588: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc598: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
+Redzone ffff8800693bc5d8: cc cc cc cc cc cc cc cc                          ........
+Padding ffff8800693bc718: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ
+CPU: 0 PID: 1689 Comm: modprobe Tainted: G    B          3.18.0-rc1-mm1+ #98
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
+ ffff8800693bc000 0000000000000000 ffff8800693bc558 ffff88006923bb78
+ ffffffff81cc68ae 00000000000000f3 ffff88006d407600 ffff88006923bba8
+ ffffffff811fd848 ffff88006d407600 ffffea0001a4ef00 ffff8800693bc558
+Call Trace:
+ [<ffffffff81cc68ae>] dump_stack+0x46/0x58
+ [<ffffffff811fd848>] print_trailer+0xf8/0x160
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff811ff0f5>] object_err+0x35/0x40
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffff8120b9fa>] kasan_report_error+0x38a/0x3f0
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffff8120b344>] ? kasan_unpoison_shadow+0x14/0x40
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff8120a995>] __asan_store1+0x75/0xb0
+ [<ffffffffa0002601>] ? kmem_cache_oob+0x1d/0xc3 [test_kasan]
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa0002065>] kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa00026b0>] init_module+0x9/0x47 [test_kasan]
+ [<ffffffff810002d9>] do_one_initcall+0x99/0x200
+ [<ffffffff811e4e5c>] ? __vunmap+0xec/0x160
+ [<ffffffff81114f63>] load_module+0x2cb3/0x3b20
+ [<ffffffff8110fd70>] ? m_show+0x240/0x240
+ [<ffffffff81115f06>] SyS_finit_module+0x76/0x80
+ [<ffffffff81cd3129>] system_call_fastpath+0x12/0x17
+Memory state around the buggy address:
+ ffff8800693bc300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc380: fc fc 00 00 00 00 00 00 00 00 00 00 00 00 00 fc
+ ffff8800693bc400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc500: fc fc fc fc fc fc fc fc fc fc fc 00 00 00 00 00
+>ffff8800693bc580: 00 00 00 00 00 00 00 00 00 00 03 fc fc fc fc fc
+                                                 ^
+ ffff8800693bc600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc700: fc fc fc fc fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+==================================================================
+
+The header of the report discribe what kind of bug happened and what kind of
+access caused it. It's followed by the description of the accessed slub object
+(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+the description of the accessed memory page.
+
+In the last section the report shows memory state around the accessed address.
+Reading this part requires some understanding of how KASAN works.
+
+The state of each 8 aligned bytes of memory is encoded in one shadow byte.
+Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
+We use the following encoding for each shadow byte: 0 means that all 8 bytes
+of the corresponding memory region are accessible; number N (1 <= N <= 7) means
+that the first N bytes are accessible, and other (8 - N) bytes are not;
+any negative value indicates that the entire 8-byte word is inaccessible.
+We use different negative values to distinguish between different kinds of
+inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h).
+
+In the report above the arrows point to the shadow byte 03, which means that
+the accessed address is partially accessible.
+
+
+2. Implementation details
+=========================
+
+From a high level, our approach to memory error detection is similar to that
+of kmemcheck: use shadow memory to record whether each byte of memory is safe
+to access, and use compile-time instrumentation to check shadow memory on each
+memory access.
+
+AddressSanitizer dedicates 1/8 of kernel memory to its shadow memory
+(e.g. 16TB to cover 128TB on x86_64) and uses direct mapping with a scale and
+offset to translate a memory address to its corresponding shadow address.
+
+Here is the function which translates an address to its corresponding shadow
+address:
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return ((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+where KASAN_SHADOW_SCALE_SHIFT = 3.
+
+Compile-time instrumentation used for checking memory accesses. Compiler inserts
+function calls (__asan_load*(addr), __asan_store*(addr)) before each memory
+access of size 1, 2, 4, 8 or 16. These functions check whether memory access is
+valid or not by checking corresponding shadow memory.
+
+GCC 5.0 has possibility to perform inline instrumentation. Instead of making
+function calls GCC directly inserts the code to check the shadow memory.
+This option significantly enlarges kernel but it gives x1.1-x2 performance
+boost over outline instrumented kernel.
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+        CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+#define COVER_SIZE			(64<<10)
+
+int main(int argc, char **argv)
+{
+	int fd;
+	unsigned long *cover, n, i;
+
+	/* A single fd descriptor allows coverage collection on a single
+	 * thread.
+	 */
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	/* Setup trace mode and trace size. */
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/* Mmap buffer shared between kernel- and user-space. */
+	cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Enable coverage collection on the current thread. */
+	if (ioctl(fd, KCOV_ENABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Reset coverage from the tail of the ioctl() call. */
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	/* That's the target syscal call. */
+	read(-1, NULL, 0);
+	/* Read number of PCs collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++)
+		printf("0x%lx\n", cover[i + 1]);
+	/* Disable coverage collection for the current thread. After this call
+	 * coverage can be enabled for a different thread.
+	 */
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
--- /dev/null
+++ b/Documentation/target/mhvtl.txt
@@ -0,0 +1,19 @@
+mhvtl: A Virtual Tape & Library system.
+
+This package is composed of a kernel module (mhvtl) which is a pseudo HBA
+
+The vtl is basically a stripped down scsi_debug kernel module + a char dev
+'back end' to pass the SCSI commands thru to user space daemons. It is the
+user space daemons responsibility to respond and process the SCSI commands.
+
+See the INSTALL file for compile and install instructions.
+
+This has been my first attempt at kernel level programming. Suggestions and
+comments always welcome. It has also been many years since I have done any c
+programming. Please be gentle..
+
+Mark Harvey
+markh794@gmail.com
+mark.harvey@veritas.com
+
+https://github.com/markh794/mhvtl
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1429,6 +1429,7 @@ struct kvm_irq_routing_entry {
 	union {
 		struct kvm_irq_routing_irqchip irqchip;
 		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_hv_sint hv_sint;
 		__u32 pad[8];
 	} u;
 };
@@ -1436,6 +1437,7 @@ struct kvm_irq_routing_entry {
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_HV_SINT 4
 
 No flags are specified so far, the corresponding field must be set to zero.
 
@@ -1451,12 +1453,16 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
 On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
 feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
 address_hi bits 31-8 provide bits 31-8 of the destination id.  Bits 7-0 of
 address_hi must be zero.
 
-
 4.53 KVM_ASSIGN_SET_MSIX_NR
 
 Capability: none
@@ -2971,6 +2977,34 @@ HVC instruction based PSCI call from the vcpu. The 'type' field describes
 the system-level event type. The 'flags' field describes architecture
 specific flags for the system-level event.
 
+		struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+			__u32 type;
+			union {
+				struct {
+					__u32 msr;
+					__u64 control;
+					__u64 evt_page;
+					__u64 msg_page;
+				} synic;
+				struct {
+					__u64 input;
+					__u64 result;
+					__u64 params[2];
+				} hcall;
+			} u;
+		};
+		/* KVM_EXIT_HYPERV */
+                struct kvm_hyperv_exit hyperv;
+Indicates that the VCPU exits into userspace to process some tasks
+related to Hyper-V emulation.
+Valid values for 'type' are:
+	KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about
+Hyper-V SynIC state change. Notification is used to remap SynIC
+event/message pages and to enable/disable SynIC messages/events processing
+in userspace.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -3244,3 +3278,34 @@ available, means that that the kernel has an implementation of the
 H_RANDOM hypercall backed by a hardware random-number generator.
 If present, the kernel H_RANDOM handler can be enabled for guest use
 with the KVM_CAP_PPC_ENABLE_HCALL capability.
+
+8.2 KVM_CAP_HYPERV_SYNIC
+
+Architectures: x86
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is
+used to support Windows Hyper-V based guest paravirt drivers(VMBus).
+
+In order to use SynIC, it has to be activated by setting this
+capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
+will disable the use of APIC hardware virtualization even if supported
+by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.11 KVM_CAP_HYPERV_SYNIC2
+
+Architectures: x86
+
+This capability enables a newer version of Hyper-V Synthetic interrupt
+controller (SynIC).  The only difference with KVM_CAP_HYPERV_SYNIC is that KVM
+doesn't clear SynIC message and event flags pages when they are enabled by
+writing to the respective MSRs.
+
+8.12 KVM_CAP_HYPERV_VP_INDEX
+
+Architectures: x86
+
+This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr.  Its
+value is used to denote the target vcpu for a SynIC interrupt.  For
+compatibilty, KVM initializes this msr to KVM's internal vcpu index.  When this
+capability is absent, userspace can still query this msr's value.
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -166,10 +166,11 @@ MSR_KVM_SYSTEM_TIME: 0x12
 MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 	data: Bits 63-6 hold 64-byte aligned physical address of a
 	64 byte memory area which must be in guest RAM and must be
-	zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
+	zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1
 	when asynchronous page faults are enabled on the vcpu 0 when
 	disabled. Bit 1 is 1 if asynchronous page faults can be injected
-	when vcpu is in cpl == 0.
+	when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults
+	are delivered to L1 as #PF vmexits.
 
 	First 4 byte of 64 byte memory location will be written to by
 	the hypervisor at the time of asynchronous page fault (APF)
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -12,6 +12,8 @@ hugetlbpage.txt
 	- a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
 	- explains what hwpoison is
+idle_page_tracking.txt
+	- description of the idle page tracking feature.
 ksm.txt
 	- how to use the Kernel Samepage Merging feature.
 locking
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW
 A cleancache "backend" that provides transcendent memory registers itself
 to the kernel's cleancache "frontend" by calling cleancache_register_ops,
 passing a pointer to a cleancache_ops structure with funcs set appropriately.
-Note that cleancache_register_ops returns the previous settings so that
-chaining can be performed if desired. The functions provided must conform to
-certain semantics as follows:
+The functions provided must conform to certain semantics as follows:
 
 Most important, cleancache is "ephemeral".  Pages which are copied into
 cleancache have an indefinite lifetime which is completely unknowable
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.txt
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -63,6 +63,10 @@ reading files in /proc.
     21. KSM
     22. THP
 
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
+
 Short descriptions to the page flags:
 
  0. LOCKED
--- /dev/null
+++ b/Documentation/vm/remap_file_pages.txt
@@ -0,0 +1,27 @@
+The remap_file_pages() system call is used to create a nonlinear mapping,
+that is, a mapping in which the pages of the file are mapped into a
+nonsequential order in memory. The advantage of using remap_file_pages()
+over using repeated calls to mmap(2) is that the former approach does not
+require the kernel to create additional VMA (Virtual Memory Area) data
+structures.
+
+Supporting of nonlinear mapping requires significant amount of non-trivial
+code in kernel virtual memory subsystem including hot paths. Also to get
+nonlinear mapping work kernel need a way to distinguish normal page table
+entries from entries with file offset (pte_file). Kernel reserves flag in
+PTE for this purpose. PTE flags are scarce resource especially on some CPU
+architectures. It would be nice to free up the flag for other usage.
+
+Fortunately, there are not many users of remap_file_pages() in the wild.
+It's only known that one enterprise RDBMS implementation uses the syscall
+on 32-bit systems to map files bigger than can linearly fit into 32-bit
+virtual address space. This use-case is not critical anymore since 64-bit
+systems are widely available.
+
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
+
+One side effect of emulation (apart from performance) is that user can hit
+vm.max_map_count limit more easily due to additional VMAs. See comment for
+DEFAULT_MAX_MAP_COUNT for more details on the limit.
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -11,6 +11,9 @@ ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
 ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
 ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
+... unused hole ...
+ ... unused hole ...
+ffffec0000000000 - fffffc0000000000 (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
 ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,22 @@ RHEL_RELEASE = 693.11.6
 RHEL_DRM_VERSION = 4
 RHEL_DRM_PATCHLEVEL = 10
 RHEL_DRM_SUBLEVEL = 13
+# VZVERSION = ovz.40.4
+VZVERSION = ovz.custom
+
+ifeq ($(VZVERSION), ovz.custom)
+  GIT_DIR := .git
+  ifneq ("$(wildcard $(GIT_DIR) )", "")
+    VZVERSION := $(shell git describe --abbrev=0 2>/dev/null | \
+		   sed -r 's/^.*\.vz7\.//')
+  else
+    VZVERSION := custom
+  endif
+
+  ifeq ($(EXTRAVERSION),)
+    EXTRAVERSION := -$(RHEL_RELEASE).ovz.$(VZVERSION)
+  endif
+endif
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
@@ -367,6 +383,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
+CFLAGS_KCOV	= -fsanitize-coverage=trace-pc
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
@@ -419,14 +436,14 @@ KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
 KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
 KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION)
 
-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
+export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 export CPP AR NM STRIP OBJCOPY OBJDUMP
 export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_KCOV
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -647,6 +664,14 @@ else
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifdef CONFIG_KCOV
+  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+    $(warning Cannot use CONFIG_KCOV: \
+             -fsanitize-coverage=trace-pc is not supported by compiler)
+    CFLAGS_KCOV =
+  endif
+endif
+
 # This warning generated too much noise in a regular build.
 # Use make W=1 to enable this warning (see scripts/Makefile.build)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
@@ -717,6 +742,8 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
 	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+include $(srctree)/scripts/Makefile.kasan
+
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
 KBUILD_CPPFLAGS += $(KCPPFLAGS)
 KBUILD_AFLAGS += $(KAFLAGS)
@@ -924,7 +951,8 @@ define filechk_utsrelease.h
 	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
 	  exit 1;                                                         \
 	fi;                                                               \
-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; 		  \
+		echo \#define VZVERSION \"$(VZVERSION)\";)
 endef
 
 define filechk_version.h
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -549,6 +549,14 @@ config ARCH_MMAP_RND_COMPAT_BITS
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_rnd_compat_bits tunable
 
+config HAVE_ARCH_COMPAT_MMAP_BASES
+	bool
+	help
+	  This allows 64bit applications to invoke 32-bit mmap() syscall
+	  and vice-versa 32-bit applications to call 64-bit mmap().
+	  Required for applications doing different bitness syscalls.
+
+
 #
 # ABI hall of shame
 #
--- a/arch/arc/mm/mmap.c
+++ b/arch/arc/mm/mmap.c
@@ -64,7 +64,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/arm/include/asm/exception.h
+++ b/arch/arm/include/asm/exception.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_ARM_EXCEPTION_H
 #define __ASM_ARM_EXCEPTION_H
 
-#include <linux/ftrace.h>
+#include <linux/interrupt.h>
 
 #define __exception	__attribute__((section(".exception.text")))
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -40,7 +40,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -100,6 +100,7 @@ SECTIONS
 			*(.exception.text)
 			__exception_text_end = .;
 			IRQENTRY_TEXT
+			SOFTIRQENTRY_TEXT
 			TEXT_TEXT
 			SCHED_TEXT
 			LOCK_TEXT
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -89,7 +89,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -140,7 +140,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-				(!vma || addr + len <= vma->vm_start))
+				(!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -29,8 +29,8 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 enum aarch64_reloc_op {
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ SECTIONS
 			*(.exception.text)
 			__exception_text_end = .;
 			IRQENTRY_TEXT
+			SOFTIRQENTRY_TEXT
 			TEXT_TEXT
 			SCHED_TEXT
 			LOCK_TEXT
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -35,6 +35,7 @@ SECTIONS
 #endif
 		LOCK_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		KPROBES_TEXT
 #ifdef CONFIG_ROMKERNEL
 		__sinittext = .;
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -78,6 +78,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		KPROBES_TEXT
 		*(.fixup)
 		*(.gnu.warning)
--- a/arch/frv/mm/elf-fdpic.c
+++ b/arch/frv/mm/elf-fdpic.c
@@ -74,7 +74,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(current->mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			goto success;
 	}
 
--- a/arch/metag/kernel/vmlinux.lds.S
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@ SECTIONS
 	LOCK_TEXT
 	KPROBES_TEXT
 	IRQENTRY_TEXT
+	SOFTIRQENTRY_TEXT
 	*(.text.*)
 	*(.gnu.warning)
 	}
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS {
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		. = ALIGN (4) ;
 		_etext = . ;
 	}
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -23,6 +23,7 @@
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
+#include <linux/numa.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -46,7 +47,7 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, -1,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -56,6 +56,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.text.*)
 		*(.fixup)
 		*(.gnu.warning)
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -92,7 +92,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
 	  LOCK_TEXT
 	  KPROBES_TEXT
 	  IRQENTRY_TEXT
+	  SOFTIRQENTRY_TEXT
 	  *(.fixup)
 	  *(.text.__*)
 	  _etext = .;
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -219,7 +219,7 @@ void *module_alloc(unsigned long size)
 	 * init_data correctly */
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_RWX, NUMA_NO_NODE,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -59,6 +59,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.text.do_softirq)
 		*(.text.sys_exit)
 		*(.text.do_sigaltstack)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1108,6 +1108,8 @@ endif
 config	ARCH_RANDOM
 	def_bool n
 
+source "kernel/Kconfig.openvz"
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
@@ -1118,6 +1120,8 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig"
 
 source "lib/Kconfig"
 
+source "kernel/bc/Kconfig"
+
 source "arch/powerpc/Kconfig.debug"
 
 source "security/Kconfig"
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -368,7 +368,7 @@ SYSCALL_SPU(memfd_create) /* sys_memfd_create */
 SYSCALL(ni_syscall) /* sys_bpf */
 SYSCALL(ni_syscall) /* sys_execveat */
 PPC64ONLY(switch_endian)
-SYSCALL_SPU(userfaultfd)
+SYSCALL_SPU(userfaultfd) /* 364 */
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
@@ -383,4 +383,16 @@ SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
-SYSCALL(copy_file_range)
+SYSCALL(copy_file_range) /* 379 */
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(getluid)
+SYSCALL(setluid)
+SYSCALL(setublimit)
+SYSCALL(ubstat) /* 391 */
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -11,8 +11,7 @@
 
 #include <uapi/asm/unistd.h>
 
-
-#define __NR_syscalls		380
+#define __NR_syscalls		392
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
--- a/arch/powerpc/include/uapi/asm/ioctls.h
+++ b/arch/powerpc/include/uapi/asm/ioctls.h
@@ -116,4 +116,6 @@
 #define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
 #define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
 
+#define TIOSAK		_IO('T', 0x66)	/* "Secure Attention Key" */
+
 #endif	/* _ASM_POWERPC_IOCTLS_H */
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -389,4 +389,9 @@
 #define __NR_userfaultfd	364
 #define __NR_copy_file_range	379
 
+#define __NR_getluid		388
+#define __NR_setluid		389
+#define __NR_setublimit		390
+#define __NR_ubstat		391
+
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/personality.h>
 #include <linux/random.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/ve.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -56,6 +56,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 
 #ifdef CONFIG_PPC32
 		*(.got1)
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -763,7 +763,7 @@ static int __init spufs_init(void)
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -50,7 +50,7 @@ void *module_alloc(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 #endif
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -42,6 +42,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 	} :text = 0x0700
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 		_etext = .;		/* End of text section */
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -63,7 +63,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -113,7 +113,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -29,7 +29,7 @@ static void *module_map(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #else
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -119,7 +119,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -182,7 +182,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -47,6 +47,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.gnu.warning)
 	} = 0
 	_etext = .;
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -118,7 +118,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, HPAGE_SIZE);
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 	if (mm->get_unmapped_area == arch_get_unmapped_area)
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -40,6 +40,7 @@ SECTIONS
     HEAD_TEXT
     SCHED_TEXT
     LOCK_TEXT
+    SOFTIRQENTRY_TEXT
     __fix_text_end = .;   /* tile-cpack won't rearrange before this */
     TEXT_TEXT
     *(.text.*)
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -269,7 +269,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 	if (current->mm->get_unmapped_area == arch_get_unmapped_area)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,8 +84,11 @@ config X86
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+	select ARCH_HAS_KCOV			if X86_64
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_HAS_ELF_RANDOMIZE
+	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
@@ -1280,16 +1283,25 @@ config DIRECT_GBPAGES
 	  support it. This can improve the kernel's performance a tiny bit by
 	  reducing TLB pressure. If in doubt, say "Y".
 
-config TRACK_DIRTY_PAGES
-	bool "Enable dirty page tracking"
-	default n
-	depends on !KMEMCHECK
-	---help---
-	  Turning this on enables tracking of re-dirtied and
-	  changed pages.  This is needed by the Live Kernel
-	  Self Migration project (lksm.sourceforge.net) to perform
-	  live copying of memory and system state to another system.
-	  Most users will say n here.
+#
+# This tracker is breaking MEM_SOFT_DIRTY option because
+# it conflicts with the bits used there.
+#
+# So turn it off permanently because vanilla kernel already
+# has a tracker, no need to invent new one!
+#
+# 	-- cyrillos
+#
+#config TRACK_DIRTY_PAGES
+#	bool "Enable dirty page tracking"
+#	default n
+#	depends on !KMEMCHECK
+#	---help---
+#	  Turning this on enables tracking of re-dirtied and
+#	  changed pages.  This is needed by the Live Kernel
+#	  Self Migration project (lksm.sourceforge.net) to perform
+#	  live copying of memory and system state to another system.
+#	  Most users will say n here.
 
 # Common NUMA Features
 config NUMA
@@ -2584,6 +2596,8 @@ config VMD
 	  single domain. If you know your system provides one of these and
 	  has devices attached to it, say Y; if you are not sure, say N.
 
+source "kernel/Kconfig.openvz"
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
@@ -2601,3 +2615,5 @@ source "crypto/Kconfig"
 source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
+
+source "kernel/bc/Kconfig"
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -11,11 +11,20 @@
 
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
+KASAN_SANITIZE := n
+
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -32,6 +32,10 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
+KASAN_SANITIZE := n
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 # Compressed kernel should be built as PIE since it may be loaded at any
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -7,6 +7,7 @@
  *
  * ----------------------------------------------------------------------- */
 
+#include <linux/types.h>
 #include <linux/efi.h>
 #include <linux/pci.h>
 #include <asm/efi.h>
@@ -14,8 +15,7 @@
 #include <asm/desc.h>
 #include <asm/bootparam_utils.h>
 
-#undef memcpy			/* Use memcpy from misc.c */
-
+#include "../string.h"
 #include "eboot.h"
 
 static efi_system_table_t *sys_table;
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -8,6 +8,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_KAISER
+#undef CONFIG_KASAN
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2029,8 +2029,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	intel_pmu_lbr_read();
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
-		WARN_ONCE(1, "perfevents: irq loop stuck!\n");
-		perf_event_print_debug();
+		static bool warned = false;
+		if (!warned) {
+			pr_warn("perfevents: irq loop stuck!\n");
+			dump_stack();
+			perf_event_print_debug();
+			warned = true;
+		}
 		intel_pmu_reset();
 		goto done;
 	}
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -19,6 +19,7 @@
 #include <linux/personality.h>
 #include <linux/compat.h>
 #include <linux/binfmts.h>
+#include <linux/ptrace.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
@@ -34,10 +35,28 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+	/* Don't leak in-kernel non-uapi flags to user-space */
+	if (oact)
+		oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+	if (!act)
+		return;
+
+	/* Don't let flags to be set from userspace */
+	act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+	if (is_ia32_task())
+		act->sa.sa_flags |= SA_IA32_ABI;
+	if (is_x32_task())
+		act->sa.sa_flags |= SA_X32_ABI;
+}
+
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
+		bool x32_ABI)
 {
 	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
 
 	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
 		return -EFAULT;
@@ -71,7 +90,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 				put_user_ex(from->si_arch, &to->si_arch);
 				break;
 			case __SI_CHLD >> 16:
-				if (ia32) {
+				if (!x32_ABI) {
 					put_user_ex(from->si_utime, &to->si_utime);
 					put_user_ex(from->si_stime, &to->si_stime);
 				} else {
@@ -105,6 +124,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 	return err;
 }
 
+/* from syscall's path, where we know the ABI */
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+{
+	return __copy_siginfo_to_user32(to, from, is_x32_task());
+}
+
 int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 {
 	int err = 0;
@@ -473,7 +498,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
 	} put_user_catch(err);
 
-	err |= copy_siginfo_to_user32(&frame->info, &ksig->info);
+	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				     regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
 				advice);
 }
 
-long sys32_vm86_warning(void)
-{
-	struct task_struct *me = current;
-	static char lastcomm[sizeof(me->comm)];
-
-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		compat_printk(KERN_INFO
-			      "%s: vm86 mode not supported on 64 bit kernel\n",
-			      me->comm);
-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
-	}
-	return -ENOSYS;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
 				   size_t count)
 {
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,4 +5,3 @@ genhdr-y += unistd_64.h
 genhdr-y += unistd_x32.h
 
 generic-y += clkdev.h
-generic-y += mm-arch-hooks.h
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -261,19 +261,17 @@ struct compat_shmid64_ds {
 /*
  * The type of struct elf_prstatus.pr_reg in compatible core dumps.
  */
-#ifdef CONFIG_X86_X32_ABI
 typedef struct user_regs_struct compat_elf_gregset_t;
 
-#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216)
-#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296)
-#define SET_PR_FPVALID(S,V) \
-  do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \
+/* Full regset -- prstatus on x32, otherwise on ia32 */
+#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
+#define SET_PR_FPVALID(S, V, R) \
+  do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
   while (0)
 
+#ifdef CONFIG_X86_X32_ABI
 #define COMPAT_USE_64BIT_TIME \
 	(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
-#else
-typedef struct user_regs_struct32 compat_elf_gregset_t;
 #endif
 
 /*
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -107,6 +107,7 @@
 #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	(3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_CPUID_FAULTING (3*32+31) /* cpuid faulting */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
@@ -442,6 +443,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)
 #define cpu_has_topoext		boot_cpu_has(X86_FEATURE_TOPOEXT)
 #define cpu_has_bpext		boot_cpu_has(X86_FEATURE_BPEXT)
+#define cpu_has_cpuid_faulting	boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
 
 #if __GNUC__ >= 4
 /*
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -68,6 +68,16 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 				 u32 type, u64 attribute);
 
+/*
+ * CONFIG_KASAN may redefine memset to __memset.  __memset function is present
+ * only in kernel binary.  Since the EFI stub linked into a separate binary it
+ * doesn't have __memset().  So we should use standard memset from
+ * arch/x86/boot/compressed/string.c.  The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+
 #endif /* CONFIG_X86_32 */
 
 extern int add_efi_memmap;
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -286,8 +286,23 @@ do {									\
 	}								\
 } while (0)
 
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+	return IS_ENABLED(CONFIG_X86_32) ||
+	       (IS_ENABLED(CONFIG_COMPAT) &&
+		test_thread_flag(TIF_ADDR32));
+}
+
+extern unsigned long tasksize_32bit(void);
+extern unsigned long tasksize_64bit(void);
+extern unsigned long get_mmap_base(int is_legacy);
+
 #ifdef CONFIG_X86_32
 
+#define __STACK_RND_MASK(is32bit) (0x7ff)
 #define STACK_RND_MASK (0x7ff)
 
 #define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
@@ -301,7 +316,8 @@ do {									\
 #define VDSO_HIGH_BASE		0xffffe000U /* CONFIG_COMPAT_VDSO address */
 
 /* 1GB for 64bit, 8MB for 32bit */
-#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff)
+#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
+#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
 
 #define ARCH_DLINFO							\
 do {									\
@@ -345,20 +361,14 @@ extern int x32_setup_additional_pages(struct linux_binprm *bprm,
 extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static inline int mmap_is_ia32(void)
-{
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_ADDR32))
-		return 1;
+#ifdef CONFIG_X86_64
+extern bool vdso_or_vvar_present(struct mm_struct *mm);
+extern int do_map_vdso_64(unsigned long addr);
+# ifdef CONFIG_COMPAT
+extern int do_map_vdso_32(unsigned long addr);
+# endif
 #endif
-	return 0;
-}
+
 
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -22,6 +22,7 @@
 #include <asm/uaccess.h>
 #include <asm/xsave.h>
 #include <asm/smap.h>
+#include <asm/signal.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/sigcontext32.h>
@@ -38,6 +39,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 # define ia32_setup_rt_frame	__setup_rt_frame
 #endif
 
+#ifdef CONFIG_COMPAT
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to,
+		const siginfo_t *from, bool x32_ABI);
+#endif
+
+
 extern unsigned int mxcsr_feature_mask;
 extern void fpu_init(void);
 extern void eager_fpu_init(void);
@@ -69,20 +76,21 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
-static inline int is_ia32_compat_frame(void)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
 	return config_enabled(CONFIG_IA32_EMULATION) &&
-	       test_thread_flag(TIF_IA32);
+		ksig->ka.sa.sa_flags & SA_IA32_ABI;
 }
 
-static inline int is_ia32_frame(void)
+static inline int is_ia32_frame(struct ksignal *ksig)
 {
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
 }
 
-static inline int is_x32_frame(void)
+static inline int is_x32_frame(struct ksignal *ksig)
 {
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+	return config_enabled(CONFIG_X86_X32_ABI) &&
+		ksig->ka.sa.sa_flags & SA_X32_ABI;
 }
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -110,7 +110,7 @@ static inline bool setup_remapped_irq(int irq,
 	return false;
 }
 
-int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+static inline int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
 {
 	return -ENOSYS;
 }
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START      (KASAN_SHADOW_OFFSET + \
+					(0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KASAN
+void __init kasan_early_init(void);
+void __init kasan_init(void);
+#else
+static inline void kasan_early_init(void) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -23,6 +23,7 @@ struct x86_exception {
 	u16 error_code;
 	bool nested_page_fault;
 	u64 address; /* cr2 or nested page fault gpa */
+	u8 async_page_fault;
 };
 
 /*
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
 #include <linux/irqbypass.h>
+#include <linux/hyperv.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -373,9 +374,40 @@ enum {
 	KVM_DEBUGREG_RELOAD = 4,
 };
 
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+	struct hrtimer timer;
+	int index;
+	u64 config;
+	u64 count;
+	u64 exp_time;
+	struct hv_message msg;
+	bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+	u64 version;
+	u64 control;
+	u64 msg_page;
+	u64 evt_page;
+	atomic64_t sint[HV_SYNIC_SINT_COUNT];
+	atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+	DECLARE_BITMAP(auto_eoi_bitmap, 256);
+	DECLARE_BITMAP(vec_bitmap, 256);
+	bool active;
+	bool dont_zero_synic_pages;
+};
+
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
+	u32 vp_index;
 	u64 hv_vapic;
+	s64 runtime_offset;
+	struct kvm_vcpu_hv_synic synic;
+	struct kvm_hyperv_exit exit;
+	struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+	DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
 };
 
 struct kvm_mtrr_range {
@@ -466,6 +498,7 @@ struct kvm_vcpu_arch {
 		bool reinject;
 		u8 nr;
 		u32 error_code;
+		u8 nested_apf;
 	} exception;
 
 	struct kvm_queued_interrupt {
@@ -564,6 +597,9 @@ struct kvm_vcpu_arch {
 		u64 msr_val;
 		u32 id;
 		bool send_user_only;
+		u32 host_apf_reason;
+		unsigned long nested_apf_token;
+		bool delivery_as_pf_vmexit;
 	} apf;
 
 	/* OSVW MSRs (AMD only) */
@@ -637,6 +673,8 @@ struct kvm_hv {
 	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
 	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
 	u64 hv_crash_ctl;
+
+	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
@@ -1257,7 +1295,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v, bool make_req);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
--- /dev/null
+++ b/arch/x86/include/asm/mm-arch-hooks.h
@@ -0,0 +1,20 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
+#define _ASM_POWERPC_MM_ARCH_HOOKS_H
+
+extern void arch_remap(struct mm_struct *mm,
+		unsigned long old_start, unsigned long old_end,
+		unsigned long new_start, unsigned long new_end);
+#define arch_remap arch_remap
+
+#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -52,6 +52,7 @@
 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
 #define MSR_IA32_SYSENTER_CS		0x00000174
 #define MSR_IA32_SYSENTER_ESP		0x00000175
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -5,17 +5,23 @@
 #include <asm/kaslr.h>
 #endif
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -128,7 +128,8 @@ do {							\
 do {									\
 	typedef typeof(var) pao_T__;					\
 	const int pao_ID__ = (__builtin_constant_p(val) &&		\
-			      ((val) == 1 || (val) == -1)) ? (val) : 0;	\
+			      ((val) == 1 || (val) == -1)) ?		\
+				(int)(val) : 0;				\
 	if (0) {							\
 		pao_T__ pao_tmp__;					\
 		pao_tmp__ = (val);					\
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -4,6 +4,7 @@
 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
 #include <linux/pagemap.h>
+#include <linux/sched.h>	/* for init_mm */
 
 static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
 
@@ -81,11 +82,15 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *page;
-	page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_REPEAT | __GFP_ZERO;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	page = alloc_pages(gfp, 0);
 	if (!page)
 		return NULL;
 	if (!pgtable_pmd_page_ctor(page)) {
-		__free_pages(page, 0);
+		__free_page(page);
 		return NULL;
 	}
 	return (pmd_t *)page_address(page);
@@ -125,7 +130,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_REPEAT;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	return (pud_t *)get_zeroed_page(gfp);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -60,93 +60,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
- * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
- * into this range.
- */
-#define PTE_FILE_MAX_BITS	28
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT4		(_PAGE_BIT_SOFT_DIRTY + 1)
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-#define PTE_FILE_BITS3		(PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
-
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> (PTE_FILE_SHIFT1))				\
-	  & ((1U << PTE_FILE_BITS1) - 1)))				\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT2))			\
-	    & ((1U << PTE_FILE_BITS2) - 1))				\
-	   << (PTE_FILE_BITS1))						\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT3))			\
-	    & ((1U << PTE_FILE_BITS3) - 1))				\
-	   << (PTE_FILE_BITS1 + PTE_FILE_BITS2))			\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT4)))			\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1)					\
-	     & ((1U << PTE_FILE_BITS2) - 1))				\
-	    << PTE_FILE_SHIFT2)						\
-	 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	     & ((1U << PTE_FILE_BITS3) - 1))				\
-	    << PTE_FILE_SHIFT3)						\
-	 + ((((off) >>							\
-	      (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)))	\
-	    << PTE_FILE_SHIFT4)						\
-	 + _PAGE_FILE })
-
-#else /* CONFIG_MEM_SOFT_DIRTY */
-
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS	29
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#else
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_FILE + 1)
-#endif
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> PTE_FILE_SHIFT1)				\
-	  & ((1U << PTE_FILE_BITS1) - 1))				\
-	 + ((((pte).pte_low >> PTE_FILE_SHIFT2)				\
-	     & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1)		\
-	 + (((pte).pte_low >> PTE_FILE_SHIFT3)				\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1))	\
-	    << PTE_FILE_SHIFT2)						\
-	 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	    << PTE_FILE_SHIFT3)						\
-	 + _PAGE_FILE })
-
-#endif /* CONFIG_MEM_SOFT_DIRTY */
-
 /* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -189,18 +189,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off)						\
-	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS       32
-
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -108,11 +108,6 @@ static inline int pte_write(pte_t pte)
 	return pte_flags(pte) & _PAGE_RW;
 }
 
-static inline int pte_file(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_FILE;
-}
-
 static inline int pte_huge(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_PSE;
@@ -340,21 +335,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -290,10 +290,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* PUD - Level3 access */
 
 /* PMD  - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
-					    _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
 
 /* PTE - Level 1 access. */
 
@@ -301,11 +297,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
-/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE > _PAGE_BIT_PROTNONE
-#error unsupported PTE bit arrangement
-#endif
-
 /*
  * Encode and de-code a swap entry
  *
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -30,13 +30,6 @@
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-/* Pick a bit unaffected by the "KNL4 erratum": */
-#define _PAGE_BIT_FILE		_PAGE_BIT_PCD
-#else
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
-#endif
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -104,29 +97,6 @@
 #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-/*
- * Do compile-time checks for all the bits that may be set on
- * non-present PTEs
- */
-#if _PAGE_BIT_FILE == _PAGE_BIT_SWP_SOFT_DIRTY
-#error conflicting _PAGE_BIT_FILE
-#endif
-#if _PAGE_BIT_FILE == _PAGE_BIT_PROTNONE
-#error conflicting _PAGE_BIT_FILE
-#endif
-/*
- * Do compile-time checks for all the bits affected by the "KNL4"
- * erratum:
- */
-#if _PAGE_BIT_FILE == _PAGE_BIT_DIRTY
-#error conflicting _PAGE_BIT_FILE
-#endif
-#if _PAGE_BIT_FILE == _PAGE_BIT_ACCESSED
-#error conflicting _PAGE_BIT_FILE
-#endif
-#endif
-
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
@@ -136,7 +106,6 @@
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 /*
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -876,6 +876,7 @@ static inline void spin_lock_prefetch(const void *x)
 /*
  * User space process size: 3GB (default).
  */
+#define IA32_PAGE_OFFSET	PAGE_OFFSET
 #define TASK_SIZE		PAGE_OFFSET
 #define TASK_SIZE_MAX		TASK_SIZE
 #define STACK_TOP		TASK_SIZE
@@ -926,7 +927,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define task_pt_regs(task)                                             \
 ({                                                                     \
        struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \
+				     TOP_OF_KERNEL_STACK_PADDING);     \
        __regs__ - 1;                                                   \
 })
 
@@ -984,7 +986,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
  * This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+#define __TASK_UNMAPPED_BASE(task_size)	(PAGE_ALIGN(task_size / 3))
+#define TASK_UNMAPPED_BASE		__TASK_UNMAPPED_BASE(TASK_SIZE)
 
 #define KSTK_EIP(task)		(task_pt_regs(task)->ip)
 
@@ -1042,4 +1045,7 @@ bool xen_set_default_idle(void);
 
 void stop_this_cpu(void *dummy);
 
+extern void (*set_cpuid_faulting_cb)(bool enable);
+extern void set_cpuid_faulting(bool enable);
+
 #endif /* _ASM_X86_PROCESSOR_H */
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,10 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI	0x02000000u
+#define SA_X32_ABI	0x01000000u
+
 #ifndef CONFIG_COMPAT
 typedef sigset_t compat_sigset_t;
 #endif
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -50,7 +50,7 @@ static __always_inline bool static_key_false(struct static_key *key);
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-	set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -64,6 +64,31 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+							__ticket_t head)
+{
+	if (head & TICKET_SLOWPATH_FLAG) {
+		arch_spinlock_t old, new;
+
+		old.tickets.head = head;
+		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+
+		/* try to clear slowpath flag when there are no contenders */
+		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+	}
+}
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
+}
 
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
@@ -91,91 +116,69 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 	if (likely(inc.head == inc.tail))
 		goto out;
 
-	inc.tail &= ~TICKET_SLOWPATH_FLAG;
 	for (;;) {
 		unsigned count = SPIN_THRESHOLD;
 
 		do {
-			if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
-				goto out;
+			inc.head = READ_ONCE(lock->tickets.head);
+			if (__tickets_equal(inc.head, inc.tail))
+				goto clear_slowpath;
 			cpu_relax();
 		} while (--count);
 		__ticket_lock_spinning(lock, inc.tail);
 	}
-out:	barrier();	/* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+	__ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	arch_spinlock_t old, new;
 
-	old.tickets = ACCESS_ONCE(lock->tickets);
-	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+	old.tickets = READ_ONCE(lock->tickets);
+	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
 		return 0;
 
 	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
 	/* cmpxchg is a full barrier, so nothing can move before it */
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-					    arch_spinlock_t old)
-{
-	arch_spinlock_t new;
-
-	BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-	/* Perform the unlock on the "before" copy */
-	old.tickets.head += TICKET_LOCK_INC;
-
-	/* Clear the slowpath flag */
-	new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-	/*
-	 * If the lock is uncontended, clear the flag - use cmpxchg in
-	 * case it changes behind our back though.
-	 */
-	if (new.tickets.head != new.tickets.tail ||
-	    cmpxchg(&lock->head_tail, old.head_tail,
-					new.head_tail) != old.head_tail) {
-		/*
-		 * Lock still has someone queued for it, so wake up an
-		 * appropriate waiter.
-		 */
-		__ticket_unlock_kick(lock, old.tickets.head);
-	}
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	if (TICKET_SLOWPATH_FLAG &&
-	    static_key_false(&paravirt_ticketlocks_enabled)) {
-		arch_spinlock_t prev;
+		static_key_false(&paravirt_ticketlocks_enabled)) {
+		__ticket_t head;
 
-		prev = *lock;
-		add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-		/* add_smp() is a full mb() */
+		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-		if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-			__ticket_unlock_slowpath(lock, prev);
+		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+			head &= ~TICKET_SLOWPATH_FLAG;
+			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+		}
 	} else
 		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return tmp.tail != tmp.head;
+	return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	tmp.head &= ~TICKET_SLOWPATH_FLAG;
+	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
@@ -187,8 +190,20 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	while (arch_spin_is_locked(lock))
+	__ticket_t head = READ_ONCE(lock->tickets.head);
+
+	for (;;) {
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
+		/*
+		 * We need to check "unlocked" in a loop, tmp.head == head
+		 * can be false positive because of overflow.
+		 */
+		if (__tickets_equal(tmp.head, tmp.tail) ||
+				!__tickets_equal(tmp.head, head))
+			break;
+
 		cpu_relax();
+	}
 }
 #endif /* CONFIG_QUEUED_SPINLOCKS */
 
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
 extern void *memcpy(void *to, const void *from, size_t len);
 #else
-extern void *__memcpy(void *to, const void *from, size_t len);
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
 size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -37,7 +37,6 @@ asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
 asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
-long sys32_vm86_warning(void);
 
 asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
 asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -11,6 +11,33 @@
 #include <asm/page.h>
 #include <asm/types.h>
 
+/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack.  We do it because of a nasty
+ * 32-bit corner case.  On x86_32, the hardware stack frame is
+ * variable-length.  Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0.  On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault.  (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
 /*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -116,6 +116,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
+void do_cpuid_fault(struct pt_regs *);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -5,6 +5,7 @@
  */
 #include <linux/errno.h>
 #include <linux/compiler.h>
+#include <linux/kasan-checks.h>
 #include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -7,6 +7,7 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/lockdep.h>
+#include <linux/kasan-checks.h>
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/page.h>
@@ -59,6 +60,7 @@ static inline unsigned long __must_check copy_from_user(void *to,
 	int sz = __compiletime_object_size(to);
 
 	might_fault();
+	kasan_check_write(to, n);
 	if (likely(sz == -1 || sz >= n))
 		n = _copy_from_user(to, from, n);
 #ifdef CONFIG_DEBUG_VM
@@ -72,7 +74,7 @@ static __always_inline __must_check
 int copy_to_user(void __user *dst, const void *src, unsigned size)
 {
 	might_fault();
-
+	kasan_check_read(src, size);
 	return _copy_to_user(dst, src, size);
 }
 
@@ -81,6 +83,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
+	kasan_check_write(dst, size);
 	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
@@ -125,6 +128,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
+	kasan_check_read(src, size);
 	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
@@ -220,12 +224,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 static __must_check __always_inline int
 __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
 {
+	kasan_check_write(dst, size);
 	return copy_user_generic(dst, (__force const void *)src, size);
 }
 
 static __must_check __always_inline int
 __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 {
+	kasan_check_read(src, size);
 	return copy_user_generic((__force void *)dst, src, size);
 }
 
@@ -236,6 +242,7 @@ static inline int
 __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_fault();
+	kasan_check_write(dst, size);
 	return __copy_user_nocache(dst, src, size, 1);
 }
 
@@ -243,6 +250,7 @@ static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
 {
+	kasan_check_write(dst, size);
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,6 +1,20 @@
 #ifndef _ASM_X86_VDSO_H
 #define _ASM_X86_VDSO_H
 
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name)					\
+({									\
+	extern const char VDSO64_##name[];				\
+	(void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const char VDSO32_PRELINK[];
 
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -45,6 +45,7 @@
 
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
+DECLARE_VVAR(64, volatile unsigned long, fence_wdog_jiffies64)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
 
 #undef DECLARE_VVAR
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -156,6 +156,12 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET			0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
@@ -217,13 +223,27 @@
 #define HV_X64_MSR_CRASH_PARAMS		\
 		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
 
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG              0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT               0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG              0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT               0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG              0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT               0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG              0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT               0x400000B7
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
 		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
 
 /* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_POST_MESSAGE			0x005c
+#define HVCALL_SIGNAL_EVENT			0x005d
 
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12
@@ -266,4 +286,96 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 #define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
 #define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
 
+#define HV_SYNIC_STIMER_COUNT		(4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE			(256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+	HVMSG_NONE			= 0x00000000,
+
+	/* Memory access messages. */
+	HVMSG_UNMAPPED_GPA		= 0x80000000,
+	HVMSG_GPA_INTERCEPT		= 0x80000001,
+
+	/* Timer notification messages. */
+	HVMSG_TIMER_EXPIRED			= 0x80000010,
+
+	/* Error messages. */
+	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020,
+	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021,
+	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022,
+
+	/* Trace buffer complete messages. */
+	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040,
+
+	/* Platform-specific processor intercept messages. */
+	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000,
+	HVMSG_X64_MSR_INTERCEPT		= 0x80010001,
+	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002,
+	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003,
+	HVMSG_X64_APIC_EOI			= 0x80010004,
+	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+	__u8 asu8;
+	struct {
+		__u8 msg_pending:1;
+		__u8 reserved:7;
+	};
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+	__u32 asu32;
+	struct {
+		__u32 id:24;
+		__u32 reserved:8;
+	} u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+	__u32 message_type;
+	__u8 payload_size;
+	union hv_message_flags message_flags;
+	__u8 reserved[2];
+	union {
+		__u64 sender;
+		union hv_port_id port;
+	};
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+	struct hv_message_header header;
+	union {
+		__u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+	} u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+	struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+	__u32 timer_index;
+	__u32 reserved;
+	__u64 expiration_time;	/* When the timer expired */
+	__u64 delivery_time;	/* When the message was delivered */
+};
+
+#define HV_STIMER_ENABLE		(1ULL << 0)
+#define HV_STIMER_PERIODIC		(1ULL << 1)
+#define HV_STIMER_LAZY			(1ULL << 2)
+#define HV_STIMER_AUTOENABLE		(1ULL << 3)
+#define HV_STIMER_SINT(config)		(__u8)(((config) >> 16) & 0x0F)
+
 #endif
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -65,6 +65,7 @@ struct kvm_clock_pairing {
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
 #define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
+#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT	(1 << 2)
 
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,4 +6,10 @@
 #define ARCH_GET_FS 0x1003
 #define ARCH_GET_GS 0x1004
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# define ARCH_MAP_VDSO_X32     0x2001
+# define ARCH_MAP_VDSO_32      0x2002
+# define ARCH_MAP_VDSO_64      0x2003
+#endif
+
 #endif /* _ASM_X86_PRCTL_H */
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,6 +22,17 @@ OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o		:= y
 
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_stacktrace.o := n
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
@@ -43,6 +54,7 @@ obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= cpuid_fault.o
 
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o
 obj-y				+= hw_nmi.o
 
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -445,6 +445,31 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void intel_set_cpuid_faulting(bool enable)
+{
+	unsigned int l1, l2;
+
+	rdmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+	l1 &= ~1;
+	if (enable)
+		l1 |= 1;
+	wrmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+}
+
+static void intel_cpuid_faulting_init(struct cpuinfo_x86 *c)
+{
+	unsigned int l1, l2;
+
+	if (rdmsr_safe(MSR_PLATFORM_INFO, &l1, &l2) != 0 ||
+	    !(l1 & (1 << 31)))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_CPUID_FAULTING);
+	set_cpuid_faulting_cb = intel_set_cpuid_faulting;
+
+	intel_set_cpuid_faulting(false);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -562,6 +587,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 	}
 
 	probe_xeon_phi_r3mwait(c);
+
+	intel_cpuid_faulting_init(c);
 }
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -3,6 +3,7 @@
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/cpufreq.h>
+#include <linux/sched.h>
 
 /*
  *	Get CPU information for use by the procfs.
@@ -51,10 +52,58 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 }
 #endif
 
+extern void __do_cpuid_fault(unsigned int op, unsigned int count,
+			     unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx);
+
+struct cpu_flags {
+	u32 val[NCAPINTS];
+};
+
+static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
+
+static void init_cpu_flags(void *dummy)
+{
+	int cpu = smp_processor_id();
+	struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned int eax, ebx, ecx, edx;
+
+	memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
+
+	/*
+	 * Clear feature bits masked using cpuid masking/faulting.
+	 */
+
+	if (c->cpuid_level >= 0x00000001) {
+		__do_cpuid_fault(0x00000001, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[4] &= ecx;
+		flags->val[0] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x00000007) {
+		__do_cpuid_fault(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[9] &= ebx;
+	}
+
+	if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000 &&
+	    c->extended_cpuid_level >= 0x80000001) {
+		__do_cpuid_fault(0x80000001, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[6] &= ecx;
+		flags->val[1] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x0000000d) {
+		__do_cpuid_fault(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+		flags->val[10] &= eax;
+	}
+}
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
 	unsigned int cpu;
+	int is_super = ve_is_super(get_exec_env());
 	int i;
 
 	cpu = c->cpu_index;
@@ -81,6 +130,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 		if (!freq)
 			freq = cpu_khz;
+		freq = sched_cpulimit_scale_cpufreq(freq);
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
 			   freq / 1000, (freq % 1000));
 	}
@@ -94,7 +144,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 	seq_printf(m, "flags\t\t:");
 	for (i = 0; i < 32*NCAPINTS; i++)
-		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+		if (x86_cap_flags[i] != NULL &&
+				((is_super && cpu_has(c, i)) ||
+				 (!is_super && test_bit(i, (unsigned long *)
+							&per_cpu(cpu_flags, cpu)))))
 			seq_printf(m, " %s", x86_cap_flags[i]);
 
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
@@ -128,18 +181,24 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	return 0;
 }
 
-static void *c_start(struct seq_file *m, loff_t *pos)
+static void *__c_start(struct seq_file *m, loff_t *pos)
 {
 	*pos = cpumask_next(*pos - 1, cpu_online_mask);
-	if ((*pos) < nr_cpu_ids)
+	if (__cpus_weight(cpu_online_mask, *pos) < num_online_vcpus())
 		return &cpu_data(*pos);
 	return NULL;
 }
 
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	on_each_cpu(init_cpu_flags, NULL, 1);
+	return __c_start(m, pos);
+}
+
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
-	return c_start(m, pos);
+	return __c_start(m, pos);
 }
 
 static void c_stop(struct seq_file *m, void *v)
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/ve.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include "cpu.h"
--- /dev/null
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -0,0 +1,315 @@
+/*
+ *  arch/x86/kernel/cpuid_fault.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+	unsigned int op;
+	unsigned int count;
+	bool has_count;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES	16
+
+struct cpuid_override_table {
+	struct rcu_head rcu_head;
+	int size;
+	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+#define cpuid_override_active		(!!rcu_access_pointer(cpuid_override))
+
+void (*set_cpuid_faulting_cb)(bool enable);
+static DEFINE_PER_CPU(bool, cpuid_faulting_enabled);
+
+void set_cpuid_faulting(bool enable)
+{
+	bool *enabled;
+
+	if (!cpu_has_cpuid_faulting)
+		return;
+	if (!cpuid_override_active)
+		enable = false;
+
+	enabled = &get_cpu_var(cpuid_faulting_enabled);
+	if (*enabled != enable) {
+		set_cpuid_faulting_cb(enable);
+		*enabled = enable;
+	}
+	put_cpu_var(cpuid_faulting_enabled);
+}
+EXPORT_SYMBOL(set_cpuid_faulting);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+	struct cpuid_override_table *old_table;
+
+	spin_lock(&cpuid_override_lock);
+	old_table = rcu_access_pointer(cpuid_override);
+	rcu_assign_pointer(cpuid_override, new_table);
+	spin_unlock(&cpuid_override_lock);
+
+	if (old_table)
+		kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+				 unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	bool ret = false;
+	struct cpuid_override_table *t;
+	struct cpuid_override_entry *e;
+	int i;
+
+	rcu_read_lock();
+	t = rcu_dereference(cpuid_override);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < t->size; i++) {
+		e = &t->entries[i];
+		if (e->op != op)
+			continue;
+		if (e->has_count && e->count != count)
+			continue;
+		*eax = e->eax;
+		*ebx = e->ebx;
+		*ecx = e->ecx;
+		*edx = e->edx;
+		ret = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+void __do_cpuid_fault(unsigned int op, unsigned int count,
+		      unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	/* check if op is overridden */
+	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+		return;
+
+	/* fallback to real cpuid */
+	cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+	regs->ax = eax;
+	regs->bx = ebx;
+	regs->cx = ecx;
+	regs->dx = edx;
+}
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+				      struct cpuid_override_entry *e)
+{
+	int taken;
+	char *end;
+
+	if (sscanf(s, "%x %x: %x %x %x %x%n",
+		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+		   &taken) == 6)
+		e->has_count = true;
+	else if (sscanf(s, "%x: %x %x %x %x%n",
+			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+			&taken) == 5)
+		e->has_count = false;
+	else
+		return -EINVAL;
+
+	end = (char *)s + taken;
+	if (*end) {
+		if (*end != '\n')
+			return -EINVAL;
+		++end;
+	}
+	*endp = end;
+	return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct cpuid_override_table *t = NULL;
+	void *page = NULL;
+	char *s;
+	int err;
+
+	err = -E2BIG;
+	if (count >= PAGE_SIZE)
+		goto out;
+
+	err = -ENOMEM;
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	page = (void *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto out;
+
+	s = page;
+	s[count] = '\0';
+	t->size = 0;
+	while (*(s = skip_spaces(s))) {
+		err = -E2BIG;
+		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+			goto out;
+		err = -EINVAL;
+		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+			goto out;
+	}
+	if (!t->size) {
+		kfree(t);
+		t = NULL;
+	}
+	err = 0;
+out:
+	free_page((unsigned long)page);
+
+	if (!err)
+		cpuid_override_update(t);
+	else
+		kfree(t);
+
+	return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+	return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	rcu_read_lock();
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+				     void *v, loff_t *ppos)
+{
+	++*ppos;
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+	struct cpuid_override_entry *e = v;
+
+	seq_printf(s, "0x%08x", e->op);
+	if (e->has_count)
+		seq_printf(s, " 0x%08x", e->count);
+	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		   e->eax, e->ebx, e->ecx, e->edx);
+	return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+	.start = cpuid_override_seq_start,
+	.next  = cpuid_override_seq_next,
+	.stop  = cpuid_override_seq_stop,
+	.show  = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cpuid_override_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cpuid_override_write,
+};
+
+static void disable_cpuid_faulting_fn(void *unused)
+{
+	set_cpuid_faulting(false);
+}
+
+static int cpuid_faulting_reboot_notify(struct notifier_block *nb,
+					unsigned long code, void *unused)
+{
+	if (code == SYS_RESTART) {
+		/*
+		 * Disable cpuid faulting before loading a new kernel by kexec
+		 * in case the new kernel does not support this feature.
+		 */
+		cpuid_override_update(NULL);
+		on_each_cpu(disable_cpuid_faulting_fn, NULL, 1);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cpuid_faulting_reboot_nb = {
+	.notifier_call = cpuid_faulting_reboot_notify,
+};
+
+static int __init cpuid_fault_init(void)
+{
+	struct proc_dir_entry *proc;
+
+	if (!cpu_has_cpuid_faulting)
+		return 0;
+
+	register_reboot_notifier(&cpuid_faulting_reboot_nb);
+
+	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+			   &proc_cpuid_override_ops);
+	if (!proc)
+		return -ENOMEM;
+
+	return 0;
+}
+module_init(cpuid_fault_init);
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -257,7 +257,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("SMP ");
 #endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
+	printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+	printk("KASAN");
 #endif
 	printk("\n");
 	if (notify_die(DIE_OOPS, str, regs, err,
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,7 @@ ENTRY(ia32_sysenter_target)
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1011,9 +1011,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 .endm
 #endif
 
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY  .popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
 .macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
 trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
 .endm
 
 #ifdef CONFIG_SMP
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
 #include <asm/microcode.h>
+#include <asm/kasan.h>
 
 /*
  * Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
 
 	next_early_pgt = 0;
 
-	__load_cr3(__pa(early_level4_pgt));
+	__load_cr3(__pa_nodebug(early_level4_pgt));
 }
 
 /* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
@@ -158,9 +159,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
-	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
+	clear_page(init_level4_pgt);
+
+	kasan_early_init();
+
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
 		set_intr_gate(i, early_idt_handler_array[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
@@ -175,7 +179,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	clear_page(init_level4_pgt);
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
 
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -552,3 +552,4 @@ ENTRY(phys_base)
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
+
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -327,21 +327,24 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
+	int length;
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
+	length = insn.length;
+
 	/* Another subsystem puts a breakpoint, failed to recover */
 	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
-	memcpy(dest, insn.kaddr, insn.length);
+	memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
 	if (insn_rip_relative(&insn)) {
 		s64 newdisp;
 		u8 *disp;
-		kernel_insn_init(&insn, dest, insn.length);
+		kernel_insn_init(&insn, dest, length);
 		insn_get_displacement(&insn);
 		/*
 		 * The copied instruction uses the %rip-relative addressing
@@ -365,7 +368,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
 		*(s32 *) disp = (s32) newdisp;
 	}
 #endif
-	return insn.length;
+	return length;
 }
 
 static int __kprobes arch_copy_kprobe(struct kprobe *p)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -333,7 +333,12 @@ static void kvm_guest_cpu_init(void)
 #ifdef CONFIG_PREEMPT
 		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
 #endif
-		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+		pa |= KVM_ASYNC_PF_ENABLED;
+
+		/* Async page fault support for L1 hypervisor is optional */
+		if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN,
+			(pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0)
+			wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
 		__this_cpu_write(apf_reason.enabled, 1);
 		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 		       smp_processor_id());
@@ -633,7 +638,7 @@ static inline void check_zero(void)
 	u8 ret;
 	u8 old;
 
-	old = ACCESS_ONCE(zero_stats);
+	old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -751,6 +756,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	int cpu;
 	u64 start;
 	unsigned long flags;
+	__ticket_t head;
 
 	if (in_nmi())
 		return;
@@ -792,11 +798,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking.
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -827,8 +837,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	add_stats(RELEASED_SLOW, 1);
 	for_each_cpu(cpu, &waiting_cpus) {
 		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == ticket) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == ticket) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			kvm_kick_cpu(cpu);
 			break;
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,9 @@
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
 
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -49,9 +52,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
 			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+		newldt = vmalloc_account(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = (void *)__get_free_page(GFP_KERNEL);
+		newldt = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
 
 	if (!newldt)
 		return -ENOMEM;
@@ -153,7 +156,7 @@ void destroy_context(struct mm_struct *mm)
 		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
-			put_page(virt_to_page(mm->context.ldt));
+			__free_page(virt_to_page(mm->context.ldt));
 		mm->context.size = 0;
 	}
 }
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -76,13 +77,20 @@ static unsigned long int get_module_load_offset(void)
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1,
-				    MODULES_VADDR + get_module_load_offset(),
-				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_EXEC, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
+				 MODULES_VADDR + get_module_load_offset(),
+				 MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
+				 PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				 __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+	return p;
 }
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,7 +6,6 @@
 #include <linux/smp.h>
 #include <linux/prctl.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
@@ -14,6 +13,8 @@
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
 #include <linux/utsname.h>
+#include <linux/ve.h>
+#include <generated/utsrelease.h>
 #include <linux/stackprotector.h>
 #include <linux/tick.h>
 #include <linux/cpuidle.h>
@@ -471,3 +472,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	return randomize_page(mm->brk, 0x02000000);
 }
 
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long start, bottom, top, sp, fp, ip;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	start = (unsigned long)task_stack_page(p);
+	if (!start)
+		return 0;
+
+	/*
+	 * Layout of the stack page:
+	 *
+	 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+	 * PADDING
+	 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+	 * stack
+	 * ----------- bottom = start + sizeof(thread_info)
+	 * thread_info
+	 * ----------- start
+	 *
+	 * The tasks stack pointer points at the location where the
+	 * framepointer is stored. The data on the stack is:
+	 * ... IP FP ... IP FP
+	 *
+	 * We need to read FP and IP, so we need to adjust the upper
+	 * bound by another unsigned long.
+	 */
+	top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+	top -= 2 * sizeof(unsigned long);
+	bottom = start + sizeof(struct thread_info);
+
+	sp = READ_ONCE(p->thread.sp);
+	if (sp < bottom || sp > top)
+		return 0;
+
+	fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+	do {
+		if (fp < bottom || fp > top)
+			return 0;
+		ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
+		if (!in_sched_functions(ip))
+			return ip;
+		fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
+	} while (count++ < 16 && p->state != TASK_RUNNING);
+	return 0;
+}
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -317,31 +317,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	return prev_p;
 }
-
-#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
-	unsigned long bp, sp, ip;
-	unsigned long stack_page;
-	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack_page = (unsigned long)task_stack_page(p);
-	sp = p->thread.sp;
-	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
-		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes bp last. */
-	bp = *(unsigned long *) sp;
-	do {
-		if (bp < stack_page || bp > top_ebp+stack_page)
-			return 0;
-		ip = *(unsigned long *) (bp+4);
-		if (!in_sched_functions(ip))
-			return ip;
-		bp = *(unsigned long *) bp;
-	} while (count++ < 16);
-	return 0;
-}
-
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -50,6 +50,11 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/intel_rdt.h>
+#include <asm/unistd.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -427,6 +432,8 @@ void set_personality_64bit(void)
 	clear_thread_flag(TIF_IA32);
 	clear_thread_flag(TIF_ADDR32);
 	clear_thread_flag(TIF_X32);
+	/* Pretend that this comes from a 64bit execve */
+	task_pt_regs(current)->orig_ax = __NR_execve;
 
 	/* Ensure the corresponding mm is not marked. */
 	if (current->mm)
@@ -439,58 +446,52 @@ void set_personality_64bit(void)
 	current->personality &= ~READ_IMPLIES_EXEC;
 }
 
-void set_personality_ia32(bool x32)
+static void __set_personality_x32(void)
 {
-	/* inherit personality from parent */
-
-	/* Make sure to be in 32bit mode */
-	set_thread_flag(TIF_ADDR32);
+#ifdef CONFIG_X86_X32
+	clear_thread_flag(TIF_IA32);
+	set_thread_flag(TIF_X32);
+	if (current->mm)
+		current->mm->context.ia32_compat = TIF_X32;
+	current->personality &= ~READ_IMPLIES_EXEC;
+	/*
+	 * in_compat_syscall() uses the presence of the x32 syscall bit
+	 * flag to determine compat status.  The x86 mmap() code relies on
+	 * the syscall bitness so set x32 syscall bit right here to make
+	 * in_compat_syscall() work during exec().
+	 *
+	 * Pretend to come from a x32 execve.
+	 */
+	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+	current_thread_info()->status &= ~TS_COMPAT;
+#endif
+}
 
-	/* Mark the associated mm as containing 32-bit tasks. */
-	if (x32) {
-		clear_thread_flag(TIF_IA32);
-		set_thread_flag(TIF_X32);
-		if (current->mm)
-			current->mm->context.ia32_compat = TIF_X32;
-		current->personality &= ~READ_IMPLIES_EXEC;
-		/* is_compat_task() uses the presence of the x32
-		   syscall bit flag to determine compat status */
-		current_thread_info()->status &= ~TS_COMPAT;
-	} else {
-		set_thread_flag(TIF_IA32);
-		clear_thread_flag(TIF_X32);
-		if (current->mm)
-			current->mm->context.ia32_compat = TIF_IA32;
-		current->personality |= force_personality32;
-		/* Prepare the first "return" to user space */
-		current_thread_info()->status |= TS_COMPAT;
-	}
+static void __set_personality_ia32(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+	set_thread_flag(TIF_IA32);
+	clear_thread_flag(TIF_X32);
+	if (current->mm)
+		current->mm->context.ia32_compat = TIF_IA32;
+	current->personality |= force_personality32;
+	/* Prepare the first "return" to user space */
+	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
+	current_thread_info()->status |= TS_COMPAT;
+#endif
 }
-EXPORT_SYMBOL_GPL(set_personality_ia32);
 
-unsigned long get_wchan(struct task_struct *p)
+void set_personality_ia32(bool x32)
 {
-	unsigned long stack;
-	u64 fp, ip;
-	int count = 0;
+	/* Make sure to be in 32bit mode */
+	set_thread_flag(TIF_ADDR32);
 
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
-		return 0;
-	fp = *(u64 *)(p->thread.sp);
-	do {
-		if (fp < (unsigned long)stack ||
-		    fp >= (unsigned long)stack+THREAD_SIZE)
-			return 0;
-		ip = *(u64 *)(fp+8);
-		if (!in_sched_functions(ip))
-			return ip;
-		fp = *(u64 *)fp;
-	} while (count++ < 16);
-	return 0;
+	if (x32)
+		__set_personality_x32();
+	else
+		__set_personality_ia32();
 }
+EXPORT_SYMBOL_GPL(set_personality_ia32);
 
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 {
@@ -579,6 +580,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		break;
 	}
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_COMPAT
+	case ARCH_MAP_VDSO_32:
+		return do_map_vdso_32(addr);
+# endif
+
+	case ARCH_MAP_VDSO_64:
+		return do_map_vdso_64(addr);
+
+	/* x32 vDSO remap API is omitted for simplicity. */
+	case ARCH_MAP_VDSO_X32:
+#endif
+
 	default:
 		ret = -EINVAL;
 		break;
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1434,7 +1434,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
 #ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
+	if (!user_64bit_mode(task_pt_regs(task)))
 #endif
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 		return &user_x86_32_view;
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -91,6 +91,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/bugs.h>
+#include <asm/kasan.h>
 
 #include <asm/vsyscall.h>
 #include <asm/cpu.h>
@@ -1306,6 +1307,8 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.paging.pagetable_init();
 
+	kasan_init();
+
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
 		mmu_cr4_features = read_cr4();
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -479,7 +479,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		if (copy_siginfo_to_user32(&frame->info, &ksig->info))
+		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
 			return -EFAULT;
 	}
 
@@ -612,12 +612,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 	compat_sigset_t *cset = (compat_sigset_t *) set;
 
 	/* Set up the stack frame */
-	if (is_ia32_frame()) {
+	if (is_ia32_frame(ksig)) {
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO)
 			return ia32_setup_rt_frame(usig, ksig, cset, regs);
 		else
 			return ia32_setup_frame(usig, ksig, cset, regs);
-	} else if (is_x32_frame()) {
+	} else if (is_x32_frame(ksig)) {
 		return x32_setup_rt_frame(ksig, cset, regs);
 	} else {
 		return __setup_rt_frame(ksig->sig, ksig, set, regs);
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -16,6 +16,8 @@
 #include <linux/uaccess.h>
 #include <linux/elf.h>
 
+#include <asm/elf.h>
+#include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
 
@@ -100,7 +102,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
-	if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
+	if (!in_compat_syscall() && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
 		   it to that.  This means we need to move the
@@ -113,10 +115,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		if (current->flags & PF_RANDOMIZE) {
 			*begin = randomize_page(*begin, 0x02000000);
 		}
-	} else {
-		*begin = current->mm->mmap_legacy_base;
-		*end = TASK_SIZE;
+		return;
 	}
+
+	*begin	= get_mmap_base(1);
+	*end	= in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
 }
 
 unsigned long
@@ -175,7 +178,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		return addr;
 
 	/* for MAP_32BIT mappings we force the legacy mmap base */
-	if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
+	if (!in_compat_syscall() && (flags & MAP_32BIT))
 		goto bottomup;
 
 	/* requesting a specific address */
@@ -190,7 +193,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = mm->mmap_base;
+	info.high_limit = get_mmap_base(0);
 	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 	if (filp) {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -332,6 +332,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	exception_exit(prev_state);
 }
 
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+	unsigned long addr;
+	unsigned short opcode;
+
+	if (error_code != 0)
+		return 0;
+
+	addr = convert_ip_to_linear(current, regs);
+	if (get_user(opcode, (unsigned short __user *)addr))
+		return 0;
+
+	if (opcode != 0xa20f)
+		return 0;
+
+	do_cpuid_fault(regs);
+
+	regs->ip += 2;
+	return 1;
+}
+
 dotraplinkage void __kprobes
 do_general_protection(struct pt_regs *regs, long error_code)
 {
@@ -362,6 +383,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		goto exit;
 	}
 
+	if (check_cpuid_fault(regs, error_code))
+		return;
+
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_GP;
 
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -111,6 +111,7 @@ SECTIONS
 		ENTRY_TEXT
 		IRQENTRY_TEXT
 		ALIGN_KAISER()
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 		/* End of text section */
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
 #undef memset
 #undef memmove
 
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
 extern void *memset(void *, int, __kernel_size_t);
 extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(memmove);
 
 #ifndef CONFIG_DEBUG_VIRTUAL
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -96,6 +96,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			best->ecx |= F(OSXSAVE);
 	}
 
+	best->edx &= ~F(APIC);
+	if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
+		best->edx |= F(APIC);
+
 	if (apic) {
 		if (best->ecx & F(TSC_DEADLINE_TIMER))
 			apic->lapic_timer.timer_mode_mask = 3 << 17;
@@ -861,12 +865,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 	if (!best)
 		best = check_cpuid_limit(vcpu, function, index);
 
-	/*
-	 * Perfmon not yet supported for L2 guest.
-	 */
-	if (is_guest_mode(vcpu) && function == 0xa)
-		best = NULL;
-
 	if (best) {
 		*eax = best->eax;
 		*ebx = best->ebx;
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,13 +23,705 @@
 
 #include "x86.h"
 #include "lapic.h"
+#include "ioapic.h"
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/apicdef.h>
 #include <trace/events/kvm.h>
 
 #include "trace.h"
 
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+	return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+	if (sint_value & HV_SYNIC_SINT_MASKED)
+		return -1;
+	return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+				      int vector)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+			return true;
+	}
+	return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+				     int vector)
+{
+	int i;
+	u64 sint_value;
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		sint_value = synic_read_sint(synic, i);
+		if (synic_get_sint_vector(sint_value) == vector &&
+		    sint_value & HV_SYNIC_SINT_AUTO_EOI)
+			return true;
+	}
+	return false;
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+			  u64 data, bool host)
+{
+	int vector;
+
+	vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+	if (vector < 16 && !host)
+		return 1;
+	/*
+	 * Guest may configure multiple SINTs to use the same vector, so
+	 * we maintain a bitmap of vectors handled by synic, and a
+	 * bitmap of vectors with auto-eoi behavior.  The bitmaps are
+	 * updated here, and atomically queried on fast paths.
+	 */
+
+	atomic64_set(&synic->sint[sint], data);
+
+	if (synic_has_vector_connected(synic, vector))
+		__set_bit(vector, synic->vec_bitmap);
+	else
+		__clear_bit(vector, synic->vec_bitmap);
+
+	if (synic_has_vector_auto_eoi(synic, vector))
+		__set_bit(vector, synic->auto_eoi_bitmap);
+	else
+		__clear_bit(vector, synic->auto_eoi_bitmap);
+
+	/* Load SynIC vectors into EOI exit bitmap */
+	kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+	return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	if (vpidx < KVM_MAX_VCPUS)
+		vcpu = kvm_get_vcpu(kvm, vpidx);
+	if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+		return vcpu;
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+			return vcpu;
+	return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu_hv_synic *synic;
+
+	vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+	if (!vcpu)
+		return NULL;
+	synic = vcpu_to_synic(vcpu);
+	return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+					u32 sint)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct page *page;
+	gpa_t gpa;
+	struct hv_message *msg;
+	struct hv_message_page *msg_page;
+
+	gpa = synic->msg_page & PAGE_MASK;
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+			 gpa);
+		return;
+	}
+	msg_page = kmap_atomic(page);
+
+	msg = &msg_page->sint_message[sint];
+	msg->header.message_flags.msg_pending = 0;
+
+	kunmap_atomic(msg_page);
+	kvm_release_page_dirty(page);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	struct kvm_vcpu_hv_stimer *stimer;
+	int gsi, idx, stimers_pending;
+
+	trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+	if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+		synic_clear_sint_msg_pending(synic, sint);
+
+	/* Try to deliver pending Hyper-V SynIC timers messages */
+	stimers_pending = 0;
+	for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+		stimer = &hv_vcpu->stimer[idx];
+		if (stimer->msg_pending &&
+		    (stimer->config & HV_STIMER_ENABLE) &&
+		    HV_STIMER_SINT(stimer->config) == sint) {
+			set_bit(stimer->index,
+				hv_vcpu->stimer_pending_bitmap);
+			stimers_pending++;
+		}
+	}
+	if (stimers_pending)
+		kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = atomic_read(&synic->sint_to_gsi[sint]);
+	if (gsi != -1)
+		kvm_notify_acked_gsi(kvm, gsi);
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+	hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+	hv_vcpu->exit.u.synic.msr = msr;
+	hv_vcpu->exit.u.synic.control = synic->control;
+	hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+	hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+	kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+			 u32 msr, u64 data, bool host)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	int ret;
+
+	if (!synic->active)
+		return 1;
+
+	trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+	ret = 0;
+	switch (msr) {
+	case HV_X64_MSR_SCONTROL:
+		synic->control = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_SVERSION:
+		if (!host) {
+			ret = 1;
+			break;
+		}
+		synic->version = data;
+		break;
+	case HV_X64_MSR_SIEFP:
+		if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+		    !synic->dont_zero_synic_pages)
+			if (kvm_clear_guest(vcpu->kvm,
+					    data & PAGE_MASK, PAGE_SIZE)) {
+				ret = 1;
+				break;
+			}
+		synic->evt_page = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_SIMP:
+		if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+		    !synic->dont_zero_synic_pages)
+			if (kvm_clear_guest(vcpu->kvm,
+					    data & PAGE_MASK, PAGE_SIZE)) {
+				ret = 1;
+				break;
+			}
+		synic->msg_page = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_EOM: {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+			kvm_hv_notify_acked_sint(vcpu, i);
+		break;
+	}
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+{
+	int ret;
+
+	if (!synic->active)
+		return 1;
+
+	ret = 0;
+	switch (msr) {
+	case HV_X64_MSR_SCONTROL:
+		*pdata = synic->control;
+		break;
+	case HV_X64_MSR_SVERSION:
+		*pdata = synic->version;
+		break;
+	case HV_X64_MSR_SIEFP:
+		*pdata = synic->evt_page;
+		break;
+	case HV_X64_MSR_SIMP:
+		*pdata = synic->msg_page;
+		break;
+	case HV_X64_MSR_EOM:
+		*pdata = 0;
+		break;
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		*pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct kvm_lapic_irq irq;
+	int ret, vector;
+
+	if (sint >= ARRAY_SIZE(synic->sint))
+		return -EINVAL;
+
+	vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+	if (vector < 0)
+		return -ENOENT;
+
+	memset(&irq, 0, sizeof(irq));
+	irq.shorthand = APIC_DEST_SELF;
+	irq.dest_mode = APIC_DEST_PHYSICAL;
+	irq.delivery_mode = APIC_DM_FIXED;
+	irq.vector = vector;
+	irq.level = 1;
+
+	ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+	trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+	return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+{
+	struct kvm_vcpu_hv_synic *synic;
+
+	synic = synic_get(kvm, vpidx);
+	if (!synic)
+		return -EINVAL;
+
+	return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+	int i;
+
+	trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+		if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+			kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+	struct kvm_vcpu_hv_synic *synic;
+
+	synic = synic_get(kvm, vpidx);
+	if (!synic)
+		return -EINVAL;
+
+	if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+		return -EINVAL;
+
+	atomic_set(&synic->sint_to_gsi[sint], gsi);
+	return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+	struct kvm_irq_routing_table *irq_rt;
+	struct kvm_kernel_irq_routing_entry *e;
+	u32 gsi;
+
+	irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+					lockdep_is_held(&kvm->irq_lock));
+
+	for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+		hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+			if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+				kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+						    e->hv_sint.sint, gsi);
+		}
+	}
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+	int i;
+
+	memset(synic, 0, sizeof(*synic));
+	synic->version = HV_SYNIC_VERSION_1;
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+		atomic_set(&synic->sint_to_gsi[i], -1);
+	}
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	struct kvm_vcpu *vcpu;
+	u64 tsc;
+
+	/*
+	 * The guest has not set up the TSC page or the clock isn't
+	 * stable, fall back to get_kvmclock_ns.
+	 */
+	if (!hv->tsc_ref.tsc_sequence)
+		return div_u64(get_kvmclock_ns(kvm), 100);
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	tsc = kvm_read_l1_tsc(vcpu, native_read_tsc());
+	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+		+ hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+				bool vcpu_kick)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+	set_bit(stimer->index,
+		vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+	kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+	if (vcpu_kick)
+		kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+	trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+				    stimer->index);
+
+	hrtimer_cancel(&stimer->timer);
+	clear_bit(stimer->index,
+		  vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+	stimer->msg_pending = false;
+	stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+	struct kvm_vcpu_hv_stimer *stimer;
+
+	stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+	trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+				     stimer->index);
+	stimer_mark_pending(stimer, true);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+	u64 time_now;
+	ktime_t ktime_now;
+
+	time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+	ktime_now = ktime_get();
+
+	if (stimer->config & HV_STIMER_PERIODIC) {
+		if (stimer->exp_time) {
+			if (time_now >= stimer->exp_time) {
+				u64 remainder;
+
+				div64_u64_rem(time_now - stimer->exp_time,
+					      stimer->count, &remainder);
+				stimer->exp_time =
+					time_now + (stimer->count - remainder);
+			}
+		} else
+			stimer->exp_time = time_now + stimer->count;
+
+		trace_kvm_hv_stimer_start_periodic(
+					stimer_to_vcpu(stimer)->vcpu_id,
+					stimer->index,
+					time_now, stimer->exp_time);
+
+		hrtimer_start(&stimer->timer,
+			      ktime_add_ns(ktime_now,
+					   100 * (stimer->exp_time - time_now)),
+			      HRTIMER_MODE_ABS);
+		return 0;
+	}
+	stimer->exp_time = stimer->count;
+	if (time_now >= stimer->count) {
+		/*
+		 * Expire timer according to Hypervisor Top-Level Functional
+		 * specification v4(15.3.1):
+		 * "If a one shot is enabled and the specified count is in
+		 * the past, it will expire immediately."
+		 */
+		stimer_mark_pending(stimer, false);
+		return 0;
+	}
+
+	trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+					   stimer->index,
+					   time_now, stimer->count);
+
+	hrtimer_start(&stimer->timer,
+		      ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+		      HRTIMER_MODE_ABS);
+	return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+			     bool host)
+{
+	trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+				       stimer->index, config, host);
+
+	stimer_cleanup(stimer);
+	if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+		config &= ~HV_STIMER_ENABLE;
+	stimer->config = config;
+	stimer_mark_pending(stimer, false);
+	return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+			    bool host)
+{
+	trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+				      stimer->index, count, host);
+
+	stimer_cleanup(stimer);
+	stimer->count = count;
+	if (stimer->count == 0)
+		stimer->config &= ~HV_STIMER_ENABLE;
+	else if (stimer->config & HV_STIMER_AUTOENABLE)
+		stimer->config |= HV_STIMER_ENABLE;
+	stimer_mark_pending(stimer, false);
+	return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+	*pconfig = stimer->config;
+	return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+	*pcount = stimer->count;
+	return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+			     struct hv_message *src_msg)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct page *page;
+	gpa_t gpa;
+	struct hv_message *dst_msg;
+	int r;
+	struct hv_message_page *msg_page;
+
+	if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+		return -ENOENT;
+
+	gpa = synic->msg_page & PAGE_MASK;
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return -EFAULT;
+
+	msg_page = kmap_atomic(page);
+	dst_msg = &msg_page->sint_message[sint];
+	if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+			 src_msg->header.message_type) != HVMSG_NONE) {
+		dst_msg->header.message_flags.msg_pending = 1;
+		r = -EAGAIN;
+	} else {
+		memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+		       src_msg->header.payload_size);
+		dst_msg->header.message_type = src_msg->header.message_type;
+		dst_msg->header.payload_size = src_msg->header.payload_size;
+		r = synic_set_irq(synic, sint);
+		if (r >= 1)
+			r = 0;
+		else if (r == 0)
+			r = -EFAULT;
+	}
+	kunmap_atomic(msg_page);
+	kvm_release_page_dirty(page);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+	return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+	struct hv_message *msg = &stimer->msg;
+	struct hv_timer_message_payload *payload =
+			(struct hv_timer_message_payload *)&msg->u.payload;
+
+	payload->expiration_time = stimer->exp_time;
+	payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+	return synic_deliver_msg(vcpu_to_synic(vcpu),
+				 HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+	int r;
+
+	stimer->msg_pending = true;
+	r = stimer_send_msg(stimer);
+	trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+				       stimer->index, r);
+	if (!r) {
+		stimer->msg_pending = false;
+		if (!(stimer->config & HV_STIMER_PERIODIC))
+			stimer->config &= ~HV_STIMER_ENABLE;
+	}
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	struct kvm_vcpu_hv_stimer *stimer;
+	u64 time_now, exp_time;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+			stimer = &hv_vcpu->stimer[i];
+			if (stimer->config & HV_STIMER_ENABLE) {
+				exp_time = stimer->exp_time;
+
+				if (exp_time) {
+					time_now =
+						get_time_ref_counter(vcpu->kvm);
+					if (time_now >= exp_time)
+						stimer_expiration(stimer);
+				}
+
+				if ((stimer->config & HV_STIMER_ENABLE) &&
+				    stimer->count) {
+					if (!stimer->msg_pending)
+						stimer_start(stimer);
+				} else
+					stimer_cleanup(stimer);
+			}
+		}
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct hv_message *msg = &stimer->msg;
+	struct hv_timer_message_payload *payload =
+			(struct hv_timer_message_payload *)&msg->u.payload;
+
+	memset(&msg->header, 0, sizeof(msg->header));
+	msg->header.message_type = HVMSG_TIMER_EXPIRED;
+	msg->header.payload_size = sizeof(*payload);
+
+	payload->timer_index = stimer->index;
+	payload->expiration_time = 0;
+	payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+	memset(stimer, 0, sizeof(*stimer));
+	stimer->index = timer_index;
+	hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	stimer->timer.function = stimer_timer_callback;
+	stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	int i;
+
+	synic_init(&hv_vcpu->synic);
+
+	bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+
+	hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+	/*
+	 * Hyper-V SynIC auto EOI SINT's are
+	 * not compatible with APICV, so deactivate APICV
+	 */
+	kvm_vcpu_deactivate_apicv(vcpu);
+	synic->active = true;
+	synic->dont_zero_synic_pages = dont_zero_synic_pages;
+	return 0;
+}
+
 static bool kvm_hv_msr_partition_wide(u32 msr)
 {
 	bool r = false;
@@ -41,6 +733,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_TIME_REF_COUNT:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_RESET:
 		r = true;
 		break;
 	}
@@ -103,6 +796,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+					HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+	u64 max_mul;
+
+	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+		return false;
+
+	/*
+	 * check if scale would overflow, if so we use the time ref counter
+	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+	 */
+	max_mul = 100ull << (32 - hv_clock->tsc_shift);
+	if (hv_clock->tsc_to_system_mul >= max_mul)
+		return false;
+
+	/*
+	 * Otherwise compute the scale and offset according to the formulas
+	 * derived above.
+	 */
+	tsc_ref->tsc_scale =
+		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+				hv_clock->tsc_to_system_mul,
+				100);
+
+	tsc_ref->tsc_offset = hv_clock->system_time;
+	do_div(tsc_ref->tsc_offset, 100);
+	tsc_ref->tsc_offset -=
+		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+	return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	u32 tsc_seq;
+	u64 gfn;
+
+	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+		return;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	/*
+	 * Because the TSC parameters only vary when there is a
+	 * change in the master clock, do not bother with caching.
+	 */
+	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+				    &tsc_seq, sizeof(tsc_seq))))
+		return;
+
+	/*
+	 * While we're computing and writing the parameters, force the
+	 * guest to use the time reference count MSR.
+	 */
+	hv->tsc_ref.tsc_sequence = 0;
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			    &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+		return;
+
+	if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+		return;
+
+	/* Ensure sequence is zero before writing the rest of the struct.  */
+	smp_wmb();
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+		return;
+
+	/*
+	 * Now switch to the TSC page mechanism by writing the sequence.
+	 */
+	tsc_seq++;
+	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+		tsc_seq = 1;
+
+	/* Write the struct entirely before the non-zero sequence.  */
+	smp_wmb();
+
+	hv->tsc_ref.tsc_sequence = tsc_seq;
+	kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 			     bool host)
 {
@@ -140,29 +956,23 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
 						 data);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+	case HV_X64_MSR_RESET:
+		if (data == 1) {
+			vcpu_debug(vcpu, "hyper-v reset requested\n");
+			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+		}
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -171,11 +981,25 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	return 0;
 }
 
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+	cputime_t utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
 	switch (msr) {
+	case HV_X64_MSR_VP_INDEX:
+		if (!host)
+			return 1;
+		hv->vp_index = (u32)data;
+		break;
 	case HV_X64_MSR_APIC_ASSIST_PAGE: {
 		u64 gfn;
 		unsigned long addr;
@@ -205,6 +1029,36 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+	case HV_X64_MSR_VP_RUNTIME:
+		if (!host)
+			return 1;
+		hv->runtime_offset = data - current_task_runtime_100ns();
+		break;
+	case HV_X64_MSR_SCONTROL:
+	case HV_X64_MSR_SVERSION:
+	case HV_X64_MSR_SIEFP:
+	case HV_X64_MSR_SIMP:
+	case HV_X64_MSR_EOM:
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+	case HV_X64_MSR_STIMER0_CONFIG:
+	case HV_X64_MSR_STIMER1_CONFIG:
+	case HV_X64_MSR_STIMER2_CONFIG:
+	case HV_X64_MSR_STIMER3_CONFIG: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+		return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+					 data, host);
+	}
+	case HV_X64_MSR_STIMER0_COUNT:
+	case HV_X64_MSR_STIMER1_COUNT:
+	case HV_X64_MSR_STIMER2_COUNT:
+	case HV_X64_MSR_STIMER3_COUNT: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+		return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+					data, host);
+	}
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -241,6 +1095,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 						 pdata);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+	case HV_X64_MSR_RESET:
+		data = 0;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -256,18 +1113,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
 	switch (msr) {
-	case HV_X64_MSR_VP_INDEX: {
-		int r;
-		struct kvm_vcpu *v;
-
-		kvm_for_each_vcpu(r, v, vcpu->kvm) {
-			if (v == vcpu) {
-				data = r;
-				break;
-			}
-		}
+	case HV_X64_MSR_VP_INDEX:
+		data = hv->vp_index;
 		break;
-	}
 	case HV_X64_MSR_EOI:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
 	case HV_X64_MSR_ICR:
@@ -277,6 +1125,34 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = hv->hv_vapic;
 		break;
+	case HV_X64_MSR_VP_RUNTIME:
+		data = current_task_runtime_100ns() + hv->runtime_offset;
+		break;
+	case HV_X64_MSR_SCONTROL:
+	case HV_X64_MSR_SVERSION:
+	case HV_X64_MSR_SIEFP:
+	case HV_X64_MSR_SIMP:
+	case HV_X64_MSR_EOM:
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+	case HV_X64_MSR_STIMER0_CONFIG:
+	case HV_X64_MSR_STIMER1_CONFIG:
+	case HV_X64_MSR_STIMER2_CONFIG:
+	case HV_X64_MSR_STIMER3_CONFIG: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+		return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+					 pdata);
+	}
+	case HV_X64_MSR_STIMER0_COUNT:
+	case HV_X64_MSR_STIMER1_COUNT:
+	case HV_X64_MSR_STIMER2_COUNT:
+	case HV_X64_MSR_STIMER3_COUNT: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+		return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+					pdata);
+	}
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -295,7 +1171,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		mutex_unlock(&vcpu->kvm->lock);
 		return r;
 	} else
-		return kvm_hv_set_msr(vcpu, msr, data);
+		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
@@ -316,6 +1192,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+	bool longmode;
+
+	longmode = is_64_bit_mode(vcpu);
+	if (longmode)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+	else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+	}
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+	return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -328,7 +1225,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
+		return 1;
 	}
 
 	longmode = is_64_bit_mode(vcpu);
@@ -356,22 +1253,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
+	/* Hypercall continuation is not supported yet */
+	if (rep_cnt || rep_idx) {
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		goto set_result;
+	}
+
 	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
+	case HVCALL_POST_MESSAGE:
+	case HVCALL_SIGNAL_EVENT:
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
 	}
 
+set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
+	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,9 +24,68 @@
 #ifndef __ARCH_X86_KVM_HYPERV_H__
 #define __ARCH_X86_KVM_HYPERV_H__
 
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+	struct kvm_vcpu_arch *arch;
+
+	arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+	return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+	return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+
 bool kvm_hv_hypercall_enabled(struct kvm *kvm);
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
 
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+							int timer_index)
+{
+	return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu_hv *hv_vcpu;
+
+	hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+			       stimer[0]);
+	return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+	return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+			     HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -236,22 +236,14 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	int value;
-
-	spin_lock(&ps->inject_lock);
-	value = atomic_dec_return(&ps->pending);
-	if (value < 0)
-		/* spurious acks can be generated if, for example, the
-		 * PIC is being reset.  Handle it gracefully here
-		 */
-		atomic_inc(&ps->pending);
-	else if (value > 0 && ps->reinject)
-		/* in this case, we had multiple outstanding pit interrupts
-		 * that we needed to inject.  Reinject
-		 */
+
+	atomic_set(&ps->irq_ack, 1);
+	/* irq_ack should be set before pending is read.  Order accesses with
+	 * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+	 */
+	smp_mb();
+	if (atomic_dec_if_positive(&ps->pending) > 0 && ps->reinject)
 		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
-	ps->irq_ack = 1;
-	spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,36 +274,25 @@ static void pit_do_work(struct kthread_work *work)
 	struct kvm_vcpu *vcpu;
 	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
-	int inject = 0;
 
-	/* Try to inject pending interrupts when
-	 * last one has been acked.
+	if (ps->reinject && !atomic_xchg(&ps->irq_ack, 0))
+		return;
+
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation does not follow
+	 * the MP specification.  We propagate a PIT interrupt to all
+	 * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
+	 * also be simultaneously delivered through PIC and IOAPIC.
 	 */
-	spin_lock(&ps->inject_lock);
-	if (!ps->reinject)
-		inject = 1;
-	else if (ps->irq_ack) {
-		ps->irq_ack = 0;
-		inject = 1;
-	}
-	spin_unlock(&ps->inject_lock);
-	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
-		/*
-		 * Provides NMI watchdog support via Virtual Wire mode.
-		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-		 *
-		 * Note: Our Virtual Wire implementation is simplified, only
-		 * propagating PIT interrupts to all VCPUs when they have set
-		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-		 * VCPU0, and only if its LVT0 is in EXTINT mode.
-		 */
-		if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
-			kvm_for_each_vcpu(i, vcpu, kvm)
-				kvm_apic_nmi_wd_deliver(vcpu);
-	}
+	if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
@@ -331,6 +312,12 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		return HRTIMER_NORESTART;
 }
 
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+	atomic_set(&pit->pit_state.pending, 0);
+	atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
@@ -353,8 +340,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	ps->timer.function = pit_timer_fn;
 	ps->kvm = ps->pit->kvm;
 
-	atomic_set(&ps->pending, 0);
-	ps->irq_ack = 1;
+	kvm_pit_reset_reinject(ps->pit);
 
 	/*
 	 * Do not allow the guest to program periodic timers with small
@@ -650,18 +636,15 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	}
 	mutex_unlock(&pit->pit_state.lock);
 
-	atomic_set(&pit->pit_state.pending, 0);
-	pit->pit_state.irq_ack = 1;
+	kvm_pit_reset_reinject(pit);
 }
 
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
-	if (!mask) {
-		atomic_set(&pit->pit_state.pending, 0);
-		pit->pit_state.irq_ack = 1;
-	}
+	if (!mask)
+		kvm_pit_reset_reinject(pit);
 }
 
 static const struct kvm_io_device_ops pit_dev_ops = {
@@ -695,7 +678,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	spin_lock_init(&pit->pit_state.inject_lock);
 
 	pid = get_pid(task_tgid(current));
 	pid_nr = pid_vnr(pid);
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,8 +33,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	spinlock_t inject_lock;
-	unsigned long irq_ack;
+	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -82,7 +82,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 	if (kvm_vcpu_apicv_active(v))
 		return 0;
 
-	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+	return kvm_apic_has_interrupt(v, false) != -1; /* LAPIC */
 }
 
 /*
@@ -97,7 +97,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	return kvm_apic_has_interrupt(v) != -1;	/* LAPIC */
+	return kvm_apic_has_interrupt(v, true) != -1;	/* LAPIC */
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
@@ -122,7 +122,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 /*
  * Read pending interrupt vector and intack.
  */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v, bool make_req)
 {
 	int vector;
 
@@ -134,7 +134,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	if (vector != -1)
 		return vector;			/* PIC */
 
-	return kvm_get_apic_interrupt(v);	/* APIC */
+	return kvm_get_apic_interrupt(v, make_req);	/* APIC */
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -35,6 +35,8 @@
 
 #include "x86.h"
 
+#include "hyperv.h"
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
@@ -266,6 +268,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+		    struct kvm *kvm, int irq_source_id, int level,
+		    bool line_status)
+{
+	if (!level)
+		return -1;
+
+	return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
@@ -308,6 +320,11 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		if (kvm_msi_route_invalid(kvm, e))
 			goto out;
 		break;
+	case KVM_IRQ_ROUTING_HV_SINT:
+		e->set = kvm_hv_set_sint;
+		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+		e->hv_sint.sint = ue->u.hv_sint.sint;
+		break;
 	default:
 		goto out;
 	}
@@ -383,9 +400,21 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
 }
 
-void kvm_arch_irq_routing_update(struct kvm *kvm)
+int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm,
+		     int irq_source_id, int level, bool line_status)
 {
-	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+	switch (irq->type) {
+	case KVM_IRQ_ROUTING_HV_SINT:
+		return kvm_hv_set_sint(irq, kvm, irq_source_id, level,
+				       line_status);
+	default:
+		return -EWOULDBLOCK;
+	}
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
@@ -419,3 +448,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+	kvm_hv_irq_routing_update(kvm);
+}
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
 #include "trace.h"
 #include "x86.h"
 #include "cpuid.h"
+#include "hyperv.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -567,7 +568,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static void apic_update_ppr(struct kvm_lapic *apic)
+static void apic_update_ppr(struct kvm_lapic *apic, bool make_req)
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
@@ -587,7 +588,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 
 	if (old_ppr != ppr) {
 		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
-		if (ppr < old_ppr)
+		if (make_req && ppr < old_ppr)
 			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 	}
 }
@@ -595,7 +596,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 {
 	kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 }
 
 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
@@ -1060,7 +1061,10 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 		return vector;
 
 	apic_clear_isr(vector, apic);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
+
+	if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+		kvm_hv_synic_send_eoi(apic->vcpu, vector);
 
 	kvm_ioapic_send_eoi(apic, vector);
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
@@ -1172,7 +1176,7 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		val = apic_get_tmcct(apic);
 		break;
 	case APIC_PROCPRI:
-		apic_update_ppr(apic);
+		apic_update_ppr(apic, true);
 		val = kvm_lapic_get_reg(apic, offset);
 		break;
 	case APIC_TASKPRI:
@@ -1692,14 +1696,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	u64 old_value = vcpu->arch.apic_base;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!apic) {
+	if (!apic)
 		value |= MSR_IA32_APICBASE_BSP;
-		vcpu->arch.apic_base = value;
-		return;
-	}
 
 	vcpu->arch.apic_base = value;
 
+	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+		kvm_update_cpuid(vcpu);
+
+	if (!apic)
+		return;
+
 	/* update jump label if enable bit changes */
 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
 		if (value & MSR_IA32_APICBASE_ENABLE) {
@@ -1781,7 +1788,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_lapic_set_base(vcpu,
 				vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
 	vcpu->arch.pv_eoi.msr_val = 0;
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
@@ -1902,7 +1909,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	return -ENOMEM;
 }
 
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu, bool make_req)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
@@ -1910,7 +1917,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	if (!apic_enabled(apic))
 		return -1;
 
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, make_req);
 	highest_irr = apic_find_highest_irr(apic);
 	if ((highest_irr == -1) ||
 	    ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
@@ -1943,9 +1950,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 	}
 }
 
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu, bool make_req)
 {
-	int vector = kvm_apic_has_interrupt(vcpu);
+	int vector = kvm_apic_has_interrupt(vcpu, make_req);
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	if (vector == -1)
@@ -1959,8 +1966,14 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	 */
 
 	apic_set_isr(vector, apic);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 	apic_clear_irr(vector, apic);
+
+	if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+		apic_clear_isr(vector, apic);
+		apic_update_ppr(apic, true);
+	}
+
 	return vector;
 }
 
@@ -2008,7 +2021,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 	hrtimer_cancel(&apic->lapic_timer.timer);
 	apic_update_lvtt(apic);
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -52,9 +52,9 @@ struct dest_map;
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu, bool make_req);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu, bool make_req);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -42,6 +42,7 @@
 #include <asm/io.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
+#include "trace.h"
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -3482,12 +3483,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
+	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+		return false;
+
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
@@ -3503,7 +3507,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	if (!prefault && can_do_async_pf(vcpu)) {
+	if (!prefault && kvm_can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -3517,6 +3521,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	return false;
 }
 
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+				u64 fault_address, char *insn, int insn_len,
+				bool need_unprotect)
+{
+	int r = 1;
+
+	switch (vcpu->arch.apf.host_apf_reason) {
+	default:
+		trace_kvm_page_fault(fault_address, error_code);
+
+		if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
+		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+				insn_len);
+		break;
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+		vcpu->arch.apf.host_apf_reason = 0;
+		local_irq_disable();
+		kvm_async_pf_task_wait(fault_address);
+		local_irq_enable();
+		break;
+	case KVM_PV_REASON_PAGE_READY:
+		vcpu->arch.apf.host_apf_reason = 0;
+		local_irq_disable();
+		kvm_async_pf_task_wake(fault_address);
+		local_irq_enable();
+		break;
+	}
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+
 static bool
 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
 {
@@ -3639,13 +3675,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 static inline bool is_last_gpte(struct kvm_mmu *mmu,
 				unsigned level, unsigned gpte)
 {
-	/*
-	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
-	 */
-	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
 	/*
 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
 	 * If it is clear, there are no large pages at this level, so clear
@@ -3653,6 +3682,13 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 	 */
 	gpte &= level - mmu->last_nonleaf_level;
 
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
 	return gpte & PT_PAGE_SIZE_MASK;
 }
 
@@ -4082,6 +4118,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->direct_map = false;
 
 	update_permission_bitmask(vcpu, context, true);
+	update_last_nonleaf_level(vcpu, context);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
@@ -4899,13 +4936,12 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 	}
 }
 
-static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
 	int nr_to_scan = sc->nr_to_scan;
-
-	if (nr_to_scan == 0)
-		goto out;
+	unsigned long freed = 0;
 
 	spin_lock(&kvm_lock);
 
@@ -4940,25 +4976,37 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 			goto unlock;
 		}
 
-		prepare_zap_oldest_mmu_page(kvm, &invalid_list);
+		if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+			freed++;
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
 unlock:
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 
+		/*
+		 * unfair on small ones
+		 * per-vm shrinkers cry out
+		 * sadness comes quickly
+		 */
 		list_move_tail(&kvm->vm_list, &vm_list);
 		break;
 	}
 
 	spin_unlock(&kvm_lock);
+	return freed;
 
-out:
+}
+
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
 	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
-	.shrink = mmu_shrink,
+	.count_objects = mmu_shrink_count,
+	.scan_objects = mmu_shrink_scan,
 	.seeks = DEFAULT_SEEKS * 10,
 };
 
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -74,6 +74,10 @@ enum {
 int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+				u64 fault_address, char *insn, int insn_len,
+				bool need_unprotect);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -307,10 +307,11 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		--walker->level;
 
 		index = PT_INDEX(addr, walker->level);
-
 		table_gfn = gpte_to_gfn(pte);
 		offset    = index * sizeof(pt_element_t);
 		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+
+		BUG_ON(walker->level < 1);
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -175,7 +175,6 @@ struct vcpu_svm {
 
 	unsigned int3_injected;
 	unsigned long int3_rip;
-	u32 apf_reason;
 
 	/* cached guest cpuid flags for faster access */
 	bool nrips_enabled	: 1;
@@ -1920,34 +1919,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 static int pf_interception(struct vcpu_svm *svm)
 {
 	u64 fault_address = svm->vmcb->control.exit_info_2;
-	u32 error_code;
-	int r = 1;
+	u64 error_code = svm->vmcb->control.exit_info_1;
 
-	switch (svm->apf_reason) {
-	default:
-		error_code = svm->vmcb->control.exit_info_1;
-
-		trace_kvm_page_fault(fault_address, error_code);
-		if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
-			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
-		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
 			svm->vmcb->control.insn_bytes,
-			svm->vmcb->control.insn_len);
-		break;
-	case KVM_PV_REASON_PAGE_NOT_PRESENT:
-		svm->apf_reason = 0;
-		local_irq_disable();
-		kvm_async_pf_task_wait(fault_address);
-		local_irq_enable();
-		break;
-	case KVM_PV_REASON_PAGE_READY:
-		svm->apf_reason = 0;
-		local_irq_disable();
-		kvm_async_pf_task_wake(fault_address);
-		local_irq_enable();
-		break;
-	}
-	return r;
+			svm->vmcb->control.insn_len, !npt_enabled);
 }
 
 static int db_interception(struct vcpu_svm *svm)
@@ -2152,8 +2128,7 @@ static int halt_interception(struct vcpu_svm *svm)
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	kvm_emulate_hypercall(&svm->vcpu);
-	return 1;
+	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
@@ -2258,15 +2233,19 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	if (!is_guest_mode(&svm->vcpu))
 		return 0;
 
+	vmexit = nested_svm_intercept(svm);
+	if (vmexit != NESTED_EXIT_DONE)
+		return 0;
+
 	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
 	svm->vmcb->control.exit_code_hi = 0;
 	svm->vmcb->control.exit_info_1 = error_code;
-	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
-	vmexit = nested_svm_intercept(svm);
-	if (vmexit == NESTED_EXIT_DONE)
-		svm->nested.exit_required = true;
+	if (svm->vcpu.arch.exception.nested_apf)
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+	else
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
+	svm->nested.exit_required = true;
 	return vmexit;
 }
 
@@ -2416,7 +2395,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 		break;
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
 		/* When we're shadowing, trap PFs, but not async PF */
-		if (!npt_enabled && svm->apf_reason == 0)
+		if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
 			return NESTED_EXIT_HOST;
 		break;
 	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -2462,7 +2441,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 			vmexit = NESTED_EXIT_DONE;
 		/* async page fault always cause vmexit */
 		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
-			 svm->apf_reason != 0)
+			 svm->vcpu.arch.exception.nested_apf != 0)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
@@ -4566,7 +4545,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	/* if exit due to PF check for async PF */
 	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-		svm->apf_reason = kvm_read_and_reset_pf_reason();
+		svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 
 	if (npt_enabled) {
 		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1086,6 +1086,269 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
 		  __entry->vec)
 );
 
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+	TP_PROTO(int vcpu_id, u32 sint),
+	TP_ARGS(vcpu_id, sint),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->sint = sint;
+	),
+
+	TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+	TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+	TP_ARGS(vcpu_id, sint, vector, ret),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+		__field(int, vector)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->sint = sint;
+		__entry->vector = vector;
+		__entry->ret = ret;
+	),
+
+	TP_printk("vcpu_id %d sint %u vector %d ret %d",
+		  __entry->vcpu_id, __entry->sint, __entry->vector,
+		  __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+	TP_PROTO(int vcpu_id, int vector),
+	TP_ARGS(vcpu_id, vector),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+		__field(int, vector)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->vector	= vector;
+	),
+
+	TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+	TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+	TP_ARGS(vcpu_id, msr, data, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, msr)
+		__field(u64, data)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->msr = msr;
+		__entry->data = data;
+		__entry->host = host
+	),
+
+	TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+		  __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+	TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+	TP_ARGS(vcpu_id, timer_index, config, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, config)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->config = config;
+		__entry->host = host;
+	),
+
+	TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+		  __entry->vcpu_id, __entry->timer_index, __entry->config,
+		  __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+	TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+	TP_ARGS(vcpu_id, timer_index, count, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, count)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->count = count;
+		__entry->host = host;
+	),
+
+	TP_printk("vcpu_id %d timer %d count %llu host %d",
+		  __entry->vcpu_id, __entry->timer_index, __entry->count,
+		  __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+	TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+	TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, time_now)
+		__field(u64, exp_time)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->time_now = time_now;
+		__entry->exp_time = exp_time;
+	),
+
+	TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+		  __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+		  __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+	TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+	TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, time_now)
+		__field(u64, count)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->time_now = time_now;
+		__entry->count = count;
+	),
+
+	TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+		  __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+		  __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+	TP_PROTO(int vcpu_id, int timer_index),
+	TP_ARGS(vcpu_id, timer_index),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+	),
+
+	TP_printk("vcpu_id %d timer %d",
+		  __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+	TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+	TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(int, msg_send_result)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->msg_send_result = msg_send_result;
+	),
+
+	TP_printk("vcpu_id %d timer %d msg send result %d",
+		  __entry->vcpu_id, __entry->timer_index,
+		  __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+	TP_PROTO(int vcpu_id, int timer_index),
+	TP_ARGS(vcpu_id, timer_index),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+	),
+
+	TP_printk("vcpu_id %d timer %d",
+		  __entry->vcpu_id, __entry->timer_index)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2284,14 +2284,26 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
  */
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	unsigned int nr = vcpu->arch.exception.nr;
 
-	if (!(vmcs12->exception_bitmap & (1u << nr)))
+	if (!((vmcs12->exception_bitmap & (1u << nr)) ||
+		(nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
 		return 0;
 
-	nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+	if (vcpu->arch.exception.nested_apf) {
+		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+			PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+			INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+			vcpu->arch.apf.nested_apf_token);
+		return 1;
+	}
+
+	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
@@ -2305,7 +2317,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
 	if (!reinject && is_guest_mode(vcpu) &&
-	    nested_vmx_check_exception(vcpu, nr))
+	    nested_vmx_check_exception(vcpu))
 		return;
 
 	if (has_error_code) {
@@ -2360,7 +2372,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
 	if (is_guest_mode(vcpu))
 		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
+	else if (cpu_has_secondary_exec_ctrls() &&
+		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -2497,7 +2511,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	 * reason is that if one of these bits is necessary, it will appear
 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
-	 * nested_vmx_exit_handled() will not pass related exits to L1.
+	 * nested_vmx_exit_reflected() will not pass related exits to L1.
 	 * These rules have exceptions below.
 	 */
 
@@ -4695,6 +4709,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+	if (cpu_has_secondary_exec_ctrls()) {
+		if (kvm_vcpu_apicv_active(vcpu))
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+		else
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_APIC_REGISTER_VIRT |
+					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx_set_msr_bitmap(vcpu);
 }
 
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5277,14 +5304,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	}
 
 	if (is_page_fault(intr_info)) {
-		/* EPT won't cause page fault directly */
-		BUG_ON(enable_ept);
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
-		trace_kvm_page_fault(cr2, error_code);
-
-		if (kvm_event_needs_reinjection(vcpu))
-			kvm_mmu_unprotect_page_virt(vcpu, cr2);
-		return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+		/* EPT won't cause page fault directly */
+		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
+		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
+				true);
 	}
 
 	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -5704,8 +5728,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_hypercall(vcpu);
-	return 1;
+	return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6135,12 +6158,7 @@ static __init int hardware_setup(void)
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
-	/*
-	 * Allow direct access to the PC debug port (it is often used for I/O
-	 * delays, but the vmexits simply slow things down).
-	 */
 	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-	clear_bit(0x80, vmx_io_bitmap_a);
 
 	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
@@ -6221,23 +6239,20 @@ static __init int hardware_setup(void)
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	if (enable_apicv) {
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			vmx_disable_intercept_msr_read_x2apic(msr);
-
-		/* According SDM, in x2apic mode, the whole id reg is used.
-		 * But in KVM, it only use the highest eight bits. Need to
-		 * intercept it */
-		vmx_enable_intercept_msr_read_x2apic(0x802);
-		/* TMCCT */
-		vmx_enable_intercept_msr_read_x2apic(0x839);
-		/* TPR */
-		vmx_disable_intercept_msr_write_x2apic(0x808);
-		/* EOI */
-		vmx_disable_intercept_msr_write_x2apic(0x80b);
-		/* SELF-IPI */
-		vmx_disable_intercept_msr_write_x2apic(0x83f);
-	}
+	for (msr = 0x800; msr <= 0x8ff; msr++)
+		vmx_disable_intercept_msr_read_x2apic(msr);
+
+	/* According SDM, in x2apic mode, the whole id reg is used.  But in
+	 * KVM, it only use the highest eight bits. Need to intercept it */
+	vmx_enable_intercept_msr_read_x2apic(0x802);
+	/* TMCCT */
+	vmx_enable_intercept_msr_read_x2apic(0x839);
+	/* TPR */
+	vmx_disable_intercept_msr_write_x2apic(0x808);
+	/* EOI */
+	vmx_disable_intercept_msr_write_x2apic(0x80b);
+	/* SELF-IPI */
+	vmx_disable_intercept_msr_write_x2apic(0x83f);
 
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull,
@@ -7707,12 +7722,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u32 exit_reason = vmx->exit_reason;
 
 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
 				vmcs_readl(EXIT_QUALIFICATION),
@@ -7735,7 +7749,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		if (is_nmi(intr_info))
 			return false;
 		else if (is_page_fault(intr_info))
-			return enable_ept;
+			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
 		else if (is_no_device(intr_info) &&
 			 !(vmcs12->guest_cr0 & X86_CR0_TS))
 			return false;
@@ -7759,8 +7773,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TASK_SWITCH:
 		return true;
 	case EXIT_REASON_CPUID:
-		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-			return false;
 		return true;
 	case EXIT_REASON_HLT:
 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -7845,6 +7857,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/*
+	 * At this point, the exit interruption info in exit_intr_info
+	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
+	 * we need to query the in-kernel LAPIC.
+	 */
+	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+	if ((exit_intr_info &
+	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->vm_exit_intr_error_code =
+			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	}
+
+	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+			  vmcs_readl(EXIT_QUALIFICATION));
+	return 1;
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
 	*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8089,12 +8124,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required)
 		return handle_invalid_guest_state(vcpu);
 
-	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-		nested_vmx_vmexit(vcpu, exit_reason,
-				  vmcs_read32(VM_EXIT_INTR_INFO),
-				  vmcs_readl(EXIT_QUALIFICATION));
-		return 1;
-	}
+	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		dump_vmcs();
@@ -8320,6 +8351,10 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	exit_intr_info = vmx->exit_intr_info;
 
+	/* if exit due to PF check for async PF */
+	if (is_page_fault(exit_intr_info))
+		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+
 	/* Handle machine checks before interrupts are enabled */
 	if (is_machine_check(exit_intr_info))
 		kvm_machine_check();
@@ -9086,12 +9121,14 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+		vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
 				  vmcs_read32(VM_EXIT_INTR_INFO),
 				  vmcs_readl(EXIT_QUALIFICATION));
-	else
+	} else {
 		kvm_inject_page_fault(vcpu, fault);
+	}
 }
 
 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
@@ -9722,6 +9759,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 				page_to_phys(vmx->nested.virtual_apic_page));
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+	} else {
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+				CPU_BASED_CR8_STORE_EXITING;
+#endif
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
@@ -10324,13 +10366,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
 	vmcs12->vm_exit_reason = exit_reason;
 	vmcs12->exit_qualification = exit_qualification;
-
 	vmcs12->vm_exit_intr_info = exit_intr_info;
-	if ((vmcs12->vm_exit_intr_info &
-	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
-		vmcs12->vm_exit_intr_error_code =
-			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
 	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -10529,7 +10566,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
 	    && nested_exit_intr_ack_set(vcpu)) {
-		int irq = kvm_cpu_get_interrupt(vcpu);
+		int irq = kvm_cpu_get_interrupt(vcpu, true);
 		WARN_ON(irq < 0);
 		vmcs12->vm_exit_intr_info = irq |
 			INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
@@ -10810,7 +10847,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
-	int idx, ret = -EINVAL;
+	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP))
@@ -10818,7 +10855,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+	if (guest_irq >= irq_rt->nr_rt_entries ||
+	    hlist_empty(&irq_rt->map[guest_irq])) {
+		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+			     guest_irq, irq_rt->nr_rt_entries);
+		goto out;
+	}
 
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -444,7 +444,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = fault->address;
+	vcpu->arch.exception.nested_apf =
+		is_guest_mode(vcpu) && fault->async_page_fault;
+	if (vcpu->arch.exception.nested_apf)
+		vcpu->arch.apf.nested_apf_token = fault->address;
+	else
+		vcpu->arch.cr2 = fault->address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
@@ -982,6 +987,11 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+	HV_X64_MSR_RESET,
+	HV_X64_MSR_VP_INDEX,
+	HV_X64_MSR_VP_RUNTIME,
+	HV_X64_MSR_SCONTROL,
+	HV_X64_MSR_STIMER0_CONFIG,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 
@@ -1415,10 +1425,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 usdiff;
 	bool matched;
 	bool already_matched;
 	u64 data = msr->data;
+	bool synchronizing = false;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
@@ -1426,51 +1436,32 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
-		int faulted = 0;
-
-		/* n.b - signed multiplication and division required */
-		usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
-		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
-		/* do_div() only does unsigned */
-		asm("1: idivl %[divisor]\n"
-		    "2: xor %%edx, %%edx\n"
-		    "   movl $0, %[faulted]\n"
-		    "3:\n"
-		    ".section .fixup,\"ax\"\n"
-		    "4: movl $1, %[faulted]\n"
-		    "   jmp  3b\n"
-		    ".previous\n"
-
-		_ASM_EXTABLE(1b, 4b)
-
-		: "=A"(usdiff), [faulted] "=r" (faulted)
-		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
-
-#endif
-		do_div(elapsed, 1000);
-		usdiff -= elapsed;
-		if (usdiff < 0)
-			usdiff = -usdiff;
-
-		/* idivl overflow => difference is larger than USEC_PER_SEC */
-		if (faulted)
-			usdiff = USEC_PER_SEC;
-	} else
-		usdiff = USEC_PER_SEC; /* disable TSC match window below */
+		if ((data == 0) && msr->host_initiated) {
+			/*
+			* detection of vcpu initialization -- need to sync with other vCPUs
+			* particularly helps to keep kvm_clock stable after CPU hotplug
+			*/
+			synchronizing = true;
+		} else {
+			u64 tsc_exp = kvm->arch.last_tsc_write +
+						nsec_to_cycles(vcpu, elapsed);
+			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+			/*
+			 * Special case: TSC write with a small delta (1 second) of virtual
+			 * cycle time against real time is interpreted as an attempt to
+			 * synchronize the CPU.
+			 */
+			synchronizing = data < tsc_exp + tsc_hz && data > tsc_exp - tsc_hz;
+		}
+	}
 
 	/*
-	 * Special case: TSC write with a small delta (1 second) of virtual
-	 * cycle time against real time is interpreted as an attempt to
-	 * synchronize the CPU.
-         *
 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 	 * TSC, we add elapsed time in this computation.  We could let the
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
-	if (usdiff < USEC_PER_SEC &&
+	if (synchronizing &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
@@ -1769,6 +1760,60 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 	return ret;
 }
 
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct pvclock_vcpu_time_info guest_hv_clock;
+
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return;
+
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
+	 */
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
@@ -1776,7 +1821,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 
@@ -1830,8 +1874,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
+	/* With all the info we got, fill in the values */
 
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		tgt_tsc_khz = kvm_has_tsc_control ?
@@ -1842,64 +1885,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-		&guest_hv_clock, sizeof(guest_hv_clock))))
-		return 0;
-
-	/* This VCPU is paused, but it's legal for a guest to read another
-	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * it says that version is odd if data is being modified, and even after
-	 * it is consistent.
-	 *
-	 * Version field updates must be kept separate.  This is because
-	 * kvm_write_guest_cached might use a "rep movs" instruction, and
-	 * writes within a string instruction are weakly ordered.  So there
-	 * are three writes overall.
-	 *
-	 * As a small optimization, only write the version field in the first
-	 * and third write.  The vcpu->pv_time cache is still valid, because the
-	 * version field is the first in the struct.
-	 */
-	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
-
-	smp_wmb();
-
-	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
 	/* If the host uses TSC clocksource, then it is stable */
+	pvclock_flags = 0;
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
-
-	smp_wmb();
-
-	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	if (vcpu->pv_time_enabled)
+		kvm_setup_pvclock_page(v);
+	if (v == kvm_get_vcpu(v->kvm, 0))
+		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
 
@@ -2031,8 +2031,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
 
-	/* Bits 2:5 are reserved, Should be zero */
-	if (data & 0x3c)
+	/* Bits 3:5 are reserved, Should be zero */
+	if (data & 0x38)
 		return 1;
 
 	vcpu->arch.apf.msr_val = data;
@@ -2048,6 +2048,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 1;
 
 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
 	kvm_async_pf_wakeup_all(vcpu);
 	return 0;
 }
@@ -2265,6 +2266,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
 		return kvm_hv_set_msr_common(vcpu, msr, data,
 					     msr_info->host_initiated);
 	case MSR_IA32_BBL_CR_CTL3:
@@ -2471,6 +2473,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
 		return kvm_hv_get_msr_common(vcpu,
 					     msr_info->index, &msr_info->data);
 		break;
@@ -2609,6 +2612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV:
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
+	case KVM_CAP_HYPERV_SYNIC:
+	case KVM_CAP_HYPERV_SYNIC2:
+	case KVM_CAP_HYPERV_VP_INDEX:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3058,13 +3064,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-		u32 hflags = vcpu->arch.hflags;
 		if (events->smi.smm)
-			hflags |= HF_SMM_MASK;
+			vcpu->arch.hflags |= HF_SMM_MASK;
 		else
-			hflags &= ~HF_SMM_MASK;
-		kvm_set_hflags(vcpu, hflags);
-
+			vcpu->arch.hflags &= ~HF_SMM_MASK;
 		vcpu->arch.smi_pending = events->smi.pending;
 		if (events->smi.smm_inside_nmi)
 			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
@@ -3285,6 +3288,26 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+				     struct kvm_enable_cap *cap)
+{
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_HYPERV_SYNIC2:
+		if (cap->args[0])
+			return -EINVAL;
+	case KVM_CAP_HYPERV_SYNIC:
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
+		return kvm_hv_activate_synic(vcpu, cap->cap ==
+					     KVM_CAP_HYPERV_SYNIC2);
+	default:
+		return -EINVAL;
+	}
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -3552,6 +3575,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_set_guest_paused(vcpu);
 		goto out;
 	}
+	case KVM_ENABLE_CAP: {
+		struct kvm_enable_cap cap;
+
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -6191,7 +6223,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 				return r;
 		}
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu, false),
 					    false);
 			kvm_x86_ops->set_irq(vcpu);
 		}
@@ -6465,6 +6497,9 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 		if (vcpu->arch.apicv_active)
 			kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+		bitmap_or((ulong*)vcpu->arch.eoi_exit_bitmap,
+			  (ulong*)vcpu->arch.eoi_exit_bitmap,
+			  vcpu_to_synic(vcpu)->vec_bitmap, 256);
 	}
 	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
@@ -6592,6 +6627,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			r = 0;
+			goto out;
+		}
+		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+			vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+			r = 0;
+			goto out;
+		}
+
+		/*
+		 * KVM_REQ_HV_STIMER has to be processed after
+		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+		 * depend on the guest clock being up-to-date
+		 */
+		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+			kvm_hv_process_stimers(vcpu);
 	}
 
 	/*
@@ -6849,7 +6904,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		}
 		if (need_resched()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-			cond_resched();
+			cond_resched_may_throttle();
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		}
 	}
@@ -7465,6 +7520,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
+	kvm_hv_vcpu_postcreate(vcpu);
+
 	if (vcpu_load(vcpu))
 		return;
 	msr.data = 0x0;
@@ -7768,6 +7825,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pending_external_vector = -1;
 
+	kvm_hv_vcpu_init(vcpu);
+
 	return 0;
 fail_free_wbinvd_dirty_mask:
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -7787,6 +7846,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
+	kvm_hv_vcpu_uninit(vcpu);
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
@@ -8198,6 +8258,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	    kvm_cpu_has_interrupt(vcpu))
 		return true;
 
+	if (kvm_hv_has_stimer_pending(vcpu))
+		return true;
+
 	return false;
 }
 
@@ -8365,6 +8428,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 		fault.error_code = 0;
 		fault.nested_page_fault = false;
 		fault.address = work->arch.token;
+		fault.async_page_fault = true;
 		kvm_inject_page_fault(vcpu, &fault);
 	}
 }
@@ -8387,6 +8451,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 		fault.error_code = 0;
 		fault.nested_page_fault = false;
 		fault.address = work->arch.token;
+		fault.async_page_fault = true;
 		kvm_inject_page_fault(vcpu, &fault);
 	}
 	vcpu->arch.apf.halted = false;
@@ -8398,8 +8463,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
 		return true;
 	else
-		return !kvm_event_needs_reinjection(vcpu) &&
-			kvm_x86_ops->interrupt_allowed(vcpu);
+		return kvm_can_do_async_pf(vcpu);
 }
 
 void kvm_arch_start_assignment(struct kvm *kvm)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,9 @@ OBJECT_FILES_NON_STANDARD_memmove_64.o		+= y
 OBJECT_FILES_NON_STANDARD_memset_64.o		+= y
 OBJECT_FILES_NON_STANDARD_rwlock.o		+= y
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
 .Lmemcpy_e_e:
 	.previous
 
+.weak memcpy
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
 	 * only outcome...
 	 */
 	.section .altinstructions, "a"
-	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
  * Output:
  * rax: dest
  */
+.weak memmove
+
 ENTRY(memmove)
+ENTRY(__memmove)
 	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
 		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 	.previous
+ENDPROC(__memmove)
 ENDPROC(memmove)
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
 .Lmemset_e_e:
 	.previous
 
+.weak memset
+
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
          * feature to implement the right patch order.
 	 */
 	.section .altinstructions,"a"
-	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
-	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
@@ -20,6 +23,9 @@ obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
 obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
 
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
+
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -716,14 +716,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+	ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
 
-	print_vma_addr(KERN_CONT " in ", regs->ip);
+	ve_print_vma_addr(VE_LOG, KERN_CONT " in ", regs->ip);
 
-	printk(KERN_CONT "\n");
+	ve_printk(VE_LOG, KERN_CONT "\n");
 }
 
 static void
@@ -953,7 +953,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	return ret;
 }
 
-int show_unhandled_signals = 1;
+int show_unhandled_signals = 0;
 
 static inline int
 access_error(unsigned long error_code, struct vm_area_struct *vma)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -7,14 +7,17 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/ratelimit.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/err.h>
 #include <linux/sysctl.h>
+#include <linux/compat.h>
 #include <asm/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
+#include <asm/elf.h>
 
 #if 0	/* This is just for testing */
 struct page *
@@ -83,8 +86,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 
 	info.flags = 0;
 	info.length = len;
-	info.low_limit = TASK_UNMAPPED_BASE;
-	info.high_limit = TASK_SIZE;
+	info.low_limit = get_mmap_base(1);
+	info.high_limit = in_compat_syscall() ?
+		tasksize_32bit() : tasksize_64bit();
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	return vm_unmapped_area(&info);
@@ -101,7 +105,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = current->mm->mmap_base;
+	info.high_limit = get_mmap_base(0);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	addr = vm_unmapped_area(&info);
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/ratelimit.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,244 @@
+#define pr_fmt(fmt) "kasan: " fmt
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/mmu_context.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+
+static int __init map_range(struct range *range)
+{
+	unsigned long start;
+	unsigned long end;
+
+	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+	/*
+	 * end + 1 here is intentional. We check several shadow bytes in advance
+	 * to slightly speed up fastpath. In some rare cases we could cross
+	 * boundary of mapped shadow, so we just map some more here.
+	 */
+	return vmemmap_populate(start, end + 1, pfn_to_nid(range->start));
+}
+
+static void __init clear_pgds(unsigned long start,
+			unsigned long end)
+{
+	for (; start < end; start += PGDIR_SIZE)
+		pgd_clear(pgd_offset_k(start));
+}
+
+static void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+	int i;
+	unsigned long start = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+
+	for (i = pgd_index(start); start < end; i++) {
+		pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+				| _KERNPG_TABLE);
+		start += PGDIR_SIZE;
+	}
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	while (addr + PAGE_SIZE <= end) {
+		WARN_ON(!pte_none(*pte));
+		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+					| __PAGE_KERNEL_RO));
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+	return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+		WARN_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+					| _KERNPG_TABLE));
+		addr += PMD_SIZE;
+		pmd = pmd_offset(pud, addr);
+	}
+	if (addr < end) {
+		if (pmd_none(*pmd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pte_populate(pmd, addr, end);
+	}
+	return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pud_t *pud = pud_offset(pgd, addr);
+
+	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+		WARN_ON(!pud_none(*pud));
+		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+					| _KERNPG_TABLE));
+		addr += PUD_SIZE;
+		pud = pud_offset(pgd, addr);
+	}
+
+	if (addr < end) {
+		if (pud_none(*pud)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pmd_populate(pud, addr, end);
+	}
+	return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+	int ret = 0;
+	pgd_t *pgd = pgd_offset_k(addr);
+
+	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+		WARN_ON(!pgd_none(*pgd));
+		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+					| _KERNPG_TABLE));
+		addr += PGDIR_SIZE;
+		pgd = pgd_offset_k(addr);
+	}
+
+	if (addr < end) {
+		if (pgd_none(*pgd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pud_populate(pgd, addr, end);
+	}
+	return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+		panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	if (val == DIE_GPF) {
+		pr_emerg("CONFIG_KASAN_INLINE enabled\n");
+		pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+	.notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_early_init(void)
+{
+	int i;
+	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+	pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+	pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		kasan_zero_pte[i] = __pte(pte_val);
+
+	for (i = 0; i < PTRS_PER_PMD; i++)
+		kasan_zero_pmd[i] = __pmd(pmd_val);
+
+	for (i = 0; i < PTRS_PER_PUD; i++)
+		kasan_zero_pud[i] = __pud(pud_val);
+
+	kasan_map_early_shadow(early_level4_pgt);
+	kasan_map_early_shadow(init_level4_pgt);
+}
+
+void __init kasan_init(void)
+{
+	int i;
+
+#ifdef CONFIG_KASAN_INLINE
+	register_die_notifier(&kasan_die_notifier);
+#endif
+
+	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+	load_cr3(early_level4_pgt);
+	__flush_tlb_all();
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	populate_zero_shadow((void *)KASAN_SHADOW_START,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	for (i = 0; i < E820_X_MAX; i++) {
+		if (pfn_mapped[i].end == 0)
+			break;
+
+		if (map_range(&pfn_mapped[i]))
+			panic("kasan: unable to allocate shadow!");
+	}
+	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			0);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+			(void *)KASAN_SHADOW_END);
+
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	load_cr3(init_level4_pgt);
+	__flush_tlb_all();
+	init_task.kasan_depth = 0;
+
+	pr_info("Kernel address sanitizer initialized\n");
+}
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,30 +29,44 @@
 #include <linux/random.h>
 #include <linux/limits.h>
 #include <linux/sched.h>
+#include <linux/compat.h>
 #include <asm/elf.h>
 
 struct __read_mostly va_alignment va_align = {
 	.flags = -1,
 };
 
-static unsigned long stack_maxrandom_size(void)
+unsigned long tasksize_32bit(void)
+{
+	return IA32_PAGE_OFFSET;
+}
+
+unsigned long tasksize_64bit(void)
+{
+	return TASK_SIZE_MAX;
+}
+
+static unsigned long stack_maxrandom_size(unsigned long task_size)
 {
 	unsigned long max = 0;
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
+		max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+		max <<= PAGE_SHIFT;
 	}
 
 	return max;
 }
 
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole with possible stack randomization.
- */
-#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
-#define MAX_GAP (TASK_SIZE/6*5)
+#ifdef CONFIG_COMPAT
+# define mmap32_rnd_bits  8
+# define mmap64_rnd_bits  28
+#else
+# define mmap32_rnd_bits  28
+# define mmap64_rnd_bits  28
+#endif
+
+#define SIZE_128M    (128 * 1024 * 1024UL)
 
 static int mmap_is_legacy(void)
 {
@@ -65,66 +79,93 @@ static int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-unsigned long arch_mmap_rnd(void)
+static unsigned long arch_rnd(unsigned int rndbits)
 {
-	unsigned long rnd;
-
-	if (mmap_is_ia32())
-#ifdef CONFIG_COMPAT
-		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-#else
-		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-#endif
-	else
-		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
+	return ((unsigned long)get_random_int() &
+			((1UL << rndbits) - 1)) << PAGE_SHIFT;
+}
 
-	return rnd << PAGE_SHIFT;
+unsigned long arch_mmap_rnd(void)
+{
+	return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
 }
 
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
 {
 	unsigned long gap = rlimit(RLIMIT_STACK);
+	unsigned long gap_min, gap_max;
+
+	/*
+	 * Top of mmap area (just below the process stack).
+	 * Leave an at least ~128 MB hole with possible stack randomization.
+	 */
+	gap_min = SIZE_128M + stack_maxrandom_size(task_size);
+	gap_max = (task_size / 6) * 5;
 
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
+	if (gap < gap_min)
+		gap = gap_min;
+	else if (gap > gap_max)
+		gap = gap_max;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+	return PAGE_ALIGN(task_size - gap - rnd);
 }
 
-/*
- * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
- * does, but not when emulating X86_32
- */
-static unsigned long mmap_legacy_base(unsigned long rnd)
+static unsigned long mmap_legacy_base(unsigned long rnd,
+				      unsigned long task_size)
 {
-	if (mmap_is_ia32())
-		return TASK_UNMAPPED_BASE;
-	else
-		return TASK_UNMAPPED_BASE + rnd;
+	return __TASK_UNMAPPED_BASE(task_size) + rnd;
 }
 
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
-void arch_pick_mmap_layout(struct mm_struct *mm)
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+		unsigned long random_factor, unsigned long task_size)
 {
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+	*legacy_base = mmap_legacy_base(random_factor, task_size);
+	if (mmap_is_legacy())
+		*base = *legacy_base;
+	else
+		*base = mmap_base(random_factor, task_size);
+}
 
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
 	if (mmap_is_legacy()) {
-		mm->mmap_base = mm->mmap_legacy_base;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
 	} else {
-		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
+
+	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+			arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/*
+	 * The mmap syscall mapping base decision depends solely on the
+	 * syscall type (64-bit or compat). This applies for 64bit
+	 * applications and 32bit applications. The 64bit syscall uses
+	 * mmap_base, the compat syscall uses mmap_compat_base.
+	 */
+	arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+			arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+#endif
+}
+
+unsigned long get_mmap_base(int is_legacy)
+{
+	struct mm_struct *mm = current->mm;
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	if (in_compat_syscall()) {
+		return is_legacy ? mm->mmap_compat_legacy_base
+				 : mm->mmap_compat_base;
+	}
+#endif
+	return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *)__get_free_page(PGALLOC_GFP);
+	return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
 }
 
 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -78,8 +78,10 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #if PAGETABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
+	struct page *page = virt_to_page(pud);
+
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pud));
+	tlb_remove_page(tlb, page);
 }
 #endif	/* PAGETABLE_LEVELS > 3 */
 #endif	/* PAGETABLE_LEVELS > 2 */
@@ -202,13 +204,17 @@ static void free_pmds(pmd_t *pmds[])
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
+	gfp_t gfp = PGALLOC_GFP;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
 
 	for(i = 0; i < PREALLOCATED_PMDS; i++) {
-		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
+		pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 		if (!pmd)
 			failed = true;
 		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
@@ -297,7 +303,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
--- a/arch/x86/mm/track.c
+++ b/arch/x86/mm/track.c
@@ -73,9 +73,6 @@ static inline void harvest_clear_soft_dirty(struct vm_area_struct *vma,
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
-	} else if (pte_file(ptent)) {
-		ptent = pte_file_clear_soft_dirty(ptent);
-		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
 }
 
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -7,7 +7,7 @@
 #
 #
 OBJECT_FILES_NON_STANDARD	:= y
-
+KASAN_SANITIZE := n
 subdir- := rm
 
 obj-y += init.o
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,6 +7,10 @@
 #
 #
 OBJECT_FILES_NON_STANDARD	:= y
+KASAN_SANITIZE := n
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
 
 always := realmode.bin realmode.relocs
 
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
 110	i386	iopl			sys_iopl
 111	i386	vhangup			sys_vhangup
 112	i386	idle
-113	i386	vm86old			sys_vm86old			sys32_vm86_warning
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
 114	i386	wait4			sys_wait4			compat_sys_wait4
 115	i386	swapoff			sys_swapoff
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
@@ -172,7 +172,7 @@
 163	i386	mremap			sys_mremap
 164	i386	setresuid		sys_setresuid16
 165	i386	getresuid		sys_getresuid16
-166	i386	vm86			sys_vm86			sys32_vm86_warning
+166	i386	vm86			sys_vm86			sys_ni_syscall
 167	i386	query_module
 168	i386	poll			sys_poll
 169	i386	nfsservctl
@@ -363,3 +363,8 @@
 356	i386	memfd_create		sys_memfd_create
 374	i386	userfaultfd		sys_userfaultfd
 377	i386	copy_file_range		sys_copy_file_range
+
+510	i386	getluid			sys_getluid
+511	i386	setluid			sys_setluid
+512	i386	setublimit		sys_setublimit			compat_sys_setublimit
+513	i386	ubstat			sys_ubstat			compat_sys_ubstat
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -329,6 +329,11 @@
 323	common	userfaultfd		sys_userfaultfd
 326	common	copy_file_range		sys_copy_file_range
 
+500	64	getluid			sys_getluid
+501	64	setluid			sys_setluid
+502	64	setublimit		sys_setublimit
+503	64	ubstat			sys_ubstat
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -184,7 +184,7 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
 # The DSO images are built using a special linker script.
 #
 quiet_cmd_vdso = VDSO    $@
-      cmd_vdso = $(CC) -nostdlib -o $@ \
+      cmd_vdso = $(CC) $(call cc-option, -fno-use-linker-plugin) -nostdlib -o $@ \
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) \
 		$(if $(AFTER_LINK),; $(AFTER_LINK)) && \
@@ -192,6 +192,8 @@ quiet_cmd_vdso = VDSO    $@
 
 VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 GCOV_PROFILE := n
+KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
 
 #
 # Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -27,6 +27,8 @@
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
+struct timespec VDSO64_ve_start_timespec;
+
 notrace static cycle_t vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)rdtsc_ordered();
@@ -68,52 +70,47 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
 	return &pvti_base[offset];
 }
 
-static notrace cycle_t vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(int *mode)
 {
-	const struct pvclock_vsyscall_time_info *pvti;
-	cycle_t ret;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
+	u64 ret;
 	u64 last;
 	u32 version;
-	u8 flags;
-	unsigned cpu, cpu1;
-
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
+	 * Note: The kernel and hypervisor must guarantee that cpu ID
+	 * number maps 1:1 to per-CPU pvclock time info.
+	 *
+	 * Because the hypervisor is entirely unaware of guest userspace
+	 * preemption, it cannot guarantee that per-CPU pvclock time
+	 * info is updated if the underlying CPU changes or that that
+	 * version is increased whenever underlying CPU changes.
+	 *
+	 * On KVM, we are guaranteed that pvti updates for any vCPU are
+	 * atomic as seen by *all* vCPUs.  This is an even stronger
+	 * guarantee than we get with a normal seqlock.
 	 *
+	 * On Xen, we don't appear to have that guarantee, but Xen still
+	 * supplies a valid seqlock using the version field.
+	 *
+	 * We only do pvclock vdso timing at all if
+	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+	 * mean that all vCPUs have matching pvti and that the TSC is
+	 * synced, so we can just look at vCPU 0's pvti.
 	 */
-	do {
-		cpu = __getcpu() & VGETCPU_CPU_MASK;
-		/* TODO: We can put vcpu id into higher bits of pvti.version.
-		 * This will save a couple of cycles by getting rid of
-		 * __getcpu() calls (Gleb).
-		 */
 
-		pvti = get_pvti(cpu);
-		version = pvclock_read_begin(&pvti->pvti);
-		flags = pvti->pvti.flags;
-
-		ret = __pvclock_read_cycles(&pvti->pvti, rdtsc_ordered());
+	do {
+		version = pvclock_read_begin(pvti);
 
-		/*
-		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
-		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			 pvclock_read_retry(&pvti->pvti, version)));
+		if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+			*mode = VCLOCK_NONE;
+			return 0;
+		}
 
-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
-		*mode = VCLOCK_NONE;
+		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
+	} while (pvclock_read_retry(pvti, version));
 
-	/* refer to tsc.c read_tsc() comment for rationale */
+	/* refer to vread_tsc() comment for rationale */
 	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
 
 	if (likely(ret >= last))
@@ -180,6 +177,43 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 	return mode;
 }
 
+notrace static struct timespec *get_ve_timespec(void)
+{
+	struct timespec *ret;
+	asm volatile ("lea VDSO64_ve_start_timespec(%%rip),%0\n": "=r"(ret));
+	return ret;
+}
+
+notrace static void vdso_set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		/*
+		 * The following asm() prevents the compiler from
+		 * optimising this loop into a modulo operation. See
+		 * also __iter_div_u64_rem() in include/linux/time.h
+		 */
+		asm("" : "+rm"(nsec));
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		asm("" : "+rm"(nsec));
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+static void monotonic_time_to_ve(struct timespec *ts)
+{
+	struct timespec *ve_timespec = get_ve_timespec();
+
+	vdso_set_normalized_timespec(ts,
+		ts->tv_sec - ve_timespec->tv_sec,
+		ts->tv_nsec - ve_timespec->tv_nsec);
+}
+
 notrace static int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq;
@@ -195,8 +229,9 @@ notrace static int do_monotonic(struct timespec *ts)
 		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-	timespec_add_ns(ts, ns);
 
+	timespec_add_ns(ts, ns);
+	monotonic_time_to_ve(ts);
 	return mode;
 }
 
@@ -220,6 +255,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
 		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
+	monotonic_time_to_ve(ts);
 	return 0;
 }
 
--- a/arch/x86/vdso/vdso-note.S
+++ b/arch/x86/vdso/vdso-note.S
@@ -7,6 +7,8 @@
 #include <linux/version.h>
 #include <linux/elfnote.h>
 
+	.globl VDSO64_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+VDSO64_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -26,6 +26,10 @@
 #include <asm/vdso.h>
 #include <asm/proto.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 enum {
 	VDSO_DISABLED = 0,
 	VDSO_ENABLED = 1,
@@ -303,6 +307,164 @@ int __init sysenter_setup(void)
 	return 0;
 }
 
+static DEFINE_MUTEX(vdso32_mutex);
+
+static struct page **uts_prep_vdso_pages_locked(int map)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct mm_struct *mm = current->mm;
+	struct ve_struct *ve = get_exec_env();
+	struct page **pages = vdso32_pages;
+	int n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+	void *addr;
+
+	/*
+	 * Simply reuse vDSO pages if we can.
+	 */
+	if (uts_ns == &init_uts_ns)
+		return vdso32_pages;
+
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso32.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso32.pages);
+	smp_read_barrier_depends();
+	if (p)
+		return uts_ns->vdso32.pages;
+
+	up_write(&mm->mmap_sem);
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto out;
+#ifdef CONFIG_X86_32
+		else {
+			/*
+			 * Native x86-32 mode requires vDSO runtime
+			 * relocations applied which is not supported
+			 * in the old vanilla kernels, moreover even
+			 * being ported we would break compatibility
+			 * with rhel5 vdso which has addresses hardcoded.
+			 * Thus simply warn about this problem and
+			 * continue execution without virtualization.
+			 * After all i686 is pretty outdated nowadays.
+			 */
+			pr_warn_once("x86-32 vDSO virtualization is not supported.");
+			goto out;
+		}
+#endif
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto out;
+	}
+
+	mutex_lock(&vdso32_mutex);
+	if (uts_ns->vdso32.pages) {
+		pages = uts_ns->vdso32.pages;
+		goto out_unlock;
+	}
+
+	uts_ns->vdso32.nr_pages		= 1;
+	uts_ns->vdso32.size		= PAGE_SIZE;
+	uts_ns->vdso32.version_off	= (unsigned long)VDSO32_SYMBOL(0, linux_version_code);
+	new_pages			= kmalloc(sizeof(struct page *), GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	new_pages[0] = alloc_page(GFP_KERNEL);
+	if (!new_pages[0]) {
+		pr_err("Can't allocate page for VE %d\n", ve->veid);
+		kfree(new_pages);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	copy_page(page_address(new_pages[0]), page_address(vdso32_pages[0]));
+
+	addr = page_address(new_pages[0]);
+	*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
+	smp_wmb();
+
+	pages = uts_ns->vdso32.pages = new_pages;
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+out_unlock:
+	mutex_unlock(&vdso32_mutex);
+out:
+	down_write(&mm->mmap_sem);
+	return pages;
+}
+
+void arch_remap(struct mm_struct *mm,
+		unsigned long old_start, unsigned long old_end,
+		unsigned long new_start, unsigned long new_end)
+{
+	/*
+	 * mremap() doesn't allow moving multiple vmas so we can limit the
+	 * check to old_start == vdso_base.
+	 */
+	if (old_start == (unsigned long)mm->context.vdso) {
+		if (WARN_ON_ONCE(current->mm != mm))
+			return;
+
+		mm->context.vdso = (void *)new_start;
+		current_thread_info()->sysenter_return =
+			VDSO32_SYMBOL(new_start, SYSENTER_RETURN);
+	}
+}
+
+/* Call under mm->mmap_sem */
+static int __arch_setup_additional_pages(unsigned long addr, bool compat)
+{
+	struct mm_struct *mm = current->mm;
+	int ret;
+
+	current->mm->context.vdso = (void *)addr;
+
+	if (compat_uses_vma || !compat) {
+		struct page **pages = uts_prep_vdso_pages_locked(compat);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+					      pages);
+
+		if (ret)
+			return ret;
+	}
+
+	current_thread_info()->sysenter_return =
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+	return 0;
+}
+
 /* Setup a VMA at program startup for the vsyscall page */
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
@@ -337,35 +499,66 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
-	current->mm->context.vdso = (void *)addr;
+	ret = __arch_setup_additional_pages(addr, compat);
+	if (ret)
+		current->mm->context.vdso = NULL;
 
-	if (compat_uses_vma || !compat) {
-		/*
-		 * MAYWRITE to allow gdb to COW and set breakpoints
-		 */
-		ret = install_special_mapping(mm, addr, PAGE_SIZE,
-					      VM_READ|VM_EXEC|
-					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-					      vdso32_pages);
+up_fail:
 
-		if (ret)
-			goto up_fail;
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+int do_map_vdso_32(unsigned long req_addr)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long vdso_addr;
+	int ret;
+	bool compat;
+
+	if (vdso_enabled == VDSO_DISABLED)
+		return -ENOENT;
+
+	down_write(&mm->mmap_sem);
+
+	compat = (vdso_enabled == VDSO_COMPAT);
+	/* Maybe we can omit this check, but yet let it be for safety */
+	if (compat && req_addr != VDSO_HIGH_BASE) {
+		ret = -EFAULT;
+		goto up_fail;
 	}
 
-	current_thread_info()->sysenter_return =
-		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+	if (vdso_or_vvar_present(mm)) {
+		ret = -EEXIST;
+		goto up_fail;
+	}
 
-  up_fail:
+	vdso_addr = get_unmapped_area(NULL, req_addr, PAGE_SIZE, 0, 0);
+	if (IS_ERR_VALUE(vdso_addr)) {
+		ret = vdso_addr;
+		goto up_fail;
+	}
+
+	if (req_addr != vdso_addr) {
+		ret = -EFAULT;
+		goto up_fail;
+	}
+
+	ret = __arch_setup_additional_pages(req_addr, compat);
 	if (ret)
 		current->mm->context.vdso = NULL;
+	else
+		ret = ARRAY_SIZE(vdso32_pages) * PAGE_SIZE;
 
+up_fail:
 	up_write(&mm->mmap_sem);
 
 	return ret;
 }
 
-#ifdef CONFIG_X86_64
-
 subsys_initcall(sysenter_setup);
 
 #ifdef CONFIG_SYSCTL
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -9,7 +9,9 @@
 /* Ideally this would use UTS_NAME, but using a quoted string here
    doesn't work. Remember to change this when changing the
    kernel's name. */
+	.globl linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
 
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -35,3 +35,4 @@ VDSO32_PRELINK		= VDSO_PRELINK;
 VDSO32_vsyscall		= __kernel_vsyscall;
 VDSO32_sigreturn	= __kernel_sigreturn;
 VDSO32_rt_sigreturn	= __kernel_rt_sigreturn;
+VDSO32_linux_version_code = linux_version_code;
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,6 +16,10 @@
 #include <asm/vdso.h>
 #include <asm/page.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
@@ -111,6 +115,12 @@ static int __init init_vdso(void)
 		vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
 #endif
 
+	init_uts_ns.vdso.addr		= vdso_start;
+	init_uts_ns.vdso.pages		= vdso_pages;
+	init_uts_ns.vdso.nr_pages	= npages;
+	init_uts_ns.vdso.size		= vdso_size;
+	init_uts_ns.vdso.version_off	= (unsigned long)VDSO64_SYMBOL(0, linux_version_code);
+
 	return 0;
 }
 subsys_initcall(init_vdso);
@@ -161,28 +171,52 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 	return addr;
 }
 
+bool vdso_or_vvar_present(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		if (vma_is_vdso_or_vvar(vma, mm))
+			return true;
+	return false;
+}
+
 /* Setup a VMA at program startup for the vsyscall page.
    Not called for compat tasks */
 static int setup_additional_pages(struct linux_binprm *bprm,
 				  int uses_interp,
 				  struct page **pages,
-				  unsigned size)
+				  unsigned size,
+				  unsigned long req_addr)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long addr;
+	unsigned long addr = req_addr;
 	int ret;
 
 	if (!vdso_enabled)
 		return 0;
 
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, size);
+
+	if (vdso_or_vvar_present(mm)) {
+		ret = -EEXIST;
+		goto up_fail;
+	}
+
+	if (!req_addr)
+		addr = vdso_addr(mm->start_stack, size);
+
 	addr = get_unmapped_area(NULL, addr, size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		goto up_fail;
 	}
 
+	if (req_addr && req_addr != addr) {
+		ret = -EFAULT;
+		goto up_fail;
+	}
+
 	current->mm->context.vdso = (void *)addr;
 
 	ret = install_special_mapping(mm, addr, size,
@@ -199,17 +233,126 @@ static int setup_additional_pages(struct linux_binprm *bprm,
 	return ret;
 }
 
+static DEFINE_MUTEX(vdso_mutex);
+
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm,
+		int uses_interp, unsigned long addr)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct ve_struct *ve = get_exec_env();
+	int i, n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+
+	/*
+	 * For node or in case we've not changed UTS simply
+	 * map preallocated original vDSO.
+	 *
+	 * In turn if we already allocated one for this UTS
+	 * simply reuse it. It improves speed significantly.
+	 */
+	if (uts_ns == &init_uts_ns)
+		goto map_init_uts;
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso.pages);
+	smp_read_barrier_depends();
+	if (p)
+		goto map_uts;
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Using host's uts name.\n");
+		new_version = LINUX_VERSION_CODE;
+	}
+
+	mutex_lock(&vdso_mutex);
+	if (uts_ns->vdso.pages) {
+		mutex_unlock(&vdso_mutex);
+		goto map_uts;
+	}
+
+	uts_ns->vdso.nr_pages	= init_uts_ns.vdso.nr_pages;
+	uts_ns->vdso.size	= init_uts_ns.vdso.size;
+	uts_ns->vdso.version_off= init_uts_ns.vdso.version_off;
+	new_pages		= kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		goto out_unlock;
+	}
+
+	for (i = 0; i < uts_ns->vdso.nr_pages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			pr_err("Can't allocate page for VE %d\n", ve->veid);
+			for (; i > 0; i--)
+				put_page(new_pages[i - 1]);
+			kfree(new_pages);
+			goto out_unlock;
+		}
+		new_pages[i] = p;
+		copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i]));
+	}
+
+	uts_ns->vdso.addr = vmap(new_pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL);
+	if (!uts_ns->vdso.addr) {
+		pr_err("Can't map vDSO pages for VE %d\n", ve->veid);
+		for (i = 0; i < uts_ns->vdso.nr_pages; i++)
+			put_page(new_pages[i]);
+		kfree(new_pages);
+		goto out_unlock;
+	}
+
+	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	*((struct timespec*)(VDSO64_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
+	smp_wmb();
+	uts_ns->vdso.pages = new_pages;
+	mutex_unlock(&vdso_mutex);
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+map_uts:
+	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages,
+		uts_ns->vdso.size, addr);
+map_init_uts:
+	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages,
+		init_uts_ns.vdso.size, addr);
+out_unlock:
+	mutex_unlock(&vdso_mutex);
+	return -ENOMEM;
+}
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages(bprm, uses_interp, vdso_pages,
-				      vdso_size);
+	return uts_arch_setup_additional_pages(bprm, uses_interp, 0);
+}
+
+int do_map_vdso_64(unsigned long req_addr)
+{
+	return uts_arch_setup_additional_pages(0, 0, req_addr);
 }
 
 #ifdef CONFIG_X86_X32_ABI
 int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
 	return setup_additional_pages(bprm, uses_interp, vdsox32_pages,
-				      vdsox32_size);
+				      vdsox32_size, 0);
 }
 #endif
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -48,6 +48,7 @@
 #include <linux/memblock.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
+#include <linux/ratelimit.h>
 
 #include <trace/events/xen.h>
 
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -91,7 +91,7 @@ static u8 zero_stats;
 static inline void check_zero(void)
 {
 	u8 ret;
-	u8 old = ACCESS_ONCE(zero_stats);
+	u8 old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -159,6 +159,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
 	int cpu = smp_processor_id();
 	u64 start;
+	__ticket_t head;
 	unsigned long flags;
 
 	/* If kicker interrupts not initialized yet, just spin */
@@ -206,11 +207,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -251,8 +256,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 
 		/* Make sure we read lock before want */
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == next) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == next) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 			break;
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		/* At this point:  (!vmm || addr < vmm->vm_end). */
 		if (TASK_SIZE - len < addr)
 			return -ENOMEM;
-		if (!vmm || addr + len <= vmm->vm_start)
+		if (!vmm || addr + len <= vm_start_gap(vmm))
 			return addr;
 		addr = vmm->vm_end;
 		if (flags & MAP_SHARED)
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -118,6 +118,14 @@ source "block/partitions/Kconfig"
 
 endmenu
 
+config BLK_DEV_CBT
+	bool "Block layer changed block tracking support"
+	---help---
+	Block layer changed block tracking support, It can be used to optimize
+	device backup,copy.
+
+	If unsure, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_BLK_DEV_CBT)	+= blk-cbt.o
--- /dev/null
+++ b/block/blk-cbt.c
@@ -0,0 +1,855 @@
+/*
+ *  block/blk-cbt.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define CBT_MAX_EXTENTS	512
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+
+#define CBT_PAGE_MISSED (struct page *)(0x1)
+#define CBT_PAGE(cbt, idx) (cbt->map[idx] == CBT_PAGE_MISSED ? \
+			    NULL : cbt->map[idx])
+
+static __cacheline_aligned_in_smp DEFINE_MUTEX(cbt_mutex);
+
+struct cbt_extent{
+	blkcnt_t start;
+	blkcnt_t len;
+};
+
+struct cbt_info {
+	__u8 	 uuid[16];
+	struct request_queue *queue;
+	blkcnt_t block_max;
+	blkcnt_t block_bits;
+	unsigned long flags;
+
+	struct rcu_head rcu;
+	unsigned int count;
+	struct cbt_extent __percpu *cache;
+	struct page **map;
+	spinlock_t lock;
+};
+
+
+enum CBT_FLAGS
+{
+	CBT_ERROR = 0,
+	CBT_DEAD  = 1,
+	CBT_NOCACHE  = 2,
+};
+static void cbt_release_callback(struct rcu_head *head);
+static void cbt_flush_cache(struct cbt_info *cbt);
+
+static inline void spin_lock_page(struct page *page)
+{
+	while(!trylock_page(page))
+		cpu_relax();
+}
+
+static void set_bits(void *bm, int cur, int len, bool is_set)
+{
+	__u32 *addr;
+	__u32 pattern = is_set? 0xffffffff : 0;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+
+			*addr = pattern;
+			cur += 32;
+			continue;
+		}
+		if (is_set)
+			set_bit(cur, bm);
+		else
+			clear_bit(cur, bm);
+		cur++;
+	}
+}
+
+/*
+ * Return values:
+ * 0 if OK,
+ * -EAGAIN if cbt was updated,
+ * -EBADF if cbt is dead,
+ * -ENOMEM if alloc_page failed.
+ */
+static int cbt_page_alloc(struct cbt_info  **cbt_pp, unsigned long idx,
+			  int in_rcu)
+{
+	struct cbt_info	 *cbt = *cbt_pp;
+	struct page *page;
+
+	/* Page not allocated yet. Synchronization required */
+	spin_lock_irq(&cbt->lock);
+	if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
+		cbt->count++;
+	} else {
+		struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+
+		spin_unlock_irq(&cbt->lock);
+		/* was cbt updated ? */
+		if (new != cbt) {
+			*cbt_pp = new;
+			return -EAGAIN;
+		} else {
+			return -EBADF;
+		}
+	}
+	spin_unlock_irq(&cbt->lock);
+	if (in_rcu)
+		rcu_read_unlock();
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (in_rcu)
+		rcu_read_lock();
+	spin_lock_irq(&cbt->lock);
+	if (unlikely(!cbt->count-- && test_bit(CBT_DEAD, &cbt->flags))) {
+		spin_unlock_irq(&cbt->lock);
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+		if (page)
+			__free_page(page);
+		return -EBADF;
+	}
+	if (unlikely(!page)) {
+		set_bit(CBT_ERROR, &cbt->flags);
+		spin_unlock_irq(&cbt->lock);
+		return -ENOMEM;
+	}
+
+	if (likely(CBT_PAGE(cbt, idx) == NULL))
+		cbt->map[idx] = page;
+	else
+		__free_page(page);
+
+	page = NULL;
+	spin_unlock_irq(&cbt->lock);
+
+	return 0;
+}
+
+static int __blk_cbt_set(struct cbt_info  *cbt, blkcnt_t block,
+			 blkcnt_t count, bool in_rcu, bool set,
+			 unsigned long *pages_missed,
+			 unsigned long *idx_first)
+{
+	struct page *page;
+
+	if (unlikely(block + count > cbt->block_max)) {
+		printk("WARN: %s eof access block:%lld, len: %lld, max:%lld\n",
+		       __FUNCTION__, (unsigned long long) block,
+		       (unsigned long long)count,
+		       (unsigned long long)cbt->block_max);
+		set_bit(CBT_ERROR, &cbt->flags);
+		return -EINVAL;
+	}
+
+	while(count) {
+		unsigned long idx = block >> (PAGE_SHIFT + 3);
+		unsigned long off = block & (BITS_PER_PAGE -1);
+		unsigned long len = min_t(unsigned long, BITS_PER_PAGE - off,
+					  count);
+		int ret;
+
+		page = CBT_PAGE(cbt, idx);
+		if (page) {
+			spin_lock_page(page);
+			set_bits(page_address(page), off, len, set);
+			unlock_page(page);
+			count -= len;
+			block += len;
+			continue;
+		} else if (pages_missed) {
+			(*pages_missed)++;
+			if (!*idx_first)
+				*idx_first = idx;
+			cbt->map[idx] = CBT_PAGE_MISSED;
+			count -= len;
+			block += len;
+			continue;
+		}  else {
+			if (!set) {
+				/* Nothing to do */
+				count -= len;
+				block += len;
+				continue;
+			}
+		}
+
+		ret = cbt_page_alloc(&cbt, idx, in_rcu);
+		if (ret == -EAGAIN) /* new cbt */
+			continue;
+		else if (ret == -EBADF) /* dead cbt */
+			break;
+		else if (ret)
+			return ret;
+	}
+	return (pages_missed && *pages_missed) ? -EAGAIN : 0;
+}
+
+static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	struct cbt_extent old;
+	blkcnt_t end;
+	/* Check per-cpu cache */
+
+	rcu_read_lock();
+	cbt = rcu_dereference(q->cbt);
+	if (unlikely(!cbt))
+		goto out_rcu;
+
+	if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
+		goto out_rcu;
+	end = (start + len + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+	start >>= cbt->block_bits;
+	len = end - start;
+	if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
+		__blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
+		goto out_rcu;
+	}
+	ex = this_cpu_ptr(cbt->cache);
+	if (ex->start + ex->len == start) {
+		ex->len += len;
+		goto out_rcu;
+	}
+	old = *ex;
+	ex->start = start;
+	ex->len = len;
+
+	if (likely(old.len))
+		__blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
+out_rcu:
+	rcu_read_unlock();
+}
+
+inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_size)
+		return;
+
+	blk_cbt_add(q, bio->bi_sector << 9, bio->bi_size);
+}
+
+static struct cbt_info* do_cbt_alloc(struct request_queue *q, __u8 *uuid,
+				     loff_t size, loff_t blocksize)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	int i;
+
+
+	cbt = kzalloc(sizeof(*cbt), GFP_KERNEL);
+	if (!cbt)
+		return ERR_PTR(-ENOMEM);
+
+	cbt->block_bits = ilog2(blocksize);
+	cbt->block_max  = (size + blocksize - 1) >> cbt->block_bits;
+	spin_lock_init(&cbt->lock);
+	memcpy(cbt->uuid, uuid, sizeof(cbt->uuid));
+	cbt->cache = alloc_percpu(struct cbt_extent);
+	if (!cbt->cache)
+		goto err_cbt;
+
+	for_each_possible_cpu(i) {
+		ex = per_cpu_ptr(cbt->cache, i);
+		memset(ex, 0, sizeof (*ex));
+	}
+
+	cbt->map = vmalloc(NR_PAGES(cbt->block_max) * sizeof(void*));
+	if (!cbt->map)
+		goto err_pcpu;
+
+	memset(cbt->map, 0, NR_PAGES(cbt->block_max) * sizeof(void*));
+	cbt->queue = q;
+	return cbt;
+err_pcpu:
+	free_percpu(cbt->cache);
+err_cbt:
+	kfree(cbt);
+	return ERR_PTR(-ENOMEM);
+}
+
+int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
+			  struct page ***map_ptr, blkcnt_t *block_max,
+			  blkcnt_t *block_bits)
+{
+	struct cbt_info *cbt;
+	struct page **map;
+	unsigned long npages;
+	unsigned long i;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+
+	BUG_ON(!cbt->map);
+	BUG_ON(!cbt->block_max);
+
+	if (!uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid))) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+
+	cbt_flush_cache(cbt);
+
+	npages = NR_PAGES(cbt->block_max);
+	map = vmalloc(npages * sizeof(void*));
+	if (!map)
+		goto fail;
+
+	memset(map, 0, npages * sizeof(void*));
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = cbt->map[i];
+
+		BUG_ON(page == CBT_PAGE_MISSED);
+
+		if (page) {
+			map[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+			if (!map[i])
+				goto fail_pages;
+
+			spin_lock_page(page);
+			memcpy(page_address(map[i]), page_address(page),
+			       PAGE_SIZE);
+			memset(page_address(page), 0, PAGE_SIZE);
+			unlock_page(page);
+		}
+	}
+	mutex_unlock(&cbt_mutex);
+
+	*map_ptr = map;
+	*block_max = cbt->block_max;
+	*block_bits = cbt->block_bits;
+	return 0;
+
+fail_pages:
+	while (--i >= 0) {
+		if (map[i])
+			__free_page(map[i]);
+	}
+fail:
+	vfree(map);
+	mutex_unlock(&cbt_mutex);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(blk_cbt_map_copy_once);
+
+static void blk_cbt_page_merge(struct page *pg_from, struct page *pg_to)
+{
+	u32 *from = page_address(pg_from);
+	u32 *to = page_address(pg_to);
+	u32 *fin = to + PAGE_SIZE/sizeof(*to);
+
+	while (to < fin) {
+		*to |= *from;
+		to++;
+		from++;
+	}
+}
+
+int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
+		      struct page **map, blkcnt_t block_max,
+		      blkcnt_t block_bits)
+{
+	struct cbt_info *cbt;
+	unsigned long i;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+
+	BUG_ON(!cbt->map);
+	BUG_ON(!cbt->block_max);
+
+	if (!map || !uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid)) ||
+	    block_max != cbt->block_max || block_bits != cbt->block_bits) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < NR_PAGES(cbt->block_max); i++) {
+		struct page *page_main = cbt->map[i];
+		struct page *page_addon = map[i];
+
+		BUG_ON(page_main == CBT_PAGE_MISSED);
+		BUG_ON(page_addon == CBT_PAGE_MISSED);
+
+		if (!page_addon)
+			continue;
+
+		if (!page_main) {
+			int ret = cbt_page_alloc(&cbt, i, 0);
+			if (ret) {
+				mutex_unlock(&cbt_mutex);
+				return ret;
+			}
+			page_main = cbt->map[i];
+			BUG_ON(page_main == NULL);
+			BUG_ON(page_main == CBT_PAGE_MISSED);
+		}
+
+		spin_lock_page(page_main);
+		blk_cbt_page_merge(page_addon, page_main);
+		unlock_page(page_main);
+	}
+	mutex_unlock(&cbt_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(blk_cbt_map_merge);
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+	struct request_queue *q;
+	struct cbt_info *new, *cbt;
+	unsigned long to_cpy, idx;
+	unsigned bsz;
+	loff_t new_sz = i_size_read(bdev->bd_inode);
+	int in_use = 0;
+
+	if (!bdev->bd_disk || !bdev_get_queue(bdev))
+		return;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return;
+	}
+	bsz = 1 << cbt->block_bits;
+	if ((new_sz + bsz - 1) >> cbt->block_bits <= cbt->block_max)
+		goto err_mtx;
+
+	new = do_cbt_alloc(q, cbt->uuid, new_sz, bsz);
+	if (IS_ERR(new)) {
+		set_bit(CBT_ERROR, &cbt->flags);
+		goto err_mtx;
+	}
+	to_cpy = NR_PAGES(cbt->block_max);
+	set_bit(CBT_NOCACHE, &cbt->flags);
+	cbt_flush_cache(cbt);
+	spin_lock_irq(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	for (idx = 0; idx < to_cpy; idx++){
+		new->map[idx] = cbt->map[idx];
+		if (CBT_PAGE(new, idx))
+			get_page(CBT_PAGE(new, idx));
+	}
+	rcu_assign_pointer(q->cbt, new);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (!in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+
+
+}
+
+static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct cbt_info *cbt;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+
+	if (((ci.ci_blksize -1) & ci.ci_blksize))
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	if (q->cbt) {
+		ret = -EBUSY;
+		goto err_mtx;
+	}
+	cbt = do_cbt_alloc(q, ci.ci_uuid, i_size_read(bdev->bd_inode), ci.ci_blksize);
+	if (IS_ERR(cbt))
+		ret = PTR_ERR(cbt);
+	else
+		rcu_assign_pointer(q->cbt, cbt);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+	return ret;
+}
+
+static void cbt_release_callback(struct rcu_head *head)
+{
+	struct cbt_info *cbt;
+	int nr_pages, i;
+
+	cbt = container_of(head, struct cbt_info, rcu);
+	nr_pages = NR_PAGES(cbt->block_max);
+	for (i = 0; i < nr_pages; i++)
+		if (CBT_PAGE(cbt, i))
+			__free_page(CBT_PAGE(cbt, i));
+
+	vfree(cbt->map);
+	free_percpu(cbt->cache);
+	kfree(cbt);
+}
+
+void blk_cbt_release(struct request_queue *q)
+{
+	struct cbt_info *cbt;
+	int in_use = 0;
+
+	cbt = q->cbt;
+	if (!cbt)
+		return;
+	spin_lock(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	rcu_assign_pointer(q->cbt, NULL);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev)
+{
+	struct request_queue *q;
+
+	mutex_lock(&cbt_mutex);
+	q = bdev_get_queue(bdev);
+	if(!q->cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	blk_cbt_release(q);
+	mutex_unlock(&cbt_mutex);
+	return 0;
+}
+
+struct flush_ctx {
+	struct cbt_info *cbt;
+	unsigned long pages_missed;
+	unsigned long idx_first;
+};
+
+static inline void __cbt_flush_cpu_cache(void *ptr)
+{
+	struct flush_ctx *ctx = (struct flush_ctx *)ptr;
+	struct cbt_info *cbt = ctx->cbt;
+	struct cbt_extent *ex = this_cpu_ptr(cbt->cache);
+
+	if (ex->len) {
+		int ret = __blk_cbt_set(cbt, ex->start, ex->len, 0, 1,
+					&ctx->pages_missed,
+					&ctx->idx_first);
+		if (!ret) {
+			ex->start += ex->len;
+			ex->len = 0;
+		}
+	}
+}
+
+static void cbt_flush_cache(struct cbt_info *cbt)
+{
+	for (;;) {
+		struct flush_ctx ctx;
+		unsigned long i;
+try_again:
+		ctx.cbt = cbt;
+		ctx.pages_missed = 0;
+		ctx.idx_first = 0;
+
+		on_each_cpu(__cbt_flush_cpu_cache, &ctx, 1);
+
+		if (likely(!ctx.pages_missed))
+			return;
+
+		for (i = ctx.idx_first; i < NR_PAGES(cbt->block_max); i++) {
+			int ret;
+
+			if (cbt->map[i] != CBT_PAGE_MISSED)
+				continue;
+
+			ret = cbt_page_alloc(&cbt, i, 0);
+			if (ret == -EAGAIN) /* new cbt */
+				goto try_again;
+			else if (ret) /* dead cbt or alloc_page failed */
+				return;
+
+			/* cbt_page_alloc succeeded ... */
+			if (!--ctx.pages_missed)
+				break;
+		}
+	}
+}
+
+static void cbt_find_next_extent(struct cbt_info *cbt, blkcnt_t block, struct cbt_extent *ex)
+{
+	unsigned long off, off2, idx;
+	struct page *page;
+	bool found = 0;
+
+	ex->start = cbt->block_max;
+	ex->len = 0;
+
+	idx = block >> (PAGE_SHIFT + 3);
+	while (block < cbt->block_max) {
+		off = block & (BITS_PER_PAGE -1);
+		page = CBT_PAGE(cbt, idx);
+		if (!page) {
+			if (found)
+				break;
+			goto next;
+		}
+		spin_lock_page(page);
+		/* Find extent start */
+		if (!found) {
+			ex->start = find_next_bit(page_address(page), BITS_PER_PAGE, off);
+			if (ex->start != BITS_PER_PAGE) {
+				off = ex->start;
+				ex->start += idx << (PAGE_SHIFT + 3);
+				found = 1;
+			} else {
+				unlock_page(page);
+				goto next;
+			}
+		}
+		if (found) {
+			off2 = find_next_zero_bit(page_address(page), BITS_PER_PAGE, off);
+			ex->len += off2 - off;
+			if (off2 != BITS_PER_PAGE) {
+				unlock_page(page);
+				break;
+			}
+		}
+		unlock_page(page);
+	next:
+		idx++;
+		block = idx << (PAGE_SHIFT + 3);
+		continue;
+	}
+}
+
+static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user *cur_u_ex;
+	struct blk_user_cbt_extent        *cur_ex, *cur_ex_base;
+	struct cbt_info *cbt;
+	struct cbt_extent ex;
+	blkcnt_t block , end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_flags &  ~CI_FLAG_ONCE)
+		return -EINVAL;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+
+	if (ci.ci_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, cur_u_ex,
+		       ci.ci_extent_count * sizeof(struct blk_user_cbt_extent))){
+		return -EFAULT;
+	}
+
+	cur_ex_base = cur_ex = kzalloc(ci.ci_extent_count * sizeof(*cur_ex),
+				       GFP_KERNEL);
+	if (!cur_ex_base)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt ||
+	    (ci.ci_start >> cbt->block_bits) > cbt->block_max)
+		goto ioc_get_failed;
+
+	ret = -EIO;
+	if (test_bit(CBT_ERROR, &cbt->flags))
+		goto ioc_get_failed;
+	cbt_flush_cache(cbt);
+
+	memcpy(&ci.ci_uuid, cbt->uuid, sizeof(cbt->uuid));
+	ci.ci_blksize = 1UL << cbt->block_bits;
+	block = ci.ci_start >> cbt->block_bits;
+	end = (ci.ci_start + ci.ci_length) >> cbt->block_bits;
+	if (end > cbt->block_max)
+		end = cbt->block_max;
+
+	ci.ci_mapped_extents = 0;
+	while (ci.ci_mapped_extents < ci.ci_extent_count) {
+		cbt_find_next_extent(cbt, block, &ex);
+		if (!ex.len || ex.start > end)
+			break;
+		cur_ex->ce_physical = ex.start << cbt->block_bits;
+		cur_ex->ce_length = ex.len << cbt->block_bits;
+
+		if (ci.ci_flags & CI_FLAG_ONCE)
+			__blk_cbt_set(cbt, ex.start, ex.len, 0, 0, NULL, NULL);
+		cur_ex++;
+		ci.ci_mapped_extents++;
+		block = ex.start + ex.len;
+	}
+	mutex_unlock(&cbt_mutex);
+
+	ret = 0;
+	if (ci.ci_mapped_extents &&
+	    copy_to_user(cur_u_ex, cur_ex_base,
+			 sizeof(*cur_ex_base) * ci.ci_mapped_extents))
+		ret = -EFAULT;
+	if (!ret && copy_to_user(ucbt_ioc, &ci, sizeof(ci)))
+		ret = -EFAULT;
+
+	kfree(cur_ex_base);
+	return ret;
+
+ioc_get_failed:
+	mutex_unlock(&cbt_mutex);
+	kfree(cur_ex_base);
+	return ret;
+}
+
+static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc, bool set)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct cbt_info *cbt;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user *cur_u_ex;
+	struct blk_user_cbt_extent *cur_ex, *cur_ex_base, *end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+	if (ci.ci_extent_count < ci.ci_mapped_extents)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+	if (!access_ok(VERIFY_READ, cur_u_ex,
+		       ci.ci_mapped_extents * sizeof(struct blk_user_cbt_extent)))
+		return -EFAULT;
+
+	cur_ex_base = cur_ex = kzalloc(ci.ci_mapped_extents * sizeof(*cur_ex),
+				       GFP_KERNEL);
+	if (!cur_ex_base)
+		return -ENOMEM;
+	end = cur_ex_base + ci.ci_mapped_extents;
+
+	if (copy_from_user(cur_ex_base, cur_u_ex,
+			   sizeof(*cur_ex_base) * ci.ci_mapped_extents)) {
+		kfree(cur_ex_base);
+		return -EFAULT;
+	}
+
+	ret = -EINVAL;
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt)
+		goto ioc_set_failed;
+
+	if (ci.ci_flags & CI_FLAG_NEW_UUID)
+		memcpy(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid));
+	else if (memcmp(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid)))
+		goto ioc_set_failed;
+
+	ret = -EIO;
+	if (test_bit(CBT_ERROR, &cbt->flags))
+		goto ioc_set_failed;
+
+	/* Do not care about pcpu caches on set, only in case of clear */
+	if (!set)
+		cbt_flush_cache(cbt);
+
+	ret = 0;
+	while (cur_ex < end) {
+		struct cbt_extent ex;
+
+		ex.start  = cur_ex->ce_physical >> cbt->block_bits;
+		ex.len  = (cur_ex->ce_length + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+		if (ex.start > q->cbt->block_max ||
+		    ex.start + ex.len > q->cbt->block_max ||
+		    ex.len == 0) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = __blk_cbt_set(cbt, ex.start, ex.len, 0, set, NULL, NULL);
+		if (ret)
+			break;
+		cur_ex++;
+	}
+	mutex_unlock(&cbt_mutex);
+	kfree(cur_ex_base);
+	return ret;
+
+ioc_set_failed:
+	mutex_unlock(&cbt_mutex);
+	kfree(cur_ex_base);
+	return ret;
+}
+
+int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct blk_user_cbt_info __user *ucbt_ioc = (struct blk_user_cbt_info __user *) arg;
+
+	switch(cmd) {
+	case BLKCBTSTART:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return cbt_ioc_init(bdev, ucbt_ioc);
+	case BLKCBTSTOP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_stop(bdev);
+	case BLKCBTGET:
+		return cbt_ioc_get(bdev, ucbt_ioc);
+	case BLKCBTSET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 1);
+	case BLKCBTCLR:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 0);
+	default:
+		BUG();
+	}
+	return -ENOTTY;
+}
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1958,6 +1958,7 @@ generic_make_request_checks(struct bio *bio)
 		return false;	/* throttled, will be resubmitted later */
 
 	trace_block_bio_queue(q, bio);
+	blk_cbt_bio_queue(q, bio);
 	return true;
 
 end_io:
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
+#include <bc/beancounter.h>
 
 #include "blk.h"
 
@@ -117,6 +118,9 @@ static void ioc_release_fn(struct work_struct *work)
 
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
+#ifdef CONFIG_BEANCOUNTERS
+	put_beancounter(ioc->ioc_ub);
+#endif
 	kmem_cache_free(iocontext_cachep, ioc);
 }
 
@@ -150,8 +154,12 @@ void put_io_context(struct io_context *ioc)
 		spin_unlock_irqrestore(&ioc->lock, flags);
 	}
 
-	if (free_ioc)
+	if (free_ioc) {
+#ifdef CONFIG_BEANCOUNTERS
+		put_beancounter(ioc->ioc_ub);
+#endif
 		kmem_cache_free(iocontext_cachep, ioc);
+	}
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -195,6 +203,7 @@ void put_io_context_active(struct io_context *ioc)
 
 	put_io_context(ioc);
 }
+EXPORT_SYMBOL(put_io_context_active);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -209,6 +218,7 @@ void exit_io_context(struct task_struct *task)
 	atomic_dec(&ioc->nr_tasks);
 	put_io_context_active(ioc);
 }
+EXPORT_SYMBOL(exit_io_context);
 
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
@@ -249,6 +259,9 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
+#ifdef CONFIG_BEANCOUNTERS
+	ioc->ioc_ub = get_beancounter(get_exec_ub());
+#endif
 
 	/*
 	 * Try to install.  ioc shouldn't be installed if someone else
@@ -261,8 +274,12 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 	if (!task->io_context &&
 	    (task == current || !(task->flags & PF_EXITING)))
 		task->io_context = ioc;
-	else
+	else {
+#ifdef CONFIG_BEANCOUNTERS
+		put_beancounter(ioc->ioc_ub);
+#endif
 		kmem_cache_free(iocontext_cachep, ioc);
+	}
 
 	ret = task->io_context ? 0 : -EBUSY;
 
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -560,6 +560,7 @@ static void blk_release_queue(struct kobject *kobj)
 		blk_mq_release(q);
 
 	blk_trace_shutdown(q);
+	blk_cbt_release(q);
 
 	bdi_destroy(&q->backing_dev_info);
 
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,9 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/nmi.h>
+#include <bc/io_acct.h>
+
 #include "blk.h"
 #include "blk-cgroup.h"
 
@@ -858,8 +861,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct cfq_io_cq *cic, struct bio *bio,
-				       gfp_t gfp_mask);
+				       struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -1750,6 +1752,18 @@ static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return __cfq_set_weight(cgrp, cft, val, true);
 }
 
+#ifdef CONFIG_BC_IO_PRIORITY
+unsigned int blkcg_get_weight(struct cgroup *cgrp)
+{
+	return cgroup_to_blkcg(cgrp)->cfq_weight;
+}
+
+int blkcg_set_weight(struct cgroup *cgrp, unsigned int weight)
+{
+	return cfq_set_weight(cgrp, NULL, weight);
+}
+#endif
+
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
@@ -1806,6 +1820,56 @@ static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+#ifdef CONFIG_BC_IO_PRIORITY
+static u64 cfqg_prfill_ub_iostat(struct seq_file *sf,
+				 struct blkg_policy_data *pd, int unused)
+{
+	struct user_beancounter *ub = sf->private;
+	struct blkg_rwstat queued, serviced, wait_time;
+	u64 sectors, time;
+	const char *dev_name;
+
+	if (pd->blkg->q->kobj.parent)
+		dev_name = kobject_name(pd->blkg->q->kobj.parent);
+	else
+		dev_name = "none";
+
+	queued = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.queued));
+	serviced = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.serviced));
+	wait_time = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.wait_time));
+	sectors = cfqg_stat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.sectors));
+	time = cfqg_stat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.time));
+
+	seq_printf(sf, "%s %s . %llu 0 0 %llu %llu %llu %llu %llu %llu\n",
+		   dev_name, ub->ub_name,
+		   (unsigned long long)(queued.cnt[BLKG_RWSTAT_READ] +
+					queued.cnt[BLKG_RWSTAT_WRITE]),
+		   (unsigned long long)div_u64(wait_time.cnt[BLKG_RWSTAT_READ] +
+					       wait_time.cnt[BLKG_RWSTAT_WRITE],
+					       NSEC_PER_MSEC),
+		   (unsigned long long)time,
+		   (unsigned long long)(serviced.cnt[BLKG_RWSTAT_READ] +
+					serviced.cnt[BLKG_RWSTAT_WRITE]),
+		   (unsigned long long)sectors,
+		   (unsigned long long)serviced.cnt[BLKG_RWSTAT_READ],
+		   (unsigned long long)serviced.cnt[BLKG_RWSTAT_WRITE]);
+	return 0;
+}
+
+void blkcg_show_ub_iostat(struct cgroup *cgrp, struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_ub_iostat,
+			  &blkcg_policy_cfq, 0, false);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
@@ -2357,6 +2421,11 @@ static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
 	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
+#ifdef CONFIG_BC_IO_PRIORITY
+	if (get_exec_ub()->ub_bound_css[UB_BLKIO_CGROUP] !=
+	    &(RQ_CFQG(req))->pd.blkg->blkcg->css)
+		ub_writeback_io(0, bio_sectors(bio));
+#endif
 }
 
 static void
@@ -3171,6 +3240,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+		touch_nmi_watchdog();
 	}
 
 	BUG_ON(cfqd->busy_queues);
@@ -3502,8 +3572,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-					 GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -3574,13 +3643,12 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) {
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-		     struct bio *bio, gfp_t gfp_mask)
+		     struct bio *bio)
 {
 	struct blkcg *blkcg;
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	struct cfq_queue *cfqq;
 	struct cfq_group *cfqg;
 
-retry:
 	rcu_read_lock();
 
 	blkcg = bio_blkcg(bio);
@@ -3597,27 +3665,9 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = NULL;
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
-			rcu_read_unlock();
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-			spin_lock_irq(cfqd->queue->queue_lock);
-			if (new_cfqq)
-				goto retry;
-			else
-				return &cfqd->oom_cfqq;
-		} else {
-			cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-		}
-
+		cfqq = kmem_cache_alloc_node(cfq_pool,
+					     GFP_ATOMIC | __GFP_ZERO,
+					     cfqd->queue->node);
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, cic);
@@ -3627,9 +3677,6 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			cfqq = &cfqd->oom_cfqq;
 	}
 out:
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
-
 	rcu_read_unlock();
 	return cfqq;
 }
@@ -3654,29 +3701,30 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-	      struct bio *bio, gfp_t gfp_mask)
+	      struct bio *bio)
 {
 	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-	struct cfq_queue **async_cfqq = NULL;
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue **async_cfqq;
+	struct cfq_queue *cfqq;
 
 	if (!is_sync) {
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
+		if (cfqq)
+			goto out;
 	}
 
-	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
-	if (!is_sync && !(*async_cfqq)) {
+	if (!is_sync && cfqq != &cfqd->oom_cfqq) {
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
-
+out:
 	cfqq->ref++;
 	return cfqq;
 }
@@ -3942,6 +3990,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
+
+#ifdef CONFIG_BC_IO_PRIORITY
+	if (get_exec_ub()->ub_bound_css[UB_BLKIO_CGROUP] !=
+	    &(RQ_CFQG(rq))->pd.blkg->blkcg->css)
+		ub_writeback_io(1, blk_rq_sectors(rq));
+#endif
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, q);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -4211,8 +4266,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
 	spin_lock_irq(q->queue_lock);
 
 	check_ioprio_changed(cic, bio);
@@ -4220,7 +4273,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+		if (cfqq)
+			cfq_put_queue(cfqq);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
+#include <bc/io_acct.h>
 
 /*
  * See Documentation/block/deadline-iosched.txt
@@ -108,6 +109,8 @@ deadline_add_request(struct request_queue *q, struct request *rq)
 	 */
 	rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
 	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	ub_writeback_io(1, blk_rq_sectors(rq));
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, q);
 }
 
 /*
@@ -186,6 +189,12 @@ deadline_merged_requests(struct request_queue *q, struct request *req,
 	deadline_remove_request(q, next);
 }
 
+static void deadline_bio_merged(struct request_queue *q, struct request *req,
+				struct bio *bio)
+{
+	ub_writeback_io(0, bio_sectors(bio));
+}
+
 /*
  * move request from sort list to dispatch queue.
  */
@@ -445,6 +454,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_bio_merged_fn =	deadline_bio_merged,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
+#include <linux/device_cgroup.h>
 
 #include "blk.h"
 
@@ -35,7 +36,7 @@ struct kobject *block_depr;
 static DEFINE_SPINLOCK(ext_devt_lock);
 static DEFINE_IDR(ext_devt_idr);
 
-static struct device_type disk_type;
+struct device_type disk_type;
 
 static void disk_check_events(struct disk_events *ev,
 			      unsigned int *clearing_ptr);
@@ -260,8 +261,12 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 
 	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&block_class_lock);
-		for (dp = major_names[offset]; dp; dp = dp->next)
+		for (dp = major_names[offset]; dp; dp = dp->next) {
+			if (!devcgroup_device_visible(S_IFBLK, dp->major,
+						0, INT_MAX))
+				continue;
 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
+		}
 		mutex_unlock(&block_class_lock);
 	}
 }
@@ -884,11 +889,15 @@ static int show_partition(struct seq_file *seqf, void *v)
 
 	/* show the full disk and all non-0 size partitions of it */
 	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
-	while ((part = disk_part_iter_next(&piter)))
-		seq_printf(seqf, "%4d  %7d %10llu %s\n",
-			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
+	while ((part = disk_part_iter_next(&piter))) {
+		unsigned int major = MAJOR(part_devt(part));
+		unsigned int minor = MINOR(part_devt(part));
+
+		if (devcgroup_device_visible(S_IFBLK, major, minor, 1))
+			seq_printf(seqf, "%4d  %7d %10llu %s\n", major, minor,
 			   (unsigned long long)part_nr_sects_read(part) >> 1,
 			   disk_name(sgp, part->partno, buf));
+	}
 	disk_part_iter_exit(&piter);
 
 	return 0;
@@ -1141,6 +1150,7 @@ static void disk_release(struct device *dev)
 struct class block_class = {
 	.name		= "block",
 };
+EXPORT_SYMBOL(block_class);
 
 static char *block_devnode(struct device *dev, umode_t *mode,
 			   kuid_t *uid, kgid_t *gid)
@@ -1152,12 +1162,13 @@ static char *block_devnode(struct device *dev, umode_t *mode,
 	return NULL;
 }
 
-static struct device_type disk_type = {
+struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
 	.devnode	= block_devnode,
 };
+EXPORT_SYMBOL(disk_type);
 
 #ifdef CONFIG_PROC_FS
 /*
@@ -1232,7 +1243,7 @@ static const struct file_operations proc_diskstats_operations = {
 static int __init proc_genhd_init(void)
 {
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+	proc_create("partitions", S_ISVTX, NULL, &proc_partitions_operations);
 	return 0;
 }
 module_init(proc_genhd_init);
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -140,12 +140,27 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			}
 			disk_part_iter_exit(&piter);
 			part_nr_sects_write(part, (sector_t)length);
-			i_size_write(bdevp->bd_inode, p.length);
+			bd_write_size(bdevp, p.length);
 			mutex_unlock(&bdevp->bd_mutex);
 			mutex_unlock(&bdev->bd_mutex);
 			bdput(bdevp);
 			disk_put_part(part);
 			return 0;
+		case BLKPG_GET_PARTITION:
+			mutex_lock(&bdev->bd_mutex);
+			part = disk_get_part(disk, partno);
+			if (!part)
+			{
+				mutex_unlock(&bdev->bd_mutex);
+				return -ENXIO;
+			}
+			p.start = part->start_sect << 9;
+			p.length = part->nr_sects << 9;
+			disk_put_part(part);
+			mutex_unlock(&bdev->bd_mutex);
+			if (copy_to_user(a.data, &p, sizeof(struct blkpg_partition)))
+				return -EFAULT;
+			return 0;
 		default:
 			return -EINVAL;
 	}
@@ -580,6 +595,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return blkdev_pr_preempt(bdev, argp, true);
 	case IOC_PR_CLEAR:
 		return blkdev_pr_clear(bdev, argp);
+	case BLKCBTSTART:
+	case BLKCBTSTOP:
+	case BLKCBTGET:
+	case BLKCBTSET:
+	case BLKCBTCLR:
+		return blk_cbt_ioctl(bdev, cmd, (char __user *)arg);
 	default:
 		return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	}
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -43,6 +43,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 	return buf;
 }
+EXPORT_SYMBOL(disk_name);
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -379,6 +379,7 @@ static int crypto_init_ablkcipher_ops(struct crypto_tfm *tfm, u32 type,
 	}
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
@@ -460,6 +461,7 @@ static int crypto_init_givcipher_ops(struct crypto_tfm *tfm, u32 type,
 	crt->givdecrypt = alg->givdecrypt ?: no_givdecrypt;
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -130,13 +130,16 @@ EXPORT_SYMBOL_GPL(af_alg_release);
 void af_alg_release_parent(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
-	bool last;
+	unsigned int nokey = ask->nokey_refcnt;
+	bool last = nokey && !ask->refcnt;
 
 	sk = ask->parent;
 	ask = alg_sk(sk);
 
 	lock_sock(sk);
-	last = !--ask->refcnt;
+	ask->nokey_refcnt -= nokey;
+	if (!last)
+		last = !--ask->refcnt;
 	release_sock(sk);
 
 	if (last)
@@ -179,7 +182,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	err = -EBUSY;
 	lock_sock(sk);
-	if (ask->refcnt)
+	if (ask->refcnt | ask->nokey_refcnt)
 		goto unlock;
 
 	swap(ask->type, type);
@@ -259,8 +262,8 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
 	struct sock *sk2;
+	unsigned int nokey;
 	int err;
-	bool nokey;
 
 	lock_sock(sk);
 	type = ask->type;
@@ -291,8 +294,10 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 
 	if (nokey || !ask->refcnt++)
 		sock_hold(sk);
+	ask->nokey_refcnt += nokey;
 	alg_sk(sk2)->parent = sk;
 	alg_sk(sk2)->type = type;
+	alg_sk(sk2)->nokey_refcnt = nokey;
 
 	newsock->ops = type->ops;
 	newsock->state = SS_CONNECTED;
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -68,8 +68,9 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk)
 	struct scatterlist *sg;
 
 	sg = walk->sg;
-	walk->pg = sg_page(sg);
 	walk->offset = sg->offset;
+	walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT);
+	walk->offset = offset_in_page(walk->offset);
 	walk->entrylen = sg->length;
 
 	if (walk->entrylen > walk->total)
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -56,7 +56,8 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 
 	lock_sock(sk);
 	if (!ctx->more) {
-		err = crypto_ahash_init(&ctx->req);
+		err = af_alg_wait_for_completion(crypto_ahash_init(&ctx->req),
+						&ctx->completion);
 		if (err)
 			goto unlock;
 	}
@@ -136,6 +137,7 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page,
 	} else {
 		if (!ctx->more) {
 			err = crypto_ahash_init(&ctx->req);
+			err = af_alg_wait_for_completion(err, &ctx->completion);
 			if (err)
 				goto unlock;
 		}
@@ -253,22 +255,23 @@ static struct proto_ops algif_hash_ops = {
 
 static int hash_check_key(struct socket *sock)
 {
-	int err;
+	int err = 0;
 	struct sock *psk;
 	struct alg_sock *pask;
 	struct algif_hash_tfm *tfm;
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 
+	lock_sock(sk);
 	if (ask->refcnt)
-		return 0;
+		goto unlock_child;
 
 	psk = ask->parent;
 	pask = alg_sk(ask->parent);
 	tfm = pask->private;
 
 	err = -ENOKEY;
-	lock_sock(psk);
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
 	if (!tfm->has_key)
 		goto unlock;
 
@@ -282,6 +285,8 @@ static int hash_check_key(struct socket *sock)
 
 unlock:
 	release_sock(psk);
+unlock_child:
+	release_sock(sk);
 
 	return err;
 }
@@ -395,7 +400,7 @@ static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
 	return err;
 }
 
-static void hash_sock_destruct_common(struct sock *sk)
+static void hash_sock_destruct(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
@@ -403,33 +408,10 @@ static void hash_sock_destruct_common(struct sock *sk)
 	sock_kfree_s(sk, ctx->result,
 		     crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)));
 	sock_kfree_s(sk, ctx, ctx->len);
-}
-
-static void hash_sock_destruct(struct sock *sk)
-{
-	hash_sock_destruct_common(sk);
-	af_alg_release_parent(sk);
-}
-
-static void hash_release_parent_nokey(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-
-	if (!ask->refcnt) {
-		sock_put(ask->parent);
-		return;
-	}
-
 	af_alg_release_parent(sk);
 }
 
-static void hash_sock_destruct_nokey(struct sock *sk)
-{
-	hash_sock_destruct_common(sk);
-	hash_release_parent_nokey(sk);
-}
-
-static int hash_accept_parent_common(void *private, struct sock *sk)
+static int hash_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct hash_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
@@ -472,21 +454,7 @@ static int hash_accept_parent(void *private, struct sock *sk)
 	if (!tfm->has_key && crypto_ahash_has_setkey(tfm->hash))
 		return -ENOKEY;
 
-	return hash_accept_parent_common(private, sk);
-}
-
-static int hash_accept_parent_nokey(void *private, struct sock *sk)
-{
-	int err;
-
-	err = hash_accept_parent_common(private, sk);
-	if (err)
-		goto out;
-
-	sk->sk_destruct = hash_sock_destruct_nokey;
-
-out:
-	return err;
+	return hash_accept_parent_nokey(private, sk);
 }
 
 static const struct af_alg_type algif_type_hash = {
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -448,13 +448,6 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 		char __user *from = iov->iov_base;
 
 		while (seglen) {
-			sgl = list_first_entry(&ctx->tsgl,
-					       struct skcipher_sg_list, list);
-			sg = sgl->sg;
-
-			while (!sg->length)
-				sg++;
-
 			used = ctx->used;
 			if (!used) {
 				err = skcipher_wait_for_data(sk, flags);
@@ -476,6 +469,13 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 			if (!used)
 				goto free;
 
+			sgl = list_first_entry(&ctx->tsgl,
+					       struct skcipher_sg_list, list);
+			sg = sgl->sg;
+
+			while (!sg->length)
+				sg++;
+
 			ablkcipher_request_set_crypt(&ctx->req, sg,
 						     ctx->rsgl.sg, used,
 						     ctx->iv);
@@ -553,22 +553,23 @@ static struct proto_ops algif_skcipher_ops = {
 
 static int skcipher_check_key(struct socket *sock)
 {
-	int err;
+	int err = 0;
 	struct sock *psk;
 	struct alg_sock *pask;
 	struct ablkcipher_tfm_keycheck *tfm;
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 
+	lock_sock(sk);
 	if (ask->refcnt)
-		return 0;
+		goto unlock_child;
 
 	psk = ask->parent;
 	pask = alg_sk(ask->parent);
 	tfm = pask->private;
 
 	err = -ENOKEY;
-	lock_sock(psk);
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
 	if (!tfm->has_key)
 		goto unlock;
 
@@ -582,6 +583,8 @@ static int skcipher_check_key(struct socket *sock)
 
 unlock:
 	release_sock(psk);
+unlock_child:
+	release_sock(sk);
 
 	return err;
 }
@@ -683,7 +686,7 @@ static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 	return err;
 }
 
-static void skcipher_sock_destruct_common(struct sock *sk)
+static void skcipher_sock_destruct(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	struct skcipher_ctx *ctx = ask->private;
@@ -692,33 +695,10 @@ static void skcipher_sock_destruct_common(struct sock *sk)
 	skcipher_free_sgl(sk);
 	sock_kfree_s(sk, ctx->iv, crypto_ablkcipher_ivsize(tfm));
 	sock_kfree_s(sk, ctx, ctx->len);
-}
-
-static void skcipher_sock_destruct(struct sock *sk)
-{
-	skcipher_sock_destruct_common(sk);
 	af_alg_release_parent(sk);
 }
 
-static void skcipher_release_parent_nokey(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-
-	if (!ask->refcnt) {
-		sock_put(ask->parent);
-		return;
-	}
-
-	af_alg_release_parent(sk);
-}
-
-static void skcipher_sock_destruct_nokey(struct sock *sk)
-{
-	skcipher_sock_destruct_common(sk);
-	skcipher_release_parent_nokey(sk);
-}
-
-static int skcipher_accept_parent_common(void *private, struct sock *sk)
+static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct skcipher_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
@@ -768,21 +748,7 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 	if (!tfm->has_key && calg->cra_u.ablkcipher.max_keysize)
 		return -ENOKEY;
 
-	return skcipher_accept_parent_common(private, sk);
-}
-
-static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
-{
-	int err;
-
-	err = skcipher_accept_parent_common(private, sk);
-	if (err)
-		goto out;
-
-	sk->sk_destruct = skcipher_sock_destruct_nokey;
-
-out:
-	return err;
+	return skcipher_accept_parent_nokey(private, sk);
 }
 
 static const struct af_alg_type algif_type_skcipher = {
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -238,6 +238,8 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc,
 		return blkcipher_walk_done(desc, walk, -EINVAL);
 	}
 
+	bsize = min(walk->blocksize, n);
+
 	walk->flags &= ~(BLKCIPHER_WALK_SLOW | BLKCIPHER_WALK_COPY |
 			 BLKCIPHER_WALK_DIFF);
 	if (!scatterwalk_aligned(&walk->in, alignmask) ||
@@ -250,7 +252,6 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc,
 		}
 	}
 
-	bsize = min(walk->blocksize, n);
 	n = scatterwalk_clamp(&walk->in, n);
 	n = scatterwalk_clamp(&walk->out, n);
 
@@ -458,6 +459,7 @@ static int crypto_init_blkcipher_ops_async(struct crypto_tfm *tfm)
 	}
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -585,9 +585,14 @@ static int cryptd_hash_export(struct ahash_request *req, void *out)
 
 static int cryptd_hash_import(struct ahash_request *req, const void *in)
 {
-	struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct shash_desc *desc = cryptd_shash_desc(req);
+
+	desc->tfm = ctx->child;
+	desc->flags = req->base.flags;
 
-	return crypto_shash_import(&rctx->desc, in);
+	return crypto_shash_import(desc, in);
 }
 
 static int cryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -519,6 +519,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (link->dump == NULL)
 			return -EINVAL;
 
+		down_read(&crypto_alg_sem);
 		list_for_each_entry(alg, &crypto_alg_list, cra_list)
 			dump_alloc += CRYPTO_REPORT_MAXSIZE;
 
@@ -528,8 +529,11 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				.done = link->done,
 				.min_dump_alloc = dump_alloc,
 			};
-			return netlink_dump_start(crypto_nlsk, skb, nlh, &c);
+			err = netlink_dump_start(crypto_nlsk, skb, nlh, &c);
 		}
+		up_read(&crypto_alg_sem);
+
+		return err;
 	}
 
 	err = nlmsg_parse(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX,
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -716,7 +716,9 @@ static struct crypto_instance *crypto_gcm_alloc_common(struct rtattr **tb,
 
 	ghash_alg = crypto_find_alg(ghash_name, &crypto_ahash_type,
 				    CRYPTO_ALG_TYPE_HASH,
-				    CRYPTO_ALG_TYPE_AHASH_MASK);
+				    CRYPTO_ALG_TYPE_AHASH_MASK |
+				    crypto_requires_sync(algt->type,
+							 algt->mask));
 	if (IS_ERR(ghash_alg))
 		return ERR_CAST(ghash_alg);
 
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -68,7 +68,8 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out,
 
 void scatterwalk_done(struct scatter_walk *walk, int out, int more)
 {
-	if (!(scatterwalk_pagelen(walk) & (PAGE_SIZE - 1)) || !more)
+	if (!more || walk->offset >= walk->sg->offset + walk->sg->length ||
+	    !(walk->offset & (PAGE_SIZE - 1)))
 		scatterwalk_pagedone(walk, out, more);
 }
 EXPORT_SYMBOL_GPL(scatterwalk_done);
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -353,11 +353,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	crt->final = shash_async_final;
 	crt->finup = shash_async_finup;
 	crt->digest = shash_async_digest;
+	crt->setkey = shash_async_setkey;
+
+	crt->has_setkey = alg->setkey != shash_no_setkey;
 
-	if (alg->setkey) {
-		crt->setkey = shash_async_setkey;
-		crt->has_setkey = true;
-	}
 	if (alg->export)
 		crt->export = shash_async_export;
 	if (alg->import)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
 obj-$(CONFIG_PARIDE) 		+= block/paride/
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= block/ploop/
 obj-$(CONFIG_TC)		+= tc/
 obj-$(CONFIG_UWB)		+= uwb/
 obj-$(CONFIG_USB_PHY)		+= usb/
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -43,6 +43,7 @@
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_dbg.h>
 #include "../scsi/scsi_transport_api.h"
+#include "../scsi/scsi_dbg.h"
 
 #include <linux/libata.h>
 
@@ -686,6 +687,7 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap,
 				 * Successfully complete it.
 				 */
 				scmd->retries = scmd->allowed;
+				scsi_debug_log_cmnd(ATA_SCSI_CMD_ERROR_HANDLER_CALLS_EH_FINISH, scmd);
 				scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
 			}
 		}
@@ -1002,6 +1004,7 @@ void ata_std_sched_eh(struct ata_port *ap)
 		return;
 
 	ata_eh_set_pending(ap, 1);
+	scsi_debug_log_shost(ATA_STD_SCHED_EH_CALLS_SCHEDULE_EH, ap->scsi_host);
 	scsi_schedule_eh(ap->scsi_host);
 
 	DPRINTK("port EH scheduled\n");
@@ -1024,6 +1027,7 @@ void ata_std_end_eh(struct ata_port *ap)
 {
 	struct Scsi_Host *host = ap->scsi_host;
 
+	scsi_debug_log_shost(ATA_STD_END_EH_ZERO_EH_SCHEDULED, host);
 	host->host_eh_scheduled = 0;
 }
 EXPORT_SYMBOL(ata_std_end_eh);
@@ -1299,6 +1303,7 @@ static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
 	WARN_ON(ata_tag_valid(qc->tag));
 	spin_unlock_irqrestore(ap->lock, flags);
 
+	scsi_debug_log_cmnd(ATA_EH_QC_COMPLETE_CALLS_EH_FINISH, scmd);
 	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
 }
 
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1029,7 +1029,7 @@ static void device_remove_sys_dev_entry(struct device *dev)
 
 	if (kobj) {
 		format_dev_t(devt_str, dev->devt);
-		sysfs_remove_link(kobj, devt_str);
+		sysfs_delete_link(kobj, &dev->kobj, devt_str);
 	}
 }
 
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -21,9 +21,9 @@
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/ramfs.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/ve.h>
 #include "base.h"
 
 static struct task_struct *thread;
@@ -54,9 +54,64 @@ static int __init mount_param(char *str)
 }
 __setup("devtmpfs.mount=", mount_param);
 
+#ifdef CONFIG_VE
+static int ve_test_dev_sb(struct super_block *s, void *p)
+{
+	return get_exec_env()->dev_sb == s;
+}
+
+static int ve_set_dev_sb(struct super_block *s, void *p)
+{
+	struct ve_struct *ve = get_exec_env();
+	int error;
+
+	error = set_anon_super(s, p);
+	if (!error) {
+		BUG_ON(ve->dev_sb);
+		ve->dev_sb = s;
+		atomic_inc(&s->s_active);
+	}
+	return error;
+}
+
+static struct dentry *ve_dev_mount(struct file_system_type *fs_type, int flags,
+		      const char *dev_name, void *data)
+{
+	int (*fill_super)(struct super_block *, void *, int);
+	struct super_block *s;
+	int error;
+
+#ifdef CONFIG_TMPFS
+	fill_super = shmem_fill_super;
+#else
+	fill_super = ramfs_fill_super;
+#endif
+	s = sget(fs_type, ve_test_dev_sb, ve_set_dev_sb, flags, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	if (!s->s_root) {
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+		s->s_flags |= MS_ACTIVE;
+	}
+	return dget(s->s_root);
+}
+#endif /* CONFIG_VE */
+
 static struct dentry *dev_mount(struct file_system_type *fs_type, int flags,
 		      const char *dev_name, void *data)
 {
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return ve_dev_mount(fs_type, flags, dev_name, data);
+#endif
 #ifdef CONFIG_TMPFS
 	return mount_single(fs_type, flags, data, shmem_fill_super);
 #else
@@ -68,6 +123,7 @@ static struct file_system_type dev_fs_type = {
 	.name = "devtmpfs",
 	.mount = dev_mount,
 	.kill_sb = kill_litter_super,
+	.fs_flags = FS_VIRTUALIZED | FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
 };
 
 #ifdef CONFIG_BLOCK
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -272,7 +272,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * global one. Requires architecture specific get_dev_cma_area() helper
  * function.
  */
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int align)
 {
 	unsigned long mask, pfn, pageno, start = 0;
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -281,6 +281,22 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+config BLK_DEV_PLOOP
+	tristate "Virtuozzo loopback device support"
+	depends on BLK_DEV_CBT
+	---help---
+	  Saying Y here will allow you to use a regular file as a block
+	  device; you can then create a file system on that block device and
+	  mount it just as you would mount other block devices such as hard
+	  drive partitions, CD-ROM drives or floppy drives. The loop devices
+	  are block special device files with major number 182 and typically
+	  called /dev/ploop0, /dev/ploop1 etc.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called loop.
+
+	  Most users will answer N here.
+
 source "drivers/block/drbd/Kconfig"
 
 config BLK_DEV_NBD
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -616,7 +616,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_SPECIAL;
+		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
 
 		/* Check again after getting mutex back.  */
--- /dev/null
+++ b/drivers/block/ploop/Makefile
@@ -0,0 +1,23 @@
+#
+# Makefile for Virtuozzo loop device
+#
+# Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+#
+
+CFLAGS_io_direct.o = -I$(src)
+CFLAGS_ploop_events.o = -I$(src)
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= ploop.o
+ploop-objs := dev.o map.o io.o sysfs.o tracker.o freeblks.o ploop_events.o discard.o push_backup.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_ploop1.o
+pfmt_ploop1-objs := fmt_ploop1.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_raw.o
+pfmt_raw-objs := fmt_raw.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_direct.o
+pio_direct-objs := io_direct.o io_direct_map.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_kaio.o
+pio_kaio-objs := io_kaio.o io_kaio_map.o
--- /dev/null
+++ b/drivers/block/ploop/dev.c
@@ -0,0 +1,5556 @@
+/*
+ *  drivers/block/ploop/dev.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop_events.h"
+#include "freeblks.h"
+#include "discard.h"
+#include "push_backup.h"
+
+/* Structures and terms:
+ *
+ * ploop_device is root of everything.
+ *	Normally we use local variable "plo" to refer to it.
+ *
+ * ploop_device -> list of ploop_delta's.
+ *	Head of list is "top delta", tail of list is "root delta".
+ *	"top delta" is delta, where all the modifications are written,
+ *	"root delta" is base image. "Level" is distance from root.
+ *
+ * ploop_delta  -> { ops, priv } refers to particulat format of delta.
+ *		-> ploop_io refers to image on disk.
+ *
+ * ploop_io	-> list of ploop_file, each file maps an area in image.
+ *	*** Further is "ideal", right now we support only one ploop_file
+ *	*** and we do not support creation of new ploop_file's.
+ *		-> { ops , priv } generic image ops, mostly creation
+ *		   of new chunks.
+ *
+ * ploop_file	-> { file, ops, priv } how we do real IO on this file.
+ */
+
+static int ploop_max __read_mostly = PLOOP_DEVICE_RANGE;
+static int ploop_major __read_mostly = PLOOP_DEVICE_MAJOR;
+int max_map_pages __read_mostly;
+
+static long root_threshold __read_mostly = 2L * 1024 * 1024; /* 2GB in KB */
+static long user_threshold __read_mostly = 4L * 1024 * 1024; /* 4GB in KB */
+
+static int large_disk_support __read_mostly = 1; /* true */
+
+static struct rb_root ploop_devices_tree = RB_ROOT;
+static DEFINE_MUTEX(ploop_devices_mutex);
+
+static LIST_HEAD(ploop_formats);
+static DEFINE_MUTEX(ploop_formats_mutex);
+
+int ploop_register_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_add(&ops->list, &ploop_formats);
+	mutex_unlock(&ploop_formats_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_format);
+
+void ploop_unregister_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_formats_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_format);
+
+struct ploop_delta_ops * ploop_format_get(unsigned int id)
+{
+	struct ploop_delta_ops * ops;
+
+	mutex_lock(&ploop_formats_mutex);
+	list_for_each_entry(ops, &ploop_formats, list) {
+		if (ops->id == id && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_formats_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_formats_mutex);
+	return NULL;
+}
+
+void ploop_format_put(struct ploop_delta_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+void ploop_msg_once(struct ploop_device *plo, const char *fmt, ...)
+{
+	va_list args;
+
+	if (test_and_set_bit(PLOOP_S_ONCE, &plo->state))
+		return;
+
+	va_start(args, fmt);
+	printk("ploop(%d): ", plo->index);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+EXPORT_SYMBOL(ploop_msg_once);
+
+static void mitigation_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    (!list_empty(&plo->entry_queue) ||
+	     ((plo->bio_head || !bio_list_empty(&plo->bio_discard_list)) &&
+	      !list_empty(&plo->free_list))) &&
+	      waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void freeze_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->freeze_waitq))
+		wake_up_interruptible(&plo->freeze_waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void ploop_congest(struct ploop_device *plo)
+{
+	if (!test_bit(PLOOP_S_CONGESTED, &plo->state) &&
+	    PLOOP_CONGESTED(plo) > plo->tune.congestion_high_watermark)
+		set_bit(PLOOP_S_CONGESTED, &plo->state);
+}
+
+static void ploop_uncongest(struct ploop_device *plo)
+{
+	if (PLOOP_CONGESTED(plo) <= plo->tune.congestion_low_watermark &&
+	    test_and_clear_bit(PLOOP_S_CONGESTED, &plo->state)) {
+		struct backing_dev_info *bdi = &plo->queue->backing_dev_info;
+
+		if (waitqueue_active(&bdi->cong_waitq))
+			wake_up_all(&bdi->cong_waitq);
+	}
+}
+
+static struct ploop_request *
+ploop_alloc_request(struct ploop_device * plo)
+{
+	struct ploop_request * preq;
+
+	/* We allow only finite amount of request in process.
+	 * If caller does not stop to congest us, we force him to wait.
+	 *
+	 * _XXX_ I am afraid this logic is flawed. The justification is
+	 * that conventional devices, using request queues, do similar thing
+	 * blocking in add_request(), but I am still not sure that logic
+	 * applies here.
+	 */
+	if (list_empty(&plo->free_list)) {
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&plo->req_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+			if (!list_empty(&plo->free_list))
+				break;
+			plo->st.bio_full++;
+			spin_unlock_irq(&plo->lock);
+			io_schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&plo->req_waitq, &_wait);
+	}
+
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+	plo->free_qlen--;
+	ploop_congest(plo);
+	return preq;
+}
+
+static void ploop_grab_iocontext(struct bio *bio)
+{
+	struct io_context **ioc_pp = (struct io_context **)(&bio->bi_bdev);
+	if (current->io_context) {
+		ioc_task_link(current->io_context);
+		*ioc_pp = current->io_context;
+		set_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	}
+}
+
+/* always called with plo->lock held */
+static inline void preq_unlink(struct ploop_request * preq,
+			       struct list_head *drop_list)
+{
+	list_del(&preq->list);
+	ploop_entry_qlen_dec(preq);
+	list_add(&preq->list, drop_list);
+}
+
+static void ploop_set_blockable(struct ploop_device *plo,
+				struct ploop_request *preq)
+{
+	if (!test_and_set_bit(PLOOP_REQ_BLOCKABLE, &preq->state))
+		plo->blockable_reqs++;
+}
+
+static void ploop_test_and_clear_blockable(struct ploop_device *plo,
+					   struct ploop_request *preq)
+{
+	if (test_and_clear_bit(PLOOP_REQ_BLOCKABLE, &preq->state))
+		plo->blockable_reqs--;
+}
+
+/* always called with plo->lock released */
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list)
+{
+	struct ploop_request * preq;
+	int drop_qlen = 0;
+
+	list_for_each_entry(preq, drop_list, list) {
+		if (preq->ioc) {
+			atomic_dec(&preq->ioc->nr_tasks);
+			put_io_context_active(preq->ioc);
+			preq->ioc = NULL;
+		}
+
+		BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state));
+		ploop_test_and_clear_blockable(plo, preq);
+		drop_qlen++;
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	list_splice_init(drop_list, plo->free_list.prev);
+	plo->free_qlen += drop_qlen;
+	if (waitqueue_active(&plo->req_waitq))
+		wake_up(&plo->req_waitq);
+	else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+		waitqueue_active(&plo->waitq) &&
+		(plo->bio_head || !bio_list_empty(&plo->bio_discard_list)))
+		wake_up_interruptible(&plo->waitq);
+
+	ploop_uncongest(plo);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static void merge_rw_flags_to_req(unsigned long rw,
+				  struct ploop_request * preq)
+{
+		if (rw & REQ_FLUSH)
+			preq->req_rw |= REQ_FLUSH;
+		if (rw & REQ_FUA)
+			preq->req_rw |= REQ_FUA;
+}
+
+static void preq_set_sync_bit(struct ploop_request * preq)
+{
+	if (!test_bit(PLOOP_REQ_SYNC, &preq->state)) {
+		if (!(preq->req_rw & WRITE) || (preq->req_rw & (REQ_FLUSH|REQ_FUA))) {
+			preq->plo->read_sync_reqs++;
+			__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		}
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	}
+}
+
+static void overlap_forward(struct ploop_device * plo,
+			    struct ploop_request * preq,
+			    struct ploop_request * preq1,
+			    struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq->req_sector + preq->req_size == preq1->req_sector) {
+		preq->bl.tail->bi_next = preq1->bl.head;
+		preq->bl.tail = preq1->bl.tail;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq1->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mforw++;
+	}
+
+	while ((n = rb_next(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq->req_sector + preq->req_size <= preq1->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oforw++;
+	}
+}
+
+static void overlap_backward(struct ploop_device * plo,
+			     struct ploop_request * preq,
+			     struct ploop_request * preq1,
+			     struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq1->req_sector + preq1->req_size == preq->req_sector) {
+		preq1->bl.tail->bi_next = preq->bl.head;
+		preq->bl.head = preq1->bl.head;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		preq->req_sector = preq1->req_sector;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mback++;
+	}
+
+	while ((n = rb_prev(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq1->req_sector + preq1->req_size <= preq->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oback++;
+	}
+}
+
+static int try_merge(struct ploop_device *plo, struct ploop_request * preq,
+		     struct bio * bio, struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	/* Merge to tail */
+	if (bio->bi_sector == preq->req_sector + preq->req_size) {
+		preq->bl.tail->bi_next = bio;
+		preq->bl.tail = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->tstamp = jiffies;
+		if (bio->bi_rw & REQ_SYNC)
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		plo->st.coal_forw++;
+		n = rb_next(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector + preq->req_size >= preq1->req_sector)
+				overlap_forward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (bio->bi_sector + (bio->bi_size >> 9) == preq->req_sector) {
+		bio->bi_next = preq->bl.head;
+		preq->bl.head = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->req_sector = bio->bi_sector;
+		preq->tstamp = jiffies;
+		plo->st.coal_back++;
+		if (bio->bi_rw & REQ_SYNC)
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		n = rb_prev(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct ploop_request *
+tree_insert(struct rb_root *root, struct ploop_request * preq0)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ploop_request * preq;
+
+	while (*p) {
+		parent = *p;
+		preq = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq0->req_cluster < preq->req_cluster)
+			p = &(*p)->rb_left;
+		else if (preq0->req_cluster > preq->req_cluster)
+			p = &(*p)->rb_right;
+		else if (preq0->req_sector + preq0->req_size < preq->req_sector)
+			p = &(*p)->rb_left;
+		else if (preq0->req_sector > preq->req_sector + preq->req_size)
+			p = &(*p)->rb_right;
+		else
+			return preq;
+	}
+
+	rb_link_node(&preq0->lockout_link, parent, p);
+	rb_insert_color(&preq0->lockout_link, root);
+	__set_bit(PLOOP_REQ_SORTED, &preq0->state);
+	return NULL;
+}
+
+static int
+insert_entry_tree(struct ploop_device * plo, struct ploop_request * preq0,
+		  struct list_head *drop_list)
+{
+	struct ploop_request * clash;
+	struct rb_node * n;
+
+	clash = tree_insert(&plo->entry_tree[preq0->req_rw & WRITE], preq0);
+	if (!clash)
+		return 0;
+
+	if (preq0->req_sector == clash->req_sector + clash->req_size) {
+		clash->bl.tail->bi_next = preq0->bl.head;
+		clash->bl.tail = preq0->bl.tail;
+		clash->req_size += preq0->req_size;
+		clash->tstamp = jiffies;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+		plo->st.coal_forw2++;
+
+		n = rb_next(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector + clash->req_size >= preq1->req_sector)
+				overlap_forward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (clash->req_sector == preq0->req_sector + preq0->req_size) {
+		preq0->bl.tail->bi_next = clash->bl.head;
+		clash->bl.head = preq0->bl.head;
+		clash->req_size += preq0->req_size;
+		clash->req_sector = preq0->req_sector;
+		clash->tstamp = jiffies;
+		plo->st.coal_back2++;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+
+		n = rb_prev(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	plo->st.coal_overlap++;
+
+	return 0;
+}
+
+static void
+ploop_bio_queue(struct ploop_device * plo, struct bio * bio,
+		struct list_head *drop_list, int account_blockable)
+{
+	struct ploop_request * preq;
+
+	BUG_ON(list_empty(&plo->free_list));
+	BUG_ON(plo->free_qlen <= 0);
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+	plo->free_qlen--;
+
+	preq->req_cluster = bio->bi_sector >> plo->cluster_log;
+	bio->bi_next = NULL;
+	preq->req_sector = bio->bi_sector;
+	preq->req_size = bio->bi_size >> 9;
+	preq->req_rw = bio->bi_rw;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = 0;
+	preq->ppb_state = 0;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+	preq->prealloc_size = 0;
+
+	if (account_blockable && (bio->bi_rw & REQ_WRITE) && bio->bi_size &&
+	    ploop_pb_check_and_clear_bit(plo->pbd, preq->req_cluster))
+		ploop_set_blockable(plo, preq);
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		int clu_size = 1 << plo->cluster_log;
+		int i = (clu_size - 1) & bio->bi_sector;
+		int err = 0;
+
+		if (i) {
+			preq->req_cluster++;
+			if (preq->req_size >= clu_size)
+				preq->req_size -= clu_size - i;
+		}
+
+		if (preq->req_size < clu_size ||
+		    (err = ploop_discard_add_bio(plo->fbd, bio))) {
+			if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+				struct io_context *ioc;
+				ioc = (struct io_context *)(bio->bi_bdev);
+				atomic_dec(&ioc->nr_tasks);
+				put_io_context_active(ioc);
+
+				bio->bi_bdev = plo->bdev;
+				clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+			}
+			BIO_ENDIO(plo->queue, bio, err);
+			list_add(&preq->list, &plo->free_list);
+			plo->free_qlen++;
+			plo->bio_discard_qlen--;
+			plo->bio_total--;
+			return;
+		}
+
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_DISCARD);
+		preq->dst_iblock = 0;
+		preq->bl.head = preq->bl.tail = NULL;
+	} else
+		preq->bl.head = preq->bl.tail = bio;
+
+	if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+		    preq->ioc = (struct io_context *)(bio->bi_bdev);
+		    bio->bi_bdev = plo->bdev;
+		    clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	} else {
+		preq->ioc = NULL;
+	}
+
+	if (unlikely(bio->bi_rw & REQ_SYNC))
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	if (unlikely(bio == plo->bio_sync)) {
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+		plo->bio_sync = NULL;
+	}
+
+	__TRACE("A %p %u\n", preq, preq->req_cluster);
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD))
+		plo->bio_discard_qlen--;
+	else
+		plo->bio_qlen--;
+	ploop_entry_add(plo, preq);
+
+	if (bio->bi_size && !(bio->bi_rw & REQ_DISCARD))
+		insert_entry_tree(plo, preq, drop_list);
+
+	trace_bio_queue(preq);
+}
+
+static inline struct ploop_request *
+ploop_get_request(struct ploop_device * plo, struct list_head * list)
+{
+	struct ploop_request * preq;
+
+	if (unlikely(list_empty(list)))
+		return NULL;
+
+	preq = list_first_entry(list, struct ploop_request, list);
+	list_del_init(&preq->list);
+	return preq;
+}
+
+static struct ploop_delta * find_delta(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		if (delta->level == level)
+			return delta;
+	}
+
+	return NULL;
+}
+
+DEFINE_BIO_CB(ploop_fast_end_io)
+{
+	unsigned long flags;
+	struct ploop_device * plo;
+	struct bio * orig = bio->bi_private;
+
+	plo = orig->bi_bdev->bd_disk->private_data;
+
+	BIO_ENDIO(plo->queue, orig, err);
+
+	/* End of fast bio wakes up main process only when this could
+	 * mean exit from ATTENTION state.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	plo->active_reqs--;
+	plo->fastpath_reqs--;
+	plo->bio_total--;
+
+	if (plo->active_reqs == 0 &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq) &&
+	    (test_bit(PLOOP_S_EXITING, &plo->state) ||
+	     !list_empty(&plo->entry_queue)))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	bio_put(bio);
+}
+END_BIO_CB(ploop_fast_end_io)
+
+static struct ploop_delta *
+ploop_fast_lookup(struct ploop_device * plo, sector_t sec,
+		  unsigned long rw, sector_t * isec)
+{
+	struct ploop_delta * top_delta, * delta;
+	int level;
+	cluster_t bio_cluster = sec >> plo->cluster_log;
+	iblock_t iblk;
+
+	level = ploop_fastmap(&plo->map, bio_cluster, &iblk);
+	if (level < 0)
+		return NULL;
+
+	top_delta = ploop_top_delta(plo);
+	delta = top_delta;
+
+	if (level != top_delta->level) {
+		/* _XXX_ here is a problem. While merge_bvec() we do
+		 * not know whether this bio is read or write. If it is read
+		 * we should check backing map. This is tradeoff:
+		 * either we will direct reads to slow path, or we
+		 * do not aggregate writes, which makes COW much
+		 * slower. For now we select optimization of COW.
+		 */
+		if (rw & REQ_WRITE)
+			return NULL;
+
+		delta = find_delta(plo, level);
+	}
+	if (delta) {
+		*isec = ((sector_t)iblk << plo->cluster_log) +
+			(sec & ((1 << plo->cluster_log) - 1));
+	}
+	return delta;
+}
+
+
+/* Got a bio, which is mapped 1-1 to block device.
+ * But there is a problem, this bio could bypass device merge functions,
+ * because we skipped it while our own merge_fn.
+ *
+ * We cannot split bio in fast path, but we can revalidate it.
+ *
+ * q->max_phys_segments and q->max_hw_segments must be set to minimal
+ * of all participating backing devices.
+ */
+
+static int
+bio_fast_map(struct ploop_device * plo, struct bio * orig_bio, struct bio * bio)
+{
+	struct ploop_delta * delta;
+	sector_t isector;
+
+	if (orig_bio->bi_size == 0)
+		delta = ploop_top_delta(plo);
+	else
+		delta = ploop_fast_lookup(plo, orig_bio->bi_sector,
+					  orig_bio->bi_rw, &isector);
+	if (delta == NULL) {
+		plo->st.fast_neg_nomap++;
+		return 1;
+	}
+
+	if (delta->io.ops->fastmap == NULL)
+		return 1;
+
+	return delta->io.ops->fastmap(&delta->io, orig_bio, bio, isector);
+}
+
+static inline unsigned int block_vecs(struct ploop_device * plo)
+{
+	return 1 << (plo->cluster_log + 9 - PAGE_SHIFT);
+}
+
+static int whole_block(struct ploop_device * plo, struct ploop_request *preq)
+{
+	if (preq->req_size != (1<<plo->cluster_log))
+		return 0;
+	return !(preq->req_sector & ((1<<plo->cluster_log) - 1));
+}
+
+static struct bio *
+preallocate_bio(struct bio * orig_bio, struct ploop_device * plo)
+{
+	struct bio * nbio = NULL;
+
+	if (plo->cached_bio) {
+		spin_lock_irq(&plo->lock);
+		nbio = plo->cached_bio;
+		if (nbio) {
+			if (orig_bio->bi_vcnt <= nbio->bi_max_vecs)
+				plo->cached_bio = NULL;
+			else
+				nbio = NULL;
+		}
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (nbio == NULL)
+		nbio = bio_alloc(GFP_NOIO, max(orig_bio->bi_max_vecs, block_vecs(plo)));
+	return nbio;
+}
+
+static void process_bio_queue_one(struct ploop_device * plo,
+				  struct list_head *drop_list,
+				  int check_push_backup)
+{
+	struct bio *bio = plo->bio_head;
+
+	BUG_ON (!plo->bio_tail);
+	plo->bio_head = plo->bio_head->bi_next;
+	if (!plo->bio_head)
+		plo->bio_tail = NULL;
+
+	if (check_push_backup &&
+	    (bio->bi_rw & REQ_WRITE) && bio->bi_size &&
+	    plo->free_qlen <= plo->free_qmax / 2 &&
+	    plo->blockable_reqs > plo->free_qmax / 4 &&
+	    ploop_pb_bio_detained(plo->pbd, bio))
+		plo->blocked_bios++;
+	else
+		ploop_bio_queue(plo, bio, drop_list, check_push_backup);
+}
+
+static void process_bio_queue_optional(struct ploop_device * plo,
+				       struct list_head *drop_list)
+{
+	while (plo->bio_head && !list_empty(&plo->free_list) &&
+	       (!test_bit(PLOOP_S_PUSH_BACKUP, &plo->state) ||
+		plo->free_qlen > plo->free_qmax / 2))
+		process_bio_queue_one(plo, drop_list, 0);
+}
+
+static void process_bio_queue_main(struct ploop_device * plo,
+				   struct list_head *drop_list)
+{
+	int check = test_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+
+	while (plo->bio_head && !list_empty(&plo->free_list))
+		process_bio_queue_one(plo, drop_list, check);
+}
+
+static void ploop_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct ploop_device *plo = cb->data;
+
+	clear_bit(PLOOP_S_SYNC, &plo->state);
+
+	/* And kick our "soft" queue too in case mitigation timer is in effect */
+	spin_lock_irq(&plo->lock);
+	if (plo->bio_head) {
+		BUG_ON (!plo->bio_tail);
+		/* another way would be: bio_tail->bi_rw |= BIO_RW_SYNCIO; */
+		plo->bio_sync = plo->bio_tail;
+	} else if (!list_empty(&plo->entry_queue)) {
+		struct ploop_request * preq = list_entry(plo->entry_queue.prev,
+							 struct ploop_request,
+							 list);
+		preq_set_sync_bit(preq);
+	}
+
+	if ((!list_empty(&plo->entry_queue) ||
+	     (plo->bio_head && !list_empty(&plo->free_list))) &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+
+	kfree(cb);
+}
+
+static void
+process_discard_bio_queue(struct ploop_device * plo, struct list_head *drop_list)
+{
+	bool discard = test_bit(PLOOP_S_DISCARD, &plo->state);
+
+	while (!list_empty(&plo->free_list)) {
+		struct bio *tmp;
+
+		/* Only one discard bio can be handled concurrently */
+		if (discard && ploop_discard_is_inprogress(plo->fbd))
+			return;
+
+		tmp = bio_list_pop(&plo->bio_discard_list);
+		if (tmp == NULL)
+			break;
+
+		/* If PLOOP_S_DISCARD isn't set, ploop_bio_queue
+		 * will complete it with a proper error.
+		 */
+		ploop_bio_queue(plo, tmp, drop_list, 0);
+	}
+}
+
+static void ploop_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct bio * nbio;
+	struct ploop_device * plo = q->queuedata;
+	unsigned long rw = bio_data_dir(bio);
+	struct hd_struct *part;
+	int cpu;
+	LIST_HEAD(drop_list);
+
+	trace_make_request(bio);
+
+	plo->st.bio_in++;
+
+	BUG_ON(bio->bi_idx);
+	BUG_ON(bio->bi_size & 511);
+
+	cpu = part_stat_lock();
+	part = disk_map_sector_rcu(plo->disk, bio->bi_sector);
+	part_stat_inc(cpu, part, ios[rw]);
+	part_stat_add(cpu, part, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	if (unlikely(bio->bi_size == 0)) {
+		/* Is it possible? This makes sense if the request is
+		 * marked as FLUSH, otherwise just warn and complete. */
+		if (!(bio->bi_rw & REQ_FLUSH)) {
+			WARN_ON(1);
+			BIO_ENDIO(q, bio, 0);
+			return;
+		}
+		/* useless to pass this bio further */
+		if (!plo->tune.pass_flushes) {
+			ploop_acc_ff_in(plo, bio->bi_rw);
+			BIO_ENDIO(q, bio, 0);
+			return;
+		}
+	}
+
+	/* This is crazy. Pattern is borrowed from raid0.c
+	 * bio layer assumes that it can prepare single-page bio
+	 * not depending on any alignment constraints. So be it.
+	 */
+	if (!(bio->bi_rw & REQ_DISCARD) && bio->bi_size &&
+	    (bio->bi_sector >> plo->cluster_log) !=
+	    ((bio->bi_sector + (bio->bi_size >> 9) - 1) >> plo->cluster_log)) {
+		struct bio_pair *bp;
+		unsigned int first_sectors = (1<<plo->cluster_log)
+			- (bio->bi_sector & ((1<<plo->cluster_log) - 1));
+
+		plo->st.bio_splits++;
+
+		BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
+
+		bp = bio_split(bio, first_sectors);
+		ploop_make_request(q, &bp->bio1);
+		ploop_make_request(q, &bp->bio2);
+		bio_pair_release(bp);
+		return;
+	}
+
+	rw = bio->bi_rw;
+	if (unlikely((bio->bi_rw & REQ_FLUSH) &&
+		     !plo->tune.pass_flushes))
+		bio->bi_rw &= ~REQ_FLUSH;
+	if (unlikely((bio->bi_rw & REQ_FUA) &&
+		     !plo->tune.pass_fuas))
+		bio->bi_rw &= ~REQ_FUA;
+
+	/* Allocate new bio now. */
+	nbio = preallocate_bio(bio, plo);
+
+	if (!current->io_context) {
+		struct io_context *ioc;
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc)
+			put_io_context(ioc);
+	}
+
+	spin_lock_irq(&plo->lock);
+	ploop_acc_ff_in_locked(plo, rw);
+	plo->bio_total++;
+
+	/* Device is aborted, everything is in error. This should not happen. */
+	if (unlikely(!test_bit(PLOOP_S_RUNNING, &plo->state) ||
+		     ((bio->bi_rw & REQ_WRITE) &&
+		      test_bit(PLOOP_S_ABORT, &plo->state)))) {
+		plo->bio_total--;
+		spin_unlock_irq(&plo->lock);
+
+		BIO_ENDIO(q, bio, -EIO);
+		if (nbio)
+			bio_put(nbio);
+		return;
+	}
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		bio_list_add(&plo->bio_discard_list, bio);
+		plo->bio_discard_qlen++;
+		goto queued;
+	}
+
+	/* Write tracking in fast path does not work at the moment. */
+	if (unlikely(test_bit(PLOOP_S_TRACK, &plo->state) &&
+		     (bio->bi_rw & WRITE)))
+		goto queue;
+
+	/* No fast path, when maintenance is in progress.
+	 * (PLOOP_S_TRACK was checked immediately above) */
+	if (FAST_PATH_DISABLED(plo->maintenance_type))
+		goto queue;
+
+	/* Attention state, always queue */
+	if (unlikely(test_bit(PLOOP_S_ATTENTION, &plo->state)))
+		goto queue;
+
+	/* Some barriers have been already enqueued, always queue */
+	if (unlikely(plo->barrier_reqs))
+		goto queue;
+
+	if (unlikely(nbio == NULL))
+		goto queue;
+
+	/* Try to merge before checking for fastpath. Maybe, this
+	 * is not wise.
+	 */
+	if (!RB_EMPTY_ROOT(&plo->entry_tree[bio->bi_rw & WRITE]) &&
+	    bio->bi_size) {
+		struct ploop_request * preq;
+		struct rb_node * n = plo->entry_tree[bio->bi_rw & WRITE].rb_node;
+		u32 bio_cluster = bio->bi_sector >> plo->cluster_log;
+
+		while (n) {
+			preq = rb_entry(n, struct ploop_request, lockout_link);
+
+			if (bio_cluster < preq->req_cluster)
+				n = n->rb_left;
+			else if (bio_cluster > preq->req_cluster)
+				n = n->rb_right;
+			else if (bio->bi_sector + (bio->bi_size >> 9) < preq->req_sector)
+				n = n->rb_left;
+			else if (bio->bi_sector > preq->req_sector + preq->req_size)
+				n = n->rb_right;
+			else
+				break;
+		}
+
+		if (n && try_merge(plo, preq, bio, &drop_list))
+			goto out;
+	}
+
+
+	/* Try fast path. If all the mappings are available
+	 * and bio can be remapped without split, just do it.
+	 */
+	if (!bio_fast_map(plo, bio, nbio)) {
+		/* Here is a little problem. It would be really good
+		 * to remap original bio and to return 1. It is how
+		 * make_request() engine is supposed to work.
+		 * Nevertheless, this logic is flawed.
+		 *
+		 * We cannot return remapped bio, because we lose track of it
+		 * and have no way to wait for end of IO f.e. to start
+		 * snapshot or to replace image file.
+		 */
+		trace_bio_fast_map(bio);
+		nbio->bi_private = bio;
+		nbio->bi_end_io = ploop_fast_end_io;
+		plo->active_reqs++;
+		plo->fastpath_reqs++;
+		plo->st.bio_fast++;
+		ploop_acc_ff_out_locked(plo, nbio->bi_rw);
+
+		spin_unlock_irq(&plo->lock);
+
+		generic_make_request(nbio);
+		return;
+	}
+
+	/* Otherwise: queue */
+
+queue:
+	BUG_ON (bio->bi_bdev != plo->bdev && bio_sectors(bio));
+	if (bio->bi_bdev == plo->bdev) {
+		BUG_ON (test_bit(BIO_BDEV_REUSED, &bio->bi_flags));
+		ploop_grab_iocontext(bio);
+	}
+
+	BUG_ON (bio->bi_next);
+	if (plo->bio_tail) {
+		BUG_ON (!plo->bio_head);
+		BUG_ON (plo->bio_tail->bi_next);
+		plo->bio_tail->bi_next = bio;
+		plo->bio_tail = bio;
+	} else {
+		BUG_ON (plo->bio_head);
+		plo->bio_head = plo->bio_tail = bio;
+	}
+	plo->bio_qlen++;
+	ploop_congest(plo);
+
+	/* second chance to merge requests */
+	process_bio_queue_optional(plo, &drop_list);
+
+queued:
+	/* If main thread is waiting for requests, wake it up.
+	 * But try to mitigate wakeups, delaying wakeup for some short
+	 * time.
+	 */
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state)) {
+		/* Synchronous requests are not batched. */
+		if (plo->entry_qlen > plo->tune.batch_entry_qlen ||
+			(bio->bi_rw & (REQ_FLUSH|REQ_FUA)) ||
+			(!bio_list_empty(&plo->bio_discard_list) &&
+			 !list_empty(&plo->free_list)) ||
+			!current->plug) {
+			wake_up_interruptible(&plo->waitq);
+		} else if (!timer_pending(&plo->mitigation_timer)) {
+			mod_timer(&plo->mitigation_timer,
+				  jiffies + plo->tune.batch_entry_delay);
+		}
+	}
+out:
+	if (nbio) {
+		if (!plo->cached_bio)
+			plo->cached_bio = nbio;
+		else
+			bio_put(nbio);
+	}
+	spin_unlock_irq(&plo->lock);
+
+	blk_check_plugged(ploop_unplug, plo, sizeof(struct blk_plug_cb));
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list);
+
+	return;
+}
+
+
+/* q->merge_bvec_fn
+ *
+ * According to API, this function returns length which we are able
+ * to merge, but nobody uses it actually, so that we return either 0
+ * or bvec->bv_len.
+ */
+
+static int
+ploop_merge_bvec(struct request_queue *q, struct bvec_merge_data *bm_data,
+		 struct bio_vec *bvec)
+{
+	struct ploop_device *plo = q->queuedata;
+	struct ploop_delta * delta;
+	sector_t sec;
+	sector_t isector;
+	unsigned int len, ret;
+	unsigned long flags;
+
+	sec = bm_data->bi_sector + get_start_sect(bm_data->bi_bdev);
+	len = bm_data->bi_size + bvec->bv_len;
+	ret = bvec->bv_len;
+
+	/* Always allow to add the first bvec. */
+	if (!bm_data->bi_size)
+		return ret;
+
+	/* Is this possible? This would not contradict to anything. */
+	BUG_ON(len & 511);
+
+	len >>= 9;
+
+	if ((sec >> plo->cluster_log) != 
+	    ((sec + len - 1) >> plo->cluster_log)) {
+		plo->st.merge_neg_cluster++;
+		return 0;
+	}
+
+	/* We can return ret right now, the further action is an optimization
+	 * to prevent splitting overhead and to enable fast path.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	delta = ploop_fast_lookup(plo, sec, 0, &isector);
+	if (delta &&
+	    delta->io.ops->disable_merge &&
+	    delta->io.ops->disable_merge(&delta->io, isector, len)) {
+		plo->st.merge_neg_disable++;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	/* If no mapping is available, merge up to cluster boundary */
+	return ret;
+}
+
+static int ploop_congested2(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+
+	if (test_bit(PLOOP_S_CONGESTED, &plo->state))
+		return bits;
+
+	return 0;
+}
+
+static int ploop_congested(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+	struct ploop_delta * top_delta;
+	int ret = 0;
+
+	top_delta = ploop_top_delta(plo);
+	if (top_delta->io.ops->congested)
+		ret |= top_delta->io.ops->congested(&top_delta->io, bits);
+
+	return ret;
+}
+
+static int __check_lockout(struct ploop_request *preq, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node * n = pb ? plo->lockout_pb_tree.rb_node :
+				  plo->lockout_tree.rb_node;
+	struct ploop_request * p;
+	int lockout_bit = pb ? PLOOP_REQ_PB_LOCKOUT : PLOOP_REQ_LOCKOUT;
+
+	if (n == NULL)
+		return 0;
+
+	if (test_bit(lockout_bit, &preq->state))
+		return 0;
+
+	while (n) {
+		if (pb)
+			p = rb_entry(n, struct ploop_request, lockout_pb_link);
+		else
+			p = rb_entry(n, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster < p->req_cluster)
+			n = n->rb_left;
+		else if (preq->req_cluster > p->req_cluster)
+			n = n->rb_right;
+		else {
+			list_add_tail(&preq->list, &p->delay_list);
+			plo->st.bio_lockouts++;
+			trace_preq_lockout(preq, p);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int check_lockout(struct ploop_request *preq)
+{
+	if (__check_lockout(preq, false))
+		return 1;
+
+	/* push_backup passes READs intact */
+	if (!(preq->req_rw & REQ_WRITE))
+		return 0;
+
+	if (__check_lockout(preq, true))
+		return 1;
+
+	return 0;
+}
+
+static int __ploop_add_lockout(struct ploop_request *preq, int try, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+	struct rb_node *link;
+	struct rb_root *tree;
+	int lockout_bit;
+
+	if (pb) {
+		link = &preq->lockout_pb_link;
+		tree = &plo->lockout_pb_tree;
+		lockout_bit = PLOOP_REQ_PB_LOCKOUT;
+	} else {
+		link = &preq->lockout_link;
+		tree = &plo->lockout_tree;
+		lockout_bit = PLOOP_REQ_LOCKOUT;
+	}
+
+	if (test_bit(lockout_bit, &preq->state))
+		return 0;
+
+	p = &tree->rb_node;
+	while (*p) {
+		parent = *p;
+		if (pb)
+			pr = rb_entry(parent, struct ploop_request, lockout_pb_link);
+		else
+			pr = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster == pr->req_cluster) {
+			if (try)
+				return 1;
+			BUG();
+		}
+
+		if (preq->req_cluster < pr->req_cluster)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	trace_add_lockout(preq);
+
+	rb_link_node(link, parent, p);
+	rb_insert_color(link, tree);
+	__set_bit(lockout_bit, &preq->state);
+	return 0;
+}
+
+int ploop_add_lockout(struct ploop_request *preq, int try)
+{
+	return __ploop_add_lockout(preq, try, false);
+}
+EXPORT_SYMBOL(ploop_add_lockout);
+
+static void ploop_add_pb_lockout(struct ploop_request *preq)
+{
+	__ploop_add_lockout(preq, 0, true);
+}
+
+static void __del_lockout(struct ploop_request *preq, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node *link;
+	struct rb_root *tree;
+	int lockout_bit;
+
+	if (pb) {
+		link = &preq->lockout_pb_link;
+		tree = &plo->lockout_pb_tree;
+		lockout_bit = PLOOP_REQ_PB_LOCKOUT;
+	} else {
+		link = &preq->lockout_link;
+		tree = &plo->lockout_tree;
+		lockout_bit = PLOOP_REQ_LOCKOUT;
+	}
+
+	if (!test_and_clear_bit(lockout_bit, &preq->state))
+		return;
+
+	trace_del_lockout(preq);
+
+	rb_erase(link, tree);
+}
+
+void del_lockout(struct ploop_request *preq)
+{
+	__del_lockout(preq, false);
+}
+
+static void del_pb_lockout(struct ploop_request *preq)
+{
+	__del_lockout(preq, true);
+}
+
+static void ploop_discard_wakeup(struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo = preq->plo;
+
+	if (err || !ploop_fb_get_n_free(plo->fbd)) {
+		/* Only one discard request is processed */
+		ploop_fb_reinit(plo->fbd, err);
+	} else
+		set_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state) ||
+		    !test_bit(PLOOP_S_DISCARD, &plo->state))
+			complete(&plo->maintenance_comp);
+}
+
+static void ploop_complete_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	int nr_completed = 0;
+	struct io_context *ioc;
+
+	trace_complete_request(preq);
+
+	__TRACE("Z %p %u\n", preq, preq->req_cluster);
+
+	while (preq->bl.head) {
+		struct bio * bio = preq->bl.head;
+		preq->bl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, preq->error);
+		nr_completed++;
+	}
+	preq->bl.tail = NULL;
+
+	WARN_ON(!preq->error && test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state));
+
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+	    test_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
+	    test_bit(PLOOP_REQ_RELOC_N, &preq->state)) {
+		if (preq->error)
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+		if (!preq->error) {
+			if (plo->merge_ptr < plo->trans_map->max_index) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map) {
+					map_release(preq->map);
+					preq->map = NULL;
+				}
+				if (preq->trans_map) {
+					map_release(preq->trans_map);
+					preq->trans_map = NULL;
+				}
+
+				del_lockout(preq);
+
+				preq->req_cluster = ~0U;
+
+				if (!list_empty(&preq->delay_list))
+					list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+				plo->active_reqs--;
+
+				preq->eng_state = PLOOP_E_ENTRY;
+				ploop_entry_add(plo, preq);
+				spin_unlock_irq(&plo->lock);
+				return;
+			}
+		} else
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_DISCARD, &preq->state))
+		ploop_discard_wakeup(preq, preq->error);
+
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct page *page = bio->bi_io_vec[i].bv_page;
+			if (page != ZERO_PAGE(0))
+				put_page(page);
+		}
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+	del_pb_lockout(preq); /* preq may die via ploop_fail_immediate() */
+	ploop_test_and_clear_blockable(plo, preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	ioc = preq->ioc;
+	preq->ioc = NULL;
+
+	plo->active_reqs--;
+
+	if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+		ploop_fb_put_zero_request(plo->fbd, preq);
+	} else {
+		ploop_uncongest(plo);
+		list_add(&preq->list, &plo->free_list);
+		plo->free_qlen++;
+		if (waitqueue_active(&plo->req_waitq))
+			wake_up(&plo->req_waitq);
+		else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+			 waitqueue_active(&plo->waitq) &&
+			 (plo->bio_head ||
+			  !bio_list_empty(&plo->bio_discard_list)))
+			wake_up_interruptible(&plo->waitq);
+	}
+	plo->bio_total -= nr_completed;
+
+	if (plo->tune.congestion_detection &&
+	    plo->entry_qlen + plo->active_reqs - plo->fastpath_reqs
+	    <= plo->tune.max_requests/2) {
+		if (test_and_clear_bit(PLOOP_S_WRITE_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, WRITE);
+		if (test_and_clear_bit(PLOOP_S_READ_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, READ);
+	}
+
+	spin_unlock_irq(&plo->lock);
+
+	if (ioc) {
+		atomic_dec(&ioc->nr_tasks);
+		put_io_context_active(ioc);
+	}
+}
+
+void ploop_fail_request(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_req_set_error(preq, err);
+
+	spin_lock_irq(&plo->lock);
+	if (err == -ENOSPC) {
+		set_bit(PLOOP_S_ENOSPC_EVENT, &plo->state);
+		list_add(&preq->list, &plo->ready_queue);
+		if (waitqueue_active(&plo->event_waitq))
+			wake_up_interruptible(&plo->event_waitq);
+	} else {
+		set_bit(PLOOP_S_ABORT, &plo->state);
+		list_add_tail(&preq->list, &plo->ready_queue);
+	}
+	spin_unlock_irq(&plo->lock);
+}
+EXPORT_SYMBOL(ploop_fail_request);
+
+void ploop_fail_immediate(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_req_set_error(preq, err);
+
+	set_bit(PLOOP_S_ABORT, &plo->state);
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+}
+
+#define PLOOP_REQ_FAIL_IMMEDIATE(preq, err)		\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_fail_immediate(preq, err);	\
+	} while (0);
+
+void ploop_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&plo->lock, flags);
+	__TRACE("C %p %u\n", preq, preq->req_cluster);
+	if (preq->error)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+
+	list_add_tail(&preq->list, &plo->ready_queue);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+EXPORT_SYMBOL(ploop_complete_io_state);
+
+
+static int fill_bio(struct ploop_device *plo, struct bio * bio, cluster_t blk)
+{
+	int pages = block_vecs(plo);
+
+	for (; bio->bi_vcnt < pages; bio->bi_vcnt++) {
+		bio->bi_io_vec[bio->bi_vcnt].bv_page = alloc_page(GFP_NOFS);
+		if (bio->bi_io_vec[bio->bi_vcnt].bv_page == NULL)
+			return -ENOMEM;
+		bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+		bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+	}
+	bio->bi_sector = blk << plo->cluster_log;
+	bio->bi_size = (1 << (plo->cluster_log + 9));
+	return 0;
+}
+
+/* Not generic. We assume that dst is aligned properly, i.e. it is
+ * array of the whole pages starting at cluster boundary.
+ */
+static void bio_bcopy(struct bio *dst, struct bio *src, struct ploop_device *plo)
+{
+	int i;
+	unsigned int doff, soff, bv_off;
+
+	doff = (src->bi_sector & ((1<<plo->cluster_log) - 1)) << 9;
+	soff = 0;
+	bv_off = 0;
+	i = 0;
+
+	while (soff < src->bi_size) {
+		struct bio_vec * bv = src->bi_io_vec + i;
+		unsigned int copy;
+		int didx;
+		int poff;
+		void * ksrc;
+
+		if (bv_off >= bv->bv_len) {
+			i++;
+			bv++;
+			bv_off = 0;
+		}
+
+		didx = doff / PAGE_SIZE;
+		poff = doff & (PAGE_SIZE-1);
+		copy = bv->bv_len - bv_off;
+		if (copy > PAGE_SIZE - poff)
+			copy = PAGE_SIZE - poff;
+
+		ksrc = kmap_atomic(bv->bv_page);
+		memcpy(page_address(dst->bi_io_vec[didx].bv_page) + poff,
+		       ksrc + bv->bv_offset + bv_off,
+		       copy);
+		kunmap_atomic(ksrc);
+
+		bv_off += copy;
+		doff += copy;
+		soff += copy;
+	}
+}
+
+int check_zeros(struct bio_list * bl)
+{
+	struct bio * bio;
+
+	bio_list_for_each(bio, bl) {
+		int i;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct bio_vec * bv = bio->bi_io_vec + i;
+			unsigned long * ptr;
+			void * kaddr;
+			int k;
+
+			if (bv->bv_page == ZERO_PAGE(0))
+				continue;
+
+			kaddr = kmap_atomic(bv->bv_page);
+			ptr = kaddr + bv->bv_offset;
+			k = bv->bv_len/sizeof(unsigned long);
+			while (k) {
+				if (*ptr)
+					break;
+				ptr++;
+				k--;
+			}
+			kunmap_atomic(kaddr);
+			if (k)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int prepare_merge_req(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	u32 iblk;
+	int res;
+
+	BUG_ON (preq->trans_map == NULL);
+
+	if (trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+		u32 cluster = preq->req_cluster;
+
+		preq->req_cluster = ~0U;
+
+		if (cluster + 1 != plo->merge_ptr)
+			goto drop_map;
+
+		do {
+			cluster++;
+
+			if (cluster >= plo->trans_map->max_index)
+				goto drop_map;
+
+			if (cluster > map_get_mn_end(preq->trans_map)) {
+				plo->merge_ptr = cluster;
+				goto drop_map;
+			}
+		} while (trans_map_get_index(preq, cluster, &iblk));
+
+		preq->req_cluster = cluster;
+		plo->merge_ptr = cluster + 1;
+	}
+
+	spin_lock_irq(&plo->lock);
+	res = ploop_add_lockout(preq, 1);
+	spin_unlock_irq(&plo->lock);
+	return res;
+
+drop_map:
+	spin_lock_irq(&plo->lock);
+	map_release(preq->trans_map);
+	preq->trans_map = NULL;
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	spin_unlock_irq(&plo->lock);
+	return 1;
+}
+
+void ploop_queue_zero_request(struct ploop_device *plo,
+			      struct ploop_request *orig_preq, cluster_t clu)
+{
+	struct ploop_request * preq;
+
+	spin_lock_irq(&plo->lock);
+
+	preq = ploop_fb_get_zero_request(plo->fbd);
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = clu;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_ZERO);
+	if (test_bit(PLOOP_REQ_SYNC, &orig_preq->state))
+		preq->state |= (1 << PLOOP_REQ_SYNC);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &orig_preq->state)) {
+		if (orig_preq->dst_iblock == ~0U)
+			orig_preq->eng_state = PLOOP_E_RELOC_COMPLETE;
+	} else {
+		orig_preq->eng_state = orig_preq->iblock ?
+			PLOOP_E_DELTA_ZERO_INDEX : PLOOP_E_ZERO_INDEX;
+	}
+	orig_preq->iblock = 0;
+	INIT_LIST_HEAD(&preq->delay_list);
+	list_add_tail(&orig_preq->list, &preq->delay_list);
+
+	list_add(&preq->list, &plo->ready_queue);
+	plo->active_reqs++;
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static void
+ploop_reloc_sched_read(struct ploop_request *preq, iblock_t iblk)
+{
+	struct ploop_device *plo   = preq->plo;
+	struct ploop_delta  *delta = ploop_top_delta(plo);
+	struct bio_list sbl;
+
+	spin_lock_irq(&plo->lock);
+	if (check_lockout(preq)) {
+		__TRACE("l2 %p %u\n", preq, preq->req_cluster);
+		spin_unlock_irq(&plo->lock);
+		return;
+	}
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&plo->lock);
+
+	if (!preq->aux_bio) {
+		preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+			return;
+		}
+	}
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_RELOC_DATA_READ;
+	sbl.head = sbl.tail = preq->aux_bio;
+	delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+			      &sbl, iblk, 1<<plo->cluster_log);
+}
+
+/*
+ * Returns 0 if and only if a free block was successfully reused
+ */
+static int
+ploop_reuse_free_block(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	iblock_t  iblk;
+	cluster_t clu;
+	int	  rc;
+	unsigned long pin_state;
+
+	if (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+	    plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -1;
+
+	rc = ploop_fb_get_free_block(plo->fbd, &clu, &iblk);
+
+	/* simple case - no free blocks left */
+	if (rc < 0)
+		return rc;
+
+	/* a free block to reuse requires zeroing index */
+	if (rc > 0) {
+		ploop_queue_zero_request(plo, preq, clu);
+		return 0;
+	}
+
+	/* 'rc == 0' - use iblk as a lost block */
+	pin_state = preq->iblock ? PLOOP_E_DELTA_ZERO_INDEX :
+				   PLOOP_E_ZERO_INDEX;
+	preq->iblock = iblk;
+
+	/* pin preq to some reloc request processing iblk ? */
+	if (ploop_fb_check_reloc_req(plo->fbd, preq, pin_state))
+		return 0;
+
+	/* iblk is a lost block and nobody is relocating it now */
+	preq->eng_state = PLOOP_E_DATA_WBI;
+	__TRACE("T2 %p %u\n", preq, preq->req_cluster);
+	plo->st.bio_out++;
+
+	if (pin_state == PLOOP_E_ZERO_INDEX) {
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &preq->bl, preq->iblock,
+					  preq->req_size);
+	} else { /* PLOOP_E_DELTA_READ */
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				      &sbl, preq->iblock, 1<<plo->cluster_log);
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if zero preq was successfully processed
+ */
+static int
+ploop_entry_zero_req(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	iblock_t iblk = 0;
+	int	 err;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level != top_delta->level) {
+		printk("Can't zero index on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       iblk, preq->iblock);
+		return -EIO;
+	}
+
+	ploop_index_update(preq);
+	return 0;
+}
+
+#define MAP_MAX_IND(preq) min(map_get_mn_end(preq->map),	\
+			      preq->plo->map.max_index - 1)
+
+/*
+ * Returns 0 if and only if RELOC_A preq was successfully processed.
+ *
+ * Advance preq->req_cluster till it points to *iblk in grow range.
+ * Returning 0, always set *iblk to a meaningful value: either zero
+ * (if preq->req_cluster went out of allowed range or map is being read)
+ * or iblock in grow range that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_a_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	cluster_t           *clu       = &preq->req_cluster;
+	int level;
+	int err;
+	BUG_ON (*clu == ~0U);
+
+	while(*clu < plo->map.max_index) {
+		err = ploop_find_map(&plo->map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("m %p %u\n", preq, *clu);
+				*iblk = 0;
+				return 0;
+			}
+			return err;
+		}
+		BUG_ON (preq->map == NULL);
+
+		for (; *clu <= MAP_MAX_IND(preq); (*clu)++) {
+			level = map_get_index(preq, *clu, iblk);
+			if (level == top_delta->level &&
+			    *iblk >= plo->grow_start &&
+			    *iblk <= plo->grow_end)
+				break;
+		}
+
+		if (*clu <= MAP_MAX_IND(preq))
+			break;
+
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (*clu >= plo->map.max_index) {
+		preq->eng_state = PLOOP_E_COMPLETE;
+		ploop_complete_request(preq);
+		*iblk = 0;
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if RELOC_S preq was successfully processed.
+ *
+ * Sets preq->req_cluster to the block we're going to relocate.
+ * Returning 0, always set *iblk to a meaningful value: either
+ * zero (if no more blocks to relocate or block to relocate is free
+ *	 (and zero-index op is scheduled) or map is being read)
+ * or iblock that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_s_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+
+	cluster_t from_clu, to_clu;
+	iblock_t from_iblk, to_iblk;
+	u32 free;
+	int level;
+	int err;
+
+	*iblk = 0;
+
+	if (preq->req_cluster == ~0U) {
+		cluster_t zero_cluster;
+
+		BUG_ON (preq->error);
+		err = ploop_fb_get_reloc_block(plo->fbd, &from_clu, &from_iblk,
+					       &to_clu, &to_iblk, &free);
+		if (err < 0) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return 0;
+		}
+
+		preq->req_cluster = from_clu;
+		preq->src_iblock  = from_iblk;
+		ploop_fb_add_reloc_req(plo->fbd, preq);
+
+		if (free) {
+			preq->dst_iblock  = ~0U;
+			preq->dst_cluster = ~0U;
+			zero_cluster = preq->req_cluster;
+		} else {
+			preq->dst_iblock  = to_iblk;
+			preq->dst_cluster = to_clu;
+			zero_cluster = preq->dst_cluster;
+		}
+
+		ploop_queue_zero_request(plo, preq, zero_cluster);
+		return 0;
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+	BUG_ON (preq->map == NULL);
+
+	level = map_get_index(preq, preq->req_cluster, iblk);
+	if (level != top_delta->level) {
+		printk("Can't relocate block on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       *iblk, preq->iblock);
+		return -EIO;
+	}
+	if (preq->src_iblock != *iblk) {
+		printk("Can't relocate block due to wrong mapping: "
+		       "req_cluster=%u should point to iblk=%u while "
+		       "map_get_index() calculated iblk=%u\n",
+		       preq->req_cluster, preq->src_iblock, *iblk);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* dummy wrapper around ploop_entry_reloc_[a|s]_req() */
+static int
+ploop_entry_reloc_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state))
+		return ploop_entry_reloc_a_req(preq, iblk);
+	else if (test_bit(PLOOP_REQ_RELOC_S, &preq->state))
+		return ploop_entry_reloc_s_req(preq, iblk);
+	else
+		BUG();
+}
+
+static void fill_zero_bio(struct ploop_device *plo, struct bio * bio)
+{
+	int pages = block_vecs(plo);
+
+	for (; bio->bi_vcnt < pages; bio->bi_vcnt++) {
+		bio->bi_io_vec[bio->bi_vcnt].bv_page = ZERO_PAGE(0);
+		bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+		bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+	}
+	bio->bi_sector = 0;
+	bio->bi_size = (1 << (plo->cluster_log + 9));
+}
+
+/*
+ * Returns 0 if and only if RELOC_A preq was successfully processed.
+ *
+ * Advance preq->req_cluster till it points to *iblk in grow range.
+ * Returning 0, always set *iblk to a meaningful value: either zero
+ * (if preq->req_cluster went out of allowed range or map is being read)
+ * or iblock in grow range that preq->req_cluster points to.
+ */
+static int
+ploop_entry_nullify_req(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	struct bio_list sbl;
+
+	if (!preq->aux_bio) {
+		preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+		if (!preq->aux_bio)
+			return -ENOMEM;
+		fill_zero_bio(plo, preq->aux_bio);
+	}
+
+	sbl.head = sbl.tail = preq->aux_bio;
+	preq->eng_state = PLOOP_E_RELOC_NULLIFY;
+	list_del_init(&preq->list);
+
+	/*
+	 * Lately we think we does sync of nullified blocks at format
+	 * driver by image fsync before header update.
+	 * But we write this data directly into underlying device
+	 * bypassing EXT4 by usage of extent map tree
+	 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
+	 * We need to force sync of nullified blocks.
+	 */
+	if (top_delta->io.ops->issue_flush) {
+		preq->eng_io = &top_delta->io;
+		set_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state);
+	}
+
+	top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				  &sbl, preq->iblock, 1<<plo->cluster_log);
+	return 0;
+}
+
+static int discard_get_index(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	int	 err;
+
+	preq->iblock = 0;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err)
+		return err;
+
+	level = map_get_index(preq, preq->req_cluster, &preq->iblock);
+	if (level != top_delta->level)
+		preq->iblock = 0;
+
+	if (preq->map) {
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	return 0;
+}
+
+static int ploop_entry_discard_req(struct ploop_request *preq)
+{
+	int err = 0;
+	struct ploop_device * plo = preq->plo;
+	unsigned int len = 0;
+	cluster_t last_clu;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+
+	last_clu = (preq->req_sector + preq->req_size) >> plo->cluster_log;
+
+	for (; preq->req_cluster < last_clu; preq->req_cluster++) {
+		len = preq->req_cluster - preq->dst_cluster;
+
+		err = discard_get_index(preq);
+		if (err) {
+			if (err == 1)
+				return 0;
+			goto err;
+		}
+
+		if (preq->dst_iblock &&
+		    (!preq->iblock || preq->dst_iblock + len != preq->iblock)) {
+			err = ploop_fb_add_free_extent(plo->fbd,
+							preq->dst_cluster,
+							preq->dst_iblock, len);
+			preq->dst_iblock = 0;
+			if (err) {
+				if (err == -EINVAL) {
+					printk("ploop_entry_discard_req1: "
+					       "(%lu %u; %u %u; %u %u)\n",
+					       preq->req_sector, preq->req_size,
+					       preq->req_cluster, preq->iblock,
+					       preq->dst_cluster, preq->dst_iblock);
+					WARN_ONCE(1, "add_free_extent failed\n");
+				}
+				goto err;
+			}
+		}
+
+		if (!preq->dst_iblock && preq->iblock) {
+			preq->dst_cluster = preq->req_cluster;
+			preq->dst_iblock = preq->iblock;
+		}
+	}
+
+	if (preq->dst_iblock) {
+		len = preq->req_cluster - preq->dst_cluster;
+		err = ploop_fb_add_free_extent(plo->fbd, preq->dst_cluster,
+						preq->dst_iblock, len);
+		if (err == -EINVAL) {
+			printk("ploop_entry_discard_req2: "
+			       "(%lu %u; %u %u; %u %u)\n",
+			       preq->req_sector, preq->req_size,
+			       preq->req_cluster, preq->iblock,
+			       preq->dst_cluster, preq->dst_iblock);
+			WARN_ONCE(1, "add_free_extent failed\n");
+		}
+	}
+
+err:
+	preq->error = err;
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+
+	return 0;
+}
+
+/* Main preq state machine */
+
+static inline bool preq_is_special(struct ploop_request * preq)
+{
+	unsigned long state = READ_ONCE(preq->state);
+
+	return state & (PLOOP_REQ_MERGE_FL |
+			PLOOP_REQ_RELOC_A_FL |
+			PLOOP_REQ_RELOC_S_FL |
+			PLOOP_REQ_RELOC_N_FL |
+			PLOOP_REQ_DISCARD_FL |
+			PLOOP_REQ_ZERO_FL);
+}
+
+void ploop_add_req_to_fsync_queue(struct ploop_request * preq)
+{
+	struct ploop_device * plo       = preq->plo;
+	struct ploop_delta  * top_delta = ploop_top_delta(plo);
+	struct ploop_io     * top_io    = &top_delta->io;
+
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &top_io->fsync_queue);
+	top_io->fsync_qlen++;
+	if (waitqueue_active(&top_io->fsync_waitq))
+		wake_up_interruptible(&top_io->fsync_waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void
+ploop_entry_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo       = preq->plo;
+	struct ploop_delta  * top_delta = ploop_top_delta(plo);
+	struct ploop_io     * top_io    = &top_delta->io;
+	struct ploop_delta  * delta;
+	int level;
+	int err;
+	iblock_t iblk;
+
+	if (!preq_is_special(preq)) {
+		/* Control request */
+		if (unlikely(preq->bl.head == NULL)) {
+			complete(plo->quiesce_comp);
+			wait_for_completion(&plo->relax_comp);
+			ploop_complete_request(preq);
+			complete(&plo->relaxed_comp);
+			return;
+		}
+
+		/* Need to fsync before start handling FLUSH */
+		if ((preq->req_rw & REQ_FLUSH) &&
+		    test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state) &&
+		    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
+			ploop_add_req_to_fsync_queue(preq);
+			return;
+		}
+
+		/* Empty flush or unknown zero-size request */
+		if (preq->req_size == 0) {
+			if (preq->req_rw & REQ_FLUSH &&
+			    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
+				preq->eng_state = PLOOP_E_COMPLETE;
+				if (top_io->ops->issue_flush) {
+					top_io->ops->issue_flush(top_io, preq);
+					return;
+				}
+			}
+
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return;
+		}
+	}
+
+	if (unlikely(test_bit(PLOOP_REQ_SYNC, &preq->state) &&
+		     !(preq->req_rw & REQ_SYNC)))
+		preq->req_rw |= REQ_SYNC;
+
+restart:
+	if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+		err = ploop_entry_discard_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_ZERO, &preq->state)) {
+		err = ploop_entry_zero_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+		   test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		err = ploop_entry_reloc_req(preq, &iblk);
+		if (err)
+			goto error;
+		if (iblk)
+			ploop_reloc_sched_read(preq, iblk);
+		return;
+	} else if (test_bit(PLOOP_REQ_RELOC_N, &preq->state)) {
+		err = ploop_entry_nullify_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (preq->req_cluster == ~0U) {
+		BUG_ON(!test_bit(PLOOP_REQ_MERGE, &preq->state));
+		BUG_ON(preq->trans_map);
+		BUG_ON(preq->map);
+
+		preq->req_cluster = plo->merge_ptr;
+		plo->merge_ptr++;
+		if (preq->req_cluster >= plo->trans_map->max_index) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return;
+		}
+	}
+
+	if (check_lockout(preq)) {
+		__TRACE("l %p %u\n", preq, preq->req_cluster);
+		return;
+	}
+
+	/* push_backup special processing */
+	if (!test_bit(PLOOP_REQ_PB_LOCKOUT, &preq->state) &&
+	    (preq->req_rw & REQ_WRITE) && preq->req_size &&
+	    ploop_pb_check_bit(plo->pbd, preq->req_cluster)) {
+		if (ploop_pb_preq_add_pending(plo->pbd, preq)) {
+			/* already reported by userspace push_backup */
+			ploop_pb_clear_bit(plo->pbd, preq->req_cluster);
+		} else {
+			/* needn't lock because only ploop_thread accesses */
+			ploop_add_pb_lockout(preq);
+			ploop_set_blockable(plo, preq);
+			/*
+			 * preq IN: preq is in ppb_pending tree waiting for
+			 * out-of-band push_backup processing by userspace ...
+			 */
+			return;
+		}
+	} else if (test_bit(PLOOP_REQ_PB_LOCKOUT, &preq->state) &&
+		   test_and_clear_bit(PLOOP_REQ_PUSH_BACKUP, &preq->ppb_state)) {
+		/*
+		 * preq OUT: out-of-band push_backup processing by
+		 * userspace done; preq was re-scheduled
+		 */
+		ploop_pb_clear_bit(plo->pbd, preq->req_cluster);
+		ploop_test_and_clear_blockable(plo, preq);
+
+		del_pb_lockout(preq);
+		spin_lock_irq(&plo->lock);
+		if (!list_empty(&preq->delay_list))
+			list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (plo->trans_map) {
+		err = ploop_find_trans_map(plo->trans_map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("tm %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+			goto error;
+		}
+
+		if (preq->trans_map &&
+		    !(preq->req_rw & REQ_WRITE) &&
+		    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+			delta = map_top_delta(plo->trans_map);
+			preq->iblock = iblk;
+			preq->eng_state = PLOOP_E_COMPLETE;
+			plo->st.bio_out++;
+			__TRACE("tS %p %u\n", preq, preq->req_cluster);
+			delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+					      iblk, preq->req_size);
+			return;
+		}
+
+		if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+			if (prepare_merge_req(preq))
+				goto restart;
+		}
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, preq->req_cluster);
+			return;
+		}
+		goto error;
+	}
+
+	if (preq->trans_map &&
+	    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+		struct bio_list sbl;
+
+		/* Read requests were served earlier. */
+		BUG_ON(!(preq->req_rw & REQ_WRITE));
+
+		spin_lock_irq(&plo->lock);
+		ploop_add_lockout(preq, 0);
+		spin_unlock_irq(&plo->lock);
+
+		if (whole_block(plo, preq)) {
+			set_bit(PLOOP_REQ_TRANS, &preq->state);
+			plo->st.bio_trans_whole++;
+			goto delta_io;
+		}
+
+		plo->st.bio_cows++;
+
+		if (!preq->aux_bio)
+			preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+			return;
+		}
+
+		delta = map_top_delta(plo->trans_map);
+
+		__TRACE("tDR %p %u\n", preq, preq->req_cluster);
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_TRANS_DELTA_READ;
+		sbl.head = sbl.tail = preq->aux_bio;
+		delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+				      &sbl, iblk, 1<<plo->cluster_log);
+		plo->st.bio_trans_copy++;
+		return;
+	}
+
+delta_io:
+	BUG_ON(test_bit(PLOOP_REQ_MERGE, &preq->state));
+
+	delta = top_delta;
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level < 0) {
+		delta = NULL;
+	} else if (level != top_delta->level) {
+		delta = find_delta(plo, level);
+		if (!delta) {
+			err = -EIO;
+			goto error;
+		}
+	}
+
+	if (!(preq->req_rw & REQ_WRITE)) {
+		/* Read direction. If we found existing block in some
+		 * delta, we direct bio there. If we did not, this location
+		 * was never written before. We return zero fill and,
+		 * probably, should log an alert.
+		 */
+		if (!delta) {
+			struct bio * bio;
+
+			if (map_index_fault(preq) == 0) {
+				__TRACE("i %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+
+			__TRACE("X %p %u\n", preq, preq->req_cluster);
+			bio_list_for_each(bio, &preq->bl) {
+				zero_fill_bio(bio);
+			}
+			ploop_complete_request(preq);
+			plo->st.bio_rzero++;
+			return;
+		}
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_COMPLETE;
+		plo->st.bio_out++;
+		__TRACE("S %p %u\n", preq, preq->req_cluster);
+		delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				      iblk, preq->req_size);
+	} else {
+		if (delta) {
+			if (delta == top_delta) {
+				/* Block exists in top delta. Good. */
+				if (plo->maintenance_type == PLOOP_MNTN_GROW ||
+				    plo->maintenance_type == PLOOP_MNTN_RELOC) {
+					spin_lock_irq(&plo->lock);
+					ploop_add_lockout(preq, 0);
+					spin_unlock_irq(&plo->lock);
+				}
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_COMPLETE;
+				__TRACE("T %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_out++;
+				delta->io.ops->submit(&delta->io, preq, preq->req_rw,
+						      &preq->bl, iblk, preq->req_size);
+			} else if (whole_block(plo, preq)) {
+				__TRACE("O1 %p %u\n", preq, preq->req_cluster);
+				/* Block does not exist in top delta,
+				 * but it exists in some delta.
+				 * BUT! Plain luck, we have full block
+				 * and can skip read stage.
+				 */
+				plo->st.bio_whole_cows++;
+
+				/* About lockout. Reads could proceed
+				 * without lockout.
+				 */
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				if (likely(ploop_reuse_free_block(preq)))
+					top_delta->ops->allocate(top_delta,
+								 preq, &preq->bl,
+								 preq->req_size);
+			} else {
+				struct bio_list sbl;
+
+				plo->st.bio_cows++;
+
+				if (!preq->aux_bio)
+					preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+				if (!preq->aux_bio ||
+				    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+					PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+					return;
+				}
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				__TRACE("DR %p %u\n", preq, preq->req_cluster);
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_DELTA_READ;
+				sbl.head = sbl.tail = preq->aux_bio;
+				delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+						      &sbl, iblk, 1<<plo->cluster_log);
+			}
+		} else {
+			if (!whole_block(plo, preq) && map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+			}
+
+			if (plo->tune.check_zeros && check_zeros(&preq->bl)) {
+				if (map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+				}
+				preq->eng_state = PLOOP_E_COMPLETE;
+				/* Not ploop_complete_request().
+				 * This can be TRANS request.
+				 */
+				ploop_complete_io_state(preq);
+				if(whole_block(plo, preq))
+					plo->st.bio_alloc_whole++;
+				plo->st.bio_wzero++;
+				return;
+			}
+			if(whole_block(plo, preq))
+				plo->st.bio_alloc_whole++;
+
+			spin_lock_irq(&plo->lock);
+			ploop_add_lockout(preq, 0);
+			spin_unlock_irq(&plo->lock);
+
+			/* Block does not exist. */
+			if (likely(ploop_reuse_free_block(preq))) {
+				__TRACE("K %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_alloc++;
+				top_delta->ops->allocate(top_delta, preq,
+							 &preq->bl,
+							 preq->req_size);
+			}
+		}
+	}
+	return;
+
+error:
+	PLOOP_REQ_FAIL_IMMEDIATE(preq, err);
+}
+
+static void ploop_req_state_process(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta;
+	struct io_context * saved_ioc = NULL;
+	int release_ioc = 0;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter * uninitialized_var(saved_ub);
+#endif
+
+	trace_req_state_process(preq);
+
+	if (preq->ioc) {
+		saved_ioc = current->io_context;
+		current->io_context = preq->ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		saved_ub = set_exec_ub(preq->ioc->ioc_ub);
+#endif
+		atomic_long_inc(&preq->ioc->refcount);
+		release_ioc = 1;
+	}
+
+	if (preq->eng_state != PLOOP_E_COMPLETE &&
+	    test_bit(PLOOP_REQ_SYNC, &preq->state))
+		set_bit(PLOOP_S_SYNC, &plo->state);
+
+	if (test_bit(PLOOP_REQ_TRACK, &preq->state)) {
+		sector_t sec;
+		clear_bit(PLOOP_REQ_TRACK, &preq->state);
+
+		sec = (sector_t)preq->track_cluster << plo->cluster_log;
+		if (sec < plo->track_end)
+			ploop_tracker_notify(plo, sec);
+	}
+
+	/* trick: preq->prealloc_size is actually new pos of eof */
+	if (unlikely(preq->prealloc_size && !preq->error)) {
+		struct ploop_io *io = &ploop_top_delta(plo)->io;
+		int log = preq->plo->cluster_log + 9;
+
+		BUG_ON(preq != io->prealloc_preq);
+		io->prealloc_preq = NULL;
+
+		io->prealloced_size = preq->prealloc_size -
+				      ((loff_t)io->alloc_head << log);
+		preq->prealloc_size = 0; /* only for sanity */
+	}
+
+	if (test_bit(PLOOP_REQ_POST_SUBMIT, &preq->state)) {
+		preq->eng_io->ops->post_submit(preq->eng_io, preq);
+		clear_bit(PLOOP_REQ_POST_SUBMIT, &preq->state);
+		preq->eng_io = NULL;
+	}
+
+	if (test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state)) {
+		preq->eng_io->ops->issue_flush(preq->eng_io, preq);
+		clear_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state);
+		preq->eng_io = NULL;
+		goto out;
+	}
+
+restart:
+	BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, &preq->state));
+	__TRACE("ST %p %u %lu\n", preq, preq->req_cluster, preq->eng_state);
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		/* First entry */
+		if (preq->error ||
+		    ((preq->req_rw & REQ_WRITE) &&
+		     test_bit(PLOOP_S_ABORT, &plo->state))) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		ploop_entry_request(preq);
+		break;
+
+	case PLOOP_E_RELOC_COMPLETE:
+		BUG_ON (!test_bit(PLOOP_REQ_RELOC_S, &preq->state));
+		if (!preq->error) {
+			ploop_fb_relocate_req_completed(plo->fbd);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+			spin_lock_irq(&plo->lock);
+			if (!list_empty(&preq->delay_list)) {
+				struct ploop_request *pr;
+				pr = list_entry(preq->delay_list.next,
+						struct ploop_request, list);
+				list_splice_init(&preq->delay_list,
+						 plo->ready_queue.prev);
+			}
+			spin_unlock_irq(&plo->lock);
+			preq->req_cluster = ~0U;
+			preq->src_iblock  = ~0U; /* redundant */
+			preq->dst_cluster = ~0U; /* redundant */
+			preq->dst_iblock  = ~0U; /* redundant */
+			preq->eng_state = PLOOP_E_ENTRY;
+			goto restart;
+		}
+		/* drop down to PLOOP_E_COMPLETE case ... */
+	case PLOOP_E_COMPLETE:
+		if (unlikely(test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
+			     preq->error)) {
+			printk("RELOC_S completed with err %d"
+			       " (%u %u %u %u %u)\n",
+			       preq->error, preq->req_cluster, preq->iblock,
+			       preq->src_iblock, preq->dst_cluster,
+			       preq->dst_iblock);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+		}
+
+		if (!preq->error &&
+		    test_bit(PLOOP_REQ_TRANS, &preq->state)) {
+			u32 iblk;
+
+			__clear_bit(PLOOP_REQ_TRANS, &preq->state);
+			BUG_ON(!preq->trans_map);
+			if (!trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map)
+					map_release(preq->map);
+				preq->map = preq->trans_map;
+				preq->trans_map = NULL;
+				spin_unlock_irq(&plo->lock);
+				preq->iblock = 0;
+				top_delta = map_top_delta(plo->trans_map);
+				top_delta->ops->allocate_complete(top_delta, preq);
+				plo->st.bio_trans_index++;
+				break;
+			}
+		}
+
+		ploop_complete_request(preq);
+		/* All done. */
+		break;
+
+	case PLOOP_E_DELTA_READ:
+	{
+		struct bio * b;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		/* Fall through ... */
+	}
+	case PLOOP_E_DELTA_COPIED:
+	{
+		if (likely(ploop_reuse_free_block(preq))) {
+			struct bio_list sbl;
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta = ploop_top_delta(plo);
+			top_delta->ops->allocate(top_delta, preq,
+						 &sbl, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_ZERO_INDEX:
+	{
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		if (whole_block(plo, preq)) {
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &preq->bl, preq->iblock,
+						  preq->req_size);
+		} else {
+			struct bio_list sbl;
+			struct bio * b;
+			int i;
+
+			if (!preq->aux_bio)
+				preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+			if (!preq->aux_bio ||
+			    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+				PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+				break;
+			}
+
+			for (i = 0; i < preq->aux_bio->bi_vcnt; i++)
+				memset(page_address(preq->aux_bio->bi_io_vec[i].bv_page),
+				       0, PAGE_SIZE);
+
+			bio_list_for_each(b, &preq->bl) {
+				bio_bcopy(preq->aux_bio, b, plo);
+			}
+
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, preq->iblock, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_DELTA_ZERO_INDEX:
+	{
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		sbl.head = sbl.tail = preq->aux_bio;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &sbl, preq->iblock,
+					  1<<plo->cluster_log);
+		break;
+	}
+	case PLOOP_E_RELOC_DATA_READ:
+	{
+		struct bio_list sbl;
+
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		/* Relocated data write required sync before BAT update
+		 * this will happen inside index_update */
+
+		if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+			preq->eng_state = PLOOP_E_DATA_WBI;
+			plo->st.bio_out++;
+			preq->iblock = preq->dst_iblock;
+			top_delta->io.ops->submit(&top_delta->io, preq,
+						  preq->req_rw, &sbl,
+						  preq->iblock,
+						  1<<plo->cluster_log);
+		} else {
+			top_delta->ops->allocate(top_delta, preq, &sbl,
+						 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_RELOC_NULLIFY:
+	{
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		if (++plo->grow_relocated > plo->grow_end - plo->grow_start) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			break;
+		}
+
+		del_lockout(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->iblock++;
+		goto restart;
+	}
+	case PLOOP_E_TRANS_DELTA_READ:
+	{
+		struct bio * b;
+		struct bio_list sbl;
+		u32 iblk;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		__set_bit(PLOOP_REQ_TRANS, &preq->state);
+		if (map_get_index(preq, preq->req_cluster, &iblk) != top_delta->level) {
+			/*
+			 * we can be here only if merge is in progress and
+			 * merge can't happen concurrently with ballooning
+			 */
+			top_delta->ops->allocate(top_delta, preq, &sbl, 1<<plo->cluster_log);
+			plo->st.bio_trans_alloc++;
+		} else {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			preq->iblock = iblk;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, iblk, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_INDEX_READ:
+	case PLOOP_E_TRANS_INDEX_READ:
+		/* It was an index read. */
+		map_read_complete(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		goto restart;
+
+	case PLOOP_E_DATA_WBI:
+		/* Data written. Index must be updated. */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		top_delta = ploop_top_delta(plo);
+		top_delta->ops->allocate_complete(top_delta, preq);
+		break;
+
+	case PLOOP_E_INDEX_WB:
+		/* Index write completed. */
+		ploop_index_wb_complete(preq);
+		break;
+
+	case PLOOP_E_FSYNC_PENDED:
+		/* fsync done */
+		ploop_index_wb_proceed(preq);
+		break;
+
+	default:
+		BUG();
+	}
+out:
+	if (release_ioc) {
+		struct io_context * ioc = current->io_context;
+		current->io_context = saved_ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		set_exec_ub(saved_ub);
+#endif
+		put_io_context(ioc);
+	}
+}
+
+static void ploop_wait(struct ploop_device * plo, int once, struct blk_plug *plug)
+{
+	DEFINE_WAIT(_wait);
+	for (;;) {
+		prepare_to_wait(&plo->waitq, &_wait, TASK_INTERRUPTIBLE);
+
+		/* This is obvious. */
+		if (!list_empty(&plo->ready_queue))
+			break;
+
+		/* This is not. If we have something in entry queue... */
+		if (!list_empty(&plo->entry_queue)) {
+			/* And entry queue is not suspended due to barrier
+			 * or active reuests are all completed, so that
+			 * we can start/finish barrier processing
+			 */
+			if (!once &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		} else if (plo->bio_head ||
+			   (!bio_list_empty(&plo->bio_discard_list) &&
+			    !ploop_discard_is_inprogress(plo->fbd))) {
+			/* ready_queue and entry_queue are empty, but
+			 * bio list not. Obviously, we'd like to process
+			 * bio_list instead of sleeping */
+			if (!list_empty(&plo->free_list) &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		}
+
+		if (kthread_should_stop() && !plo->active_reqs)
+			break;
+
+		set_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+		if (kthread_should_stop())
+			set_bit(PLOOP_S_EXITING, &plo->state);
+		once = 0;
+		spin_unlock_irq(&plo->lock);
+		blk_finish_plug(plug);
+		schedule();
+		blk_start_plug(plug);
+		spin_lock_irq(&plo->lock);
+		clear_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+	}
+	finish_wait(&plo->waitq, &_wait);
+}
+
+static void ploop_handle_enospc_req(struct ploop_request *preq)
+{
+	struct ploop_device * plo = preq->plo;
+	DEFINE_WAIT(_wait);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return;
+
+	mod_timer(&plo->freeze_timer, jiffies + HZ * 10);
+
+	prepare_to_wait(&plo->freeze_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+
+	finish_wait(&plo->freeze_waitq, &_wait);
+
+	spin_unlock_irq(&plo->lock);
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct page *page = bio->bi_io_vec[i].bv_page;
+			if (page != ZERO_PAGE(0))
+				put_page(page);
+		}
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+}
+
+static void
+process_pending_bios(struct ploop_device * plo, struct list_head *drop_list)
+{
+	while (!ploop_pb_bio_list_empty(plo->pbd) &&
+	       !list_empty(&plo->free_list) &&
+	       (plo->free_qlen > plo->free_qmax / 2 ||
+		plo->blockable_reqs <= plo->free_qmax / 4)) {
+		struct bio *bio = ploop_pb_bio_get(plo->pbd);
+
+		ploop_bio_queue(plo, bio, drop_list, 1);
+		plo->blocked_bios--;
+	}
+}
+
+/* Main process. Processing queues in proper order, handling pre-barrier
+ * flushes and queue suspend while processing a barrier
+ */
+static int ploop_thread(void * data)
+{
+	int once = 0;
+	struct ploop_device * plo = data;
+	struct blk_plug plug;
+	LIST_HEAD(drop_list);
+
+	set_user_nice(current, -20);
+
+	blk_start_plug(&plug);
+	for (;;) {
+		/* Convert bios to preqs early (at least before processing
+		 * entry queue) to increase chances of bio merge
+		 */
+		cond_resched();
+		spin_lock_irq(&plo->lock);
+		BUG_ON (!list_empty(&drop_list));
+
+		process_pending_bios(plo, &drop_list);
+		process_bio_queue_main(plo, &drop_list);
+		process_discard_bio_queue(plo, &drop_list);
+
+		if (!list_empty(&drop_list)) {
+			spin_unlock_irq(&plo->lock);
+			ploop_preq_drop(plo, &drop_list);
+			continue;
+		}
+
+		if (!list_empty(&plo->ready_queue)) {
+			struct ploop_request * preq;
+			preq = ploop_get_request(plo, &plo->ready_queue);
+			if (preq->error == -ENOSPC)
+				ploop_handle_enospc_req(preq);
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+
+			continue;
+		}
+
+		/* Now ready_queue is empty */
+
+		if (plo->active_reqs == 0)
+			clear_bit(PLOOP_S_ATTENTION, &plo->state);
+
+		if (!list_empty(&plo->entry_queue) &&
+		    !test_bit(PLOOP_S_ATTENTION, &plo->state)) {
+			struct ploop_request * preq;
+
+			preq = ploop_get_request(plo, &plo->entry_queue);
+
+			if (test_bit(PLOOP_REQ_BARRIER, &preq->state)) {
+				set_bit(PLOOP_S_ATTENTION, &plo->state);
+				if (plo->active_reqs) {
+					list_add(&preq->list, &plo->entry_queue);
+					spin_unlock_irq(&plo->lock);
+					continue;
+				}
+				plo->barrier_reqs--;
+			} else {
+				if (!plo->read_sync_reqs &&
+				    plo->active_reqs > plo->tune.max_active_requests &&
+				    plo->active_reqs > plo->entry_qlen &&
+				    time_before(jiffies, preq->tstamp + plo->tune.batch_entry_delay) &&
+				    !kthread_should_stop()) {
+					list_add(&preq->list, &plo->entry_queue);
+					once = 1;
+					mod_timer(&plo->mitigation_timer, preq->tstamp + plo->tune.batch_entry_delay);
+					goto wait_more;
+				}
+			}
+
+			plo->active_reqs++;
+			ploop_entry_qlen_dec(preq);
+
+			if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+				BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+				atomic_inc(&plo->maintenance_cnt);
+			}
+
+			if (test_bit(PLOOP_REQ_SORTED, &preq->state)) {
+				rb_erase(&preq->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+				__clear_bit(PLOOP_REQ_SORTED, &preq->state);
+			}
+			preq->eng_state = PLOOP_E_ENTRY;
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+			continue;
+		}
+
+		/* Termination condition: stop requested,
+		 * no requests are in process or in entry queue
+		 */
+		if (kthread_should_stop() && !plo->active_reqs &&
+		    list_empty(&plo->entry_queue) && !plo->bio_head &&
+		    bio_list_empty(&plo->bio_discard_list) &&
+		    ploop_pb_bio_list_empty(plo->pbd))
+			break;
+
+wait_more:
+		ploop_wait(plo, once, &plug);
+		spin_unlock_irq(&plo->lock);
+		once = 0;
+	}
+
+	spin_unlock_irq(&plo->lock);
+	blk_finish_plug(&plug);
+
+	if (current->io_context)
+		exit_io_context(current);
+
+	return 0;
+}
+
+
+/* block device operations */
+static int ploop_open(struct block_device *bdev, fmode_t fmode)
+{
+	struct ploop_device * plo = bdev->bd_disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	BUG_ON (plo->bdev && plo->bdev != bdev);
+	if (!plo->bdev)
+		plo->bdev = bdev;
+
+	atomic_inc(&plo->open_count);
+	mutex_unlock(&plo->ctl_mutex);
+
+	check_disk_change(bdev);
+
+	return 0;
+}
+
+static void ploop_release(struct gendisk *disk, fmode_t fmode)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+	if (atomic_dec_and_test(&plo->open_count)) {
+		ploop_pb_destroy(plo, NULL);
+		ploop_tracker_stop(plo, 1);
+		plo->bdev = NULL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+}
+
+static struct ploop_delta *
+init_delta(struct ploop_device * plo, struct ploop_ctl * ctl, int level)
+{
+	struct ploop_delta * delta;
+	struct ploop_delta_ops * ops;
+	int err;
+
+	ops = ploop_format_get(ctl->pctl_format);
+	if (ops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (level < 0 && !list_empty(&plo->map.delta_list)) {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+		err = -EINVAL;
+		if (top_delta->level >= 127)
+			goto out_err;
+		level = top_delta->level + 1;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (!(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	} else if (level >= 0) {
+		struct ploop_delta * delta = find_delta(plo, level);
+		err = -EINVAL;
+		if (delta == NULL)
+			goto out_err;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (level && !(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	}
+
+	if (level < 0)
+		level = 0;
+
+	err = -ENOMEM;
+	delta = kzalloc(sizeof(struct ploop_delta), GFP_KERNEL);
+	if (delta == NULL)
+		goto out_err;
+
+	__module_get(THIS_MODULE);
+
+	delta->level = level;
+	delta->cluster_log = ctl->pctl_cluster_log;
+	delta->plo = plo;
+	delta->ops = ops;
+	delta->flags = ctl->pctl_flags & PLOOP_FMT_FLAGS;
+	delta->max_delta_size = ULLONG_MAX;
+
+	KOBJECT_INIT(&delta->kobj, &ploop_delta_ktype);
+	return delta;
+
+out_err:
+	ploop_format_put(ops);
+	return ERR_PTR(err);
+}
+
+
+static int ploop_set_max_delta_size(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta * top_delta = ploop_top_delta(plo);
+	u64 max_delta_size;
+
+	if (copy_from_user(&max_delta_size, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (top_delta == NULL)
+		return -EINVAL;
+
+	top_delta->max_delta_size = max_delta_size;
+
+	return 0;
+}
+
+static int ploop_add_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_COOKIE) && !plo->cookie[0] &&
+	    copy_from_user(plo->cookie, (void*)arg + sizeof(struct ploop_ctl) +
+			   sizeof(struct ploop_ctl_chunk),
+			   PLOOP_COOKIE_SIZE - 1))
+		return -EFAULT;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list))
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list)) {
+		plo->cluster_log = delta->cluster_log;
+	} else {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+
+		err = -EINVAL;
+		if (!(top_delta->flags & PLOOP_FMT_RDONLY))
+			goto out_close;
+	}
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	list_add(&delta->list, &plo->map.delta_list);
+	mutex_unlock(&plo->sysfs_mutex);
+	set_bit(PLOOP_S_CHANGED, &plo->state);
+
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static int ploop_replace_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * old_delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	old_delta = find_delta(plo, ctl.pctl_level);
+	if (old_delta == NULL)
+		return -ENOENT;
+
+	if ((old_delta->flags ^ ctl.pctl_flags) & PLOOP_FMT_RDONLY)
+		return -EINVAL;
+
+	delta = init_delta(plo, &ctl, ctl.pctl_level);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	kobject_del(&old_delta->kobj);
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	kobject_put(&plo->kobj);
+
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	ploop_quiesce(plo);
+	ploop_map_destroy(&plo->map);
+	list_replace_init(&old_delta->list, &delta->list);
+	ploop_relax(plo);
+
+	old_delta->ops->stop(old_delta);
+	old_delta->ops->destroy(old_delta);
+	kobject_put(&old_delta->kobj);
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+
+void ploop_quiesce(struct ploop_device * plo)
+{
+	struct completion qcomp;
+	struct ploop_request * preq;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	preq = ploop_alloc_request(plo);
+	preq->bl.head = preq->bl.tail = NULL;
+	preq->req_size = 0;
+	preq->req_rw = 0;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_BARRIER);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+
+	init_completion(&qcomp);
+	init_completion(&plo->relax_comp);
+	init_completion(&plo->relaxed_comp);
+	plo->quiesce_comp = &qcomp;
+
+	ploop_entry_add(plo, preq);
+	plo->barrier_reqs++;
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+
+	wait_for_completion(&qcomp);
+	plo->quiesce_comp = NULL;
+}
+
+void ploop_relax(struct ploop_device * plo)
+{
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	complete(&plo->relax_comp);
+	wait_for_completion(&plo->relaxed_comp);
+}
+
+/* search disk for first partition bdev with mounted fs and freeze it */
+static struct super_block *find_and_freeze_bdev(struct ploop_device *plo,
+						struct block_device ** bdev_pp)
+{
+	struct super_block  * sb   = NULL;
+	struct block_device * bdev = NULL;
+	struct gendisk *disk = plo->disk;
+	int i;
+
+	bdev = ploop_get_dm_crypt_bdev(plo);
+	if (bdev) {
+		sb = freeze_bdev(bdev);
+		goto out;
+	}
+
+	for (i = 0; i <= (*bdev_pp)->bd_part_count; i++) {
+		bdev = bdget_disk(disk, i);
+		if (!bdev)
+			break;
+
+		sb = freeze_bdev(bdev);
+		if (sb)
+			break;
+
+		thaw_bdev(bdev, sb);
+		bdput(bdev);
+		bdev = NULL;
+	}
+
+out:
+	if (IS_ERR(sb))
+		bdput(bdev);
+	else
+		*bdev_pp = bdev;
+	return sb;
+}
+
+static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
+			  struct block_device * bdev)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * top_delta;
+	struct ploop_snapdata snapdata;
+	struct super_block * sb;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return ploop_add_delta(plo, arg);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err)
+		goto out_close;
+
+	top_delta = ploop_top_delta(plo);
+
+	err = top_delta->ops->prepare_snapshot(top_delta, &snapdata);
+	if (err)
+		goto out_close2;
+
+	/* _XXX_ only one mounted fs per ploop-device is supported */
+	sb = NULL;
+	if (ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) {
+		/* freeze_bdev() may trigger ploop_bd_full() */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		sb = find_and_freeze_bdev(plo, &bdev);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		if (IS_ERR(sb)) {
+			err = PTR_ERR(sb);
+			fput(snapdata.file);
+			goto out_close2;
+		}
+	}
+
+	ploop_quiesce(plo);
+	err = top_delta->ops->complete_snapshot(top_delta, &snapdata);
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_add(&delta->list, &plo->map.delta_list);
+		clear_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		mutex_unlock(&plo->sysfs_mutex);
+	}
+	ploop_relax(plo);
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) && bdev) {
+		/* Drop ctl_mutex in order to avoid reverse order locking
+		   thaw_bdev() ->kill_sb() ->blkdev_put() ->bd_mutex */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		thaw_bdev(bdev, sb);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		bdput(bdev);
+	}
+
+	if (err)
+		goto out_close2;
+
+	return 0;
+
+out_close2:
+	kobject_del(&delta->kobj);
+out_close:
+	kobject_put(&plo->kobj);
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static void renumber_deltas(struct ploop_device * plo)
+{
+	struct ploop_delta * delta;
+	int level = 0;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		delta->level = level++;
+	}
+
+	if (level == 1) {
+		delta = ploop_top_delta(plo);
+		if (delta->level == 0 &&
+		    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+			set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+	}
+}
+
+static void rename_deltas(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		int err;
+
+		if (delta->level < level)
+			continue;
+#if 0
+		/* Oops, kobject_rename() is not exported! */
+		sprintf(nname, "%d", delta->level);
+		err = kobject_rename(&delta->kobj, nname);
+#else
+		kobject_del(&delta->kobj);
+		err = KOBJECT_ADD(&delta->kobj, &plo->kobj,
+				  "%d", delta->level);
+#endif
+		if (err)
+			printk("rename_deltas: %d %d %d\n", err, level, delta->level);
+	}
+}
+
+/* Delete delta. Obviously, removing an arbitrary delta will destroy
+ * all the data unless this delta is empty or its data are completely
+ * covered by higher delta or lower delta contains the whole copy of delta,
+ * which is deleted. Driver does not check this.
+ *
+ * Some cases, f.e. removing writable top delta are never valid,
+ * because caller has no way to ensure that new data do not emerge.
+ * Nevertheless, we do _NOT_ prohibit this operation, assuming
+ * that caller have some knowledge, which we cannot comprehend.
+ * F.e. virtual machine using the device was stopped, device
+ * was synced and data were copied to lower delta. And this is bad
+ * idea. This should be different ioctl.
+ */
+
+static int ploop_del_delta(struct ploop_device * plo, unsigned long arg)
+{
+	__u32 level;
+	struct ploop_delta * delta, * next;
+
+	if (copy_from_user(&level, (void*)arg, 4))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (level == 0 && test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		printk(KERN_INFO "Can't del base delta on running ploop%d\n",
+		       plo->index);
+		return -EBUSY;
+	}
+
+	delta = find_delta(plo, level);
+
+	if (delta == NULL)
+		return -ENOENT;
+
+	kobject_del(&delta->kobj);
+	kobject_put(&plo->kobj);
+
+	ploop_quiesce(plo);
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+	list_del(&delta->list);
+	if (list_empty(&plo->map.delta_list))
+		plo->cookie[0] = 0;
+	if (level != 0)
+		next->ops->refresh(next);
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_remove_delta(&plo->map, level);
+	renumber_deltas(plo);
+	ploop_relax(plo);
+	rename_deltas(plo, level);
+
+	delta->ops->stop(delta);
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state) &&
+	       list_empty(&plo->map.delta_list));
+	return 0;
+}
+
+static void ploop_merge_process(struct ploop_device * plo)
+{
+	int num_reqs;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->merge_ptr = 0;
+
+	init_completion(&plo->maintenance_comp);
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	for (; num_reqs; num_reqs--) {
+		struct ploop_request * preq;
+
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U;
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_MERGE);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+int ploop_maintenance_wait(struct ploop_device * plo)
+{
+	int err;
+
+	mutex_unlock(&plo->ctl_mutex);
+
+	err = wait_for_completion_interruptible(&plo->maintenance_comp);
+
+	mutex_lock(&plo->ctl_mutex);
+
+	return atomic_read(&plo->maintenance_cnt) ? err : 0;
+}
+
+static void ploop_update_fmt_version(struct ploop_device * plo)
+{
+	struct ploop_delta * delta = ploop_top_delta(plo);
+
+	if (delta->level == 0 &&
+	    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL)) {
+		ploop_map_destroy(&plo->map);
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+	}
+}
+
+static void ploop_merge_cleanup(struct ploop_device * plo,
+				struct ploop_map * map,
+				struct ploop_delta * delta, int err)
+{
+	ploop_quiesce(plo);
+	mutex_lock(&plo->sysfs_mutex);
+	list_del(&delta->list);
+
+	if (err)
+		list_add(&delta->list, &plo->map.delta_list);
+	else
+		ploop_update_fmt_version(plo);
+
+	plo->trans_map = NULL;
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	mutex_unlock(&plo->sysfs_mutex);
+	ploop_map_destroy(map);
+	ploop_relax(plo);
+}
+
+static int ploop_merge(struct ploop_device * plo)
+{
+	int err;
+	struct ploop_map * map;
+	struct ploop_delta * delta, * next;
+	struct ploop_snapdata sd;
+
+	if (plo->maintenance_type == PLOOP_MNTN_MERGE)
+		goto already;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	BUG_ON (plo->trans_map);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level == 0)
+		return -ENOENT;
+
+	map = kzalloc(sizeof(struct ploop_map), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map_init(plo, map);
+	ploop_map_start(map, plo->bd_size);
+
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+
+	err = next->ops->prepare_merge(next, &sd);
+	if (err) {
+		printk(KERN_WARNING "prepare_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		goto out;
+	}
+
+	ploop_quiesce(plo);
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_destroy(&plo->map);
+
+	err = next->ops->start_merge(next, &sd);
+
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		list_add(&delta->list, &map->delta_list);
+		delta->level = 0;
+		plo->trans_map = map;
+		plo->maintenance_type = PLOOP_MNTN_MERGE;
+		mutex_unlock(&plo->sysfs_mutex);
+	} else {
+		/* Yes. All transient obstacles must be resolved
+		 * in prepare_merge. Failed start_merge means
+		 * abort of the device.
+		 */
+		printk(KERN_WARNING "start_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	}
+
+	ploop_relax(plo);
+
+	if (err)
+		goto out;
+
+	ploop_merge_process(plo);
+
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_MERGE)
+		return -EALREADY;
+
+	map = plo->trans_map;
+	BUG_ON (!map);
+
+	delta = map_top_delta(plo->trans_map);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		printk(KERN_WARNING "merge for ploop%d failed (state ABORT)\n",
+		       plo->index);
+		err = -EIO;
+	}
+
+	ploop_merge_cleanup(plo, map, delta, err);
+
+	if (!err) {
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->stop(delta);
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+out:
+	kfree(map);
+	return err;
+}
+
+static int ploop_truncate(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_truncate_ctl ctl;
+	struct ploop_delta * delta;
+	struct file * file;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_truncate_ctl)))
+		return -EFAULT;
+
+	if (ctl.fd < 0)
+		return -EBADF;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = find_delta(plo, ctl.level);
+	if (delta == NULL)
+		return -ENOENT;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY))
+		return -EBUSY;
+
+	if (delta->ops->truncate == NULL)
+		return -EOPNOTSUPP;
+
+	file = fget(ctl.fd);
+	if (file == NULL)
+		return -EBADF;
+
+	ploop_quiesce(plo);
+
+	ploop_map_destroy(&plo->map);
+
+	err = delta->ops->truncate(delta, file, ctl.alloc_head);
+	if (!err)
+		delta->io.prealloced_size = 0;
+
+	ploop_relax(plo);
+
+	fput(file);
+
+	return err;
+}
+
+#define FUSE_SUPER_MAGIC 0x65735546
+#define IS_PSTORAGE(sb) (sb->s_magic == FUSE_SUPER_MAGIC && \
+			 (!strcmp(sb->s_subtype, "pstorage") || \
+			  !strcmp(sb->s_subtype, "vstorage")))
+
+static int ploop_bd_full(struct backing_dev_info *bdi, long long nr, int root)
+{
+	struct ploop_device *plo      = bdi->congested_data;
+	u64		     reserved = 0;
+	int		     rc	      = 0;
+
+	if (root) {
+		if (!plo->tune.disable_root_threshold)
+			reserved = (u64)root_threshold * 1024;
+	} else {
+		if (!plo->tune.disable_user_threshold)
+			reserved = (u64)user_threshold * 1024;
+	}
+
+	if (reserved) {
+		struct kstatfs buf;
+		int	       ret;
+
+		struct ploop_delta *top_delta;
+		struct file	   *file;
+		struct super_block *sb;
+		void		   *jctx = current->journal_info;
+
+		mutex_lock(&plo->sysfs_mutex);
+		top_delta = ploop_top_delta(plo);
+		file	  = top_delta->io.files.file;
+		sb	  = F_DENTRY(file)->d_inode->i_sb;
+
+		/* bd_full can be unsupported or not needed */
+		if (IS_PSTORAGE(sb) || sb->s_op->statfs == simple_statfs ||
+		    top_delta->flags & PLOOP_FMT_PREALLOCATED) {
+			mutex_unlock(&plo->sysfs_mutex);
+			return 0;
+		}
+
+		get_file(file);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		current->journal_info = NULL;
+		ret = sb->s_op->statfs(F_DENTRY(file), &buf);
+		if (ret || buf.f_bfree * buf.f_bsize < reserved + nr) {
+			static unsigned long full_warn_time;
+
+			if (printk_timed_ratelimit(&full_warn_time, 60*60*HZ))
+				printk(KERN_WARNING
+				       "ploop%d: host disk is almost full "
+				       "(%llu < %llu); CT sees -ENOSPC !\n",
+				       plo->index, buf.f_bfree * buf.f_bsize,
+				       reserved + nr);
+
+			rc = 1;
+		}
+
+		fput(file);
+		current->journal_info = jctx;
+	}
+
+	return rc;
+}
+
+static int ploop_start(struct ploop_device * plo, struct block_device *bdev)
+{
+	int err;
+	struct ploop_delta * top_delta, * delta;
+	int i;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			break;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &plo->free_list);
+		plo->free_qlen++;
+		plo->free_qmax++;
+	}
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		err = delta->ops->start(delta);
+		if (err)
+			return err;
+	}
+
+	ploop_map_start(&plo->map, plo->bd_size);
+
+	top_delta = ploop_top_delta(plo);
+
+	if (top_delta->level == 0 &&
+	    (top_delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+
+	/* Deltas are ready. Enable block device. */
+	set_device_ro(bdev, (top_delta->flags & PLOOP_FMT_RDONLY) != 0);
+
+	blk_queue_make_request(plo->queue, ploop_make_request);
+	plo->queue->queuedata = plo;
+	plo->queue->backing_dev_info.congested_fn = ploop_congested;
+	plo->queue->backing_dev_info.congested_fn2 = ploop_congested2;
+	plo->queue->backing_dev_info.bd_full_fn = ploop_bd_full;
+	plo->queue->backing_dev_info.congested_data = plo;
+
+	blk_queue_merge_bvec(plo->queue, ploop_merge_bvec);
+	blk_queue_flush(plo->queue, REQ_FLUSH);
+
+	if (top_delta->io.ops->queue_settings)
+		top_delta->io.ops->queue_settings(&top_delta->io, plo->queue);
+
+	blk_queue_max_discard_sectors(plo->queue, INT_MAX);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, plo->queue);
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+	set_blocksize(bdev, PAGE_SIZE);
+
+	plo->thread = kthread_create(ploop_thread, plo, "ploop%d",
+				     plo->index);
+	if (IS_ERR(plo->thread)) {
+		err = PTR_ERR(plo->thread);
+		goto out_err;
+	}
+
+	wake_up_process(plo->thread);
+	set_bit(PLOOP_S_RUNNING, &plo->state);
+	BUG_ON(list_empty(&plo->map.delta_list));
+	return 0;
+
+out_err:
+	plo->thread = NULL;
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+	return err;
+}
+
+static int ploop_stop(struct ploop_device * plo, struct block_device *bdev)
+{
+	int p;
+	struct ploop_delta * delta;
+	int cnt;
+
+	if (bdev != bdev->bd_contains) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (wrong bdev)\n",
+			       plo->index);
+		return -ENODEV;
+	}
+
+	if (bdev->bd_contains->bd_holders) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (holders=%d)\n",
+			       plo->index, bdev->bd_contains->bd_holders);
+		return -EBUSY;
+	}
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (list_empty(&plo->map.delta_list)) {
+		printk(KERN_INFO "stop ploop%d failed (no deltas)\n",
+		       plo->index);
+		return -ENOENT;
+	}
+
+	cnt = atomic_read(&plo->open_count);
+	if (cnt > 1) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (cnt=%d)\n",
+			       plo->index, cnt);
+		return -EBUSY;
+	}
+
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	if (plo->freeze_state != PLOOP_F_NORMAL) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (freeze_state=%d)\n",
+			       plo->index, plo->freeze_state);
+		return -EBUSY;
+	}
+
+	clear_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+	ploop_pb_stop(plo->pbd, true);
+
+	for (p = plo->disk->minors - 1; p > 0; p--)
+		invalidate_partition(plo->disk, p);
+	invalidate_partition(plo->disk, 0);
+
+	clear_bit(PLOOP_S_RUNNING, &plo->state);
+
+	del_timer_sync(&plo->mitigation_timer);
+	del_timer_sync(&plo->freeze_timer);
+
+	/* This will wait for queue drain */
+	kthread_stop(plo->thread);
+	plo->thread = NULL;
+
+	/* queue drained, no more ENOSPC */
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->event_waitq))
+		wake_up_interruptible(&plo->event_waitq);
+	spin_unlock_irq(&plo->lock);
+
+	BUG_ON(plo->entry_qlen);
+	BUG_ON(plo->active_reqs);
+	BUG_ON(plo->barrier_reqs);
+	BUG_ON(plo->fastpath_reqs);
+	BUG_ON(plo->read_sync_reqs);
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		delta->ops->stop(delta);
+	}
+
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+
+	if (plo->cached_bio) {
+		bio_put(plo->cached_bio);
+		plo->cached_bio = NULL;
+	}
+
+	while (!list_empty(&plo->free_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&plo->free_list, struct ploop_request, list);
+		list_del(&preq->list);
+		plo->free_qlen--;
+		plo->free_qmax--;
+		kfree(preq);
+	}
+	BUG_ON(plo->free_qlen);
+
+	ploop_map_destroy(&plo->map);
+	if (plo->trans_map)
+		ploop_map_destroy(plo->trans_map);
+
+	return 0;
+}
+
+static int ploop_sync(struct ploop_device * plo, struct block_device *bdev)
+{
+	struct ploop_delta * delta;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+
+	if (delta->ops->sync == NULL)
+		return 0;
+
+	return delta->ops->sync(delta);
+}
+
+static void destroy_deltas(struct ploop_device * plo, struct ploop_map * map)
+{
+	while (!list_empty(&map->delta_list)) {
+		struct ploop_delta * delta;
+		delta = list_entry(map->delta_list.next, struct ploop_delta, list);
+
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+
+	plo->cookie[0] = 0;
+}
+
+static int ploop_clear(struct ploop_device * plo, struct block_device * bdev)
+{
+	int cnt;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (RUNNING)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	if (plo->maintenance_type == PLOOP_MNTN_TRACK) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (TRACK)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+	clear_bit(PLOOP_S_NULLIFY, &plo->state);
+
+	destroy_deltas(plo, &plo->map);
+
+	if (plo->trans_map) {
+		struct ploop_map * map;
+		destroy_deltas(plo, plo->trans_map);
+		map = plo->trans_map;
+		plo->trans_map = NULL;
+		kfree(map);
+	}
+
+	ploop_fb_fini(plo->fbd, 0);
+	ploop_pb_fini(plo->pbd);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	plo->bd_size = 0;
+	plo->state = (1 << PLOOP_S_CHANGED);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state));
+	return 0;
+}
+
+static int ploop_index_update_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_index_update_ctl ctl;
+	struct reloc_map *map;
+	int i;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg,
+			   sizeof(struct ploop_index_update_ctl)))
+		return -EFAULT;
+
+	if (!ctl.n_maps)
+		return 0;
+
+	map = kzalloc(sizeof(*map) * ctl.n_maps, GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (copy_from_user(map, (u8*)arg + sizeof(ctl),
+			   sizeof(*map) * ctl.n_maps)) {
+		kfree(map);
+		return -EFAULT;
+	}
+
+	ploop_quiesce(plo);
+
+	for (i = 0; i < ctl.n_maps; i++)
+		ploop_update_map(&plo->map, ctl.level,
+				 map[i].req_cluster, map[i].iblk);
+
+	ploop_relax(plo);
+
+	kfree(map);
+	return 0;
+}
+
+enum {
+	PLOOP_GROW_RELOC = 0,
+	PLOOP_GROW_NULLIFY,
+	PLOOP_GROW_MAX,
+};
+
+static void ploop_relocate(struct ploop_device * plo, int grow_stage)
+{
+	struct ploop_request * preq;
+	int reloc_type = (grow_stage == PLOOP_GROW_RELOC) ?
+		PLOOP_REQ_RELOC_A : PLOOP_REQ_RELOC_N;
+
+	BUG_ON(grow_stage != PLOOP_GROW_RELOC &&
+	       grow_stage != PLOOP_GROW_NULLIFY);
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->grow_relocated = 0;
+
+	if (grow_stage == PLOOP_GROW_NULLIFY)
+		set_bit(PLOOP_S_NULLIFY, &plo->state);
+
+	init_completion(&plo->maintenance_comp);
+
+	preq = ploop_alloc_request(plo);
+
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = 0;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << reloc_type);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = (reloc_type == PLOOP_REQ_RELOC_A) ? 0 : plo->grow_start;
+	preq->prealloc_size = 0;
+
+	atomic_inc(&plo->maintenance_cnt);
+
+	ploop_entry_add(plo, preq);
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int ploop_grow(struct ploop_device *plo, struct block_device *bdev,
+		      unsigned long arg)
+{
+	u64 new_size;
+	struct ploop_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	int reloc = 0; /* 'relocation needed' flag */
+	int err;
+	int grow_stage = PLOOP_GROW_RELOC;
+
+	if (!delta)
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_GROW) {
+		if (test_bit(PLOOP_S_NULLIFY, &plo->state))
+			grow_stage = PLOOP_GROW_NULLIFY;
+		goto already;
+	}
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+
+	if (ctl.pctl_cluster_log != plo->cluster_log)
+		return -EINVAL;
+
+	if (ctl.pctl_flags & PLOOP_FLAG_CLUBLKS)
+		new_size = (u64)ctl.pctl_size << plo->cluster_log;
+	else
+		new_size = ctl.pctl_size;
+
+	if (plo->bd_size > new_size) /* online shrink not supported */
+		return -EINVAL;
+
+	if (plo->bd_size == new_size) /* nothing to do */
+		return 0;
+
+	if (!delta->ops->prepare_grow)
+		return -EINVAL;
+
+	ploop_quiesce(plo);
+	err = delta->ops->prepare_grow(delta, &new_size, &reloc);
+	if (err)
+		goto grow_failed;
+
+	plo->grow_new_size = new_size;
+
+	/* prepare_grow() succeeded, but more actions needed */
+	if (reloc) {
+		plo->maintenance_type = PLOOP_MNTN_GROW;
+		ploop_relax(plo);
+		for (; grow_stage < PLOOP_GROW_MAX; grow_stage++) {
+			ploop_relocate(plo, grow_stage);
+already:
+			err = ploop_maintenance_wait(plo);
+			if (err)
+				return err;
+
+			BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+			if (plo->maintenance_type != PLOOP_MNTN_GROW)
+				return -EALREADY;
+
+			if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+				clear_bit(PLOOP_S_NULLIFY, &plo->state);
+				plo->maintenance_type = PLOOP_MNTN_OFF;
+				return -EIO;
+			}
+		}
+
+		ploop_quiesce(plo);
+		new_size = plo->grow_new_size;
+		clear_bit(PLOOP_S_NULLIFY, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+
+	/* Update bdev size and friends */
+	if (delta->ops->complete_grow) {
+		err = delta->ops->complete_grow(delta, new_size);
+		if (err)
+			goto grow_failed;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	plo->bd_size = new_size;
+	plo->map.max_index = (plo->bd_size + (1 << plo->cluster_log) - 1 )
+			     >> plo->cluster_log;
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+
+	mutex_unlock(&plo->sysfs_mutex);
+grow_failed:
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_balloon_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_balloon_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.inflate && ctl.keep_intact)
+		return -EINVAL;
+
+	switch (plo->maintenance_type) {
+	case PLOOP_MNTN_DISCARD:
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			break;
+
+		ploop_quiesce(plo);
+		clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		ploop_fb_lost_range_init(plo->fbd, delta->io.alloc_head);
+		ploop_relax(plo);
+		/* fall through */
+	case PLOOP_MNTN_FBLOADED:
+	case PLOOP_MNTN_RELOC:
+		BUG_ON (!plo->fbd);
+		ctl.alloc_head = ploop_fb_get_alloc_head(plo->fbd);
+		ctl.level      = ploop_fb_get_freezed_level(plo->fbd);
+		break;
+	case PLOOP_MNTN_OFF:
+		if (ctl.inflate) {
+			if (delta->ops->id != PLOOP_FMT_PLOOP1)
+				return -EOPNOTSUPP;
+
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_BALLOON;
+			ploop_relax(plo);
+		}
+		break;
+	case PLOOP_MNTN_BALLOON :
+		if (!ctl.inflate && !ctl.keep_intact) {
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_OFF;
+			ploop_relax(plo);
+		}
+	}
+	ctl.mntn_type = plo->maintenance_type;
+
+	return copy_to_user((void*)arg, &ctl, sizeof(ctl));
+}
+
+static int ploop_freeblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta;
+	struct ploop_freeblks_ctl ctl;
+	struct ploop_freeblks_ctl_extent __user *extents;
+	struct ploop_freeblks_desc *fbd;
+	int i;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_OFF)
+		return -EINVAL;
+	if (plo->maintenance_type != PLOOP_MNTN_BALLOON)
+		return -EBUSY;
+	BUG_ON (plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level != ctl.level) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	extents = (void __user *)(arg + sizeof(ctl));
+
+	for (i = 0; i < ctl.n_extents; i++) {
+		struct ploop_freeblks_ctl_extent extent;
+
+		if (copy_from_user(&extent, &extents[i],
+					sizeof(extent))) {
+			rc = -EFAULT;
+			ploop_fb_fini(fbd, rc);
+			goto exit;
+		}
+
+		rc = ploop_fb_add_free_extent(fbd, extent.clu,
+					extent.iblk, extent.len);
+		if (rc) {
+			if (rc == -EINVAL) {
+				printk("ploop_freeblks_ioc: n=%d\n", ctl.n_extents);
+				for (i = 0; i < ctl.n_extents; i++) {
+					if (copy_from_user(&extent, &extents[i],
+							   sizeof(extent))) {
+						printk("copy failed: i=%d\n", i);
+						break;
+					}
+					printk("ploop_freeblks_ioc: i=%d: %u %u %u\n",
+					       i, extent.clu, extent.iblk, extent.len);
+				}
+				WARN_ONCE(1, "add_free_extent failed\n");
+			}
+			ploop_fb_fini(fbd, rc);
+			goto exit;
+		}
+	}
+
+	ploop_quiesce(plo);
+
+	ctl.alloc_head = delta->io.alloc_head;
+	if (copy_to_user((void*)arg, &ctl, sizeof(ctl))) {
+		rc = -EFAULT;
+		ploop_fb_fini(fbd, rc);
+	} else {
+		iblock_t a_h = delta->io.alloc_head;
+		/* make fbd visible to ploop engine */
+		plo->fbd = fbd;
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		BUG_ON (a_h != ctl.alloc_head); /* quiesce sanity */
+		ploop_fb_lost_range_init(fbd, a_h);
+		ploop_fb_set_freezed_level(fbd, delta->level);
+	}
+
+	ploop_relax(plo);
+exit:
+	return rc;
+}
+
+static int ploop_fbget_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_freeblks_ctl ctl;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_copy_freeblks_to_user(plo->fbd, (void*)arg, &ctl);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static int ploop_fbfilter_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	int rc = 0;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD ||
+	    !test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+		return -EINVAL;
+
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_filter_freeblks(plo->fbd, arg);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static void ploop_relocblks_process(struct ploop_device *plo)
+{
+	int num_reqs;
+	struct ploop_request *preq;
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+
+	init_completion(&plo->maintenance_comp);
+
+	for (; num_reqs; num_reqs--) {
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U; /* uninitialized */
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int release_fbd(struct ploop_device *plo, int err)
+{
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_quiesce(plo);
+	ploop_fb_fini(plo->fbd, err);
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	ploop_relax(plo);
+
+	return err;
+}
+
+static void ploop_discard_restart(struct ploop_device *plo, int err)
+{
+	if (!err && test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		ploop_fb_reinit(plo->fbd, 0);
+		atomic_set(&plo->maintenance_cnt, 0);
+		init_completion(&plo->maintenance_comp);
+		plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	} else {
+		clear_bit(PLOOP_S_DISCARD, &plo->state);
+		ploop_fb_fini(plo->fbd, err);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+}
+
+static int ploop_fbdrop_ioc(struct ploop_device *plo)
+{
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	ploop_discard_restart(plo, 0);
+	ploop_relax(plo);
+
+	return 0;
+}
+
+static int ploop_relocblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	struct ploop_relocblks_ctl ctl;
+	struct ploop_freeblks_desc *fbd = plo->fbd;
+	int i;
+	int err = 0;
+	int n_free;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!fbd || (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+		     plo->maintenance_type != PLOOP_MNTN_RELOC))
+		return -EINVAL;
+
+	BUG_ON(test_bit(PLOOP_S_DISCARD_LOADED, &plo->state));
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (delta->level != ctl.level ||
+	    ploop_fb_get_freezed_level(plo->fbd) != ctl.level ||
+	    ploop_fb_get_alloc_head(plo->fbd) != ctl.alloc_head) {
+		return -EINVAL;
+	}
+
+	if (plo->maintenance_type == PLOOP_MNTN_RELOC)
+		goto already;
+
+	if (ctl.n_extents) {
+		struct ploop_relocblks_ctl_extent __user *extents;
+
+		extents = (void __user *)(arg + sizeof(ctl));
+
+		for (i = 0; i < ctl.n_extents; i++) {
+			struct ploop_relocblks_ctl_extent extent;
+
+			if (copy_from_user(&extent, &extents[i],
+						sizeof(extent)))
+				return release_fbd(plo, -EFAULT);
+
+			/* this extent is also present in freemap */
+			err = ploop_fb_add_reloc_extent(fbd, extent.clu,
+					extent.iblk, extent.len, extent.free);
+			if (err)
+				return release_fbd(plo, err);
+		}
+	}
+
+	ploop_quiesce(plo);
+
+	/* alloc_head must never decrease */
+	BUG_ON (delta->io.alloc_head < ploop_fb_get_alloc_head(plo->fbd));
+	n_free = ploop_fb_get_n_free(plo->fbd);
+
+	/*
+	 * before relocation start, freeblks engine could provide only
+	 * free blocks
+	 */
+	BUG_ON (delta->io.alloc_head > ploop_fb_get_alloc_head(plo->fbd) &&
+		n_free);
+	ploop_fb_relocation_start(plo->fbd, ctl.n_scanned);
+
+	if (!n_free || !ctl.n_extents)
+		goto truncate;
+
+	plo->maintenance_type = PLOOP_MNTN_RELOC;
+
+	ploop_relax(plo);
+
+	ploop_relocblks_process(plo);
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -EALREADY;
+
+	fbd = plo->fbd;
+	BUG_ON (!fbd);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		clear_bit(PLOOP_S_DISCARD,&plo->state);
+
+		ploop_fb_fini(plo->fbd, -EIO);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		return -EIO;
+	}
+
+	if (ploop_fb_get_n_relocated(fbd) != ploop_fb_get_n_relocating(fbd))
+		return release_fbd(plo, -EIO);
+
+	/* time to truncate */
+	ploop_quiesce(plo);
+truncate:
+	if (ploop_fb_get_lost_range_len(plo->fbd) != 0) {
+		BUG_ON (delta->io.alloc_head >
+			ploop_fb_get_alloc_head(plo->fbd));
+		err = delta->ops->truncate(delta, NULL,
+					   ploop_fb_get_first_lost_iblk(plo->fbd));
+		if (!err) {
+			delta->io.prealloced_size = 0;
+			ctl.alloc_head = ploop_fb_get_lost_range_len(plo->fbd);
+			err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+		}
+	} else {
+		ctl.alloc_head = 0;
+		err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	}
+
+	ploop_discard_restart(plo, err);
+
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_getdevice_ioc(unsigned long arg)
+{
+	int err;
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_getdevice_ctl ctl = {};
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n), index++) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+		if (plo->index != index || list_empty(&plo->map.delta_list))
+			break;
+	}
+	mutex_unlock(&ploop_devices_mutex);
+
+	ctl.minor = index << PLOOP_PART_SHIFT;
+	if (ctl.minor & ~MINORMASK)
+		return -ERANGE;
+	err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	return err;
+}
+
+static int ploop_push_backup_init(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_push_backup_init_ctl ctl;
+	struct ploop_pushbackup_desc *pbd = NULL;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EINVAL;
+
+	BUG_ON(plo->pbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	pbd = ploop_pb_alloc(plo);
+	if (!pbd) {
+		rc = -ENOMEM;
+		goto pb_init_done;
+	}
+
+	ploop_quiesce(plo);
+
+	rc = ploop_pb_init(pbd, ctl.cbt_uuid, !ctl.cbt_mask_addr);
+	if (rc) {
+		ploop_relax(plo);
+		goto pb_init_done;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	plo->pbd = pbd;
+	mutex_unlock(&plo->sysfs_mutex);
+
+	atomic_set(&plo->maintenance_cnt, 0);
+	plo->maintenance_type = PLOOP_MNTN_PUSH_BACKUP;
+	set_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+
+	ploop_relax(plo);
+
+	if (ctl.cbt_mask_addr)
+		rc = ploop_pb_copy_cbt_to_user(pbd, (char *)ctl.cbt_mask_addr);
+pb_init_done:
+	if (rc)
+		ploop_pb_fini(pbd);
+	return rc;
+}
+
+static int ploop_push_backup_io_get(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl,
+		int (*get)(struct ploop_pushbackup_desc *, cluster_t *,
+			   cluster_t *, unsigned))
+{
+	struct ploop_push_backup_ctl_extent *e;
+	unsigned n_extents = 0;
+	int rc = 0;
+	cluster_t clu = 0;
+	cluster_t len = 0;
+
+	e = kmalloc(sizeof(*e) * ctl->n_extents, GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	while (n_extents < ctl->n_extents) {
+		rc = get(plo->pbd, &clu, &len, n_extents);
+		if (rc == -ENOENT && n_extents)
+			break;
+		else if (rc)
+			goto io_get_done;
+
+		e[n_extents].clu = clu;
+		e[n_extents].len = len;
+		n_extents++;
+	}
+
+	rc = -EFAULT;
+	ctl->n_extents = n_extents;
+	if (copy_to_user((void*)arg, ctl, sizeof(*ctl)))
+		goto io_get_done;
+	if (n_extents &&
+	    copy_to_user((void*)(arg + sizeof(*ctl)), e,
+			 n_extents * sizeof(*e)))
+			goto io_get_done;
+	rc = 0;
+
+io_get_done:
+	kfree(e);
+	return rc;
+}
+
+static int ploop_push_backup_io_read(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl)
+{
+	return ploop_push_backup_io_get(plo, arg, ctl, ploop_pb_get_pending);
+}
+
+static int ploop_push_backup_io_peek(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl)
+{
+	int rc;
+
+	rc = ploop_push_backup_io_get(plo, arg, ctl, ploop_pb_peek);
+
+	if (rc == -ENOENT) {
+		ctl->n_extents = 0;
+		if (copy_to_user((void*)arg, ctl, sizeof(*ctl)))
+			rc = -EFAULT;
+		else
+			rc = 0;
+	}
+
+	return rc;
+}
+
+static int ploop_push_backup_io_write(struct ploop_device *plo, unsigned long arg,
+				      struct ploop_push_backup_io_ctl *ctl)
+{
+	struct ploop_push_backup_ctl_extent *e;
+	unsigned i;
+	int rc = 0;
+
+	e = kmalloc(sizeof(*e) * ctl->n_extents, GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	rc = -EFAULT;
+	if (copy_from_user(e, (void*)(arg + sizeof(*ctl)),
+			   ctl->n_extents * sizeof(*e)))
+		goto io_write_done;
+
+	rc = 0;
+	for (i = 0; i < ctl->n_extents; i++)
+		ploop_pb_put_reported(plo->pbd, e[i].clu, e[i].len);
+
+io_write_done:
+	kfree(e);
+	return rc;
+}
+
+static int ploop_push_backup_io(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_push_backup_io_ctl ctl;
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_PUSH_BACKUP)
+		return -EINVAL;
+
+	BUG_ON (!pbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (!ctl.n_extents)
+		return -EINVAL;
+
+	if (ploop_pb_check_uuid(pbd, ctl.cbt_uuid)) {
+		printk("ploop(%d): PUSH_BACKUP_IO uuid mismatch\n",
+		       plo->index);
+		return -EINVAL;
+	}
+
+	switch(ctl.direction) {
+	case PLOOP_READ:
+		return ploop_push_backup_io_read(plo, arg, &ctl);
+	case PLOOP_WRITE:
+		return ploop_push_backup_io_write(plo, arg, &ctl);
+	case PLOOP_PEEK:
+		return ploop_push_backup_io_peek(plo, arg, &ctl);
+	}
+
+	return -EINVAL;
+}
+
+static int ploop_push_backup_stop(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+	struct ploop_push_backup_stop_ctl ctl;
+	int ret;
+
+	if (plo->maintenance_type != PLOOP_MNTN_PUSH_BACKUP)
+		return -EINVAL;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (pbd && ploop_pb_check_uuid(pbd, ctl.cbt_uuid)) {
+		printk("ploop(%d): PUSH_BACKUP_STOP uuid mismatch\n",
+		       plo->index);
+		return -EINVAL;
+	}
+
+	ret = ploop_pb_destroy(plo, &ctl.status);
+	if (ret)
+		return ret;
+
+	return copy_to_user((void*)arg, &ctl, sizeof(ctl));
+}
+
+static int ploop_freeze(struct ploop_device *plo, struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (plo->freeze_state == PLOOP_F_FROZEN)
+		return 0;
+
+	if (plo->freeze_state == PLOOP_F_THAWING)
+		return -EBUSY;
+
+	if (plo->dm_crypt_bdev)
+		bdev = plo->dm_crypt_bdev;
+
+	bdgrab(bdev);
+	sb = freeze_bdev(bdev);
+	if (sb && IS_ERR(sb)) {
+		bdput(bdev);
+		return PTR_ERR(sb);
+	}
+
+	plo->frozen_bdev = bdev;
+	plo->freeze_state = PLOOP_F_FROZEN;
+	return 0;
+}
+
+static int ploop_thaw(struct ploop_device *plo)
+{
+	struct block_device *bdev = plo->frozen_bdev;
+	struct super_block *sb = bdev ? bdev->bd_super : NULL;
+	int err;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (plo->freeze_state == PLOOP_F_NORMAL)
+		return 0;
+
+	if (plo->freeze_state == PLOOP_F_THAWING)
+		return -EBUSY;
+
+	plo->frozen_bdev = NULL;
+	plo->freeze_state = PLOOP_F_THAWING;
+
+	mutex_unlock(&plo->ctl_mutex);
+	err = thaw_bdev(bdev, sb);
+	bdput(bdev);
+	mutex_lock(&plo->ctl_mutex);
+
+	BUG_ON(plo->freeze_state != PLOOP_F_THAWING);
+
+	if (!err)
+		plo->freeze_state = PLOOP_F_NORMAL;
+	else
+		plo->freeze_state = PLOOP_F_FROZEN;
+
+	return err;
+}
+
+static int ploop_ioctl(struct block_device *bdev, fmode_t fmode, unsigned int cmd,
+		       unsigned long arg)
+{
+	struct ploop_device *plo = bdev->bd_disk->private_data;
+	int err = -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	if (plo->maintenance_type == PLOOP_MNTN_SNAPSHOT) {
+		mutex_unlock(&plo->ctl_mutex);
+		return -EBUSY;
+	}
+
+	switch(cmd) {
+	case PLOOP_IOC_ADD_DELTA:
+		err = ploop_add_delta(plo, arg);
+		break;
+	case PLOOP_IOC_DEL_DELTA:
+		err = ploop_del_delta(plo, arg);
+		break;
+	case PLOOP_IOC_REPLACE_DELTA:
+		err = ploop_replace_delta(plo, arg);
+		break;
+	case PLOOP_IOC_SNAPSHOT:
+		err = ploop_snapshot(plo, arg, bdev);
+		break;
+	case PLOOP_IOC_CLEAR:
+		err = ploop_clear(plo, bdev);
+		break;
+	case PLOOP_IOC_STOP:
+		err = ploop_stop(plo, bdev);
+		break;
+	case PLOOP_IOC_START:
+		err = ploop_start(plo, bdev);
+		break;
+	case PLOOP_IOC_SYNC:
+		err = ploop_sync(plo, bdev);
+		break;
+
+	case PLOOP_IOC_TRACK_INIT:
+		err = ploop_tracker_init(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_SETPOS:
+		err = ploop_tracker_setpos(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_STOP:
+		err = ploop_tracker_stop(plo, 0);
+		break;
+	case PLOOP_IOC_TRACK_ABORT:
+		err = ploop_tracker_stop(plo, 1);
+		break;
+	case PLOOP_IOC_TRACK_READ:
+		err = ploop_tracker_read(plo, arg);
+		break;
+
+	case PLOOP_IOC_MERGE:
+		err = ploop_merge(plo);
+		break;
+	case PLOOP_IOC_TRUNCATE:
+		err = ploop_truncate(plo, arg);
+		break;
+	case PLOOP_IOC_UPDATE_INDEX:
+		err = ploop_index_update_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GROW:
+		err = ploop_grow(plo, bdev, arg);
+		break;
+	case PLOOP_IOC_BALLOON:
+		err = ploop_balloon_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FREEBLKS:
+		err = ploop_freeblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBGET:
+		err = ploop_fbget_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBFILTER:
+		err = ploop_fbfilter_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBDROP:
+		err = ploop_fbdrop_ioc(plo);
+		break;
+	case PLOOP_IOC_RELOCBLKS:
+		err = ploop_relocblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GETDEVICE:
+		err = ploop_getdevice_ioc(arg);
+		break;
+
+	case PLOOP_IOC_DISCARD_INIT:
+		err = ploop_discard_init_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_FINI:
+		err = ploop_discard_fini_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_WAIT:
+		err = ploop_discard_wait_ioc(plo);
+		break;
+	case PLOOP_IOC_MAX_DELTA_SIZE:
+		err = ploop_set_max_delta_size(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_INIT:
+		err = ploop_push_backup_init(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_IO:
+		err = ploop_push_backup_io(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_STOP:
+		err = ploop_push_backup_stop(plo, arg);
+		break;
+	case PLOOP_IOC_FREEZE:
+		err = ploop_freeze(plo, bdev);
+		break;
+	case PLOOP_IOC_THAW:
+		err = ploop_thaw(plo);
+		break;
+	default:
+		err = -EINVAL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+	return err;
+}
+
+static int ploop_media_changed(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	return test_bit(PLOOP_S_CHANGED, &plo->state);
+}
+
+static int ploop_revalidate(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	clear_bit(PLOOP_S_CHANGED, &plo->state);
+	return 0;
+}
+
+static struct block_device_operations ploop_dev_fops = {
+	.owner =		THIS_MODULE,
+	.open =			ploop_open,
+	.release =		ploop_release,
+	.ioctl =		ploop_ioctl,
+	.media_changed =	ploop_media_changed,
+	.revalidate_disk =	ploop_revalidate,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo loopback device driver");
+MODULE_ALIAS_BLOCKDEV_MAJOR(PLOOP_DEVICE_MAJOR);
+
+atomic_t plo_count = ATOMIC_INIT(0);
+
+static struct sysfs_ops ploop_sysfs_ops = { };
+
+static void ploop_obj_release(struct kobject *kobj)
+{
+	struct ploop_device *plo = container_of(kobj, struct ploop_device, kobj);
+	kfree(plo);
+	atomic_dec(&plo_count);
+}
+
+static struct kobj_type ploop_ktype = {
+	.sysfs_ops	= &ploop_sysfs_ops,
+	.release	= ploop_obj_release,
+};
+
+static struct ploop_device *__ploop_dev_alloc(int index)
+{
+	struct ploop_device *plo;
+	struct gendisk *dk;
+
+	plo = kzalloc(sizeof(*plo), GFP_KERNEL);
+	if(!plo)
+		goto out;
+
+	plo->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!plo->queue)
+		goto out_mem;
+
+	dk = plo->disk = alloc_disk(PLOOP_PART_MAX);
+	if (!plo->disk)
+		goto out_queue;
+
+	spin_lock_init(&plo->lock);
+	spin_lock_init(&plo->dummy_lock);
+	plo->queue->queue_lock = &plo->dummy_lock;
+	mutex_init(&plo->ctl_mutex);
+	mutex_init(&plo->sysfs_mutex);
+	plo->index = index;
+	plo->state = 0;
+	atomic_set(&plo->open_count, 0);
+	init_timer(&plo->mitigation_timer);
+	plo->mitigation_timer.function = mitigation_timeout;
+	plo->mitigation_timer.data = (unsigned long)plo;
+	init_timer(&plo->freeze_timer);
+	plo->freeze_timer.function = freeze_timeout;
+	plo->freeze_timer.data = (unsigned long)plo;
+	INIT_LIST_HEAD(&plo->entry_queue);
+	plo->entry_tree[0] = plo->entry_tree[1] = RB_ROOT;
+	plo->lockout_tree = RB_ROOT;
+	plo->lockout_pb_tree = RB_ROOT;
+	INIT_LIST_HEAD(&plo->ready_queue);
+	INIT_LIST_HEAD(&plo->free_list);
+	init_waitqueue_head(&plo->waitq);
+	init_waitqueue_head(&plo->req_waitq);
+	init_waitqueue_head(&plo->freeze_waitq);
+	init_waitqueue_head(&plo->event_waitq);
+	plo->tune = DEFAULT_PLOOP_TUNE;
+	map_init(plo, &plo->map);
+	track_init(plo);
+	KOBJECT_INIT(&plo->kobj, &ploop_ktype);
+	atomic_inc(&plo_count);
+	bio_list_init(&plo->bio_discard_list);
+
+	dk->major		= ploop_major;
+	dk->first_minor		= index << PLOOP_PART_SHIFT;
+	dk->minors		= PLOOP_PART_MAX;
+	dk->fops		= &ploop_dev_fops;
+	dk->private_data	= plo;
+	dk->queue		= plo->queue;
+	snprintf(dk->disk_name, sizeof(dk->disk_name), "ploop%d", index);
+	return plo;
+
+out_queue:
+	blk_cleanup_queue(plo->queue);
+out_mem:
+	kfree(plo);
+out:
+	return NULL;
+}
+
+static void ploop_dev_del(struct ploop_device *plo)
+{
+	ploop_tracker_destroy(plo, 1);
+	ploop_sysfs_uninit(plo);
+	del_gendisk(plo->disk);
+	blk_cleanup_queue(plo->queue);
+	put_disk(plo->disk);
+	rb_erase(&plo->link, &ploop_devices_tree);
+	ploop_fb_fini(plo->fbd, 0);
+	kobject_put(&plo->kobj);
+}
+
+static void ploop_dev_insert(struct ploop_device *plo)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_device * pl;
+
+	p = &ploop_devices_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pl = rb_entry(parent, struct ploop_device, link);
+		BUG_ON (plo->index == pl->index);
+
+		if (plo->index < pl->index)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&plo->link, parent, p);
+	rb_insert_color(&plo->link, &ploop_devices_tree);
+}
+
+static struct ploop_device *ploop_dev_search(int index)
+{
+	struct rb_node *n = ploop_devices_tree.rb_node;
+
+	while(n) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+
+		if (index < plo->index)
+			n = n->rb_left;
+		else if (index > plo->index)
+			n = n->rb_right;
+		else
+			return plo;
+	}
+
+	return NULL;
+}
+
+static struct ploop_device *ploop_dev_init(int index)
+{
+	struct ploop_device *plo = ploop_dev_search(index);
+
+	if (plo) {
+		BUG_ON(list_empty(&plo->map.delta_list) &&
+		       test_bit(PLOOP_S_NULLIFY, &plo->state));
+		return plo;
+	}
+
+	plo = __ploop_dev_alloc(index);
+	if (plo) {
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	return plo;
+}
+
+static struct kobject *ploop_dev_probe(dev_t dev, int *part, void *data)
+{
+	struct kobject *kobj;
+	struct ploop_device *plo;
+
+	*part = dev & (PLOOP_PART_MAX - 1);
+	mutex_lock(&ploop_devices_mutex);
+	plo = ploop_dev_init((dev & MINORMASK) >> PLOOP_PART_SHIFT);
+	if (!plo)
+		kobj = ERR_PTR(-ENOMEM);
+	else
+		kobj = get_disk(plo->disk);
+	mutex_unlock(&ploop_devices_mutex);
+
+	return kobj;
+}
+
+/* Functions to service /proc/vz/ploop_minor */
+
+static int ploop_minor_show(struct seq_file *m, void *v)
+{
+	struct ploop_device *plo = m->private;
+	seq_printf(m, "%d\n", plo->index << PLOOP_PART_SHIFT);
+	return 0;
+}
+
+/* Returns random index from 10000 - 65535 range */
+static unsigned ploop_random_index(void)
+{
+	unsigned int n;
+
+	get_random_bytes(&n, sizeof(n));
+
+	return 10000 + n % (65536 - 10000);
+}
+
+static int ploop_minor_open(struct inode *inode, struct file *file)
+{
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_device *plo = NULL;
+	int found = 0;
+	int ret;
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n)) {
+		plo = rb_entry(n, struct ploop_device, link);
+		if (list_empty(&plo->map.delta_list) &&
+		    !test_bit(PLOOP_S_LOCKED, &plo->locking_state)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		int i = 0;
+
+		index = ploop_random_index();
+		plo = ploop_dev_search(index);
+
+		while (plo) {
+			for (n = &plo->link; n; n = rb_next(n), index++) {
+				plo = rb_entry(n, struct ploop_device, link);
+				if (plo->index != index ||
+				    (list_empty(&plo->map.delta_list) &&
+				     !test_bit(PLOOP_S_LOCKED, &plo->locking_state)))
+					break;
+			}
+
+			BUG_ON (plo->index == index);
+
+			/* not more than two iterations */
+			if (i++ == 2)
+				break;
+
+			if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+				index = 0;
+				plo = ploop_dev_search(index);
+			} else
+				plo = NULL;
+		}
+		
+		if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ERANGE;
+		}
+
+		plo = __ploop_dev_alloc(index);
+		if (!plo) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ENOMEM;
+		}
+
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	BUG_ON(test_bit(PLOOP_S_NULLIFY, &plo->state));
+	set_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	mutex_unlock(&ploop_devices_mutex);
+
+	ret = single_open(file, ploop_minor_show, plo);
+	if (ret)
+		clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return ret;
+}
+
+static int ploop_minor_release(struct inode *inode, struct file *filp)
+{
+	struct ploop_device *plo = ((struct seq_file *)filp->private_data)->private;
+	clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations proc_ploop_minor = {
+	.owner          = THIS_MODULE,
+	.open		= ploop_minor_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ploop_minor_release,
+};
+
+module_param(ploop_max, int, 0);
+MODULE_PARM_DESC(ploop_max, "Maximum number of ploop devices");
+module_param(ploop_major, int, 0);
+MODULE_PARM_DESC(ploop_major, "Major number of ploop device");
+module_param(max_map_pages, int, 0644);
+MODULE_PARM_DESC(ploop_max_map_pages, "Maximal amount of pages taken by map cache");
+module_param(root_threshold, long, 0644);
+MODULE_PARM_DESC(root_threshold, "Disk space reserved for root (in kilobytes)");
+module_param(user_threshold, long, 0644);
+MODULE_PARM_DESC(user_threshold, "Disk space reserved for user (in kilobytes)");
+module_param(large_disk_support, int, 0444);
+MODULE_PARM_DESC(ploop_large_disk_support, "Support of large disks (>2TB)");
+
+static int __init ploop_mod_init(void)
+{
+	int err;
+
+	/* _XXX_ should be estimated from available ram */
+	if (max_map_pages == 0)
+		max_map_pages = 1024;
+
+	err = ploop_map_init();
+	if (err)
+		goto out_err;
+
+	if (register_blkdev(ploop_major, "ploop"))
+		goto out_err;
+
+	blk_register_region(MKDEV(ploop_major, 0), ploop_max,
+			THIS_MODULE, ploop_dev_probe, NULL, NULL);
+
+	if (!proc_create("ploop_minor", 0440,
+			 proc_vz_dir, &proc_ploop_minor))
+		goto out_err2;
+
+	printk(KERN_INFO "ploop_dev: module loaded\n");
+	return 0;
+
+out_err2:
+	err = -ENOMEM;
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+out_err:
+	ploop_map_exit();
+	return err;
+}
+
+static void __exit ploop_mod_exit(void)
+{
+	struct rb_node * n;
+
+	remove_proc_entry("ploop_minor", proc_vz_dir);
+	while ((n = rb_first(&ploop_devices_tree)) != NULL)
+		ploop_dev_del(rb_entry(n, struct ploop_device, link));
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+	ploop_map_exit();
+	WARN_ON(atomic_read(&plo_count));
+}
+module_init(ploop_mod_init);
+module_exit(ploop_mod_exit);
--- /dev/null
+++ b/drivers/block/ploop/discard.c
@@ -0,0 +1,115 @@
+/*
+ *  drivers/block/ploop/discard.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/bio.h>
+
+#include <linux/ploop/ploop.h>
+#include "discard.h"
+#include "freeblks.h"
+
+int ploop_discard_init_ioc(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (delta == NULL)
+		return -EINVAL;
+
+	if (delta->ops->id != PLOOP_FMT_PLOOP1)
+		return -EOPNOTSUPP;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd)
+		return -ENOMEM;
+
+	ploop_quiesce(plo);
+
+	ploop_fb_set_freezed_level(fbd, delta->level);
+
+	plo->fbd = fbd;
+
+	atomic_set(&plo->maintenance_cnt, 0);
+	init_completion(&plo->maintenance_comp);
+	plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	set_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_relax(plo);
+
+	return 0;
+}
+
+int ploop_discard_fini_ioc(struct ploop_device *plo)
+{
+	int ret = 0;
+	struct ploop_request *preq, *tmp;
+	LIST_HEAD(drop_list);
+
+	if (!test_and_clear_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+
+	spin_lock_irq(&plo->lock);
+	list_for_each_entry_safe(preq, tmp, &plo->entry_queue, list)
+		if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+			list_move(&preq->list, &drop_list);
+			ploop_entry_qlen_dec(preq);
+		}
+	spin_unlock_irq(&plo->lock);
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list);
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ploop_fb_fini(plo->fbd, -EOPNOTSUPP);
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	complete(&plo->maintenance_comp);
+
+out:
+	ploop_relax(plo);
+
+	return ret;
+}
+
+int ploop_discard_wait_ioc(struct ploop_device *plo)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	if (plo->maintenance_type == PLOOP_MNTN_FBLOADED)
+		return 1;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EINVAL;
+
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		goto out;
+
+	/* maintenance_cnt is zero without discard requests,
+	 * in this case ploop_maintenance_wait returns 0
+	 * instead of ERESTARTSYS */
+	if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state)) {
+		err = 1;
+	} else if (signal_pending(current))
+		err = -ERESTARTSYS;
+out:
+	return err;
+}
--- /dev/null
+++ b/drivers/block/ploop/discard.h
@@ -0,0 +1,15 @@
+/*
+ *  drivers/block/ploop/discard.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_DISCARD_H_
+#define _LINUX_PLOOP_DISCARD_H_
+
+extern int ploop_discard_init_ioc(struct ploop_device *plo);
+extern int ploop_discard_fini_ioc(struct ploop_device *plo);
+extern int ploop_discard_wait_ioc(struct ploop_device *plo);
+
+#endif // _LINUX_PLOOP_DISCARD_H_
--- /dev/null
+++ b/drivers/block/ploop/events.h
@@ -0,0 +1,115 @@
+/*
+ *  drivers/block/ploop/events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENTS_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+
+#define PRINT_BI_RW(rw)	__print_flags(rw, "|",		\
+			{ REQ_WRITE,				"W"},	\
+			{ REQ_FAILFAST_DEV,		"FD"},	\
+			{ REQ_FAILFAST_TRANSPORT,	"FT"},	\
+			{ REQ_FAILFAST_DRIVER,		"FDRV"},\
+			{ REQ_RAHEAD,			"A"},	\
+			{ REQ_SYNC,			"S"},	\
+			{ REQ_META,			"M"},	\
+			{ REQ_DISCARD,			"D"},	\
+			{ REQ_NOIDLE,			"N"},	\
+			{ REQ_FLUSH,			"F"},	\
+			{ REQ_FUA,			"FUA"},	\
+			{ REQ_THROTTLED,		"T"})
+
+#define PRINT_PREQ_STATE(state)					\
+			__print_flags(state, "|",		\
+			{ 1 << PLOOP_REQ_LOCKOUT,	"L"},	\
+			{ 1 << PLOOP_REQ_PB_LOCKOUT,	"BL"},	\
+			{ 1 << PLOOP_REQ_SYNC,		"S"},	\
+			{ 1 << PLOOP_REQ_BARRIER,	"B"},	\
+			{ 1 << PLOOP_REQ_UNSTABLE,	"U"},	\
+			{ 1 << PLOOP_REQ_TRACK,		"TRACK"},\
+			{ 1 << PLOOP_REQ_SORTED,	"SORT"},\
+			{ 1 << PLOOP_REQ_TRANS,		"T"},	\
+			{ 1 << PLOOP_REQ_MERGE,		"M"},	\
+			{ 1 << PLOOP_REQ_RELOC_A,	"RA"},	\
+			{ 1 << PLOOP_REQ_RELOC_S,	"RS"},	\
+			{ 1 << PLOOP_REQ_RELOC_N,	"RN"},	\
+			{ 1 << PLOOP_REQ_ZERO,		"Z"},	\
+			{ 1 << PLOOP_REQ_DISCARD,	"D"})
+
+#define PREQ_FORMAT "preq=0x%p cluster=0x%x iblock=0x%x size=0x%x eng_state=0x%lx state=%s rw=%s"
+
+#define PREQ_ARGS	__entry->preq,				\
+			__entry->clu,				\
+			__entry->iblk,				\
+			__entry->size,				\
+			__entry->eng_state,			\
+			PRINT_PREQ_STATE(__entry->state),	\
+			PRINT_BI_RW(__entry->rw)
+
+DECLARE_EVENT_CLASS(preq_template,
+	TP_PROTO(struct ploop_request *preq),
+
+	TP_ARGS(preq),
+
+	TP_STRUCT__entry(
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk(PREQ_FORMAT, PREQ_ARGS)
+);
+
+DECLARE_EVENT_CLASS(bio_template,
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(void *,		bio)
+		__field(sector_t,	sector)
+		__field(unsigned int,	size)
+		__field(unsigned long,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->bio		= bio;
+		__entry->sector		= bio->bi_sector;
+		__entry->size		= bio->bi_size;
+		__entry->rw		= bio->bi_rw;
+	),
+
+	TP_printk("bio=0x%p sector=0x%lx size=0x%x rw=%s",
+			__entry->bio,
+			__entry->sector,
+			__entry->size,
+			PRINT_BI_RW(__entry->rw)
+			)
+);
+
+#endif /* _TRACE_PLOOP_H */
--- /dev/null
+++ b/drivers/block/ploop/fmt_ploop1.c
@@ -0,0 +1,603 @@
+/*
+ *  drivers/block/ploop/fmt_ploop1.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop1_image.h"
+
+/* The implementaion of ploop1 (PVD) delta format, defined in ploop1_fmt.h
+ */
+
+#define INDEX_PER_PAGE	     (PAGE_SIZE  / 4)
+#define INDEX_PER_PAGE_SHIFT (PAGE_SHIFT - 2)
+
+struct ploop1_private
+{
+	struct page	*dyn_page;
+	u64		bd_size;
+	u32		alloc_head;
+	sector_t	l1_off;
+};
+
+int ploop1_map_index(struct ploop_delta * delta, unsigned long block, sector_t *sec)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if ((u64)block << delta->plo->cluster_log >= ph->bd_size)
+		return 0;
+
+	/*
+	 * ondisk_pageno == (block + off) >> INDEX_PER_PAGE_SHIFT
+	 * sec == ondisk_pageno << (PAGE_SHIFT - 9)
+	 * (8 sectors per page, and log(8) == PAGE_SHIFT - 9)
+	 */
+	*sec = ((block + PLOOP_MAP_OFFSET) >> INDEX_PER_PAGE_SHIFT) <<
+	       (PAGE_SHIFT - 9);
+	return 1;
+}
+
+static void
+ploop1_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+		  struct page * page, sector_t sec)
+{
+	return delta->io.ops->read_page(&delta->io, preq, page, sec);
+}
+
+static void
+ploop1_destroy_priv(struct ploop_delta * delta)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if (ph == NULL)
+		return;
+
+	delta->priv = NULL;
+
+	if (ph->dyn_page)
+		put_page(ph->dyn_page);
+
+	kfree(ph);
+}
+
+static int ploop1_stop(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) ||
+	    test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return 0;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	pvd_header_set_disk_closed(vh);
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+ploop1_open(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop1_private * ph;
+	struct ploop_pvd_header *vh;
+	u64 i_size;
+	int version;
+
+	err = -ENOMEM;
+	ph = kzalloc(sizeof(struct ploop1_private), GFP_KERNEL);
+	if (ph == NULL)
+		return -ENOMEM;
+
+	delta->priv = ph;
+
+	ph->dyn_page = alloc_page(GFP_KERNEL);
+	if (ph->dyn_page == NULL)
+		goto out_err;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		goto out_err;
+
+	/* IO engine is ready. */
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out_err;
+
+	err = -EINVAL;
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	version = ploop1_version(vh);
+	if (version == -1 || 
+	    vh->m_Type	  != cpu_to_le32(PRL_IMAGE_COMPRESSED) ||
+	    vh->m_Sectors != cpu_to_le32(1 << delta->cluster_log))
+		goto out_err;
+
+	/* We don't support mixed configuration of V1 and V2 images */
+	if (delta->plo->fmt_version && delta->plo->fmt_version != version)
+		goto out_err;
+
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	err = -EBUSY;
+	if (pvd_header_is_disk_in_use(vh))
+		goto out_err;
+
+	err = -EINVAL;
+	i_size = delta->io.ops->i_size_read(&delta->io);
+	ph->alloc_head = i_size >> (delta->cluster_log + 9);
+	if (!(le32_to_cpu(vh->m_Sectors) << 9) ||
+	    do_div(i_size, le32_to_cpu(vh->m_Sectors) << 9))
+		goto out_err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, version);
+
+	if (delta->plo->bd_size > ph->bd_size)
+		goto out_err;
+	if (ph->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+	if (delta->plo->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		pvd_header_set_disk_in_use(vh);
+		err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+		if (err)
+			goto out_err;
+	}
+
+	delta->io.alloc_head = ph->alloc_head;
+	delta->plo->bd_size = ph->bd_size;
+	delta->plo->fmt_version = version;
+
+	/* If i_size >= max_size, no more allocations needed */
+	if ((u64)ph->alloc_head << (delta->cluster_log + 9) >=
+	    ((u64)ph->bd_size + ph->l1_off) << 9)
+		delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+
+out_err:
+	ploop1_destroy_priv(delta);
+	return err;
+}
+
+static int
+ploop1_refresh(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return 0;
+}
+
+/*
+ * The function gets preq with a bio. Caller checked that this bio
+ * is write to a block, which is not allocated in this delta.
+ * If this block is totally new, bio can cover only a part of block,
+ * if bio is a COW from previous delta, the function gets a bio
+ * covering the whole cluster, which is read from original delta.
+ *
+ * Task of this function is to allocate new block in image,
+ * to copy data there and to update index after this. A lot, huh?
+ */
+
+static void
+ploop1_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	if (delta->io.alloc_head >=
+			(delta->max_delta_size >> delta->cluster_log)) {
+		PLOOP_FAIL_REQUEST(preq, -E2BIG);
+		return;
+	}
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+/* Call this when data write is complete */
+
+static void
+ploop1_allocate_complete(struct ploop_delta * delta, struct ploop_request * preq)
+{
+	ploop_index_update(preq);
+}
+
+static void
+ploop1_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+	ploop1_destroy_priv(delta);
+}
+
+static int
+ploop1_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+ploop1_sync(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		return 0;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return -EIO;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	pvd_header_set_disk_in_use(vh);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+ploop1_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->io.plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	pvd_header_set_disk_closed(vh);
+
+	/*
+	 * NB: we don't call ploop_update_map_hdr() here because top
+	 * delta after snapshot completion should bear m_DiskInUse != 0.
+	 * Also, we rely on the fact that new top delta (created while
+	 * snapshotting) has exactly the same PVD-header as former top
+	 * delta. So, first 64 bytes of correspondent map_node page
+	 * remain valid.
+	 */
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+ploop1_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (pvd_header_is_disk_in_use(vh))
+		return -EBUSY;
+
+	ph->alloc_head = delta->io.ops->i_size_read(&delta->io) >>
+			 (delta->io.plo->cluster_log + 9);
+	delta->io.alloc_head = ph->alloc_head;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+ploop1_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "ploop1_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	pvd_header_set_disk_in_use(vh);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int ploop1_truncate(struct ploop_delta * delta, struct file * file,
+			   __u32 alloc_head)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	/*
+	 * Maybe we should call here ploop1_refresh() and re-read PVD-header
+	 * from disk. This will be clear in the course of porting
+	 * ploop-shrink.c::shrink_in_place().
+	 */
+
+	ph->alloc_head = alloc_head;
+	delta->io.alloc_head = alloc_head;
+
+	return delta->io.ops->truncate(&delta->io,
+				       file ? file : delta->io.files.file,
+				       alloc_head);
+}
+
+static int
+ploop1_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	struct ploop1_private * ph = delta->priv;
+	struct ploop_pvd_header *vh;
+	int idxs_per_iblk; /* # indices in one cluster-block */
+	iblock_t bdsize;   /* block-device size measured in cluster-blocks */
+	int n_present;     /* # cluster-blocks in L2-table (existent now) */
+	int n_needed;      /* # cluster-blocks in L2-table (for new_size) */
+	int n_alloced = 0; /* # cluster-blocks we can alloc right now */
+	int err;
+	iblock_t a_h = delta->io.alloc_head;
+	int	 log = delta->io.plo->cluster_log;
+
+	if (*new_size & ((1 << delta->cluster_log) - 1))
+		return -EINVAL;
+
+	if (*new_size > ploop1_max_size(1 << delta->plo->cluster_log,
+					delta->plo->fmt_version))
+		return -EFBIG;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	n_present  = le32_to_cpu(vh->m_FirstBlockOffset) >> log;
+	BUG_ON (!n_present);
+
+	bdsize = (*new_size + (1 << log) - 1) >> log;
+
+	idxs_per_iblk = (1 << (log + 9)) / sizeof(u32);
+	n_needed = (bdsize + PLOOP_MAP_OFFSET + idxs_per_iblk - 1) /
+		   idxs_per_iblk;
+
+	if (n_needed <= n_present)
+		return 0;
+
+	if (a_h < n_needed) {
+		n_alloced = n_needed - a_h;
+		err = delta->io.ops->alloc(&delta->io,
+					   (loff_t)a_h << (log + 9),
+					   (loff_t)(n_alloced) << (log + 9));
+		if (err)
+			return err;
+	}
+
+	*reloc = n_needed - n_present - n_alloced;
+	if (*reloc) {
+		/* Feeling irresistable infatuation to relocate ... */
+		delta->io.plo->grow_start = n_present;
+		delta->io.plo->grow_end = n_needed - n_alloced - 1;
+	}
+
+	return 0;
+}
+
+static int ploop1_complete_grow(struct ploop_delta * delta, u64 new_size)
+{
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+	int err;
+	u32 vh_bsize; /* block size in sectors */
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	vh_bsize = le32_to_cpu(vh->m_Sectors);
+
+	if (vh_bsize != (1 << delta->io.plo->cluster_log)) {
+		printk("grow: vh->m_Sectors=%u != 1<<plo->cluster_log=%u\n",
+		       vh_bsize, 1 << delta->io.plo->cluster_log);
+		return -EINVAL;
+	}
+
+	generate_pvd_header(vh, new_size, vh_bsize, delta->plo->fmt_version);
+
+	vh->m_Type             = cpu_to_le32(vh->m_Type);
+	cpu_to_le_SizeInSectors(vh, delta->plo->fmt_version);
+	vh->m_Sectors          = cpu_to_le32(vh->m_Sectors);
+	vh->m_Heads            = cpu_to_le32(vh->m_Heads);
+	vh->m_Cylinders        = cpu_to_le32(vh->m_Cylinders);
+	vh->m_Size             = cpu_to_le32(vh->m_Size);
+	vh->m_FirstBlockOffset = cpu_to_le32(vh->m_FirstBlockOffset);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	ph->bd_size = new_size;
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	return 0;
+}
+
+static struct ploop_delta_ops ploop1_delta_ops =
+{
+	.id		=	PLOOP_FMT_PLOOP1,
+	.name		=	"ploop1",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE | PLOOP_FMT_CAP_DELTA,
+
+	.map_index	=	ploop1_map_index,
+	.read_index	=	ploop1_read_index,
+
+	.allocate	=	ploop1_allocate,
+	.allocate_complete =	ploop1_allocate_complete,
+
+	.compose	=	ploop1_compose,
+	.open		=	ploop1_open,
+	.destroy	=	ploop1_destroy,
+	.start		=	ploop1_start,
+	.stop		=	ploop1_stop,
+	.refresh	=	ploop1_refresh,
+	.sync		=	ploop1_sync,
+	.prepare_snapshot =	ploop1_prepare_snapshot,
+	.complete_snapshot =	ploop1_complete_snapshot,
+	.prepare_merge	=	ploop1_prepare_merge,
+	.start_merge	=	ploop1_start_merge,
+	.truncate	=	ploop1_truncate,
+	.prepare_grow	=	ploop1_prepare_grow,
+	.complete_grow	=	ploop1_complete_grow,
+};
+
+static int __init pfmt_ploop1_mod_init(void)
+{
+	return ploop_register_format(&ploop1_delta_ops);
+}
+
+static void __exit pfmt_ploop1_mod_exit(void)
+{
+	ploop_unregister_format(&ploop1_delta_ops);
+}
+
+module_init(pfmt_ploop1_mod_init);
+module_exit(pfmt_ploop1_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/fmt_raw.c
@@ -0,0 +1,269 @@
+/*
+ *  drivers/block/ploop/fmt_raw.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+
+/* An implementation of raw linear image format.
+ *
+ * Right now it is not quite optimal because we simulate
+ * raw image as ploop1-like image with dummy preallocated
+ * index tables. It is optimized only when we have
+ * just one raw image without any deltas on top.
+ * Probably, this is all that we need.
+ */
+
+static int raw_stop(struct ploop_delta * delta)
+{
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+raw_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+raw_open(struct ploop_delta * delta)
+{
+	int err;
+	loff_t pos;
+	int cluster_log = list_empty(&delta->plo->map.delta_list) ?
+		delta->cluster_log : delta->plo->cluster_log;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		return err;
+
+	if (delta->plo->bd_size) {
+		if (delta->plo->bd_size > (delta->io.ops->i_size_read(&delta->io) >> 9))
+			return -EINVAL;
+	} else {
+		delta->plo->bd_size = delta->io.ops->i_size_read(&delta->io) >> 9;
+	}
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	pos += (1 << (cluster_log + 9)) - 1;
+	delta->io.alloc_head = pos >> (cluster_log + 9);
+
+	/* no more allocations at all */
+	delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+}
+
+/*
+ * Sanity checks below assumes that we can be called only by
+ * ploop_del_delta() or raw_start_merge(). Thus, there recently
+ * was a ploop1 delta above us. Adding ploop1 delta on the top
+ * of raw delta is only supported if raw delta is cluster-block
+ * aligned.
+ *
+ * Another assumption is that either size of raw delta was
+ * kept unchanged or it was grown in user-space while merging.
+ */
+static int
+raw_refresh(struct ploop_delta * delta)
+{
+	loff_t pos;
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	if (pos & ((1 << (delta->plo->cluster_log + 9)) - 1)) {
+		printk("raw delta is not aligned (%llu bytes)\n", pos);
+		return -EINVAL;
+	}
+	if ((pos >> (delta->plo->cluster_log + 9)) < delta->io.alloc_head) {
+		printk("raw delta was corrupted "
+		       "(old_size=%u new_size=%llu iblocks)\n",
+		       delta->io.alloc_head,
+		       pos >> (delta->plo->cluster_log + 9));
+		return -EINVAL;
+	}
+
+	delta->io.alloc_head = pos >> (delta->plo->cluster_log + 9);
+	return 0;
+}
+
+static void
+raw_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+int raw_map_index(struct ploop_delta * delta, unsigned long index, sector_t *sec)
+{
+	*sec = index;
+	return 1;
+}
+
+static void
+raw_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+	       struct page * page, sector_t sec)
+{
+	int i;
+	u32 * ptr = page_address(page);
+	int skip = (sec == 0) ? PLOOP_MAP_OFFSET : 0;
+
+	for (i = skip; i < PAGE_SIZE/4; i++) {
+		if ((sec << delta->plo->cluster_log) >=
+		    (delta->io.alloc_head << delta->plo->cluster_log)) {
+			ptr[i] = 0;
+			sec++;
+		} else if (sec == 0) {
+			/* ptr[i]==0 would be interpreted as "iblock not alloced" */
+			ptr[i] = PLOOP_ZERO_INDEX;
+			sec++;
+		} else {
+			ptr[i] = sec++ << ploop_map_log(delta->plo);
+		}
+	}
+
+	ploop_complete_io_state(preq);
+}
+
+static void
+raw_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+}
+
+static int
+raw_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+raw_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+raw_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+raw_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+raw_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "raw_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = raw_refresh(delta);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+
+static int
+raw_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	*new_size = (*new_size + (PAGE_SIZE >> 9) - 1) &
+		    ~((PAGE_SIZE >> 9) - 1);
+	return delta->io.ops->alloc(&delta->io,
+				    delta->plo->bd_size << 9,
+				    (*new_size - delta->plo->bd_size) << 9);
+}
+
+static struct ploop_delta_ops raw_delta_ops =
+{
+	.id		=	PLOOP_FMT_RAW,
+	.name		=	"raw",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE|PLOOP_FMT_CAP_IDENTICAL,
+
+	.map_index	=	raw_map_index,
+	.read_index	=	raw_read_index,
+
+	.allocate	=	raw_allocate,
+
+	.compose	=	raw_compose,
+	.open		=	raw_open,
+	.destroy	=	raw_destroy,
+	.start		=	raw_start,
+	.stop		=	raw_stop,
+	.refresh	=	raw_refresh,
+	.prepare_snapshot =	raw_prepare_snapshot,
+	.complete_snapshot =	raw_complete_snapshot,
+	.prepare_merge	=	raw_prepare_merge,
+	.start_merge	=	raw_start_merge,
+	.prepare_grow	=	raw_prepare_grow,
+};
+
+static int __init pfmt_raw_mod_init(void)
+{
+	return ploop_register_format(&raw_delta_ops);
+}
+
+static void __exit pfmt_raw_mod_exit(void)
+{
+	ploop_unregister_format(&raw_delta_ops);
+}
+
+module_init(pfmt_raw_mod_init);
+module_exit(pfmt_raw_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/freeblks.c
@@ -0,0 +1,1110 @@
+/*
+ *  drivers/block/ploop/freeblks.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "freeblks.h"
+
+#define MIN(a, b) (a < b ? a : b)
+
+struct ploop_freeblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+
+};
+
+struct ploop_relocblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+	u32	  free;	/* this extent is also present in freemap */
+};
+
+struct ploop_fextent_ptr {
+	struct ploop_freeblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_rextent_ptr {
+	struct ploop_relocblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_freeblks_desc {
+	struct ploop_device *plo;
+
+	int fbd_n_free;	       /* # free blocks remaining
+				  (i.e. "not re-used") */
+
+	/* fbd_ffb.ext->clu + fbd_ffb.off can be used as
+	 * 'clu of first free block to reuse' for WRITE ops */
+	struct ploop_fextent_ptr fbd_ffb; /* 'ffb' stands for
+					     'first free block' */
+
+	/* fbd_lfb.ext->clu + fbd_lfb.off can be used as
+	 * 'clu of first block to overwrite' (draining reloc range from end) */
+	struct ploop_fextent_ptr fbd_lfb; /* 'lfb' stands for
+					     'last free block for relocation'*/
+
+	/* fbd_reloc_extents[fbd->fbd_last_reloc_extent].clu +
+	 * fbd_last_reloc_off can be used as 'clu of first block to relocate'
+	 * (draining reloc range from end)
+	 * NB: ffb and lfb above deal with free_list, while lrb deals with
+	 * reloc_list! */
+	struct ploop_rextent_ptr fbd_lrb; /* 'lrb' stands for
+					     'last block to relocate' */
+
+	/* counters to trace the progress of relocation */
+	int fbd_n_relocated;  /* # blocks actually relocated */
+	int fbd_n_relocating; /* # blocks whose relocation was at
+				   least started */
+
+	/* lost_range: [fbd_first_lost_iblk ..
+	 *		fbd_first_lost_iblk + fbd_lost_range_len - 1] */
+	iblock_t fbd_first_lost_iblk;
+	int	 fbd_lost_range_len;
+	int	 fbd_lost_range_addon; /* :)) */
+
+	/* any reloc request resides there while it's "in progress" */
+	struct rb_root		reloc_tree;
+
+	/* list of ploop_request-s for PLOOP_REQ_ZERO ops: firstly zero index
+	 * for PLOOP_REQ_ZERO req_cluster, then schedule ordinary request
+	 * pinned to given PLOOP_REQ_ZERO request */
+	struct list_head	free_zero_list;
+
+	/* storage for free-block extents: list for now */
+	struct list_head	fbd_free_list;
+
+	/* storage for reloc-block extents: list for now */
+	struct list_head	fbd_reloc_list;
+
+	int	 fbd_freezed_level; /* for sanity - level on
+				     * PLOOP_IOC_FREEBLKS stage */
+
+	struct bio_list	fbd_dbl; /* dbl stands for 'discard bio list' */
+};
+
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocated;
+}
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocating;
+}
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_free;
+}
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk + fbd->fbd_lost_range_len;
+}
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lost_range_len;
+}
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk;
+}
+
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_freezed_level;
+}
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level)
+{
+	fbd->fbd_freezed_level = level;
+}
+
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+
+	if (fbd == NULL)
+		return;
+
+	p = &fbd->reloc_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, reloc_link);
+		BUG_ON (preq->src_iblock == pr->src_iblock);
+
+		if (preq->src_iblock < pr->src_iblock)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&preq->reloc_link, parent, p);
+	rb_insert_color(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	BUG_ON (fbd == NULL);
+
+	rb_erase(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd,
+			     struct ploop_request *preq,
+			     unsigned long pin_state)
+{
+	struct rb_node *n;
+	struct ploop_request * p;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (preq->iblock == 0);
+	BUG_ON (preq->iblock >= fbd->fbd_first_lost_iblk);
+
+	n = fbd->reloc_tree.rb_node;
+	if (n == NULL)
+		return 0;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, reloc_link);
+
+		if (preq->iblock < p->src_iblock)
+			n = n->rb_left;
+		else if (preq->iblock > p->src_iblock)
+			n = n->rb_right;
+		else {
+			spin_lock_irq(&fbd->plo->lock);
+			preq->eng_state = pin_state;
+			list_add_tail(&preq->list, &p->delay_list);
+			spin_unlock_irq(&fbd->plo->lock);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl)
+{
+	int   rc = 0;
+	int   n	 = 0;
+	struct ploop_freeblks_extent	 *fextent;
+	struct ploop_freeblks_ctl_extent  cext;
+
+	memset(&cext, 0, sizeof(cext));
+	list_for_each_entry(fextent, &fbd->fbd_free_list, list)
+		if (ctl->n_extents) {
+			int off = offsetof(struct ploop_freeblks_ctl,
+					   extents[n]);
+			if (n++ >= ctl->n_extents) {
+				rc = -ENOSPC;
+				break;
+			}
+
+			cext.clu  = fextent->clu;
+			cext.iblk = fextent->iblk;
+			cext.len  = fextent->len;
+
+			rc = copy_to_user((u8*)arg + off, &cext, sizeof(cext));
+			if (rc)
+				break;
+		} else {
+			n++;
+		}
+
+	if (!rc) {
+		ctl->n_extents = n;
+		rc = copy_to_user((void*)arg, ctl, sizeof(*ctl));
+	}
+
+	return rc;
+}
+
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen)
+{
+	struct ploop_freeblks_extent *fextent, *n;
+
+	list_for_each_entry_safe(fextent, n, &fbd->fbd_free_list, list)
+		if (fextent->len < minlen) {
+			list_del(&fextent->list);
+			fbd->fbd_n_free -= fextent->len;
+			kfree(fextent);
+		}
+
+	if (list_empty(&fbd->fbd_free_list))
+		fbd->fbd_ffb.ext = NULL;
+	else
+		fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next,
+						struct ploop_freeblks_extent,
+						list);
+	fbd->fbd_ffb.off = 0;
+
+	return fbd->fbd_n_free;
+}
+
+struct ploop_request *
+ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_request * preq;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (list_empty(&fbd->free_zero_list));
+
+	preq = list_entry(fbd->free_zero_list.next,
+			  struct ploop_request, list);
+	list_del(&preq->list);
+	return preq;
+}
+
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd,
+			       struct ploop_request *preq)
+{
+	list_add(&preq->list, &fbd->free_zero_list);
+}
+
+static iblock_t ffb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->iblk + fbd->fbd_ffb.off;
+}
+static cluster_t ffb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->clu + fbd->fbd_ffb.off;
+}
+static iblock_t lfb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off;
+}
+static cluster_t lfb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->clu + fbd->fbd_lfb.off;
+}
+static iblock_t lrb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lrb.ext->iblk + fbd->fbd_lrb.off;
+}
+
+static iblock_t get_first_reloc_iblk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent = list_entry(fbd->fbd_reloc_list.next,
+			      struct ploop_relocblks_extent, list);
+	return r_extent->iblk;
+}
+
+static void advance_ffb_simple(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list)
+			fbd->fbd_ffb.ext = NULL;
+		else
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+}
+
+static void advance_lrb(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t skip = 0;
+	BUG_ON (fbd->fbd_lrb.ext == NULL);
+
+	if (likely(fbd->fbd_lrb.off)) {
+		fbd->fbd_lrb.off--;
+	} else {
+		struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+		/* here 'skip' means: [new_lrb_ext]<--skip-->[r_extent] */
+
+		if (fbd->fbd_lrb.ext->list.prev == &fbd->fbd_reloc_list) {
+			BUG_ON (fbd->fbd_lost_range_addon < 0);
+			skip = fbd->fbd_lost_range_addon;
+			fbd->fbd_lrb.ext = NULL;
+		} else {
+			fbd->fbd_lrb.ext = list_entry(fbd->fbd_lrb.ext->list.prev,
+						      struct ploop_relocblks_extent,
+						      list);
+			fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+			BUG_ON (r_extent->iblk < fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+			skip = r_extent->iblk - (fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+		}
+	}
+
+	fbd->fbd_first_lost_iblk -= 1 + skip;
+	fbd->fbd_lost_range_len	 += 1 + skip;
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+
+	BUG_ON(fbd->fbd_n_free <= 0);
+	fbd->fbd_n_free--;
+}
+
+static int split_fb_extent(struct ploop_freeblks_extent *extent, u32 *off_p,
+			   struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_freeblks_extent *new_extent;
+
+	new_extent = kzalloc(sizeof(*new_extent), GFP_KERNEL);
+	if (new_extent == NULL) {
+		printk("Can't allocate new freeblks extent for splittig!\n");
+		return -ENOMEM;
+	}
+
+	new_extent->clu	 = extent->clu	+ *off_p + 1;
+	new_extent->iblk = extent->iblk + *off_p + 1;
+	new_extent->len	 = extent->len	- *off_p - 1;
+
+	extent->len  = *off_p;
+
+	list_add(&new_extent->list, &extent->list);
+
+	(*off_p)--;
+	return 0;
+}
+
+static int advance_lfb_left(struct ploop_freeblks_desc *fbd)
+{
+	int rc = 0;
+	struct ploop_freeblks_extent *lfb_ext = fbd->fbd_lfb.ext;
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (lfb_ext == NULL);
+	BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (ffb_iblk(fbd) == lfb_iblk(fbd)) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		advance_ffb_simple(fbd);
+		return 0;
+	}
+
+	if (fbd->fbd_lfb.off) {
+		if (fbd->fbd_lfb.off == lfb_ext->len - 1) {
+			lfb_ext->len--;
+			fbd->fbd_lfb.off--;
+		} else {
+			rc = split_fb_extent(lfb_ext, &fbd->fbd_lfb.off, fbd);
+		}
+	} else {
+		BUG_ON (lfb_ext->list.prev == &fbd->fbd_free_list);
+		BUG_ON (lfb_ext == fbd->fbd_ffb.ext);
+
+		lfb_ext->clu++;
+		lfb_ext->iblk++;
+		lfb_ext->len--;
+
+		fbd->fbd_lfb.ext = list_entry(lfb_ext->list.prev,
+					      struct ploop_freeblks_extent,
+					      list);
+		fbd->fbd_lfb.off = fbd->fbd_lfb.ext->len - 1;
+
+		if (lfb_ext->len == 0) {
+			list_del(&lfb_ext->list);
+			kfree(lfb_ext);
+		}
+	}
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext == NULL);
+	BUG_ON (lfb_iblk(fbd) < ffb_iblk(fbd));
+	return rc;
+}
+
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd,
+			     cluster_t *from_clu_p, iblock_t *from_iblk_p,
+			     cluster_t *to_clu_p, iblock_t *to_iblk_p,
+			     u32 *free_p)
+{
+	cluster_t from_clu, to_clu;
+	iblock_t  from_iblk, to_iblk;
+	u32 free;
+	struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+
+	if (!fbd)
+		return -1;
+
+	/* whole range is drained? */
+	if (r_extent == NULL)
+		return -1;
+
+	BUG_ON (fbd->fbd_lrb.off >= r_extent->len);
+
+	from_clu  = r_extent->clu  + fbd->fbd_lrb.off;
+	from_iblk = r_extent->iblk + fbd->fbd_lrb.off;
+	free	  = r_extent->free;
+
+	/* from_iblk is in range to relocate, but it's marked as free.
+	 * This means that we only need to zero its index, no actual
+	 * relocation needed. Such an operation doesn't consume free
+	 * block that fbd_last_free refers to */
+	if (free) {
+		/* The block we're going to zero-index was already re-used? */
+		if (fbd->fbd_ffb.ext == NULL || ffb_iblk(fbd) > from_iblk)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+
+		to_iblk = ~0U;
+		to_clu	= ~0U;
+	} else {
+		/* run out of free blocks which can be used as destination
+		 * for relocation ? */
+		if (fbd->fbd_lfb.ext == NULL)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.ext == NULL);
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+		BUG_ON (fbd->fbd_lfb.off  >= fbd->fbd_lfb.ext->len);
+		BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+		to_clu	= lfb_clu(fbd);
+		to_iblk = lfb_iblk(fbd);
+
+		if (advance_lfb_left(fbd)) {
+			/* Error implies stopping relocation */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+			return -1;
+		}
+	}
+
+	/* consume one block from the end of reloc list */
+	advance_lrb(fbd);
+
+	fbd->fbd_n_relocating++;
+
+	*from_clu_p  = from_clu;
+	*from_iblk_p = from_iblk;
+	*to_clu_p    = to_clu;
+	*to_iblk_p   = to_iblk;
+	*free_p	     = free;
+	return 0;
+}
+
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd)
+{
+	fbd->fbd_n_relocated++;
+}
+
+static void advance_lfb_right(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t iblk = get_first_reloc_iblk(fbd);
+
+	if (fbd->fbd_lfb.off < fbd->fbd_lfb.ext->len - 1) {
+		if (fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off + 1 < iblk) {
+			fbd->fbd_lfb.off++;
+		}
+	} else if (fbd->fbd_lfb.ext->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *f_extent;
+		f_extent = list_entry(fbd->fbd_lfb.ext->list.next,
+				      struct ploop_freeblks_extent,
+				      list);
+		if (f_extent->iblk < iblk) {
+			fbd->fbd_lfb.ext = f_extent;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+
+	/* invalidating ffb always implies invalidating lfb */
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL);
+
+	/* caller has just advanced ffb, but we must keep lfb intact
+	 * if next-free-block (following to lfb) is in reloc-range */
+	if (fbd->fbd_ffb.ext != NULL && fbd->fbd_lfb.ext != NULL &&
+	    lfb_iblk(fbd) < ffb_iblk(fbd)) {
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+}
+
+static void trim_reloc_list_one_blk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent_first;
+	iblock_t iblk = lrb_iblk(fbd);
+	int invalidate = 0;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+
+	if (r_extent_first->len > 1) {
+		fbd->fbd_lost_range_addon = 0;
+		r_extent_first->iblk++;
+		r_extent_first->clu++;
+		r_extent_first->len--;
+		if (iblk < r_extent_first->iblk) {
+			invalidate = 1;
+		} else if (r_extent_first == fbd->fbd_lrb.ext) {
+			BUG_ON (fbd->fbd_lrb.off == 0);
+			fbd->fbd_lrb.off--;
+		}
+	} else {
+		if (r_extent_first == fbd->fbd_lrb.ext) {
+			invalidate = 1;
+		} else {
+			struct ploop_relocblks_extent *r_extent;
+			BUG_ON (r_extent_first->list.next ==
+				&fbd->fbd_reloc_list);
+			r_extent = list_entry(r_extent_first->list.next,
+					      struct ploop_relocblks_extent,
+					      list);
+			fbd->fbd_lost_range_addon = r_extent->iblk -
+				(r_extent_first->iblk + r_extent_first->len);
+		}
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+	}
+
+	if (invalidate) {
+		/* invalidate both lfb and lrb */
+		fbd->fbd_lrb.ext = NULL;
+		fbd->fbd_lrb.off = 0;
+		if (fbd->fbd_lfb.ext != NULL) {
+			fbd->fbd_lfb.ext = NULL;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+}
+
+static void advance_ffb(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext != NULL && ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list) {
+			BUG_ON (fbd->fbd_lfb.ext != NULL &&
+				ffb_iblk(fbd) != lfb_iblk(fbd));
+			fbd->fbd_ffb.ext = NULL;
+		} else {
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		}
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		return;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate both ffb and lfb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+
+	/* nothing to do anymore if relocation process is completed */
+	if (fbd->fbd_lrb.ext == NULL)
+		return;
+
+	trim_reloc_list_one_blk(fbd);
+
+	/* trim could invalidate both lrb and lfb */
+	if (fbd->fbd_lrb.ext == NULL || fbd->fbd_lfb.ext == NULL)
+		return;
+
+	advance_lfb_right(fbd);
+}
+
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd,
+			    cluster_t *clu, iblock_t *iblk)
+{
+	if (!fbd)
+		return -1;
+
+	if (fbd->fbd_ffb.ext == NULL) {
+		BUG_ON (fbd->fbd_lfb.ext != NULL);
+		BUG_ON (fbd->fbd_lost_range_len < 0);
+
+		if (fbd->fbd_lost_range_len == 0)
+			return -1;
+
+		*iblk = fbd->fbd_first_lost_iblk++;
+		fbd->fbd_lost_range_len--;
+
+		if (fbd->fbd_lrb.ext != NULL) {
+			/* stop relocation process */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+		}
+
+		return 0;
+	}
+
+	BUG_ON (ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk);
+	BUG_ON (fbd->fbd_n_free <= 0);
+
+	*clu = ffb_clu(fbd);
+	fbd->fbd_n_free--;
+
+	if (fbd->plo->maintenance_type == PLOOP_MNTN_RELOC)
+		advance_ffb(fbd);
+	else
+		advance_ffb_simple(fbd);
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_n_free != 0);
+	BUG_ON (fbd->fbd_ffb.ext != NULL && fbd->fbd_n_free == 0);
+
+	return 1;
+}
+
+static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo = fbd->plo;
+	unsigned int nr_completed = 0;
+
+	while (fbd->fbd_dbl.head) {
+		struct bio * bio = fbd->fbd_dbl.head;
+		fbd->fbd_dbl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, err);
+		nr_completed++;
+	}
+	fbd->fbd_dbl.tail = NULL;
+
+	spin_lock_irq(&plo->lock);
+	plo->bio_total -= nr_completed;
+	if (!bio_list_empty(&plo->bio_discard_list) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err)
+{
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	fbd->fbd_n_free = 0;
+	fbd->fbd_ffb.ext = NULL;
+	fbd->fbd_lfb.ext = NULL;
+	fbd->fbd_lrb.ext = NULL;
+	fbd->fbd_ffb.off = 0;
+	fbd->fbd_lfb.off = 0;
+	fbd->fbd_lrb.off = 0;
+	fbd->fbd_n_relocated = fbd->fbd_n_relocating = 0;
+	fbd->fbd_lost_range_len = 0;
+	fbd->fbd_lost_range_addon = 0;
+
+	BUG_ON(!RB_EMPTY_ROOT(&fbd->reloc_tree));
+}
+
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	int i;
+
+	fbd = kmalloc(sizeof(struct ploop_freeblks_desc), GFP_KERNEL);
+	if (fbd == NULL)
+		return NULL;
+
+	fbd->fbd_dbl.tail = fbd->fbd_dbl.head = NULL;
+	INIT_LIST_HEAD(&fbd->fbd_free_list);
+	INIT_LIST_HEAD(&fbd->fbd_reloc_list);
+	fbd->reloc_tree = RB_ROOT;
+	fbd->fbd_freezed_level = -1;
+
+	fbd->plo = plo;
+
+	ploop_fb_reinit(fbd, 0);
+
+	INIT_LIST_HEAD(&fbd->free_zero_list);
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			goto fb_init_failed;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &fbd->free_zero_list);
+	}
+
+	return fbd;
+
+fb_init_failed:
+	ploop_fb_fini(fbd, -ENOMEM);
+	return NULL;
+}
+
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo;
+
+	if (fbd == NULL)
+		return;
+
+	plo = fbd->plo;
+	BUG_ON (plo == NULL);
+
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	while (!list_empty(&fbd->free_zero_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&fbd->free_zero_list,
+					struct ploop_request,
+					list);
+		list_del(&preq->list);
+		kfree(preq);
+	}
+
+	kfree(fbd);
+	plo->fbd = NULL;
+}
+
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd,
+			     cluster_t clu, iblock_t iblk, u32 len)
+{
+	struct ploop_freeblks_extent *fblk_extent;
+	struct ploop_freeblks_extent *ex;
+
+	if (len == 0) {
+		printk("ploop_fb_add_free_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	list_for_each_entry_reverse(ex, &fbd->fbd_free_list, list)
+		if (ex->iblk < iblk)
+			break;
+
+	if (ex->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *tmp;
+		tmp = list_entry(ex->list.next, struct ploop_freeblks_extent, list);
+
+		if (iblk + len > tmp->iblk) {
+			int c = &ex->list != &fbd->fbd_free_list;
+			printk("ploop_fb_add_free_extent(): next (%u %u %u) "
+			       "intersects with (%u %u %u); ex (%u %u %d)\n",
+			       tmp->clu, tmp->iblk, tmp->len, clu, iblk, len,
+			       c ? ex->clu : 0, c ? ex->iblk : 0, c ? ex->len : -1);
+			return -EINVAL;
+		}
+	}
+
+	if (&ex->list != &fbd->fbd_free_list) {
+		if (ex->iblk + ex->len > iblk) {
+			struct ploop_freeblks_extent *t = NULL;
+			if (ex->list.next != &fbd->fbd_free_list)
+				t = list_entry(ex->list.next, struct ploop_freeblks_extent, list);
+			printk("ploop_fb_add_free_extent(): ex (%u %u %u) "
+			       "intersects with (%u %u %u); next (%u %u %d)\n",
+			       ex->clu, ex->iblk, ex->len, clu, iblk, len,
+			       t ? t->clu : 0, t ? t->iblk : 0, t ? t->len : -1);
+			return -EINVAL;
+		}
+	}
+
+	fblk_extent = kzalloc(sizeof(*fblk_extent), GFP_KERNEL);
+	if (fblk_extent == NULL)
+		return -ENOMEM;
+
+	fblk_extent->clu  = clu;
+	fblk_extent->iblk = iblk;
+	fblk_extent->len  = len;
+
+	list_add(&fblk_extent->list, &ex->list);
+
+	fbd->fbd_n_free	 += len;
+
+	fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next, struct ploop_freeblks_extent, list);
+	fbd->fbd_ffb.off = 0;
+
+	return 0;
+}
+
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd,
+			      cluster_t clu, iblock_t iblk, u32 len, u32 free)
+{
+	struct ploop_relocblks_extent *rblk_extent;
+
+	if (len == 0) {
+		printk("ploop_fb_add_reloc_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		rblk_extent = list_entry(fbd->fbd_reloc_list.prev,
+					 struct ploop_relocblks_extent, list);
+		if (rblk_extent->iblk + rblk_extent->len > iblk) {
+			printk("ploop_fb_add_reloc_extent(): extents should be sorted\n");
+			return -EINVAL;
+		}
+
+		if (rblk_extent->list.next != &fbd->fbd_reloc_list) {
+			rblk_extent = list_entry(rblk_extent->list.next,
+					 struct ploop_relocblks_extent, list);
+			if (iblk + len > rblk_extent->iblk) {
+				printk("ploop_fb_add_reloc_extent(): intersected extents\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	rblk_extent = kzalloc(sizeof(*rblk_extent), GFP_KERNEL);
+	if (rblk_extent == NULL)
+		return -ENOMEM;
+
+	rblk_extent->clu  = clu;
+	rblk_extent->iblk = iblk;
+	rblk_extent->len  = len;
+	rblk_extent->free = free;
+
+	list_add_tail(&rblk_extent->list, &fbd->fbd_reloc_list);
+
+	return 0;
+}
+
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd,
+			      iblock_t first_lost_iblk)
+{
+	fbd->fbd_first_lost_iblk = first_lost_iblk;
+	fbd->fbd_lost_range_len = 0;
+}
+
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd,
+			       __u32 n_scanned)
+{
+	iblock_t a_h = fbd->fbd_first_lost_iblk;
+	iblock_t new_a_h; /* where a_h will be after relocation
+			     if no WRITEs intervene */
+	struct ploop_relocblks_extent *r_extent;
+	struct ploop_relocblks_extent *r_extent_first;
+	int n_free = fbd->fbd_n_free;
+	u32 l;
+	struct ploop_freeblks_extent *fextent;
+
+	BUG_ON(fbd->fbd_lost_range_len != 0);
+	if (list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_first_lost_iblk -= n_scanned;
+		fbd->fbd_lost_range_len	 += n_scanned;
+		return;
+	}
+
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+	r_extent = list_entry(fbd->fbd_reloc_list.prev,
+			      struct ploop_relocblks_extent, list);
+	new_a_h = r_extent->iblk + r_extent->len;
+
+	BUG_ON(fbd->fbd_first_lost_iblk < new_a_h);
+	fbd->fbd_lost_range_len = fbd->fbd_first_lost_iblk - new_a_h;
+	fbd->fbd_first_lost_iblk = new_a_h;
+
+	if (!n_free)
+		return;
+
+	while (1) {
+		l = MIN(n_free, r_extent->len);
+
+		n_free	-= l;
+		new_a_h -= l;
+
+		if (!n_free)
+			break;
+
+		if (r_extent->list.prev == &fbd->fbd_reloc_list) {
+			r_extent = NULL;
+			break;
+		} else {
+			r_extent = list_entry(r_extent->list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		}
+		/* skip lost blocks */
+		new_a_h = r_extent->iblk + r_extent->len;
+	}
+
+	l = 0;
+
+	/* ploop-balloon scanned exactly range [a_h - n_scanned .. a_h - 1] */
+	if (n_free) {
+		l = r_extent_first->iblk - (a_h - n_scanned);
+	} else if (r_extent->iblk == new_a_h) {
+		if (r_extent == r_extent_first) {
+			l = r_extent->iblk - (a_h - n_scanned);
+		} else {
+			struct ploop_relocblks_extent *r_extent_prev;
+
+			BUG_ON (r_extent->list.prev == &fbd->fbd_reloc_list);
+			r_extent_prev = list_entry(r_extent->list.prev,
+						   struct ploop_relocblks_extent,
+						   list);
+			l = r_extent->iblk - (r_extent_prev->iblk +
+					      r_extent_prev->len);
+		}
+	}
+
+	new_a_h -= l;
+
+	/* let's trim reloc_list a bit based on new_a_h */
+	while (r_extent_first->iblk < new_a_h) {
+
+		if (r_extent_first->iblk + r_extent_first->len > new_a_h) {
+			l = new_a_h - r_extent_first->iblk;
+			r_extent_first->iblk += l;
+			r_extent_first->clu  += l;
+			r_extent_first->len  -= l;
+			break;
+		}
+
+		if (r_extent_first->list.next == &fbd->fbd_reloc_list) {
+			list_del(&r_extent_first->list);
+			kfree(r_extent_first);
+			break;
+		}
+
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+		r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+					    struct ploop_relocblks_extent,
+					    list);
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_lrb.ext = list_entry(fbd->fbd_reloc_list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+
+		fbd->fbd_lost_range_addon = r_extent_first->iblk - new_a_h;
+	}
+
+	/* new_a_h is calculated. now, let's find "last free block" position */
+	if (ffb_iblk(fbd) < new_a_h) {
+		list_for_each_entry_reverse(fextent, &fbd->fbd_free_list, list)
+			if (fextent->iblk < new_a_h)
+				break;
+
+		BUG_ON(&fextent->list == &fbd->fbd_free_list);
+	} else
+		fextent = NULL;
+
+	fbd->fbd_lfb.ext = fextent; /* NULL means
+				       "no free blocks for relocation" */
+	if (fextent != NULL)
+		fbd->fbd_lfb.off = MIN(new_a_h - fextent->iblk,
+				       fextent->len) - 1;
+}
+
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio)
+{
+	struct ploop_device *plo;
+
+	if (!fbd)
+		return -EOPNOTSUPP;
+
+	plo = fbd->plo;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return -EOPNOTSUPP;
+	if (fbd->plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EBUSY;
+	/* only one request can be processed simultaneously */
+	if (fbd->fbd_dbl.head)
+		return -EBUSY;
+
+	fbd->fbd_dbl.head = fbd->fbd_dbl.tail = bio;
+
+	return 0;
+}
+
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd)
+{
+	return fbd && fbd->fbd_dbl.head != NULL;
+}
--- /dev/null
+++ b/drivers/block/ploop/freeblks.h
@@ -0,0 +1,58 @@
+/*
+ *  drivers/block/ploop/freeblks.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __FREEBLKS_H__
+#define __FREEBLKS_H__
+
+/* freeblks API - in-kernel balloon support */
+
+/* init/fini stuff */
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo);
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err);
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err);
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len);
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len, u32 free);
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd, iblock_t first_lost_iblk);
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd, __u32 n_scanned);
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio);
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd);
+
+/* avoid direct access to freeblks internals */
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd);
+
+/* get/set freezed level (for sanity checks) */
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd);
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level);
+
+/* maintain rb-tree of "in progress" relocation requests */
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq, unsigned long pin_state);
+
+/* helper for ioctl(PLOOP_IOC_FBGET) */
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl);
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen);
+
+/* get/put "zero index" request */
+struct ploop_request *ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd);
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+
+/* get/put block to relocate */
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd, cluster_t *from_clu, iblock_t *from_iblk,
+			     cluster_t *to_clu, iblock_t *to_iblk, u32 *free);
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd);
+
+/* get free block to reuse */
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd, cluster_t *clu, iblock_t *iblk);
+
+#endif
--- /dev/null
+++ b/drivers/block/ploop/io.c
@@ -0,0 +1,150 @@
+/*
+ *  drivers/block/ploop/io.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+
+/* Generic IO routines. */
+
+static LIST_HEAD(ploop_ios);
+static DEFINE_MUTEX(ploop_ios_mutex);
+
+int ploop_register_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_add(&ops->list, &ploop_ios);
+	mutex_unlock(&ploop_ios_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_io);
+
+void ploop_unregister_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_ios_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_io);
+
+static struct ploop_io_ops * ploop_io_get(struct ploop_io *io, unsigned int id)
+{
+	struct ploop_io_ops * ops;
+
+	mutex_lock(&ploop_ios_mutex);
+	list_for_each_entry(ops, &ploop_ios, list) {
+		if ((id == ops->id || id == PLOOP_IO_AUTO) &&
+		    !ops->autodetect(io) && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_ios_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_ios_mutex);
+	return NULL;
+}
+
+void ploop_io_put(struct ploop_io_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+
+int
+ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	int err;
+
+	if (nchunks != 1)
+		return -EINVAL;
+
+	if (pc[0].pctl_offset ||
+	    pc[0].pctl_start ||
+	    pc[0].pctl_len)
+		return -EINVAL;
+
+	memset(&delta->io, 0, sizeof(struct ploop_io));
+	delta->io.plo = delta->plo;
+	delta->io.n_chunks = 1;
+
+	err = -EBADF;
+	delta->io.files.file = fget(pc[0].pctl_fd);
+	if (!delta->io.files.file)
+		goto out_err;
+
+	err = -EOPNOTSUPP;
+	delta->io.ops = ploop_io_get(&delta->io, pc[0].pctl_type);
+	if (delta->io.ops == NULL)
+		goto out_err;
+
+	err = delta->io.ops->init(&delta->io);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	if (delta->io.files.file)
+		fput(delta->io.files.file);
+	delta->io.files.file = NULL;
+	if (delta->io.ops)
+		ploop_io_put(delta->io.ops);
+	delta->io.ops = NULL;
+	return err;
+}
+EXPORT_SYMBOL(ploop_io_init);
+
+int ploop_io_open(struct ploop_io * io)
+{
+	struct file * file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+	if ((file = io->files.file) == NULL)
+		return -EBADF;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) &&
+	    (io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) &&
+	    !(io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	return io->ops->open(io);
+}
+EXPORT_SYMBOL(ploop_io_open);
+
+void ploop_io_destroy(struct ploop_io * io)
+{
+	if (io->ops) {
+		io->ops->destroy(io);
+		ploop_io_put(io->ops);
+		io->ops = NULL;
+	}
+}
+EXPORT_SYMBOL(ploop_io_destroy);
+
+void ploop_io_report_fn(struct file * file, char * msg)
+{
+	char *fn = "?";
+	char *path;
+
+	path = (char *)__get_free_page(GFP_KERNEL);
+	if (path) {
+		fn = d_path(&file->f_path, path, PAGE_SIZE);
+		if (IS_ERR(fn))
+			fn = "?";
+	}
+
+	printk("%s: %s\n", msg, fn);
+
+	if (path)
+		free_page((unsigned long)path);
+}
+EXPORT_SYMBOL(ploop_io_report_fn);
--- /dev/null
+++ b/drivers/block/ploop/io_direct.c
@@ -0,0 +1,1974 @@
+/*
+ *  drivers/block/ploop/io_direct.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include <linux/falloc.h>
+#include <linux/magic.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+#include <linux/ploop/compat.h>
+#include "ploop_events.h"
+#include "io_direct_map.h"
+
+#define CREATE_TRACE_POINTS
+#include "io_direct_events.h"
+
+/* from fs/ext4/ext4.h */
+#define EXT4_EXTENTS_FL			0x00080000
+
+#define MIN(a, b) (a < b ? a : b)
+
+#define PLOOP_MAX_PREALLOC(plo) (128 * 1024 * 1024) /* 128MB */
+
+#define PLOOP_MAX_EXTENT_MAP (64 * 1024 * 1024)    /* 64MB */
+int max_extent_map_pages __read_mostly;
+int min_extent_map_entries __read_mostly;
+
+/* total sum of m->size for all ploop_mapping structs */
+atomic_long_t ploop_io_images_size = ATOMIC_LONG_INIT(0);
+
+/* Direct IO from/to file.
+ *
+ * Holes in image file are not allowed.
+ */
+
+static inline sector_t
+dio_isec_to_phys(struct extent_map * em, sector_t isec)
+{
+	return (isec - em->start) + em->block_start;
+}
+
+DEFINE_BIO_CB(dio_endio_async)
+{
+	struct ploop_request * preq = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err)
+		PLOOP_REQ_SET_ERROR(preq, err);
+
+	ploop_complete_io_request(preq);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_async)
+
+struct bio_list_walk
+{
+	struct bio * cur;
+	int idx;
+	int bv_off;
+};
+
+static int cached_submit(struct ploop_io *io, iblock_t iblk,
+	      struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size, bool use_prealloc);
+
+static void
+dio_submit(struct ploop_io *io, struct ploop_request * preq,
+	   unsigned long rw,
+	   struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	struct extent_map * em;
+	sector_t sec, nsec;
+	int err;
+	struct bio_list_walk bw;
+	int write = !!(rw & REQ_WRITE);
+	int delayed_fua = 0;
+
+	trace_submit(preq);
+
+	if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) {
+		/* Mark req that delayed flush required */
+		preq->req_rw |= (REQ_FLUSH | REQ_FUA);
+		delayed_fua = 1;
+	}
+
+	rw &= ~(REQ_FLUSH | REQ_FUA);
+
+
+	bio_list_init(&bl);
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	if ((rw & REQ_WRITE) &&
+	    !(io->files.file->f_mode & FMODE_WRITE)) {
+		err = -EBADF;
+		goto out;
+	}
+
+	sec = sbl->head->bi_sector;
+	sec = ((sector_t)iblk << preq->plo->cluster_log) | (sec & ((1<<preq->plo->cluster_log) - 1));
+
+	em = extent_lookup_create(io, sec, size);
+	if (IS_ERR(em))
+		goto out_em_err;
+
+	if (write && em->uninit) {
+		sector_t end = (sector_t)(iblk + 1) << preq->plo->cluster_log;
+		sec = (sector_t)iblk << preq->plo->cluster_log;
+
+		if (em->start <= sec)
+			sec = em->end;
+		ploop_extent_put(em);
+
+		while (sec < end) {
+			em = extent_lookup_create(io, sec, end - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (!em->uninit)
+				goto write_unint_fail;
+
+			sec = em->end;
+			ploop_extent_put(em);
+		}
+
+		goto write_unint;
+	}
+
+	ploop_prepare_io_request(preq);
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, sec);
+
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+
+	bio = NULL;
+
+	while (size > 0) {
+		struct bio_vec * bv;
+		int copy;
+
+		bv = bw.cur->bi_io_vec + bw.idx;
+
+		if (bw.bv_off >= bv->bv_len) {
+			bw.idx++;
+			bv++;
+			bw.bv_off = 0;
+			if (bw.idx >= bw.cur->bi_vcnt) {
+				bw.cur = bw.cur->bi_next;
+				bw.idx = 0;
+				bv = bw.cur->bi_io_vec;
+			}
+			BUG_ON(bv->bv_len & 511);
+		}
+
+		if (sec >= em->end) {
+			ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, size);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (write && em->uninit)
+				goto write_unint_fail;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (!em->uninit &&
+		     (bio == NULL ||
+		     bio->bi_sector + (bio->bi_size>>9) != nsec)) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = bv->bv_len - bw.bv_off;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+
+		if (em->uninit) {
+			void *kaddr = kmap_atomic(bv->bv_page);
+			memset(kaddr + bv->bv_offset + bw.bv_off, 0, copy);
+			kunmap_atomic(kaddr);
+		} else if (bio_add_page(bio, bv->bv_page, copy,
+				 bv->bv_offset + bw.bv_off) != copy) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * fresh bio.
+			 */
+			goto flush_bio;
+		}
+
+		bio->bi_rw |= bw.cur->bi_rw &
+			(REQ_FLUSH | delayed_fua ? 0 : REQ_FUA);
+		bw.bv_off += copy;
+		size -= copy >> 9;
+		sec += copy >> 9;
+	}
+	ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		unsigned long rw2 = rw;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
+		submit_bio(rw2, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+write_unint:
+	spin_lock_irq(&preq->plo->lock);
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&preq->plo->lock);
+
+	err = cached_submit(io, iblk, preq, sbl, size, false);
+	goto out;
+
+write_unint_fail:
+	ploop_extent_put(em);
+	err = -EIO;
+	ploop_msg_once(io->plo, "A part of cluster is in uninitialized extent.");
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+
+	if (err)
+		PLOOP_FAIL_REQUEST(preq, err);
+}
+
+struct bio_iter {
+	struct bio     *bio;  /* traverses sbl */
+	struct bio_vec *bv;   /* traverses bio->bi_io_vec */
+	int             off;  /* offset in bv payload:
+			       * 0 <= off < bv->bv_len */
+};
+
+static inline void bio_iter_init(struct bio_iter *biter, struct bio_list *sbl)
+{
+	biter->bio  = sbl->head;
+	biter->bv   = biter->bio->bi_io_vec;
+	biter->off  = 0;
+}
+
+static inline void bio_iter_advance(struct bio_iter *biter, int len)
+{
+	if (biter->bv->bv_len - biter->off > len) {
+		biter->off += len;
+		return;
+	}
+
+	BUG_ON (biter->bv->bv_len - biter->off != len);
+
+	biter->bv++;
+	biter->off = 0;
+
+	if (biter->bv - biter->bio->bi_io_vec < biter->bio->bi_vcnt)
+		return;
+
+	biter->bio = biter->bio->bi_next;
+	if (biter->bio)
+		biter->bv = biter->bio->bi_io_vec;
+}
+
+static void bcopy_from_blist(struct page *page, int dst_off, /* dst */
+			     struct bio_iter *biter,         /* src */
+			     int copy_len)                   /* len */
+{
+	u8 *kdst = kmap_atomic(page);
+
+	while (copy_len > 0) {
+		u8 *ksrc;
+		int copy = MIN(copy_len, biter->bv->bv_len - biter->off);
+
+		ksrc = kmap_atomic(biter->bv->bv_page);
+		memcpy(kdst + dst_off,
+		       ksrc + biter->bv->bv_offset + biter->off,
+		       copy);
+		kunmap_atomic(ksrc);
+
+		copy_len -= copy;
+		dst_off  += copy;
+		bio_iter_advance(biter, copy);
+		BUG_ON (copy_len && !biter->bio);
+	}
+
+	kunmap_atomic(kdst);
+}
+
+static inline void bzero_page(struct page *page)
+{
+	void *kaddr = kmap_atomic(page);
+
+	memset(kaddr, 0, PAGE_SIZE);
+
+	kunmap_atomic(kaddr);
+}
+
+static void
+dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
+	       struct bio_list * sbl, unsigned int size,
+	       struct extent_map *em);
+
+static int
+cached_submit(struct ploop_io *io, iblock_t iblk, struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size, bool use_prealloc)
+{
+	struct ploop_device * plo = preq->plo;
+	int err = 0;
+	loff_t pos, end_pos, start, end;
+	loff_t clu_siz = 1 << (plo->cluster_log + 9);
+	struct bio_iter biter;
+	loff_t new_size;
+	loff_t used_pos;
+	bool may_fallocate = io->files.file->f_op->fallocate &&
+		io->files.flags & EXT4_EXTENTS_FL;
+
+	trace_cached_submit(preq);
+
+	pos = (loff_t)iblk << (plo->cluster_log + 9);
+	end_pos = pos + clu_siz;
+	used_pos = (io->alloc_head - 1) << (io->plo->cluster_log + 9);
+
+	file_start_write(io->files.file);
+
+	if (use_prealloc && end_pos > used_pos && may_fallocate) {
+		if (unlikely(io->prealloced_size < clu_siz)) {
+			loff_t prealloc = end_pos;
+			if (prealloc > PLOOP_MAX_PREALLOC(plo))
+				prealloc = PLOOP_MAX_PREALLOC(plo);
+try_again:
+			err = io->files.file->f_op->fallocate(io->files.file, 0,
+							       pos, prealloc);
+			if (err) {
+				if (err == -ENOSPC && prealloc != clu_siz) {
+					prealloc = clu_siz;
+					goto try_again;
+				} else {
+					goto end_write;
+				}
+			}
+
+			/* flush new i_size to disk */
+			err = io->ops->sync(io);
+			if (err)
+				goto end_write;
+
+			io->prealloced_size = prealloc;
+		}
+
+		io->prealloced_size -= clu_siz;
+	}
+
+	if (may_fallocate) {
+		sector_t sec = (sector_t)iblk << preq->plo->cluster_log;
+		sector_t len = 1 << preq->plo->cluster_log;
+		struct extent_map * em = extent_lookup_create(io, sec, len);
+
+		if (unlikely(IS_ERR(em))) {
+			err = PTR_ERR(em);
+			goto end_write;
+		}
+
+		preq->iblock = iblk;
+		preq->eng_io = io;
+		BUG_ON(test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state));
+		set_bit(PLOOP_REQ_POST_SUBMIT, &preq->state);
+		dio_submit_pad(io, preq, sbl, size, em);
+		err = 0;
+		goto end_write;
+	}
+
+	bio_iter_init(&biter, sbl);
+	mutex_lock(&io->files.inode->i_mutex);
+
+	start = pos + ((sbl->head->bi_sector & ((1<<plo->cluster_log)-1)) << 9);
+	end = start + (size << 9);
+	ploop_prepare_tracker(preq, start>>9);
+
+	while (pos < end_pos) {
+		struct page * page;
+		void * fsdata;
+
+		err = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (err)
+			break;
+
+		if (pos < start || pos + PAGE_CACHE_SIZE > end)
+			bzero_page(page);
+
+		if (pos < end && pos + PAGE_CACHE_SIZE > start) {
+			int dst_off = 0;
+			int copy_len = PAGE_CACHE_SIZE;
+
+			if (pos < start) {
+				dst_off = start - pos;
+				copy_len -= dst_off;
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - start;
+			} else {
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - pos;
+			}
+
+			bcopy_from_blist(page, dst_off, &biter, copy_len);
+		}
+
+		err = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+					  page, &fsdata);
+		if (err != PAGE_CACHE_SIZE) {
+			if (err >= 0)
+				err = -EIO;
+			break;
+		}
+		err = 0;
+
+		pos += PAGE_CACHE_SIZE;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_add(new_size - *io->size_ptr, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err)
+		err = filemap_fdatawrite(io->files.mapping);
+
+	if (!err) {
+		spin_lock_irq(&plo->lock);
+		ploop_acc_flush_skip_locked(plo, preq->req_rw);
+		preq->iblock = iblk;
+		list_add_tail(&preq->list, &io->fsync_queue);
+		io->fsync_qlen++;
+		plo->st.bio_syncwait++;
+		if ((test_bit(PLOOP_REQ_SYNC, &preq->state) ||
+		     io->fsync_qlen >= plo->tune.fsync_max) &&
+		    waitqueue_active(&io->fsync_waitq))
+			wake_up_interruptible(&io->fsync_waitq);
+		else if (!timer_pending(&io->fsync_timer))
+			mod_timer(&io->fsync_timer, jiffies + plo->tune.fsync_delay);
+		spin_unlock_irq(&plo->lock);
+	}
+end_write:
+	file_end_write(io->files.file);
+	return err;
+}
+
+static void
+dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+{
+	struct ploop_device *plo = preq->plo;
+	sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
+	loff_t clu_siz = 1 << (preq->plo->cluster_log + 9);
+	int force_sync = preq->req_rw & REQ_FUA;
+	int err;
+
+	file_start_write(io->files.file);
+
+	if (!force_sync) {
+		/* Here io->io_count is even ... */
+		spin_lock_irq(&plo->lock);
+		io->io_count++;
+		set_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
+		spin_unlock_irq(&plo->lock);
+	}
+	err = io->files.file->f_op->fallocate(io->files.file,
+					      FALLOC_FL_CONVERT_UNWRITTEN,
+					      (loff_t)sec << 9, clu_siz);
+
+	/* highly unlikely case: FUA coming to a block not provisioned yet */
+	if (!err && force_sync)
+		err = io->ops->sync(io);
+
+	if (!force_sync) {
+		spin_lock_irq(&plo->lock);
+		io->io_count++;
+		spin_unlock_irq(&plo->lock);
+	}
+	/* and here io->io_count is even (+2) again. */
+
+	file_end_write(io->files.file);
+	if (err) {
+		PLOOP_REQ_SET_ERROR(preq, err);
+		set_bit(PLOOP_S_ABORT, &preq->plo->state);
+	}
+}
+
+/* Submit the whole cluster. If preq contains only partial data
+ * within the cluster, pad the rest of cluster with zeros.
+ */
+static void
+dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
+	       struct bio_list * sbl, unsigned int size,
+	       struct extent_map *em)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	sector_t sec, end_sec, nsec, start, end;
+	struct bio_list_walk bw;
+	int err;
+
+	bio_list_init(&bl);
+
+	/* sec..end_sec is the range which we are going to write */
+	sec = (sector_t)preq->iblock << preq->plo->cluster_log;
+	end_sec = sec + (1 << preq->plo->cluster_log);
+
+	/* start..end is data that we have. The rest must be zero padded. */
+	start = sec + (sbl->head->bi_sector & ((1<<preq->plo->cluster_log) - 1));
+	end = start + size;
+
+	if (IS_ERR(em))
+		goto out_em_err;
+
+#if 1
+	/* GCC, shut up! */
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+#endif
+
+	ploop_prepare_io_request(preq);
+	ploop_prepare_tracker(preq, start);
+
+	bio = NULL;
+
+	while (sec < end_sec) {
+		struct page * page;
+		unsigned int poff, plen;
+		bool zero_page;
+
+		if (sec < start) {
+			zero_page = true;
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = start - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else if (sec >= end) {
+			zero_page = true;
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = end_sec - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else {
+			/* sec >= start && sec < end */
+			struct bio_vec * bv;
+			zero_page = false;
+
+			if (sec == start) {
+				bw.cur = sbl->head;
+				bw.idx = 0;
+				bw.bv_off = 0;
+				BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+			}
+			bv = bw.cur->bi_io_vec + bw.idx;
+
+			if (bw.bv_off >= bv->bv_len) {
+				bw.idx++;
+				bv++;
+				bw.bv_off = 0;
+				if (bw.idx >= bw.cur->bi_vcnt) {
+					bw.cur = bw.cur->bi_next;
+					bw.idx = 0;
+					bw.bv_off = 0;
+					bv = bw.cur->bi_io_vec;
+				}
+				BUG_ON(bv->bv_len & 511);
+			}
+
+			page = bv->bv_page;
+			poff = bv->bv_offset + bw.bv_off;
+			plen = (bv->bv_len - bw.bv_off) >> 9;
+		}
+
+		if (sec >= em->end) {
+			ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, end_sec - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		if (plen > em->end - sec)
+			plen = em->end - sec;
+
+		if (bio_add_page(bio, page, plen<<9, poff) != (plen<<9)) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * new bio
+			 */
+			goto flush_bio;
+		}
+
+		/* Handle FLUSH here, dio_post_submit will handle FUA */
+		if (!zero_page)
+			bio->bi_rw |= bw.cur->bi_rw & REQ_FLUSH;
+
+		bw.bv_off += (plen<<9);
+		BUG_ON(plen == 0);
+		sec += plen;
+	}
+	ploop_extent_put(em);
+
+	while (bl.head) {
+		unsigned long rw;
+		struct bio * b = bl.head;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		rw = preq->req_rw & ~(REQ_FLUSH | REQ_FUA);
+		ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
+		submit_bio(rw, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	PLOOP_FAIL_REQUEST(preq, err);
+}
+
+static struct extent_map * dio_fallocate(struct ploop_io *io, u32 iblk, int nr)
+{
+	struct extent_map * em;
+	mutex_lock(&io->files.inode->i_mutex);
+	em = map_extent_get_block(io,
+				  io->files.mapping,
+				  (sector_t)iblk << io->plo->cluster_log,
+				  1 << io->plo->cluster_log,
+				  1, mapping_gfp_mask(io->files.mapping),
+				  NULL);
+	mutex_unlock(&io->files.inode->i_mutex);
+	return em;
+}
+
+
+static void
+dio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	int err;
+	iblock_t iblk = io->alloc_head++;
+
+	trace_submit_alloc(preq);
+
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	/* io->fallocate is not a "posix" fallocate()!
+	 *
+	 * We require backing fs gave us _uninitialized_ blocks,
+	 * otherwise it does not make sense to go that way.
+	 *
+	 * IMPORTANT: file _grows_ and dio_submit_alloc() cannot
+	 * complete requests until i_size is commited to disk.
+	 * Read this as: no hope to do this in a non-suboptimal way,
+	 * linux updates i_size synchronously even when O_DIRECT AIO
+	 * is requested. Even in PCSS we have to update i_size synchronously.
+	 * Obviously, we will expand file by larger pieces
+	 * and take some measures to avoid initialization of the blocks
+	 * and the same time leakage of uninitizlized data
+	 * to user of our device.
+	 */
+	if (io->files.em_tree->_get_extent) {
+		struct extent_map * em;
+
+		em = dio_fallocate(io, iblk, 1);
+		if (unlikely(IS_ERR(em))) {
+			PLOOP_FAIL_REQUEST(preq, PTR_ERR(em));
+			return;
+		}
+
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_DATA_WBI;
+
+		dio_submit_pad(io, preq, sbl, size, em);
+		return;
+	}
+
+	err = cached_submit(io, iblk, preq, sbl, size, true);
+	if (err) {
+		if (err == -ENOSPC)
+			io->alloc_head--;
+		PLOOP_FAIL_REQUEST(preq, err);
+	}
+	preq->eng_state = PLOOP_E_DATA_WBI;
+}
+
+/* When backing fs does not export any method to allocate new blocks
+ * without initialization, we fallback to cached write with subsequent
+ * fsync. Obviously, this is going to be utterly inefficient.
+ *
+ * Here is a workaround. We start writeback, but do not fsync()
+ * immediately, but start a timer, which wakes up ploop_sync thread.
+ *
+ * Requests are queued to ploop_sync and when timer expires or we
+ * have a lot of requests scheduled for sync, the thread call
+ * real fsync.
+ *
+ * Still not sure this is an improvement. :-)
+ */
+
+static int dio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+	u64 io_count;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		LIST_HEAD(list);
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		INIT_LIST_HEAD(&list);
+		list_splice_init(&io->fsync_queue, &list);
+		io_count = io->io_count;
+		spin_unlock_irq(&plo->lock);
+
+		/* filemap_fdatawrite() has been made already */
+		filemap_fdatawait(io->files.mapping);
+
+		err = io->ops->sync(io);
+
+		/* Do we need to invalidate page cache? Not really,
+		 * because we use it only to create full new pages,
+		 * which we overwrite completely. Probably, we should
+		 * invalidate in a non-blocking way to reclaim memory
+		 * faster than it happens with normal LRU logic.
+		 */
+
+		spin_lock_irq(&plo->lock);
+
+		if (io_count == io->io_count && !(io_count & 1))
+			clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
+
+		while (!list_empty(&list)) {
+			struct ploop_request * preq;
+			preq = list_entry(list.next, struct ploop_request, list);
+			list_del(&preq->list);
+			if (err)
+				PLOOP_REQ_SET_ERROR(preq, err);
+
+			__set_bit(PLOOP_REQ_FSYNC_DONE, &preq->state);
+			list_add_tail(&preq->list, &plo->ready_queue);
+			io->fsync_qlen--;
+		}
+		plo->st.bio_fsync++;
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+/* Invalidate page cache. It is called with inode mutex taken
+ * and mapping mapping must be synced. If some dirty pages remained,
+ * it will fail.
+ *
+ * Retry with fs freeze is required to work around a race (bug?)
+ * in ext3, where some blocks can be held by uncommited transaction.
+ * The procedure is dangerous. No mutexes should be held, ploop
+ * must not be quiesced.
+ */
+
+static int dio_invalidate_cache(struct ploop_io * io)
+{
+	struct address_space *mapping = io->files.mapping;
+	struct block_device  *bdev    = io->files.bdev;
+	int err;
+	int attempt2 = 0;
+
+retry:
+	err = invalidate_inode_pages2(mapping);
+	if (err) {
+		struct ploop_device *plo = io->plo;
+		struct block_device *dm_crypt_bdev;
+
+		printk("PLOOP: failed to invalidate page cache %d/%d\n", err, attempt2);
+		if (attempt2)
+			return err;
+		attempt2 = 1;
+
+		mutex_unlock(&mapping->host->i_mutex);
+
+		WARN_ONCE(!mutex_is_locked(&plo->ctl_mutex), "ctl_mutex is not held");
+		dm_crypt_bdev = __ploop_get_dm_crypt_bdev(plo);
+		if (dm_crypt_bdev)
+			bdev = dm_crypt_bdev;
+		else
+			bdgrab(bdev);
+
+		thaw_bdev(bdev, freeze_bdev(bdev));
+		bdput(bdev);
+
+		mutex_lock(&mapping->host->i_mutex);
+		goto retry;
+	}
+	return err;
+}
+
+static int dio_truncate(struct ploop_io *, struct file *, __u32);
+
+static int dio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = dio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void dio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		(void)dio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			io->files.em_tree = NULL;
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_dio_close(io, delta->flags & PLOOP_FMT_RDONLY);
+			(void)dio_invalidate_cache(io);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		del_timer_sync(&io->fsync_timer);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		if (!(delta->flags & PLOOP_FMT_RDONLY))
+			file_update_time(file);
+		fput(file);
+	}
+}
+
+static int dio_sync(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	int err = 0;
+
+	if (file)
+		err = file->f_op->fsync(file, 0, LLONG_MAX, 0);
+
+	return err;
+}
+
+static int dio_stop(struct ploop_io * io)
+{
+	return io->ops->sync(io);
+}
+
+static int dio_open(struct ploop_io * io)
+{
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err = 0;
+	struct file * file = io->files.file;
+	struct extent_map_tree * em_tree;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	err = io->ops->sync(io);
+	if (err)
+		return err;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	em_tree = ploop_dio_open(io, (delta->flags & PLOOP_FMT_RDONLY));
+	err = PTR_ERR(em_tree);
+	if (IS_ERR(em_tree))
+		goto out;
+
+	io->files.em_tree = em_tree;
+
+	err = dio_invalidate_cache(io);
+	if (err) {
+		io->files.em_tree = NULL;
+		ploop_dio_close(io, 0);
+		goto out;
+	}
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) && !io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			io->files.em_tree = NULL;
+			ploop_dio_close(io, 0);
+			goto out;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+out:
+	mutex_unlock(&io->files.inode->i_mutex);
+	return err;
+}
+
+void fsync_timeout(unsigned long data)
+{
+	struct ploop_io * io = (void*)data;
+
+	wake_up_interruptible(&io->fsync_waitq);
+}
+
+static int
+dio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+	init_timer(&io->fsync_timer);
+	io->fsync_timer.function = fsync_timeout;
+	io->fsync_timer.data = (unsigned long)io;
+
+	return 0;
+}
+
+struct dio_comp
+{
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+DEFINE_BIO_CB(dio_endio_sync)
+{
+	struct dio_comp * comp = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err && !comp->error)
+		comp->error = err;
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_sync)
+
+static int
+dio_sync_io(struct ploop_io * io, int rw, struct page * page,
+	    unsigned int len, unsigned int off, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+
+	BUG_ON(len & 511);
+	BUG_ON(off & 511);
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw, b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+	      unsigned int off, sector_t pos)
+{
+	return dio_sync_io(io, READ_SYNC, page, len, off, pos);
+}
+
+static int
+dio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+	       unsigned int off, sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_io(io, WRITE_SYNC, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+static int
+dio_sync_iovec(struct ploop_io * io, int rw, struct page ** pvec,
+	       unsigned int nr, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	unsigned int len = PAGE_SIZE * nr;
+	unsigned int off;
+	struct extent_map * em;
+	int err;
+	sector_t nsec;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (off/PAGE_SIZE != (off + copy + 1)/PAGE_SIZE)
+			copy = PAGE_SIZE - (off & (PAGE_SIZE-1));
+		if (bio_add_page(bio, pvec[off/PAGE_SIZE], copy,
+				 off & (PAGE_SIZE-1) ) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw, b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		 sector_t sec)
+{
+	return dio_sync_iovec(io, READ_SYNC, pvec, nr, sec);
+}
+
+static int
+dio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		  sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_iovec(io, WRITE_SYNC, pvec, nr, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+/*
+ * Allocate and zero new block in file. Do it through page cache.
+ * It is assumed there is no point to optimize this, it is used
+ * (for ploop1 format) only for allocation of index clusters. Another
+ * use-case is growing raw delta, but this is assumed to be rare.
+ */
+static int dio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	int err;
+	int ret;
+	struct page *pad = NULL;
+	int pad_len = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (pos + len > i_size_read(io->files.inode) &&
+	    io->files.file->f_op->fallocate) {
+		err = io->files.file->f_op->fallocate(io->files.file, 0,
+						       pos, len);
+		if (err)
+			return err;
+	}
+
+	if (pad_len) {
+		BUILD_BUG_ON(PAGE_SIZE != PAGE_CACHE_SIZE);
+
+		pad = alloc_page(GFP_NOFS);
+		if (pad == NULL)
+			return -ENOMEM;
+
+		len += pad_len;
+		pos -= pad_len;
+
+		err = dio_sync_read(io, pad, pad_len, 0, pos >> 9);
+		if (err) {
+			put_page(pad);
+			return err;
+		}
+	}
+
+	err = 0;
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	while (len > 0) {
+		struct page *page;
+		void *fsdata;
+		ret = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (ret) {
+			err = ret;
+			mutex_unlock(&io->files.inode->i_mutex);
+			goto fail;
+		}
+
+		bzero_page(page);
+
+		if (pad) {
+			memcpy(page_address(page), page_address(pad), pad_len);
+			put_page(pad);
+			pad = NULL;
+		}
+
+		ret = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE,
+					  PAGE_CACHE_SIZE, page, fsdata);
+		if (ret < 0 || ret != PAGE_CACHE_SIZE) {
+			err = ret;
+			mutex_unlock(&io->files.inode->i_mutex);
+			goto fail;
+		}
+
+		len -= PAGE_CACHE_SIZE;
+		pos += PAGE_CACHE_SIZE;
+	}
+
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	err = filemap_fdatawrite(io->files.mapping);
+	if (err)
+		goto fail;
+
+	err = io->ops->sync(io);
+	if (err)
+		goto fail;
+
+	err = filemap_fdatawait(io->files.mapping);
+
+fail:
+	if (pad)
+		put_page(pad);
+
+	if (!err)
+		io->alloc_head = pos >> (io->plo->cluster_log + 9);
+
+	return err;
+}
+
+static void
+dio_io_page(struct ploop_io * io, unsigned long rw,
+	    struct ploop_request * preq, struct page * page,
+	    sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	unsigned int len;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+	int off;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	ploop_prepare_io_request(preq);
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, sec);
+
+	len = PAGE_SIZE;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_async;
+		b->bi_private = preq;
+		atomic_inc(&preq->io_count);
+		ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
+		submit_bio(rw, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	PLOOP_FAIL_REQUEST(preq, err);
+}
+
+static void
+dio_read_page(struct ploop_io * io, struct ploop_request * preq,
+	      struct page * page, sector_t sec)
+{
+	dio_io_page(io, READ | REQ_SYNC, preq, page, sec);
+}
+
+static void
+dio_write_page(struct ploop_io * io, struct ploop_request * preq,
+	       struct page * page, sector_t sec, unsigned long rw)
+{
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
+}
+
+static int
+dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
+	    struct bio * bio, sector_t isec)
+{
+	struct request_queue * q;
+	struct extent_map * em;
+	int i;
+
+	if (unlikely((orig_bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+		     test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)))
+		return 1;
+
+	if (orig_bio->bi_size == 0) {
+		bio->bi_vcnt   = 0;
+		bio->bi_sector = 0;
+		bio->bi_size   = 0;
+		bio->bi_idx    = 0;
+
+		bio->bi_rw   = orig_bio->bi_rw;
+		bio->bi_bdev = io->files.bdev;
+		return 0;
+	}
+
+	em = extent_lookup(io->files.em_tree, isec);
+
+	if (em == NULL) {
+		io->plo->st.fast_neg_noem++;
+		return 1;
+	}
+
+	if (isec + (orig_bio->bi_size>>9) > em->end) {
+		io->plo->st.fast_neg_shortem++;
+		ploop_extent_put(em);
+		return 1;
+	}
+
+	BUG_ON(bio->bi_max_vecs < orig_bio->bi_vcnt);
+
+	memcpy(bio->bi_io_vec, orig_bio->bi_io_vec,
+	       orig_bio->bi_vcnt * sizeof(struct bio_vec));
+
+	bio->bi_sector = dio_isec_to_phys(em, isec);
+	ploop_extent_put(em);
+
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_rw = orig_bio->bi_rw;
+	bio->bi_vcnt = orig_bio->bi_vcnt;
+	bio->bi_size = orig_bio->bi_size;
+	bio->bi_idx = orig_bio->bi_idx;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	if (q->merge_bvec_fn == NULL)
+		return 0;
+
+	bio->bi_size = 0;
+	bio->bi_vcnt = 0;
+
+	for (i = 0; i < orig_bio->bi_vcnt; i++) {
+		struct bio_vec * bv = &bio->bi_io_vec[i];
+		struct bvec_merge_data bm_data = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+		if (q->merge_bvec_fn(q, &bm_data, bv) < bv->bv_len) {
+			io->plo->st.fast_neg_backing++;
+			return 1;
+		}
+		bio->bi_size += bv->bv_len;
+		bio->bi_vcnt++;
+	}
+	return 0;
+}
+
+/* Merge is disabled _only_ if we _have_ resolved mapping and
+ * we are sure bio is going to be split in any case due to
+ * file level fragmentation.
+ */
+static int
+dio_disable_merge(struct ploop_io * io, sector_t isector, unsigned int len)
+{
+	int ret = 0;
+	struct extent_map * em;
+
+	em = extent_lookup(io->files.em_tree, isector);
+	if (em) {
+		if (isector + len > em->end)
+			ret = 1;
+		ploop_extent_put(em);
+	}
+	return ret;
+}
+
+static int dio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+	struct path	path;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDONLY|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = io->ops->sync(io);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = dio_invalidate_cache(io);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int dio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = dio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	mutex_lock(&io->files.inode->i_mutex);
+	ploop_dio_downgrade(io->files.mapping);
+	BUG_ON((loff_t)io->alloc_head << (io->plo->cluster_log + 9) !=
+	       i_size_read(io->files.inode));
+	(void)invalidate_inode_pages2(io->files.mapping);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int dio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+	struct path	path;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDWR|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = io->ops->sync(io);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	err = dio_invalidate_cache(io);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+
+	err = ploop_dio_upgrade(io);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (!io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  io->plo->index);
+		if (io->fsync_thread == NULL) {
+			fput(file);
+			return -ENOMEM;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int dio_truncate(struct ploop_io * io, struct file * file,
+			__u32 alloc_head)
+{
+	int err;
+	struct iattr newattrs;
+	loff_t new_size;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size = (u64)alloc_head << (io->plo->cluster_log + 9);
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	if (io->files.em_tree)
+		trim_extent_mappings(io->files.em_tree, newattrs.ia_size>>9);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs, NULL);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_sub(*io->size_ptr - new_size, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err) {
+		if (io->files.file == file)
+			err = io->ops->sync(io);
+		else
+			err = file->f_op->fsync(file, 0, LLONG_MAX, 0);
+	}
+
+	return err;
+}
+
+static int dio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static void dio_unplug(struct ploop_io * io)
+{	
+	/* Need more thinking how to implement unplug */
+}
+
+static int dio_congested(struct ploop_io * io, int bits)
+{
+	struct request_queue *bq;
+
+	bq = bdev_get_queue(io->files.bdev);
+
+	return bdi_congested(&bq->backing_dev_info, bits);
+}
+
+static void dio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+	blk_queue_stack_limits(q, bdev_get_queue(io->files.bdev));
+}
+
+static void dio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOFS, 0);
+	if (unlikely(!bio)) {
+		PLOOP_FAIL_REQUEST(preq, -ENOMEM);
+		return;
+	}
+
+	ploop_prepare_io_request(preq);
+	bio->bi_end_io = dio_endio_async;
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_private = preq;
+
+	atomic_inc(&preq->io_count);
+	ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw);
+	submit_bio(WRITE_FLUSH, bio);
+	ploop_complete_io_request(preq);
+}
+
+static int dio_dump(struct ploop_io * io)
+{
+	extern void dump_extent_map(struct extent_map_tree *tree);
+
+	if (io->files.em_tree) {
+		dump_extent_map(io->files.em_tree);
+		return 0;
+	}
+	return -1;
+}
+
+static int dio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+	char         * s_id  = inode->i_sb->s_id;
+
+	int err;
+	mm_segment_t fs;
+	unsigned int flags;
+	
+	if (inode->i_sb->s_magic != EXT4_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (inode->i_sb->s_bdev == NULL) {
+		printk("File on FS EXT(%s) without backing device\n", s_id);
+		return -1;
+	}
+
+	if (!file->f_op->fallocate)
+		ploop_io_report_fn(file, KERN_WARNING
+					"File on FS w/o fallocate");
+
+	if (!file->f_op->unlocked_ioctl) {
+		printk("Cannot run on EXT4(%s): no unlocked_ioctl\n", s_id);
+		return -1;
+	}
+
+	if (!file->f_op->fsync) {
+		printk("Cannot run on EXT4(%s): no fsync\n", s_id);
+		return -1;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	flags = 0;
+	err = file->f_op->unlocked_ioctl(file, FS_IOC_GETFLAGS, (long)&flags);
+	set_fs(fs);
+
+	if (err != 0) {
+		printk("Cannot run on EXT4(%s): failed FS_IOC_GETFLAGS (%d)\n",
+		       s_id, err);
+		return -1;
+	}
+
+	io->files.flags = flags;
+	if (!(flags & EXT4_EXTENTS_FL))
+		ploop_io_report_fn(file, KERN_WARNING "File w/o extents");
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_direct =
+{
+	.id		=	PLOOP_IO_DIRECT,
+	.name		=	"direct",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	dio_unplug,
+	.congested	=	dio_congested,
+
+	.alloc		=	dio_alloc_sync,
+	.submit		=	dio_submit,
+	.submit_alloc	=	dio_submit_alloc,
+	.post_submit	=	dio_post_submit,
+	.disable_merge	=	dio_disable_merge,
+	.fastmap	=	dio_fastmap,
+	.read_page	=	dio_read_page,
+	.write_page	=	dio_write_page,
+	.sync_read	=	dio_sync_read,
+	.sync_write	=	dio_sync_write,
+	.sync_readvec	=	dio_sync_readvec,
+	.sync_writevec	=	dio_sync_writevec,
+
+	.init		=	dio_init,
+	.destroy	=	dio_destroy,
+	.open		=	dio_open,
+	.sync		=	dio_sync,
+	.stop		=	dio_stop,
+	.prepare_snapshot =	dio_prepare_snapshot,
+	.complete_snapshot =	dio_complete_snapshot,
+	.prepare_merge  =	dio_prepare_merge,
+	.start_merge	=	dio_start_merge,
+	.truncate	=	dio_truncate,
+
+	.queue_settings	=	dio_queue_settings,
+	.issue_flush	=	dio_issue_flush,
+
+	.dump		=	dio_dump,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       dio_autodetect,
+};
+
+module_param(max_extent_map_pages, int, 0644);
+MODULE_PARM_DESC(max_extent_map_pages, "Maximal amount of pages taken by all extent map caches");
+module_param(min_extent_map_entries, int, 0644);
+MODULE_PARM_DESC(min_extent_map_entries, "Minimal amount of entries in a single extent map cache");
+
+static int __init pio_direct_mod_init(void)
+{
+	int err;
+
+	if (max_extent_map_pages == 0)
+		max_extent_map_pages = PLOOP_MAX_EXTENT_MAP >> PAGE_SHIFT;
+
+	if (min_extent_map_entries == 0)
+		min_extent_map_entries = 64;
+
+	err = ploop_extent_map_init();
+	if (!err) {
+		err = ploop_register_io(&ploop_io_ops_direct);
+		if (err)
+			ploop_extent_map_exit();
+	}
+
+	return err;
+}
+
+static void __exit pio_direct_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_direct);
+	ploop_extent_map_exit();
+	BUG_ON(atomic_long_read(&ploop_io_images_size));
+}
+
+module_init(pio_direct_mod_init);
+module_exit(pio_direct_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/io_direct_events.h
@@ -0,0 +1,49 @@
+/*
+ *  drivers/block/ploop/io_direct_events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_IO_DIRECT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_DIRECT_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "io_direct_map.h"
+
+TRACE_EVENT(add_extent_mapping,
+	TP_PROTO(struct extent_map *em),
+
+	TP_ARGS(em),
+
+	TP_STRUCT__entry(
+		__field(sector_t,  start)
+		__field(sector_t,  end)
+		__field(sector_t,  bstart)
+	),
+
+	TP_fast_assign(
+		__entry->start	= em->start;
+		__entry->end	= em->end;
+		__entry->bstart	= em->block_start;
+	),
+
+	TP_printk("start=0x%lx end=0x%lx block_start=0x%lx",
+			__entry->start, __entry->end, __entry->bstart)
+);
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE io_direct_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+++ b/drivers/block/ploop/io_direct_map.c
@@ -0,0 +1,863 @@
+/*
+ *  drivers/block/ploop/io_direct_map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/ploop/ploop_if.h>
+#include "io_direct_events.h"
+#include "io_direct_map.h"
+
+/* Part of io_direct shared between all the devices.
+ * No way this code is good. But it is the best, which we can do
+ * not modifying core.
+ *
+ * Keep track of images opened by ploop. Maintain shared extent
+ * maps for shared images, which are open read-only. Top level
+ * deltas, which are open for write, are open exclusively.
+ *
+ * Also take care about setting/clearing S_SWAPFILE and setting
+ * mapping gfp mask to GFP_NOFS.
+ */
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+	unsigned long		saved_gfp_mask;
+	loff_t			size;
+
+	struct extent_map_tree	extent_root;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+/* total number of extent_map structures */
+static atomic_t ploop_extent_maps_count = ATOMIC_INIT(0);
+
+static void extent_map_tree_init(struct extent_map_tree *tree);
+static int drop_extent_map(struct extent_map_tree *tree);
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+extern atomic_long_t ploop_io_images_size;
+
+/*
+ * ploop_dio_* functions must be called with i_mutex taken.
+ */
+
+struct extent_map_tree *
+ploop_dio_open(struct ploop_io * io, int rdonly)
+{
+	int err;
+	struct ploop_mapping *m, *pm;
+	struct file * file = io->files.file;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	err = 0;
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+
+out_unlock:
+			spin_unlock(&ploop_mappings_lock);
+			if (pm)
+				kfree(pm);
+			if (!err)
+				io->size_ptr = &m->size;
+			return err ? ERR_PTR(err) : &m->extent_root;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	pm->mapping = mapping;
+	extent_map_tree_init(&pm->extent_root);
+	pm->extent_root.mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	mapping->host->i_flags |= S_SWAPFILE;
+	io->size_ptr = &pm->size;
+	*io->size_ptr = i_size_read(mapping->host);
+	atomic_long_add(*io->size_ptr, &ploop_io_images_size);
+
+	pm->saved_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping,
+			     pm->saved_gfp_mask & ~__GFP_FS);
+
+	spin_unlock(&ploop_mappings_lock);
+
+	if (strcmp(mapping->host->i_sb->s_type->name, "pcss") == 0) {
+		struct ploop_xops xops;
+		if (file->f_op->unlocked_ioctl) {
+			mm_segment_t fs = get_fs();
+
+			set_fs(KERNEL_DS);
+			xops.magic = 0;
+			err = file->f_op->unlocked_ioctl(file, PLOOP_IOC_INTERNAL, (long)&xops);
+			set_fs(fs);
+			if (err == 0 && xops.magic == PLOOP_INTERNAL_MAGIC)
+				pm->extent_root._get_extent = xops.get_extent;
+		}
+	}
+	return &pm->extent_root;
+}
+
+int
+ploop_dio_close(struct ploop_io * io, int rdonly)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				atomic_long_sub(*io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = 0;
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		drop_extent_map(&pm->extent_root);
+		BUG_ON(pm->extent_root.map_size);
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_dio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_dio_upgrade(struct ploop_io * io)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				loff_t new_size = i_size_read(io->files.inode);
+				atomic_long_add(new_size - *io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = new_size;
+
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
+
+
+/* The rest of the file is written by Jens Axboe.
+ * I just fixed a few of bugs (requests not aligned at fs block size
+ * due to direct-io aligned to 512) and truncated some useless functionality.
+ *
+ * In any case, it must be remade: not only because of GPL, but also
+ * because it is not good.
+ */
+
+static struct kmem_cache *extent_map_cache;
+
+int __init ploop_extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("ploop_itree",
+						sizeof(struct extent_map), 0,
+						SLAB_MEM_SPREAD, NULL
+						);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ploop_extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+static void extent_map_tree_init(struct extent_map_tree *tree)
+{
+	tree->map.rb_node = NULL;
+	INIT_LIST_HEAD(&tree->lru_list);
+	tree->map_size = 0;
+	rwlock_init(&tree->lock);
+}
+
+struct extent_map *ploop_alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+
+	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	if (em) {
+		atomic_set(&em->refs, 1);
+		INIT_LIST_HEAD(&em->lru_link);
+		atomic_inc(&ploop_extent_maps_count);
+		em->uninit = false;
+	}
+	return em;
+}
+
+void ploop_extent_put(struct extent_map *em)
+{
+	if (!em)
+		return;
+	if (atomic_dec_and_test(&em->refs)) {
+		atomic_dec(&ploop_extent_maps_count);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, sector_t start,
+				   sector_t end, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct extent_map *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		if (end <= entry->start)
+			p = &(*p)->rb_left;
+		else if (start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/* Find extent which contains "offset". If there is no such extent,
+ * prev_ret is the first extent following "offset".
+ */
+static struct rb_node *__tree_search(struct rb_root *root, sector_t offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && offset >= prev_entry->end) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/* Find the first extent which could intersect a range starting at offset.
+ * Probably, it does not contain offset.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static int tree_delete(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *node;
+
+	node = __tree_search(root, offset, NULL);
+	if (!node)
+		return -ENOENT;
+	rb_erase(node, root);
+	return 0;
+}
+
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (prev->end == next->start &&
+	    next->block_start == extent_map_block_end(prev))
+		return 1;
+	return 0;
+}
+
+static inline int purge_lru_mapping(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	return atomic_read(&ploop_extent_maps_count) > max_entries &&
+	       tree->map_size > max(1, min_extent_map_entries) &&
+	       (u64)tree->map_size * atomic_long_read(&ploop_io_images_size) >
+	       (u64)max_entries * i_size_read(tree->mapping->host);
+}
+
+static inline void purge_lru_warn(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	loff_t ratio = i_size_read(tree->mapping->host) * 100;
+	do_div(ratio, atomic_long_read(&ploop_io_images_size));
+
+	printk(KERN_WARNING "Purging lru entry from extent tree for inode %ld "
+	       "(map_size=%d ratio=%lld%%)\n",
+	       tree->mapping->host->i_ino, tree->map_size, ratio);
+
+	/* Claim FS as 'too fragmented' if average_extent_size < 8MB */
+	if ((u64)max_entries * (8 * 1024 * 1024) <
+	    atomic_long_read(&ploop_io_images_size))
+		printk(KERN_WARNING "max_extent_map_pages=%d is too low for "
+		       "ploop_io_images_size=%ld bytes\n",
+		       max_extent_map_pages,
+		       atomic_long_read(&ploop_io_images_size));
+	else {
+		loff_t avg_siz = i_size_read(tree->mapping->host);
+		do_div(avg_siz, tree->map_size);
+
+		printk(KERN_WARNING "host fs is too fragmented: average extent"
+		       " size is lesser than %lld bytes\n", avg_siz);
+	}
+}
+
+/*
+ * add_extent_mapping tries a simple forward/backward merge with existing
+ * mappings.  The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+static int add_extent_mapping(struct extent_map_tree *tree,
+			      struct extent_map *em)
+{
+	int ret = 0;
+	struct rb_node *rb;
+
+	write_lock_irq(&tree->lock);
+
+	do {
+		rb = tree_insert(&tree->map, em->start, em->end, &em->rb_node);
+		/* A part of this extent can be in tree */
+		if (rb) {
+			struct extent_map *tmp =
+				rb_entry(rb, struct extent_map, rb_node);
+			BUG_ON(tmp->block_start - tmp->start !=
+					em->block_start - em->start);
+			if (tmp->start <= em->start &&
+			    tmp->end >= em->end) {
+				ret =  -EEXIST;
+				goto out;
+			}
+			if (tmp->start < em->start) {
+				em->start = tmp->start;
+				em->block_start = tmp->block_start;
+			}
+			if (tmp->end > em->end)
+				em->end = tmp->end;
+			rb_erase(rb, &tree->map);
+			list_del_init(&tmp->lru_link);
+			tree->map_size--;
+			ploop_extent_put(tmp);
+		} else {
+			list_add_tail(&em->lru_link, &tree->lru_list);
+			tree->map_size++;
+
+			if (purge_lru_mapping(tree)) {
+				struct extent_map *victim_em;
+				static unsigned long purge_lru_time;
+
+				/* Warn about this once per hour */
+				if (printk_timed_ratelimit(&purge_lru_time,
+							   60*60*HZ))
+					purge_lru_warn(tree);
+
+				victim_em = list_entry(tree->lru_list.next,
+						       struct extent_map,
+						       lru_link);
+
+				list_del_init(&victim_em->lru_link);
+				tree->map_size--;
+				rb_erase(&victim_em->rb_node, &tree->map);
+				ploop_extent_put(victim_em);
+			}
+		}
+	} while (rb);
+
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb) {
+			struct extent_map *merge;
+
+			merge = rb_entry(rb, struct extent_map, rb_node);
+			if (mergable_maps(merge, em)) {
+				em->start = merge->start;
+				em->block_start = merge->block_start;
+				rb_erase(&merge->rb_node, &tree->map);
+				list_del_init(&merge->lru_link);
+				tree->map_size--;
+				ploop_extent_put(merge);
+			}
+		}
+	}
+	rb = rb_next(&em->rb_node);
+	if (rb) {
+		struct extent_map *merge;
+
+		merge = rb_entry(rb, struct extent_map, rb_node);
+		if (mergable_maps(em, merge)) {
+			em->end = merge->end;
+			rb_erase(&merge->rb_node, &tree->map);
+			list_del_init(&merge->lru_link);
+			tree->map_size--;
+			ploop_extent_put(merge);
+		}
+	}
+
+	trace_add_extent_mapping(em);
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+struct extent_map *
+extent_lookup(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em = NULL;
+	struct rb_node *rb_node;
+
+	read_lock(&tree->lock);
+	rb_node = __tree_search(&tree->map, start, NULL);
+	if (rb_node) {
+		em = rb_entry(rb_node, struct extent_map, rb_node);
+		atomic_inc(&em->refs);
+	}
+	read_unlock(&tree->lock);
+
+	if (em) {
+		write_lock(&tree->lock);
+		/* em could not be released, but could be deleted
+		 * from the list before we re-acquired the lock */
+		if (!list_empty(&em->lru_link)) {
+			list_del(&em->lru_link);
+			list_add_tail(&em->lru_link, &tree->lru_list);
+		}
+		write_unlock(&tree->lock);
+	}
+
+	return em;
+}
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, start+len) range.  There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+static struct extent_map *
+lookup_extent_mapping(struct extent_map_tree *tree, sector_t start, sector_t len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+
+	read_lock_irq(&tree->lock);
+	rb_node = tree_search(&tree->map, start);
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (em->end <= start || em->start >= start + len) {
+		em = NULL;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+
+out:
+	read_unlock_irq(&tree->lock);
+	return em;
+}
+
+/*
+ * removes an extent_map struct from the tree.  No reference counts are
+ * dropped, and no checks are done to  see if the range is in use
+ */
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret;
+
+	write_lock_irq(&tree->lock);
+	ret = tree_delete(&tree->map, em->start);
+	if (!ret) {
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+	}
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+static struct extent_map *__map_extent_get_extent(struct extent_map_tree *tree,
+						  struct address_space *mapping,
+						  sector_t start, sector_t len, int create,
+						  gfp_t gfp_mask)
+{
+	struct inode *inode = mapping->host;
+	struct extent_map *em;
+	sector_t nstart, result;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		if (em->start <= start && em->end >= start + len)
+			return em;
+
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else if (!create) {
+			return em;
+		}
+		ploop_extent_put(em);
+	}
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	em = ploop_alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * FIXME if there are errors later on, we end up exposing stale
+	 * data on disk while filling holes.
+	 *
+	 * _XXX_ Danger! len is reduced above, therefore _get_extent
+	 * does not allocate all that we need. It works only with pcss
+	 * and only when cluster size <= pcss block size and allocation
+	 * is aligned. If we relax those conditions, the code must be fixed.
+	 */
+	ret = tree->_get_extent(inode, start, len, &nstart, &result, create);
+	if (ret < 0) {
+		ploop_extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	em->start = nstart;
+	em->end = nstart + ret;
+	em->block_start = result;
+
+	ret = add_extent_mapping(tree, em);
+	if (ret == -EEXIST) {
+		ploop_extent_put(em);
+		goto again;
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent_bmap(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, gfp_t gfp_mask)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+	struct inode *inode = mapping->host;
+	loff_t start_off = (loff_t)start << 9;
+	struct extent_map *em;
+	struct fiemap_extent_info fieinfo;
+	struct fiemap_extent fi_extent;
+	mm_segment_t old_fs;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else {
+			return em;
+		}
+		ploop_extent_put(em);
+	}
+
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	if (!inode->i_op->fiemap)
+		return ERR_PTR(-EINVAL);
+
+	em = ploop_alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	fieinfo.fi_extents_start = &fi_extent;
+	fieinfo.fi_extents_max = 1;
+	fieinfo.fi_flags = 0;
+	fieinfo.fi_extents_mapped = 0;
+	fi_extent.fe_flags = 0;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = inode->i_op->fiemap(inode, &fieinfo, start_off, 1);
+
+	/* chase for PSBM-26762: em->block_start == 0 */
+	if (!ret && fieinfo.fi_extents_mapped == 1 &&
+	    !(fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) &&
+	    (fi_extent.fe_physical >> 9) == 0) {
+		/* see how ext4_fill_fiemap_extents() implemented */
+		if (!(fi_extent.fe_flags & FIEMAP_EXTENT_DELALLOC)) {
+			printk("bad fiemap(%ld,%ld) on inode=%p &fieinfo=%p"
+			" i_size=%lld\n", start, len, inode, &fieinfo,
+			i_size_read(inode));
+			BUG();
+		}
+		/* complain about delalloc case -- ploop always fallocate
+		* before buffered write */
+		WARN(1, "ploop%d: delalloc extent [%lld,%lld] for [%lld,%ld];"
+			" i_size=%lld\n", io->plo->index, fi_extent.fe_logical,
+			fi_extent.fe_length, start_off, len << 9, i_size_read(inode));
+		ret = -ENOENT;
+	}
+	set_fs(old_fs);
+
+	if (ret) {
+		ploop_extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	if (fieinfo.fi_extents_mapped != 1) {
+		if (start_off < i_size_read(inode))
+			ploop_msg_once(io->plo, "a hole in image file detected"
+				       " (mapped=%d i_size=%llu off=%llu)",
+				       fieinfo.fi_extents_mapped,
+				       i_size_read(inode), start_off);
+		ploop_extent_put(em);
+		return ERR_PTR(-EINVAL);
+	}
+
+	em->start = fi_extent.fe_logical >> 9;
+	em->end = (fi_extent.fe_logical + fi_extent.fe_length) >> 9;
+	em->block_start = fi_extent.fe_physical >> 9;
+
+	if (fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
+		em->uninit = true;
+	} else {
+		ret = add_extent_mapping(tree, em);
+		if (ret == -EEXIST) {
+			ploop_extent_put(em);
+			goto again;
+		}
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, int create,
+				       gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	if (tree->_get_extent)
+		return __map_extent_get_extent(tree, mapping, start, len, create,
+					       gfp_mask);
+	if (create)
+		/* create flag not supported by bmap implementation */
+		return ERR_PTR(-EINVAL);
+
+	return __map_extent_bmap(io, mapping, start,len, gfp_mask);
+}
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map *em;
+	sector_t last;
+	sector_t map_ahead_len = 0;
+
+	em = __map_extent(io, mapping, start, len, create,
+			  gfp_mask, get_block);
+
+	/*
+	 * if we're doing a write or we found a large extent, return it
+	 */
+	if (IS_ERR(em) || !em || create || start + len < em->end) {
+		return em;
+	}
+
+	/*
+	 * otherwise, try to walk forward a bit and see if we can build
+	 * something bigger.
+	 */
+	do {
+		/* avoid race with userspace merge */
+		if (em->end >=
+		    ((sector_t)io->alloc_head << io->plo->cluster_log))
+			break;
+
+		last = em->end;
+		ploop_extent_put(em);
+		em = __map_extent(io, mapping, last, len, create,
+				  gfp_mask, get_block);
+		if (IS_ERR(em) || !em)
+			break;
+		map_ahead_len += em->end - last;
+	} while (em->start <= start && start + len <= em->end &&
+		 map_ahead_len < 1024);
+
+	/* make sure we return the extent for this range */
+	if (!em || IS_ERR(em) || em->start > start ||
+	    start + len > em->end) {
+		if (em && !IS_ERR(em))
+			ploop_extent_put(em);
+		em = __map_extent(io, mapping, start, len, create,
+				  gfp_mask, get_block);
+	}
+	return em;
+}
+
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	return map_extent_get_block(io, tree->mapping,
+				    start, len, 0, mapping_gfp_mask(tree->mapping),
+				    NULL);
+}
+
+static int drop_extent_map(struct extent_map_tree *tree)
+{
+	struct extent_map *em;
+	struct rb_node * node;
+
+	write_lock_irq(&tree->lock);
+	while ((node = tree->map.rb_node) != NULL) {
+		em = rb_entry(node, struct extent_map, rb_node);
+		rb_erase(node, &tree->map);
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+		ploop_extent_put(em);
+	}
+	write_unlock_irq(&tree->lock);
+	return 0;
+}
+
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em;
+
+	while ((em = lookup_extent_mapping(tree, start, ((sector_t)(-1ULL)) - start))) {
+		remove_extent_mapping(tree, em);
+		WARN_ON(atomic_read(&em->refs) != 2);
+		/* once for us */
+		ploop_extent_put(em);
+		/* No concurrent lookups due to ploop_quiesce(). See WARN_ON above */
+		/* once for the tree */
+		ploop_extent_put(em);
+	}
+}
+
+
+void dump_extent_map(struct extent_map_tree *tree)
+{
+	struct rb_node * r = rb_first(&tree->map);
+
+	while (r) {
+		struct extent_map *em0 = rb_entry(r, struct extent_map, rb_node);
+		printk("N=%ld %ld -> %ld\n", (long)em0->start, (long)(em0->end - em0->start), (long)em0->block_start);
+		r = rb_next(r);
+	}
+}
+
--- /dev/null
+++ b/drivers/block/ploop/io_direct_map.h
@@ -0,0 +1,68 @@
+/*
+ *  drivers/block/ploop/io_direct_map.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __INTERVAL_TREE_H__
+#define __INTERVAL_TREE_H__
+
+#include <linux/rbtree.h>
+
+struct extent_map_tree
+{
+	struct rb_root map;
+	struct list_head lru_list;
+	unsigned int map_size; /* # entries in map */
+	rwlock_t lock;
+	struct address_space * mapping;
+	int (*_get_extent)(struct inode *inode, sector_t isec,
+			   unsigned int nr, sector_t *start,
+			   sector_t *psec, int creat);
+};
+
+struct extent_map
+{
+	struct rb_node rb_node;
+	struct list_head lru_link;
+
+	sector_t	start;
+	sector_t	end;
+
+	sector_t	block_start;
+
+	atomic_t refs;
+
+	bool uninit;
+};
+
+extern int max_extent_map_pages;
+extern int min_extent_map_entries;
+
+static inline sector_t extent_map_block_end(struct extent_map *em)
+{
+	return em->block_start + (em->end - em->start);
+}
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len);
+struct extent_map *extent_lookup(struct extent_map_tree *tree,
+				 sector_t start);
+void ploop_extent_put(struct extent_map *em);
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block);
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start);
+
+int ploop_dio_close(struct ploop_io * io, int rdonly);
+struct extent_map_tree * ploop_dio_open(struct ploop_io * io, int rdonly);
+void ploop_dio_downgrade(struct address_space * mapping);
+int ploop_dio_upgrade(struct ploop_io * io);
+
+int __init ploop_extent_map_init(void);
+void ploop_extent_map_exit(void);
+
+#endif
--- /dev/null
+++ b/drivers/block/ploop/io_kaio.c
@@ -0,0 +1,1056 @@
+/*
+ *  drivers/block/ploop/io_kaio.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+#include <linux/aio.h>
+
+#include <linux/ploop/ploop.h>
+
+/* from fs/inode/fuse.c */
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define KAIO_PREALLOC (128 * 1024 * 1024) /* 128 MB */
+
+#define KAIO_MAX_PAGES_PER_REQ 32	  /* 128 KB */
+
+/* This will be used as flag "ploop_kaio_open() succeeded" */
+static struct extent_map_tree
+{
+} dummy_em_tree;
+
+int ploop_kaio_open(struct file * file, int rdonly);
+int ploop_kaio_close(struct address_space * mapping, int rdonly);
+void ploop_kaio_downgrade(struct address_space * mapping);
+int ploop_kaio_upgrade(struct address_space * mapping);
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos);
+static int kaio_truncate(struct ploop_io * io, struct file * file, __u32 a_h);
+
+static void __kaio_queue_fsync_req(struct ploop_request * preq, int prio)
+{
+	struct ploop_device * plo   = preq->plo;
+	struct ploop_delta  * delta = ploop_top_delta(plo);
+	struct ploop_io     * io    = &delta->io;
+
+	if (prio)
+		list_add(&preq->list, &io->fsync_queue);
+	else
+		list_add_tail(&preq->list, &io->fsync_queue);
+
+	io->fsync_qlen++;
+	if (waitqueue_active(&io->fsync_waitq))
+		wake_up_interruptible(&io->fsync_waitq);
+}
+
+static void kaio_queue_fsync_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 0);
+}
+
+static void kaio_queue_trunc_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 1);
+}
+
+static void kaio_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo   = preq->plo;
+	unsigned long flags;
+	int post_fsync = 0;
+	int need_fua = !!(preq->req_rw & REQ_FUA);
+	unsigned long state = READ_ONCE(preq->state);
+	int reloc = !!(state & (PLOOP_REQ_RELOC_A_FL|
+				PLOOP_REQ_RELOC_S_FL|
+				PLOOP_REQ_RELOC_N_FL));
+
+	if (preq->error || !(preq->req_rw & REQ_FUA) ||
+	    preq->eng_state == PLOOP_E_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_DELTA_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_DELTA_READ) {
+		ploop_complete_io_state(preq);
+		return;
+	}
+
+	/* Convert requested fua to fsync */
+	if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state) ||
+	    (need_fua && !ploop_req_delay_fua_possible(preq)) ||
+	    (reloc && ploop_req_delay_fua_possible(preq))) {
+		post_fsync = 1;
+		preq->req_rw &= ~REQ_FUA;
+	}
+
+	if (post_fsync) {
+		spin_lock_irqsave(&plo->lock, flags);
+		kaio_queue_fsync_req(preq);
+		plo->st.bio_syncwait++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	} else {
+		ploop_complete_io_state(preq);
+	}
+}
+
+static void kaio_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		kaio_complete_io_state(preq);
+}
+
+struct kaio_req {
+	struct ploop_request *preq;
+	struct bio_vec	      bvecs[0];
+};
+
+static void kaio_rw_aio_complete(u64 data, long res)
+{
+	struct ploop_request * preq = (struct ploop_request *)data;
+
+	if (unlikely(res < 0)) {
+		struct bio *b = preq->aux_bio;
+		printk("kaio_rw_aio_complete: kaio failed with err=%ld "
+		       "(rw=%s; state=%ld/0x%lx; clu=%d; iblk=%d; aux=%ld)\n",
+		       res, (preq->req_rw & REQ_WRITE) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, preq->req_cluster,
+		       preq->iblock, b ? b->bi_sector : -1);
+		bio_list_for_each(b, &preq->bl)
+			printk(" bio=%p: bi_sector=%ld bi_size=%d\n",
+			       b, b->bi_sector, b->bi_size);
+		PLOOP_REQ_SET_ERROR(preq, res);
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void kaio_rw_kreq_complete(u64 data, long res)
+{
+	struct kaio_req *kreq = (struct kaio_req *)data;
+	struct ploop_request *preq = kreq->preq;
+
+	kfree(kreq);
+	kaio_rw_aio_complete((u64)preq, res);
+}
+
+static struct kaio_req *kaio_kreq_alloc(struct ploop_request *preq, int *nr_p)
+{
+	static const int nr = KAIO_MAX_PAGES_PER_REQ;
+	struct kaio_req *kreq;
+
+	kreq = kmalloc(offsetof(struct kaio_req, bvecs[nr]), GFP_NOFS);
+	if (kreq) {
+		*nr_p = nr;
+		kreq->preq = preq;
+	}
+
+	return kreq;
+}
+
+static int kaio_kernel_submit(struct file *file, struct kaio_req *kreq,
+		size_t nr_segs, size_t count, loff_t pos, unsigned long rw)
+{
+	struct kiocb *iocb;
+	unsigned short op;
+	struct iov_iter iter;
+	int err;
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	if (rw & REQ_WRITE)
+		op = IOCB_CMD_WRITE_ITER;
+	else
+		op = IOCB_CMD_READ_ITER;
+
+	iov_iter_init_bvec(&iter, kreq->bvecs, nr_segs, count, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_kreq_complete, (u64)kreq);
+
+	err = aio_kernel_submit(iocb);
+	if (err)
+		printk("kaio_kernel_submit: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld; len=%ld)\n",
+		       err, (rw & REQ_WRITE) ? "WRITE" : "READ",
+		       kreq->preq->eng_state, kreq->preq->state, pos, count);
+	return err;
+}
+
+/*
+ * Pack as many bios from the list pointed by '*bio_pp' to kreq as possible,
+ * but no more than 'size' bytes. Returns 'copy' equal to # bytes copied.
+ *
+ * <*bio_pp, *idx_p> plays the role of iterator to walk through bio list.
+ * NB: the iterator is valid only while 'size' > 'copy'
+ *
+ * NB: at enter, '*nr_segs' depicts capacity of kreq;
+ *     at return, it depicts actual payload
+ */
+static size_t kaio_kreq_pack(struct kaio_req *kreq, int *nr_segs,
+			     struct bio **bio_pp, int *idx_p, size_t size)
+{
+	int kreq_nr_max = *nr_segs;
+	struct bio *b = *bio_pp;
+	int idx = *idx_p;
+	struct bio_vec *src_bv = b->bi_io_vec + idx;
+	struct bio_vec *dst_bv = kreq->bvecs;
+	size_t copy = 0;
+
+	BUG_ON(b->bi_idx);
+
+	while (1) {
+		int nr = min_t(int, kreq_nr_max, b->bi_vcnt - idx);
+		BUG_ON(!nr);
+
+		memcpy(dst_bv, src_bv, nr * sizeof(struct bio_vec));
+
+		copy += bvec_length(dst_bv, nr);
+		if (copy >= size) {
+			*nr_segs = dst_bv - kreq->bvecs + nr;
+			return size;
+		}
+
+		dst_bv += nr;
+		src_bv += nr;
+		idx += nr;
+
+		if (b->bi_vcnt == idx) {
+			b = b->bi_next;
+			BUG_ON(!b);
+			src_bv = b->bi_io_vec;
+			idx = 0;
+		}
+
+		kreq_nr_max -= nr;
+		if (kreq_nr_max == 0)
+			break;
+	}
+
+	*bio_pp = b;
+	*idx_p = idx;
+	return copy;
+}
+
+/*
+ * WRITE case:
+ *
+ * sbl is the list of bio; the first bio in the list and iblk specify
+ * destination file offset; the content of bios in sbl is scattered source
+ * buffer.
+ *
+ * The goal is to write source buffer to the file with given offset. We're
+ * doing it by stuffing as many bvecs from source to kreqs as possible and
+ * submitting kreqs to in-kernel aio.
+ *
+ * READ case:
+ *
+ * The same as WRITE, but here the file plays the role of source and the
+ * content of bios in sbl plays the role of destination.
+ */
+static void kaio_sbl_submit(struct file *file, struct ploop_request *preq,
+			    unsigned long rw, struct bio_list *sbl,
+			    iblock_t iblk, size_t size)
+{
+	struct bio *bio = sbl->head;
+	int idx = 0;
+
+	loff_t off = bio->bi_sector;
+	off = ((loff_t)iblk << preq->plo->cluster_log) |
+		(off & ((1<<preq->plo->cluster_log) - 1));
+
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, off);
+
+	off <<= 9;
+	/* since now 'off' always points to a position in the file to X-mit */
+
+	WARN_ONCE(!(file->f_flags & O_DIRECT), "File opened w/o O_DIRECT");
+
+	ploop_prepare_io_request(preq);
+
+	size <<= 9;
+	while (size > 0) {
+		struct kaio_req *kreq;
+		int nr_segs;
+		size_t copy;
+		int err;
+
+		kreq = kaio_kreq_alloc(preq, &nr_segs);
+		if (!kreq) {
+			PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+			break;
+		}
+
+		copy = kaio_kreq_pack(kreq, &nr_segs, &bio, &idx, size);
+
+		atomic_inc(&preq->io_count);
+		err = kaio_kernel_submit(file, kreq, nr_segs, copy, off, rw);
+		if (err) {
+			PLOOP_REQ_SET_ERROR(preq, err);
+			ploop_complete_io_request(preq);
+			kfree(kreq);
+			break;
+		}
+
+		off += copy;
+		size -= copy;
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void
+kaio_submit(struct ploop_io *io, struct ploop_request * preq,
+	     unsigned long rw,
+	     struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	if (rw & REQ_FLUSH) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	kaio_sbl_submit(io->files.file, preq, rw, sbl, iblk, size);
+}
+
+/* returns non-zero if and only if preq was resubmitted */
+static int kaio_resubmit(struct ploop_request * preq)
+{
+	struct ploop_delta * delta = ploop_top_delta(preq->plo);
+
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		return 0;
+	case PLOOP_E_COMPLETE:
+	case PLOOP_E_RELOC_NULLIFY:
+	case PLOOP_E_DATA_WBI:
+		if (preq->aux_bio) {
+			struct bio_list tbl;
+			tbl.head = tbl.tail = preq->aux_bio;
+			kaio_submit(&delta->io, preq, preq->req_rw, &tbl,
+				    preq->iblock, 1<<preq->plo->cluster_log);
+		} else {
+			kaio_submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				    preq->iblock, preq->req_size);
+		}
+		break;
+	case PLOOP_E_TRANS_DELTA_READ:
+		/* BUG_ON below guarantees that 'case PLOOP_E_DELTA_COPIED'
+		 * is equivalent to the part of 'case PLOOP_E_TRANS_DELTA_READ'
+		 * after bio_bcopy(). This is not trivial. */
+		BUG_ON(!test_bit(PLOOP_REQ_TRANS, &preq->state));
+		/* Fall through ... */
+	case PLOOP_E_DELTA_READ:
+		preq->eng_state = PLOOP_E_DELTA_COPIED; /* skip bcopy() */
+		return 0;
+	default:
+		printk("Resubmit bad state %lu\n", preq->eng_state);
+		BUG();
+	}
+
+	return 1;
+}
+
+static inline int io2level(struct ploop_io * io)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	return delta->level;
+}
+
+static int kaio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		struct ploop_request * preq;
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		preq = list_entry(io->fsync_queue.next, struct ploop_request, list);
+		list_del(&preq->list);
+		io->fsync_qlen--;
+		if (!preq->prealloc_size)
+			plo->st.bio_fsync++;
+		spin_unlock_irq(&plo->lock);
+
+		/* trick: preq->prealloc_size is actually new pos of eof */
+		if (preq->prealloc_size) {
+			err = kaio_truncate(io, io->files.file,
+					    preq->prealloc_size >> (plo->cluster_log + 9));
+			if (err)
+				PLOOP_REQ_SET_ERROR(preq, -EIO);
+		} else {
+			struct file *file = io->files.file;
+			err = vfs_fsync(file, 1);
+			if (err) {
+				printk("kaio_fsync_thread: vfs_fsync failed "
+				       "with err=%d (i_ino=%ld of level=%d "
+				       "on ploop%d)\n",
+				       err, io->files.inode->i_ino,
+				       io2level(io), plo->index);
+				PLOOP_REQ_SET_ERROR(preq, -EIO);
+			} else if (preq->req_rw & REQ_FLUSH) {
+				BUG_ON(!preq->req_size);
+				preq->req_rw &= ~REQ_FLUSH;
+				if (kaio_resubmit(preq)) {
+					spin_lock_irq(&plo->lock);
+					continue;
+				}
+			}
+		}
+
+		spin_lock_irq(&plo->lock);
+		list_add_tail(&preq->list, &plo->ready_queue);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+static void
+kaio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	iblock_t iblk;
+	int log = preq->plo->cluster_log + 9;
+	loff_t clu_siz = 1 << log;
+
+	if (delta->flags & PLOOP_FMT_RDONLY) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	iblk = io->alloc_head;
+
+	if (unlikely(preq->req_rw & REQ_FLUSH)) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	BUG_ON(preq->prealloc_size);
+
+	if (unlikely(io->prealloced_size < clu_siz)) {
+		if (!io->prealloc_preq) {
+			loff_t pos = (((loff_t)(iblk + 1)  << log) |
+				      (KAIO_PREALLOC - 1)) + 1;
+
+			BUG_ON(preq->prealloc_size);
+			preq->prealloc_size = pos;
+			io->prealloc_preq   = preq;
+
+			spin_lock_irq(&io->plo->lock);
+			kaio_queue_trunc_req(preq);
+			io->plo->st.bio_syncwait++;
+			spin_unlock_irq(&io->plo->lock);
+			return;
+		} else { /* we're not first */
+			list_add_tail(&preq->list,
+				      &io->prealloc_preq->delay_list);
+			return;
+		}
+	}
+
+	io->prealloced_size -= clu_siz;
+	io->alloc_head++;
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_DATA_WBI;
+
+	kaio_sbl_submit(io->files.file, preq, REQ_WRITE, sbl, iblk, size);
+}
+
+static int kaio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = kaio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void
+kaio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		(void)kaio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_kaio_close(io->files.mapping, delta->flags & PLOOP_FMT_RDONLY);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		fput(file);
+	}
+}
+
+static int
+kaio_sync(struct ploop_io * io)
+{
+	struct file *file = io->files.file;
+
+	return vfs_fsync(file, 0);
+}
+
+static int
+kaio_stop(struct ploop_io * io)
+{
+	return 0;
+}
+
+static int
+kaio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+
+	return 0;
+}
+
+static void
+kaio_io_page(struct ploop_io * io, int op, struct ploop_request * preq,
+	     struct page * page, sector_t sec)
+{
+
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	int err;
+
+	ploop_prepare_io_request(preq);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb) {
+		PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+		goto out;
+	}
+
+	iov_iter_init_page(&iter, page, PAGE_SIZE, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_aio_complete, (u64)preq);
+
+	atomic_inc(&preq->io_count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_io_page: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, pos);
+		PLOOP_REQ_SET_ERROR(preq, err);
+	}
+
+out:
+	ploop_complete_io_request(preq);
+}
+
+static void
+kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
+		struct page * page, sector_t sec)
+{
+	kaio_io_page(io, IOCB_CMD_READ_ITER, preq, page, sec);
+}
+
+static void
+kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
+		 struct page * page, sector_t sec, unsigned long rw)
+{
+	ploop_prepare_tracker(preq, sec);
+
+	/* No FUA in kaio, convert it to fsync. Don't care
+	   about REQ_FLUSH: only io_direct relies on it,
+	   io_kaio implements delay_fua in another way... */
+	if (rw & REQ_FUA)
+		set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
+
+	kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
+}
+
+static int
+kaio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		   sector_t sec)
+{
+	return -EINVAL;
+}
+
+static int
+kaio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		    sector_t sec)
+{
+	return -EINVAL;
+}
+
+struct kaio_comp {
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+static inline void kaio_comp_init(struct kaio_comp * c)
+{
+	init_completion(&c->comp);
+	atomic_set(&c->count, 1);
+	c->error = 0;
+}
+
+static void kaio_sync_io_complete(u64 data, long err)
+{
+
+	struct kaio_comp *comp = (struct kaio_comp *) data;
+
+	if (unlikely(err < 0)) {
+		if (!comp->error)
+			comp->error = err;
+	}
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+}
+
+static int
+kaio_sync_io(struct ploop_io * io, int op, struct page * page,
+	     unsigned int len, unsigned int off, sector_t sec)
+{
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	struct bio_vec bvec;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	struct kaio_comp comp;
+	int err;
+
+	kaio_comp_init(&comp);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	bvec.bv_page = page;
+	bvec.bv_len = len;
+	bvec.bv_offset = off;
+
+	iov_iter_init_bvec(&iter, &bvec, 1, bvec_length(&bvec, 1), 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_sync_io_complete, (u64)&comp);
+
+	atomic_inc(&comp.count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_sync_io: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; pos=%lld; len=%d off=%d)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (!err && comp.error)
+		printk("kaio_sync_io: kaio failed with err=%d "
+		       "(rw=%s; pos=%lld; len=%d off=%d)\n",
+		       comp.error,
+		       (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+
+	return comp.error;
+}
+
+static int
+kaio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+		unsigned int off, sector_t sec)
+{
+	return kaio_sync_io(io, IOCB_CMD_READ_ITER, page, len, off, sec);
+}
+
+static int
+kaio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+		 unsigned int off, sector_t sec)
+{
+	int ret;
+
+	ret = kaio_sync_io(io, IOCB_CMD_WRITE_ITER, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return ret;
+}
+
+static int kaio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	return __kaio_truncate(io, io->files.file, pos + len);
+}
+
+static int kaio_open(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = ploop_kaio_open(file, delta->flags & PLOOP_FMT_RDONLY);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err)
+		return err;
+
+	io->files.em_tree = &dummy_em_tree;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		io->fsync_thread = kthread_create(kaio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			ploop_kaio_close(io->files.mapping, 0);
+			return -ENOMEM;
+		}
+
+		wake_up_process(io->fsync_thread);
+	}
+
+	return 0;
+}
+
+static int kaio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	struct path   path;
+	int err;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDONLY|O_LARGEFILE|O_DIRECT,
+			   current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = vfs_fsync(file, 0);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int kaio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = kaio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	ploop_kaio_downgrade(io->files.mapping);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int kaio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	struct path   path;
+	int err;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDWR|O_LARGEFILE|O_DIRECT, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		err = -EINVAL;
+		goto prep_merge_done;
+	}
+
+	err = vfs_fsync(file, 0);
+	if (err)
+		goto prep_merge_done;
+
+	err = ploop_kaio_upgrade(io->files.mapping);
+	if (err)
+		goto prep_merge_done;
+
+	io->fsync_thread = kthread_create(kaio_fsync_thread,
+					  io, "ploop_fsync%d",
+					  io->plo->index);
+	if (io->fsync_thread == NULL) {
+		err = -ENOMEM;
+		goto prep_merge_done;
+	}
+
+	wake_up_process(io->fsync_thread);
+
+	sd->file = file;
+
+prep_merge_done:
+	if (err)
+		fput(file);
+	return err;
+}
+
+static int kaio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos)
+{
+	int err;
+	struct iattr newattrs;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size  = pos;
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs, NULL);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): notify_change failed with err=%d "
+		       "(i_size=%lld)\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err, i_size_read(io->files.inode));
+		return err;
+	}
+
+	err = vfs_fsync(file, 0);
+
+	if (err)
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): vfs_fsync failed with err=%d\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err);
+
+	return err;
+}
+
+static int kaio_truncate(struct ploop_io * io, struct file * file,
+			  __u32 alloc_head)
+{
+	return __kaio_truncate(io, file,
+			       (u64)alloc_head << (io->plo->cluster_log + 9));
+}
+
+static void kaio_unplug(struct ploop_io * io)
+{
+	/* Need more thinking how to implement unplug */
+}
+
+static void kaio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+	blk_set_stacking_limits(&q->limits);
+	blk_queue_max_write_same_sectors(q, 0);
+}
+
+static void kaio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+
+	preq->req_rw &= ~REQ_FLUSH;
+
+	spin_lock_irq(&io->plo->lock);
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		list_add_tail(&preq->list, &io->plo->ready_queue);
+	else
+		kaio_queue_fsync_req(preq);
+
+	spin_unlock_irq(&io->plo->lock);
+}
+
+static int kaio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+
+	if (inode->i_sb->s_magic != FUSE_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (!(file->f_flags & O_DIRECT)) {
+		ploop_io_report_fn(file, "File opened w/o O_DIRECT");
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_bvec == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_bvec\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_page == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_page\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_kaio =
+{
+	.id		=	PLOOP_IO_KAIO,
+	.name		=	"kaio",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	kaio_unplug,
+
+	.alloc		=	kaio_alloc_sync,
+	.submit		=	kaio_submit,
+	.submit_alloc	=	kaio_submit_alloc,
+	.read_page	=	kaio_read_page,
+	.write_page	=	kaio_write_page,
+	.sync_read	=	kaio_sync_read,
+	.sync_write	=	kaio_sync_write,
+	.sync_readvec	=	kaio_sync_readvec,
+	.sync_writevec	=	kaio_sync_writevec,
+
+	.init		=	kaio_init,
+	.destroy	=	kaio_destroy,
+	.open		=	kaio_open,
+	.sync		=	kaio_sync,
+	.stop		=	kaio_stop,
+	.prepare_snapshot =	kaio_prepare_snapshot,
+	.complete_snapshot =	kaio_complete_snapshot,
+	.prepare_merge	=	kaio_prepare_merge,
+	.start_merge	=	kaio_start_merge,
+	.truncate	=	kaio_truncate,
+
+	.queue_settings	=	kaio_queue_settings,
+	.issue_flush	=	kaio_issue_flush,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       kaio_autodetect,
+};
+
+static int __init pio_kaio_mod_init(void)
+{
+	return ploop_register_io(&ploop_io_ops_kaio);
+}
+
+static void __exit pio_kaio_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_kaio);
+}
+
+module_init(pio_kaio_mod_init);
+module_exit(pio_kaio_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/io_kaio_map.c
@@ -0,0 +1,133 @@
+/*
+ *  drivers/block/ploop/io_kaio_map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ploop/ploop.h>
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+int ploop_kaio_open(struct file * file, int rdonly)
+{
+	int err = 0;
+	struct ploop_mapping *m, *pm;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+			goto kaio_open_done;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto kaio_open_done;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto kaio_open_done;
+	}
+
+	pm->mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	pm = NULL;
+	mapping->host->i_flags |= S_SWAPFILE;
+
+kaio_open_done:
+	spin_unlock(&ploop_mappings_lock);
+	if (pm)
+		kfree(pm);
+	return err;
+}
+
+int ploop_kaio_close(struct address_space * mapping, int rdonly)
+{
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_kaio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_kaio_upgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
--- /dev/null
+++ b/drivers/block/ploop/map.c
@@ -0,0 +1,1348 @@
+/*
+ *  drivers/block/ploop/map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Generic engine for mapping virtual blocks (cluster_t) to indices
+ * in image (iblock_t).
+ *
+ * Mapping is global, it is defined not for some particular delta,
+ * but for the whole disk. Therefore it is abstract and does not depend
+ * on particular virtual disk format. Of course, for some disk types
+ * it can be not so easy to fetch/update backing store. Actually,
+ * this engine is tightly bound to organization of index tables in ploop1.
+ *
+ * Technically, it is just array of pages with some metainformation
+ * attached to each page. The array may be highly sparse, so that it is
+ * in rbtree keyed by array index cluster_no / (PAGE_SIZE / sizeof(map_index)).
+ *
+ * Sadly, it is completely similar to linux page cache for a virtual
+ * mapping. "Sadly" is because linux page cache provides only a crippled
+ * implementation of asynchronous read/writeback, which requires synchronous
+ * waits for completions and does not making any callbacks on completion.
+ * So that, we have to redo all the work here.
+ *
+ * Two words about synchronization. All the updates to map are
+ * made from single thread. Lookups can happen in an unserialized context,
+ * therefore we protect all critical updates with spinlock. RCU can be used too.
+ *
+ * Mapping is UPTODATE, when it is in sync with top delta.
+ * When a mapping is accessed the first time and there is no mapping in top
+ * delta, we search for lower level delta. We could create empty mapping
+ * and this would have advantage: when the whole blocks are rewritten
+ * we do not even need lower deltas (_XXX_).
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <linux/ploop/ploop.h>
+
+/* This defines slot in mapping page. Right now it is 32 bit
+ * and therefore it directly matches ploop1 structure. */
+typedef u32 map_index_t;
+
+#define INDEX_PER_PAGE	(PAGE_SIZE / sizeof(map_index_t))
+
+static struct kmem_cache * ploop_map_cache;
+
+static LIST_HEAD(map_lru);
+static DEFINE_SPINLOCK(map_lru_lock);
+static atomic_t map_pages_nr = ATOMIC_INIT(0);
+
+/*
+ * Additional information for each page is:
+ * 1. rb tree link
+ * 2. Page
+ * 3. mn_start, mn_end - the first and the last index
+ * (correspondingly) the page maps to iblocks.
+ * 4. lru linkage
+ * 5. delta level of whole page, it is delta, where this page
+ *    is backed.
+ * 6. Array of delta levels for each map_index in the page.
+ *    If page is backed at level N, those levels cannot be >N.
+ *    If all the levels == N, array of levels is not allocated.
+ *    When at least one level < N, it is stored in the array.
+ *    Note, that in this case exporting page to disk implies
+ *    clearing irrelevant entries.
+ */
+
+struct map_node
+{
+	struct rb_node		rb_link;
+	cluster_t		mn_start;
+	cluster_t		mn_end;
+	unsigned long		state;
+	atomic_t		refcnt;
+	struct ploop_map	*parent;
+
+	struct page		*page;
+	struct list_head	lru;
+	u8			*levels;
+
+	/* List of preq's blocking on this mapping.
+	 *
+	 * We queue here several kinds of requests:
+	 * 1. If mapping is not uptodate, all the requests which need
+	 *    this mapping are queued here. preq state is ENTRY.
+	 * 2. If preq requires index update and it is delayed
+	 *    because writeback is in progress. preq state is INDEX_DELAY,
+	 *    new index is kept in preq->iblock.
+	 * 3. If preq's started index update, preq state is INDEX_WB,
+	 *    new indices are sent to io, but they are not inserted
+	 *    into mapping until writeback is complete.
+	 */
+	struct list_head	io_queue;
+};
+
+cluster_t map_get_mn_end(struct map_node *m)
+{
+	return m->mn_end;
+}
+
+#define MAP_LEVEL(m)		((m)->state & 0xFF)
+#define MAP_SET_LEVEL(m, l)	((m)->state = ((m)->state & ~0xFF) | (l))
+
+#define MAP_UPTODATE(m)		(((m)->state >> 8) & 0xFFUL)
+#define MAP_SET_UPTODATE(m, l)	((m)->state = ((m)->state & ~0xFF00UL) | ((l)<<8))
+
+enum {
+	PLOOP_MAP_UPTODATE	= 16,	/* Mapping is in sync with top_delta,
+					 * we can write index. But zero entries
+					 * still require read lower delta indices.
+					 */
+	PLOOP_MAP_READ		= 17,	/* Mapping read is scheduled */
+	PLOOP_MAP_WRITEBACK	= 18,	/* Mapping is under writeback */
+	PLOOP_MAP_ERROR		= 19,	/* Mapping is baaad */
+};
+
+void map_init(struct ploop_device * plo, struct ploop_map * map)
+{
+	INIT_LIST_HEAD(&map->delta_list);
+	map->flags = 0;
+	map->last_activity = jiffies;
+	map->plo = plo;
+	map->rb_root = RB_ROOT;
+	map->lru_buffer_ptr = 0;
+	init_waitqueue_head(&map->destroy_waitq);
+}
+
+/* Deliver batch of LRU updates from buffer to global LRU.
+ * Everything, which has zero refcnt, is added to LRU or moved to tail
+ * of LRU. Everything, which has non-zero refcnt, is removed from LRU.
+ */
+static void flush_lru_buffer(struct ploop_map * map)
+{
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&map_lru_lock, flags);
+	for (i = 0; i < map->lru_buffer_ptr; i++) {
+		struct map_node * m = map->lru_buffer[i];
+		if (atomic_dec_and_test(&m->refcnt))
+			list_move_tail(&m->lru, &map_lru);
+		else
+			list_del_init(&m->lru);
+	}
+	spin_unlock_irqrestore(&map_lru_lock, flags);
+
+	map->lru_buffer_ptr = 0;
+}
+
+/*
+ * map_release() must be called under plo-lock, because
+ * The pair atomic_read & atomic_dec_and_test is not atomic.
+ */
+void map_release(struct map_node * m)
+{
+	struct ploop_map * map = m->parent;
+
+	if (atomic_read(&m->refcnt) == 1) {
+		if (!list_empty(&m->lru))
+			return;
+		if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+			flush_lru_buffer(map);
+		map->lru_buffer[map->lru_buffer_ptr++] = m;
+		return;
+	}
+	if (atomic_dec_and_test(&m->refcnt))
+		BUG();
+}
+
+static inline void cond_flush_lru_buffer(struct ploop_map * map)
+{
+	if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+		flush_lru_buffer(map);
+}
+
+
+static struct map_node * map_lookup(struct ploop_map * map, cluster_t block)
+{
+	struct rb_node * n = map->rb_root.rb_node;
+	struct map_node * m;
+
+	while (n) {
+		m = rb_entry(n, struct map_node, rb_link);
+
+		if (block < m->mn_start)
+			n = n->rb_left;
+		else if (block > m->mn_end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+	return NULL;
+}
+
+/* Lookup mapping atomically. */
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t blk;
+
+	if (unlikely(block >= map->max_index))
+		return -1;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags)) {
+		*result = block;
+		return 0;
+	}
+
+	m = map_lookup(map, block);
+	if (m == NULL)
+		return -1;
+
+	if (atomic_read(&m->refcnt) == 0) {
+		cond_flush_lru_buffer(map);
+		if (atomic_read(&m->refcnt) == 0) {
+			atomic_inc(&m->refcnt);
+			map->lru_buffer[map->lru_buffer_ptr++] = m;
+		}
+	}
+	map->last_activity = jiffies;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return -1;
+
+	idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1); 
+	blk = ((map_index_t *)page_address(m->page))[idx] >>
+	       ploop_map_log(map->plo);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+static void map_node_destroy(struct map_node *m)
+{
+	rb_erase(&m->rb_link, &m->parent->rb_root);
+	list_del_init(&m->lru);
+	BUG_ON(atomic_read(&m->refcnt));
+	BUG_ON(!list_empty(&m->io_queue));
+	if (m->page)
+		put_page(m->page);
+	if (m->levels)
+		kfree(m->levels);
+	m->parent->pages--;
+	atomic_dec(&map_pages_nr);
+	kmem_cache_free(ploop_map_cache, m);
+}
+
+static void map_lru_scan(void)
+{
+	int max_loops = atomic_read(&map_pages_nr);
+
+	while (atomic_read(&map_pages_nr) > max_map_pages &&
+	       --max_loops >= 0) {
+		struct ploop_map * map;
+		struct map_node * candidate = NULL;
+
+		spin_lock_irq(&map_lru_lock);
+		if (!list_empty(&map_lru)) {
+			candidate = list_first_entry(&map_lru, struct map_node, lru);
+			atomic_inc(&candidate->refcnt);
+		}
+		spin_unlock_irq(&map_lru_lock);
+
+		if (!candidate)
+			break;
+
+		map = candidate->parent;
+
+		spin_lock_irq(&map->plo->lock);
+		spin_lock(&map_lru_lock);
+
+		if (waitqueue_active(&map->destroy_waitq)) {
+			atomic_dec(&candidate->refcnt);
+			wake_up(&map->destroy_waitq);
+			spin_unlock(&map_lru_lock);
+			spin_unlock_irq(&map->plo->lock);
+			return;
+		}
+
+		list_del_init(&candidate->lru);
+
+		if (atomic_dec_and_test(&candidate->refcnt)) {
+			/* This instance is within its limits, just
+			 * readd node back to tail of lru.
+			 */
+			if (map->pages <= map->plo->tune.min_map_pages &&
+			    time_after(map->last_activity +
+				       map->plo->tune.max_map_inactivity, jiffies) &&
+			    !test_bit(PLOOP_MAP_DEAD, &map->flags)) {
+				list_add_tail(&candidate->lru, &map_lru);
+			} else {
+				map_node_destroy(candidate);
+			}
+		}
+		spin_unlock(&map_lru_lock);
+		spin_unlock_irq(&map->plo->lock);
+
+		if (!(max_loops & 16))
+			cond_resched();
+	}
+}
+
+static struct map_node *
+map_create(struct ploop_map * map, cluster_t block)
+{
+	struct ploop_device * plo = map->plo;
+	struct rb_node **p, *parent;
+	struct map_node * m;
+	cluster_t ondisk_pageno = (block + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE;
+
+	m = kmem_cache_alloc(ploop_map_cache, GFP_NOFS);
+	if (unlikely(m == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	m->page = alloc_page(GFP_NOFS);
+	if (unlikely(m->page == NULL)) {
+		kmem_cache_free(ploop_map_cache, m);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ondisk_pageno == 0) {
+		m->mn_start = 0;
+		m->mn_end = INDEX_PER_PAGE - PLOOP_MAP_OFFSET - 1;
+	} else {
+		m->mn_start = ondisk_pageno * INDEX_PER_PAGE - PLOOP_MAP_OFFSET;
+		m->mn_end = m->mn_start + INDEX_PER_PAGE - 1;
+	}
+
+	INIT_LIST_HEAD(&m->io_queue);
+	INIT_LIST_HEAD(&m->lru);
+	m->levels = NULL;
+	m->state = 0;
+	atomic_set(&m->refcnt, 1);
+	m->parent = map;
+
+	spin_lock_irq(&plo->lock);
+
+	p = &map->rb_root.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		struct map_node * entry;
+		parent = *p;
+		entry = rb_entry(parent, struct map_node, rb_link);
+
+		/* Nodes can be deleted by any of ploop threads,
+		 * but they are inserted only in ploop thread.
+		 * Before calling map_create() we checked the node
+		 * is absent, therefore:
+		 */
+		BUG_ON(ondisk_pageno ==
+		       (entry->mn_start + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE);
+
+		if (block < entry->mn_start)
+			p = &(*p)->rb_left;
+		else if (block > entry->mn_end)
+			p = &(*p)->rb_right;
+		else
+			printk("map_create: Oops! block=%u; mn_range=[%u..%u]\n",
+			       block, entry->mn_start, entry->mn_end);
+	}
+
+	rb_link_node(&m->rb_link, parent, p);
+	rb_insert_color(&m->rb_link, &map->rb_root);
+
+	map->pages++;
+	atomic_inc(&map_pages_nr);
+	spin_unlock_irq(&plo->lock);
+
+	if (atomic_read(&map_pages_nr) > max_map_pages)
+		map_lru_scan();
+
+	return m;
+}
+
+/* helper for trans_map_get_index() and map_get_index() */
+static iblock_t
+cluster2iblock(struct ploop_request *preq, struct map_node *m, cluster_t block,
+	       u32 *idx)
+{
+	iblock_t iblk;
+	char *fmt;
+
+	BUG_ON (block < INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start != 0);
+	BUG_ON (block >= INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start !=
+		((block + PLOOP_MAP_OFFSET) &
+		 ~(INDEX_PER_PAGE - 1)) - PLOOP_MAP_OFFSET);
+
+	*idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	iblk = ((map_index_t *)page_address(m->page))[*idx];
+
+	if (likely(iblk != PLOOP_ZERO_INDEX))
+		iblk >>= ploop_map_log(preq->plo);
+
+	if (m == preq->trans_map)
+		fmt = "tmgi %u %d %u [ %u %u ]\n";
+	else if (m == preq->map)
+		fmt = "mgi %u %d %u [ %u %u ]\n";
+	else
+		BUG();
+
+	__TRACE(fmt, block, *idx, iblk,
+		((map_index_t *)page_address(m->page))[0],
+		((map_index_t *)page_address(m->page))[1]);
+
+	return iblk;
+}
+
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->trans_map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL)
+		return -1;
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		return 0;
+	}
+	return -1;
+}
+
+
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL) {
+		*result = block;
+		return 0;
+	}
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+int map_index_fault(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	struct map_node * m = preq->map;
+	int uptodate_level;
+	sector_t pos;
+	int err;
+
+	uptodate_level = MAP_UPTODATE(m);
+
+	/* All the levels are read, mapping is absent. */
+	if (uptodate_level == 0) {
+		__TRACE("MAP E %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	top_delta = ploop_top_delta(plo);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &plo->map.delta_list, list) {
+		int rc;
+
+		if (ndelta->level >= uptodate_level)
+			continue;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+
+		MAP_SET_UPTODATE(m, ndelta->level);
+		__TRACE("MAP SKIP %u %d\n", preq->req_cluster, ndelta->level);
+	}
+
+	/* Not found anywhere. */
+	if (!delta) {
+		__TRACE("MAP NF %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	/* Mapping is present in lower delta, start merge */
+	spin_lock_irq(&plo->lock);
+	ploop_add_lockout(preq, 0);
+
+	if (test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+		__TRACE("r %p %u %p\n", preq, preq->req_cluster, m);
+		list_add_tail(&preq->list, &m->io_queue);
+		plo->st.merge_lockouts++;
+		spin_unlock_irq(&plo->lock);
+		/* Someone already scheduled read. */
+		return 0;
+	}
+	spin_unlock_irq(&plo->lock);
+
+	err = -EIO;
+	if (test_bit(PLOOP_MAP_ERROR, &m->state))
+		goto err_out;
+
+	err = -ENOMEM;
+	preq->sinfo.ri.tpage = alloc_page(GFP_NOFS);
+	if (preq->sinfo.ri.tpage == NULL)
+		goto err_out;
+
+	preq->sinfo.ri.level = delta->level;
+	preq->eng_state = PLOOP_E_INDEX_READ;
+
+	plo->st.map_merges++;
+	delta->ops->read_index(delta, preq, preq->sinfo.ri.tpage, pos);
+	return 0;
+
+err_out:
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	PLOOP_FAIL_REQUEST(preq, err);
+	return 0;
+}
+
+static void map_read_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head * n, *pn;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&plo->lock);
+
+	if (!preq->error) {
+		set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	} else {
+		set_bit(PLOOP_MAP_ERROR, &m->state);
+	}
+	clear_bit(PLOOP_MAP_READ, &m->state);
+
+	__TRACE(">E %p %u %p\n", preq, preq->req_cluster, m);
+
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void map_merge_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head *n, *pn;
+	LIST_HEAD(list);
+	int i;
+	u32 * map;
+	u32 * merged;
+	int skip = m->mn_start == 0 ? PLOOP_MAP_OFFSET : 0;
+
+	__TRACE(">M %p %u %p\n", preq, preq->req_cluster, m);
+
+	if (unlikely(preq->error))
+		goto abort_update;
+
+	map = page_address(m->page);
+	merged = page_address(preq->sinfo.ri.tpage);
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (map[i] != 0)
+			continue;
+		if (merged[i] == 0)
+			continue;
+		if (preq->sinfo.ri.level != MAP_LEVEL(m)) {
+			if (!m->levels) {
+				m->levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+				if (unlikely(m->levels == NULL)) {
+					preq->error = -ENOMEM;
+					goto abort_update;
+				}
+				memset(m->levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			}
+			m->levels[i] = preq->sinfo.ri.level;
+		}
+		map[i] = merged[i];
+	}
+
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	MAP_SET_UPTODATE(m, preq->sinfo.ri.level);
+	__TRACE("MAP U %u %d\n", preq->req_cluster, preq->sinfo.ri.level);
+	preq->eng_state = PLOOP_E_ENTRY;
+
+flush_queue:
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+
+abort_update:
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+	preq->eng_state = PLOOP_E_COMPLETE;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_ERROR, &m->state);
+	goto flush_queue;
+}
+
+
+void map_read_complete(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (preq->eng_state == PLOOP_E_TRANS_INDEX_READ)
+		m = preq->trans_map;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		map_read_endio(preq, m);
+	else
+		map_merge_endio(preq, m);
+}
+
+static int
+ploop_map_start_read(struct ploop_map * map, struct ploop_request * preq,
+		     struct map_node * m)
+{
+	struct ploop_device * plo = map->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	sector_t pos;
+
+	top_delta = map_top_delta(map);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &map->delta_list, list) {
+		int rc;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+	}
+
+	if (delta) {
+		__TRACE("MAP R0 %u %d %lu %d\n", preq->req_cluster, delta->level, pos, m->index);
+		/* We know delta, we know position. We can read. */
+		MAP_SET_LEVEL(m, delta->level);
+		MAP_SET_UPTODATE(m, delta->level);
+		if (map == &plo->map)
+			preq->eng_state = PLOOP_E_INDEX_READ;
+		else
+			preq->eng_state = PLOOP_E_TRANS_INDEX_READ;
+		delta->ops->read_index(delta, preq, m->page, pos);
+		plo->st.map_reads++;
+		return 1;
+	}
+
+	/* Otherwise mapping does not exist. */
+	memset(page_address(m->page), 0, PAGE_SIZE);
+	__TRACE("MAP R1 %u %d\n", preq->req_cluster, top_delta->level);
+	MAP_SET_LEVEL(m, top_delta->level);
+	MAP_SET_UPTODATE(m, 0);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	return 0;
+}
+
+static int ploop_read_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = (map == &plo->map) ? preq->map : preq->trans_map;
+	int err = 0;
+
+	spin_lock_irq(&plo->lock);
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state)) {
+		if (test_bit(PLOOP_MAP_ERROR, &m->state)) {
+			err = -EIO;
+			goto out;
+		}
+
+		if (!test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+			spin_unlock_irq(&plo->lock);
+
+			return ploop_map_start_read(map, preq, m);
+		} else {
+			__TRACE("g %p %u %p\n", preq, preq->req_cluster, m);
+			plo->st.map_lockouts++;
+			list_add_tail(&preq->list, &m->io_queue);
+			err = 1;
+		}
+	}
+
+out:
+	spin_unlock_irq(&plo->lock);
+	return err;
+}
+
+void ploop_update_map(struct ploop_map * map, int level,
+		      cluster_t block, iblock_t iblk)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t *p;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, block);
+	if (!m || !test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		goto out;
+
+	p = (map_index_t *)page_address(m->page);
+	idx = (block  + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+	if (p[idx]) {
+		int lvl = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+		if (lvl == level)
+			p[idx] = iblk << ploop_map_log(map->plo);
+		else if (lvl < level)
+			printk("Unexpected condition: uptodate map_node %p "
+			       "covering range %u..%u maps %u to %u on level "
+			       "%d, while user-space merge detected mapping "
+			       "on level %d\n", m, m->mn_start, m->mn_end,
+			       block, p[idx] >> map->plo->cluster_log, lvl,
+			       level);
+	}
+out:
+	spin_unlock_irq(&map->plo->lock);
+}
+
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size)
+{
+	struct map_node * m;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, 0);
+	if (m && test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		memcpy(page_address(m->page), hdr, hdr_size);
+
+	spin_unlock_irq(&map->plo->lock);
+}
+EXPORT_SYMBOL(ploop_update_map_hdr);
+
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	map->last_activity = jiffies;
+
+	m = preq->trans_map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			struct ploop_delta * mdelta = map_top_delta(map);
+			sector_t sec;
+			if (mdelta->ops->map_index(mdelta, block, &sec) == 0)
+				return 0;
+
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->trans_map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+/* Find mapping for this request. Mapping can be not uptodate. */
+
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags))
+		return 0;
+
+	map->last_activity = jiffies;
+
+	m = preq->map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+
+/* Blank entries, which refer to another delta
+ * _XXX_ a little more brain stress to detect the case, when we do not
+ * have such entries. Also, copy cries for an optimization.
+ */
+
+static void copy_index_for_wb(struct page * page, struct map_node * m, int level)
+{
+	int i;
+	u32 * s = page_address(m->page);
+	u32 * d = page_address(page);
+	int skip = 0;
+
+	if (m->mn_start == 0) {
+		skip = PLOOP_MAP_OFFSET;
+		memcpy(d, s, skip * sizeof(u32));
+	}
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (level != (m->levels ? m->levels[i] : MAP_LEVEL(m)))
+			d[i] = 0;
+		else
+			d[i] = s[i];
+	}
+}
+
+
+void ploop_index_wb_proceed(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	struct page * page = preq->sinfo.wi.tpage;
+	unsigned long rw = preq->req_index_update_rw;
+	sector_t sec;
+
+	preq->eng_state = PLOOP_E_INDEX_WB;
+
+	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
+
+	__TRACE("wbi-proceed %p %u %p\n", preq, preq->req_cluster, m);
+	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
+
+	put_page(page);
+}
+
+static void ploop_index_wb_proceed_or_delay(struct ploop_request * preq,
+					    int do_fsync_if_delayed)
+{
+	if (do_fsync_if_delayed) {
+		struct map_node * m = preq->map;
+		struct ploop_delta * top_delta = map_top_delta(m->parent);
+		struct ploop_io * top_io = &top_delta->io;
+
+		if (test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state)) {
+			preq->eng_state = PLOOP_E_FSYNC_PENDED;
+			ploop_add_req_to_fsync_queue(preq);
+			return;
+		}
+	}
+
+	ploop_index_wb_proceed(preq);
+}
+
+/* Data write is commited. Now we need to update index. */
+
+void ploop_index_update(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	u32 idx;
+	map_index_t blk;
+	int old_level;
+	struct page * page;
+	unsigned long state = READ_ONCE(preq->state);
+	int do_fsync_if_delayed = 0;
+
+	/* No way back, we are going to initiate index write. */
+
+	idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	blk = ((map_index_t *)page_address(m->page))[idx]  >> ploop_map_log(plo);
+	old_level = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+	if (top_delta->level != old_level) {
+		if (m->levels == NULL) {
+			u8 * levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+			if (levels == NULL)
+				goto enomem;
+			memset(levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			m->levels = levels;
+		}
+	}
+
+	BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state) && preq->iblock);
+	if (test_bit(PLOOP_REQ_ZERO, &preq->state) && !blk) {
+		printk("Either map_node is corrupted or bug in "
+		       "ploop-balloon (%u)\n", preq->req_cluster);
+		PLOOP_REQ_SET_ERROR(preq, -EIO);
+		goto corrupted;
+	}
+
+	if (blk == preq->iblock && top_delta->level == old_level)
+		goto out;
+
+	if (test_and_set_bit(PLOOP_MAP_WRITEBACK, &m->state)) {
+		preq->eng_state = PLOOP_E_INDEX_DELAY;
+		list_add_tail(&preq->list, &m->io_queue);
+		__TRACE("d %p %u %p\n", preq, preq->req_cluster, m);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page == NULL) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		goto enomem;
+	}
+
+	copy_index_for_wb(page, m, top_delta->level);
+
+	((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+	get_page(page);
+	preq->sinfo.wi.tpage = page;
+
+	__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
+	plo->st.map_single_writes++;
+
+	preq->req_index_update_rw = (preq->req_rw & (REQ_FUA | REQ_FLUSH));
+
+	/* We've just set REQ_FLUSH in rw, ->write_page() below
+	   will do the FLUSH */
+	preq->req_rw &= ~REQ_FLUSH;
+
+	/* Relocate requires consistent index update */
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
+		preq->req_index_update_rw |= (REQ_FLUSH | REQ_FUA);
+		do_fsync_if_delayed = 1;
+	}
+
+	ploop_index_wb_proceed_or_delay(preq, do_fsync_if_delayed);
+	return;
+
+enomem:
+	PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+corrupted:
+	set_bit(PLOOP_S_ABORT, &plo->state);
+out:
+	preq->eng_state = PLOOP_E_COMPLETE;
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+}
+EXPORT_SYMBOL(ploop_index_update);
+
+
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec)
+{
+	return delta->ops->map_index(delta, preq->map->mn_start, sec);
+}
+EXPORT_SYMBOL(map_index);
+
+struct ploop_delta * map_writable_delta(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (m == NULL)
+		return ploop_top_delta(preq->plo);
+	else
+		return map_top_delta(m->parent);
+}
+EXPORT_SYMBOL(map_writable_delta);
+
+static void map_idx_swap(struct map_node *m, unsigned int idx,
+			 iblock_t *iblk, int log)
+{
+	iblock_t iblk2 = ((map_index_t*)page_address(m->page))[idx] >> log;
+	((map_index_t*)page_address(m->page))[idx] = *iblk << log;
+	*iblk = iblk2;
+}
+
+static inline void requeue_req(struct ploop_request *preq,
+			       unsigned long new_eng_state)
+{
+	preq->eng_state = new_eng_state;
+	spin_lock_irq(&preq->plo->lock);
+	list_del(&preq->list);
+	list_add_tail(&preq->list, &preq->plo->ready_queue);
+	spin_unlock_irq(&preq->plo->lock);
+}
+
+/*
+ * Index write-back for given preq happened, map_wb_complete()
+ * found preq in m->io_queue in PLOOP_E_INDEX_WB eng_state and
+ * updated in-core page of L2-table with preq->iblock. Now, it's
+ * time to either finalize preq (main case) setting eng_state to
+ * PLOOP_E_COMPLETE or process it further (RELOC_[A|S] case)
+ */
+static void map_wb_complete_post_process(struct ploop_map *map,
+					 struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo       = map->plo;
+
+	if (likely(err ||
+		   (!test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
+		    !test_bit(PLOOP_REQ_RELOC_S, &preq->state)))) {
+
+		requeue_req(preq, PLOOP_E_COMPLETE);
+		return;
+	}
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		spin_lock_irq(&plo->lock);
+		del_lockout(preq);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+
+		requeue_req(preq, PLOOP_E_RELOC_COMPLETE);
+		return;
+	}
+
+	BUG_ON (!test_bit(PLOOP_REQ_RELOC_A, &preq->state));
+	BUG_ON (!preq->aux_bio);
+
+	if (++plo->grow_relocated > plo->grow_end - plo->grow_start) {
+		requeue_req(preq, PLOOP_E_COMPLETE);
+		return;
+	}
+
+	del_lockout(preq);
+	preq->req_cluster++;
+	requeue_req(preq, PLOOP_E_ENTRY);
+}
+
+static void map_wb_complete(struct map_node * m, int err)
+{
+	struct ploop_device * plo = m->parent->plo;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	struct list_head * cursor, * tmp;
+	struct ploop_request * main_preq;
+	struct page * page = NULL;
+	int delayed = 0;
+	unsigned int idx;
+	unsigned long rw;
+	int do_fsync_if_delayed = 0;
+
+	/* First, complete processing of written back indices,
+	 * finally instantiate indices in mapping cache.
+	 */
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_ENTRY:
+			break;
+		case PLOOP_E_INDEX_WB:
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+			if (!err) {
+				struct ploop_request *pr = preq;
+				int do_levels_update = 0;
+
+				if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+					BUG_ON (list_empty(&preq->delay_list));
+					pr = list_first_entry(&preq->delay_list,
+							      struct ploop_request,
+							      list);
+				}
+
+				if (m->levels &&  m->levels[idx] != top_delta->level) {
+					spin_lock_irq(&plo->lock);
+					do_levels_update = 1;
+				}
+
+				if (unlikely(test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+					     test_bit(PLOOP_REQ_ZERO, &preq->state)))
+					map_idx_swap(m, idx, &pr->iblock,
+						     ploop_map_log(plo));
+				else
+					((map_index_t*)page_address(m->page))[idx] =
+						pr->iblock << ploop_map_log(plo);
+
+				if (m->levels) {
+					if (do_levels_update) {
+						m->levels[idx] = top_delta->level;
+						spin_unlock_irq(&plo->lock);
+					}
+				} else {
+					BUG_ON(MAP_LEVEL(m) != top_delta->level);
+				}
+			} else {
+				PLOOP_REQ_SET_ERROR(preq, err);
+			}
+			put_page(preq->sinfo.wi.tpage);
+			preq->sinfo.wi.tpage = NULL;
+			map_wb_complete_post_process(m->parent, preq, err);
+			break;
+		case PLOOP_E_INDEX_DELAY:
+			if (err) {
+				PLOOP_REQ_SET_ERROR(preq, err);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &preq->plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+			} else {
+				delayed++;
+			}
+			break;
+		}
+	}
+
+	if (!delayed) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page)
+		copy_index_for_wb(page, m, top_delta->level);
+
+	main_preq = NULL;
+	rw = 0;
+
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+		unsigned long state;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_INDEX_DELAY:
+			if (page == NULL) {
+				PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+				break;
+			}
+
+			rw |= (preq->req_rw & (REQ_FLUSH | REQ_FUA));
+
+			/* We've just set REQ_FLUSH in rw, ->write_page() below
+			   will do the FLUSH */
+			preq->req_rw &= ~REQ_FLUSH;
+
+			state = READ_ONCE(preq->state);
+			/* Relocate requires consistent index update */
+			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
+				rw |= (REQ_FLUSH | REQ_FUA);
+				do_fsync_if_delayed = 1;
+			}
+
+			preq->eng_state = PLOOP_E_INDEX_WB;
+			get_page(page);
+			preq->sinfo.wi.tpage = page;
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+			((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+			if (!main_preq) {
+				main_preq = preq;
+				list_del_init(&main_preq->list);
+			}
+			plo->st.map_multi_updates++;
+		}
+	}
+
+	if (!page) {
+		/* Writes are discarded */
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	__TRACE("wbi2 %p %u %p\n", main_preq, main_preq->req_cluster, m);
+	plo->st.map_multi_writes++;
+
+	main_preq->req_index_update_rw = rw;
+	ploop_index_wb_proceed_or_delay(main_preq, do_fsync_if_delayed);
+}
+
+void
+ploop_index_wb_complete(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &m->io_queue);
+	spin_unlock_irq(&plo->lock);
+
+	map_wb_complete(m, preq->error);
+}
+
+void ploop_map_start(struct ploop_map * map, u64 bd_size)
+{
+	struct ploop_device * plo = map->plo;
+
+	map->max_index = (bd_size + (1 << plo->cluster_log) - 1 ) >> plo->cluster_log;
+	map->flags = 0;
+}
+
+
+static void map_wait(struct ploop_map * map)
+{
+	DEFINE_WAIT(_wait);
+	prepare_to_wait(&map->destroy_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	io_schedule();
+	spin_lock_irq(&map->plo->lock);
+	spin_lock(&map_lru_lock);
+
+	finish_wait(&map->destroy_waitq, &_wait);
+}
+
+void ploop_map_destroy(struct ploop_map * map)
+{
+	int i;
+	struct rb_node * node;
+
+	spin_lock_irq(&map->plo->lock);
+	set_bit(PLOOP_MAP_DEAD, &map->flags);
+
+	for (i = 0; i < map->lru_buffer_ptr; i++)
+		atomic_dec(&map->lru_buffer[i]->refcnt);
+
+	map->lru_buffer_ptr = 0;
+
+	spin_lock(&map_lru_lock);
+	while ((node = map->rb_root.rb_node) != NULL) {
+		struct map_node * m = rb_entry(node, struct map_node, rb_link);
+		/* refcnt can be not zero if and only if this node is grabbed
+		 * by map_lru_scan() and in flight between releasing
+		 * map_lru_lock and taking plo->lock. We can skip this entry
+		 * if will be destroyed by map_lru_scan(), because we
+		 * set PLOOP_MAP_DEAD.
+		 */
+		if (atomic_read(&m->refcnt) == 0)
+			map_node_destroy(m);
+		else
+			map_wait(map);
+	}
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	BUG_ON(map->pages);
+}
+
+void ploop_map_remove_delta(struct ploop_map * map, int level)
+{
+	/* For now. */
+	ploop_map_destroy(map);
+}
+
+
+int __init ploop_map_init(void)
+{
+	ploop_map_cache = kmem_cache_create("ploop_map",
+						sizeof(struct map_node), 0,
+						SLAB_MEM_SPREAD, NULL
+						);
+	if (!ploop_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ploop_map_exit(void)
+{
+	if (ploop_map_cache)
+		kmem_cache_destroy(ploop_map_cache);
+}
--- /dev/null
+++ b/drivers/block/ploop/ploop1_image.h
@@ -0,0 +1,429 @@
+/*
+ *  drivers/block/ploop/ploop1_image.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __PLOOP1_IMAGE_H__
+#define __PLOOP1_IMAGE_H__ 1
+
+/* Definition of PVD (Parallels Virtual Disk) format
+ *
+ * 1. All the data are in ?little-endian? format.
+ * 2. All the data except for the first cluster are aligned and padded
+ *    to size of cluster. First cluster is exception - it combines
+ *    PVD header (first 64 bytes of the cluster) with L2 index table
+ *    (L2 index table is an array of indices of blocks)
+ * 3. Image size must be multiple of cluster size. If it is not,
+ *    we assume it is the result of image extension failed in the
+ *    middle of transaction, therefore new allocations start at
+ *    size rounded down to cluster size.
+ * 4. Update of indices must be done only after data clusters
+ *    are committed to reliable storage. If we fail to update index,
+ *    we can get an unused and, maybe, uninitialized or partially
+ *    initialized data cluster. It is lost, forgotten and ignored
+ *    until repair or image rebuild.
+ */
+
+/*
+ * copy/paste of IMAGE_PARAMETERS from DiskImageComp.h
+ */
+#pragma pack(push,1)
+struct ploop_pvd_header
+{
+	__u8  m_Sig[16];          /* Signature */
+	__u32 m_Type;             /* Disk type */
+	__u32 m_Heads;            /* heads count */
+	__u32 m_Cylinders;        /* tracks count */
+	__u32 m_Sectors;          /* Sectors per track count */
+	__u32 m_Size;             /* Size of disk in tracks */
+	union {                   /* Size of disk in 512-byte sectors */
+		struct {
+			__u32 m_SizeInSectors_v1;
+			__u32 Unused;
+		};
+		__u64 m_SizeInSectors_v2;
+	};
+	__u32 m_DiskInUse;        /* Disk in use */
+	__u32 m_FirstBlockOffset; /* First data block offset (in sectors) */
+	__u32 m_Flags;            /* Misc flags */
+	__u8  m_Reserved[8];      /* Reserved */
+};
+#pragma pack(pop)
+
+/* Compressed disk (version 1) */
+#define PRL_IMAGE_COMPRESSED            2
+
+/* Compressed disk v1 signature */
+#define SIGNATURE_STRUCTURED_DISK_V1 "WithoutFreeSpace"
+
+/* Compressed disk v2 signature */
+#define SIGNATURE_STRUCTURED_DISK_V2 "WithouFreSpacExt"
+
+/* Sign that the disk is in "using" state */
+#define SIGNATURE_DISK_IN_USE		0x746F6E59
+
+/* Disk was closed by software which conformed specification 2.0 */
+#define SIGNATURE_DISK_CLOSED_V20	0x0
+
+/* Disk disk was closed by software which conformed specification 2.1 */
+#define SIGNATURE_DISK_CLOSED_V21	0x312e3276
+
+/**
+ * Compressed disk image flags
+ */
+#define	CIF_NoFlags		0x00000000 /* No any flags */
+#define	CIF_Empty		0x00000001 /* No any data was written */
+#define	CIF_Invalid		0xFFFFFFFF /* Invalid flag */
+
+
+#define PLOOP1_SECTOR_LOG	9
+#define PLOOP1_DEF_CLUSTER_LOG	9 /* 256K cluster-block */
+#define CLUSTER (1UL << (PLOOP1_DEF_CLUSTER_LOG + PLOOP1_SECTOR_LOG))
+
+/* Helpers to generate PVD-header based on requested bdsize */
+
+#define DEFAULT_HEADS_COUNT   16
+#define DEFAULT_SECTORS_COUNT 63
+#define SECTOR_SIZE (1 << 9)
+
+struct CHSData
+{
+	__u32 Sectors;
+	__u32 Heads;
+	__u32 Cylinders;
+};
+
+#ifdef __KERNEL__
+# define ploop_do_div(n, base) do_div(n, base)
+#else
+# define ploop_do_div(n, base) ({		\
+	__u32 __rem = n % base;			\
+	n /= base;				\
+	__rem;					\
+ })
+#endif
+/*
+ * Try to count disk sectors per track value
+ */
+static inline __u32
+CalcSectors(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine sector count */
+	if (!ploop_do_div(size, DEFAULT_SECTORS_COUNT))
+		return DEFAULT_SECTORS_COUNT;
+
+	if (!(uiSize % 32))
+		return 32;
+
+	if (!(uiSize % 16))
+		return 16;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	return ~0;
+}
+
+/*
+ * Try to count disk heads value
+ */
+static inline __u32
+CalcHeads(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine heads count */
+	if (!ploop_do_div(size, DEFAULT_HEADS_COUNT))
+		return DEFAULT_HEADS_COUNT;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	if (!(uiSize % 4))
+		return 4;
+
+	if (!(uiSize % 2))
+		return 2;
+
+	return ~0;
+}
+
+/*
+ * Convert size to CHS for disks from 504 Mb to 8 Gb
+ */
+static inline void
+ConvertToCHSLow(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = DEFAULT_SECTORS_COUNT;
+	chs->Heads = DEFAULT_HEADS_COUNT;
+	ploop_do_div(From, DEFAULT_SECTORS_COUNT * DEFAULT_HEADS_COUNT);
+	chs->Cylinders = From;
+}
+
+/*
+ * Convert size to pure LBA config
+ */
+static inline void
+ConvertToPureLBA(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = 1;
+	chs->Heads = 1;
+	chs->Cylinders = From;
+}
+
+static inline void
+ConvertToCHS(__u64 From, struct CHSData *chs)
+{
+	__u64 Size;
+
+	/*
+	 * According to ATA2 specs:
+	 *  - If the device is above 1,032,192 sectors then the value should be 63.
+	 *    This value does not exceed 63 (3Fh). But note, that if device size
+	 *    above 16,777,216 the HDD reports proper 'magic' number in CHS values,
+	 *    so the situation in the middle must be handled separately
+	 */
+	if ((From > 1032192) && (From < 16777216))
+	{
+		ConvertToCHSLow(From, chs);
+		return;
+	}
+
+	Size = From;
+
+	/* Store size */
+	chs->Sectors = CalcSectors(Size);
+
+	if (chs->Sectors == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Sectors);
+
+	chs->Heads = CalcHeads(Size);
+
+	if (chs->Heads == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Heads);
+
+	chs->Cylinders = Size;
+
+	return;
+
+PureLBA:
+	ConvertToPureLBA(From, chs);
+}
+
+static inline __u32
+GetHeaderSize(__u32 m_Size)
+{
+	__u32 Size = sizeof(struct ploop_pvd_header);
+
+	/* Add BAT */
+	Size += m_Size * sizeof(__u32);
+	/* Align to size of sector */
+	Size = (Size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE - 1);
+
+	return Size;
+}
+
+static inline char *
+ploop1_signature(int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return SIGNATURE_STRUCTURED_DISK_V1;
+	case PLOOP_FMT_V2:
+		return SIGNATURE_STRUCTURED_DISK_V2;
+#ifdef __KERNEL__
+	default:
+		BUG();
+#endif
+	}
+
+	return NULL;
+}
+
+static inline int
+ploop1_version(struct ploop_pvd_header *vh)
+{
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V1, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V1;
+
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V2, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V2;
+
+	return -1;
+}
+
+static inline __u64
+ploop1_max_size(__u32 blocksize, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return (__u32)-1;
+	case PLOOP_FMT_V2:
+		return 0xffffffffUL * blocksize;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+static inline u64
+get_SizeInSectors_from_le(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return le32_to_cpu(vh->m_SizeInSectors_v1);
+	case PLOOP_FMT_V2:
+		return le64_to_cpu(vh->m_SizeInSectors_v2);
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static inline void
+put_SizeInSectors(u64 SizeInSectors, struct ploop_pvd_header *vh,
+		  int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = SizeInSectors;
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = SizeInSectors;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void
+cpu_to_le_SizeInSectors(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = cpu_to_le32(vh->m_SizeInSectors_v1);
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = cpu_to_le64(vh->m_SizeInSectors_v2);
+		break;
+	default:
+		BUG();
+	}
+}
+#endif
+
+/*
+ * Returns: "size to fill" (in bytes)
+ *
+ * NB: m_Flags and m_DiskInUse are being kept as is; our caller
+ * should take care of them.
+ *
+ * NB: Both bdsize and blocksize are measured in sectors.
+ */
+static inline __u32
+generate_pvd_header(struct ploop_pvd_header *vh, __u64 bdsize, __u32 blocksize,
+		    int version)
+{
+	struct CHSData chs;
+	__u32 SizeToFill;
+	__u32 uiAlignmentSize;
+	__u64 SizeInSectors;
+
+	memcpy(vh->m_Sig, ploop1_signature(version) , sizeof(vh->m_Sig));
+	vh->m_Type = PRL_IMAGE_COMPRESSED;
+
+	/* Round up to block size */
+	SizeInSectors = bdsize + blocksize - 1;
+	ploop_do_div(SizeInSectors, blocksize);
+	SizeInSectors *= blocksize;
+	put_SizeInSectors(SizeInSectors, vh, version);
+
+	ConvertToCHS(SizeInSectors, &chs);
+
+	vh->m_Sectors = blocksize;
+	vh->m_Heads = chs.Heads;
+	vh->m_Cylinders = chs.Cylinders;
+
+	ploop_do_div(SizeInSectors, blocksize);
+	vh->m_Size = SizeInSectors;
+
+	uiAlignmentSize = blocksize << 9;
+	SizeToFill = GetHeaderSize(vh->m_Size);
+	/* Align to block size */
+	if (SizeToFill % uiAlignmentSize)
+		SizeToFill += uiAlignmentSize - (SizeToFill % uiAlignmentSize);
+
+	vh->m_FirstBlockOffset = SizeToFill >> 9;
+
+	return SizeToFill;
+}
+
+static inline bool pvd_header_is_disk_in_use(struct ploop_pvd_header *vh)
+{
+	return (vh->m_DiskInUse == cpu_to_le32(SIGNATURE_DISK_IN_USE)) ?
+		true : false;
+}
+
+static inline void pvd_header_set_disk_in_use(struct ploop_pvd_header *vh)
+{
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
+}
+
+static inline void pvd_header_set_disk_closed(struct ploop_pvd_header *vh)
+{
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_CLOSED_V20);
+}
+
+/* Translation of sector number to offset in image */
+
+#if 0
+
+/* Those function are not really used */
+
+/* Calculate virtual cluster number from virtual sector number */
+
+static inline __u32
+ploop1_cluster(struct ploop_img_header * info, __u64 sector)
+{
+	return sector >> info->cluster_log;
+}
+
+/* Get amount of clusters covered by one L2 table, 32K by default,
+ * which can map 4G of data
+ */
+static inline __u32
+ploop1_clusters_per_l2(struct ploop_img_header * info)
+{
+	return 1 << (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L1 table mapping a cluster. */
+
+static inline __u32
+ploop1_l1_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster >> (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L2 table mapping a cluster. */
+
+static inline __u32
+ploop1_l2_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster & (ploop1_clusters_per_l2(info) - 1);
+}
+
+/* That's all, simple and stupid */
+
+#endif
+
+#endif /* __PLOOP1_IMAGE_H__ */
--- /dev/null
+++ b/drivers/block/ploop/ploop_events.c
@@ -0,0 +1,16 @@
+/*
+ *  drivers/block/ploop/ploop_events.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#define CREATE_TRACE_POINTS
+#include "ploop_events.h"
+
+EXPORT_TRACEPOINT_SYMBOL(submit);
+EXPORT_TRACEPOINT_SYMBOL(submit_alloc);
+EXPORT_TRACEPOINT_SYMBOL(cached_submit);
--- /dev/null
+++ b/drivers/block/ploop/ploop_events.h
@@ -0,0 +1,100 @@
+/*
+ *  drivers/block/ploop/ploop_events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_PLOOP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PLOOP_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "events.h"
+
+DEFINE_EVENT(preq_template, submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, submit_alloc,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, cached_submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, complete_request,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, req_state_process,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, bio_queue,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, add_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, del_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+TRACE_EVENT(preq_lockout,
+	TP_PROTO(struct ploop_request *preq,
+		struct ploop_request *ppreq),
+
+	TP_ARGS(preq, ppreq),
+
+	TP_STRUCT__entry(
+		__field(void *,		ppreq)
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->ppreq		= ppreq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk("ppreq=%p "PREQ_FORMAT, __entry->ppreq, PREQ_ARGS)
+);
+
+DEFINE_EVENT(bio_template, make_request,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+DEFINE_EVENT(bio_template, bio_fast_map,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE ploop_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+++ b/drivers/block/ploop/push_backup.c
@@ -0,0 +1,1106 @@
+/*
+ *  drivers/block/ploop/push_backup.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "push_backup.h"
+
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE  (1UL << (PAGE_SHIFT + 3))
+
+struct pb_set {
+	struct rb_root	   tree;
+	struct list_head   list;
+	struct timer_list  timer;
+	char		  *name;
+	struct ploop_pushbackup_desc *pbd; /* points to parent pbd */
+};
+
+enum {
+	PLOOP_PB_ALIVE,
+	PLOOP_PB_STOPPING,
+	PLOOP_PB_DEAD,
+};
+
+struct ploop_pushbackup_desc {
+	struct ploop_device *plo;
+	struct page **cbt_map; /* a 'snapshot' copy of CBT mask */
+	blkcnt_t cbt_block_max;
+	blkcnt_t cbt_block_bits;
+	__u8 	 cbt_uuid[16];
+
+	struct page **ppb_map; /* Ploop Push Backup mask */
+	struct page **reported_map; /* what userspace reported as backed up */
+	cluster_t ppb_block_max; /* first invalid index in ppb_map */
+
+	spinlock_t	      ppb_lock;
+	struct completion     ppb_comp;
+	bool                  ppb_waiting;
+
+	struct pb_set	      pending_set;
+	struct pb_set	      reported_set;
+
+	struct bio_list	      bio_pending_list;
+
+	struct task_struct   *health_monitor_thread;
+	wait_queue_head_t     ppb_waitq;
+	int		      ppb_state; /* see enum above */
+};
+
+int ploop_pb_check_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid)
+{
+	if (memcmp(pbd->cbt_uuid, uuid, sizeof(pbd->cbt_uuid)))
+		return -1;
+	return 0;
+}
+
+int ploop_pb_get_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid)
+{
+	if (!pbd)
+		return -1;
+
+	memcpy(uuid, pbd->cbt_uuid, sizeof(pbd->cbt_uuid));
+	return 0;
+}
+
+static struct page **ploop_pb_map_alloc(unsigned long block_max)
+{
+	unsigned long npages = NR_PAGES(block_max);
+	struct page **map = vmalloc(npages * sizeof(void *));
+	unsigned long i;
+
+	if (!map)
+		return NULL;
+
+	memset(map, 0, npages * sizeof(void *));
+
+	for (i = 0; i < npages; i++) {
+		map[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!map[i]) {
+			while (--i >= 0)
+				__free_page(map[i]);
+			vfree(map);
+			return NULL;
+		}
+	}
+
+	return map;
+}
+
+static void ploop_pb_map_free(struct page **map, unsigned long block_max)
+{
+	if (map) {
+		unsigned long i;
+		for (i = 0; i < NR_PAGES(block_max); i++)
+			if (map[i])
+				__free_page(map[i]);
+
+		vfree(map);
+	}
+}
+
+int ploop_pb_cbt_map_release(struct ploop_pushbackup_desc *pbd, bool do_merge)
+{
+	int ret = 0;
+
+	if (pbd->cbt_map == NULL)
+		return 0;
+
+	if (do_merge) {
+		ret = blk_cbt_map_merge(pbd->plo->queue,
+					pbd->cbt_uuid,
+					pbd->cbt_map,
+					pbd->cbt_block_max,
+					pbd->cbt_block_bits);
+		if (ret)
+			printk("ploop(%d): blk_cbt_map_merge() failed with "
+			       "%d\n", pbd->plo->index, ret);
+	}
+
+	ploop_pb_map_free(pbd->cbt_map, pbd->cbt_block_max);
+	pbd->cbt_map = NULL;
+	return ret;
+}
+
+static void ploop_pb_timeout_func(unsigned long data);
+
+static void ploop_pbs_init(struct pb_set *pbs,
+		struct ploop_pushbackup_desc *pbd, char *name)
+{
+	pbs->pbd = pbd;
+	pbs->name = name;
+	pbs->tree = RB_ROOT;
+	INIT_LIST_HEAD(&pbs->list);
+
+	init_timer(&pbs->timer);
+	pbs->timer.function = ploop_pb_timeout_func;
+	pbs->timer.data = (unsigned long)pbs;
+}
+
+static void ploop_pbs_fini(struct pb_set *pbs)
+{
+	del_timer_sync(&pbs->timer);
+}
+
+struct ploop_pushbackup_desc *ploop_pb_alloc(struct ploop_device *plo)
+{
+	struct ploop_pushbackup_desc *pbd;
+
+	pbd = kmalloc(sizeof(struct ploop_pushbackup_desc), GFP_KERNEL|__GFP_ZERO);
+	if (pbd == NULL)
+		return NULL;
+
+	pbd->ppb_block_max = (plo->bd_size + (1 << plo->cluster_log) - 1)
+		>> plo->cluster_log;
+
+	pbd->ppb_map = ploop_pb_map_alloc(pbd->ppb_block_max);
+	if (!pbd->ppb_map) {
+		kfree(pbd);
+		return NULL;
+	}
+
+	pbd->reported_map = ploop_pb_map_alloc(pbd->ppb_block_max);
+	if (!pbd->reported_map) {
+		ploop_pb_map_free(pbd->ppb_map, pbd->ppb_block_max);
+		kfree(pbd);
+		return NULL;
+	}
+
+	spin_lock_init(&pbd->ppb_lock);
+	init_completion(&pbd->ppb_comp);
+	ploop_pbs_init(&pbd->pending_set, pbd, "pending");
+	ploop_pbs_init(&pbd->reported_set, pbd, "reported");
+	init_waitqueue_head(&pbd->ppb_waitq);
+	bio_list_init(&pbd->bio_pending_list);
+	pbd->plo = plo;
+
+	return pbd;
+}
+
+static int find_first_blk_in_map(struct page **map, u64 map_max, u64 *blk_p)
+{
+	u64 blk = *blk_p;
+	unsigned long idx = blk >> (PAGE_SHIFT + 3);
+
+	while (blk < map_max) {
+		unsigned long off = blk & (BITS_PER_PAGE -1);
+		unsigned long next_bit;
+		struct page *page = map[idx];
+
+		if (!page)
+			goto next;
+
+		next_bit = find_next_bit(page_address(page), BITS_PER_PAGE, off);
+		if (next_bit != BITS_PER_PAGE) {
+			*blk_p = ((u64)idx << (PAGE_SHIFT + 3)) + next_bit;
+			return 0;
+		}
+
+	next:
+		idx++;
+		blk = (u64)idx << (PAGE_SHIFT + 3);
+	}
+
+	return -1;
+}
+
+enum {
+	SET_BIT,
+	CLEAR_BIT,
+	CHECK_BIT,
+};
+
+static bool do_bit_in_map(struct page **map, u64 map_max, u64 blk, int action)
+{
+	unsigned long idx = blk >> (PAGE_SHIFT + 3);
+	unsigned long off = blk & (BITS_PER_PAGE -1);
+	struct page *page = map[idx];
+
+	BUG_ON(blk >= map_max);
+
+	switch (action) {
+	case SET_BIT:
+		__set_bit(off, page_address(page));
+		break;
+	case CLEAR_BIT:
+		__clear_bit(off, page_address(page));
+		break;
+	case CHECK_BIT:
+		return test_bit(off, page_address(page));
+	default:
+		BUG();
+	}
+
+	return false;
+}
+
+static void set_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	do_bit_in_map(map, map_max, blk, SET_BIT);
+}
+
+static void clear_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	do_bit_in_map(map, map_max, blk, CLEAR_BIT);
+}
+
+static bool check_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	return do_bit_in_map(map, map_max, blk, CHECK_BIT);
+}
+
+static void set_bits_in_map(struct page **map, u64 map_max, u64 blk, u64 cnt)
+{
+	if (blk + cnt > map_max) {
+		printk("set_bits_in_map: extent [%llu, %llu) is out of range"
+		       " [0, %llu)\n", blk, blk + cnt, map_max);
+		return;
+	}
+
+	while (cnt) {
+		unsigned long idx = blk >> (PAGE_SHIFT + 3);
+		unsigned long off = blk & (BITS_PER_PAGE -1);
+		unsigned long len;
+		void *addr = page_address(map[idx]);
+
+		len = min_t(unsigned long, BITS_PER_PAGE - off, cnt);
+		cnt -= len;
+		blk += len;
+
+		while (len) {
+			if ((off & 31) == 0 && len >= 32) {
+				*(u32 *)(addr + (off >> 3)) = -1;
+				off += 32;
+				len -= 32;
+			} else {
+				__set_bit(off, addr);
+				off += 1;
+				len -= 1;
+			}
+		}
+	}
+}
+
+/* intentionally lockless */
+void ploop_pb_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu)
+{
+	BUG_ON(!pbd);
+	clear_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu);
+}
+
+/* intentionally lockless */
+bool ploop_pb_check_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu)
+{
+	if (!pbd)
+		return false;
+
+	return check_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu);
+}
+
+static int convert_map_to_map(struct ploop_pushbackup_desc *pbd)
+{
+	struct page **from_map = pbd->cbt_map;
+	blkcnt_t from_max = pbd->cbt_block_max;
+	blkcnt_t from_bits = pbd->cbt_block_bits;
+
+	struct page **to_map = pbd->ppb_map;
+	cluster_t to_max = pbd->ppb_block_max;
+	int to_bits = pbd->plo->cluster_log + 9;
+
+	u64 from_blk, to_blk;
+
+	if ((u64)from_max << from_bits != (u64)to_max << to_bits) {
+		printk("mismatch in map convert: %lu %lu ---> %u %d\n",
+		       from_max, from_bits, to_max, to_bits);
+		return -EINVAL;
+	}
+
+	for (from_blk = 0; from_blk < from_max;
+	     from_blk = (++to_blk << to_bits) >> from_bits) {
+
+		if (find_first_blk_in_map(from_map, from_max, &from_blk))
+			break;
+
+		to_blk = (from_blk << from_bits) >> to_bits;
+		set_bit_in_map(to_map, to_max, to_blk);
+	}
+
+	return 0;
+
+}
+
+static int ploop_pb_health_monitor(void * data)
+{
+	struct ploop_pushbackup_desc *pbd = data;
+	struct ploop_device	     *plo = pbd->plo;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	while (!kthread_should_stop() || pbd->ppb_state == PLOOP_PB_STOPPING) {
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&pbd->ppb_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (pbd->ppb_state == PLOOP_PB_STOPPING ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&pbd->ppb_lock);
+			schedule();
+			spin_lock_irq(&pbd->ppb_lock);
+		}
+		finish_wait(&pbd->ppb_waitq, &_wait);
+
+		if (pbd->ppb_state == PLOOP_PB_STOPPING) {
+			spin_unlock_irq(&pbd->ppb_lock);
+			mutex_lock(&plo->ctl_mutex);
+			ploop_pb_stop(pbd, true);
+			mutex_unlock(&plo->ctl_mutex);
+			spin_lock_irq(&pbd->ppb_lock);
+		}
+	}
+	spin_unlock_irq(&pbd->ppb_lock);
+	return 0;
+}
+
+int ploop_pb_init(struct ploop_pushbackup_desc *pbd, __u8 *uuid, bool full)
+{
+	struct task_struct *ts;
+
+	memcpy(pbd->cbt_uuid, uuid, sizeof(pbd->cbt_uuid));
+
+	if (full) {
+		int i, off;
+		for (i = 0; i < NR_PAGES(pbd->ppb_block_max); i++)
+			memset(page_address(pbd->ppb_map[i]), 0xff, PAGE_SIZE);
+
+		/* nullify bits beyond [0, pbd->ppb_block_max) range */
+		off = pbd->ppb_block_max & (BITS_PER_PAGE -1);
+		i = pbd->ppb_block_max >> (PAGE_SHIFT + 3);
+		while (off && off < BITS_PER_PAGE) {
+			__clear_bit(off, page_address(pbd->ppb_map[i]));
+			off++;
+		}
+	} else {
+		int rc = blk_cbt_map_copy_once(pbd->plo->queue,
+					       uuid,
+					       &pbd->cbt_map,
+					       &pbd->cbt_block_max,
+					       &pbd->cbt_block_bits);
+		if (rc)
+			return rc;
+
+		rc = convert_map_to_map(pbd);
+		if (rc)
+			return rc;
+	}
+
+	ts = kthread_create(ploop_pb_health_monitor, pbd, "ploop_pb_hm%d",
+			    pbd->plo->index);
+	if (IS_ERR(ts))
+		return PTR_ERR(ts);
+
+	pbd->health_monitor_thread = ts;
+	wake_up_process(ts);
+	return 0;
+}
+
+void ploop_pb_fini(struct ploop_pushbackup_desc *pbd)
+{
+	if (pbd == NULL)
+		return;
+
+	if (!RB_EMPTY_ROOT(&pbd->pending_set.tree))
+		printk("ploop_pb_fini: pending_tree is not empty!\n");
+	if (!RB_EMPTY_ROOT(&pbd->reported_set.tree))
+		printk("ploop_pb_fini: reported_tree is not empty!\n");
+
+	if (pbd->health_monitor_thread) {
+		kthread_stop(pbd->health_monitor_thread);
+		pbd->health_monitor_thread = NULL;
+	}
+
+	if (pbd->plo) {
+		struct ploop_device *plo = pbd->plo;
+		mutex_lock(&plo->sysfs_mutex);
+		plo->pbd = NULL;
+		mutex_unlock(&plo->sysfs_mutex);
+	}
+
+	ploop_pb_cbt_map_release(pbd, true);
+	ploop_pb_map_free(pbd->ppb_map, pbd->ppb_block_max);
+	ploop_pb_map_free(pbd->reported_map, pbd->ppb_block_max);
+
+	kfree(pbd);
+}
+
+int ploop_pb_copy_cbt_to_user(struct ploop_pushbackup_desc *pbd, char *user_addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < NR_PAGES(pbd->cbt_block_max); i++) {
+		struct page *page = pbd->cbt_map[i] ? : ZERO_PAGE(0);
+
+		if (copy_to_user(user_addr, page_address(page), PAGE_SIZE))
+			return -EFAULT;
+
+		user_addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static void ploop_pb_add_req_to_tree(struct ploop_request *preq,
+				     struct pb_set *pbs)
+{
+	struct rb_root *tree = &pbs->tree;
+	struct rb_node ** p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+	unsigned long timeout = preq->plo->tune.push_backup_timeout * HZ;
+
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, reloc_link);
+		BUG_ON (preq->req_cluster == pr->req_cluster);
+
+		if (preq->req_cluster < pr->req_cluster)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	preq->tstamp = jiffies;
+	if (timeout && list_empty(&pbs->list) &&
+	    pbs->pbd->ppb_state == PLOOP_PB_ALIVE)
+		mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+
+	if (pbs->list.prev->next != &pbs->list) {
+		printk("list_add corruption. pbs->list.prev->next should be "
+		       "&pbs->list (%p), but was %p. (pbs->list.prev=%p)."
+		       " preq=%p\n",
+		       &pbs->list, pbs->list.prev->next, pbs->list.prev, preq);
+		BUG();
+	}
+	list_add_tail(&preq->list, &pbs->list);
+
+	rb_link_node(&preq->reloc_link, parent, p);
+	rb_insert_color(&preq->reloc_link, tree);
+}
+
+static void ploop_pb_add_req_to_pending(struct ploop_pushbackup_desc *pbd,
+					struct ploop_request *preq)
+{
+	ploop_pb_add_req_to_tree(preq, &pbd->pending_set);
+}
+
+static void ploop_pb_add_req_to_reported(struct ploop_pushbackup_desc *pbd,
+					 struct ploop_request *preq)
+{
+	ploop_pb_add_req_to_tree(preq, &pbd->reported_set);
+}
+
+static void remove_req_from_pbs(struct pb_set *pbs,
+					 struct ploop_request *preq)
+{
+	unsigned long timeout = preq->plo->tune.push_backup_timeout * HZ;
+	bool oldest_deleted = false;
+
+	if (preq == list_first_entry(&pbs->list, struct ploop_request, list))
+		oldest_deleted = true;
+
+	rb_erase(&preq->reloc_link, &pbs->tree);
+	list_del_init(&preq->list);
+
+	if (timeout && oldest_deleted && !list_empty(&pbs->list) &&
+	    pbs->pbd->ppb_state == PLOOP_PB_ALIVE) {
+		preq = list_first_entry(&pbs->list, struct ploop_request,
+					list);
+		mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+	}
+}
+
+
+static inline bool preq_match(struct ploop_request *preq, cluster_t clu,
+			      cluster_t len)
+{
+	return preq &&
+		clu <= preq->req_cluster &&
+		preq->req_cluster < clu + len;
+}
+
+/* returns leftmost preq which req_cluster >= clu */
+static struct ploop_request *ploop_pb_get_req_from_tree(struct pb_set *pbs,
+						cluster_t clu, cluster_t len,
+						struct ploop_request **npreq)
+{
+	struct rb_root *tree = &pbs->tree;
+	struct rb_node *n = tree->rb_node;
+	struct ploop_request *p = NULL;
+
+	*npreq = NULL;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, reloc_link);
+
+		if (clu < p->req_cluster)
+			n = n->rb_left;
+		else if (clu > p->req_cluster)
+			n = n->rb_right;
+		else { /* perfect match */
+			n = rb_next(n);
+			if (n)
+				*npreq = rb_entry(n, struct ploop_request,
+						  reloc_link);
+			remove_req_from_pbs(pbs, p);
+			return p;
+		}
+	}
+	/* here p is not perfect, but it's closest */
+
+	if (p && p->req_cluster < clu) {
+		n = rb_next(&p->reloc_link);
+		if (n)
+			p = rb_entry(n, struct ploop_request, reloc_link);
+	}
+
+	if (preq_match(p, clu, len)) {
+		n = rb_next(&p->reloc_link);
+		if (n)
+			*npreq = rb_entry(n, struct ploop_request, reloc_link);
+		remove_req_from_pbs(pbs, p);
+		return p;
+	}
+
+	return NULL;
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_tree(struct pb_set *pbs,
+				 struct ploop_request **npreq)
+{
+	struct rb_root *tree = &pbs->tree;
+	struct ploop_request *p;
+	struct rb_node *n = rb_first(tree);
+
+	if (!n)
+		return NULL;
+
+	if (npreq) {
+		struct rb_node *next = rb_next(n);
+		if (next)
+			*npreq = rb_entry(next, struct ploop_request,
+					  reloc_link);
+		else
+			*npreq = NULL;
+	}
+
+	p = rb_entry(n, struct ploop_request, reloc_link);
+	remove_req_from_pbs(pbs, p);
+	return p;
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_pending(struct ploop_pushbackup_desc *pbd)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->pending_set, NULL);
+}
+
+static struct ploop_request *
+ploop_pb_get_first_reqs_from_pending(struct ploop_pushbackup_desc *pbd,
+				     struct ploop_request **npreq)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->pending_set, npreq);
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_reported(struct ploop_pushbackup_desc *pbd)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->reported_set, NULL);
+}
+
+int ploop_pb_preq_add_pending(struct ploop_pushbackup_desc *pbd,
+			       struct ploop_request *preq)
+{
+	BUG_ON(!pbd);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -ESTALE;
+	}
+
+	if (!test_bit(PLOOP_S_PUSH_BACKUP, &pbd->plo->state)) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -EINTR;
+	}
+
+	if (check_bit_in_map(pbd->reported_map, pbd->ppb_block_max,
+			     preq->req_cluster)) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -EALREADY;
+	}
+
+	ploop_pb_add_req_to_pending(pbd, preq);
+
+	if (pbd->ppb_waiting)
+		complete(&pbd->ppb_comp);
+
+	spin_unlock_irq(&pbd->ppb_lock);
+	return 0;
+}
+
+bool ploop_pb_check_and_clear_bit(struct ploop_pushbackup_desc *pbd,
+				  cluster_t clu)
+{
+	if (!pbd)
+		return false;
+
+	if (!check_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu))
+		return false;
+
+	spin_lock(&pbd->ppb_lock);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE ||
+	    check_bit_in_map(pbd->reported_map, pbd->ppb_block_max, clu)) {
+		spin_unlock(&pbd->ppb_lock);
+		ploop_pb_clear_bit(pbd, clu);
+		return false;
+	}
+
+	spin_unlock(&pbd->ppb_lock);
+	return true;
+}
+
+static void return_bios_back_to_plo(struct ploop_device *plo,
+				    struct bio_list *bl)
+{
+	if (!bl->head)
+		return;
+
+	if (plo->bio_tail)
+		plo->bio_tail->bi_next = bl->head;
+	else
+		plo->bio_head = bl->head;
+
+	plo->bio_tail = bl->tail;
+
+	bio_list_init(bl);
+}
+
+/* Always serialized by plo->ctl_mutex */
+unsigned long ploop_pb_stop(struct ploop_pushbackup_desc *pbd, bool do_merge)
+{
+	unsigned long ret = 0;
+	int merge_status = 0;
+	LIST_HEAD(drop_list);
+
+	if (pbd == NULL)
+		return 0;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	if (pbd->ppb_state == PLOOP_PB_DEAD) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return 0;
+	}
+	pbd->ppb_state = PLOOP_PB_DEAD;
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	ploop_pbs_fini(&pbd->pending_set);
+	ploop_pbs_fini(&pbd->reported_set);
+
+	merge_status = ploop_pb_cbt_map_release(pbd, do_merge);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	while (!RB_EMPTY_ROOT(&pbd->pending_set.tree)) {
+		struct ploop_request *preq =
+			ploop_pb_get_first_req_from_pending(pbd);
+		list_add(&preq->list, &drop_list);
+		ret++;
+	}
+
+	while (!RB_EMPTY_ROOT(&pbd->reported_set.tree)) {
+		struct ploop_request *preq =
+			ploop_pb_get_first_req_from_reported(pbd);
+		list_add(&preq->list, &drop_list);
+		ret++;
+	}
+
+	if (pbd->ppb_waiting)
+		complete(&pbd->ppb_comp);
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	if (!list_empty(&drop_list) || !ploop_pb_bio_list_empty(pbd)) {
+		struct ploop_device *plo = pbd->plo;
+
+		BUG_ON(!plo);
+		spin_lock_irq(&plo->lock);
+		list_splice_init(&drop_list, plo->ready_queue.prev);
+		return_bios_back_to_plo(plo, &pbd->bio_pending_list);
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+		spin_unlock_irq(&plo->lock);
+	}
+
+	return merge_status ? : ret;
+}
+
+int ploop_pb_get_pending(struct ploop_pushbackup_desc *pbd,
+			 cluster_t *clu_p, cluster_t *len_p, unsigned n_done)
+{
+	bool blocking  = !n_done;
+	struct ploop_request *preq, *npreq;
+	int err = 0;
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	preq = ploop_pb_get_first_reqs_from_pending(pbd, &npreq);
+	if (!preq) {
+		struct ploop_device *plo = pbd->plo;
+
+		if (!blocking) {
+			err = -ENOENT;
+			goto get_pending_unlock;
+		}
+
+                /* blocking case */
+		if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+			err = -ESTALE;
+			goto get_pending_unlock;
+		}
+		if (unlikely(pbd->ppb_waiting)) {
+			/* Other task is already waiting for event */
+			err = -EBUSY;
+			goto get_pending_unlock;
+		}
+wait_again:
+		pbd->ppb_waiting = true;
+		spin_unlock_irq(&pbd->ppb_lock);
+
+		mutex_unlock(&plo->ctl_mutex);
+		err = wait_for_completion_interruptible(&pbd->ppb_comp);
+		mutex_lock(&plo->ctl_mutex);
+
+		if (plo->pbd != pbd)
+			return -EINTR;
+
+		spin_lock_irq(&pbd->ppb_lock);
+		pbd->ppb_waiting = false;
+		init_completion(&pbd->ppb_comp);
+
+		preq = ploop_pb_get_first_reqs_from_pending(pbd, &npreq);
+		if (!preq) {
+			if (!test_bit(PLOOP_S_PUSH_BACKUP, &plo->state))
+				err = -EINTR;
+			else if (pbd->ppb_state != PLOOP_PB_ALIVE)
+				err =  -ESTALE;
+			else if (signal_pending(current))
+				err = -ERESTARTSYS;
+			else
+				goto wait_again;
+
+			goto get_pending_unlock;
+		}
+	}
+
+	ploop_pb_add_req_to_reported(pbd, preq);
+
+	*clu_p = preq->req_cluster;
+	*len_p = 1;
+
+	while (npreq && npreq->req_cluster == *clu_p + *len_p) {
+		struct rb_node *next = rb_next(&npreq->reloc_link);
+
+		preq = npreq;
+		if (next)
+			npreq = rb_entry(next, struct ploop_request,
+					 reloc_link);
+		else
+			npreq = NULL;
+
+		remove_req_from_pbs(&pbd->pending_set, preq);
+		ploop_pb_add_req_to_reported(pbd, preq);
+
+		(*len_p)++;
+	}
+
+get_pending_unlock:
+	spin_unlock_irq(&pbd->ppb_lock);
+	return err;
+}
+
+static void fill_page_to_backup(struct ploop_pushbackup_desc *pbd,
+				unsigned long idx, struct page *page)
+{
+	u32 *dst = page_address(page);
+	u32 *fin = page_address(page) + PAGE_SIZE;
+	u32 *map = page_address(pbd->ppb_map[idx]);
+	u32 *rep = page_address(pbd->reported_map[idx]);
+
+	while (dst < fin) {
+		*dst = *map & ~*rep;
+		dst++;
+		map++;
+		rep++;
+	}
+}
+
+int ploop_pb_peek(struct ploop_pushbackup_desc *pbd,
+		  cluster_t *clu_p, cluster_t *len_p, unsigned n_done)
+{
+	unsigned long block = *clu_p + *len_p;
+	unsigned long idx = block >> (PAGE_SHIFT + 3);
+	unsigned long clu = 0;
+	unsigned long len = 0;
+	unsigned long off, off2;
+	struct page *page;
+	bool found = 0;
+
+	if (block >= pbd->ppb_block_max)
+		return -ENOENT;
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE)
+		return -ESTALE;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	while (block < pbd->ppb_block_max) {
+		fill_page_to_backup(pbd, idx, page);
+		off = block & (BITS_PER_PAGE -1);
+
+		if (!found) {
+			clu = find_next_bit(page_address(page),
+					       BITS_PER_PAGE, off);
+			if (clu == BITS_PER_PAGE)
+				goto next;
+
+			off = clu;
+			clu += idx << (PAGE_SHIFT + 3);
+			found = 1;
+		}
+
+		if (found) {
+			off2 = find_next_zero_bit(page_address(page),
+						  BITS_PER_PAGE, off);
+			len += off2 - off;
+			if (off2 != BITS_PER_PAGE)
+				break;
+		}
+
+	next:
+		idx++;
+		block = idx << (PAGE_SHIFT + 3);
+	}
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	__free_page(page);
+
+	if (!found)
+		return -ENOENT;
+
+	*clu_p = clu;
+	*len_p = len;
+	return 0;
+}
+
+static void ploop_pb_process_extent(struct pb_set *pbs, cluster_t clu,
+				    cluster_t len, struct list_head *ready_list,
+				    int *n_found)
+{
+	struct ploop_request *preq, *npreq;
+
+	preq = ploop_pb_get_req_from_tree(pbs, clu, len, &npreq);
+
+	while (preq) {
+		struct rb_node *n;
+
+		set_bit(PLOOP_REQ_PUSH_BACKUP, &preq->ppb_state);
+		list_add(&preq->list, ready_list);
+
+		if (n_found)
+			(*n_found)++;
+
+		if (!preq_match(npreq, clu, len))
+			break;
+
+		preq = npreq;
+		n = rb_next(&preq->reloc_link);
+		if (n)
+			npreq = rb_entry(n, struct ploop_request, reloc_link);
+		else
+			npreq = NULL;
+		remove_req_from_pbs(pbs, preq);
+	}
+}
+
+void ploop_pb_put_reported(struct ploop_pushbackup_desc *pbd,
+			   cluster_t clu, cluster_t len)
+{
+	int n_found = 0;
+	LIST_HEAD(ready_list);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	ploop_pb_process_extent(&pbd->reported_set, clu, len, &ready_list, &n_found);
+	ploop_pb_process_extent(&pbd->pending_set, clu, len, &ready_list, NULL);
+
+	/*
+	 * If preq not found above, it's unsolicited report. Then it's
+	 * enough to have corresponding bit set in reported_map because if
+	 * any WRITE-request comes afterwards, ploop_pb_preq_add_pending()
+	 * fails and ploop_thread will clear corresponding bit in ppb_map
+	 * -- see "push_backup special processing" in ploop_entry_request()
+	 * for details.
+	 */
+	set_bits_in_map(pbd->reported_map, pbd->ppb_block_max, clu, len);
+
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	if (!list_empty(&ready_list)) {
+		struct ploop_device *plo = pbd->plo;
+
+		spin_lock_irq(&plo->lock);
+		list_splice(&ready_list, plo->ready_queue.prev);
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+		spin_unlock_irq(&plo->lock);
+	}
+}
+
+int ploop_pb_destroy(struct ploop_device *plo, __u32 *status)
+{
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+	unsigned long ret;
+	bool do_merge;
+
+	if (!test_and_clear_bit(PLOOP_S_PUSH_BACKUP, &plo->state))
+		return -EINVAL;
+
+	BUG_ON (!pbd);
+	do_merge = status ? *status : true;
+	ret = ploop_pb_stop(pbd, do_merge);
+
+	if (status)
+		*status = ret;
+
+	ploop_quiesce(plo);
+	ploop_pb_fini(plo->pbd);
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	ploop_relax(plo);
+
+	return 0;
+}
+
+static bool ploop_pb_set_expired(struct pb_set *pbs)
+{
+	struct ploop_pushbackup_desc *pbd = pbs->pbd;
+	struct ploop_device          *plo = pbd->plo;
+	unsigned long timeout = plo->tune.push_backup_timeout * HZ;
+	unsigned long tstamp = 0;
+	cluster_t clu = 0;
+	bool ret = false;
+	unsigned long flags;
+
+	if (!timeout)
+		return false;
+
+	spin_lock_irqsave(&pbd->ppb_lock, flags);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+		spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+		return false;
+	}
+
+	/* No need to scan the whole list: the first preq is the oldest! */
+	if (!list_empty(&pbs->list)) {
+		struct ploop_request *preq = list_first_entry(&pbs->list,
+							      struct ploop_request, list);
+		if (time_before(preq->tstamp + timeout, jiffies)) {
+			tstamp = preq->tstamp;
+			clu = preq->req_cluster;
+			ret = true;
+		} else
+			mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+	}
+
+	spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+
+	if (ret)
+		printk(KERN_WARNING "Abort push_backup for ploop%d: found "
+		       "preq (clu=%d) in %s tree delayed for %u msecs\n",
+		       plo->index, clu, pbs->name,
+		       jiffies_to_msecs(jiffies - tstamp));
+
+	return ret;
+}
+
+static void ploop_pb_timeout_func(unsigned long data)
+{
+	struct pb_set                *pbs = (void*)data;
+	struct ploop_pushbackup_desc *pbd = pbs->pbd;
+	struct ploop_device          *plo = pbd->plo;
+	unsigned long flags;
+
+	if (!plo->tune.push_backup_timeout ||
+	    !test_bit(PLOOP_S_RUNNING, &plo->state) ||
+	    !test_bit(PLOOP_S_PUSH_BACKUP, &plo->state) ||
+	    !ploop_pb_set_expired(pbs))
+		return;
+
+	spin_lock_irqsave(&pbd->ppb_lock, flags);
+	if (pbd->ppb_state == PLOOP_PB_ALIVE) {
+		pbd->ppb_state = PLOOP_PB_STOPPING;
+		if (waitqueue_active(&pbd->ppb_waitq))
+			wake_up_interruptible(&pbd->ppb_waitq);
+	}
+	spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+}
+
+/* Return true if bio was detained, false otherwise */
+bool ploop_pb_bio_detained(struct ploop_pushbackup_desc *pbd, struct bio *bio)
+{
+	cluster_t   clu = bio->bi_sector >> pbd->plo->cluster_log;
+
+	if (ploop_pb_check_and_clear_bit(pbd, clu)) {
+		bio_list_add(&pbd->bio_pending_list, bio);
+		return true;
+	}
+
+	return false;
+}
+
+/* Return true if no detained bio-s present, false otherwise */
+bool ploop_pb_bio_list_empty(struct ploop_pushbackup_desc *pbd)
+{
+	return !pbd || bio_list_empty(&pbd->bio_pending_list);
+}
+
+struct bio *ploop_pb_bio_get(struct ploop_pushbackup_desc *pbd)
+{
+	return bio_list_pop(&pbd->bio_pending_list);
+}
+
+void ploop_pb_bio_list_merge(struct ploop_pushbackup_desc *pbd,
+			     struct bio_list *tmp)
+{
+	bio_list_merge(&pbd->bio_pending_list, tmp);
+}
--- /dev/null
+++ b/drivers/block/ploop/push_backup.h
@@ -0,0 +1,37 @@
+/*
+ *  drivers/block/ploop/push_backup.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+struct ploop_pushbackup_desc;
+
+struct ploop_pushbackup_desc *ploop_pb_alloc(struct ploop_device *plo);
+int ploop_pb_init(struct ploop_pushbackup_desc *pbd, __u8 *uuid, bool full);
+void ploop_pb_fini(struct ploop_pushbackup_desc *pbd);
+int ploop_pb_copy_cbt_to_user(struct ploop_pushbackup_desc *pbd, char *user_addr);
+unsigned long ploop_pb_stop(struct ploop_pushbackup_desc *pbd, bool do_merge);
+int ploop_pb_check_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid);
+int ploop_pb_get_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid);
+
+int ploop_pb_get_pending(struct ploop_pushbackup_desc *pbd,
+			 cluster_t *clu_p, cluster_t *len_p, unsigned n_done);
+int ploop_pb_peek(struct ploop_pushbackup_desc *pbd,
+		  cluster_t *clu_p, cluster_t *len_p, unsigned n_done);
+void ploop_pb_put_reported(struct ploop_pushbackup_desc *pbd,
+			   cluster_t clu, cluster_t len);
+
+void ploop_pb_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+bool ploop_pb_check_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+bool ploop_pb_check_and_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+
+int ploop_pb_preq_add_pending(struct ploop_pushbackup_desc *pbd,
+			       struct ploop_request *preq);
+
+int ploop_pb_destroy(struct ploop_device *plo, __u32 *status);
+
+bool ploop_pb_bio_detained(struct ploop_pushbackup_desc *pbd, struct bio *bio);
+bool ploop_pb_bio_list_empty(struct ploop_pushbackup_desc *pbd);
+struct bio *ploop_pb_bio_get(struct ploop_pushbackup_desc *pbd);
+void ploop_pb_bio_list_merge(struct ploop_pushbackup_desc *pbd, struct bio_list *tmp);
--- /dev/null
+++ b/drivers/block/ploop/sysfs.c
@@ -0,0 +1,713 @@
+/*
+ *  drivers/block/ploop/sysfs.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+#include "push_backup.h"
+
+struct delta_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct ploop_delta *, char *);
+	ssize_t (*store)(struct ploop_delta *, const char *, size_t);
+};
+
+static ssize_t
+delta_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->show)
+		return -EIO;
+	return entry->show(delta, page);
+}
+
+static ssize_t
+delta_attr_store(struct kobject *kobj, struct attribute *attr,
+		 const char *page, size_t length)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->store)
+		return -EIO;
+
+	return entry->store(delta, page, length);
+}
+
+
+static struct sysfs_ops delta_sysfs_ops = {
+	.show	= delta_attr_show,
+	.store	= delta_attr_store,
+};
+
+static void release_delta(struct kobject *kobj)
+{
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (delta->ops)
+		ploop_format_put(delta->ops);
+	module_put(THIS_MODULE);
+	kfree(delta);
+}
+
+static ssize_t
+delta_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+delta_string_show(char * str, char *page)
+{
+	return sprintf(page, "%s\n", str);
+}
+
+static ssize_t delta_level_show(struct ploop_delta *delta, char *page)
+{
+	return delta_var_show(delta->level, page);
+}
+
+static ssize_t delta_image_show(struct ploop_delta *delta, char *page)
+{
+	char * res;
+	int len = -ENOENT;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (delta->io.files.file) {
+		res = d_path(&delta->io.files.file->f_path, page, PAGE_SIZE-1);
+		len = PTR_ERR(res);
+		if (!IS_ERR(res)) {
+			len = strlen(res);
+			if (res != page)
+				memmove(page, res, len);
+			page[len] = '\n';
+			len++;
+		}
+	}
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return len;
+}
+
+static ssize_t delta_image_info_show(struct ploop_delta *delta, char *page)
+{
+	int len = -ENOENT;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (delta->io.files.file) {
+		struct inode *inode = file_inode(delta->io.files.file);
+		len = snprintf(page, PAGE_SIZE, "ino:%lu\nsdev:%u:%u\n",
+				inode->i_ino,
+				MAJOR(inode->i_sb->s_dev),
+				MINOR(inode->i_sb->s_dev));
+	}
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return len;
+}
+
+static ssize_t delta_format_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->ops->name, page);
+}
+
+static ssize_t delta_io_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->io.ops->name, page);
+}
+
+static ssize_t delta_ro_show(struct ploop_delta *delta, char *page)
+{
+	return sprintf(page, "%d\n", !!(delta->flags & PLOOP_FMT_RDONLY));
+}
+
+static ssize_t delta_trans_show(struct ploop_delta *delta, char *page)
+{
+	struct ploop_device * plo = delta->plo;
+	int trans = 0;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (plo->trans_map && map_top_delta(plo->trans_map) == delta)
+		trans = 1;
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return sprintf(page, "%d\n", trans);
+}
+
+static ssize_t delta_dump(struct ploop_delta *delta, char *page)
+{
+	int ret = delta->io.ops->dump ? delta->io.ops->dump(&delta->io) : -1;
+	return sprintf(page, "%d\n", ret);
+}
+
+static struct delta_sysfs_entry delta_level_entry = {
+	.attr = {.name = "level", .mode = S_IRUGO },
+	.show = delta_level_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_image_entry = {
+	.attr = {.name = "image", .mode = S_IRUGO },
+	.show = delta_image_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_image_info_entry = {
+	.attr = {.name = "image_info", .mode = S_IRUGO },
+	.show = delta_image_info_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_format_entry = {
+	.attr = {.name = "format", .mode = S_IRUGO },
+	.show = delta_format_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_io_entry = {
+	.attr = {.name = "io", .mode = S_IRUGO },
+	.show = delta_io_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_ro_entry = {
+	.attr = {.name = "ro", .mode = S_IRUGO },
+	.show = delta_ro_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_trans_entry = {
+	.attr = {.name = "transparent", .mode = S_IRUGO },
+	.show = delta_trans_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_dump_entry = {
+	.attr = {.name = "dump", .mode = S_IRUGO },
+	.show = delta_dump,
+};
+
+static struct attribute *default_attrs[] = {
+	&delta_level_entry.attr,
+	&delta_image_entry.attr,
+	&delta_image_info_entry.attr,
+	&delta_format_entry.attr,
+	&delta_io_entry.attr,
+	&delta_ro_entry.attr,
+	&delta_trans_entry.attr,
+	&delta_dump_entry.attr,
+	NULL,
+};
+
+struct kobj_type ploop_delta_ktype = {
+	.sysfs_ops	= &delta_sysfs_ops,
+	.default_attrs	= default_attrs,
+	.release	= release_delta,
+};
+
+
+static struct {
+#define __DO(_at)	struct attribute _at;
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+} _attr_arr = {
+#define __DO(_at)	._at = { .name = __stringify(_at), .mode = S_IRUGO|S_IWUSR, },
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+};
+
+static struct attribute *stats_attributes[] = {
+#define __DO(_at) &_attr_arr._at,
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+	NULL
+};
+
+static const struct attribute_group stats_group = {
+	.attrs = stats_attributes,
+};
+
+
+
+#define to_disk(obj) dev_to_disk(container_of(obj,struct device,kobj))
+
+static ssize_t pstat_show(struct kobject *kobj, struct attribute *attr,
+			  char *page)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	int n;
+
+	n = attr - (struct attribute *)&_attr_arr;
+
+	return sprintf(page, "%u\n", ((u32*)&plo->st)[n]);
+}
+
+static ssize_t pstat_store(struct kobject * kobj, struct attribute * attr,
+			   const char *page, size_t count)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int n;
+
+	var = simple_strtoul(p, &p, 10);
+
+	n = attr - (struct attribute *)&_attr_arr;
+	((u32*)&plo->st)[n] = var;
+	return count;
+}
+
+static u32 show_block_size(struct ploop_device * plo)
+{
+	return 1 << plo->cluster_log;
+}
+
+static u32 show_fmt_version(struct ploop_device * plo)
+{
+	return plo->fmt_version;
+}
+
+static u32 show_total_bios(struct ploop_device * plo)
+{
+	return plo->bio_total;
+}
+
+static u32 show_queued_bios(struct ploop_device * plo)
+{
+	return plo->bio_qlen;
+}
+
+static u32 show_discard_bios(struct ploop_device * plo)
+{
+	return plo->bio_discard_qlen;
+}
+
+static u32 show_active_reqs(struct ploop_device * plo)
+{
+	return plo->active_reqs;
+}
+
+static u32 show_entry_read_sync_reqs(struct ploop_device * plo)
+{
+	return plo->read_sync_reqs;
+}
+
+static u32 show_entry_reqs(struct ploop_device * plo)
+{
+	return plo->entry_qlen;
+}
+
+static u32 show_barrier_reqs(struct ploop_device * plo)
+{
+	return plo->barrier_reqs;
+}
+
+static u32 show_fsync_reqs(struct ploop_device * plo)
+{
+	u32 qlen = 0;
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		qlen = ploop_top_delta(plo)->io.fsync_qlen;
+	mutex_unlock(&plo->sysfs_mutex);
+	return qlen;
+}
+
+static u32 show_fastpath_reqs(struct ploop_device * plo)
+{
+	return plo->fastpath_reqs;
+}
+
+static u32 show_map_pages(struct ploop_device * plo)
+{
+	return plo->map.pages;
+}
+
+static u32 show_running(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_RUNNING, &plo->state);
+}
+
+static u32 show_locked(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_LOCKED, &plo->locking_state);
+}
+
+static u32 show_aborted(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_ABORT, &plo->state);
+}
+
+static int store_aborted(struct ploop_device * plo, u32 val)
+{
+	printk(KERN_INFO "ploop: Force %s aborted state for ploop%d\n",
+	       val ? "set" : "clear", plo->index);
+
+	if (val)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	else
+		clear_bit(PLOOP_S_ABORT, &plo->state);
+	return 0;
+}
+
+static u32 show_top(struct ploop_device * plo)
+{
+	int top = -1;
+
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		top = ploop_top_delta(plo)->level;
+	if (plo->trans_map)
+		top++;
+	mutex_unlock(&plo->sysfs_mutex);
+	return (u32)top;
+}
+
+static inline u32 get_event_locked(struct ploop_device * plo)
+{
+	if (test_and_clear_bit(PLOOP_S_ENOSPC_EVENT, &plo->state))
+		return PLOOP_EVENT_ENOSPC;
+	else if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return PLOOP_EVENT_ABORTED;
+	else if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return PLOOP_EVENT_STOPPED;
+
+	return 0;
+}
+
+static u32 show_event(struct ploop_device * plo)
+{
+	u32 ret;
+
+	DEFINE_WAIT(_wait);
+	spin_lock_irq(&plo->lock);
+
+	ret = get_event_locked(plo);
+	if (ret) {
+		spin_unlock_irq(&plo->lock);
+		return ret;
+	}
+
+	prepare_to_wait(&plo->event_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+	finish_wait(&plo->event_waitq, &_wait);
+
+	ret = get_event_locked(plo);
+
+	spin_unlock_irq(&plo->lock);
+	return ret;
+}
+
+static u32 show_open_count(struct ploop_device * plo)
+{
+	return atomic_read(&plo->open_count);
+}
+
+static ssize_t print_cookie(struct ploop_device * plo, char * page)
+{
+	return sprintf(page, "%s\n", plo->cookie);
+}
+
+static ssize_t print_push_backup_uuid(struct ploop_device * plo, char * page)
+{
+	__u8 uuid[16];
+	int err;
+
+	mutex_lock(&plo->sysfs_mutex);
+	err = ploop_pb_get_uuid(plo->pbd, uuid);
+	mutex_unlock(&plo->sysfs_mutex);
+
+	page[0] = '\0';
+	if (err)
+		return 0;
+
+	return snprintf(page, PAGE_SIZE, "%pUB\n", uuid);
+}
+
+static u32 show_free_reqs(struct ploop_device * plo)
+{
+	return plo->free_qlen;
+}
+
+static u32 show_free_qmax(struct ploop_device * plo)
+{
+	return plo->free_qmax;
+}
+
+static u32 show_blockable_reqs(struct ploop_device * plo)
+{
+	return plo->blockable_reqs;
+}
+
+static u32 show_blocked_bios(struct ploop_device * plo)
+{
+	return plo->blocked_bios;
+}
+
+static u32 show_freeze_state(struct ploop_device * plo)
+{
+	return plo->freeze_state;
+}
+
+#define _TUNE_U32(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return plo->tune._name;				\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = val;				\
+	return 0;					\
+}
+
+#define _TUNE_JIFFIES(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return (plo->tune._name * 1000) / HZ;		\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = (val * HZ) / 1000;		\
+	return 0;					\
+}
+
+#define _TUNE_BOOL	_TUNE_U32
+
+_TUNE_U32(max_requests);
+_TUNE_U32(batch_entry_qlen);
+_TUNE_JIFFIES(batch_entry_delay);
+_TUNE_U32(fsync_max);
+_TUNE_JIFFIES(fsync_delay);
+_TUNE_BOOL(pass_flushes);
+_TUNE_BOOL(pass_fuas);
+_TUNE_BOOL(congestion_detection);
+_TUNE_BOOL(check_zeros);
+_TUNE_U32(min_map_pages);
+_TUNE_JIFFIES(max_map_inactivity);
+_TUNE_BOOL(disable_root_threshold);
+_TUNE_BOOL(disable_user_threshold);
+_TUNE_U32(congestion_high_watermark);
+_TUNE_U32(congestion_low_watermark);
+_TUNE_U32(max_active_requests);
+_TUNE_U32(push_backup_timeout);
+
+
+struct pattr_sysfs_entry {
+	struct attribute attr;
+	u32 (*show)(struct ploop_device *);
+	int (*store)(struct ploop_device *, __u32 val);
+	ssize_t (*print)(struct ploop_device *, char *page);
+};
+
+#define _A(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .show = show_##_name, }).attr
+
+#define _A2(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO|S_IWUSR }, .show = show_##_name, .store = store_##_name, }).attr
+
+#define _A3(_name)							\
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .print = print_##_name, }).attr
+
+static struct attribute *state_attributes[] = {
+	_A(block_size),
+	_A(fmt_version),
+	_A(total_bios),
+	_A(queued_bios),
+	_A(discard_bios),
+	_A(active_reqs),
+	_A(entry_reqs),
+	_A(entry_read_sync_reqs),
+	_A(barrier_reqs),
+	_A(fastpath_reqs),
+	_A(fsync_reqs),
+	_A(map_pages),
+	_A(running),
+	_A(locked),
+	_A2(aborted),
+	_A(top),
+	_A(event),
+	_A3(cookie),
+	_A3(push_backup_uuid),
+	_A(open_count),
+	_A(free_reqs),
+	_A(free_qmax),
+	_A(blockable_reqs),
+	_A(blocked_bios),
+	_A(freeze_state),
+	NULL
+};
+
+static struct attribute *tune_attributes[] = {
+	_A2(max_requests),
+	_A2(batch_entry_qlen),
+	_A2(batch_entry_delay),
+	_A2(fsync_max),
+	_A2(fsync_delay),
+	_A2(min_map_pages),
+	_A2(max_map_inactivity),
+	_A2(pass_flushes),
+	_A2(pass_fuas),
+	_A2(congestion_detection),
+	_A2(check_zeros),
+	_A2(disable_root_threshold),
+	_A2(disable_user_threshold),
+	_A2(congestion_high_watermark),
+	_A2(congestion_low_watermark),
+	_A2(max_active_requests),
+	_A2(push_backup_timeout),
+	NULL
+};
+
+static const struct attribute_group state_group = {
+	.attrs = state_attributes,
+};
+
+static const struct attribute_group tune_group = {
+	.attrs = tune_attributes,
+};
+
+static ssize_t
+pattr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	u32 val;
+
+	if (entry->print)
+		return entry->print(plo, page);
+
+	if (!entry->show)
+		return -EIO;
+	val = entry->show(plo);
+	return sprintf(page, "%u\n", val);
+}
+
+static ssize_t
+pattr_store(struct kobject *kobj, struct attribute *attr,
+	    const char *page, size_t length)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int err;
+
+	if (!entry->store)
+		return -EIO;
+
+	var = simple_strtoul(p, &p, 10);
+
+	err = entry->store(plo, var);
+	return err ? : length;
+}
+
+static struct sysfs_ops pattr_sysfs_ops = {
+	.show	= &pattr_show,
+	.store	= &pattr_store,
+};
+
+static struct sysfs_ops pstat_sysfs_ops = {
+	.show	= &pstat_show,
+	.store	= &pstat_store,
+};
+
+static void pattr_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type pattr_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pattr_sysfs_ops,
+};
+
+static struct kobj_type pstat_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pstat_sysfs_ops,
+};
+
+struct kobject *kobject_add_attr(struct gendisk *gd, const char *name,
+				 struct kobj_type * type)
+{
+	struct kobject *k;
+	int err;
+	struct kobject * parent = &disk_to_dev(gd)->kobj;
+
+	k = kzalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return NULL;
+
+	kobject_init(k, type);
+
+	err = kobject_add(k, parent, "%s", name);
+	if (err) {
+		kobject_put(k);
+		return NULL;
+	}
+	return k;
+}
+
+void ploop_sysfs_init(struct ploop_device * plo)
+{
+	plo->pstat_dir = kobject_add_attr(plo->disk, "pstat", &pstat_ktype);
+	if (plo->pstat_dir) {
+		if (sysfs_create_group(plo->pstat_dir, &stats_group))
+			printk("ploop: were not able to create pstat dir\n");
+	}
+	plo->pstate_dir = kobject_add_attr(plo->disk, "pstate", &pattr_ktype);
+	if (plo->pstate_dir) {
+		if (sysfs_create_group(plo->pstate_dir, &state_group))
+			printk("ploop: were not able to create pstate dir\n");
+	}
+	plo->ptune_dir = kobject_add_attr(plo->disk, "ptune", &pattr_ktype);
+	if (plo->ptune_dir) {
+		if (sysfs_create_group(plo->ptune_dir, &tune_group))
+			printk("ploop: were not able to create ptune dir\n");
+	}
+
+	if (kobject_add(&plo->kobj, kobject_get(&disk_to_dev(plo->disk)->kobj), "%s", "pdelta"))
+		printk("ploop: were not able to create pdelta dir\n");
+}
+
+void ploop_sysfs_uninit(struct ploop_device * plo)
+{
+	if (plo->pstat_dir) {
+		sysfs_remove_group(plo->pstat_dir, &stats_group);
+		kobject_del(plo->pstat_dir);
+		kobject_put(plo->pstat_dir);
+		plo->pstat_dir = NULL;
+	}
+	if (plo->pstate_dir) {
+		sysfs_remove_group(plo->pstate_dir, &state_group);
+		kobject_del(plo->pstate_dir);
+		kobject_put(plo->pstate_dir);
+		plo->pstate_dir = NULL;
+	}
+	if (plo->ptune_dir) {
+		sysfs_remove_group(plo->ptune_dir, &tune_group);
+		kobject_del(plo->ptune_dir);
+		kobject_put(plo->ptune_dir);
+		plo->ptune_dir = NULL;
+	}
+	kobject_del(&plo->kobj);
+
+	kobject_put(&disk_to_dev(plo->disk)->kobj);
+}
--- /dev/null
+++ b/drivers/block/ploop/tracker.c
@@ -0,0 +1,293 @@
+/*
+ *  drivers/block/ploop/tracker.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/* Tracker engine detects and records changed clusters.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+
+struct track_record
+{
+	struct rb_node	rb_node;
+	u32		start;
+	u32		end;
+};
+
+static int tree_insert(struct rb_root *root, struct track_record *m)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct track_record * entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct track_record, rb_node);
+
+		if (m->start < entry->start)
+			p = &(*p)->rb_left;
+		else if (m->start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&m->rb_node, parent, p);
+	rb_insert_color(&m->rb_node, root);
+	return 0;
+}
+
+void ploop_tracker_notify(struct ploop_device * plo, sector_t sec)
+{
+	struct track_record * m;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return;
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return;
+
+	sec >>= plo->cluster_log;
+
+	m = kmalloc(sizeof(struct track_record), GFP_NOFS);
+	if (m == NULL) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		return;
+	}
+
+	m->start = sec;
+	m->end = sec + 1;
+
+	spin_lock(&plo->track_lock);
+	if (tree_insert(&plo->track_tree, m)) {
+		kfree(m);
+	} else {
+		struct rb_node * rb;
+		struct track_record * merge;
+
+		if (m->start != 0) {
+			rb = rb_prev(&m->rb_node);
+			if (rb) {
+				merge = rb_entry(rb, struct track_record, rb_node);
+				if (m->start == merge->end) {
+					m->start = merge->start;
+					rb_erase(&merge->rb_node, &plo->track_tree);
+					kfree(merge);
+				}
+			}
+		}
+
+		rb = rb_next(&m->rb_node);
+		if (rb) {
+			merge = rb_entry(rb, struct track_record, rb_node);
+			if (m->end == merge->start) {
+				m->end = merge->end;
+				rb_erase(&merge->rb_node, &plo->track_tree);
+				kfree(merge);
+			}
+		}
+	}
+	spin_unlock(&plo->track_lock);
+}
+EXPORT_SYMBOL(ploop_tracker_notify);
+
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg)
+{
+	struct ploop_track_extent e;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	ploop_quiesce(plo);
+
+	e.start = 0;
+	e.end = (u64)ploop_top_delta(plo)->io.alloc_head << (plo->cluster_log + 9);
+	if (copy_to_user((void*)arg, &e, sizeof(struct ploop_track_extent))) {
+		ploop_relax(plo);
+		return -EFAULT;
+	}
+
+	set_bit(PLOOP_S_TRACK, &plo->state);
+	plo->maintenance_type = PLOOP_MNTN_TRACK;
+	plo->track_end = 0;
+	plo->track_ptr = 0;
+	ploop_relax(plo);
+	return 0;
+}
+
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg)
+{
+	u64 pos;
+
+	if (copy_from_user(&pos, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	pos >>= 9;
+
+	if (pos < plo->track_end) {
+		/* _XXX_ This would be good to trim tail of track tree
+		 * and to rewind tracking. We implement this if it will
+		 * be really useful.
+		 */
+		if (pos)
+			return -EINVAL;
+
+		ploop_quiesce(plo);
+
+		clear_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		ploop_tracker_destroy(plo, 1);
+
+		plo->track_end = pos;
+		plo->track_ptr = 0;
+
+		ploop_relax(plo);
+	} else 
+		plo->track_end = pos;
+
+	return 0;
+}
+
+static struct track_record * find_record(struct rb_root * root, u32 start)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node * prev = NULL;
+
+	while (n) {
+		struct track_record * m;
+
+		m = rb_entry(n, struct track_record, rb_node);
+		prev = n;
+
+		if (start < m->start)
+			n = n->rb_left;
+		else if (start >= m->end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+
+	while (prev && start >= rb_entry(prev, struct track_record, rb_node)->end)
+		prev = rb_next(prev);
+
+	if (!prev)
+		return NULL;
+
+	return rb_entry(prev, struct track_record, rb_node);
+}
+
+
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg)
+{
+	u64 ptr;
+	struct track_record * m;
+	struct ploop_delta * delta;
+	struct ploop_track_extent e;
+	int err;
+
+	if (copy_from_user(&ptr, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state)) {
+		ploop_tracker_destroy(plo, 1);
+		return -ECONNABORTED;
+	}
+
+	delta = ploop_top_delta(plo);
+
+	spin_lock(&plo->track_lock);
+	m = find_record(&plo->track_tree, plo->track_ptr);
+	if (m == NULL) {
+		if (plo->track_end >= ((sector_t)delta->io.alloc_head << plo->cluster_log) &&
+		    plo->track_ptr)
+			m = find_record(&plo->track_tree, 0);
+	}
+
+	if (m) {
+		rb_erase(&m->rb_node, &plo->track_tree);
+		plo->track_ptr = m->end;
+	} else {
+		plo->track_ptr = 0;
+	}
+	spin_unlock(&plo->track_lock);
+
+	err = -EAGAIN;
+	if (m) {
+		e.start = (u64)m->start << (plo->cluster_log + 9);
+		e.end = (u64)m->end << (plo->cluster_log + 9);
+		kfree(m);
+		err = 0;
+	} else if (plo->track_end < ((sector_t)delta->io.alloc_head << plo->cluster_log)) {
+		e.start = (u64)plo->track_end << 9;
+		e.end = (u64)delta->io.alloc_head << (plo->cluster_log + 9);
+		err = 0;
+	}
+
+	if (!err && copy_to_user((void *)arg, &e, sizeof(e))) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		err = -EFAULT;
+	}
+
+	return err;
+}
+
+int ploop_tracker_stop(struct ploop_device * plo, int force)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		force = 1;
+	err = ploop_tracker_destroy(plo, force);
+	if (!err) {
+		clear_bit(PLOOP_S_TRACK, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+	ploop_relax(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return -ECONNABORTED;
+	return err;
+}
+
+int ploop_tracker_destroy(struct ploop_device *plo, int force)
+{
+	struct rb_node * n;
+
+	if (RB_EMPTY_ROOT(&plo->track_tree))
+		return 0;
+
+	if (!force)
+		return -EBUSY;
+
+	spin_lock(&plo->track_lock);
+	while ((n = rb_first(&plo->track_tree)) != NULL) {
+		rb_erase(n, &plo->track_tree);
+		kfree(n);
+	}
+	spin_unlock(&plo->track_lock);
+	return 0;
+}
+
+void track_init(struct ploop_device * plo)
+{
+	plo->track_tree = RB_ROOT;
+	spin_lock_init(&plo->track_lock);
+}
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1804,6 +1804,11 @@ static ssize_t random_write(struct file *file, const char __user *buffer,
 {
 	size_t ret;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return count;
+#endif
+
 	ret = write_pool(&input_pool, buffer, count);
 	if (ret)
 		return ret;
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -50,266 +50,240 @@ static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer)
 	return (struct cn_msg *)(buffer + 4);
 }
 
-static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
 static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
 
-/* proc_event_counts is used as the sequence number of the netlink message */
-static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
-
-static inline void get_seq(__u32 *ts, int *cpu)
+static inline void get_seq(struct ve_struct *ve, __u32 *ts, int *cpu)
 {
 	preempt_disable();
-	*ts = __this_cpu_inc_return(proc_event_counts) - 1;
+	*ts = __this_cpu_inc_return(*ve->cn->proc_event_counts) - 1;
 	*cpu = smp_processor_id();
 	preempt_enable();
 }
 
-void proc_fork_connector(struct task_struct *task)
+static struct cn_msg *cn_msg_fill(__u8 *buffer, struct ve_struct *ve,
+				  struct task_struct *task,
+				  int what, int cookie,
+				  bool (*fill_event)(struct proc_event *ev,
+						     struct ve_struct *ve,
+						     struct task_struct *task,
+						     int cookie))
 {
 	struct cn_msg *msg;
 	struct proc_event *ev;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
 	struct timespec ts;
-	struct task_struct *parent;
-
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
 
 	msg = buffer_to_cn_msg(buffer);
 	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_FORK;
-	rcu_read_lock();
-	parent = rcu_dereference(task->real_parent);
-	ev->event_data.fork.parent_pid = parent->pid;
-	ev->event_data.fork.parent_tgid = parent->tgid;
-	rcu_read_unlock();
-	ev->event_data.fork.child_pid = task->pid;
-	ev->event_data.fork.child_tgid = task->tgid;
 
+	get_seq(ve, &msg->seq, &ev->cpu);
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
 	msg->flags = 0; /* not used */
-	/*  If cn_netlink_send() failed, the data is not sent */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
+	ktime_get_ts(&ts); /* get high res monotonic timestamp */
+	ev->timestamp_ns = timespec_to_ns(&ts);
+	ev->what = what;
+
+	return fill_event(ev, ve, task, cookie) ? msg : NULL;
 }
 
-void proc_exec_connector(struct task_struct *task)
+static int proc_event_num_listeners(struct ve_struct *ve)
+{
+	if (ve->cn)
+		return atomic_read(&ve->cn->proc_event_num_listeners);
+	return 0;
+}
+
+static void proc_event_connector_ve(struct task_struct *task,
+				    struct ve_struct *ve,
+				    int what, int cookie,
+				    bool (*fill_event)(struct proc_event *ev,
+						       struct ve_struct *ve,
+						       struct task_struct *task,
+						       int cookie))
 {
 	struct cn_msg *msg;
-	struct proc_event *ev;
-	struct timespec ts;
 	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
+	if (proc_event_num_listeners(ve) < 1)
 		return;
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_EXEC;
-	ev->event_data.exec.process_pid = task->pid;
-	ev->event_data.exec.process_tgid = task->tgid;
+	msg = cn_msg_fill(buffer, ve, task, what, cookie, fill_event);
+	if (!msg)
+		return;
 
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	/*  If cn_netlink_send() failed, the data is not sent */
+	cn_netlink_send_ve(ve, msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
-void proc_id_connector(struct task_struct *task, int which_id)
+static void proc_event_connector(struct task_struct *task,
+				 int what, int cookie,
+				 bool (*fill_event)(struct proc_event *ev,
+						    struct ve_struct *ve,
+						    struct task_struct *task,
+						    int cookie))
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
-	struct timespec ts;
-	const struct cred *cred;
+	struct ve_struct *ve = task->task_ve;
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+	if (!ve_is_super(ve))
+		proc_event_connector_ve(task, ve, what, cookie, fill_event);
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	ev->what = which_id;
-	ev->event_data.id.process_pid = task->pid;
-	ev->event_data.id.process_tgid = task->tgid;
+	proc_event_connector_ve(task, get_ve0(), what, cookie, fill_event);
+}
+
+static bool fill_fork_event(struct proc_event *ev, struct ve_struct *ve,
+			    struct task_struct *task, int unused)
+{
+	struct task_struct *parent;
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
+
+	rcu_read_lock();
+	parent = rcu_dereference(task->real_parent);
+	ev->event_data.fork.parent_pid = task_pid_nr_ns(parent, pid_ns);
+	ev->event_data.fork.parent_tgid = task_tgid_nr_ns(parent, pid_ns);
+	rcu_read_unlock();
+	ev->event_data.fork.child_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.fork.child_tgid = task_tgid_nr_ns(task, pid_ns);
+	return true;
+}
+
+void proc_fork_connector(struct task_struct *task)
+{
+	proc_event_connector(task, PROC_EVENT_FORK, 0, fill_fork_event);
+}
+
+static bool fill_exec_event(struct proc_event *ev, struct ve_struct *ve,
+			    struct task_struct *task, int unused)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
+
+	ev->event_data.exec.process_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.exec.process_tgid = task_tgid_nr_ns(task, pid_ns);
+	return true;
+}
+
+void proc_exec_connector(struct task_struct *task)
+{
+	proc_event_connector(task, PROC_EVENT_EXEC, 0, fill_exec_event);
+}
+
+static bool fill_id_event(struct proc_event *ev, struct ve_struct *ve,
+			  struct task_struct *task, int which_id)
+{
+	const struct cred *cred;
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
+	struct user_namespace *user_ns = ve->init_cred->user_ns;
+
+	ev->event_data.id.process_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.id.process_tgid = task_tgid_nr_ns(task, pid_ns);
 	rcu_read_lock();
 	cred = __task_cred(task);
 	if (which_id == PROC_EVENT_UID) {
-		ev->event_data.id.r.ruid = from_kuid_munged(&init_user_ns, cred->uid);
-		ev->event_data.id.e.euid = from_kuid_munged(&init_user_ns, cred->euid);
+		ev->event_data.id.r.ruid = from_kuid_munged(user_ns, cred->uid);
+		ev->event_data.id.e.euid = from_kuid_munged(user_ns, cred->euid);
 	} else if (which_id == PROC_EVENT_GID) {
-		ev->event_data.id.r.rgid = from_kgid_munged(&init_user_ns, cred->gid);
-		ev->event_data.id.e.egid = from_kgid_munged(&init_user_ns, cred->egid);
+		ev->event_data.id.r.rgid = from_kgid_munged(user_ns, cred->gid);
+		ev->event_data.id.e.egid = from_kgid_munged(user_ns, cred->egid);
 	} else {
 		rcu_read_unlock();
-		return;
+		return false;
 	}
 	rcu_read_unlock();
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	return true;
 }
 
-void proc_sid_connector(struct task_struct *task)
+void proc_id_connector(struct task_struct *task, int which_id)
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	struct timespec ts;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
-
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+	proc_event_connector(task, which_id, which_id, fill_id_event);
+}
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_SID;
-	ev->event_data.sid.process_pid = task->pid;
-	ev->event_data.sid.process_tgid = task->tgid;
+static bool fill_sid_event(struct proc_event *ev, struct ve_struct *ve,
+			   struct task_struct *task, int unused)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
 
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	ev->event_data.sid.process_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.sid.process_tgid = task_tgid_nr_ns(task, pid_ns);
+	return true;
 }
 
-void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
+void proc_sid_connector(struct task_struct *task)
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	struct timespec ts;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
+	proc_event_connector(task, PROC_EVENT_SID, 0, fill_sid_event);
+}
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+static bool fill_ptrace_event(struct proc_event *ev, struct ve_struct *ve,
+			      struct task_struct *task, int ptrace_id)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_PTRACE;
-	ev->event_data.ptrace.process_pid  = task->pid;
-	ev->event_data.ptrace.process_tgid = task->tgid;
+	ev->event_data.ptrace.process_pid  = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.ptrace.process_tgid = task_tgid_nr_ns(task, pid_ns);
 	if (ptrace_id == PTRACE_ATTACH) {
-		ev->event_data.ptrace.tracer_pid  = current->pid;
-		ev->event_data.ptrace.tracer_tgid = current->tgid;
+		ev->event_data.ptrace.tracer_pid  = task_pid_nr_ns(current, pid_ns);
+		ev->event_data.ptrace.tracer_tgid = task_tgid_nr_ns(current, pid_ns);
 	} else if (ptrace_id == PTRACE_DETACH) {
 		ev->event_data.ptrace.tracer_pid  = 0;
 		ev->event_data.ptrace.tracer_tgid = 0;
 	} else
-		return;
-
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+		return false;
+	return true;
 }
 
-void proc_comm_connector(struct task_struct *task)
+void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	struct timespec ts;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
+	proc_event_connector(task, PROC_EVENT_PTRACE, ptrace_id,
+			     fill_ptrace_event);
+}
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+static bool fill_comm_event(struct proc_event *ev, struct ve_struct *ve,
+			    struct task_struct *task, int unused)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_COMM;
-	ev->event_data.comm.process_pid  = task->pid;
-	ev->event_data.comm.process_tgid = task->tgid;
+	ev->event_data.comm.process_pid  = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.comm.process_tgid = task_tgid_nr_ns(task, pid_ns);
 	get_task_comm(ev->event_data.comm.comm, task);
-
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	return true;
 }
 
-void proc_coredump_connector(struct task_struct *task)
+void proc_comm_connector(struct task_struct *task)
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
-	struct timespec ts;
-
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+	proc_event_connector(task, PROC_EVENT_COMM, 0, fill_comm_event);
+}
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_COREDUMP;
-	ev->event_data.coredump.process_pid = task->pid;
-	ev->event_data.coredump.process_tgid = task->tgid;
+static bool fill_coredump_event(struct proc_event *ev, struct ve_struct *ve,
+				struct task_struct *task, int unused)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
 
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	ev->event_data.coredump.process_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.coredump.process_tgid = task_tgid_nr_ns(task, pid_ns);
+	return true;
 }
 
-void proc_exit_connector(struct task_struct *task)
+void proc_coredump_connector(struct task_struct *task)
 {
-	struct cn_msg *msg;
-	struct proc_event *ev;
-	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
-	struct timespec ts;
+	proc_event_connector(task, PROC_EVENT_COREDUMP, 0, fill_coredump_event);
+}
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
-		return;
+static bool fill_exit_event(struct proc_event *ev, struct ve_struct *ve,
+			    struct task_struct *task, int unused)
+{
+	struct pid_namespace *pid_ns = ve->ve_ns->pid_ns;
 
-	msg = buffer_to_cn_msg(buffer);
-	ev = (struct proc_event *)msg->data;
-	memset(&ev->event_data, 0, sizeof(ev->event_data));
-	get_seq(&msg->seq, &ev->cpu);
-	ktime_get_ts(&ts); /* get high res monotonic timestamp */
-	ev->timestamp_ns = timespec_to_ns(&ts);
-	ev->what = PROC_EVENT_EXIT;
-	ev->event_data.exit.process_pid = task->pid;
-	ev->event_data.exit.process_tgid = task->tgid;
+	ev->event_data.exit.process_pid = task_pid_nr_ns(task, pid_ns);
+	ev->event_data.exit.process_tgid = task_tgid_nr_ns(task, pid_ns);
 	ev->event_data.exit.exit_code = task->exit_code;
 	ev->event_data.exit.exit_signal = task->exit_signal;
+	return true;
+}
 
-	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
-	msg->ack = 0; /* not used */
-	msg->len = sizeof(*ev);
-	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+void proc_exit_connector(struct task_struct *task)
+{
+	proc_event_connector(task, PROC_EVENT_EXIT, 0, fill_exit_event);
 }
 
 /*
@@ -320,14 +294,14 @@ void proc_exit_connector(struct task_struct *task)
  * values because it's not being returned via syscall return
  * mechanisms.
  */
-static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
+static void cn_proc_ack(struct ve_struct *ve, int err, int rcvd_seq, int rcvd_ack)
 {
 	struct cn_msg *msg;
 	struct proc_event *ev;
 	__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
 	struct timespec ts;
 
-	if (atomic_read(&proc_event_num_listeners) < 1)
+	if (proc_event_num_listeners(ve) < 1)
 		return;
 
 	msg = buffer_to_cn_msg(buffer);
@@ -343,7 +317,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
 	msg->ack = rcvd_ack + 1;
 	msg->len = sizeof(*ev);
 	msg->flags = 0; /* not used */
-	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+	cn_netlink_send_ve(ve, msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
 /**
@@ -354,6 +328,7 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
 			      struct netlink_skb_parms *nsp)
 {
 	enum proc_cn_mcast_op *mc_op = NULL;
+	struct ve_struct *ve = get_exec_env();
 	int err = 0;
 
 	if (msg->len != sizeof(*mc_op))
@@ -364,12 +339,12 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
 	 * and user namespaces so ignore requestors from
 	 * other namespaces.
 	 */
-	if ((current_user_ns() != &init_user_ns) ||
-	    (task_active_pid_ns(current) != &init_pid_ns))
+	if (!current_user_ns_initial() ||
+	    (task_active_pid_ns(current) != ve->ve_ns->pid_ns))
 		return;
 
 	/* Can only change if privileged. */
-	if (!__netlink_ns_capable(nsp, &init_user_ns, CAP_NET_ADMIN)) {
+	if (!__netlink_ns_capable(nsp, ve_init_user_ns(), CAP_NET_ADMIN)) {
 		err = EPERM;
 		goto out;
 	}
@@ -377,10 +352,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
 	mc_op = (enum proc_cn_mcast_op *)msg->data;
 	switch (*mc_op) {
 	case PROC_CN_MCAST_LISTEN:
-		atomic_inc(&proc_event_num_listeners);
+		atomic_inc(&ve->cn->proc_event_num_listeners);
 		break;
 	case PROC_CN_MCAST_IGNORE:
-		atomic_dec(&proc_event_num_listeners);
+		atomic_dec(&ve->cn->proc_event_num_listeners);
 		break;
 	default:
 		err = EINVAL;
@@ -388,24 +363,31 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
 	}
 
 out:
-	cn_proc_ack(err, msg->seq, msg->ack);
+	cn_proc_ack(ve, err, msg->seq, msg->ack);
 }
 
-/*
- * cn_proc_init - initialization entry point
- *
- * Adds the connector callback to the connector driver.
- */
-static int __init cn_proc_init(void)
+int cn_proc_init_ve(struct ve_struct *ve)
 {
-	int err = cn_add_callback(&cn_proc_event_id,
+	int err;
+
+	ve->cn->proc_event_counts = alloc_percpu(u32);
+	if (!ve->cn->proc_event_counts)
+		return -ENOMEM;
+
+	err = cn_add_callback_ve(ve, &cn_proc_event_id,
 				  "cn_proc",
 				  &cn_proc_mcast_ctl);
 	if (err) {
-		pr_warn("cn_proc failed to register\n");
+		pr_warn("VE#%d: cn_proc failed to register\n", ve->veid);
+		free_percpu(ve->cn->proc_event_counts);
 		return err;
 	}
+	atomic_set(&ve->cn->proc_event_num_listeners, 0);
 	return 0;
 }
 
-module_init(cn_proc_init);
+void cn_proc_fini_ve(struct ve_struct *ve)
+{
+	cn_del_callback_ve(ve, &cn_proc_event_id);
+	free_percpu(ve->cn->proc_event_counts);
+}
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -38,10 +38,6 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
 MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector.");
 MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_CONNECTOR);
 
-static struct cn_dev cdev;
-
-static int cn_already_initialized;
-
 /*
  * msg->seq and msg->ack are used to determine message genealogy.
  * When someone sends message it puts there locally unique sequence
@@ -63,14 +59,20 @@ static int cn_already_initialized;
  * a new message.
  *
  */
-int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
+
+static struct cn_dev *get_cdev(struct ve_struct *ve)
+{
+	return &ve->cn->cdev;
+}
+
+int cn_netlink_send_ve(struct ve_struct *ve, struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
 {
 	struct cn_callback_entry *__cbq;
 	unsigned int size;
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
 	struct cn_msg *data;
-	struct cn_dev *dev = &cdev;
+	struct cn_dev *dev = get_cdev(ve);
 	u32 group = 0;
 	int found = 0;
 
@@ -115,6 +117,11 @@ int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
 
 	return netlink_broadcast(dev->nls, skb, 0, group, gfp_mask);
 }
+
+int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
+{
+	return cn_netlink_send_ve(get_ve0(), msg, __group, gfp_mask);
+}
 EXPORT_SYMBOL_GPL(cn_netlink_send);
 
 /*
@@ -123,7 +130,7 @@ EXPORT_SYMBOL_GPL(cn_netlink_send);
 static int cn_call_callback(struct sk_buff *skb)
 {
 	struct cn_callback_entry *i, *cbq = NULL;
-	struct cn_dev *dev = &cdev;
+	struct cn_dev *dev = get_cdev(skb->sk->sk_net->owner_ve);
 	struct cn_msg *msg = nlmsg_data(nlmsg_hdr(skb));
 	struct netlink_skb_parms *nsp = &NETLINK_CB(skb);
 	int err = -ENODEV;
@@ -154,31 +161,44 @@ static int cn_call_callback(struct sk_buff *skb)
  *
  * It checks skb, netlink header and msg sizes, and calls callback helper.
  */
-static void cn_rx_skb(struct sk_buff *__skb)
+static void cn_rx_skb(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh;
-	struct sk_buff *skb;
 	int len, err;
 
-	skb = skb_get(__skb);
-
 	if (skb->len >= NLMSG_HDRLEN) {
 		nlh = nlmsg_hdr(skb);
 		len = nlmsg_len(nlh);
 
 		if (len < (int)sizeof(struct cn_msg) ||
 		    skb->len < nlh->nlmsg_len ||
-		    len > CONNECTOR_MAX_MSG_SIZE) {
-			kfree_skb(skb);
+		    len > CONNECTOR_MAX_MSG_SIZE)
 			return;
-		}
 
-		err = cn_call_callback(skb);
+		err = cn_call_callback(skb_get(skb));
 		if (err < 0)
 			kfree_skb(skb);
 	}
 }
 
+int cn_add_callback_ve(struct ve_struct *ve,
+		       struct cb_id *id, const char *name,
+		       void (*callback)(struct cn_msg *,
+					struct netlink_skb_parms *))
+{
+	int err;
+	struct cn_dev *dev = get_cdev(ve);
+
+	if (!ve->cn->cn_already_initialized)
+		return -EAGAIN;
+
+	err = cn_queue_add_callback(dev->cbdev, name, id, callback);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 /*
  * Callback add routing - adds callback with given ID and name.
  * If there is registered callback with the same ID it will not be added.
@@ -189,19 +209,16 @@ int cn_add_callback(struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *,
 				     struct netlink_skb_parms *))
 {
-	int err;
-	struct cn_dev *dev = &cdev;
-
-	if (!cn_already_initialized)
-		return -EAGAIN;
+	return cn_add_callback_ve(get_ve0(), id, name, callback);
+}
+EXPORT_SYMBOL_GPL(cn_add_callback);
 
-	err = cn_queue_add_callback(dev->cbdev, name, id, callback);
-	if (err)
-		return err;
+void cn_del_callback_ve(struct ve_struct *ve, struct cb_id *id)
+{
+	struct cn_dev *dev = get_cdev(ve);
 
-	return 0;
+	cn_queue_del_callback(dev->cbdev, id);
 }
-EXPORT_SYMBOL_GPL(cn_add_callback);
 
 /*
  * Callback remove routing - removes callback
@@ -213,15 +230,13 @@ EXPORT_SYMBOL_GPL(cn_add_callback);
  */
 void cn_del_callback(struct cb_id *id)
 {
-	struct cn_dev *dev = &cdev;
-
-	cn_queue_del_callback(dev->cbdev, id);
+	cn_del_callback_ve(get_ve0(), id);
 }
 EXPORT_SYMBOL_GPL(cn_del_callback);
 
 static int cn_proc_show(struct seq_file *m, void *v)
 {
-	struct cn_queue_dev *dev = cdev.cbdev;
+	struct cn_queue_dev *dev = get_cdev(get_exec_env())->cbdev;
 	struct cn_callback_entry *cbq;
 
 	seq_printf(m, "Name            ID\n");
@@ -253,45 +268,102 @@ static const struct file_operations cn_file_ops = {
 	.release = single_release
 };
 
-static struct cn_dev cdev = {
-	.input   = cn_rx_skb,
-};
-
-static int cn_init(void)
+static int cn_init_ve(void *data)
 {
-	struct cn_dev *dev = &cdev;
+	struct ve_struct *ve = data;
+	struct cn_dev *dev;
 	struct netlink_kernel_cfg cfg = {
 		.groups	= CN_NETLINK_USERS + 0xf,
-		.input	= dev->input,
+		.input	= cn_rx_skb,
 	};
+	struct net *net = ve->ve_netns;
+	int err;
 
-	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg);
-	if (!dev->nls)
-		return -EIO;
+	ve->cn = kzalloc(sizeof(*ve->cn), GFP_KERNEL);
+	if (!ve->cn)
+		return -ENOMEM;
+
+	dev = &ve->cn->cdev;
+
+	dev->nls = netlink_kernel_create(net, NETLINK_CONNECTOR, &cfg);
+	if (!dev->nls) {
+		err = -EIO;
+		goto free_cn;
+	}
 
 	dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls);
 	if (!dev->cbdev) {
-		netlink_kernel_release(dev->nls);
-		return -EINVAL;
+		err = -EINVAL;
+		goto netlink_release;
 	}
 
-	cn_already_initialized = 1;
+	ve->cn->cn_already_initialized = 1;
 
-	proc_create("connector", S_IRUGO, init_net.proc_net, &cn_file_ops);
+	if (!proc_net_create("connector", S_IRUGO, net->proc_net, &cn_file_ops)) {
+		err = -ENOMEM;
+		goto free_cdev;
+	}
+
+	err = cn_proc_init_ve(ve);
+	if (err)
+		goto remove_proc;
 
 	return 0;
+
+remove_proc:
+	remove_proc_entry("connector", net->proc_net);
+free_cdev:
+	cn_queue_free_dev(dev->cbdev);
+netlink_release:
+	netlink_kernel_release(dev->nls);
+free_cn:
+	kfree(ve->cn);
+	ve->cn = NULL;
+	return err;
 }
 
-static void cn_fini(void)
+static void cn_fini_ve(void *data)
 {
-	struct cn_dev *dev = &cdev;
+	struct ve_struct *ve = data;
+	struct cn_dev *dev = get_cdev(ve);
+	struct net *net = ve->ve_netns;
 
-	cn_already_initialized = 0;
+	ve->cn->cn_already_initialized = 0;
 
-	remove_proc_entry("connector", init_net.proc_net);
+	cn_proc_fini_ve(ve);
+
+	remove_proc_entry("connector", net->proc_net);
 
 	cn_queue_free_dev(dev->cbdev);
 	netlink_kernel_release(dev->nls);
+
+	kfree(ve->cn);
+	ve->cn = NULL;
+}
+
+static struct ve_hook cn_ss_hook = {
+	.init = cn_init_ve,
+	.fini = cn_fini_ve,
+	.priority = HOOK_PRIO_DEFAULT,
+	.owner = THIS_MODULE,
+};
+
+static int cn_init(void)
+{
+	int err;
+
+	err = cn_init_ve(get_ve0());
+	if (err)
+		return err;
+
+	ve_hook_register(VE_SS_CHAIN, &cn_ss_hook);
+	return 0;
+}
+
+static void cn_fini(void)
+{
+	ve_hook_unregister(&cn_ss_hook);
+	return cn_fini_ve(get_ve0());
 }
 
 subsys_initcall(cn_init);
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -122,18 +122,6 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
-static int get_loadavg(void)
-{
-	unsigned long this = this_cpu_load();
-
-
-	return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
-}
-
 static inline int which_bucket(unsigned int duration)
 {
 	int bucket = 0;
@@ -173,7 +161,7 @@ static inline int performance_multiplier(void)
 
 	/* for higher loadavg, we are more reluctant */
 
-	mult += 2 * get_loadavg();
+	mult += 10 * nr_active_cpu();
 
 	/* for IO wait tasks (per cpu!) we add 5x each */
 	mult += 10 * nr_iowait_cpu(smp_processor_id());
--- a/drivers/gpu/drm/drm_backport.c
+++ b/drivers/gpu/drm/drm_backport.c
@@ -8,41 +8,6 @@
 
 #include <drm/drm_backport.h>
 
-/*
- * shrinker
- */
-
-#undef shrinker
-#undef register_shrinker
-#undef unregister_shrinker
-
-static int shrinker2_shrink(struct shrinker *shrinker, struct shrink_control *sc)
-{
-	struct shrinker2 *s2 = container_of(shrinker, struct shrinker2, compat);
-	int count;
-
-	s2->scan_objects(s2, sc);
-	count = s2->count_objects(s2, sc);
-	shrinker->seeks = s2->seeks;
-
-	return count;
-}
-
-int register_shrinker2(struct shrinker2 *s2)
-{
-	s2->compat.shrink = shrinker2_shrink;
-	s2->compat.seeks = s2->seeks;
-	register_shrinker(&s2->compat);
-	return 0;
-}
-EXPORT_SYMBOL(register_shrinker2);
-
-void unregister_shrinker2(struct shrinker2 *s2)
-{
-	unregister_shrinker(&s2->compat);
-}
-EXPORT_SYMBOL(unregister_shrinker2);
-
 int __init drm_backport_init(void)
 {
 	return 0;
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -389,6 +389,10 @@ static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free,
  *
  * XXX: (dchinner) Deadlock warning!
  *
+ * ttm_page_pool_free() does memory allocation using GFP_KERNEL.  that means
+ * this can deadlock when called a sc->gfp_mask that is not equal to
+ * GFP_KERNEL.
+ *
  * This code is crying out for a shrinker per pool....
  */
 static unsigned long
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -1003,6 +1003,11 @@ EXPORT_SYMBOL_GPL(ttm_dma_unpopulate);
  *
  * XXX: (dchinner) Deadlock warning!
  *
+ * ttm_dma_page_pool_free() does GFP_KERNEL memory allocation, and so attention
+ * needs to be paid to sc->gfp_mask to determine if this can be done or not.
+ * GFP_KERNEL memory allocation in a GFP_ATOMIC reclaim context woul dbe really
+ * bad.
+ *
  * I'm getting sadder as I hear more pathetical whimpers about needing per-pool
  * shrinkers
  */
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -73,10 +73,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 
-/* Define synthetic interrupt controller message constants. */
-#define HV_MESSAGE_SIZE			(256)
-#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
-#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30)
 #define HV_ANY_VP			(0xFFFFFFFF)
 
 /* Define synthetic interrupt controller flag constants. */
@@ -84,48 +80,9 @@ enum hv_cpuid_function {
 #define HV_EVENT_FLAGS_BYTE_COUNT	(256)
 #define HV_EVENT_FLAGS_DWORD_COUNT	(256 / sizeof(u32))
 
-/* Define hypervisor message types. */
-enum hv_message_type {
-	HVMSG_NONE			= 0x00000000,
-
-	/* Memory access messages. */
-	HVMSG_UNMAPPED_GPA		= 0x80000000,
-	HVMSG_GPA_INTERCEPT		= 0x80000001,
-
-	/* Timer notification messages. */
-	HVMSG_TIMER_EXPIRED			= 0x80000010,
-
-	/* Error messages. */
-	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020,
-	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021,
-	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022,
-
-	/* Trace buffer complete messages. */
-	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040,
-
-	/* Platform-specific processor intercept messages. */
-	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000,
-	HVMSG_X64_MSR_INTERCEPT		= 0x80010001,
-	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002,
-	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003,
-	HVMSG_X64_APIC_EOI			= 0x80010004,
-	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
-};
-
-#define HV_SYNIC_STIMER_COUNT		(4)
-
 /* Define invalid partition identifier. */
 #define HV_PARTITION_ID_INVALID		((u64)0x0)
 
-/* Define port identifier type. */
-union hv_port_id {
-	u32 asu32;
-	struct {
-		u32 id:24;
-		u32 reserved:8;
-	} u ;
-};
-
 /* Define port type. */
 enum hv_port_type {
 	HVPORT_MSG	= 1,
@@ -173,27 +130,6 @@ struct hv_connection_info {
 	};
 };
 
-/* Define synthetic interrupt controller message flags. */
-union hv_message_flags {
-	u8 asu8;
-	struct {
-		u8 msg_pending:1;
-		u8 reserved:7;
-	};
-};
-
-/* Define synthetic interrupt controller message header. */
-struct hv_message_header {
-	u32 message_type;
-	u8 payload_size;
-	union hv_message_flags message_flags;
-	u8 reserved[2];
-	union {
-		u64 sender;
-		union hv_port_id port;
-	};
-};
-
 /*
  * Timer configuration register.
  */
@@ -210,31 +146,9 @@ union hv_timer_config {
 	};
 };
 
-
-/* Define timer message payload structure. */
-struct hv_timer_message_payload {
-	u32 timer_index;
-	u32 reserved;
-	u64 expiration_time;	/* When the timer expired */
-	u64 delivery_time;	/* When the message was delivered */
-};
-
-/* Define synthetic interrupt controller message format. */
-struct hv_message {
-	struct hv_message_header header;
-	union {
-		u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
-	} u ;
-};
-
 /* Define the number of message buffers associated with each port. */
 #define HV_PORT_MESSAGE_BUFFER_COUNT	(16)
 
-/* Define the synthetic interrupt message page layout. */
-struct hv_message_page {
-	struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
-};
-
 /* Define the synthetic interrupt controller event flags format. */
 union hv_synic_event_flags {
 	u8 flags8[HV_EVENT_FLAGS_BYTE_COUNT];
@@ -347,12 +261,6 @@ struct hv_monitor_page {
 	u8 rsvdz4[1984];
 };
 
-/* Declare the various hypercall operations. */
-enum hv_call_code {
-	HVCALL_POST_MESSAGE	= 0x005c,
-	HVCALL_SIGNAL_EVENT	= 0x005d,
-};
-
 /* Definition of the hv_post_message hypercall input structure. */
 struct hv_input_post_message {
 	union hv_connection_id connectionid;
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -678,6 +678,20 @@ static struct cpuidle_state dnv_cstates[] = {
 		.enter = NULL }
 };
 
+static int force_auto_demotion = 0;
+
+static int __init parse_intel_auto_demotion(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+	if (strcmp(arg, "force") == 0)
+		force_auto_demotion = 1;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("intel_auto_demotion", parse_intel_auto_demotion);
+
 /**
  * intel_idle
  * @dev: cpuidle_device
@@ -1113,7 +1127,7 @@ static int intel_idle_cpu_init(int cpu)
 		return -EIO;
 	}
 
-	if (icpu->auto_demotion_disable_flags)
+	if (icpu->auto_demotion_disable_flags && !force_auto_demotion)
 		smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
 
 	if (icpu->disable_promotion_to_c1e)
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -46,7 +46,7 @@ static LIST_HEAD(input_handler_list);
  * be mutually exclusive which simplifies locking in drivers implementing
  * input handlers.
  */
-static DEFINE_MUTEX(input_mutex);
+DEFINE_MUTEX(input_mutex);
 
 static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 };
 
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -613,24 +613,19 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
 	return 0;
 }
 
-static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 	struct btree *b, *t;
 	unsigned long i, nr = sc->nr_to_scan;
+	unsigned long freed = 0;
 
 	if (c->shrinker_disabled)
-		return 0;
+		return SHRINK_STOP;
 
 	if (c->try_harder)
-		return 0;
-
-	/*
-	 * If nr == 0, we're supposed to return the number of items we have
-	 * cached. Not allowed to return -1.
-	 */
-	if (!nr)
-		return mca_can_free(c) * c->btree_pages;
+		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
 	if (sc->gfp_mask & __GFP_IO)
@@ -643,14 +638,14 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	i = 0;
 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
-		if (!nr)
+		if (freed >= nr)
 			break;
 
 		if (++i > 3 &&
 		    !mca_reap(b, NULL, 0)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
-			--nr;
+			freed++;
 		}
 	}
 
@@ -661,7 +656,7 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	if (list_empty(&c->btree_cache))
 		goto out;
 
-	for (i = 0; nr && i < c->bucket_cache_used; i++) {
+	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
 		b = list_first_entry(&c->btree_cache, struct btree, list);
 		list_rotate_left(&c->btree_cache);
 
@@ -670,14 +665,27 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
-			--nr;
+			freed++;
 		} else
 			b->accessed = 0;
 	}
 out:
-	nr = mca_can_free(c) * c->btree_pages;
 	mutex_unlock(&c->bucket_lock);
-	return nr;
+	return freed;
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+
+	if (c->shrinker_disabled)
+		return 0;
+
+	if (c->try_harder)
+		return 0;
+
+	return mca_can_free(c) * c->btree_pages;
 }
 
 void bch_btree_cache_free(struct cache_set *c)
@@ -746,7 +754,8 @@ int bch_btree_cache_alloc(struct cache_set *c)
 		c->verify_data = NULL;
 #endif
 
-	c->shrink.shrink = bch_mca_shrink;
+	c->shrink.count_objects = bch_mca_count;
+	c->shrink.scan_objects = bch_mca_scan;
 	c->shrink.seeks = 4;
 	c->shrink.batch = c->btree_pages * 2;
 	register_shrinker(&c->shrink);
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -541,7 +541,7 @@ STORE(__bch_cache_set)
 		struct shrink_control sc;
 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
-		c->shrink.shrink(&c->shrink, &sc);
+		c->shrink.scan_objects(&c->shrink, &sc);
 	}
 
 	sysfs_strtoul(congested_read_threshold_us,
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1549,8 +1549,8 @@ static unsigned get_retain_buffers(struct dm_bufio_client *c)
         return retain_bytes / c->block_size;
 }
 
-static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-		   struct shrink_control *sc)
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+			    gfp_t gfp_mask)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
@@ -1560,37 +1560,48 @@ static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
-			if (__try_evict_buffer(b, sc->gfp_mask))
+			if (__try_evict_buffer(b, gfp_mask))
 				freed++;
 			if (!--nr_to_scan || ((count - freed) <= retain_target))
-				return;
+				return freed;
 			cond_resched();
 		}
 	}
+	return freed;
 }
 
-static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
+static unsigned long
+dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct dm_bufio_client *c =
-	    container_of(shrinker, struct dm_bufio_client, shrinker);
-	unsigned long r;
-	unsigned long nr_to_scan = sc->nr_to_scan;
+	struct dm_bufio_client *c;
+	unsigned long freed;
 
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
 	if (sc->gfp_mask & __GFP_FS)
 		dm_bufio_lock(c);
 	else if (!dm_bufio_trylock(c))
-		return !nr_to_scan ? 0 : -1;
+		return SHRINK_STOP;
 
-	if (nr_to_scan)
-		__scan(c, nr_to_scan, sc);
+	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
+	dm_bufio_unlock(c);
+	return freed;
+}
 
-	r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-	if (r > INT_MAX)
-		r = INT_MAX;
+static unsigned long
+dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct dm_bufio_client *c;
+	unsigned long count;
 
-	dm_bufio_unlock(c);
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
+	if (sc->gfp_mask & __GFP_FS)
+		dm_bufio_lock(c);
+	else if (!dm_bufio_trylock(c))
+		return 0;
 
-	return r;
+	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+	dm_bufio_unlock(c);
+	return count;
 }
 
 /*
@@ -1687,7 +1698,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	__cache_size_refresh();
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	c->shrinker.shrink = shrink;
+	c->shrinker.count_objects = dm_bufio_shrink_count;
+	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
 	c->shrinker.batch = 0;
 	register_shrinker(&c->shrinker);
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/key.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
@@ -28,11 +29,14 @@
 #include <crypto/hash.h>
 #include <crypto/md5.h>
 #include <crypto/algapi.h>
+#include <keys/user-type.h>
 
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "crypt"
 
+#include <linux/ploop/ploop.h>
+#include "dm.h"
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -1487,15 +1491,95 @@ static int crypt_setkey(struct crypt_config *cc)
 	return err;
 }
 
+#ifdef CONFIG_KEYS
+static struct key *crypt_decode_get_keyring_key(char *key_desc)
+{
+	int ret;
+	struct key *key;
+	char *decoded_key_desc;
+	int key_desc_size = strlen(key_desc) >> 1;
+
+	decoded_key_desc = kmalloc(key_desc_size + 1, GFP_KERNEL);
+	if (!decoded_key_desc)
+		return ERR_PTR(-ENOMEM);
+
+	if (crypt_decode_key(decoded_key_desc, key_desc, key_desc_size) < 0) {
+		kfree(decoded_key_desc);
+		return ERR_PTR(-EINVAL);
+	}
+
+	decoded_key_desc[key_desc_size] = '\0';
+
+	key = request_key(&key_type_user, decoded_key_desc, NULL);
+	kfree(decoded_key_desc);
+	if (IS_ERR(key))
+		return key;
+
+	ret = key_validate(key);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	return key;
+}
+
+static int crypt_set_keyring_key(struct crypt_config *cc, char *key_desc)
+{
+	int ret = 0;
+	struct key *key;
+	const struct user_key_payload *ukp;
+
+	key = crypt_decode_get_keyring_key(key_desc);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	rcu_read_lock();
+	ukp = user_key_payload(key);
+	if (cc->key_size != ukp->datalen) {
+		ret = -EINVAL;
+		goto out;
+	}
+	memcpy(cc->key, ukp->data, cc->key_size);
+out:
+	rcu_read_unlock();
+	key_put(key);
+	return ret;
+}
+
+static int get_key_size(char *key_desc)
+{
+	int ret;
+	struct key *key;
+
+	if (key_desc[0] != ':')
+		return strlen(key_desc) >> 1;
+
+	key = crypt_decode_get_keyring_key(key_desc + 1);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	rcu_read_lock();
+	ret = user_key_payload(key)->datalen;
+	rcu_read_unlock();
+	key_put(key);
+	return ret;
+}
+#else
+static int crypt_set_keyring_key(struct crypt_config *cc, char *key_desc)
+{
+	return -EINVAL;
+}
+
+static int get_key_size(const char *key)
+{
+	return strlen(key) >> 1;
+}
+#endif
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	int r = -EINVAL;
 	int key_string_len = strlen(key);
 
-	/* The key size may not be changed. */
-	if (cc->key_size != (key_string_len >> 1))
-		goto out;
-
 	/* Hyphen (which gives a key_size of zero) means there is no key. */
 	if (!cc->key_size && strcmp(key, "-"))
 		goto out;
@@ -1503,8 +1587,19 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 	/* clear the flag since following operations may invalidate previously valid key */
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-		goto out;
+	/* ':' means that the key is in kernel keyring */
+	if (key[0] == ':') {
+		if (crypt_set_keyring_key(cc, key + 1))
+			goto out;
+	} else {
+		/* The key size may not be changed. */
+		if (cc->key_size != (key_string_len >> 1))
+			goto out;
+
+		if (cc->key_size &&
+			crypt_decode_key(cc->key, key, cc->key_size) < 0)
+			goto out;
+	}
 
 	r = crypt_setkey(cc);
 	if (!r)
@@ -1730,12 +1825,13 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
 /*
  * Construct an encryption mapping:
- * <cipher> <key> <iv_offset> <dev_path> <start>
+ * <cipher> [<key>|:<key description>] <iv_offset> <dev_path> <start>
  */
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	unsigned int key_size, opt_params;
+	int key_size;
+	unsigned int opt_params;
 	unsigned long long tmpll;
 	int ret;
 	size_t iv_size_padding;
@@ -1752,7 +1848,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 	}
 
-	key_size = strlen(argv[1]) >> 1;
+	key_size = get_key_size(argv[1]);
+	if (key_size < 0) {
+		ti->error = "Cannot get the key";
+		return -EINVAL;
+	}
 
 	cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
 	if (!cc) {
@@ -2076,6 +2176,24 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	limits->max_segment_size = PAGE_SIZE;
 }
 
+static void crypt_ploop_modify(struct dm_target *ti, int action)
+{
+	struct crypt_config *cc = ti->private;
+
+	if (cc && cc->dev)
+		switch (action) {
+		case DM_PLOOP_ATTACH:
+			ploop_set_dm_crypt_bdev(cc->dev->bdev,
+				dm_md_get_bdev(dm_table_get_md(ti->table)));
+			break;
+		case DM_PLOOP_DETACH:
+			ploop_set_dm_crypt_bdev(cc->dev->bdev, NULL);
+			break;
+		default:
+			BUG();
+		}
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
 	.version = {1, 14, 1},
@@ -2091,6 +2209,7 @@ static struct target_type crypt_target = {
 	.merge  = crypt_merge,
 	.iterate_devices = crypt_iterate_devices,
 	.io_hints = crypt_io_hints,
+	.ploop_modify = crypt_ploop_modify,
 };
 
 static int __init dm_crypt_init(void)
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,14 +36,6 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
-/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
-	int undefined__;
-};
-
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;
@@ -1037,6 +1029,9 @@ static int do_resume(struct dm_ioctl *param)
 			return PTR_ERR(old_map);
 		}
 
+		dm_table_ploop_modify(old_map, DM_PLOOP_DETACH);
+		dm_table_ploop_modify(new_map, DM_PLOOP_ATTACH);
+
 		if (dm_table_get_mode(new_map) & FMODE_WRITE)
 			set_disk_ro(dm_disk(md), 0);
 		else
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1812,3 +1812,18 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
 
+void dm_table_ploop_modify(struct dm_table *t, int action)
+{
+	unsigned int i;
+
+	if (!t)
+		return;
+
+	/* attach or detach the targets */
+	for (i = 0; i < t->num_targets; i++) {
+		struct dm_target *tgt = t->targets + i;
+
+		if (tgt->type->ploop_modify)
+			tgt->type->ploop_modify(tgt, action);
+	}
+}
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -165,6 +165,12 @@ static unsigned dm_get_numa_node(void)
 					 DM_NUMA_NODE, num_online_nodes() - 1);
 }
 
+struct block_device *dm_md_get_bdev(struct mapped_device *md)
+{
+	return md->bdev;
+}
+EXPORT_SYMBOL_GPL(dm_md_get_bdev);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -1861,7 +1867,7 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	bd_write_size(md->bdev, (loff_t)size << SECTOR_SHIFT);
 }
 
 /*
@@ -2200,7 +2206,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		       dm_device_name(md), atomic_read(&md->holders));
 
 	dm_sysfs_exit(md);
-	dm_table_destroy(__unbind(md));
+	map = __unbind(md);
+	dm_table_ploop_modify(map, DM_PLOOP_DETACH);
+	dm_table_destroy(map);
 	free_dev(md);
 }
 
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,9 +72,12 @@ bool dm_table_request_based(struct dm_table *t);
 bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+void dm_table_ploop_modify(struct dm_table *t, int action);
 
 int dm_queue_merge_is_compulsory(struct request_queue *q);
 
+struct block_device *dm_md_get_bdev(struct mapped_device *md);
+
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6652,14 +6652,14 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
 				      struct shrink_control *sc)
 {
 	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
-	unsigned long ret = ~0UL; /* SHRINK_STOP */
+	unsigned long ret = SHRINK_STOP;
 
 	if (mutex_trylock(&conf->cache_size_mutex)) {
 		ret= 0;
 		while (ret < sc->nr_to_scan &&
 		       conf->max_nr_stripes > conf->min_nr_stripes) {
 			if (drop_one_stripe(conf) == 0) {
-				ret = ~0UL; /* SHRINK_STOP */
+				ret = SHRINK_STOP;
 				break;
 			}
 			ret++;
@@ -6680,14 +6680,6 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
 	return conf->max_nr_stripes - conf->min_nr_stripes;
 }
 
-static int raid5_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
-{
-	if (sc->nr_to_scan)
-		(void) raid5_cache_scan(shrink, sc);
-
-	return raid5_cache_count(shrink, sc);
-}
-
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
 	struct r5conf *conf;
@@ -6880,8 +6872,10 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	 * So set it rather large, scaled by number of devices.
 	 */
 	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
-	conf->shrinker.shrink = raid5_cache_shrink;
+	conf->shrinker.scan_objects = raid5_cache_scan;
+	conf->shrinker.count_objects = raid5_cache_count;
 	conf->shrinker.batch = 128;
+	conf->shrinker.flags = 0;
 	register_shrinker(&conf->shrinker);
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -26,6 +26,9 @@ obj-$(CONFIG_VXLAN) += vxlan.o
 obj-$(CONFIG_GENEVE) += geneve.o
 obj-$(CONFIG_NLMON) += nlmon.o
 
+obj-$(CONFIG_VE_NETDEV) += vznetdev.o
+vznetdev-objs := venetdev.o veip_mgmt.o
+
 #
 # Networking Drivers
 #
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -285,7 +285,7 @@ void bond_create_proc_entry(struct bonding *bond)
 	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
 
 	if (bn->proc_dir) {
-		bond->proc_entry = proc_create_data(bond_dev->name,
+		bond->proc_entry = proc_net_create_data(bond_dev->name,
 						    S_IRUGO, bn->proc_dir,
 						    &bond_info_fops, bond);
 		if (bond->proc_entry == NULL)
@@ -314,7 +314,8 @@ void bond_remove_proc_entry(struct bonding *bond)
 void __net_init bond_create_proc_dir(struct bond_net *bn)
 {
 	if (!bn->proc_dir) {
-		bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net);
+		bn->proc_dir = proc_net_mkdir(bn->net, DRV_NAME,
+					      bn->net->proc_net);
 		if (!bn->proc_dir)
 			pr_warn("Warning: Cannot create /proc/net/%s\n",
 				DRV_NAME);
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -134,6 +134,7 @@ static void dummy_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
+	dev->features	|= NETIF_F_VIRTUAL;
 	eth_hw_addr_random(dev);
 }
 
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -104,7 +104,6 @@
 MODULE_AUTHOR("Advanced Micro Devices, Inc.");
 MODULE_DESCRIPTION ("AMD8111 based 10/100 Ethernet Controller. Driver Version "MODULE_VERS);
 MODULE_LICENSE("GPL");
-MODULE_DEVICE_TABLE(pci, amd8111e_pci_tbl);
 module_param_array(speed_duplex, int, NULL, 0);
 MODULE_PARM_DESC(speed_duplex, "Set device speed and duplex modes, 0: Auto Negotiate, 1: 10Mbps Half Duplex, 2: 10Mbps Full Duplex, 3: 100Mbps Half Duplex, 4: 100Mbps Full Duplex");
 module_param_array(coalesce, bool, NULL, 0);
@@ -119,6 +118,7 @@ static const struct pci_device_id amd8111e_pci_tbl[] = {
 	{ 0, }
 
 };
+MODULE_DEVICE_TABLE(pci, amd8111e_pci_tbl);
 /*
 This function will read the PHY registers.
 */
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -26,7 +26,6 @@
 #include <net/vxlan.h>
 
 MODULE_VERSION(DRV_VER);
-MODULE_DEVICE_TABLE(pci, be_dev_ids);
 MODULE_DESCRIPTION(DRV_DESC " " DRV_VER);
 MODULE_AUTHOR("Emulex Corporation");
 MODULE_LICENSE("GPL");
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -400,119 +400,87 @@ struct ixgbe_q_vector {
 	char name[IFNAMSIZ + 9];
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int state;
-#define IXGBE_QV_STATE_IDLE        0
-#define IXGBE_QV_STATE_NAPI	   1     /* NAPI owns this QV */
-#define IXGBE_QV_STATE_POLL	   2     /* poll owns this QV */
-#define IXGBE_QV_STATE_DISABLED	   4     /* QV is disabled */
-#define IXGBE_QV_OWNED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
-#define IXGBE_QV_LOCKED (IXGBE_QV_OWNED | IXGBE_QV_STATE_DISABLED)
-#define IXGBE_QV_STATE_NAPI_YIELD  8     /* NAPI yielded this QV */
-#define IXGBE_QV_STATE_POLL_YIELD  16    /* poll yielded this QV */
-#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
-#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
-	spinlock_t lock;
 #endif  /* CONFIG_NET_RX_BUSY_POLL */
+	atomic_t state;
 
 	/* for dynamic allocation of rings associated with this q_vector */
 	struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
+enum ixgbe_qv_state_t {
+	IXGBE_QV_STATE_IDLE = 0,
+	IXGBE_QV_STATE_NAPI,
+	IXGBE_QV_STATE_POLL,
+	IXGBE_QV_STATE_DISABLE
+};
+
 static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
 {
-
-	spin_lock_init(&q_vector->lock);
-	q_vector->state = IXGBE_QV_STATE_IDLE;
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from the device poll routine to get ownership of a q_vector */
 static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_LOCKED) {
-		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
-		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_NAPI);
 #ifdef BP_EXTENDED_STATS
+	if (rc != IXGBE_QV_STATE_IDLE)
 		q_vector->tx.ring->stats.yields++;
 #endif
-	} else {
-		/* we don't care if someone yielded */
-		q_vector->state = IXGBE_QV_STATE_NAPI;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true is someone tried to get the qv while napi had it */
-static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
-			       IXGBE_QV_STATE_NAPI_YIELD));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_NAPI);
+
+	/* flush any outstanding Rx frames */
+	if (q_vector->napi.gro_list)
+		napi_gro_flush(&q_vector->napi, false);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from ixgbe_low_latency_poll() */
 static inline bool ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if ((q_vector->state & IXGBE_QV_LOCKED)) {
-		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_POLL);
 #ifdef BP_EXTENDED_STATS
+	if (rc != IXGBE_QV_STATE_IDLE)
 		q_vector->rx.ring->stats.yields++;
 #endif
-	} else {
-		/* preserve yield marks */
-		q_vector->state |= IXGBE_QV_STATE_POLL;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true if someone tried to get the qv while it was locked */
-static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_POLL);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* true if a socket is polling, even if it did not get the lock */
 static inline bool ixgbe_qv_busy_polling(struct ixgbe_q_vector *q_vector)
 {
-	WARN_ON(!(q_vector->state & IXGBE_QV_OWNED));
-	return q_vector->state & IXGBE_QV_USER_PEND;
+	return atomic_read(&q_vector->state) == IXGBE_QV_STATE_POLL;
 }
 
 /* false if QV is currently owned */
 static inline bool ixgbe_qv_disable(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_OWNED)
-		rc = false;
-	q_vector->state |= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-
-	return rc;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_DISABLE);
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -853,6 +853,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 	netif_napi_add(adapter->netdev, &q_vector->napi,
 		       ixgbe_poll, 64);
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* initialize busy poll */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_DISABLE);
+
+#endif
 	/* tie q_vector and adapter together */
 	adapter->q_vector[v_idx] = q_vector;
 	q_vector->adapter = adapter;
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -58,6 +58,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/ve.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -75,6 +76,12 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	struct pcpu_lstats *lb_stats;
 	int len;
 
+#ifdef CONFIG_VE
+	if (unlikely(dev_net(dev)->owner_ve->disable_net)) {
+		kfree_skb(skb);
+		return 0;
+	}
+#endif
 	skb_orphan(skb);
 
 	/* Before queueing this packet to netif_rx(),
@@ -179,6 +186,7 @@ static void loopback_setup(struct net_device *dev)
 		| NETIF_F_HIGHDMA
 		| NETIF_F_LLTX
 		| NETIF_F_NETNS_LOCAL
+		| NETIF_F_VIRTUAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
 	dev->ethtool_ops	= &loopback_ethtool_ops;
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -54,6 +54,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+#include <linux/ve.h>
+#include <uapi/linux/vzcalluser.h>
+
 #define PPP_VERSION	"2.4.2"
 
 /*
@@ -379,8 +382,10 @@ static int ppp_open(struct inode *inode, struct file *file)
 	/*
 	 * This could (should?) be enforced by the permissions on /dev/ppp.
 	 */
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(current_user_ns(), CAP_NET_ADMIN))
 		return -EPERM;
+	if (!net_generic(current->nsproxy->net_ns, ppp_net_id)) /* no VE_FEATURE_PPP */
+		return -EACCES;
 	return 0;
 }
 
@@ -879,6 +884,9 @@ static __net_init int ppp_init_net(struct net *net)
 {
 	struct ppp_net *pn = net_generic(net, ppp_net_id);
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, ppp_net_id, NULL);
+
 	idr_init(&pn->units_idr);
 	mutex_init(&pn->all_ppp_mutex);
 
@@ -894,6 +902,9 @@ static __net_exit void ppp_exit_net(struct net *net)
 {
 	struct ppp_net *pn = net_generic(net, ppp_net_id);
 
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
 	idr_destroy(&pn->units_idr);
 }
 
@@ -1080,7 +1091,7 @@ static void ppp_setup(struct net_device *dev)
 	dev->tx_queue_len = 3;
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
-	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_VIRTUAL;
 	netif_keep_dst(dev);
 }
 
@@ -2213,12 +2224,14 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 	struct channel *pch;
 	struct ppp_net *pn;
 
+	pn = ppp_pernet(net);
+	if (!pn)
+		return -EACCES;
+
 	pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
 	if (!pch)
 		return -ENOMEM;
 
-	pn = ppp_pernet(net);
-
 	pch->ppp = NULL;
 	pch->chan = chan;
 	pch->chan_net = net;
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -77,6 +77,8 @@
 #include <linux/file.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -283,6 +285,8 @@ static void pppoe_flush_dev(struct net_device *dev)
 	int i;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
 	write_lock_bh(&pn->hash_lock);
 	for (i = 0; i < PPPOE_HASH_SIZE; i++) {
 		struct pppox_sock *po = pn->hash_table[i];
@@ -439,6 +443,8 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto drop;
 
 	/* Note that get_item does a sock_hold(), so sk_pppox(po)
 	 * is known to be safe.
@@ -497,6 +503,9 @@ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto abort;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto abort;
+
 	po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
 	if (po) {
 		struct sock *sk = sk_pppox(po);
@@ -551,6 +560,9 @@ static int pppoe_create(struct net *net, struct socket *sock)
 {
 	struct sock *sk;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
 	if (!sk)
 		return -ENOMEM;
@@ -1149,9 +1161,12 @@ static __net_init int pppoe_init_net(struct net *net)
 	struct pppoe_net *pn = pppoe_pernet(net);
 	struct proc_dir_entry *pde;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, pppoe_net_id, NULL);
+
 	rwlock_init(&pn->hash_lock);
 
-	pde = proc_create("pppoe", S_IRUGO, net->proc_net, &pppoe_seq_fops);
+	pde = proc_net_create("pppoe", S_IRUGO, net->proc_net, &pppoe_seq_fops);
 #ifdef CONFIG_PROC_FS
 	if (!pde)
 		return -ENOMEM;
@@ -1162,6 +1177,12 @@ static __net_init int pppoe_init_net(struct net *net)
 
 static __net_exit void pppoe_exit_net(struct net *net)
 {
+	struct pppoe_net *pn;
+
+	pn = net_generic(net, pppoe_net_id);
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
 	remove_proc_entry("pppoe", net->proc_net);
 }
 
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,6 +73,10 @@
 
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+#include <linux/vznetstat.h>
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -218,6 +222,9 @@ struct tun_struct {
 	u32 flow_count;
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	struct venet_stat *vestat;
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
 };
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -1436,6 +1443,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb, 0);
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_classify_add_outgoing(tun->vestat, skb);
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	rxhash = __skb_get_hash_symmetric(skb);
 #ifndef CONFIG_4KSTACKS
 	tun_rx_batched(tun, tfile, skb, more);
@@ -1597,6 +1610,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	u64_stats_update_end(&stats->syncp);
 	put_cpu_ptr(tun->pcpu_stats);
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_classify_add_incoming(tun->vestat, skb);
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	return total;
 }
 
@@ -1702,6 +1721,14 @@ static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
+
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_put_stat(tun->vestat);
+		tun->vestat = NULL;
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	free_netdev(dev);
 }
 
@@ -1981,7 +2008,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 				   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
 				   NETIF_F_HW_VLAN_STAG_TX;
-		dev->features = dev->hw_features | NETIF_F_LLTX;
+		dev->features = dev->hw_features | NETIF_F_LLTX |
+				   NETIF_F_VIRTUAL;
 		dev->vlan_features = dev->features &
 				     ~(NETIF_F_HW_VLAN_CTAG_TX |
 				       NETIF_F_HW_VLAN_STAG_TX);
@@ -2153,11 +2181,38 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 	return ret;
 }
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+/* setacctid_ioctl should be called under rtnl_lock */
+static int tun_set_acctid(struct net *net, struct ifreq *ifr)
+{
+	struct net_device *dev;
+	struct tun_struct *tun;
+
+	dev = __dev_get_by_name(net, ifr->ifr_name);
+	if (dev == NULL)
+		return -ENOENT;
+
+	/* This check may be dropped to allow tun devices */
+	if (dev->netdev_ops != &tap_netdev_ops)
+		return -EINVAL;
+
+	tun = netdev_priv(dev);
+	if (tun->vestat) {
+		venet_acct_put_stat(tun->vestat);
+	}
+	tun->vestat = venet_acct_find_create_stat(ifr->ifr_acctid);
+	if (tun->vestat == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun;
+	struct tun_struct *tun = NULL;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
 	kuid_t owner;
@@ -2168,7 +2223,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	int le;
 	int ret;
 
-	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
+	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || cmd == TUNSETACCTID ||
+			_IOC_TYPE(cmd) == 0x89) {
 		if (copy_from_user(&ifr, argp, ifreq_len))
 			return -EFAULT;
 	} else {
@@ -2187,6 +2243,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	ret = 0;
 	rtnl_lock();
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (cmd == TUNSETACCTID) {
+		ret = tun_set_acctid(tfile->net, &ifr);
+		goto unlock;
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	tun = __tun_get(tfile);
 	if (cmd == TUNSETIFF && !tun) {
 		ifr.ifr_name[IFNAMSIZ-1] = '\0';
--- /dev/null
+++ b/drivers/net/veip_mgmt.c
@@ -0,0 +1,174 @@
+/*
+ *  drivers/net/veip_mgmt.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Virtual Networking device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/inet.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <linux/venet.h>
+#include <linux/ve.h>
+
+static void veip_free(struct veip_struct *veip)
+{
+	kfree(veip);
+}
+
+static void veip_release(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = ve->veip;
+	ve->veip = NULL;
+	barrier();
+	veip_put(veip);
+}
+
+static int veip_create(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = veip_findcreate(ve->veid);
+	if (veip == NULL)
+		return -ENOMEM;
+	if (IS_ERR(veip))
+		return PTR_ERR(veip);
+
+	ve->veip = veip;
+	return 0;
+}
+
+static int skb_extract_addr(struct sk_buff *skb,
+		struct ve_addr_struct *addr, int dir)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		addr->family = AF_INET;
+		addr->key[0] = 0;
+		addr->key[1] = 0;
+		addr->key[2] = 0;
+		addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr);
+		return 0;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case __constant_htons(ETH_P_IPV6):
+		addr->family = AF_INET6;
+		memcpy(&addr->key, dir ?
+				ipv6_hdr(skb)->daddr.s6_addr32 :
+				ipv6_hdr(skb)->saddr.s6_addr32,
+				sizeof(addr->key));
+		return 0;
+#endif
+	}
+
+	return -EAFNOSUPPORT;
+}
+
+static struct ve_struct *venet_find_ve(struct ve_addr_struct *addr, int dir)
+{
+	struct ip_entry_struct *entry;
+	struct ve_struct *ve = NULL;
+
+	entry = venet_entry_lookup(addr);
+	if (entry != NULL)
+		ve = ACCESS_ONCE(entry->active_env);
+
+	return ve;
+}
+
+static struct ve_struct *
+veip_lookup(struct ve_struct *ve_old, struct sk_buff *skb)
+{
+	struct ve_struct *ve;
+	int dir;
+	struct ve_addr_struct addr;
+
+	dir = ve_is_super(ve_old);
+	if (skb_extract_addr(skb, &addr, dir) < 0)
+		goto out_drop_nolock;
+
+	rcu_read_lock();
+	if (!dir) {
+		/* from VE to host */
+		ve = venet_find_ve(&addr, 0);
+		if (ve == NULL) {
+			if (!venet_ext_lookup(ve_old, &addr))
+				goto out_drop;
+		} else {
+			if (ve != ve_old)
+				goto out_source;
+		}
+
+		ve = get_ve0();
+	} else {
+		/* from host to VE */
+		ve = venet_find_ve(&addr, 1);
+		if (ve == NULL)
+			goto out_drop;
+	}
+	rcu_read_unlock();
+
+	return ve;
+
+out_drop:
+	rcu_read_unlock();
+out_drop_nolock:
+	return ERR_PTR(-ESRCH);
+
+out_source:
+	rcu_read_unlock();
+	if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
+		printk(KERN_WARNING "Dropped packet, source wrong "
+		       "veid=%s src-IP=%u.%u.%u.%u "
+		       "dst-IP=%u.%u.%u.%u\n",
+		       ve_name(ve_old),
+		       NIPQUAD(ip_hdr(skb)->saddr),
+		       NIPQUAD(ip_hdr(skb)->daddr));
+	}
+	return ERR_PTR(-EACCES);
+}
+
+void veip_cleanup(void)
+{
+	int i;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		while (!hlist_empty(ip_entry_hash_table + i)) {
+			struct ip_entry_struct *entry;
+
+			entry = hlist_entry(ip_entry_hash_table[i].first,
+					struct ip_entry_struct, ip_hash);
+			hlist_del(&entry->ip_hash);
+			list_del(&entry->ve_list);
+			kfree(entry);
+		}
+
+	/*vzredir may remain some veip-s*/
+	while (!list_empty(&veip_lh)) {
+		veip = list_first_entry(&veip_lh, struct veip_struct, list);
+		veip_put(veip);
+	}
+	spin_unlock(&veip_lock);
+}
+
+static struct veip_pool_ops open_pool_ops = {
+	.veip_create = veip_create,
+	.veip_release = veip_release,
+	.veip_free = veip_free,
+	.veip_lookup = veip_lookup,
+};
+
+struct veip_pool_ops *veip_pool_ops = &open_pool_ops;
+EXPORT_SYMBOL(veip_pool_ops);
--- /dev/null
+++ b/drivers/net/venetdev.c
@@ -0,0 +1,1258 @@
+/*
+ *  drivers/net/venetdev.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Common part for Virtuozzo virtual network devices
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/addrconf.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h>	/* For the statistics structure. */
+#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
+#include <linux/ethtool.h>
+#include <linux/venet.h>
+#include <linux/ve_proto.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzctl_venet.h>
+#include <linux/ve.h>
+#include <linux/venet-netlink.h>
+
+struct hlist_head ip_entry_hash_table[VEIP_HASH_SZ];
+DEFINE_SPINLOCK(veip_lock);
+LIST_HEAD(veip_lh);
+static struct rtnl_link_ops venet_link_ops;
+
+#define ip_entry_hash_function(ip)  (ntohl(ip) & (VEIP_HASH_SZ - 1))
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
+{
+	hlist_add_head_rcu(&entry->ip_hash,
+			ip_entry_hash_table +
+			ip_entry_hash_function(entry->addr.key[3]));
+	list_add(&entry->ve_list, &veip->ip_lh);
+}
+
+static void ip_entry_free(struct rcu_head *rcu)
+{
+	struct ip_entry_struct *e;
+
+	e = container_of(rcu, struct ip_entry_struct, rcu);
+	kfree(e);
+}
+
+void ip_entry_unhash(struct ip_entry_struct *entry)
+{
+	list_del(&entry->ve_list);
+	hlist_del_rcu(&entry->ip_hash);
+	call_rcu(&entry->rcu, ip_entry_free);
+}
+
+static void veip_free(struct rcu_head *rcu)
+{
+	struct veip_struct *veip;
+
+	veip = container_of(rcu, struct veip_struct, rcu);
+	veip_pool_ops->veip_free(veip);
+}
+
+int veip_put(struct veip_struct *veip)
+{
+	if (!list_empty(&veip->ip_lh))
+		return 0;
+	if (!list_empty(&veip->src_lh))
+		return 0;
+	if (!list_empty(&veip->dst_lh))
+		return 0;
+
+	list_del(&veip->list);
+	call_rcu(&veip->rcu, veip_free);
+	return 1;
+}
+
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry;
+
+	hlist_for_each_entry_rcu(entry, ip_entry_hash_table +
+			ip_entry_hash_function(addr->key[3]), ip_hash)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry;
+	struct veip_struct *veip;
+
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		return NULL;
+
+	list_for_each_entry_rcu (entry, &veip->ext_lh, list)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+static int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry, *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	spin_lock(&veip_lock);
+	err = -EADDRINUSE;
+	found = venet_ext_lookup(ve, addr);
+	if (found != NULL)
+		goto out_unlock;
+
+	entry->addr = *addr;
+	list_add_rcu(&entry->list, &ve->veip->ext_lh);
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+	if (entry != NULL)
+		kfree(entry);
+	return err;
+}
+
+static void venet_ext_free(struct rcu_head *rcu)
+{
+	struct ext_entry_struct *e;
+
+	e = container_of(rcu, struct ext_entry_struct, rcu);
+	kfree(e);
+}
+
+static void venet_ext_release(struct ext_entry_struct *e)
+{
+	list_del_rcu(&e->list);
+	call_rcu(&e->rcu, venet_ext_free);
+}
+
+static int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_ext_lookup(ve, addr);
+	if (found == NULL)
+		goto out;
+
+	venet_ext_release(found);
+	err = 0;
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static void __venet_ext_clean(struct ve_struct *ve)
+{
+	struct ext_entry_struct *entry, *tmp;
+
+	list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list)
+		venet_ext_release(entry);
+}
+
+static struct veip_struct *veip_find(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	list_for_each_entry(ptr, &veip_lh, list) {
+		if (ptr->veid != veid)
+			continue;
+		return ptr;
+	}
+	return NULL;
+}
+
+struct veip_struct *veip_findcreate(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	ptr = veip_find(veid);
+	if (ptr != NULL)
+		return ERR_PTR(-EEXIST);
+
+	ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
+	if (ptr == NULL)
+		return NULL;
+	memset(ptr, 0, sizeof(struct veip_struct));
+	INIT_LIST_HEAD(&ptr->ip_lh);
+	INIT_LIST_HEAD(&ptr->src_lh);
+	INIT_LIST_HEAD(&ptr->dst_lh);
+	INIT_LIST_HEAD(&ptr->ext_lh);
+	ptr->veid = veid;
+	list_add(&ptr->list, &veip_lh);
+	return ptr;
+}
+
+static int veip_start(struct ve_struct *ve)
+{
+	int err, get;
+
+	spin_lock(&veip_lock);
+
+	get = ve->veip == NULL;
+	err = veip_pool_ops->veip_create(ve);
+	if (!err && get && !ve_is_super(ve))
+		__module_get(THIS_MODULE);
+
+	spin_unlock(&veip_lock);
+
+	return err;
+}
+
+static void __veip_stop(struct ve_struct *ve)
+{
+	struct list_head *p, *tmp;
+
+	list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
+		struct ip_entry_struct *ptr;
+		ptr = list_entry(p, struct ip_entry_struct, ve_list);
+		ptr->active_env = NULL;
+
+		if (ptr->tgt_veip == NULL)
+			ip_entry_unhash(ptr);
+	}
+
+	veip_pool_ops->veip_release(ve);
+	if (!ve_is_super(ve))
+		module_put(THIS_MODULE);
+}
+
+static void veip_stop(struct ve_struct *ve)
+{
+	spin_lock(&veip_lock);
+	if (ve->veip)
+		__veip_stop(ve);
+	spin_unlock(&veip_lock);
+}
+
+static int veip_entry_conflict(struct ip_entry_struct *entry, struct ve_struct *ve)
+{
+	if (entry->active_env != NULL)
+		return -EADDRINUSE;
+	if (entry->tgt_veip && entry->tgt_veip->veid != ve->veid)
+		return -EADDRNOTAVAIL;
+
+	entry->active_env = ve;
+	return 0;
+}
+
+static int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry, *found;
+	int err;
+
+	entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	if (ve->veip == NULL) {
+		/* This can happen if we load venet AFTER ve was started */
+	       	err = veip_start(ve);
+		if (err < 0)
+			goto out;
+	}
+
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found != NULL) {
+		err = veip_entry_conflict(found, ve);
+		goto out_unlock;
+	}
+
+	entry->active_env = ve;
+	entry->addr = *addr;
+	ip_entry_hash(entry, ve->veip);
+
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+out:
+	if (entry != NULL)
+		kfree(entry);
+
+	return err;
+}
+
+static int veip_entry_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *found;
+	int err;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found == NULL)
+		goto out;
+	if (found->active_env == NULL)
+		goto out;
+	if (found->active_env->veid != ve->veid)
+		goto out;
+
+	err = 0;
+	found->active_env = NULL;
+
+	if (found->tgt_veip == NULL)
+		ip_entry_unhash(found);
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static int convert_sockaddr(struct sockaddr *addr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in *)addr;
+		veaddr->family = AF_INET;
+		veaddr->key[0] = 0;
+		veaddr->key[1] = 0;
+		veaddr->key[2] = 0;
+		veaddr->key[3] = sin->sin_addr.s_addr;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in6))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in6 *)addr;
+		veaddr->family = AF_INET6;
+		memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key));
+		break;
+	}
+	default:
+		err = -EAFNOSUPPORT;
+	}
+	return err;
+}
+
+int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+	char addr[MAX_SOCK_ADDR];
+
+	err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr_storage *)&addr);
+	if (err < 0)
+		goto out;
+
+	err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr);
+out:
+	return err;
+}
+
+int in4_to_veaddr(const char *addr, struct ve_addr_struct *veaddr)
+{
+	veaddr->family = AF_INET;
+	if (!in4_pton(addr, -1, (u8 *)(&veaddr->key[3]), -1, NULL))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(in4_to_veaddr);
+
+int in6_to_veaddr(const char *addr, struct ve_addr_struct *veaddr)
+{
+	veaddr->family = AF_INET6;
+	if (!in6_pton(addr, -1, (u8 *)(veaddr->key), -1, NULL))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(in6_to_veaddr);
+
+void veaddr_print(char *str, int len, struct ve_addr_struct *a)
+{
+	if (a->family == AF_INET)
+		snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3]));
+	else
+		snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x",
+				ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF,
+				ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF,
+				ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF,
+				ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF
+			);
+}
+
+/*
+ * Device functions
+ */
+
+static int venet_open(struct net_device *dev)
+{
+	if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE))
+		return -EBUSY;
+	return 0;
+}
+
+static int venet_close(struct net_device *master)
+{
+	if (!ve_is_super(get_exec_env()))
+		module_put(THIS_MODULE);
+	return 0;
+}
+
+void (*venet_free_stat)(struct ve_struct *) = NULL;
+EXPORT_SYMBOL(venet_free_stat);
+
+static void venet_destructor(struct net_device *dev)
+{
+	struct venet_stats *stats = (struct venet_stats *)dev->ml_priv;
+
+	if (venet_free_stat)
+		venet_free_stat(dev->nd_net->owner_ve);
+
+	free_percpu(stats->real_stats);
+	kfree(stats);
+	free_netdev(dev);
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats;
+	struct net_device *rcv = NULL;
+	struct ve_struct *ve;
+	int length;
+
+	stats = venet_stats(dev, smp_processor_id());
+	ve = dev_net(dev)->owner_ve;
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	if (skb->protocol == __constant_htons(ETH_P_IP)) {
+		struct iphdr *iph;
+		iph = ip_hdr(skb);
+		if (ipv4_is_multicast(iph->daddr))
+			goto outf;
+	} else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h;
+		ip6h = ipv6_hdr(skb);
+		if (ipv6_addr_is_multicast(&ip6h->daddr))
+			goto outf;
+		skb_orphan(skb);
+	} else {
+		goto outf;
+	}
+
+	ve = veip_pool_ops->veip_lookup(ve, skb);
+	if (IS_ERR(ve))
+		goto outf;
+
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	rcv = ve->_venet_dev;
+	if (!rcv)
+		/* VE going down */
+		goto outf;
+
+	dev_hold(rcv);
+
+	if (!(rcv->flags & IFF_UP))
+		/* Target VE does not want to receive packets */
+		goto outf;
+
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = rcv;
+
+	/*
+	 * If there is not enough space for header we allocate one.
+	 * Remember the traffic can reach VE from outside world and
+	 * as result we have to cleanup mac address of such packet.
+	 * The same applies to traffic which comes from inside of VE
+	 * but if TUN is used and traffic get fragmented we might reach
+	 * the point where is no L2 header at all and hard_header_len
+	 * is simply ingnored (because this parameter is kind of a hint
+	 * for upper net layers and never a guarantee that header will be
+	 * provided). To unify the way how packets are seen after venet
+	 * we always produce L2 header with zero'ified MAC.
+	 */
+	if (unlikely(skb_headroom(skb) < dev->hard_header_len)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2)
+			goto outf;
+
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
+
+	skb_reset_mac_header(skb);
+	memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
+
+	nf_reset(skb);
+	length = skb->len;
+
+	if (unlikely(netif_rx(skb) != NET_RX_SUCCESS))
+		goto dropped;
+
+	stats->tx_bytes += length;
+	stats->tx_packets++;
+	if (rcv) {
+		struct net_device_stats *rcv_stats;
+
+		rcv_stats = venet_stats(rcv, smp_processor_id());
+		rcv_stats->rx_bytes += length;
+		rcv_stats->rx_packets++;
+		dev_put(rcv);
+	}
+
+	return 0;
+
+outf:
+	kfree_skb(skb);
+dropped:
+	if (rcv)
+		dev_put(rcv);
+	++stats->tx_dropped;
+	return 0;
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+	int i;
+	struct venet_stats *stats;
+
+	stats = (struct venet_stats *)dev->ml_priv;
+	memset(&stats->stats, 0, sizeof(struct net_device_stats));
+	for_each_possible_cpu(i) {
+		struct net_device_stats *dev_stats;
+
+		dev_stats = venet_stats(dev, i);
+		stats->stats.rx_bytes   += dev_stats->rx_bytes;
+		stats->stats.tx_bytes   += dev_stats->tx_bytes;
+		stats->stats.rx_packets += dev_stats->rx_packets;
+		stats->stats.tx_packets += dev_stats->tx_packets;
+		stats->stats.tx_dropped += dev_stats->tx_dropped;
+	}
+
+	return &stats->stats;
+}
+
+/* Initialize the rest of the LOOPBACK device. */
+static int venet_init_dev(struct net_device *dev)
+{
+	struct venet_stats *stats;
+
+	stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL);
+	if (stats == NULL)
+		goto fail;
+	stats->real_stats = alloc_percpu(struct net_device_stats);
+	if (stats->real_stats == NULL)
+		goto fail_free;
+	dev->ml_priv = stats;
+
+	/*
+	 *	Fill in the generic fields of the device structure.
+	 */
+	dev->type		= ARPHRD_VOID;
+	dev->hard_header_len 	= ETH_HLEN;
+	dev->mtu		= 1500; /* eth_mtu */
+	dev->tx_queue_len	= 0;
+
+	memset(dev->broadcast, 0xFF, ETH_ALEN);
+
+	/* New-style flags. */
+	dev->flags		= IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
+	return 0;
+
+fail_free:
+	kfree(stats);
+fail:
+	return -ENOMEM;
+}
+
+static netdev_features_t common_features;
+static const struct net_device_ops venet_netdev_ops;
+
+static int venet_set_features(struct net_device *dev,
+			      netdev_features_t features)
+{
+	struct net *net;
+
+	common_features = features;
+	for_each_net(net) {
+		for_each_netdev(net, dev) {
+			if (dev->netdev_ops == &venet_netdev_ops)
+				dev->features = features;
+		}
+	}
+	return 0;
+}
+#define DRV_NAME	"vz-venet"
+#define DRV_VERSION	"1.0"
+
+/*
+ * ethtool interface
+ */
+
+static struct {
+	const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+	{ "ifindex" },
+};
+
+static int venet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->supported		= 0;
+	cmd->advertising	= 0;
+	ethtool_cmd_speed_set(cmd, SPEED_10000);
+	cmd->duplex		= DUPLEX_FULL;
+	cmd->port		= PORT_TP;
+	cmd->phy_address	= 0;
+	cmd->transceiver	= XCVR_INTERNAL;
+	cmd->autoneg		= AUTONEG_DISABLE;
+	cmd->maxtxpkt		= 0;
+	cmd->maxrxpkt		= 0;
+	return 0;
+}
+
+static void venet_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+}
+
+static void venet_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	switch(stringset) {
+	case ETH_SS_STATS:
+		memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
+		break;
+	}
+}
+
+static int venet_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(ethtool_stats_keys);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void venet_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, u64 *data)
+{
+	/*
+	 * TODO: copy proper statistics here.
+	 */
+	data[0] = dev->ifindex;
+}
+
+static const struct ethtool_ops venet_ethtool_ops = {
+	.get_settings		= venet_get_settings,
+	.get_drvinfo		= venet_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= venet_get_strings,
+	.get_sset_count		= venet_get_sset_count,
+	.get_ethtool_stats	= venet_get_ethtool_stats,
+};
+
+static const struct net_device_ops venet_netdev_ops = {
+	.ndo_start_xmit = venet_xmit,
+	.ndo_get_stats = get_stats,
+	.ndo_open = venet_open,
+	.ndo_stop = venet_close,
+	.ndo_init = venet_init_dev,
+	.ndo_set_features = venet_set_features,
+};
+
+static void venet_setup(struct net_device *dev)
+{
+	/*
+	 * No other features, as they are:
+	 *  - checksumming is required, and nobody else will done our job
+	 */
+	dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
+	       NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED;
+
+	dev->netdev_ops = &venet_netdev_ops;
+	dev->destructor = venet_destructor;
+
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
+
+	dev->features |= common_features;
+
+	SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
+}
+
+static void veip_shutdown(struct ve_struct *ve)
+{
+	spin_lock(&veip_lock);
+	if (ve->veip) {
+		__venet_ext_clean(ve);
+		__veip_stop(ve);
+	}
+	spin_unlock(&veip_lock);
+}
+
+static void venet_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct ve_struct *env = dev->nd_net->owner_ve;
+
+	/* We check ve_netns to avoid races with veip SHUTDOWN hook, called from
+	 * ve_exit_ns().
+	 * Also, in veip SHUTDOWN hook we skip veip destruction, if container
+	 * has VE_FEATURE_NFS enabled. Thus here we have to destroy veip in
+	 * this case.
+	 */
+	if (env->ve_netns)
+		veip_shutdown(env);
+
+	env->_venet_dev = NULL;
+	unregister_netdevice_queue(dev, head);
+}
+
+static int venet_newlink(struct net *src_net, struct net_device *dev,
+		  struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ve_struct *env = src_net->owner_ve;
+	int err;
+
+	if (!env->ve_netns)
+		return -EBUSY;
+
+	if (src_net != env->ve_netns)
+		/* Don't create venet-s in sub net namespaces */
+		return -ENOSYS;
+
+	if (env->veip)
+		return -EEXIST;
+
+	err = veip_start(env);
+	if (err)
+		return err;
+
+	dev->features |= NETIF_F_NETNS_LOCAL;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto err_stop;
+
+	env->_venet_dev = dev;
+	return 0;
+
+err_stop:
+	veip_stop(env);
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS
+static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve)
+{
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		goto unlock;
+	list_for_each_entry (entry, &veip->ip_lh, ve_list) {
+		char addr[40];
+
+		if (entry->active_env == NULL)
+			continue;
+
+		veaddr_print(addr, sizeof(addr), &entry->addr);
+		if (entry->addr.family == AF_INET)
+			seq_printf(m, " %15s", addr);
+		else
+			seq_printf(m, " %39s", addr);
+	}
+unlock:
+	spin_unlock(&veip_lock);
+}
+
+static void *veip_seq_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t l;
+	struct ip_entry_struct *s;
+	int i;
+
+	l = *pos;
+	rcu_read_lock();
+	if (l == 0) {
+		m->private = (void *)0;
+		return SEQ_START_TOKEN;
+	}
+
+	for (i = 0; i < VEIP_HASH_SZ; i++) {
+		hlist_for_each_entry_rcu(s, ip_entry_hash_table + i, ip_hash) {
+			if (--l == 0) {
+				m->private = (void *)(long)(i + 1);
+				return &s->ip_hash;
+			}
+		}
+	}
+	return NULL;
+}
+
+static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct hlist_node *p;
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		goto find;
+
+	p = rcu_dereference(((struct hlist_node *)v)->next);
+	if (p != NULL)
+		goto found;
+
+find:
+	for (i = (int)(long)m->private; i < VEIP_HASH_SZ; i++) {
+		p = rcu_dereference(ip_entry_hash_table[i].first);
+		if (p != NULL) {
+			m->private = (void *)(long)(i + 1);
+found:
+			(*pos)++;
+			return p;
+		}
+	}
+
+	return NULL;
+}
+
+static void veip_seq_stop(struct seq_file *m, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int veip_seq_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *p;
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+	char s[40];
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "Version: 2.5\n");
+		return 0;
+	}
+
+	p = (struct hlist_node *)v;
+	entry = hlist_entry(p, struct ip_entry_struct, ip_hash);
+	veaddr_print(s, sizeof(s), &entry->addr);
+	veip = ACCESS_ONCE(entry->tgt_veip);
+	seq_printf(m, "%39s %10u\n", s, veip == NULL ? 0 : veip->veid);
+	return 0;
+}
+
+static struct seq_operations veip_seq_op = {
+	.start	= veip_seq_start,
+	.next	= veip_seq_next,
+	.stop	= veip_seq_stop,
+	.show	= veip_seq_show,
+};
+
+static int veip_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &veip_seq_op);
+}
+
+static struct file_operations proc_veip_operations = {
+	.open		= veip_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
+static int do_ve_ip_map(struct ve_struct *ve, int op, struct ve_addr_struct *addr)
+{
+	int err;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	down_read(&ve->op_sem);
+	switch (op)
+	{
+		case VE_IP_ADD:
+			/*
+			 * FIXME We should check if VE
+			 * is either running or in restore
+			 * state instead of allowing adding
+			 * address arbitrary.
+			 */
+			err = veip_entry_add(ve, addr);
+			break;
+
+		case VE_IP_DEL:
+			err = veip_entry_del(ve, addr);
+			break;
+		case VE_IP_EXT_ADD:
+			err = venet_ext_add(ve, addr);
+			break;
+		case VE_IP_EXT_DEL:
+			err = venet_ext_del(ve, addr);
+			break;
+		default:
+			err = -EINVAL;
+	}
+	up_read(&ve->op_sem);
+	return err;
+}
+
+static int real_ve_ip_map(envid_t veid, int op,
+			  struct sockaddr __user *uaddr, int addrlen)
+{
+	int err;
+	struct ve_addr_struct addr;
+	struct ve_struct *ve;
+
+	err = sockaddr_to_veaddr(uaddr, addrlen, &addr);
+	if (err < 0)
+		return err;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -ESRCH;
+
+	err = do_ve_ip_map(ve, op, &addr);
+	put_ve(ve);
+	return err;
+}
+
+int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	case VENETCTL_VE_IP_MAP: {
+		struct vzctl_ve_ip_map s;
+		err = -EFAULT;
+		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+			break;
+		err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
+		break;
+	}
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VENETCTL_COMPAT_VE_IP_MAP: {
+		struct compat_vzctl_ve_ip_map cs;
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr),
+				cs.addrlen);
+		break;
+	}
+	default:
+		err = venet_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo venetcalls = {
+	.type		= VENETCTLTYPE,
+	.ioctl		= venet_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_venet_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static int ve_ip_access_write(struct cgroup *cgrp, struct cftype *cft,
+			      const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cgrp);
+	struct ve_addr_struct addr;
+	int ret;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	memset(&addr, 0, sizeof(addr));
+	if (strncmp(cft->name, "ip6", 3)) {
+		if ((ret = in4_to_veaddr(buffer, &addr)) != 0)
+			return ret;
+	} else {
+		if ((ret = in6_to_veaddr(buffer, &addr)) != 0)
+			return ret;
+	}
+
+	return do_ve_ip_map(ve, cft->private, &addr);
+}
+
+static int ve_ip_access_seq_read(struct cgroup *cgrp, struct cftype *cft,
+				 struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cgrp);
+	struct ip_entry_struct *s;
+	char buf[40];
+	int family = strncmp(cft->name, "ip6", 3) ? AF_INET : AF_INET6;
+	int i;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	rcu_read_lock();
+	for (i = 0; i < VEIP_HASH_SZ; i++) {
+		hlist_for_each_entry_rcu(s, ip_entry_hash_table + i,
+					 ip_hash) {
+			if (s->addr.family == family &&
+			    s->active_env && s->active_env->veid == ve->veid) {
+				veaddr_print(buf, sizeof(buf), &s->addr);
+				seq_printf(m, "%s\n", buf);
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static struct cftype venet_cftypes[] = {
+	{
+		.name = "ip_allow",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_ADD,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip_deny",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_DEL,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip_list",
+		.read_seq_string = ve_ip_access_seq_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_allow",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_ADD,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_deny",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_DEL,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_list",
+		.read_seq_string = ve_ip_access_seq_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }
+};
+
+static int venet_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[])
+{
+	struct venetaddrmsg *vamp;
+	struct nlattr *nla_addr;
+	struct ve_struct *ve;
+	struct ve_addr_struct addr;
+	int cmd;
+
+	ve = dev_net(dev)->owner_ve;
+	if (ve_is_super(ve))
+		return -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	if (!data[VENET_INFO_CMD])
+		return -EINVAL;
+
+	nla_addr = data[VENET_INFO_CMD];
+	vamp = nla_data(nla_addr);
+
+	memset(&addr, 0, sizeof(addr));
+	addr.family = vamp->va_family;
+
+	if (addr.family == AF_INET)
+		memcpy(&addr.key[3], &vamp->va_addr[0], 4);
+	else if (addr.family == AF_INET6)
+		memcpy(&addr.key[0], &vamp->va_addr[0], sizeof(addr.key));
+	else
+		return -EINVAL;
+
+	if (vamp->va_cmd == VENET_IP_ADD)
+		cmd = VE_IP_ADD;
+	else if (vamp->va_cmd == VENET_IP_DEL)
+		cmd = VE_IP_DEL;
+	else
+		return -EINVAL;
+
+	return do_ve_ip_map(ve, cmd, &addr);
+}
+
+static const struct nla_policy venet_policy[VENET_INFO_MAX + 1] = {
+	[VENET_INFO_CMD]	= { .len = sizeof(struct venetaddrmsg) },
+};
+
+static struct rtnl_link_ops venet_link_ops = {
+	.kind		= "venet",
+	.priv_size	= sizeof(struct veip_struct),
+	.newlink	= venet_newlink,
+	.dellink	= venet_dellink,
+	.setup		= venet_setup,
+	.changelink	= venet_changelink,
+	.policy		= venet_policy,
+	.maxtype	= VENET_INFO_MAX,
+};
+
+static void veip_shutdown_fini(void *data)
+{
+	veip_shutdown(data);
+}
+
+static struct ve_hook veip_shutdown_hook = {
+	.fini		= veip_shutdown_fini,
+	.priority	= HOOK_PRIO_FINISHING,
+	.owner		= THIS_MODULE,
+};
+
+__init int venet_init(void)
+{
+	struct proc_dir_entry *de;
+	int i, err;
+
+	if (get_ve0()->_venet_dev != NULL)
+		return -EEXIST;
+
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		INIT_HLIST_HEAD(ip_entry_hash_table + i);
+
+	de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_veip_operations);
+	if (!de)
+		return -EINVAL;
+
+	err = cgroup_add_cftypes(&ve_subsys, venet_cftypes);
+	if (err)
+		goto err_proc;
+
+	vzioctl_register(&venetcalls);
+	vzmon_register_veaddr_print_cb(veaddr_seq_print);
+	ve_hook_register(VE_SHUTDOWN_CHAIN, &veip_shutdown_hook);
+
+	return rtnl_link_register(&venet_link_ops);
+
+err_proc:
+	remove_proc_entry("veip", proc_vz_dir);
+	return err;
+}
+
+__exit void venet_exit(void)
+{
+	cgroup_rm_cftypes(&ve_subsys, venet_cftypes);
+	vzmon_unregister_veaddr_print_cb(veaddr_seq_print);
+	vzioctl_unregister(&venetcalls);
+	remove_proc_entry("veip", proc_vz_dir);
+	veip_cleanup();
+
+	/* Ensure there are no outstanding rcu callbacks */
+	rcu_barrier();
+
+	BUG_ON(!list_empty(&veip_lh));
+	rtnl_link_unregister(&venet_link_ops);
+}
+
+module_init(venet_init);
+module_exit(venet_exit);
+
+MODULE_AUTHOR("Parallels <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("vznet");
+
+EXPORT_SYMBOL(veip_lock);
+EXPORT_SYMBOL(ip_entry_hash);
+EXPORT_SYMBOL(ip_entry_unhash);
+EXPORT_SYMBOL(sockaddr_to_veaddr);
+EXPORT_SYMBOL(veaddr_print);
+EXPORT_SYMBOL(venet_entry_lookup);
+EXPORT_SYMBOL(veip_findcreate);
+EXPORT_SYMBOL(veip_put);
+EXPORT_SYMBOL(venet_ext_lookup);
+EXPORT_SYMBOL(veip_lh);
+EXPORT_SYMBOL(ip_entry_hash_table);
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,6 +19,7 @@
 #include <net/xfrm.h>
 #include <linux/veth.h>
 #include <linux/module.h>
+#include "../../net/bridge/br_private.h"
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
@@ -106,6 +107,29 @@ static const struct ethtool_ops veth_ethtool_ops = {
 	.get_ethtool_stats	= veth_get_ethtool_stats,
 };
 
+static int vzethdev_filter(struct sk_buff *skb, struct net_device *dev, struct net_device *rcv)
+{
+	/* Filtering */
+	if (ve_is_super(dev_net(dev)->owner_ve) &&
+	    dev->features & NETIF_F_FIXED_ADDR) {
+		/* from VE0 to VEX */
+		if (ve_is_super(dev_net(rcv)->owner_ve))
+			return 1;
+		if (is_multicast_ether_addr(
+					((struct ethhdr *)skb->data)->h_dest))
+			return 1;
+		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, rcv->dev_addr))
+				return 0;
+	} else if (!ve_is_super(dev_net(dev)->owner_ve) &&
+		   dev->features & NETIF_F_FIXED_ADDR) {
+		/* from VEX to VE0 */
+		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, dev->dev_addr))
+				return 0;
+	}
+
+	return 1;
+}
+
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -119,6 +143,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
+
+	if (dev->features & NETIF_F_VENET && !vzethdev_filter(skb, dev, rcv)) {
+		kfree_skb(skb);
+		goto drop;
+	}
+
 	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
@@ -276,6 +306,59 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+static int veth_mac_addr(struct net_device *dev, void *p)
+{
+	if (dev->features & NETIF_F_VENET &&
+	    dev->features & NETIF_F_FIXED_ADDR)
+		return -EPERM;
+	return eth_mac_addr(dev, p);
+}
+
+static int vzethdev_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case SIOCSVENET:
+	{
+		struct veth_priv *priv = netdev_priv(dev);
+		struct net_device *rcv;
+
+		rcu_read_lock();
+		rcv = rcu_dereference(priv->peer);
+		if (rcv)
+			rcv->features |= NETIF_F_VENET;
+		dev->features |= NETIF_F_VENET;
+		rcu_read_unlock();
+
+		return 0;
+	}
+	case SIOCSFIXEDADDR:
+		if (ifr->ifr_ifru.ifru_flags)
+			dev->features |= NETIF_F_FIXED_ADDR;
+		else
+			dev->features &= ~NETIF_F_FIXED_ADDR;
+		return 0;
+	}
+	return -ENOTTY;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void veth_poll_controller(struct net_device *dev)
+{
+	/* veth only receives frames when its peer sends one
+	 * Since it's a synchronous operation, we are guaranteed
+	 * never to have pending data when we poll for it so
+	 * there is nothing to do here.
+	 *
+	 * We need this though so netpoll recognizes us as an interface that
+	 * supports polling, which enables bridge devices in virt setups to
+	 * still use netconsole
+	 */
+}
+#endif	/* CONFIG_NET_POLL_CONTROLLER */
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -283,10 +366,14 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_start_xmit      = veth_xmit,
 	.ndo_change_mtu      = veth_change_mtu,
 	.ndo_get_stats64     = veth_get_stats64,
-	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_set_mac_address = veth_mac_addr,
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_size		= sizeof(struct net_device_ops),
 	.extended.ndo_set_rx_headroom	= veth_set_rx_headroom,
+	.ndo_do_ioctl        = vzethdev_net_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= veth_poll_controller,
+#endif
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -307,7 +394,7 @@ static void veth_setup(struct net_device *dev)
 	dev->netdev_ops = &veth_netdev_ops;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
-	dev->features |= VETH_FEATURES;
+	dev->features |= VETH_FEATURES | NETIF_F_VIRTUAL;
 	dev->vlan_features = dev->features &
 			     ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX);
 	dev->destructor = veth_dev_free;
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2526,6 +2526,7 @@ static void vxlan_setup(struct net_device *dev)
 
 	dev->vlan_features = dev->features;
 	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+	dev->features |= NETIF_F_VIRTUAL;
 	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
 	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -49,7 +49,6 @@ static unsigned int be_iopoll_budget = 10;
 static unsigned int be_max_phys_size = 64;
 static unsigned int enable_msix = 1;
 
-MODULE_DEVICE_TABLE(pci, beiscsi_pci_id_table);
 MODULE_DESCRIPTION(DRV_DESC " " BUILD_STR);
 MODULE_VERSION(BUILD_STR);
 MODULE_AUTHOR("Emulex Corporation");
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -40,6 +40,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 
 static atomic_t scsi_host_next_hn = ATOMIC_INIT(0);	/* host_no for next new host */
@@ -140,12 +141,13 @@ int scsi_host_set_state(struct Scsi_Host *shost, enum scsi_host_state state)
 	return 0;
 
  illegal:
-	SCSI_LOG_ERROR_RECOVERY(1,
-				shost_printk(KERN_ERR, shost,
-					     "Illegal host state transition"
-					     "%s->%s\n",
-					     scsi_host_state_name(oldstate),
-					     scsi_host_state_name(state)));
+	shost_printk(KERN_ERR, shost,
+		     "Illegal host state transition"
+		     "%s->%s\n",
+		     scsi_host_state_name(oldstate),
+		     scsi_host_state_name(state));
+	dump_stack();
+
 	return -EINVAL;
 }
 EXPORT_SYMBOL(scsi_host_set_state);
@@ -358,6 +360,7 @@ static void scsi_host_dev_release(struct device *dev)
 
 	if (parent)
 		put_device(parent);
+	kfree(SHOST_TO_SDBG(shost));
 	kfree(shost);
 }
 
@@ -388,6 +391,7 @@ static struct device_type scsi_host_type = {
 struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 {
 	struct Scsi_Host *shost;
+	struct scsi_host_dbg *sdbg;
 	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (sht->unchecked_isa_dma && privsize)
@@ -397,6 +401,15 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	if (!shost)
 		return NULL;
 
+	sdbg = kzalloc(sizeof(struct scsi_host_dbg), gfp_mask);
+	if (!sdbg) {
+		kfree(shost);
+		return NULL;
+	}
+
+	SHOST_TO_SDBG(shost) = sdbg;
+	spin_lock_init(&sdbg->sdbg_lock);
+
 	shost->host_lock = &shost->default_lock;
 	spin_lock_init(shost->host_lock);
 	shost->shost_state = SHOST_CREATED;
@@ -514,6 +527,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
  fail_kthread:
 	kthread_stop(shost->ehandler);
  fail_kfree:
+	kfree(SHOST_TO_SDBG(shost));
 	kfree(shost);
 	return NULL;
 }
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -41,6 +41,7 @@
 #include "../scsi_sas_internal.h"
 #include "../scsi_transport_api.h"
 #include "../scsi_priv.h"
+#include "../scsi_dbg.h"
 
 #include <linux/err.h>
 #include <linux/blkdev.h>
@@ -234,6 +235,7 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
 	 * handler done list, this also takes it off the
 	 * error handler pending list.
 	 */
+	scsi_debug_log_cmnd(SAS_EH_FINISH_CMD_CALLS_EH_FINISH, cmd);
 	scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q);
 }
 
@@ -465,6 +467,7 @@ static int sas_queue_reset(struct domain_device *dev, int reset_type, int lun, i
 			set_bit(SAS_DEV_EH_PENDING, &dev->state);
 			set_bit(reset_type, &dev->state);
 			int_to_scsilun(lun, &dev->ssp_dev.reset_lun);
+			scsi_debug_log_shost(SAS_QUEUE_RESET_CALLS_SCHEDULE_EH, ha->core.shost);
 			scsi_schedule_eh(ha->core.shost);
 		}
 		spin_unlock_irq(&ha->lock);
@@ -789,6 +792,7 @@ void sas_scsi_recover_host(struct Scsi_Host *shost)
 	/* check if any new eh work was scheduled during the last run */
 	spin_lock_irq(&ha->lock);
 	if (ha->eh_active == 0) {
+		scsi_debug_log_shost(SAS_SCSI_RECOVER_HOST_ZERO_EH_SCHEDULED, shost);
 		shost->host_eh_scheduled = 0;
 		retry = false;
 	}
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -53,6 +53,8 @@
 #include "qla_target.h"
 #include "tcm_qla2xxx.h"
 
+#define TF_CIT_TMPL(tf) (&(tf)->tf_cit_tmpl)
+
 struct workqueue_struct *tcm_qla2xxx_free_wq;
 struct workqueue_struct *tcm_qla2xxx_cmd_wq;
 
@@ -709,7 +711,7 @@ static int tcm_qla2xxx_queue_status(struct se_cmd *se_cmd)
 	return qlt_xmit_response(cmd, xmit_type, se_cmd->scsi_status);
 }
 
-static int tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
+static void tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
 {
 	struct se_tmr_req *se_tmr = se_cmd->se_tmr_req;
 	struct qla_tgt_mgmt_cmd *mcmd = container_of(se_cmd,
@@ -741,8 +743,20 @@ static int tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
 	 * CTIO response packet.
 	 */
 	qlt_xmit_tm_rsp(mcmd);
+}
 
-	return 0;
+static void tcm_qla2xxx_aborted_task(struct se_cmd *se_cmd)
+{
+	struct qla_tgt_cmd *cmd = container_of(se_cmd,
+				struct qla_tgt_cmd, se_cmd);
+	struct scsi_qla_host *vha = cmd->vha;
+	struct qla_hw_data *ha = vha->hw;
+
+	if (!cmd->sg_mapped)
+		return;
+
+	pci_unmap_sg(ha->pdev, cmd->sg, cmd->sg_cnt, cmd->dma_data_direction);
+	cmd->sg_mapped = 0;
 }
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -1424,7 +1438,7 @@ static int tcm_qla2xxx_check_initiator_node_acl(
 	}
 	se_tpg = &tpg->se_tpg;
 
-	se_sess = transport_init_session();
+	se_sess = transport_init_session(TARGET_PROT_NORMAL);
 	if (IS_ERR(se_sess)) {
 		pr_err("Unable to initialize struct se_session\n");
 		return PTR_ERR(se_sess);
@@ -1761,6 +1775,7 @@ static struct target_core_fabric_ops tcm_qla2xxx_ops = {
 	.queue_data_in			= tcm_qla2xxx_queue_data_in,
 	.queue_status			= tcm_qla2xxx_queue_status,
 	.queue_tm_rsp			= tcm_qla2xxx_queue_tm_rsp,
+	.aborted_task			= tcm_qla2xxx_aborted_task,
 	/*
 	 * Setup function pointers for generic logic in
 	 * target_core_fabric_configfs.c
@@ -1808,6 +1823,7 @@ static struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = {
 	.queue_data_in			= tcm_qla2xxx_queue_data_in,
 	.queue_status			= tcm_qla2xxx_queue_status,
 	.queue_tm_rsp			= tcm_qla2xxx_queue_tm_rsp,
+	.aborted_task			= tcm_qla2xxx_aborted_task,
 	/*
 	 * Setup function pointers for generic logic in
 	 * target_core_fabric_configfs.c
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -68,6 +68,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/scsi.h>
@@ -684,6 +685,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
 	struct scsi_driver *drv;
 	unsigned int good_bytes;
 
+	scsi_debug_log_cmnd(SCSI_FINISH_COMMAND_CALLS_UNBUSY, cmd);
 	scsi_device_unbusy(sdev);
 
 	/*
--- /dev/null
+++ b/drivers/scsi/scsi_dbg.h
@@ -0,0 +1,133 @@
+/*
+ *  drivers/scsi/scsi_dbg.h
+ *
+ *  Copyright (c) 2016 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _SCSI_DBG_H
+#define _SCSI_DBG_H
+
+#include <scsi/scsi_cmnd.h>
+
+/*
+ * Temporary debug stuff to chase missed ehandler wakeup.
+ */
+
+#define SCSI_HOST_DBG_N_ENTRIES 45 /* fit in one page */
+
+enum scsi_dbg_type {
+	SCSI_HOST_QUEUE_READY_INC_HOST_BUSY = 1, /* scsi_host_queue_ready() */
+	SCSI_HOST_QUEUE_READY_DEC_HOST_BUSY,
+	SCSI_KILL_REQUEST_INC_HOST_BUSY,
+	SCSI_QUEUE_RQ_DEC_HOST_BUSY,
+	SCSI_FINISH_COMMAND_CALLS_UNBUSY,
+	SCSI_QUEUE_INSERT_CALLS_UNBUSY,
+	SCSI_EH_SCMD_ADD_INC_HOST_FAILED,
+	ATA_SCSI_CMD_ERROR_HANDLER_CALLS_EH_FINISH,
+	ATA_EH_QC_COMPLETE_CALLS_EH_FINISH,
+	SAS_EH_FINISH_CMD_CALLS_EH_FINISH,
+	SCSI_EH_GET_SENSE_CALLS_EH_FINISH,
+	SCSI_EH_TEST_DEVICES_CALLS_EH_FINISH,
+	SCSI_EH_ABORT_CMDS_CALLS_EH_FINISH,
+	SCSI_EH_STU_CALLS_EH_FINISH,
+	SCSI_EH_BUS_DEVICE_RESET_CALLS_EH_FINISH,
+	SCSI_EH_TARGET_RESET_CALLS_EH_FINISH,
+	SCSI_EH_BUS_RESET_CALLS_EH_FINISH,
+	SCSI_EH_HOST_RESET_CALLS_EH_FINISH,
+	SCSI_EH_OFFLINE_SDEVS_CALLS_EH_FINISH,
+	ATA_STD_END_EH_ZERO_EH_SCHEDULED,
+	SAS_SCSI_RECOVER_HOST_ZERO_EH_SCHEDULED,
+	ATA_STD_SCHED_EH_CALLS_SCHEDULE_EH,
+	SAS_QUEUE_RESET_CALLS_SCHEDULE_EH,
+	SCSI_EH_WAKEUP_EHANDLER,
+	SCSI_SCHEDULE_EH_CALLS_EH_WAKEUP,
+	SCSI_DEVICE_UNBUSY_CALLS_EH_WAKEUP,
+	SCSI_ERROR_HANDLER_SLEEP,
+	SCSI_ERROR_HANDLER_WAKEUP,
+	SCSI_ERROR_HANDLER_CALLS_HANDLER,
+};
+
+struct scsi_host_log_entry {
+	enum scsi_dbg_type sle_type;
+	enum scsi_host_state sle_shost_state;
+
+	int sle_host_failed;
+	int sle_host_busy;
+	int sle_host_blocked;
+	int sle_host_eh_scheduled;
+
+	struct task_struct *sle_task;
+	char sle_comm[TASK_COMM_LEN];
+
+	struct scsi_device *sle_sdev;
+	struct scsi_cmnd   *sle_cmnd;
+	struct request     *sle_req;
+
+	ktime_t sle_ktime;
+	u64     sle_jiffies;
+};
+
+struct scsi_host_dbg {
+	spinlock_t		   sdbg_lock;
+	struct scsi_host_log_entry sdbg_entries[SCSI_HOST_DBG_N_ENTRIES];
+	int                        sdbg_next_entry;
+};
+
+#define SHOST_TO_SDBG(shost) (shost)->scsi_mq_reserved3
+
+static inline void
+scsi_debug_log(struct Scsi_Host *shost, enum scsi_dbg_type type,
+	       struct scsi_device *sdev, struct scsi_cmnd *cmnd,
+	       struct request *req)
+{
+	struct scsi_host_dbg *s = SHOST_TO_SDBG(shost);
+	struct scsi_host_log_entry *e;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&s->sdbg_lock, irq_flags);
+	e = &s->sdbg_entries[s->sdbg_next_entry];
+
+	e->sle_type = type;
+	e->sle_sdev = sdev;
+	e->sle_cmnd = cmnd;
+	e->sle_req  = req;
+
+	e->sle_shost_state       = shost->shost_state;
+	e->sle_host_failed       = shost->host_failed;
+	e->sle_host_busy         = atomic_read(&shost->host_busy);
+	e->sle_host_blocked      = atomic_read(&shost->host_blocked);
+	e->sle_host_eh_scheduled = shost->host_eh_scheduled;
+
+	e->sle_task = current;
+	memcpy(e->sle_comm, current->comm, TASK_COMM_LEN);
+
+	e->sle_ktime   = ktime_get();
+	e->sle_jiffies = jiffies;
+
+	s->sdbg_next_entry++;
+	if (s->sdbg_next_entry == SCSI_HOST_DBG_N_ENTRIES)
+		s->sdbg_next_entry = 0;
+	spin_unlock_irqrestore(&s->sdbg_lock, irq_flags);
+}
+
+static inline void
+scsi_debug_log_cmnd(enum scsi_dbg_type type, struct scsi_cmnd *cmnd)
+{
+	scsi_debug_log(cmnd->device->host, type, cmnd->device, cmnd,
+		       cmnd->request);
+}
+
+static inline void
+scsi_debug_log_shost(enum scsi_dbg_type type, struct Scsi_Host *shost)
+{
+	scsi_debug_log(shost, type, NULL, NULL, NULL);
+}
+
+static inline void
+scsi_debug_log_sdev(enum scsi_dbg_type type, struct scsi_device *sdev)
+{
+	scsi_debug_log(sdev->host, type, sdev, NULL, NULL);
+}
+
+#endif /* _SCSI_DBG_H */
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -38,6 +38,7 @@
 #include <scsi/scsi_ioctl.h>
 
 #include "scsi_priv.h"
+#include "scsi_dbg.h"
 #include "scsi_logging.h"
 #include "scsi_transport_api.h"
 
@@ -61,6 +62,7 @@ void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
 	if (atomic_read(&shost->host_busy) == shost->host_failed) {
 		trace_scsi_eh_wakeup(shost);
+		scsi_debug_log_shost(SCSI_EH_WAKEUP_EHANDLER, shost);
 		wake_up_process(shost->ehandler);
 		SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
 			"Waking error handler thread\n"));
@@ -82,6 +84,7 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
 		shost->host_eh_scheduled++;
+		scsi_debug_log_shost(SCSI_SCHEDULE_EH_CALLS_EH_WAKEUP, shost);
 		scsi_eh_wakeup(shost);
 	}
 
@@ -247,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
 		eh_flag &= ~SCSI_EH_CANCEL_CMD;
 	scmd->eh_eflags |= eh_flag;
 	list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
+	scsi_debug_log_cmnd(SCSI_EH_SCMD_ADD_INC_HOST_FAILED, scmd);
 	shost->host_failed++;
 	scsi_eh_wakeup(shost);
  out_unlock:
@@ -1215,6 +1219,7 @@ int scsi_eh_get_sense(struct list_head *work_q,
 		else if (rtn != NEEDS_RETRY)
 			continue;
 
+		scsi_debug_log_cmnd(SCSI_EH_GET_SENSE_CALLS_EH_FINISH, scmd);
 		scsi_eh_finish_cmd(scmd, done_q);
 	}
 
@@ -1299,8 +1304,10 @@ static int scsi_eh_test_devices(struct list_head *cmd_list,
 			if (scmd->device == sdev) {
 				if (finish_cmds &&
 				    (try_stu ||
-				     scsi_eh_action(scmd, SUCCESS) == SUCCESS))
+				     scsi_eh_action(scmd, SUCCESS) == SUCCESS)) {
+					scsi_debug_log_cmnd(SCSI_EH_TEST_DEVICES_CALLS_EH_FINISH, scmd);
 					scsi_eh_finish_cmd(scmd, done_q);
+				}
 				else
 					list_move_tail(&scmd->eh_entry, work_q);
 			}
@@ -1354,9 +1361,10 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
 			return list_empty(work_q);
 		}
 		scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
-		if (rtn == FAST_IO_FAIL)
+		if (rtn == FAST_IO_FAIL) {
+			scsi_debug_log_cmnd(SCSI_EH_ABORT_CMDS_CALLS_EH_FINISH, scmd);
 			scsi_eh_finish_cmd(scmd, done_q);
-		else
+		} else
 			list_move_tail(&scmd->eh_entry, &check_list);
 	}
 
@@ -1434,8 +1442,10 @@ static int scsi_eh_stu(struct Scsi_Host *shost,
 				list_for_each_entry_safe(scmd, next,
 							  work_q, eh_entry) {
 					if (scmd->device == sdev &&
-					    scsi_eh_action(scmd, SUCCESS) == SUCCESS)
+					    scsi_eh_action(scmd, SUCCESS) == SUCCESS) {
+						scsi_debug_log_cmnd(SCSI_EH_STU_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd, done_q);
+					}
 				}
 			}
 		} else {
@@ -1499,9 +1509,11 @@ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
 				list_for_each_entry_safe(scmd, next,
 							 work_q, eh_entry) {
 					if (scmd->device == sdev &&
-					    scsi_eh_action(scmd, rtn) != FAILED)
+					    scsi_eh_action(scmd, rtn) != FAILED) {
+						scsi_debug_log_cmnd(SCSI_EH_BUS_DEVICE_RESET_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd,
 								   done_q);
+					}
 				}
 			}
 		} else {
@@ -1568,9 +1580,10 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
 
 			if (rtn == SUCCESS)
 				list_move_tail(&scmd->eh_entry, &check_list);
-			else if (rtn == FAST_IO_FAIL)
+			else if (rtn == FAST_IO_FAIL) {
+				scsi_debug_log_cmnd(SCSI_EH_TARGET_RESET_CALLS_EH_FINISH, scmd);
 				scsi_eh_finish_cmd(scmd, done_q);
-			else
+			} else
 				/* push back on work queue for further processing */
 				list_move(&scmd->eh_entry, work_q);
 		}
@@ -1633,10 +1646,11 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
 		if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
 			list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
 				if (channel == scmd_channel(scmd)) {
-					if (rtn == FAST_IO_FAIL)
+					if (rtn == FAST_IO_FAIL) {
+						scsi_debug_log_cmnd(SCSI_EH_BUS_RESET_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd,
 								   done_q);
-					else
+					} else
 						list_move_tail(&scmd->eh_entry,
 							       &check_list);
 				}
@@ -1679,6 +1693,7 @@ static int scsi_eh_host_reset(struct Scsi_Host *shost,
 			list_splice_init(work_q, &check_list);
 		} else if (rtn == FAST_IO_FAIL) {
 			list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
+				scsi_debug_log_cmnd(SCSI_EH_HOST_RESET_CALLS_EH_FINISH, scmd);
 					scsi_eh_finish_cmd(scmd, done_q);
 			}
 		} else {
@@ -1710,6 +1725,7 @@ static void scsi_eh_offline_sdevs(struct list_head *work_q,
 			 * FIXME: Handle lost cmds.
 			 */
 		}
+		scsi_debug_log_cmnd(SCSI_EH_OFFLINE_SDEVS_CALLS_EH_FINISH, scmd);
 		scsi_eh_finish_cmd(scmd, done_q);
 	}
 	return;
@@ -2198,7 +2214,9 @@ int scsi_error_handler(void *data)
 				shost_printk(KERN_INFO, shost,
 					     "scsi_eh_%d: sleeping\n",
 					     shost->host_no));
+			scsi_debug_log_shost(SCSI_ERROR_HANDLER_SLEEP, shost);
 			schedule();
+			scsi_debug_log_shost(SCSI_ERROR_HANDLER_WAKEUP, shost);
 			continue;
 		}
 
@@ -2223,6 +2241,7 @@ int scsi_error_handler(void *data)
 			continue;
 		}
 
+		scsi_debug_log_shost(SCSI_ERROR_HANDLER_CALLS_HANDLER, shost);
 		if (shost->transportt->eh_strategy_handler)
 			shost->transportt->eh_strategy_handler(shost);
 		else
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -36,6 +36,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 
 struct kmem_cache *scsi_sdb_cache;
@@ -118,8 +119,10 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
 	 * Decrement the counters, since these commands are no longer
 	 * active on the host/device.
 	 */
-	if (unbusy)
+	if (unbusy) {
+		scsi_debug_log_cmnd(SCSI_QUEUE_INSERT_CALLS_UNBUSY, cmd);
 		scsi_device_unbusy(device);
+	}
 
 	/*
 	 * Requeue this command.  It will go before all other commands
@@ -280,12 +283,13 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 	if (starget->can_queue > 0)
 		atomic_dec(&starget->target_busy);
 
+	spin_lock_irqsave(shost->host_lock, flags);
 	if (unlikely(scsi_host_in_recovery(shost) &&
 		     (shost->host_failed || shost->host_eh_scheduled))) {
-		spin_lock_irqsave(shost->host_lock, flags);
+		scsi_debug_log_shost(SCSI_DEVICE_UNBUSY_CALLS_EH_WAKEUP, shost);
 		scsi_eh_wakeup(shost);
-		spin_unlock_irqrestore(shost->host_lock, flags);
 	}
+	spin_unlock_irqrestore(shost->host_lock, flags);
 
 	atomic_dec(&sdev->device_busy);
 }
@@ -1431,6 +1435,8 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
 	if (scsi_host_in_recovery(shost))
 		return 0;
 
+	scsi_debug_log_sdev(SCSI_HOST_QUEUE_READY_INC_HOST_BUSY, sdev);
+
 	busy = atomic_inc_return(&shost->host_busy) - 1;
 	if (atomic_read(&shost->host_blocked) > 0) {
 		if (busy)
@@ -1468,7 +1474,15 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
 		list_add_tail(&sdev->starved_entry, &shost->starved_list);
 	spin_unlock_irq(shost->host_lock);
 out_dec:
+	scsi_debug_log_sdev(SCSI_HOST_QUEUE_READY_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
+
+	spin_lock_irq(shost->host_lock);
+	if (unlikely(scsi_host_in_recovery(shost) &&
+		     (shost->host_failed || shost->host_eh_scheduled)))
+		scsi_eh_wakeup(shost);
+	spin_unlock_irq(shost->host_lock);
+
 	return 0;
 }
 
@@ -1527,6 +1541,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	cmd->result = DID_NO_CONNECT << 16;
 	atomic_inc(&cmd->device->iorequest_cnt);
 
+	scsi_debug_log_cmnd(SCSI_KILL_REQUEST_INC_HOST_BUSY, cmd);
 	/*
 	 * SCSI request completion path will do scsi_device_unbusy(),
 	 * bump busy counts.  To bump the counters, we need to dance
@@ -1856,7 +1871,15 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 
 out_dec_host_busy:
+	scsi_debug_log_sdev(SCSI_QUEUE_RQ_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
+
+	spin_lock_irq(shost->host_lock);
+	if (unlikely(scsi_host_in_recovery(shost) &&
+		     (shost->host_failed || shost->host_eh_scheduled)))
+		scsi_eh_wakeup(shost);
+	spin_unlock_irq(shost->host_lock);
+
 out_dec_target_busy:
 	if (scsi_target(sdev)->can_queue > 0)
 		atomic_dec(&scsi_target(sdev)->target_busy);
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -3693,7 +3693,7 @@ iscsi_if_rx(struct sk_buff *skb)
 		uint32_t group;
 
 		nlh = nlmsg_hdr(skb);
-		if (nlh->nlmsg_len < sizeof(*nlh) ||
+		if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) ||
 		    skb->len < nlh->nlmsg_len) {
 			break;
 		}
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -62,6 +62,13 @@ struct virtio_scsi_vq {
 	struct virtqueue *vq;
 };
 
+#define __check_ret(val) do {				\
+		if (val == FAILED) {			\
+			printk("virtscsi_failure");	\
+			dump_stack();			\
+		}					\
+	} while(0)
+
 /*
  * Per-target queue state.
  *
@@ -490,6 +497,7 @@ static int virtscsi_add_cmd(struct virtqueue *vq,
 	return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_ATOMIC);
 }
 
+
 static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
 			     struct virtio_scsi_cmd *cmd,
 			     size_t req_size, size_t resp_size)
@@ -634,6 +642,7 @@ static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 	virtscsi_poll_requests(vscsi);
 
 out:
+	__check_ret(ret);
 	mempool_free(cmd, virtscsi_cmd_pool);
 	return ret;
 }
@@ -645,8 +654,10 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc)
 
 	sdev_printk(KERN_INFO, sc->device, "device reset\n");
 	cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-	if (!cmd)
+	if (!cmd) {
+		__check_ret(FAILED);
 		return FAILED;
+	}
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sc = sc;
@@ -688,11 +699,12 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
 	struct virtio_scsi *vscsi = shost_priv(sc->device->host);
 	struct virtio_scsi_cmd *cmd;
 
-	scmd_printk(KERN_INFO, sc, "abort\n");
+	scmd_printk(KERN_INFO, sc, "%s abort\n", __FUNCTION__);
 	cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-	if (!cmd)
+	if (!cmd) {
+		__check_ret(FAILED);
 		return FAILED;
-
+	}
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sc = sc;
 	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -339,29 +339,28 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 }
 
 /*
- * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
+ * ashmem_shrink - our cache shrinker, called from mm/vmscan.c
  *
- * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
- * many objects (pages) we have in total.
+ * 'nr_to_scan' is the number of objects to scan for freeing.
  *
  * 'gfp_mask' is the mask of the allocation that got us into this mess.
  *
- * Return value is the number of objects (pages) remaining, or -1 if we cannot
+ * Return value is the number of objects freed or -1 if we cannot
  * proceed without risk of deadlock (due to gfp_mask).
  *
  * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
  * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
  * pages freed.
  */
-static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
+static unsigned long
+ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct ashmem_range *range, *next;
+	unsigned long freed = 0;
 
 	/* We might recurse into filesystem code, so bail out if necessary */
-	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-		return -1;
-	if (!sc->nr_to_scan)
-		return lru_count;
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
 
 	mutex_lock(&ashmem_mutex);
 	list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
@@ -374,17 +373,32 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		range->purged = ASHMEM_WAS_PURGED;
 		lru_del(range);
 
-		sc->nr_to_scan -= range_size(range);
-		if (sc->nr_to_scan <= 0)
+		freed += range_size(range);
+		if (--sc->nr_to_scan <= 0)
 			break;
 	}
 	mutex_unlock(&ashmem_mutex);
+	return freed;
+}
 
+static unsigned long
+ashmem_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	/*
+	 * note that lru_count is count of pages on the lru, not a count of
+	 * objects on the list. This means the scan function needs to return the
+	 * number of pages freed, not the number of objects scanned.
+	 */
 	return lru_count;
 }
 
 static struct shrinker ashmem_shrinker = {
-	.shrink = ashmem_shrink,
+	.count_objects = ashmem_shrink_count,
+	.scan_objects = ashmem_shrink_scan,
+	/*
+	 * XXX (dchinner): I wish people would comment on why they need on
+	 * significant changes to the default value here
+	 */
 	.seeks = DEFAULT_SEEKS * 4,
 };
 
@@ -690,11 +704,10 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (capable(CAP_SYS_ADMIN)) {
 			struct shrink_control sc = {
 				.gfp_mask = GFP_KERNEL,
-				.nr_to_scan = 0,
+				.nr_to_scan = LONG_MAX,
 			};
-			ret = ashmem_shrink(&ashmem_shrinker, &sc);
-			sc.nr_to_scan = ret;
-			ashmem_shrink(&ashmem_shrinker, &sc);
+
+			ashmem_shrink_scan(&ashmem_shrinker, &sc);
 		}
 		break;
 	}
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -66,11 +66,20 @@ static unsigned long lowmem_deathpending_timeout;
 			pr_info(x);			\
 	} while (0)
 
-static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
+static unsigned long lowmem_count(struct shrinker *s,
+				  struct shrink_control *sc)
+{
+	return global_page_state(NR_ACTIVE_ANON) +
+		global_page_state(NR_ACTIVE_FILE) +
+		global_page_state(NR_INACTIVE_ANON) +
+		global_page_state(NR_INACTIVE_FILE);
+}
+
+static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 {
 	struct task_struct *tsk;
 	struct task_struct *selected = NULL;
-	int rem = 0;
+	unsigned long rem = 0;
 	int tasksize;
 	int i;
 	short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
@@ -92,19 +101,17 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 			break;
 		}
 	}
-	if (sc->nr_to_scan > 0)
-		lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
-				sc->nr_to_scan, sc->gfp_mask, other_free,
-				other_file, min_score_adj);
-	rem = global_page_state(NR_ACTIVE_ANON) +
-		global_page_state(NR_ACTIVE_FILE) +
-		global_page_state(NR_INACTIVE_ANON) +
-		global_page_state(NR_INACTIVE_FILE);
-	if (sc->nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
-		lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
-			     sc->nr_to_scan, sc->gfp_mask, rem);
-		return rem;
+
+	lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
+			sc->nr_to_scan, sc->gfp_mask, other_free,
+			other_file, min_score_adj);
+
+	if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
+		lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
+			     sc->nr_to_scan, sc->gfp_mask);
+		return 0;
 	}
+
 	selected_oom_score_adj = min_score_adj;
 
 	rcu_read_lock();
@@ -152,18 +159,25 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 			     selected->pid, selected->comm,
 			     selected_oom_score_adj, selected_tasksize);
 		lowmem_deathpending_timeout = jiffies + HZ;
+		/*
+		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
+		 * infrastructure. There is no real reason why the selected
+		 * task should have access to the memory reserves.
+		 */
+		mark_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
-		set_tsk_thread_flag(selected, TIF_MEMDIE);
-		rem -= selected_tasksize;
+		rem += selected_tasksize;
 	}
-	lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
+
+	lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
 		     sc->nr_to_scan, sc->gfp_mask, rem);
 	rcu_read_unlock();
 	return rem;
 }
 
 static struct shrinker lowmem_shrinker = {
-	.shrink = lowmem_shrink,
+	.scan_objects = lowmem_scan,
+	.count_objects = lowmem_count,
 	.seeks = DEFAULT_SEEKS * 16
 };
 
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -961,7 +961,7 @@ static int zcache_get_swap_cache_page(int type, pgoff_t offset,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 		/* FIXME: is it possible to get here without err==-ENOMEM?
 		 * If not, we can dispense with the do loop, use goto retry */
 	} while (err != -ENOMEM);
@@ -1495,7 +1495,7 @@ int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph)
  * to translate in-kernel semantics to zcache semantics.
  */
 
-static void zcache_cleancache_put_page(int pool_id,
+static int zcache_cleancache_put_page(int pool_id,
 					struct cleancache_filekey key,
 					pgoff_t index, struct page *page)
 {
@@ -1504,11 +1504,12 @@ static void zcache_cleancache_put_page(int pool_id,
 
 	if (!disable_cleancache_ignore_nonactive && !PageWasActive(page)) {
 		inc_zcache_eph_nonactive_puts_ignored();
-		return;
+		return 0;
 	}
 	if (likely(ind == index))
-		(void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index,
+		return !zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index,
 					page, PAGE_SIZE, false, 1);
+	return 0;
 }
 
 static int zcache_cleancache_get_page(int pool_id,
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -43,5 +43,6 @@ source "drivers/target/loopback/Kconfig"
 source "drivers/target/tcm_fc/Kconfig"
 source "drivers/target/iscsi/Kconfig"
 source "drivers/target/sbp/Kconfig"
+source "drivers/target/mhvtl/Kconfig"
 
 endif
--- a/drivers/target/Makefile
+++ b/drivers/target/Makefile
@@ -29,3 +29,6 @@ obj-$(CONFIG_LOOPBACK_TARGET)	+= loopback/
 obj-$(CONFIG_TCM_FC)		+= tcm_fc/
 obj-$(CONFIG_ISCSI_TARGET)	+= iscsi/
 obj-$(CONFIG_SBP_TARGET)	+= sbp/
+
+# mhvtl module
+obj-$(CONFIG_MHVTL)		+= mhvtl/
--- /dev/null
+++ b/drivers/target/mhvtl/Kconfig
@@ -0,0 +1,8 @@
+config MHVTL
+	tristate "A Virtual Tape module"
+	help
+	  Say Y here to enable the Virtual Tape module.
+	  The vtl is basically a stripped down scsi_debug kernel module + a
+	  char dev 'back end' to pass the SCSI commands thru to user space
+	  daemons. It is the user space daemons responsibility to respond and
+	  process the SCSI commands.
--- /dev/null
+++ b/drivers/target/mhvtl/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MHVTL)	+= mhvtl.o
--- /dev/null
+++ b/drivers/target/mhvtl/fetch27.c
@@ -0,0 +1,141 @@
+
+/**
+ * vtl_sg_copy_user - Copy data between user-space linear buffer and an SG list
+ * @sgl:	The SG list
+ * @nents:	Number of SG entries
+ * @buf:	Where to copy from
+ * @buflen:	The number of bytes to copy
+ * @to_buffer:	Transfer direction (non zero == from an sg list to a buffer,
+ *		0 == from a buffer to an sg list
+ *
+ * Returns number of copied bytes
+ *
+ * Taken in whole from scatterlist.c
+ */
+
+static size_t vtl_sg_copy_user(struct scatterlist *sgl, unsigned int nents,
+				__user void *buf, size_t buflen, int to_buffer)
+{
+	unsigned int offset = 0;
+	struct sg_mapping_iter miter;
+	/* Do not use SG_MITER_ATOMIC flag on the sg_miter_start() call */
+	unsigned int sg_flags = 0;
+	unsigned int rem;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,30)
+	if (to_buffer)
+		sg_flags |= SG_MITER_FROM_SG;
+	else
+		sg_flags |= SG_MITER_TO_SG;
+#endif
+
+	sg_miter_start(&miter, sgl, nents, sg_flags);
+
+	while (sg_miter_next(&miter) && offset < buflen) {
+		unsigned int len;
+
+		len = min(miter.length, buflen - offset);
+
+		if (to_buffer)
+			rem = copy_to_user(buf + offset, miter.addr, len);
+		else {
+			rem = copy_from_user(miter.addr, buf + offset, len);
+			flush_kernel_dcache_page(miter.page);
+		}
+		if (rem)
+			printk(KERN_DEBUG "mhvtl: %s(): "
+				"copy_%s_user() failed, rem %ld, buf 0x%llx, "
+				"miter.addr 0x%llx, len %d\n",
+				__func__, (to_buffer) ? "to" : "from",
+				(long)rem,
+				(long long unsigned int)(buf + offset),
+				(long long unsigned int)miter.addr, len);
+
+		offset += len;
+	}
+
+	sg_miter_stop(&miter);
+
+	return offset;
+}
+
+static size_t vtl_copy_from_user(struct scatterlist *sgl, unsigned int nents,
+			char __user *buf, size_t buflen)
+{
+	return vtl_sg_copy_user(sgl, nents, buf, buflen, 0);
+}
+
+static size_t vtl_copy_to_user(struct scatterlist *sgl, unsigned int nents,
+			char __user *buf, size_t buflen)
+{
+	return vtl_sg_copy_user(sgl, nents, buf, buflen, 1);
+}
+
+/*
+ * Copy data from SCSI command buffer to device buffer
+ *  (SCSI command buffer -> user space)
+ *
+ * Returns number of bytes fetched into 'arr' or -1 if error.
+ */
+static int fetch_to_dev_buffer(struct scsi_cmnd *scp, char __user *arr, int len)
+{
+	struct scsi_data_buffer *sdb = scsi_out(scp);
+
+	if (!scsi_bufflen(scp))
+		return 0;
+	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
+		return -1;
+
+	return vtl_copy_to_user(sdb->table.sgl, sdb->table.nents, arr, len);
+}
+
+/*
+ * fill_from_user_buffer : Retrieves data from user-space into SCSI
+ * buffer(s)
+
+ Returns 0 if ok else (DID_ERROR << 16). Sets scp->resid .
+ */
+static int fill_from_user_buffer(struct scsi_cmnd *scp, char __user *arr,
+				int arr_len)
+{
+	int act_len;
+	struct scsi_data_buffer *sdb = scsi_in(scp);
+
+	if (!sdb->length)
+		return 0;
+	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
+		return DID_ERROR << 16;
+
+	act_len = vtl_copy_from_user(sdb->table.sgl, sdb->table.nents,
+					arr, arr_len);
+	if (sdb->resid)
+		sdb->resid -= act_len;
+	else
+		sdb->resid = scsi_bufflen(scp) - act_len;
+
+	return 0;
+
+}
+
+/* Returns 0 if ok else (DID_ERROR << 16). Sets scp->resid . */
+static int fill_from_dev_buffer(struct scsi_cmnd *scp, unsigned char *arr,
+				int arr_len)
+{
+	int act_len;
+	struct scsi_data_buffer *sdb = scsi_in(scp);
+
+	if (!sdb->length)
+		return 0;
+	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
+		return DID_ERROR << 16;
+
+	act_len = sg_copy_from_buffer(sdb->table.sgl, sdb->table.nents,
+					arr, arr_len);
+	if (sdb->resid)
+		sdb->resid -= act_len;
+	else
+		sdb->resid = scsi_bufflen(scp) - act_len;
+
+	return 0;
+}
+
--- /dev/null
+++ b/drivers/target/mhvtl/mhvtl.c
@@ -0,0 +1,1667 @@
+/*
+ *  linux/kernel/vtl.c
+ * vvvvvvvvvvvvvvvvvvvvvvv Original vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+ *  Copyright (C) 1992  Eric Youngdale
+ *  Simulate a host adapter with 2 disks attached.  Do a lot of checking
+ *  to make sure that we are not getting blocks mixed up, and PANIC if
+ *  anything out of the ordinary is seen.
+ * ^^^^^^^^^^^^^^^^^^^^^^^ Original ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ *
+ *  For documentation see http://sg.danny.cz/sg/sdebug26.html
+ *
+ *   D. Gilbert (dpg) work for Magneto-Optical device test [20010421]
+ *   dpg: work for devfs large number of disks [20010809]
+ *        forked for lk 2.5 series [20011216, 20020101]
+ *        use vmalloc() more inquiry+mode_sense [20020302]
+ *   Patrick Mansfield <patmans@us.ibm.com> max_luns+scsi_level [20021031]
+ *   Mike Anderson <andmike@us.ibm.com> sysfs work [20021118]
+ *   dpg: change style of boot options to "vtl.num_tgts=2" and
+ *        module options to "modprobe vtl num_tgts=2" [20021221]
+ *
+ *	Mark Harvey 2005-6-1
+ *
+ *	markh794@gmail.com
+ *	  or
+ *	Current employ address: mark.harvey@veritas.com
+ *
+ *	Pinched wholesale from scsi_debug.[ch]
+ *
+ *	Hacked to represent SCSI tape drives & Library.
+ *
+ *	Registered char driver to handle data to user space daemon.
+ *	Idea is for user space daemons (vtltape & vtllibrary) to emulate
+ *	and process the SCSI SSC/SMC device command set.
+ *
+ *	I've used it for testing NetBackup - but there is no reason any
+ *	other backup utility could not use it as well.
+ *
+ * Modification History:
+ *    2010-04-18 hstadler - some source code revision in mhvtl_init,
+ *			    mhvtl_exit, some return code checking
+ *
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <linux/blkdev.h>
+#include <linux/cdev.h>
+
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsicam.h>
+
+#include <linux/stat.h>
+
+#ifndef LINUX_VERSION_CODE
+#include <linux/version.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+#include <linux/smp_lock.h>
+#endif
+
+#ifndef _SCSI_H
+#define _SCSI_H
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi.h>
+
+struct Scsi_Host;
+struct scsi_cmnd;
+struct scsi_device;
+struct scsi_target;
+struct scatterlist;
+
+#endif /* _SCSI_H */
+
+#include "vtl_common.h"
+
+#include <scsi/scsi_driver.h>
+#include <scsi/scsi_ioctl.h>
+
+/* version of scsi_debug I started from
+ #define VTL_VERSION "1.75"
+*/
+#ifndef MHVTL_VERSION
+#define MHVTL_VERSION "0.18.17"
+#endif
+static const char *vtl_version_date = "20150414-0";
+static const char vtl_driver_name[] = "mhvtl";
+
+/* Additional Sense Code (ASC) used */
+#define INVALID_FIELD_IN_CDB 0x24
+
+#define VTL_TAGGED_QUEUING 0 /* 0 | MSG_SIMPLE_TAG | MSG_ORDERED_TAG */
+
+#ifndef SCSI_MAX_SG_CHAIN_SEGMENTS
+	#define SCSI_MAX_SG_CHAIN_SEGMENTS SG_ALL
+#endif
+
+#define	TIMEOUT_FOR_USER_DAEMON	50000
+
+/* Default values for driver parameters */
+#define DEF_NUM_HOST	1
+#define DEF_NUM_TGTS	0
+#define DEF_MAX_LUNS	32
+#define DEF_OPTS	1		/* Default to verbose logging */
+
+/* bit mask values for vtl_opts */
+#define VTL_OPT_NOISE	3
+
+#ifndef MHVTL_DEBUG
+
+#define MHVTL_DBG(lvl, s...)
+#define MHVTL_DBG_PRT_CDB(lvl, s...)
+
+#else
+
+#define MHVTL_DBG(lvl, format, arg...) {				\
+	if ((vtl_opts & VTL_OPT_NOISE) >= (lvl))			\
+			printk(KERN_DEBUG "%s: %s: " format,		\
+				vtl_driver_name, __func__, ## arg);	\
+}
+
+#define MHVTL_DBG_PRT_CDB(lvl, sn, cdb, len)				\
+	{								\
+		if ((vtl_opts & VTL_OPT_NOISE) >= (lvl)) {		\
+			mhvtl_prt_cdb("CDB", (sn), (cdb), (len));	\
+		}							\
+	}
+
+#endif	/* MHVTL_DEBUG */
+
+/* If REPORT LUNS has luns >= 256 it can choose "flat space" (value 1)
+ * or "peripheral device" addressing (value 0) */
+#define SAM2_LUN_ADDRESS_METHOD 0
+
+/* Major number assigned to vtl driver => 0 means to ask for one */
+static int vtl_major = 0;
+
+#define DEF_MAX_MINOR_NO 1024	/* Max number of minor nos. this driver will handle */
+
+#define VTL_CANQUEUE	255	/* needs to be >= 1 */
+#define VTL_MAX_CMD_LEN 16
+
+static int vtl_add_host = DEF_NUM_HOST;
+static int vtl_max_luns = DEF_MAX_LUNS;
+static int vtl_num_tgts = DEF_NUM_TGTS; /* targets per host */
+static int vtl_opts = DEF_OPTS;
+
+static int vtl_cmnd_count = 0;
+
+static atomic_t serial_number;
+
+struct vtl_lu_info {
+	struct list_head lu_sibling;
+	unsigned char sense_buff[SENSE_BUF_SIZE];	/* weak nexus */
+	unsigned int channel;
+	unsigned int target;
+	unsigned int lun;
+	unsigned int minor;
+	struct vtl_hba_info *vtl_hba;
+	struct scsi_device *sdev;
+
+	char reset;
+
+	struct list_head cmd_list; /* list of outstanding cmds for this lu */
+	spinlock_t cmd_list_lock;
+};
+
+static struct vtl_lu_info *devp[DEF_MAX_MINOR_NO];
+
+struct vtl_hba_info {
+	struct list_head hba_sibling; /* List of adapters */
+	struct list_head lu_list; /* List of lu */
+	struct Scsi_Host *shost;
+	struct device dev;
+};
+
+#define to_vtl_hba(d) \
+	container_of(d, struct vtl_hba_info, dev)
+
+static LIST_HEAD(vtl_hba_list);	/* dll of adapters */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static spinlock_t vtl_hba_list_lock = __SPIN_LOCK_UNLOCKED(vtl_hba_list_lock);
+#else
+static spinlock_t vtl_hba_list_lock = SPIN_LOCK_UNLOCKED;
+#endif
+
+typedef void (*done_funct_t) (struct scsi_cmnd *);
+
+/* vtl_queued_cmd-> state */
+enum cmd_state {
+	CMD_STATE_FREE = 0,
+	CMD_STATE_QUEUED,
+	CMD_STATE_IN_USE,
+};
+
+struct vtl_queued_cmd {
+	int state;
+	struct timer_list cmnd_timer;
+	done_funct_t done_funct;
+	struct scsi_cmnd *a_cmnd;
+	int scsi_result;
+	struct vtl_header op_header;
+
+	struct list_head queued_sibling;
+};
+
+static int num_aborts = 0;
+static int num_dev_resets = 0;
+static int num_bus_resets = 0;
+static int num_host_resets = 0;
+
+static int vtl_driver_probe(struct device *);
+static int vtl_driver_remove(struct device *);
+static struct bus_type pseudo_lld_bus;
+
+static struct device_driver vtl_driverfs_driver = {
+	.name		= vtl_driver_name,
+	.bus		= &pseudo_lld_bus,
+	.probe		= vtl_driver_probe,
+	.remove		= vtl_driver_remove,
+};
+
+static const int check_condition_result =
+		(DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
+
+/* function declarations */
+static int resp_report_luns(struct scsi_cmnd *SCpnt, struct vtl_lu_info *lu);
+static int fill_from_user_buffer(struct scsi_cmnd *scp, char __user *arr,
+				int arr_len);
+static int fill_from_dev_buffer(struct scsi_cmnd *scp, unsigned char *arr,
+				int arr_len);
+static void timer_intr_handler(unsigned long);
+static struct vtl_lu_info *devInfoReg(struct scsi_device *sdp);
+static void mk_sense_buffer(struct vtl_lu_info *lu, int key, int asc, int asq);
+static void stop_all_queued(void);
+static int do_create_driverfs_files(void);
+static void do_remove_driverfs_files(void);
+
+static int vtl_add_adapter(void);
+static void vtl_remove_adapter(void);
+
+static int vtl_slave_alloc(struct scsi_device *);
+static int vtl_slave_configure(struct scsi_device *);
+static void vtl_slave_destroy(struct scsi_device *);
+#if LINUX_VERSION_CODE != KERNEL_VERSION(2,6,9)
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,19,0) || LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33))
+static int vtl_change_queue_depth(struct scsi_device *sdev, int qdepth);
+#else
+static int vtl_change_queue_depth(struct scsi_device *sdev, int qdepth,
+					int reason);
+#endif
+#endif
+static int vtl_queuecommand_lck(struct scsi_cmnd *,
+					 void (*done) (struct scsi_cmnd *));
+static int vtl_b_ioctl(struct scsi_device *, int, void __user *);
+static long vtl_c_ioctl(struct file *, unsigned int, unsigned long);
+static int vtl_c_ioctl_bkl(struct inode *, struct file *, unsigned int, unsigned long);
+static int vtl_abort(struct scsi_cmnd *);
+static int vtl_bus_reset(struct scsi_cmnd *);
+static int vtl_device_reset(struct scsi_cmnd *);
+static int vtl_host_reset(struct scsi_cmnd *);
+static const char * vtl_info(struct Scsi_Host *);
+static int vtl_open(struct inode *, struct file *);
+static int vtl_release(struct inode *, struct file *);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+static DEF_SCSI_QCMD(vtl_queuecommand)
+#endif
+
+static struct device pseudo_primary;
+
+static struct scsi_host_template vtl_driver_template = {
+	.name =			"VTL",
+	.info =			vtl_info,
+	.slave_alloc =		vtl_slave_alloc,
+	.slave_configure =	vtl_slave_configure,
+	.slave_destroy =	vtl_slave_destroy,
+	.ioctl =		vtl_b_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+	.queuecommand =		vtl_queuecommand,
+#else
+	.queuecommand =		vtl_queuecommand_lck,
+#endif
+#if LINUX_VERSION_CODE != KERNEL_VERSION(2,6,9)
+	.change_queue_depth =	vtl_change_queue_depth,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
+        .ordered_tag =          1,
+#endif
+#endif
+	.eh_abort_handler =	vtl_abort,
+	.eh_bus_reset_handler = vtl_bus_reset,
+	.eh_device_reset_handler = vtl_device_reset,
+	.eh_host_reset_handler = vtl_host_reset,
+	.can_queue =		VTL_CANQUEUE,
+	.this_id =		15,
+	.sg_tablesize =		SCSI_MAX_SG_CHAIN_SEGMENTS,
+	.cmd_per_lun =		32,
+	.max_sectors =		4096,
+	.unchecked_isa_dma =	0,
+	.use_clustering =	ENABLE_CLUSTERING,
+	.module =		THIS_MODULE,
+};
+
+static const struct file_operations vtl_fops = {
+	.owner		= THIS_MODULE,
+#if defined(HAVE_UNLOCKED_IOCTL)
+	.unlocked_ioctl	= vtl_c_ioctl,
+#else
+	.ioctl		= vtl_c_ioctl_bkl,
+#endif
+	.open		= vtl_open,
+	.release	= vtl_release,
+};
+
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+ #include "fetch27.c"
+#elif LINUX_VERSION_CODE == KERNEL_VERSION(2,6,26)
+ #include "fetch26.c"
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+ #include "fetch24.c"
+#else
+ #include "fetch.c"
+#endif
+
+
+#ifdef MHVTL_DEBUG
+static void mhvtl_prt_cdb(char *f, uint64_t sn, uint8_t *s, int l)
+{
+	int i;
+
+	if (sn)
+		printk(KERN_DEBUG "mhvtl: %s (%llu) %d bytes\n",
+				f, (long long unsigned)sn, l);
+	else
+		printk(KERN_DEBUG "mhvtl: %s (%d bytes)\n", f, l);
+
+	for (i = 0; i < l; i += 2)
+		printk(KERN_DEBUG " %02x %02x", s[i], s[i + 1]);
+
+	printk(KERN_DEBUG "\n");
+}
+#endif /* MHVTL_DEBUG */
+
+/**********************************************************************
+ *                misc functions to handle queuing SCSI commands
+ **********************************************************************/
+
+/*
+ * schedule_resp() - handle SCSI commands that are processed from the
+ *                   queuecommand() interface. i.e. No callback to done()
+ *                   outside the queuecommand() function.
+ *
+ *                   Any SCSI command handled directly by the kernel driver
+ *                   will use this.
+ */
+static int schedule_resp(struct scsi_cmnd *SCpnt,
+			 struct vtl_lu_info *lu,
+			 done_funct_t done, int scsi_result)
+{
+	if ((VTL_OPT_NOISE & vtl_opts) && SCpnt) {
+		if (scsi_result) {
+#ifdef MHVTL_DEBUG
+			struct scsi_device *sdp = SCpnt->device;
+#endif
+
+			MHVTL_DBG(1, " <%u %u %u %llu> non-zero result=0x%x\n",
+				sdp->host->host_no,
+				sdp->channel, sdp->id,
+				(unsigned long long int)sdp->lun, scsi_result);
+		}
+	}
+	if (SCpnt && lu) {
+		/* simulate autosense by this driver */
+		if (SAM_STAT_CHECK_CONDITION == (scsi_result & 0xff))
+			memcpy(SCpnt->sense_buffer, lu->sense_buff,
+				(SCSI_SENSE_BUFFERSIZE > SENSE_BUF_SIZE) ?
+				SENSE_BUF_SIZE : SCSI_SENSE_BUFFERSIZE);
+	}
+	if (SCpnt)
+		SCpnt->result = scsi_result;
+	if (done)
+		done(SCpnt);
+	return 0;
+}
+
+/**********************************************************************
+ *                SCSI data handling routines
+ **********************************************************************/
+static int resp_write_to_user(struct scsi_cmnd *SCpnt,
+				void __user *up, int count)
+{
+	int fetched;
+
+	fetched = fetch_to_dev_buffer(SCpnt, up, count);
+
+	if (fetched < count) {
+		MHVTL_DBG(1, " cdb indicated=%d, IO sent=%d bytes\n",
+				count, fetched);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void debug_queued_list(struct vtl_lu_info *lu)
+{
+	unsigned long iflags = 0;
+	struct vtl_queued_cmd *sqcp, *n;
+	int k = 0;
+
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+	list_for_each_entry_safe(sqcp, n, &lu->cmd_list, queued_sibling) {
+		if (sqcp->state) {
+			if (sqcp->a_cmnd) {
+				MHVTL_DBG(2, "%d entry in use "
+					"SCpnt: %p, SCSI result: %d, done: %p, "
+					"Serial No: %ld\n",
+					k, sqcp->a_cmnd, sqcp->scsi_result,
+					sqcp->done_funct,
+					sqcp->a_cmnd->serial_number);
+			} else {
+				MHVTL_DBG(2, "%d entry in use "
+					"SCpnt: %p, SCSI result: %d, done: %p\n",
+					k, sqcp->a_cmnd, sqcp->scsi_result,
+					sqcp->done_funct);
+			}
+		} else
+			MHVTL_DBG(2, "entry free %d\n", k);
+		k++;
+	}
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+	MHVTL_DBG(2, "found %d entr%s\n", k, (k == 1) ? "y" : "ies");
+}
+
+static struct vtl_hba_info *vtl_get_hba_entry(void)
+{
+	struct vtl_hba_info *vtl_hba;
+
+	spin_lock(&vtl_hba_list_lock);
+	if (list_empty(&vtl_hba_list))
+		vtl_hba = NULL;
+	else
+		vtl_hba = list_entry(vtl_hba_list.prev,
+					struct vtl_hba_info, hba_sibling);
+	spin_unlock(&vtl_hba_list_lock);
+	return vtl_hba;
+}
+
+static void dump_queued_list(void)
+{
+	struct vtl_lu_info *lu;
+
+	struct vtl_hba_info *vtl_hba;
+
+	vtl_hba = vtl_get_hba_entry();
+	if (!vtl_hba)
+		return;
+
+	/* Now that the work list is split per lu, we have to check each
+	 * lu to see if we can find the serial number in question
+	 */
+	list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling) {
+		MHVTL_DBG(2, "Channel %d, ID %d, LUN %d\n",
+				lu->channel, lu->target, lu->lun);
+		debug_queued_list(lu);
+	}
+}
+
+/*********************************************************
+ * Generic interface to queue SCSI cmd to userspace daemon
+ *********************************************************/
+/*
+ * q_cmd returns success if we successfully added the SCSI
+ * cmd to the queued_list
+ *
+ * - Set state to indicate that the SCSI cmnd is ready for processing.
+ */
+static int q_cmd(struct scsi_cmnd *scp,
+				done_funct_t done,
+				struct vtl_lu_info *lu)
+{
+	unsigned long iflags;
+	struct vtl_header *vheadp;
+	struct vtl_queued_cmd *sqcp;
+
+	sqcp = kmalloc(sizeof(*sqcp), GFP_ATOMIC);
+	if (!sqcp) {
+		printk(KERN_WARNING "mhvtl: %s kmalloc failed\n", __func__);
+		return 1;
+	}
+
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+	init_timer(&sqcp->cmnd_timer);
+	list_add_tail(&sqcp->queued_sibling, &lu->cmd_list);
+	sqcp->a_cmnd = scp;
+	sqcp->scsi_result = 0;
+	sqcp->done_funct = done;
+	sqcp->cmnd_timer.function = timer_intr_handler;
+	sqcp->cmnd_timer.data = scp->serial_number;
+	sqcp->cmnd_timer.expires = jiffies + TIMEOUT_FOR_USER_DAEMON;
+	add_timer(&sqcp->cmnd_timer);
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+	if (VTL_OPT_NOISE & vtl_opts)
+		dump_queued_list();
+
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+
+	vheadp = &sqcp->op_header;
+	vheadp->serialNo = scp->serial_number;
+	memcpy(vheadp->cdb, scp->cmnd, scp->cmd_len);
+
+	/* Set flag.
+	 * Next ioctl() poll by user-daemon will check this state.
+	 */
+	sqcp->state = CMD_STATE_QUEUED;
+
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+
+	return 0;
+}
+
+/**********************************************************************
+ *                Main interface from SCSI mid level
+ **********************************************************************/
+static int vtl_queuecommand_lck(struct scsi_cmnd *SCpnt, done_funct_t done)
+{
+	unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
+	int errsts = 0;
+	struct vtl_lu_info *lu = NULL;
+
+	if (done == NULL)
+		return 0;	/* assume mid level reprocessing command */
+
+	if (cmd)
+		MHVTL_DBG_PRT_CDB(1, SCpnt->serial_number, cmd, SCpnt->cmd_len);
+
+	if (SCpnt->device->id == vtl_driver_template.this_id) {
+		printk(KERN_INFO "mhvtl: initiator's id used as target!\n");
+		return schedule_resp(SCpnt, NULL, done, DID_NO_CONNECT << 16);
+	}
+
+	if (SCpnt->device->lun >= vtl_max_luns) {
+		printk(KERN_INFO "mhvtl: %s max luns exceeded\n", __func__);
+		return schedule_resp(SCpnt, NULL, done, DID_NO_CONNECT << 16);
+	}
+
+	atomic_inc(&serial_number);
+	/* atomic_read(&serial_number); */
+
+	lu = devInfoReg(SCpnt->device);
+	if (NULL == lu) {
+		printk(KERN_INFO "mhvtl: %s could not find lu\n", __func__);
+		return schedule_resp(SCpnt, NULL, done, DID_NO_CONNECT << 16);
+	}
+
+	switch (*cmd) {
+	case REPORT_LUNS:	/* mandatory, ignore unit attention */
+		errsts = resp_report_luns(SCpnt, lu);
+		break;
+
+	/* All commands down the list are handled by a user-space daemon */
+	default:	/* Pass on to user space daemon to process */
+		errsts = q_cmd(SCpnt, done, lu);
+		if (!errsts)
+			return 0;
+		break;
+	}
+	return schedule_resp(SCpnt, lu, done, errsts);
+}
+
+/* FIXME: I don't know what version this inline routine was introduced */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+
+/* RedHat 4 appears to define 'scsi_get_tag_type' but doesn't understand
+ * change_queue_depth
+ * Disabling for kernel 2.6.9 (RedHat AS 4)
+ */
+
+#define MSG_SIMPLE_TAG	0x20
+#define MSG_ORDERED_TAG	0x22
+
+/**
+ * scsi_get_tag_type - get the type of tag the device supports
+ * @sdev:	the scsi device
+ *
+ * Notes:
+ *	If the drive only supports simple tags, returns MSG_SIMPLE_TAG
+ *	if it supports all tag types, returns MSG_ORDERED_TAG.
+ */
+static inline int scsi_get_tag_type(struct scsi_device *sdev)
+{
+	if (!sdev->tagged_supported)
+		return 0;
+	if (sdev->ordered_tags)
+		return MSG_ORDERED_TAG;
+	if (sdev->simple_tags)
+		return MSG_SIMPLE_TAG;
+	return 0;
+}
+
+#endif
+
+/* RedHat 4 appears to define 'scsi_get_tag_type' but doesn't understand
+ * change_queue_depth
+ * Disabling for kernel 2.6.9 (RedHat AS 4)
+ */
+#if LINUX_VERSION_CODE != KERNEL_VERSION(2,6,9)
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,19,0) || LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33))
+static int vtl_change_queue_depth(struct scsi_device *sdev, int qdepth)
+#else
+static int vtl_change_queue_depth(struct scsi_device *sdev, int qdepth,
+					int reason)
+#endif
+{
+	printk(KERN_INFO "mhvtl %s(%d)\n", __func__, qdepth);
+
+	if (qdepth < 1)
+		qdepth = 1;
+	else if (qdepth > sdev->host->cmd_per_lun)
+		qdepth = sdev->host->cmd_per_lun;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
+	scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth);
+#else
+	scsi_change_queue_depth(sdev, qdepth);
+#endif
+	return sdev->queue_depth;
+}
+#endif
+
+static struct vtl_queued_cmd *lookup_sqcp(struct vtl_lu_info *lu,
+						unsigned long serialNo)
+{
+	unsigned long iflags;
+	struct vtl_queued_cmd *sqcp;
+
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+	list_for_each_entry(sqcp, &lu->cmd_list, queued_sibling) {
+		if (sqcp->state && (sqcp->a_cmnd->serial_number == serialNo)) {
+			spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+			return sqcp;
+		}
+	}
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+	return NULL;
+}
+
+/*
+ * Block device ioctl
+ */
+static int vtl_b_ioctl(struct scsi_device *sdp, int cmd, void __user *arg)
+{
+	MHVTL_DBG(3, "ioctl: cmd=0x%x\n", cmd);
+
+	return -ENOTTY;
+}
+
+#define MHVTL_RLUN_ARR_SZ 128
+
+static int resp_report_luns(struct scsi_cmnd *scp, struct vtl_lu_info *lu)
+{
+	unsigned int alloc_len;
+	int lun_cnt, i, upper;
+	unsigned char *cmd = (unsigned char *)scp->cmnd;
+	int select_report = (int)cmd[2];
+	struct scsi_lun *one_lun;
+	unsigned char arr[MHVTL_RLUN_ARR_SZ];
+
+	alloc_len = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+	if ((alloc_len < 16) || (select_report > 2)) {
+		mk_sense_buffer(lu, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB,0);
+		return check_condition_result;
+	}
+	/* can produce response with up to 16k luns (lun 0 to lun 16383) */
+	memset(arr, 0, MHVTL_RLUN_ARR_SZ);
+	lun_cnt = vtl_max_luns;
+	arr[2] = ((sizeof(struct scsi_lun) * lun_cnt) >> 8) & 0xff;
+	arr[3] = (sizeof(struct scsi_lun) * lun_cnt) & 0xff;
+	lun_cnt = min((int)((MHVTL_RLUN_ARR_SZ - 8) /
+				sizeof(struct scsi_lun)), lun_cnt);
+	one_lun = (struct scsi_lun *) &arr[8];
+	for (i = 0; i < lun_cnt; i++) {
+		upper = (i >> 8) & 0x3f;
+		if (upper)
+			one_lun[i].scsi_lun[0] =
+				(upper | (SAM2_LUN_ADDRESS_METHOD << 6));
+		one_lun[i].scsi_lun[1] = i & 0xff;
+	}
+	return fill_from_dev_buffer(scp, arr, min((int)alloc_len, MHVTL_RLUN_ARR_SZ));
+}
+
+static void __remove_sqcp(struct vtl_queued_cmd *sqcp)
+{
+	list_del(&sqcp->queued_sibling);
+	kfree(sqcp);
+}
+
+
+static void remove_sqcp(struct vtl_lu_info *lu, struct vtl_queued_cmd *sqcp)
+{
+	unsigned long iflags;
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+	__remove_sqcp(sqcp);
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+}
+
+/* When timer goes off this function is called. */
+static void timer_intr_handler(unsigned long indx)
+{
+	struct vtl_queued_cmd *sqcp = NULL;
+	struct vtl_lu_info *lu;
+
+	struct vtl_hba_info *vtl_hba;
+
+	vtl_hba = vtl_get_hba_entry();
+	if (!vtl_hba)
+		return;
+
+	/* Now that the work list is split per lu, we have to check each
+	 * lu to see if we can find the serial number in question
+	 */
+	list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling) {
+		sqcp = lookup_sqcp(lu, indx);
+		if (sqcp)
+			break;
+	}
+
+	if (!sqcp) {
+		printk(KERN_ERR "mhvtl: %s: Unexpected interrupt, indx %ld\n",
+					 __func__, indx);
+		return;
+	}
+
+	sqcp->state = CMD_STATE_FREE;
+	if (sqcp->done_funct) {
+		sqcp->a_cmnd->result = sqcp->scsi_result;
+		sqcp->done_funct(sqcp->a_cmnd); /* callback to mid level */
+	}
+	sqcp->done_funct = NULL;
+	remove_sqcp(lu, sqcp);
+}
+
+static int vtl_slave_alloc(struct scsi_device *sdp)
+{
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu = (struct vtl_lu_info *)sdp->hostdata;
+
+	MHVTL_DBG(2, "slave_alloc <%u %u %u %llu>\n",
+			sdp->host->host_no, sdp->channel, sdp->id,
+			(unsigned long long int)sdp->lun);
+
+	if (lu)
+		return 0;
+
+	vtl_hba = *(struct vtl_hba_info **) sdp->host->hostdata;
+	if (!vtl_hba) {
+		printk(KERN_ERR "mhvtl: Host info NULL\n");
+		return -1;
+	}
+
+	list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling) {
+		if ((lu->channel == sdp->channel) &&
+			(lu->target == sdp->id) &&
+			(lu->lun == sdp->lun)) {
+				MHVTL_DBG(3, "line %d found matching lu\n", __LINE__);
+				return 0;
+		}
+	}
+	return -1;
+}
+
+static int vtl_slave_configure(struct scsi_device *sdp)
+{
+	struct vtl_lu_info *lu;
+
+	MHVTL_DBG(2, "slave_configure <%u %u %u %llu>\n",
+			sdp->host->host_no, sdp->channel, sdp->id,
+			(unsigned long long int)sdp->lun);
+	if (sdp->host->max_cmd_len != VTL_MAX_CMD_LEN)
+		sdp->host->max_cmd_len = VTL_MAX_CMD_LEN;
+	lu = devInfoReg(sdp);
+	sdp->hostdata = lu;
+	if (sdp->host->cmd_per_lun)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
+		scsi_adjust_queue_depth(sdp, VTL_TAGGED_QUEUING,
+					sdp->host->cmd_per_lun);
+#else
+		scsi_change_queue_depth(sdp, sdp->host->cmd_per_lun);
+#endif
+	return 0;
+}
+
+static void vtl_slave_destroy(struct scsi_device *sdp)
+{
+	struct vtl_lu_info *lu = (struct vtl_lu_info *)sdp->hostdata;
+
+	MHVTL_DBG(2, "slave_destroy <%u %u %u %llu>\n",
+			sdp->host->host_no, sdp->channel, sdp->id,
+			(unsigned long long int)sdp->lun);
+	if (lu) {
+		MHVTL_DBG(2, "Removing lu structure, minor %d\n", lu->minor);
+		/* make this slot avaliable for re-use */
+		devp[lu->minor] = NULL;
+		kfree(sdp->hostdata);
+		sdp->hostdata = NULL;
+	}
+}
+
+static struct vtl_lu_info *devInfoReg(struct scsi_device *sdp)
+{
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu = (struct vtl_lu_info *)sdp->hostdata;
+
+	if (lu)
+		return lu;
+
+	vtl_hba = *(struct vtl_hba_info **) sdp->host->hostdata;
+	if (!vtl_hba) {
+		printk(KERN_ERR "mhvtl: %s Host info NULL\n", __func__);
+		return NULL;
+	}
+
+	list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling) {
+		if ((lu->channel == sdp->channel) &&
+			(lu->target == sdp->id) &&
+			(lu->lun == sdp->lun))
+				return lu;
+	}
+
+	return NULL;
+}
+
+static void mk_sense_buffer(struct vtl_lu_info *lu, int key, int asc, int asq)
+{
+	unsigned char *sbuff;
+
+	sbuff = lu->sense_buff;
+	memset(sbuff, 0, SENSE_BUF_SIZE);
+	sbuff[0] = 0x70;	/* fixed, current */
+	sbuff[2] = key;
+	sbuff[7] = 0xa;		/* implies 18 byte sense buffer */
+	sbuff[12] = asc;
+	sbuff[13] = asq;
+	MHVTL_DBG(1, " [key,asc,ascq]: [0x%x,0x%x,0x%x]\n", key, asc, asq);
+}
+
+static int vtl_device_reset(struct scsi_cmnd *SCpnt)
+{
+	struct vtl_lu_info *lu;
+
+	MHVTL_DBG(3, "Device reset called\n");
+	++num_dev_resets;
+	if (SCpnt) {
+		lu = devInfoReg(SCpnt->device);
+		if (lu)
+			lu->reset = 1;
+	}
+	return SUCCESS;
+}
+
+static int vtl_bus_reset(struct scsi_cmnd *SCpnt)
+{
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu;
+	struct scsi_device *sdp;
+	struct Scsi_Host *hp;
+
+	MHVTL_DBG(3, "Bus reset called\n");
+	++num_bus_resets;
+	if (SCpnt && ((sdp = SCpnt->device)) && ((hp = sdp->host))) {
+		vtl_hba = *(struct vtl_hba_info **) hp->hostdata;
+		if (vtl_hba) {
+			list_for_each_entry(lu, &vtl_hba->lu_list,
+						lu_sibling)
+			lu->reset = 1;
+		}
+	}
+	return SUCCESS;
+}
+
+static int vtl_host_reset(struct scsi_cmnd *SCpnt)
+{
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu;
+
+	MHVTL_DBG(3, "Host reset called\n");
+	++num_host_resets;
+	spin_lock(&vtl_hba_list_lock);
+	list_for_each_entry(vtl_hba, &vtl_hba_list, hba_sibling) {
+		list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling)
+		lu->reset = 1;
+	}
+	spin_unlock(&vtl_hba_list_lock);
+	stop_all_queued();
+	return SUCCESS;
+}
+
+/* Returns 1 if found 'cmnd' and deleted its timer. else returns 0 */
+static int stop_queued_cmnd(struct scsi_cmnd *SCpnt)
+{
+	int found = 0;
+	unsigned long iflags;
+	struct vtl_queued_cmd *sqcp, *n;
+	struct vtl_lu_info *lu;
+
+	lu = devInfoReg(SCpnt->device);
+
+	spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+	list_for_each_entry_safe(sqcp, n, &lu->cmd_list, queued_sibling) {
+		if (sqcp->state && (SCpnt == sqcp->a_cmnd)) {
+			del_timer_sync(&sqcp->cmnd_timer);
+			sqcp->state = CMD_STATE_FREE;
+			sqcp->a_cmnd = NULL;
+			found = 1;
+			__remove_sqcp(sqcp);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+	return found;
+}
+
+/* Deletes (stops) timers of all queued commands */
+static void stop_all_queued(void)
+{
+	unsigned long iflags;
+	struct vtl_queued_cmd *sqcp, *n;
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu;
+
+	vtl_hba = vtl_get_hba_entry();
+	if (!vtl_hba)
+		return;
+
+	list_for_each_entry(lu, &vtl_hba->lu_list, lu_sibling) {
+		spin_lock_irqsave(&lu->cmd_list_lock, iflags);
+		list_for_each_entry_safe(sqcp, n, &lu->cmd_list,
+			queued_sibling) {
+			if (sqcp->state && sqcp->a_cmnd) {
+				del_timer_sync(&sqcp->cmnd_timer);
+				sqcp->state = CMD_STATE_FREE;
+				sqcp->a_cmnd = NULL;
+				__remove_sqcp(sqcp);
+			}
+		}
+		spin_unlock_irqrestore(&lu->cmd_list_lock, iflags);
+	}
+}
+
+static int vtl_abort(struct scsi_cmnd *SCpnt)
+{
+	MHVTL_DBG(3, "Abort called\n");
+	++num_aborts;
+	stop_queued_cmnd(SCpnt);
+	return SUCCESS;
+}
+
+/* SLES 9 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,6)
+struct scsi_device *__scsi_add_device(struct Scsi_Host *hpnt, uint channel, uint id, uint lun, char *p )
+{
+	return scsi_add_device(hpnt, channel, id, lun);
+}
+#endif
+
+/*
+ * According to scsi_mid_low_api.txt
+ *
+ * A call from LLD scsi_add_device() will result in SCSI mid layer
+ *   -> slave_alloc()
+ *   -> slave_configure()
+ */
+static int vtl_add_device(unsigned int minor, struct vtl_ctl *ctl)
+{
+	struct Scsi_Host *hpnt;
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu;
+	int error = 0;
+
+	if (devp[minor]) {
+		MHVTL_DBG(2, "device struct already in place\n");
+		return error;
+	}
+
+	vtl_hba = vtl_get_hba_entry();
+	if (!vtl_hba) {
+		MHVTL_DBG(1, "vtl_ost info struct is NULL\n");
+		return -ENOTTY;
+	}
+	MHVTL_DBG(2, "vtl_hba_info struct is %p\n", vtl_hba);
+
+	hpnt = vtl_hba->shost;
+	if (!hpnt) {
+		MHVTL_DBG(1, "scsi host structure is NULL\n");
+		return -ENOTTY;
+	}
+	MHVTL_DBG(2, "scsi_host struct is %p\n", hpnt);
+
+	lu = kmalloc(sizeof(*lu), GFP_KERNEL);
+	if (!lu) {
+		printk(KERN_ERR "mhvtl: %s line %d - out of memory\n",
+						__func__, __LINE__);
+		return -ENOMEM;
+	}
+	memset(lu, 0, sizeof(*lu));
+	list_add_tail(&lu->lu_sibling, &vtl_hba->lu_list);
+
+	lu->minor = minor;
+	lu->channel = ctl->channel;
+	lu->target = ctl->id;
+	lu->lun = ctl->lun;
+	lu->vtl_hba = vtl_hba;
+	lu->reset = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+	lu->cmd_list_lock = __SPIN_LOCK_UNLOCKED(lu.cmd_list_lock);
+#else
+	lu->cmd_list_lock = SPIN_LOCK_UNLOCKED;
+#endif
+
+	/* List of queued SCSI op codes associated with this device */
+	INIT_LIST_HEAD(&lu->cmd_list);
+
+	lu->sense_buff[0] = 0x70;
+	lu->sense_buff[7] = 0xa;
+	devp[minor] = lu;
+	MHVTL_DBG(1, "Added lu: %p to devp[%d]\n", lu, minor);
+
+	lu->sdev = __scsi_add_device(hpnt, ctl->channel, ctl->id, ctl->lun, NULL);
+	if (IS_ERR(lu->sdev)) {
+		lu->sdev = NULL;
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/* Set 'perm' (4th argument) to 0 to disable module_param's definition
+ * of sysfs parameters (which module_param doesn't yet support).
+ * Sysfs parameters defined explicitly below.
+ */
+module_param_named(opts, vtl_opts, int, 0); /* perm=0644 */
+
+MODULE_AUTHOR("Eric Youngdale + Douglas Gilbert + Mark Harvey");
+MODULE_DESCRIPTION("SCSI vtl adapter driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MHVTL_VERSION);
+
+MODULE_PARM_DESC(opts, "1->noise, 2->medium_error, 4->...");
+
+
+static char vtl_parm_info[256];
+
+static const char *vtl_info(struct Scsi_Host *shp)
+{
+	sprintf(vtl_parm_info, "mhvtl: version %s [%s], "
+		"opts=0x%x", MHVTL_VERSION,
+		vtl_version_date, vtl_opts);
+	return vtl_parm_info;
+}
+
+static ssize_t vtl_opts_show(struct device_driver *ddp, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", vtl_opts);
+}
+
+static ssize_t vtl_opts_store(struct device_driver *ddp,
+				 const char *buf, size_t count)
+{
+	int opts;
+	char work[20];
+
+	if (1 == sscanf(buf, "%10s", work)) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
+		if (0 == strncasecmp(work, "0x", 2)) {
+#else
+		if (0 == strnicmp(work, "0x", 2)) {
+#endif
+			if (1 == sscanf(&work[2], "%x", &opts))
+				goto opts_done;
+		} else {
+			if (1 == sscanf(work, "%d", &opts))
+				goto opts_done;
+		}
+	}
+	return -EINVAL;
+opts_done:
+	vtl_opts = opts;
+	vtl_cmnd_count = 0;
+	return count;
+}
+static DRIVER_ATTR(opts, S_IRUGO|S_IWUSR, vtl_opts_show, vtl_opts_store);
+
+static ssize_t vtl_major_show(struct device_driver *ddp, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n", vtl_major);
+}
+static DRIVER_ATTR(major, S_IRUGO, vtl_major_show, NULL);
+
+static ssize_t vtl_add_lu_action(struct device_driver *ddp,
+					const char *buf, size_t count)
+{
+	int retval;
+	unsigned int minor;
+	struct vtl_ctl ctl;
+	char str[512];
+
+	if (strncmp(buf, "add", 3)) {
+		printk(KERN_ERR "mhvtl: %s Invalid command: %s\n",
+				__func__, buf);
+		return count;
+	}
+
+	retval = sscanf(buf, "%s %u %d %d %d",
+			str, &minor, &ctl.channel, &ctl.id, &ctl.lun);
+
+	MHVTL_DBG(2, "Calling 'vtl_add_device(minor: %u,"
+			" Channel: %d, ID: %d, LUN: %d)\n",
+			minor, ctl.channel, ctl.id, ctl.lun);
+
+	retval = vtl_add_device(minor, &ctl);
+
+	return count;
+}
+static DRIVER_ATTR(add_lu, S_IWUSR|S_IWGRP, NULL, vtl_add_lu_action);
+
+static int do_create_driverfs_files(void)
+{
+	int	ret;
+	ret = driver_create_file(&vtl_driverfs_driver, &driver_attr_add_lu);
+	ret |= driver_create_file(&vtl_driverfs_driver, &driver_attr_opts);
+	ret |= driver_create_file(&vtl_driverfs_driver, &driver_attr_major);
+	return ret;
+}
+
+static void do_remove_driverfs_files(void)
+{
+	driver_remove_file(&vtl_driverfs_driver, &driver_attr_major);
+	driver_remove_file(&vtl_driverfs_driver, &driver_attr_opts);
+	driver_remove_file(&vtl_driverfs_driver, &driver_attr_add_lu);
+}
+
+static int __init mhvtl_init(void)
+{
+	int ret;
+
+	memset(&devp, 0, sizeof(devp));
+
+	vtl_major = register_chrdev(vtl_major, "mhvtl", &vtl_fops);
+	if (vtl_major < 0) {
+		printk(KERN_ERR "mhvtl: can't get major number\n");
+		goto register_chrdev_error;
+	}
+
+	ret = device_register(&pseudo_primary);
+	if (ret < 0) {
+		printk(KERN_ERR "mhvtl: device_register error: %d\n", ret);
+		goto device_register_error;
+	}
+
+	ret = bus_register(&pseudo_lld_bus);
+	if (ret < 0) {
+		printk(KERN_ERR "mhvtl: bus_register error: %d\n", ret);
+		goto bus_register_error;
+	}
+
+	ret = driver_register(&vtl_driverfs_driver);
+	if (ret < 0) {
+		printk(KERN_ERR "mhvtl: driver_register error: %d\n", ret);
+		goto driver_register_error;
+	}
+
+	ret = do_create_driverfs_files();
+	if (ret < 0) {
+		printk(KERN_ERR "mhvtl: driver_create_file error: %d\n", ret);
+		goto do_create_driverfs_error;
+	}
+
+	vtl_driver_template.proc_name = (char *)vtl_driver_name;
+
+	vtl_add_host = 0;
+
+	if (vtl_add_adapter()) {
+		printk(KERN_ERR "mhvtl: %s vtl_add_adapter failed\n", __func__);
+		goto vtl_add_adapter_error;
+	}
+
+	MHVTL_DBG(1, "Built %d host%s\n",
+			vtl_add_host, (vtl_add_host == 1) ? "" : "s");
+
+	return 0;
+
+vtl_add_adapter_error:
+	do_remove_driverfs_files();
+
+do_create_driverfs_error:
+	driver_unregister(&vtl_driverfs_driver);
+
+driver_register_error:
+	bus_unregister(&pseudo_lld_bus);
+
+bus_register_error:
+	device_unregister(&pseudo_primary);
+
+device_register_error:
+	unregister_chrdev(vtl_major, "mhvtl");
+
+register_chrdev_error:
+
+	return -EFAULT;
+}
+
+static void __exit vtl_exit(void)
+{
+	int k;
+
+	stop_all_queued();
+
+	for (k = vtl_add_host; k; k--)
+		vtl_remove_adapter();
+
+	if (vtl_add_host != 0)
+		printk(KERN_ERR "mhvtl %s: vtl_remove_adapter "
+			"error at line %d\n", __func__, __LINE__);
+
+	do_remove_driverfs_files();
+	driver_unregister(&vtl_driverfs_driver);
+	bus_unregister(&pseudo_lld_bus);
+	device_unregister(&pseudo_primary);
+	unregister_chrdev(vtl_major, "mhvtl");
+}
+
+device_initcall(mhvtl_init);
+module_exit(vtl_exit);
+
+static void pseudo_9_release(struct device *dev)
+{
+	MHVTL_DBG(1, "%s() called\n", __func__);
+}
+
+static struct device pseudo_primary = {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,30)
+	.init_name	= "pseudo_9",
+#else
+	.bus_id		= "pseudo_9",
+#endif
+	.release	= pseudo_9_release,
+};
+
+static int pseudo9_lld_bus_match(struct device *dev,
+				 struct device_driver *dev_driver)
+{
+	return 1;
+}
+
+static struct bus_type pseudo_lld_bus = {
+	.name = "pseudo9",
+	.match = pseudo9_lld_bus_match,
+};
+
+static void vtl_release_adapter(struct device *dev)
+{
+	struct vtl_hba_info *vtl_hba;
+
+	vtl_hba = to_vtl_hba(dev);
+	kfree(vtl_hba);
+}
+
+/* Simplified from original.
+ *
+ * Changed so it only adds one hba instance and no logical units
+ */
+static int vtl_add_adapter(void)
+{
+	int error = 0;
+	struct vtl_hba_info *vtl_hba;
+
+	vtl_hba = kmalloc(sizeof(*vtl_hba), GFP_KERNEL);
+
+	if (!vtl_hba) {
+		printk(KERN_ERR "%s: out of memory at line %d\n",
+						__func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	memset(vtl_hba, 0, sizeof(*vtl_hba));
+	INIT_LIST_HEAD(&vtl_hba->lu_list);
+
+	spin_lock(&vtl_hba_list_lock);
+	list_add_tail(&vtl_hba->hba_sibling, &vtl_hba_list);
+	spin_unlock(&vtl_hba_list_lock);
+
+	vtl_hba->dev.bus = &pseudo_lld_bus;
+	vtl_hba->dev.parent = &pseudo_primary;
+	vtl_hba->dev.release = &vtl_release_adapter;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,30)
+	dev_set_name(&vtl_hba->dev, "adapter%d", vtl_add_host);
+#else
+	sprintf(vtl_hba->dev.bus_id, "adapter%d", vtl_add_host);
+#endif
+
+	error = device_register(&vtl_hba->dev);
+	if (error) {
+		kfree(vtl_hba);
+		return error;
+	}
+
+	vtl_add_host++;
+
+	return error;
+}
+
+static void vtl_remove_adapter(void)
+{
+	struct vtl_hba_info *vtl_hba = NULL;
+
+	spin_lock(&vtl_hba_list_lock);
+	if (!list_empty(&vtl_hba_list)) {
+		vtl_hba = list_entry(vtl_hba_list.prev,
+					struct vtl_hba_info, hba_sibling);
+		list_del(&vtl_hba->hba_sibling);
+	}
+	spin_unlock(&vtl_hba_list_lock);
+
+	if (!vtl_hba)
+		return;
+
+	device_unregister(&vtl_hba->dev);
+	--vtl_add_host;
+}
+
+static int vtl_driver_probe(struct device *dev)
+{
+	int error = 0;
+	struct vtl_hba_info *vtl_hba;
+	struct Scsi_Host *hpnt;
+
+	vtl_hba = to_vtl_hba(dev);
+
+	hpnt = scsi_host_alloc(&vtl_driver_template, sizeof(*vtl_hba));
+	if (NULL == hpnt) {
+		printk(KERN_ERR "%s: scsi_register failed\n", __func__);
+		error = -ENODEV;
+		return error;
+	}
+
+	vtl_hba->shost = hpnt;
+	*((struct vtl_hba_info **)hpnt->hostdata) = vtl_hba;
+	if ((hpnt->this_id >= 0) && (vtl_num_tgts > hpnt->this_id))
+		hpnt->max_id = vtl_num_tgts + 1;
+	else
+		hpnt->max_id = vtl_num_tgts;
+	hpnt->max_lun = vtl_max_luns;
+
+	error = scsi_add_host(hpnt, &vtl_hba->dev);
+	if (error) {
+		printk(KERN_ERR "%s: scsi_add_host failed\n", __func__);
+		error = -ENODEV;
+		scsi_host_put(hpnt);
+	} else
+		scsi_scan_host(hpnt);
+
+	return error;
+}
+
+static int vtl_driver_remove(struct device *dev)
+{
+	struct list_head *lh, *lh_sf;
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu;
+
+	vtl_hba = to_vtl_hba(dev);
+
+	if (!vtl_hba) {
+		printk(KERN_ERR "%s: Unable to locate host info\n", __func__);
+		return -ENODEV;
+	}
+
+	scsi_remove_host(vtl_hba->shost);
+
+	list_for_each_safe(lh, lh_sf, &vtl_hba->lu_list) {
+		lu = list_entry(lh, struct vtl_lu_info,
+					lu_sibling);
+		list_del(&lu->lu_sibling);
+		kfree(lu);
+	}
+
+	scsi_host_put(vtl_hba->shost);
+	vtl_hba->shost = NULL;
+	return 0;
+}
+
+/*
+ *******************************************************************
+ * Char device driver routines
+ *******************************************************************
+ */
+static int get_user_data(unsigned int minor, char __user *arg)
+{
+	struct vtl_queued_cmd *sqcp = NULL;
+	struct vtl_ds ds;
+	int ret = 0;
+	unsigned char __user *up;
+	size_t sz;
+
+	if (copy_from_user((u8 *)&ds, (u8 *)arg, sizeof(struct vtl_ds)))
+		return -EFAULT;
+
+	MHVTL_DBG(2, " data Cmd S/No : %ld\n", (long)ds.serialNo);
+	MHVTL_DBG(2, " data pointer     : %p\n", ds.data);
+	MHVTL_DBG(2, " data sz          : %d\n", ds.sz);
+	MHVTL_DBG(2, " SAM status       : %d (0x%02x)\n",
+					ds.sam_stat, ds.sam_stat);
+	up = ds.data;
+	sz = ds.sz;
+	sqcp = lookup_sqcp(devp[minor], ds.serialNo);
+	if (!sqcp)
+		return -ENOTTY;
+
+	ret = resp_write_to_user(sqcp->a_cmnd, up, sz);
+
+	return ret;
+}
+
+static int put_user_data(unsigned int minor, char __user *arg)
+{
+	struct vtl_queued_cmd *sqcp = NULL;
+	struct vtl_ds ds;
+	int ret = 0;
+	uint8_t *s;
+
+	if (copy_from_user((u8 *)&ds, (u8 *)arg, sizeof(struct vtl_ds))) {
+		ret = -EFAULT;
+		goto give_up;
+	}
+	MHVTL_DBG(2, " data Cmd S/No : %ld\n", (long)ds.serialNo);
+	MHVTL_DBG(2, " data pointer     : %p\n", ds.data);
+	MHVTL_DBG(2, " data sz          : %d\n", ds.sz);
+	MHVTL_DBG(2, " SAM status       : %d (0x%02x)\n",
+						ds.sam_stat, ds.sam_stat);
+	sqcp = lookup_sqcp(devp[minor], ds.serialNo);
+	if (!sqcp) {
+		printk(KERN_ERR "%s: callback function not found for "
+				"SCSI cmd s/no. %ld\n",
+				__func__, (long)ds.serialNo);
+		ret = 1;	/* report busy to mid level */
+		goto give_up;
+	}
+	ret = fill_from_user_buffer(sqcp->a_cmnd, ds.data, ds.sz);
+	if (ds.sam_stat) { /* Auto-sense */
+		sqcp->a_cmnd->result = ds.sam_stat;
+		if (copy_from_user(sqcp->a_cmnd->sense_buffer,
+						ds.sense_buf, SENSE_BUF_SIZE))
+			printk(KERN_ERR "Failed to retrieve autosense data\n");
+		sqcp->a_cmnd->sense_buffer[0] |= 0x70; /* force valid sense */
+		s = sqcp->a_cmnd->sense_buffer;
+		MHVTL_DBG(2, "Auto-Sense returned [key/ASC/ASCQ] "
+				"[%02x %02x %02x]\n",
+				s[2],
+				s[12],
+				s[13]);
+	} else
+		sqcp->a_cmnd->result = DID_OK << 16;
+	del_timer_sync(&sqcp->cmnd_timer);
+	if (sqcp->done_funct)
+		sqcp->done_funct(sqcp->a_cmnd);
+	else
+		printk(KERN_ERR
+			"%s FATAL, line %d: SCSI done_funct callback => NULL\n",
+						__func__, __LINE__);
+	remove_sqcp(devp[minor], sqcp);
+
+	ret = 0;
+
+give_up:
+	return ret;
+}
+
+static int send_vtl_header(unsigned int minor, char __user *arg)
+{
+	struct vtl_header *vheadp;
+	struct vtl_queued_cmd *sqcp;
+	int ret = 0;
+
+	list_for_each_entry(sqcp, &devp[minor]->cmd_list, queued_sibling) {
+		if (sqcp->state == CMD_STATE_QUEUED) {
+			vheadp = &sqcp->op_header;
+			if (copy_to_user((u8 *)arg, (u8 *)vheadp,
+						sizeof(struct vtl_header))) {
+				ret = -EFAULT;
+				goto give_up;
+			}
+			/* Found an outstanding cmd to send */
+			sqcp->state = CMD_STATE_IN_USE;
+			ret = VTL_QUEUE_CMD;
+			/* Can only send one header at a time */
+			goto give_up;
+		}
+	}
+
+give_up:
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+static DEFINE_SEMAPHORE(tmp_mutex);
+#else
+static DECLARE_MUTEX(tmp_mutex);
+#endif
+
+static int vtl_remove_lu(unsigned int minor, char __user *arg)
+{
+	struct vtl_ctl ctl;
+	struct vtl_hba_info *vtl_hba;
+	struct vtl_lu_info *lu, *n;
+	struct scsi_device *baksdev;
+	int ret = -ENODEV;
+
+	down(&tmp_mutex);
+
+	if (copy_from_user((u8 *)&ctl, (u8 *)arg, sizeof(ctl))) {
+		ret = -EFAULT;
+		goto give_up;
+	}
+
+	vtl_hba = vtl_get_hba_entry();
+	if (!vtl_hba) {
+		ret = 0;
+		goto give_up;
+	}
+
+	MHVTL_DBG(1, "ioctl to remove device <c t l> "
+		"<%02d %02d %02d>, hba: %p\n",
+			ctl.channel, ctl.id, ctl.lun, vtl_hba);
+
+	list_for_each_entry_safe(lu, n, &vtl_hba->lu_list, lu_sibling) {
+		if ((lu->channel == ctl.channel) && (lu->target == ctl.id) &&
+						(lu->lun == ctl.lun)) {
+			MHVTL_DBG(2, "line %d found matching lu\n", __LINE__);
+			list_del(&lu->lu_sibling);
+			devp[minor] = NULL;
+			baksdev = lu->sdev;
+			scsi_remove_device(lu->sdev);
+			scsi_device_put(baksdev);
+		}
+	}
+
+	ret = 0;
+
+give_up:
+	up(&tmp_mutex);
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static DEFINE_MUTEX(ioctl_mutex);
+#endif
+
+static long vtl_c_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
+	struct inode *inode = file->f_dentry->d_inode;
+#else
+	struct inode *inode = file_inode(file);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+	mutex_lock(&ioctl_mutex);
+#else
+	lock_kernel();
+#endif
+	ret = vtl_c_ioctl_bkl(inode, file, cmd, arg);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+	mutex_unlock(&ioctl_mutex);
+#else
+	unlock_kernel();
+#endif
+
+	return ret;
+}
+
+/*
+ * char device ioctl entry point
+ */
+static int vtl_c_ioctl_bkl(struct inode *inode, struct file *file,
+					unsigned int cmd, unsigned long arg)
+{
+	unsigned int minor = iminor(inode);
+	int ret;
+
+	if (minor >= DEF_MAX_MINOR_NO) {	/* Check limit minor no. */
+		return -ENODEV;
+	}
+
+	ret = 0;
+
+	switch (cmd) {
+
+	case VTL_POLL_AND_GET_HEADER:
+		if (!devp[minor]) {
+			put_user(0, (unsigned int *)arg);
+			ret = 0;
+			break;
+		}
+		ret = send_vtl_header(minor, (char __user *)arg);
+		break;
+
+	case VTL_GET_DATA:
+		MHVTL_DBG(3, "ioctl(VTL_GET_DATA)\n");
+		ret = get_user_data(minor, (char __user *)arg);
+		break;
+
+	case VTL_PUT_DATA:
+		MHVTL_DBG(3, "ioctl(VTL_PUT_DATA)\n");
+		ret = put_user_data(minor, (char __user *)arg);
+		break;
+
+	case VTL_REMOVE_LU:
+		MHVTL_DBG(3, "ioctl(VTL_REMOVE_LU)\n");
+		ret = vtl_remove_lu(minor, (char __user *)arg);
+		break;
+
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	return ret;
+}
+
+static int vtl_release(struct inode *inode, struct file *filp)
+{
+#ifdef MHVTL_DEBUG
+	unsigned int minor = iminor(inode);
+#endif
+	MHVTL_DBG(1, "lu for minor %u Release\n", minor);
+	return 0;
+}
+
+static int vtl_open(struct inode *inode, struct file *filp)
+{
+#ifdef MHVTL_DEBUG
+	unsigned int minor = iminor(inode);
+#endif
+	MHVTL_DBG(1, "mhvtl%u: opened\n", minor);
+	return 0;
+}
+
--- /dev/null
+++ b/drivers/target/mhvtl/vtl_common.h
@@ -0,0 +1,58 @@
+/* Common stuff for kernel and usr programs */
+#ifndef VTL_COMMON_H
+#define VTL_COMMON_H
+
+#define SENSE_BUF_SIZE	96
+/* Max cdb size */
+#define MAX_COMMAND_SIZE	16
+
+#define VTL_IDLE		0x00
+#define VTL_QUEUE_CMD		0xfe
+
+/* ioctl defines */
+#define VX_TAPE_ONLINE		0x80
+#define VTL_POLL_AND_GET_HEADER	0x200
+#define VTL_GET_DATA		0x201
+#define VTL_PUT_DATA		0x203
+#define VTL_REMOVE_LU		0x205
+
+#define VENDOR_ID_LEN	8
+#define PRODUCT_ID_LEN	16
+#define PRODUCT_REV_LEN	4
+
+struct	vtl_header {
+	unsigned long long serialNo;
+	unsigned char cdb[MAX_COMMAND_SIZE];
+	unsigned char *buf;
+};
+
+struct vtl_ds {
+	void *data;
+	unsigned int sz;
+	unsigned long long serialNo;
+	void *sense_buf;
+	unsigned char sam_stat;
+};
+
+struct vtl_sn_inquiry {
+	char sn[32];
+	char vendor_id[VENDOR_ID_LEN + 2];
+	char product_id[PRODUCT_ID_LEN + 2];
+};
+
+struct vtl_ctl {
+	unsigned int channel;
+	unsigned int id;
+	unsigned int lun;
+};
+
+#if !defined(FALSE)
+  #define FALSE 0
+#endif
+
+#if !defined(TRUE)
+  #define TRUE 1
+#endif
+
+#endif /* VTL_COMMON_H */
+
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -158,7 +158,6 @@ struct n_hdlc {
  */
 static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list,
 						struct n_hdlc_buf *buf);
-static void n_hdlc_buf_list_init(struct n_hdlc_buf_list *list);
 static void n_hdlc_buf_put(struct n_hdlc_buf_list *list,
 			   struct n_hdlc_buf *buf);
 static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *list);
@@ -844,11 +843,16 @@ static struct n_hdlc *n_hdlc_alloc(void)
 
 	memset(n_hdlc, 0, sizeof(*n_hdlc));
 
-	n_hdlc_buf_list_init(&n_hdlc->rx_free_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->tx_free_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->rx_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->tx_buf_list);
-	
+	spin_lock_init(&n_hdlc->rx_free_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->tx_free_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->rx_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->tx_buf_list.spinlock);
+
+	INIT_LIST_HEAD(&n_hdlc->rx_free_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->tx_free_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->rx_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->tx_buf_list.list);
+
 	/* allocate free rx buffer list */
 	for(i=0;i<DEFAULT_RX_BUF_COUNT;i++) {
 		buf = kmalloc(N_HDLC_BUF_SIZE, GFP_KERNEL);
@@ -875,17 +879,6 @@ static struct n_hdlc *n_hdlc_alloc(void)
 	
 }	/* end of n_hdlc_alloc() */
 
-/**
- * n_hdlc_buf_list_init - initialize specified HDLC buffer list
- * @list - pointer to buffer list
- */
-static void n_hdlc_buf_list_init(struct n_hdlc_buf_list *list)
-{
-	memset(list, 0, sizeof(*list));
-	spin_lock_init(&list->spinlock);
-	INIT_LIST_HEAD(&list->list);
-}	/* end of n_hdlc_buf_list_init() */
-
 /**
  * n_hdlc_buf_return - put the HDLC buffer after the head of the specified list
  * @buf_list - pointer to the buffer list
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -50,6 +50,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ratelimit.h>
+#include <linux/ve.h>
 
 
 /* number of characters left in xmit buffer before select has we have room */
@@ -2041,7 +2042,12 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 			retval = -ERESTARTSYS;
 			break;
 		}
+#ifdef CONFIG_VE
+		if (tty_hung_up_p(file) ||
+		    (tty->link && !tty->link->count && !vtty_is_master(tty->link))) {
+#else
 		if (tty_hung_up_p(file) || (tty->link && !tty->link->count)) {
+#endif
 			retval = -EIO;
 			break;
 		}
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -13,7 +13,6 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
@@ -25,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
+#include <bc/misc.h>
 
 #ifdef CONFIG_UNIX98_PTYS
 static struct tty_driver *ptm_driver;
@@ -35,6 +35,8 @@ static DEFINE_MUTEX(devpts_mutex);
 static void pty_close(struct tty_struct *tty, struct file *filp)
 {
 	BUG_ON(!tty);
+
+	ub_pty_uncharge(tty);
 	if (tty->driver->subtype == PTY_TYPE_MASTER)
 		WARN_ON(tty->count > 1);
 	else {
@@ -242,9 +244,12 @@ static void pty_flush_buffer(struct tty_struct *tty)
 
 static int pty_open(struct tty_struct *tty, struct file *filp)
 {
+	int retval;
+
 	if (!tty || !tty->link)
 		return -ENODEV;
 
+	retval = -EIO;
 	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 		goto out;
 	if (test_bit(TTY_PTY_LOCK, &tty->link->flags))
@@ -252,6 +257,10 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 	if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1)
 		goto out;
 
+	retval = -ENOMEM;
+	if (ub_pty_charge(tty))
+		goto out;
+
 	clear_bit(TTY_IO_ERROR, &tty->flags);
 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
 	set_bit(TTY_THROTTLED, &tty->flags);
@@ -259,7 +268,7 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 
 out:
 	set_bit(TTY_IO_ERROR, &tty->flags);
-	return -EIO;
+	return retval;
 }
 
 static void pty_set_termios(struct tty_struct *tty,
@@ -538,6 +547,7 @@ static void __init legacy_pty_init(void)
 	if (tty_register_driver(pty_slave_driver))
 		panic("Couldn't register pty slave driver");
 }
+
 #else
 static inline void legacy_pty_init(void) { }
 #endif
@@ -607,20 +617,13 @@ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver,
 	return tty;
 }
 
-/* We have no need to install and remove our tty objects as devpts does all
-   the work for us */
-
 static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty)
 {
 	return pty_common_install(driver, tty, false);
 }
 
-static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
-{
-}
-
 /* this is called once with whichever end is closed last */
-static void pty_unix98_shutdown(struct tty_struct *tty)
+static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
 {
 	struct pts_fs_info *fsi;
 
@@ -628,8 +631,11 @@ static void pty_unix98_shutdown(struct tty_struct *tty)
 		fsi = tty->driver_data;
 	else
 		fsi = tty->link->driver_data;
-	devpts_kill_index(fsi, tty->index);
-	devpts_put_ref(fsi);
+
+	if (fsi) {
+		devpts_kill_index(fsi, tty->index);
+		devpts_put_ref(fsi);
+	}
 }
 
 static const struct tty_operations ptm_unix98_ops = {
@@ -646,7 +652,6 @@ static const struct tty_operations ptm_unix98_ops = {
 	.set_termios = pty_set_termios,
 	.ioctl = pty_unix98_ioctl,
 	.resize = pty_resize,
-	.shutdown = pty_unix98_shutdown,
 	.cleanup = pty_cleanup
 };
 
@@ -662,7 +667,6 @@ static const struct tty_operations pty_unix98_ops = {
 	.chars_in_buffer = pty_chars_in_buffer,
 	.unthrottle = pty_unthrottle,
 	.set_termios = pty_set_termios,
-	.shutdown = pty_unix98_shutdown,
 	.cleanup = pty_cleanup,
 };
 
@@ -828,10 +832,535 @@ static void __init unix98_pty_init(void)
 static inline void unix98_pty_init(void) { }
 #endif
 
+#if defined(CONFIG_VE)
+
+/*
+ * VTTY architecture overview.
+ *
+ * With VTTY we make /dev/console and /dev/tty[X] virtualized
+ * per container (note the real names may vary because the
+ * kernel itself uses major:minor numbers to distinguish
+ * devices and doesn't care how they are named inside /dev.
+ * /dev/console stands for TTYAUX_MAJOR:1 while /dev/tty[X]
+ * stands for TTY_MAJOR:[0:12]. That said from inside of
+ * VTTY /dev/console is the same as /dev/tty0.
+ *
+ * For every container here is a tty map represented by
+ * vtty_map_t. It carries @veid of VE and associated slave
+ * tty peers.
+ *
+ * map
+ *  veid -> CTID
+ *    vttys -> [ 0 ]
+ *               `- @slave -> link -> @master
+ *             [ 1 ]
+ *               `- @slave -> link -> @master
+ */
+
+#include <linux/ve.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+
+static struct tty_driver *vttym_driver;
+static struct tty_driver *vttys_driver;
+static DEFINE_IDR(vtty_idr);
+
+static struct file_operations vtty_fops;
+
+#define vtty_match_index(idx)	((idx) >= 0 && (idx) < MAX_NR_VTTY_CONSOLES)
+
+bool vtty_is_master(struct tty_struct *tty)
+{
+	return tty->driver == vttym_driver;
+}
+
+typedef struct {
+	envid_t			veid;
+	struct tty_struct	*vttys[MAX_NR_VTTY_CONSOLES];
+} vtty_map_t;
+
+static vtty_map_t *vtty_map_lookup(envid_t veid)
+{
+	lockdep_assert_held(&tty_mutex);
+	return idr_find(&vtty_idr, veid);
+}
+
+static void vtty_map_set(vtty_map_t *map, struct tty_struct *tty)
+{
+	lockdep_assert_held(&tty_mutex);
+	WARN_ON(map->vttys[tty->index]);
+
+	tty->driver_data = tty->link->driver_data = map;
+	map->vttys[tty->index] = tty;
+}
+
+static void vtty_map_free(vtty_map_t *map)
+{
+	lockdep_assert_held(&tty_mutex);
+	idr_remove(&vtty_idr, map->veid);
+	kfree(map);
+}
+
+static void vtty_map_clear(struct tty_struct *tty)
+{
+	vtty_map_t *map = tty->driver_data;
+
+	lockdep_assert_held(&tty_mutex);
+	if (map) {
+		struct tty_struct *p = map->vttys[tty->index];
+		int i;
+
+		WARN_ON(p != (tty->driver == vttys_driver ? tty : tty->link));
+		map->vttys[tty->index] = NULL;
+		tty->driver_data = tty->link->driver_data = NULL;
+
+		for (i = 0; i < MAX_NR_VTTY_CONSOLES; i++) {
+			if (map->vttys[i])
+				break;
+		}
+
+		if (i >= MAX_NR_VTTY_CONSOLES)
+			vtty_map_free(map);
+	}
+}
+
+static vtty_map_t *vtty_map_alloc(envid_t veid)
+{
+	vtty_map_t *map = kzalloc(sizeof(*map), GFP_KERNEL);
+
+	lockdep_assert_held(&tty_mutex);
+	if (map) {
+		map->veid = veid;
+		veid = idr_alloc(&vtty_idr, map, veid, veid + 1, GFP_KERNEL);
+		if (veid < 0) {
+			kfree(map);
+			return ERR_PTR(veid);
+		}
+	} else
+		map = ERR_PTR(-ENOMEM);
+	return map;
+}
+
+/*
+ * vttys are never supposed to be opened from inside
+ * of VE0 except special ioctl call, so treat zero as
+ * "unused" sign.
+ */
+static envid_t vtty_context_veid;
+
+static void vtty_set_context(envid_t veid)
+{
+	lockdep_assert_held(&tty_mutex);
+	WARN_ON(!veid);
+	vtty_context_veid = veid;
+}
+
+static void vtty_drop_context(void)
+{
+	lockdep_assert_held(&tty_mutex);
+	vtty_context_veid = 0;
+}
+
+static envid_t vtty_get_context(void)
+{
+	lockdep_assert_held(&tty_mutex);
+	return vtty_context_veid ?: get_exec_env()->veid;
+}
+
+static struct tty_struct *vtty_lookup(struct tty_driver *driver,
+				      struct inode *inode, int idx)
+{
+	vtty_map_t *map = vtty_map_lookup(vtty_get_context());
+	struct tty_struct *tty;
+
+	if (!vtty_match_index(idx))
+		return ERR_PTR(-EIO);
+
+	/*
+	 * Nothing ever been opened yet, allocate a new
+	 * tty map together with both peers from the scratch
+	 * in install procedure.
+	 */
+	if (!map)
+		return NULL;
+
+	tty = map->vttys[idx];
+	if (tty) {
+		if (driver == vttym_driver)
+			tty = tty->link;
+		WARN_ON(!tty);
+	}
+	return tty;
+}
+
+static void vtty_standard_install(struct tty_driver *driver,
+				  struct tty_struct *tty)
+{
+	WARN_ON(tty_init_termios(tty));
+
+	tty_driver_kref_get(driver);
+	tty_port_init(tty->port);
+	tty->port->itty = tty;
+}
+
+static struct tty_struct *vtty_install_peer(struct tty_driver *driver,
+					    struct tty_port *port, int index)
+{
+	struct tty_struct *tty;
+
+	tty = alloc_tty_struct(driver, index);
+	if (!tty)
+		return ERR_PTR(-ENOMEM);
+	tty->port = port;
+	vtty_standard_install(driver, tty);
+	return tty;
+}
+
+static int vtty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+	envid_t veid = vtty_get_context();
+	struct tty_port *peer_port;
+	struct tty_struct *peer;
+	vtty_map_t *map;
+	int ret;
+
+	WARN_ON_ONCE(driver != vttys_driver);
+
+	map = vtty_map_lookup(veid);
+	if (!map) {
+		map = vtty_map_alloc(veid);
+		if (IS_ERR(map))
+			return PTR_ERR(map);
+	}
+
+	tty->port = kzalloc(sizeof(*tty->port), GFP_KERNEL);
+	peer_port = kzalloc(sizeof(*peer_port), GFP_KERNEL);
+	if (!tty->port || !peer_port) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	peer = vtty_install_peer(vttym_driver, peer_port, tty->index);
+	if (IS_ERR(peer)) {
+		ret = PTR_ERR(peer);
+		goto err_free;
+	}
+
+	vtty_standard_install(vttys_driver, tty);
+	tty->count++;
+
+	tty->link = peer;
+	peer->link = tty;
+
+	vtty_map_set(map, tty);
+	return 0;
+
+err_free:
+	kfree(tty->port);
+	kfree(peer_port);
+	return ret;
+}
+
+static int vtty_open(struct tty_struct *tty, struct file *filp)
+{
+	set_bit(TTY_THROTTLED, &tty->flags);
+	return 0;
+}
+
+static void vtty_close(struct tty_struct *tty, struct file *filp)
+{
+	if (tty->count <= (tty->driver == vttys_driver) ? 2 : 1) {
+		wake_up_interruptible(&tty->read_wait);
+		wake_up_interruptible(&tty->write_wait);
+
+		wake_up_interruptible(&tty->link->read_wait);
+		wake_up_interruptible(&tty->link->write_wait);
+	}
+}
+
+static void vtty_shutdown(struct tty_struct *tty)
+{
+	vtty_map_clear(tty);
+}
+
+static int vtty_write(struct tty_struct *tty,
+		      const unsigned char *buf, int count)
+{
+	struct tty_struct *peer = tty->link;
+
+	if (tty->stopped)
+		return 0;
+
+	if (count > 0) {
+		count = tty_insert_flip_string(peer->port, buf, count);
+		if (count) {
+			tty_flip_buffer_push(peer->port);
+			tty_wakeup(tty);
+		} else {
+			/*
+			 * Flush the slave reader if noone
+			 * is actually hooked on. Otherwise
+			 * wait until reader fetch all data.
+			 */
+			if (peer->count <
+			    (tty->driver == vttym_driver) ? 2 : 1)
+				tty_perform_flush(peer, TCIFLUSH);
+		}
+	}
+
+	return count;
+}
+
+static int vtty_write_room(struct tty_struct *tty)
+{
+	struct tty_struct *peer = tty->link;
+
+	if (tty->stopped)
+		return 0;
+
+	if (peer->count <
+	    (tty->driver == vttym_driver) ? 2 : 1)
+		return 2048;
+
+	return pty_space(peer);
+}
+
+static void vtty_remove(struct tty_driver *driver, struct tty_struct *tty)
+{
+}
+
+static int vtty_resize(struct tty_struct *tty, struct winsize *ws)
+{
+	if (tty->driver == vttym_driver)
+		return pty_resize(tty, ws);
+	return tty_do_resize(tty, ws);
+}
+
+static const struct tty_operations vtty_ops = {
+	.lookup		= vtty_lookup,
+	.install	= vtty_install,
+	.open		= vtty_open,
+	.close		= vtty_close,
+	.shutdown	= vtty_shutdown,
+	.cleanup	= pty_cleanup,
+	.write		= vtty_write,
+	.write_room	= vtty_write_room,
+	.chars_in_buffer= pty_chars_in_buffer,
+	.set_termios	= pty_set_termios,
+	.unthrottle	= pty_unthrottle,
+	.flush_buffer	= pty_flush_buffer,
+	.remove		= vtty_remove,
+	.resize		= vtty_resize,
+};
+
+struct tty_driver *vtty_console_driver(int *index)
+{
+	*index = 0;
+	return vttys_driver;
+}
+
+struct tty_driver *vtty_driver(dev_t dev, int *index)
+{
+	if (MAJOR(dev) == TTY_MAJOR &&
+	    MINOR(dev) <= MAX_NR_VTTY_CONSOLES) {
+		if (MINOR(dev))
+			*index = MINOR(dev) - 1;
+		else
+			*index = 0;
+		return vttys_driver;
+	}
+	return NULL;
+}
+
+void vtty_release(struct tty_struct *tty, struct tty_struct *o_tty,
+		  int *tty_closing, int *o_tty_closing)
+{
+	int pty_master;
+	lockdep_assert_held(&tty_mutex);
+
+	if (tty->driver != vttym_driver &&
+	    tty->driver != vttys_driver)
+		return;
+
+	pty_master = (tty->driver == vttym_driver);
+
+	/*
+	 * Do not close master while slave is active.
+	 */
+	if (!*o_tty_closing && pty_master)
+		*tty_closing = 0;
+
+	/*
+	 * Do not close master if we've closing
+	 * not the last slave even if there is no
+	 * readers on the master.
+	 */
+	if (*o_tty_closing && !*tty_closing && !pty_master)
+		*o_tty_closing = 0;
+}
+
+static int __init vtty_init(void)
+{
+#define VTTY_DRIVER_ALLOC_FLAGS			\
+	(TTY_DRIVER_REAL_RAW		|	\
+	 TTY_DRIVER_RESET_TERMIOS	|	\
+	 TTY_DRIVER_DYNAMIC_DEV		|	\
+	 TTY_DRIVER_INSTALLED		|	\
+	 TTY_DRIVER_DEVPTS_MEM)
+
+	vttym_driver = tty_alloc_driver(MAX_NR_VTTY_CONSOLES,
+					VTTY_DRIVER_ALLOC_FLAGS);
+	if (IS_ERR(vttym_driver))
+		panic(pr_fmt("Can't allocate master vtty driver\n"));
+
+	vttys_driver = tty_alloc_driver(MAX_NR_VTTY_CONSOLES,
+					VTTY_DRIVER_ALLOC_FLAGS);
+	if (IS_ERR(vttys_driver))
+		panic(pr_fmt("Can't allocate slave vtty driver\n"));
+
+	vttym_driver->driver_name		= "vtty_master";
+	vttym_driver->name			= "vttym";
+	vttym_driver->name_base			= 0;
+	vttym_driver->major			= 0;
+	vttym_driver->minor_start		= 0;
+	vttym_driver->type			= TTY_DRIVER_TYPE_PTY;
+	vttym_driver->subtype			= PTY_TYPE_MASTER;
+	vttym_driver->init_termios		= tty_std_termios;
+	vttym_driver->init_termios.c_iflag	= 0;
+	vttym_driver->init_termios.c_oflag	= 0;
+
+	/* 38400 boud rate, 8 bit char size, enable receiver */
+	vttym_driver->init_termios.c_cflag	= B38400 | CS8 | CREAD;
+	vttym_driver->init_termios.c_lflag	= 0;
+	tty_set_operations(vttym_driver, &vtty_ops);
+
+	vttys_driver->driver_name		= "vtty_slave";
+	vttys_driver->name			= "vttys";
+	vttys_driver->name_base			= 0;
+	vttys_driver->major			= 0;
+	vttys_driver->minor_start		= 0;
+	vttys_driver->type			= TTY_DRIVER_TYPE_PTY;
+	vttys_driver->subtype			= PTY_TYPE_SLAVE;
+	vttys_driver->init_termios		= tty_std_termios;
+	vttys_driver->init_termios.c_cflag	= B38400 | CS8 | CREAD;
+	tty_set_operations(vttys_driver, &vtty_ops);
+
+	if (tty_register_driver(vttym_driver))
+		panic(pr_fmt("Can't register master vtty driver\n"));
+
+	if (tty_register_driver(vttys_driver))
+		panic(pr_fmt("Can't register slave vtty driver\n"));
+
+	tty_default_fops(&vtty_fops);
+	return 0;
+}
+
+int vtty_open_master(envid_t veid, int idx)
+{
+	struct tty_struct *tty;
+	struct file *file;
+	char devname[64];
+	int fd, ret;
+
+	if (!vtty_match_index(idx))
+		return -EIO;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	snprintf(devname, sizeof(devname), "v%utty%d", veid, idx);
+	file = anon_inode_getfile(devname, &vtty_fops, NULL, O_RDWR);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	nonseekable_open(NULL, file);
+
+	ret = tty_alloc_file(file);
+	if (ret)
+		goto err_fput;
+
+	/*
+	 * Opening comes from ve0 context so
+	 * setup VE's context until master fetched.
+	 * This is done under @tty_mutex so noone
+	 * else would access it while we're holding
+	 * the lock.
+	 */
+	mutex_lock(&tty_mutex);
+	vtty_set_context(veid);
+
+	tty = vtty_lookup(vttym_driver, NULL, idx);
+	if (!tty) {
+		/*
+		 * FIXME: Previously we've been testing
+		 * for TTY_CLOSING bit which is not longer
+		 * here. Review and handle.
+		 */
+		/*
+		 * The previous connection is about to
+		 * be closed so drop it from the map and
+		 * allocate a new one.
+		 */
+		if (tty)
+			vtty_map_clear(tty);
+		tty = tty_init_dev(vttys_driver, idx);
+		if (IS_ERR(tty))
+			goto err_install;
+		tty->count--;
+		tty_unlock(tty);
+		tty = tty->link;
+	}
+
+	/* One master at a time */
+	if (tty->count >= 1) {
+		ret = -EBUSY;
+		goto err_install;
+	}
+
+	vtty_drop_context();
+
+	/* FIXME: code will be dropped anyway
+	 * WARN_ON(!test_bit(TTY_LDISC, &tty->flags));
+	 */
+
+	/*
+	 * We're the master peer so increment
+	 * slave counter as well.
+	 */
+	tty_add_file(tty, file);
+	tty->count++;
+	tty->link->count++;
+	fd_install(fd, file);
+	vtty_open(tty, file);
+
+	mutex_unlock(&tty_mutex);
+	ret = fd;
+out:
+	return ret;
+
+err_install:
+	vtty_drop_context();
+	mutex_unlock(&tty_mutex);
+	tty_free_file(file);
+err_fput:
+	file->f_op = NULL;
+	fput(file);
+err_put_unused_fd:
+	put_unused_fd(fd);
+	goto out;
+}
+EXPORT_SYMBOL(vtty_open_master);
+#else
+static void vtty_init(void) { };
+#endif /* CONFIG_VE */
+
 static int __init pty_init(void)
 {
 	legacy_pty_init();
 	unix98_pty_init();
+	vtty_init();
 	return 0;
 }
 module_init(pty_init);
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -45,6 +45,9 @@
 #include <linux/moduleparam.h>
 #include <linux/jiffies.h>
 #include <linux/rcupdate.h>
+#include <linux/ve.h>
+
+#include <bc/vmpages.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
@@ -360,7 +363,7 @@ static struct sysrq_key_op sysrq_term_op = {
 static void moom_callback(struct work_struct *ignored)
 {
 	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-		      0, NULL, true);
+		      0, NULL);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
@@ -1039,10 +1042,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 {
 	if (count) {
 		char c;
+		struct ve_struct *cur = get_exec_env();
+		static int pnum = 10;
 
 		if (get_user(c, buf))
 			return -EFAULT;
-		__handle_sysrq(c, false);
+		if (ve_is_super(cur))
+			__handle_sysrq(c, false);
+		else if (pnum--)
+			printk("SysRq: CT#%s sent '%c' magic key.\n",
+				cur->ve_name, c);
 	}
 
 	return count;
@@ -1055,7 +1064,7 @@ static const struct file_operations proc_sysrq_trigger_operations = {
 
 static void sysrq_init_procfs(void)
 {
-	if (!proc_create("sysrq-trigger", S_IWUSR, NULL,
+	if (!proc_create("sysrq-trigger", S_ISVTX | S_IWUSR, NULL,
 			 &proc_sysrq_trigger_operations))
 		pr_err("Failed to register proc interface\n");
 }
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -69,7 +69,6 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
@@ -104,6 +103,7 @@
 
 #include <linux/kmod.h>
 #include <linux/nsproxy.h>
+#include <linux/ve.h>
 
 #undef TTY_DEBUG_HANGUP
 
@@ -1555,7 +1555,7 @@ void tty_free_termios(struct tty_struct *tty)
 	/* Stash the termios data */
 	tp = tty->driver->termios[idx];
 	if (tp == NULL) {
-		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
+		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_ACCOUNT);
 		if (tp == NULL) {
 			pr_warn("tty: no memory to save termios state.\n");
 			return;
@@ -1602,13 +1602,14 @@ static void release_one_tty(struct work_struct *work)
 	struct tty_struct *tty =
 		container_of(work, struct tty_struct, hangup_work);
 	struct tty_driver *driver = tty->driver;
+	struct module *owner = driver->owner;
 
 	if (tty->ops->cleanup)
 		tty->ops->cleanup(tty);
 
 	tty->magic = 0;
 	tty_driver_kref_put(driver);
-	module_put(driver->owner);
+	module_put(owner);
 
 	spin_lock(&tty_files_lock);
 	list_del_init(&tty->tty_files);
@@ -1791,6 +1792,15 @@ int tty_release(struct inode *inode, struct file *filp)
 	while (1) {
 		do_sleep = 0;
 
+		/*
+		 * FIXME: Need to figure out how to prevent closing
+		 * peers when one is still active, unlike traditional
+		 * PTYs we don't close master if slave is closed.
+		 */
+#if 0
+		vtty_release(tty, o_tty, &tty_closing, &o_tty_closing);
+#endif
+
 		if (tty->count <= 1) {
 			if (waitqueue_active(&tty->read_wait)) {
 				wake_up_poll(&tty->read_wait, POLLIN);
@@ -1951,6 +1961,19 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 {
 	struct tty_driver *driver;
 
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		driver = vtty_driver(device, index);
+		if (driver) {
+			if (MINOR(device) == 0)
+				*noctty = 1;
+			return tty_driver_kref_get(driver);
+		}
+	}
+#endif
+
 	switch (device) {
 #ifdef CONFIG_VT
 	case MKDEV(TTY_MAJOR, 0): {
@@ -1963,6 +1986,17 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 #endif
 	case MKDEV(TTYAUX_MAJOR, 1): {
 		struct tty_driver *console_driver = console_device(index);
+#ifdef CONFIG_VE
+		if (!ve_is_super(ve)) {
+			console_driver = vtty_console_driver(index);
+			/*
+			 * Reset fops, sometimes there might be
+			 * console_fops picked from inode->i_cdev
+			 * in chrdev_open()
+			 */
+			filp->f_op = &tty_fops;
+		}
+#endif
 		if (console_driver) {
 			driver = tty_driver_kref_get(console_driver);
 			if (driver) {
@@ -2599,6 +2633,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 	return ret;
 }
 
+/**
+ *	tiocgetd	-	get line discipline
+ *	@tty: tty device
+ *	@p: pointer to user data
+ *
+ *	Retrieves the line discipline id directly from the ldisc.
+ *
+ *	Locking: waits for ldisc reference (in case the line discipline
+ *		is changing or the tty is being hungup)
+ */
+
+static int tiocgetd(struct tty_struct *tty, int __user *p)
+{
+	struct tty_ldisc *ld;
+	int ret;
+
+	ld = tty_ldisc_ref_wait(tty);
+	ret = put_user(ld->ops->num, p);
+	tty_ldisc_deref(ld);
+	return ret;
+}
+
 /**
  *	send_break	-	performed time break
  *	@tty: device to break on
@@ -2813,7 +2869,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc->ops->num, (int __user *)p);
+		return tiocgetd(tty, p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	case TIOCVHANGUP:
@@ -2869,6 +2925,11 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			break;
 		}
 		break;
+	case TIOSAK:
+		if (real_tty == tty && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		__do_SAK(real_tty);
+		return 0;
 	}
 	if (tty->ops->ioctl) {
 		retval = (tty->ops->ioctl)(tty, cmd, arg);
@@ -3038,7 +3099,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
 {
 	struct tty_struct *tty;
 
-	tty = kzalloc(sizeof(*tty), GFP_KERNEL);
+	tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT);
 	if (!tty)
 		return NULL;
 
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/jiffies.h>
 #include <linux/uaccess.h>
+#include <linux/device.h>
 
 #include <asm/irq_regs.h>
 
@@ -1423,7 +1424,7 @@ static bool kbd_match(struct input_handler *handler, struct input_dev *dev)
  * likes it, it can open it and get events from it. In this (kbd_connect)
  * function, we should decide which VT to bind that keyboard to initially.
  */
-static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+static int __kbd_connect(struct input_handler *handler, struct input_dev *dev,
 			const struct input_device_id *id)
 {
 	struct input_handle *handle;
@@ -1454,13 +1455,82 @@ static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
 	return error;
 }
 
-static void kbd_disconnect(struct input_handle *handle)
+static void __kbd_disconnect(struct input_handle *handle)
 {
 	input_close_device(handle);
 	input_unregister_handle(handle);
 	kfree(handle);
 }
 
+extern struct mutex input_mutex;
+/*
+ * To unbind keyboard need write "unbind" in kbd_bind
+ * To bind keyboard to all TTYs need write "all" in kbd_bind (by default)
+ * To bind keyboard to specified TTY... (not implemented)
+ */
+static ssize_t kbd_bind_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t len)
+{
+	struct list_head *node;
+	int ret = -EINVAL;
+	struct input_dev *idev;
+	char *s;
+
+	if (buf[len] != '\0')
+		return -EINVAL;
+
+/*	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+*/
+	s = strchr(buf, '\n');
+	if (s)
+		*s = '\0';
+
+	mutex_lock(&input_mutex);
+	if (!strcmp(buf, "unbind")) {
+		list_for_each(node, &kbd_handler.h_list) {
+			struct input_handle *handle = container_of(node,
+					struct input_handle, h_node);
+			idev = handle->dev;
+			if (&idev->dev == dev) {
+				__kbd_disconnect(handle);
+				ret = len;
+				break;
+			}
+		}
+	} else if (!strcmp(buf, "all")) {
+		idev = container_of(dev, struct input_dev, dev);
+		ret = __kbd_connect(&kbd_handler, idev, NULL);
+		if (!ret)
+			ret = len;
+	}
+	mutex_unlock(&input_mutex);
+
+	return ret;
+}
+
+static DEVICE_ATTR(kbd_bind, S_IWUSR, NULL , kbd_bind_store);
+
+static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+			const struct input_device_id *id)
+{
+	int error;
+	error = device_create_file(&dev->dev, &dev_attr_kbd_bind);
+	if (error)
+		return error;
+	error = __kbd_connect(handler, dev, id);
+	if (error)
+		device_remove_file(&dev->dev, &dev_attr_kbd_bind);
+	return error;
+}
+
+static void kbd_disconnect(struct input_handle *handle)
+{
+	device_remove_file(&handle->dev->dev, &dev_attr_kbd_bind);
+	__kbd_disconnect(handle);
+}
+
 /*
  * Start keyboard handler on the new keyboard by refreshing LED state to
  * match the rest of the system.
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -32,12 +32,12 @@
 #include <linux/kbd_kern.h>
 #include <linux/console.h>
 #include <linux/device.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <linux/notifier.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -742,6 +742,8 @@ static void visual_init(struct vc_data *vc, int num, int init)
 	__module_get(vc->vc_sw->owner);
 	vc->vc_num = num;
 	vc->vc_display_fg = &master_display_fg;
+	if (vc->vc_uni_pagedir_loc)
+		con_free_unimap(vc);
 	vc->vc_uni_pagedir_loc = &vc->vc_uni_pagedir;
 	vc->vc_uni_pagedir = 0;
 	vc->vc_hi_font_mask = 0;
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -30,7 +30,7 @@
 
 #include "vhost.h"
 
-static int experimental_zcopytx;
+static int experimental_zcopytx = 1;
 module_param(experimental_zcopytx, int, 0444);
 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 		                       " 1 -Enable; 0 - Disable");
@@ -862,7 +862,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	}
 	r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
-		kfree(n);
+		vhost_net_free(n);
 		kfree(vqs);
 		return r;
 	}
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -759,7 +759,8 @@ static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
 			goto out;
 		iov_iter_init(&t, (const struct iovec *)vq->iotlb_iov,
 			      ret, size, 0);
-		ret = memcpy_toiovecend(t.iov, (unsigned char *)from, 0, size);
+		ret = memcpy_toiovecend((struct iovec *)t.data,
+					(unsigned char *)from, 0, size);
 	}
 out:
 	return ret;
@@ -797,7 +798,8 @@ static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 		}
 		iov_iter_init(&f, (const struct iovec *)vq->iotlb_iov,
 			      ret, size, 0);
-		ret = memcpy_fromiovecend((unsigned char *)to, f.iov, 0, size);
+		ret = memcpy_fromiovecend((unsigned char *)to,
+					  (struct iovec *)f.data, 0, size);
 	}
 
 out:
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -168,7 +168,7 @@ static int xen_tmem_destroy_pool(u32 pool_id)
 
 /* cleancache ops */
 
-static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
+static int tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
 				     pgoff_t index, struct page *page)
 {
 	u32 ind = (u32) index;
@@ -176,11 +176,11 @@ static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
 	unsigned long pfn = page_to_pfn(page);
 
 	if (pool < 0)
-		return;
+		return 0;
 	if (ind != index)
-		return;
+		return 0;
 	mb(); /* ensure page is quiescent; tmem may address it with an alias */
-	(void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+	return !xen_tmem_put_page((u32)pool, oid, ind, pfn);
 }
 
 static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
@@ -397,13 +397,15 @@ static int xen_tmem_init(void)
 #ifdef CONFIG_CLEANCACHE
 	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
 	if (tmem_enabled && cleancache) {
-		char *s = "";
-		struct cleancache_ops *old_ops =
-			cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops)
-			s = " (WARNING: cleancache_ops overridden)";
-		pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
-			s);
+		int err;
+
+		err = cleancache_register_ops(&tmem_cleancache_ops);
+		if (err)
+			pr_warn("xen-tmem: failed to enable cleancache: %d\n",
+				err);
+		else
+			pr_info("cleancache enabled, RAM provided by "
+				"Xen Transcendent Memory\n");
 	}
 #endif
 #ifdef CONFIG_XEN_SELFBALLOONING
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -320,32 +320,26 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			retval = posix_acl_equiv_mode(acl, &mode);
-			if (retval < 0)
+			struct iattr iattr;
+
+			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+			if (retval)
 				goto err_out;
-			else {
-				struct iattr iattr;
-				if (retval == 0) {
-					/*
-					 * ACL can be represented
-					 * by the mode bits. So don't
-					 * update ACL.
-					 */
-					acl = NULL;
-					value = NULL;
-					size = 0;
-				}
-				/* Updte the mode bits */
-				iattr.ia_mode = ((mode & S_IALLUGO) |
-						 (inode->i_mode & ~S_IALLUGO));
-				iattr.ia_valid = ATTR_MODE;
-				/* FIXME should we update ctime ?
-				 * What is the following setxattr update the
-				 * mode ?
+			if (!acl) {
+				/*
+				 * ACL can be represented
+				 * by the mode bits. So don't
+				 * update ACL.
 				 */
-				v9fs_vfs_setattr_dotl(dentry, &iattr);
+				value = NULL;
+				size = 0;
 			}
+			iattr.ia_valid = ATTR_MODE;
+			/* FIXME should we update ctime ?
+			 * What is the following setxattr update the
+			 * mode ?
+			 */
+			v9fs_vfs_setattr_dotl(dentry, &iattr);
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -574,7 +574,7 @@ static int v9fs_init_inode_cache(void)
 	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
 					  sizeof(struct v9fs_inode),
 					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD),
+					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					  v9fs_inode_init_once);
 	if (!v9fs_inode_cache)
 		return -ENOMEM;
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -482,7 +482,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 	if (invalidate && (total > 0)) {
 		pg_start = origin >> PAGE_CACHE_SHIFT;
 		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
-		if (inode->i_mapping && inode->i_mapping->nrpages)
+		if (inode->i_mapping)
 			invalidate_inode_pages2_range(inode->i_mapping,
 						      pg_start, pg_end);
 		*offset += total;
@@ -688,7 +688,7 @@ v9fs_direct_write(struct file *filp, const char __user * data,
 	 * about to write.  We do this *before* the write so that if we fail
 	 * here we fall back to buffered write
 	 */
-	if (mapping->nrpages) {
+	{
 		pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
 		pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
 
@@ -735,7 +735,6 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -270,7 +270,7 @@ static int init_inodecache(void)
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -138,7 +138,7 @@ static int init_inodecache(void)
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
 	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
 					     sizeof(struct afs_vnode),
 					     0,
-					     SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
@@ -122,14 +123,9 @@ struct kioctx {
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
+	struct ve_struct	*ve;
 };
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
@@ -526,6 +522,9 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
 static void free_ioctx_rcu(struct rcu_head *head)
 {
 	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	struct ve_struct *ve = ctx->ve;
+
+	put_ve(ve);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -602,8 +601,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
+	struct ve_struct *ve = get_exec_env();
 	int err = -ENOMEM;
 
+	/* Kernel since e1bdd5f27a5b do this, and criu is tuned on that */
+	nr_events *= 2;
+
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
 	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -611,7 +614,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)nr_events > aio_max_nr)
+	if (!nr_events || (unsigned long)nr_events > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -619,6 +622,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = nr_events;
+	ctx->ve = get_ve(ve);
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -639,14 +643,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + nr_events > aio_max_nr ||
-	    aio_nr + nr_events < aio_nr) {
-		spin_unlock(&aio_nr_lock);
+	spin_lock(&ve->aio_nr_lock);
+	if (ve->aio_nr + ctx->nr_events > ve->aio_max_nr ||
+	    ve->aio_nr + ctx->nr_events < ve->aio_nr) {
+		spin_unlock(&ve->aio_nr_lock);
 		goto out_cleanup;
 	}
-	aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	ve->aio_nr += ctx->nr_events;
+	spin_unlock(&ve->aio_nr_lock);
 
 	/* now link into global list. */
 	spin_lock(&mm->ioctx_lock);
@@ -667,6 +671,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	aio_free_ring(ctx);
 out_freectx:
+	put_ve(ctx->ve);
 	mutex_unlock(&ctx->ring_lock);
 	put_aio_ring_file(ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
@@ -699,6 +704,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 		struct completion *requests_done)
 {
 	if (!atomic_xchg(&ctx->dead, 1)) {
+		struct ve_struct *ve = ctx->ve;
+
 		spin_lock(&mm->ioctx_lock);
 		hlist_del_rcu(&ctx->list);
 		spin_unlock(&mm->ioctx_lock);
@@ -710,10 +717,10 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 		 * -EAGAIN with no ioctxs actually in use (as far as userspace
 		 *  could tell).
 		 */
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
-		aio_nr -= ctx->max_reqs;
-		spin_unlock(&aio_nr_lock);
+		spin_lock(&ve->aio_nr_lock);
+		BUG_ON(ve->aio_nr - ctx->nr_events > ve->aio_nr);
+		ve->aio_nr -= ctx->nr_events;
+		spin_unlock(&ve->aio_nr_lock);
 
 		if (ctx->mmap_size)
 			vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -940,6 +947,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 		atomic_set(&iocb->ki_users, 0);
 		wake_up_process(iocb->ki_obj.tsk);
 		return;
+	} else if (is_kernel_kiocb(iocb)) {
+		iocb->ki_obj.complete(iocb->ki_user_data, res);
+		aio_kernel_free(iocb);
+		return;
 	}
 
 	/*
@@ -1381,6 +1392,51 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
 	return 0;
 }
 
+static ssize_t aio_read_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	if (unlikely(!is_kernel_kiocb(iocb)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	ret = security_file_permission(file, MAY_READ);
+	if (unlikely(ret))
+		return ret;
+
+	if (!file->f_op->read_iter)
+		return -EINVAL;
+
+	return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+}
+
+static ssize_t aio_write_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	if (unlikely(!is_kernel_kiocb(iocb)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	ret = security_file_permission(file, MAY_WRITE);
+	if (unlikely(ret))
+		return ret;
+
+	if (!file->f_op->write_iter)
+		return -EINVAL;
+
+	file_start_write(file);
+	ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+	file_end_write(file);
+	return ret;
+}
+
 /*
  * aio_setup_iocb:
  *	Performs the initial checks and aio retry method
@@ -1432,6 +1488,14 @@ static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
 		ret = aio_rw_vect_retry(req, rw, rw_op);
 		break;
 
+	case IOCB_CMD_READ_ITER:
+		ret = aio_read_iter(req);
+		break;
+
+	case IOCB_CMD_WRITE_ITER:
+		ret = aio_write_iter(req);
+		break;
+
 	case IOCB_CMD_FDSYNC:
 		if (!file->f_op->aio_fsync)
 			return -EINVAL;
@@ -1466,6 +1530,89 @@ static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
 	return 0;
 }
 
+/*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit().  From that point forward progress is
+ * guaranteed by the file system aio method.  Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special.  They don't have a context, we don't limit the
+ * number pending, they can't be canceled, and can't be retried.  In the short
+ * term callers need to be careful not to call operations which might retry by
+ * only calling new ops which never add retry support.  In the long term
+ * retry-based AIO should be removed.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+	struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp);
+	if (iocb)
+		iocb->ki_ctx = (void *)-1;
+	return iocb;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+	kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * The iter count must be set before calling here.  Some filesystems uses
+ * iocb->ki_left as an indicator of the size of an IO.
+ */
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off)
+{
+	iocb->ki_filp = filp;
+	iocb->ki_iter = iter;
+	iocb->ki_opcode = op;
+	iocb->ki_pos = off;
+	iocb->ki_nbytes = iov_iter_count(iter);
+	iocb->ki_left = iocb->ki_nbytes;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_iter);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data)
+{
+	iocb->ki_obj.complete = complete;
+	iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called.  The caller must not
+ * reference it.  This comes from aio_setup_iocb() modifying the iocb.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function.  The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+	int ret;
+
+	BUG_ON(!is_kernel_kiocb(iocb));
+	BUG_ON(!iocb->ki_obj.complete);
+	BUG_ON(!iocb->ki_filp);
+
+	ret = aio_run_iocb(iocb, 0);
+
+	if (ret)
+		aio_kernel_free(iocb);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
@@ -1711,3 +1858,73 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	}
 	return ret;
 }
+
+#ifdef CONFIG_VE
+static bool has_reqs_active(struct kioctx *ctx)
+{
+	unsigned long flags;
+	unsigned nr;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	nr = atomic_read(&ctx->reqs_active);
+	nr -= ctx->completed_events;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return !!nr;
+}
+
+static int ve_aio_wait_inflight_reqs(struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct kioctx *ctx;
+	int ret;
+
+	if (p->flags & PF_KTHREAD)
+		return -EINVAL;
+
+	task_lock(p);
+	mm = p->mm;
+	if (mm)
+		atomic_inc(&mm->mm_count);
+	task_unlock(p);
+	if (!mm)
+		return -ESRCH;
+
+again:
+	spin_lock_irq(&mm->ioctx_lock);
+	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
+		if (!has_reqs_active(ctx))
+			continue;
+
+		atomic_inc(&ctx->users);
+		spin_unlock_irq(&mm->ioctx_lock);
+
+		ret = wait_event_interruptible(ctx->wait, !has_reqs_active(ctx));
+		put_ioctx(ctx);
+
+		if (ret)
+			goto mmdrop;
+		goto again;
+	}
+	spin_unlock_irq(&mm->ioctx_lock);
+	ret = 0;
+mmdrop:
+	mmdrop(mm);
+	return ret;
+}
+
+int ve_aio_ioctl(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	switch (cmd) {
+		case VE_AIO_IOC_WAIT_ACTIVE:
+			ret = ve_aio_wait_inflight_reqs(task);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+#endif
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -628,7 +628,7 @@ static int _autofs_dev_ioctl(unsigned int command,
 	int err = 0;
 
 	/* only root can play with this */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -21,6 +21,7 @@ static struct file_system_type autofs_fs_type = {
 	.name		= "autofs",
 	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("autofs");
 
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -75,6 +75,10 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		return 0;
 
 	seq_printf(m, ",fd=%d", sbi->pipefd);
+	if (sbi->pipe)
+		seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+	else
+		seq_printf(m, ",pipe_ino=-1");
 	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
 		seq_printf(m, ",uid=%u",
 			from_kuid_munged(&init_user_ns, root_inode->i_uid));
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -624,7 +624,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *p_ino;
 
 	/* This allows root to remove symlinks */
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (atomic_dec_and_test(&ino->count)) {
@@ -883,7 +883,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -121,7 +121,7 @@ static unsigned long bad_file_get_unmapped_area(struct file *file,
 	return -EIO;
 }
 
-static int bad_file_check_flags(int flags)
+static int bad_file_set_flags(struct file *file, int flags)
 {
 	return -EIO;
 }
@@ -166,7 +166,7 @@ static const struct file_operations bad_file_ops =
 	.lock		= bad_file_lock,
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
-	.check_flags	= bad_file_check_flags,
+	.set_flags	= bad_file_set_flags,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -439,7 +439,7 @@ befs_init_inodecache(void)
 	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -271,7 +271,7 @@ static int init_inodecache(void)
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -296,12 +296,12 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
 		{
-			printk(KERN_NOTICE "executable not page aligned\n");
+			ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
 		}
 
 		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING
 			       "fd_offset is not page aligned. Please convert program: %s\n",
 			       bprm->file->f_path.dentry->d_name.name);
 		}
@@ -390,7 +390,7 @@ static int load_aout_library(struct file *file)
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
 		if (printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
 		}
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -34,8 +34,8 @@
 #include <linux/elf-randomize.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
-#include <linux/sched.h>
 #include <linux/dax.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1539,20 +1539,12 @@ static void do_thread_regset_writeback(struct task_struct *task,
 		regset->writeback(task, regset, 1);
 }
 
-#ifndef PR_REG_SIZE
-#define PR_REG_SIZE(S) sizeof(S)
-#endif
-
 #ifndef PRSTATUS_SIZE
-#define PRSTATUS_SIZE(S) sizeof(S)
-#endif
-
-#ifndef PR_REG_PTR
-#define PR_REG_PTR(S) (&((S)->pr_reg))
+#define PRSTATUS_SIZE(S, R) sizeof(S)
 #endif
 
 #ifndef SET_PR_FPVALID
-#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
 #endif
 
 static int fill_thread_core_info(struct elf_thread_core_info *t,
@@ -1560,6 +1552,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
+	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1568,12 +1561,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0],
-				    0, PR_REG_SIZE(t->prstatus.pr_reg),
-				    PR_REG_PTR(&t->prstatus), NULL);
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1603,7 +1595,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  regset->core_note_type,
 						  size, data);
 				else {
-					SET_PR_FPVALID(&t->prstatus, 1);
+					SET_PR_FPVALID(&t->prstatus,
+							1, regset_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -18,7 +18,6 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
 #include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
@@ -30,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -37,9 +37,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1<<31)
 #define MISC_FMT_OPEN_BINARY (1<<30)
@@ -57,22 +54,30 @@ typedef struct {
 	struct dentry *dentry;
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
+
+struct binfmt_misc {
+	struct list_head entries;
+	int enabled;
+
+	rwlock_t entries_lock;
+	struct vfsmount *bm_mnt;
+	int entry_count;
+};
+
+#define BINFMT_MISC(sb)		(((struct ve_struct *)(sb)->s_fs_info)->binfmt_misc)
 
 /* 
  * Check if we support the binfmt
  * if we do, return the node, else NULL
  * locking is done in load_misc_binary
  */
-static Node *check_file(struct linux_binprm *bprm)
+static Node *check_file(struct binfmt_misc *bm_data, struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
 	struct list_head *l;
 
-	list_for_each(l, &entries) {
+	list_for_each(l, &bm_data->entries) {
 		Node *e = list_entry(l, Node, list);
 		char *s;
 		int j;
@@ -113,17 +118,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	const char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
+	struct binfmt_misc *bm_data = get_exec_env()->binfmt_misc;
 
 	retval = -ENOEXEC;
-	if (!enabled)
+	if (!bm_data || !bm_data->enabled)
 		goto _ret;
 
 	/* to keep locking time low, we copy the interpreter string */
-	read_lock(&entries_lock);
-	fmt = check_file(bprm);
+	read_lock(&bm_data->entries_lock);
+	fmt = check_file(bm_data, bprm);
 	if (fmt)
 		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
-	read_unlock(&entries_lock);
+	read_unlock(&bm_data->entries_lock);
 	if (!fmt)
 		goto _ret;
 
@@ -490,23 +496,23 @@ static void bm_evict_inode(struct inode *inode)
 	kfree(inode->i_private);
 }
 
-static void kill_node(Node *e)
+static void kill_node(struct binfmt_misc *bm_data, Node *e)
 {
 	struct dentry *dentry;
 
-	write_lock(&entries_lock);
+	write_lock(&bm_data->entries_lock);
 	dentry = e->dentry;
 	if (dentry) {
 		list_del_init(&e->list);
 		e->dentry = NULL;
 	}
-	write_unlock(&entries_lock);
+	write_unlock(&bm_data->entries_lock);
 
 	if (dentry) {
 		drop_nlink(dentry->d_inode);
 		d_drop(dentry);
 		dput(dentry);
-		simple_release_fs(&bm_mnt, &entry_count);
+		simple_release_fs(&bm_data->bm_mnt, &bm_data->entry_count);
 	}
 }
 
@@ -536,16 +542,18 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	struct dentry *root;
 	Node *e = file_inode(file)->i_private;
 	int res = parse_command(buffer, count);
+	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
 
 	switch (res) {
 		case 1: clear_bit(Enabled, &e->flags);
 			break;
 		case 2: set_bit(Enabled, &e->flags);
 			break;
-		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
+		case 3: root = dget(sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
-			kill_node(e);
+			kill_node(bm_data, e);
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
@@ -570,6 +578,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct dentry *root, *dentry;
 	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
 	int err = 0;
 
 	e = create_entry(buffer, count);
@@ -594,7 +603,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_data->bm_mnt, &bm_data->entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
@@ -606,9 +615,9 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	write_lock(&bm_data->entries_lock);
+	list_add(&e->list, &bm_data->entries);
+	write_unlock(&bm_data->entries_lock);
 
 	err = 0;
 out2:
@@ -634,7 +643,8 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *bm_data = BINFMT_MISC(file->f_dentry->d_sb);
+	char *s = bm_data->enabled ? "enabled\n" : "disabled\n";
 
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
@@ -642,17 +652,19 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *bm_data = BINFMT_MISC(file->f_dentry->d_sb);
 	int res = parse_command(buffer, count);
 	struct dentry *root;
 
 	switch (res) {
-		case 1: enabled = 0; break;
-		case 2: enabled = 1; break;
+		case 1: bm_data->enabled = 0; break;
+		case 2: bm_data->enabled = 1; break;
 		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
-			while (!list_empty(&entries))
-				kill_node(list_entry(entries.next, Node, list));
+			while (!list_empty(&bm_data->entries))
+				kill_node(bm_data, list_first_entry(
+					&bm_data->entries, Node, list));
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
@@ -669,9 +681,19 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
+	struct ve_struct *ve = sb->s_fs_info;
+
+	bm_data->enabled = 0;
+	put_ve(ve);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -681,16 +703,41 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
-	int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
-	if (!err)
-		sb->s_op = &s_ops;
-	return err;
+	struct ve_struct *ve = data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+	int err;
+
+	if (!bm_data) {
+		bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!bm_data)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&bm_data->entries);
+		rwlock_init(&bm_data->entries_lock);
+
+		ve->binfmt_misc = bm_data;
+	}
+
+	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
+	if (err) {
+		kfree(bm_data);
+		return err;
+	}
+
+	sb->s_op = &s_ops;
+
+	bm_data->enabled = 1;
+	get_ve(ve);
+
+	return 0;
 }
 
 static struct dentry *bm_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_single(fs_type, flags, data, bm_fill_super);
+	if (!current_user_ns_initial() && !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	return mount_ns(fs_type, flags, get_exec_env(), bm_fill_super);
 }
 
 static struct linux_binfmt misc_format = {
@@ -703,19 +750,46 @@ static struct file_system_type bm_fs_type = {
 	.name		= "binfmt_misc",
 	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
+	.fs_flags	= FS_VIRTUALIZED | FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("binfmt_misc");
 
+static void ve_binfmt_fini(void *data)
+{
+	struct ve_struct *ve = data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+
+	if (!bm_data)
+		return;
+
+	/*
+	 * XXX: Note we don't take any locks here. This is safe as long as
+	 * nobody uses binfmt_misc outside the owner ve.
+	 */
+	while (!list_empty(&bm_data->entries))
+		kill_node(bm_data, list_first_entry(
+			&bm_data->entries, Node, list));
+}
+
+static struct ve_hook ve_binfmt_hook = {
+	.fini		= ve_binfmt_fini,
+	.priority	= HOOK_PRIO_DEFAULT,
+	.owner		= THIS_MODULE,
+};
+
 static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
-	if (!err)
+	if (!err) {
 		insert_binfmt(&misc_format);
+		ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook);
+	}
 	return err;
 }
 
 static void __exit exit_misc_binfmt(void)
 {
+	ve_hook_unregister(&ve_binfmt_hook);
 	unregister_binfmt(&misc_format);
 	unregister_filesystem(&bm_fs_type);
 }
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1347,6 +1347,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 		offset = uaddr & ~PAGE_MASK;
 		for (j = cur_page; j < page_limit; j++) {
 			unsigned int bytes = PAGE_SIZE - offset;
+			unsigned short prev_bi_vcnt = bio->bi_vcnt;
 
 			if (len <= 0)
 				break;
@@ -1361,6 +1362,13 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 					    bytes)
 				break;
 
+			/*
+			 * check if vector was merged with previous
+			 * drop page reference if needed
+			 */
+			if (bio->bi_vcnt == prev_bi_vcnt)
+				put_page(pages[j]);
+
 			len -= bytes;
 			offset = 0;
 		}
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -107,12 +107,12 @@ void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0)
-		return;
-
-	invalidate_bh_lrus();
-	lru_add_drain_all();	/* make sure all lru add caches are flushed */
-	invalidate_mapping_pages(mapping, 0, -1);
+	/* FIXME: Shouldn't we add '|| mapping->nrexceptional' ? */
+	if (mapping->nrpages) {
+		invalidate_bh_lrus();
+		lru_add_drain_all();	/* make sure all lru add caches are flushed */
+		invalidate_mapping_pages(mapping, 0, -1);
+	}
 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 	 * But, for the strange corners, lets be cautious
 	 */
@@ -256,7 +256,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		 * thaw_bdev drops it.
 		 */
 		sb = get_super(bdev);
-		drop_super(sb);
+		if (sb)
+			drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
@@ -670,7 +671,7 @@ void __init bdev_cache_init(void)
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-				SLAB_MEM_SPREAD|SLAB_PANIC),
+				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
@@ -1226,12 +1227,19 @@ int check_disk_change(struct block_device *bdev)
 
 EXPORT_SYMBOL(check_disk_change);
 
+void bd_write_size(struct block_device *bdev, loff_t size)
+{
+	i_size_write(bdev->bd_inode, size);
+	blk_cbt_update_size(bdev);
+}
+EXPORT_SYMBOL(bd_write_size);
+
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
 	mutex_lock(&bdev->bd_inode->i_mutex);
-	i_size_write(bdev->bd_inode, size);
+	bd_write_size(bdev, size);
 	mutex_unlock(&bdev->bd_inode->i_mutex);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
@@ -1264,6 +1272,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		perm |= MAY_READ;
 	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
+	if (mode & FMODE_MOUNT)
+		perm |= MAY_MOUNT;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2191,7 +2191,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= btrfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9439,7 +9439,8 @@ int btrfs_init_cachep(void)
 {
 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+			init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -612,6 +612,11 @@ static void __set_page_dirty(struct page *page,
 		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+		if (mapping_cap_account_dirty(mapping) &&
+				!radix_tree_prev_tag_get(
+					&mapping->page_tree,
+					PAGECACHE_TAG_DIRTY))
+			ub_io_account_dirty(mapping);
 	}
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -2945,6 +2950,7 @@ void guard_bio_eod(int rw, struct bio *bio)
 
 	/* Truncate the bio.. */
 	bio->bi_size -= truncated_bytes;
+	BUG_ON(truncated_bytes > bvec->bv_len);
 	bvec->bv_len -= truncated_bytes;
 
 	/* ..and clear the end of the buffer for reads */
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1648,7 +1648,6 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 static struct vm_operations_struct ceph_vmops = {
 	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -40,8 +40,8 @@
  */
 static size_t dio_get_pagev_size(const struct iov_iter *it)
 {
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
+    const struct iovec *iov = iov_iter_iovec(it);
+    size_t total = iov_iter_count(it);
     size_t size;
 
     size = iov->iov_len - it->iov_offset;
@@ -50,8 +50,10 @@ static size_t dio_get_pagev_size(const struct iov_iter *it)
      * and the next base are page aligned.
      */
     while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
+           PAGE_ALIGNED(((iov++)->iov_base))) {
+	    size_t n =  min(iov->iov_len, total);
+	    size += n;
+	    total -= n;
     }
     dout("dio_get_pagevlen len = %zu\n", size);
     return size;
@@ -71,7 +73,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	struct page **pages;
 	int ret = 0, idx, npages;
 
-	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+	align = (unsigned long)(iov_iter_iovec(it)->iov_base + it->iov_offset) &
 		(PAGE_SIZE - 1);
 	npages = calc_pages_for(align, nbytes);
 	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
@@ -82,10 +84,11 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	}
 
 	for (idx = 0; idx < npages; ) {
-		void __user *data = tmp_it.iov->iov_base + tmp_it.iov_offset;
+		struct iovec *tmp_iov = iov_iter_iovec(&tmp_it);
+		void __user *data = tmp_iov->iov_base + tmp_it.iov_offset;
 		size_t off = (unsigned long)data & (PAGE_SIZE - 1);
 		size_t len = min_t(size_t, nbytes,
-				   tmp_it.iov->iov_len - tmp_it.iov_offset);
+				   tmp_iov->iov_len - tmp_it.iov_offset);
 		int n = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		ret = get_user_pages_fast((unsigned long)data, n, write,
 					   pages + idx);
@@ -522,10 +525,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 		size_t left = len = ret;
 
 		while (left) {
-			void __user *data = i->iov[0].iov_base +
-					    i->iov_offset;
-			l = min(i->iov[0].iov_len - i->iov_offset,
-				left);
+			struct iovec *iov = (struct iovec *)i->data;
+			void __user *data = iov->iov_base + i->iov_offset;
+			l = min(iov->iov_len - i->iov_offset, left);
 
 			ret = ceph_copy_page_vector_to_user(&pages[k],
 							    data, off, l);
@@ -1120,8 +1122,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
 			zero_user_segment(inline_page, inline_len, end);
 
 		while (left) {
-			void __user *udata = i->iov->iov_base + i->iov_offset;
-			size_t n = min(i->iov->iov_len - i->iov_offset, left);
+			struct iovec *iov = iov_iter_iovec(i);
+			void __user *udata = iov->iov_base;
+			size_t n = min(iov->iov_len - i->iov_offset, left);
 
 			if (__copy_to_user(udata, kdata, n)) {
 				ret = -EFAULT;
@@ -1138,8 +1141,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
 		size_t left = min_t(loff_t, iocb->ki_pos + len, i_size) - pos;
 
 		while (left) {
-			void __user *udata = i->iov->iov_base + i->iov_offset;
-			size_t n = min(i->iov->iov_len - i->iov_offset, left);
+			struct iovec *iov = (struct iovec *)i->data;
+			void __user *udata = iov->iov_base;
+			size_t n = min(iov->iov_len - i->iov_offset, left);
 
 			if (__clear_user(udata, n)) {
 				ret = -EFAULT;
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -634,8 +634,8 @@ static int __init init_caches(void)
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				      SLAB_ACCOUNT, ceph_inode_init_once);
 	if (ceph_inode_cachep == NULL)
 		return -ENOMEM;
 
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -21,6 +21,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/tty.h>
+#include <linux/device_cgroup.h>
 
 #include "internal.h"
 
@@ -72,8 +73,12 @@ void chrdev_show(struct seq_file *f, off_t offset)
 
 	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&chrdevs_lock);
-		for (cd = chrdevs[offset]; cd; cd = cd->next)
+		for (cd = chrdevs[offset]; cd; cd = cd->next) {
+			if (!devcgroup_device_visible(S_IFCHR, cd->major,
+						cd->baseminor, cd->minorct))
+				continue;
 			seq_printf(f, "%3d %s\n", cd->major, cd->name);
+		}
 		mutex_unlock(&chrdevs_lock);
 	}
 }
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1074,7 +1074,7 @@ cifs_init_inodecache(void)
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
 					      sizeof(struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2497,8 +2497,9 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
 	save_len = cur_len;
 	for (i = 0; i < nr_pages; i++) {
 		bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-		copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+		copied = iov_iter_copy_from_user(wdata->pages[i], from, 0, bytes);
 		cur_len -= copied;
+		iov_iter_advance(from, copied);
 		/*
 		 * If we didn't copy as much as we expected, then that
 		 * may mean we trod into an unmapped area. Stop copying
@@ -2915,8 +2916,10 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
 	for (i = 0; i < rdata->nr_pages; i++) {
 		struct page *page = rdata->pages[i];
 		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
-		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		size_t written = iov_iter_copy_to_user(page, iter, 0, copy);
+
 		remaining -= written;
+		iov_iter_advance(iter, written);
 		if (written < copy && iov_iter_count(iter) > 0)
 			break;
 	}
@@ -3315,7 +3318,6 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static struct vm_operations_struct cifs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = cifs_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -22,7 +22,6 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/freezer.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -1874,7 +1873,7 @@ cifs_invalidate_mapping(struct inode *inode)
 static int
 cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -76,9 +76,9 @@ static void init_once(void *foo)
 int coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
-				sizeof(struct coda_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct coda_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -48,6 +48,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/device_cgroup.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -68,6 +69,18 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
+int ve_compat_printk(int dst, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = ve_vprintk(dst, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -333,9 +346,16 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
  */
 asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct compat_ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(kdev, &sbuf);
 	if (err)
 		return err;
 
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1,6 +1,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/freezer.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
@@ -32,6 +33,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -45,7 +47,6 @@
 #include <trace/events/sched.h>
 
 int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 
 struct core_name {
@@ -152,7 +153,7 @@ static int cn_print_exe_file(struct core_name *cn)
 static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 {
 	const struct cred *cred = current_cred();
-	const char *pat_ptr = core_pattern;
+	const char *pat_ptr = get_exec_env()->core_pattern;
 	int ispipe = (*pat_ptr == '|');
 	int pid_in_pattern = 0;
 	int err = 0;
@@ -388,7 +389,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	if (core_waiters > 0) {
 		struct core_thread *ptr;
 
+		freezer_do_not_count();
 		wait_for_completion(&core_state->startup);
+		freezer_count();
 		/*
 		 * Wait for all the threads to become inactive, so that
 		 * all the thread context (extended register state, like
@@ -530,6 +533,9 @@ void do_coredump(siginfo_t *siginfo)
 	if (!__get_dumpable(cprm.mm_flags))
 		goto fail;
 
+	/* Avoid dumping sensitive tasks */
+	if (mm->vps_dumpable != VD_PTRACE_COREDUMP)
+		goto fail;
 	cred = prepare_creds();
 	if (!cred)
 		goto fail;
@@ -557,7 +563,6 @@ void do_coredump(siginfo_t *siginfo)
 	if (ispipe) {
 		int dump_count;
 		char **helper_argv;
-		struct subprocess_info *sub_info;
 
 		if (ispipe < 0) {
 			printk(KERN_WARNING "format_corename failed\n");
@@ -605,12 +610,9 @@ void do_coredump(siginfo_t *siginfo)
 		}
 
 		retval = -ENOMEM;
-		sub_info = call_usermodehelper_setup(helper_argv[0],
-						helper_argv, NULL, GFP_KERNEL,
-						umh_pipe_setup, NULL, &cprm);
-		if (sub_info)
-			retval = call_usermodehelper_exec(sub_info,
-							  UMH_WAIT_EXEC);
+		retval = call_usermodehelper_fns_ve(get_exec_env(), helper_argv[0],
+		                                    helper_argv, NULL, UMH_WAIT_EXEC,
+						    umh_pipe_setup, NULL, &cprm);
 
 		argv_free(helper_argv);
 		if (retval) {
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -643,7 +643,6 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		       RADIX_DAX_ENTRY_LOCK);
 	if (hole_fill) {
 		__delete_from_page_cache(entry, NULL);
-		mem_cgroup_uncharge_page(entry);
 		/* Drop pagecache reference */
 		page_cache_release(entry);
 		error = radix_tree_insert(page_tree, index, new_entry);
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,11 +35,16 @@
 #include <linux/hardirq.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
+#include <linux/kasan.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <linux/vzstat.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include "mount.h"
 
+
 /*
  * Usage:
  * dcache->d_inode->i_lock protects:
@@ -48,7 +53,7 @@
  *   - the dcache hash table
  * s_anon bl list spinlock protects:
  *   - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
  *   - d_flags
@@ -63,7 +68,7 @@
  * Ordering:
  * dentry->d_inode->i_lock
  *   dentry->d_lock
- *     dcache_lru_lock
+ *     dentry->d_sb->s_dentry_lru_lock
  *     dcache_hash_bucket lock
  *     s_anon lock
  *
@@ -81,7 +86,8 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
+int sysctl_vfs_cache_min_ratio __read_mostly = 2;
+
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
@@ -117,23 +123,47 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_dentry, i);
 	return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_unused(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	dentry_stat.nr_unused = get_nr_dentry_unused();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -265,22 +295,38 @@ static void __d_free(struct rcu_head *head)
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
+static void dentry_free(struct dentry *dentry)
+{
+	struct rcu_head *p = (struct rcu_head *)&dentry->d_alias;
+
+	/* if dentry was never visible to RCU, immediate free is OK */
+	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+		__d_free(p);
+	else
+		call_rcu(p, __d_free);
+}
+
 /*
  * no locks, please.
  */
 static void d_free(struct dentry *dentry)
 {
-	struct rcu_head *p = (struct rcu_head *)&dentry->d_alias;
+	bool can_free = true;
+
 	BUG_ON((int)dentry->d_lockref.count > 0);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
-	/* if dentry was never visible to RCU, immediate free is OK */
-	if (!(dentry->d_flags & DCACHE_RCUACCESS))
-		__d_free(p);
-	else
-		call_rcu(p, __d_free);
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		dentry->d_flags |= DCACHE_MAY_FREE;
+		can_free = false;
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (likely(can_free))
+		dentry_free(dentry);
 }
 
 /**
@@ -347,86 +393,81 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
  */
-static void dentry_lru_add(struct dentry *dentry)
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
 {
-	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
-		spin_lock(&dcache_lru_lock);
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, 0);
+	dentry->d_flags |= DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
-static void __dentry_lru_del(struct dentry *dentry)
+static void d_lru_del(struct dentry *dentry)
 {
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
 	list_del_init(&dentry->d_lru);
 	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
-	dentry->d_sb->s_nr_dentry_unused--;
-	dentry_stat.nr_unused--;
+	this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+	D_FLAG_VERIFY(dentry, 0);
+	list_add(&dentry->d_lru, list);
+	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
 }
 
 /*
- * Remove a dentry with references from the LRU.
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
  */
-static void dentry_lru_del(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
-	if (!list_empty(&dentry->d_lru)) {
-		spin_lock(&dcache_lru_lock);
-		__dentry_lru_del(dentry);
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	list_lru_isolate(lru, &dentry->d_lru);
 }
 
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+			      struct list_head *list)
 {
-	spin_lock(&dcache_lru_lock);
-	if (list_empty(&dentry->d_lru)) {
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add_tail(&dentry->d_lru, list);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-	} else {
-		list_move_tail(&dentry->d_lru, list);
-	}
-	spin_unlock(&dcache_lru_lock);
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags |= DCACHE_SHRINK_LIST;
+	list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
  */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
-	__releases(dentry->d_lock)
-	__releases(parent->d_lock)
-	__releases(dentry->d_inode->i_lock)
+static void dentry_lru_add(struct dentry *dentry)
 {
-	__list_del_entry(&dentry->d_u.d_child);
-	/*
-	 * Inform d_walk() that we are no longer attached to the
-	 * dentry tree
-	 */
-	dentry->d_flags |= DCACHE_DENTRY_KILLED;
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	dentry_iput(dentry);
-	/*
-	 * dentry_iput drops the locks, at which point nobody (except
-	 * transient RCU lookups) can reach this dentry.
-	 */
-	d_free(dentry);
-	return parent;
+	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+		d_lru_add(dentry);
 }
 
 /*
@@ -482,34 +523,12 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static inline struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
 {
-	struct inode *inode;
-	struct dentry *parent;
+	struct dentry *parent = NULL;
 
-	inode = dentry->d_inode;
-	if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
-		return dentry; /* try again with same dentry */
-	}
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
+	if (!IS_ROOT(dentry))
 		parent = dentry->d_parent;
-	if (parent && !spin_trylock(&parent->d_lock)) {
-		if (inode)
-			spin_unlock(&inode->i_lock);
-		goto relock;
-	}
 
 	/*
 	 * The dentry is now unrecoverably dead to the world.
@@ -523,10 +542,91 @@ static inline struct dentry *dentry_kill(struct dentry *dentry)
 	if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
 		dentry->d_op->d_prune(dentry);
 
-	dentry_lru_del(dentry);
+	if (dentry->d_flags & DCACHE_LRU_LIST) {
+		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+			d_lru_del(dentry);
+	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	return d_kill(dentry, parent);
+	__list_del_entry(&dentry->d_u.d_child);
+	/*
+	 * Inform d_walk() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
+	d_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto failed;
+
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		if (unlikely(!spin_trylock(&parent->d_lock))) {
+			if (inode)
+				spin_unlock(&inode->i_lock);
+			goto failed;
+		}
+	}
+
+	__dentry_kill(dentry);
+	return parent;
+
+failed:
+	spin_unlock(&dentry->d_lock);
+	cpu_relax();
+	return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		return NULL;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return parent;
+	spin_unlock(&dentry->d_lock);
+	rcu_read_lock();
+again:
+	parent = ACCESS_ONCE(dentry->d_parent);
+	spin_lock(&parent->d_lock);
+	/*
+	 * We can't blindly lock dentry until we are sure
+	 * that we won't violate the locking order.
+	 * Any changes of dentry->d_parent must have
+	 * been done with parent->d_lock held, so
+	 * spin_lock() above is enough of a barrier
+	 * for checking if it's still our child.
+	 */
+	if (unlikely(parent != dentry->d_parent)) {
+		spin_unlock(&parent->d_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+	if (parent != dentry)
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	else
+		parent = NULL;
+	return parent;
 }
 
 /* 
@@ -739,129 +839,190 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static void try_prune_one_dentry(struct dentry *dentry)
-	__releases(dentry->d_lock)
+static void shrink_dentry_list(struct list_head *list)
 {
-	struct dentry *parent;
+	struct dentry *dentry, *parent;
 
-	parent = dentry_kill(dentry);
-	/*
-	 * If dentry_kill returns NULL, we have nothing more to do.
-	 * if it returns the same dentry, trylocks failed. In either
-	 * case, just loop again.
-	 *
-	 * Otherwise, we need to prune ancestors too. This is necessary
-	 * to prevent quadratic behavior of shrink_dcache_parent(), but
-	 * is also expected to be beneficial in reducing dentry cache
-	 * fragmentation.
-	 */
-	if (!parent)
-		return;
-	if (parent == dentry)
-		return;
+	while (!list_empty(list)) {
+		struct inode *inode;
+		dentry = list_entry(list->prev, struct dentry, d_lru);
+		spin_lock(&dentry->d_lock);
+		parent = lock_parent(dentry);
 
-	/* Prune ancestors. */
-	dentry = parent;
-	while (dentry) {
-		if (lockref_put_or_lock(&dentry->d_lockref))
-			return;
-		dentry = dentry_kill(dentry);
-	}
-}
+		/*
+		 * The dispose list is isolated and dentries are not accounted
+		 * to the LRU here, so we can simply remove it from the list
+		 * here regardless of whether it is referenced or not.
+		 */
+		d_shrink_del(dentry);
 
-static void shrink_dentry_list(struct list_head *list)
-{
-	struct dentry *dentry;
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * the LRU because of laziness during lookup. Do not free it.
+		 */
+		if ((int)dentry->d_lockref.count > 0) {
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			continue;
+		}
 
-	rcu_read_lock();
-	for (;;) {
-		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
-		if (&dentry->d_lru == list)
-			break; /* empty */
-		spin_lock(&dentry->d_lock);
-		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			if (can_free)
+				dentry_free(dentry);
 			continue;
 		}
 
-		/*
-		 * We found an inuse dentry which was not removed from
-		 * the LRU because of laziness during lookup.  Do not free
-		 * it - just keep it off the LRU list.
-		 */
-		if (dentry->d_lockref.count) {
-			dentry_lru_del(dentry);
+		inode = dentry->d_inode;
+		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
+			d_shrink_add(dentry, list);
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			continue;
 		}
 
-		rcu_read_unlock();
+		__dentry_kill(dentry);
+
+		/*
+		 * We need to prune ancestors too. This is necessary to prevent
+		 * quadratic behavior of shrink_dcache_parent(), but is also
+		 * expected to be beneficial in reducing dentry cache
+		 * fragmentation.
+		 */
+		dentry = parent;
+		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+			parent = lock_parent(dentry);
+			if (dentry->d_lockref.count != 1) {
+				dentry->d_lockref.count--;
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				break;
+			}
+			inode = dentry->d_inode;	/* can't be NULL */
+			if (unlikely(!spin_trylock(&inode->i_lock))) {
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				cpu_relax();
+				continue;
+			}
+			__dentry_kill(dentry);
+			dentry = parent;
+		}
+	}
+}
+
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
 
-		try_prune_one_dentry(dentry);
 
-		rcu_read_lock();
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(lru, dentry);
+		spin_unlock(&dentry->d_lock);
+		return LRU_REMOVED;
 	}
-	rcu_read_unlock();
+
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
+
+		/*
+		 * The list move itself will be made by the common LRU code. At
+		 * this point, we've dropped the dentry->d_lock but keep the
+		 * lru lock. This is safe to do, since every list movement is
+		 * protected by the lru lock even if both locks are held.
+		 *
+		 * This is guaranteed by the fact that all LRU management
+		 * functions are intermediated by the LRU API calls like
+		 * list_lru_add and list_lru_del. List movement in this file
+		 * only ever occur through this functions or through callbacks
+		 * like this one, that are called from the LRU API.
+		 *
+		 * The only exceptions to this are functions like
+		 * shrink_dentry_list, and code that first checks for the
+		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+		 * operating only with stack provided lists after they are
+		 * properly isolated from the main list.  It is thus, always a
+		 * local access.
+		 */
+		return LRU_ROTATE;
+	}
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
+
+	return LRU_REMOVED;
 }
 
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @count: number of entries to try to free
+ * @sc: shrink control, passed to list_lru_shrink_walk()
  *
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 {
-	struct dentry *dentry;
-	LIST_HEAD(referenced);
-	LIST_HEAD(tmp);
-
-relock:
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		dentry = list_entry(sb->s_dentry_lru.prev,
-				struct dentry, d_lru);
-		BUG_ON(dentry->d_sb != sb);
-
-		if (!spin_trylock(&dentry->d_lock)) {
-			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
-			goto relock;
-		}
+	LIST_HEAD(dispose);
+	long freed;
 
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
-			list_move(&dentry->d_lru, &referenced);
-			spin_unlock(&dentry->d_lock);
-		} else {
-			list_move_tail(&dentry->d_lru, &tmp);
-			dentry->d_flags |= DCACHE_SHRINK_LIST;
-			spin_unlock(&dentry->d_lock);
-			if (!--count)
-				break;
-		}
-		cond_resched_lock(&dcache_lru_lock);
-	}
-	if (!list_empty(&referenced))
-		list_splice(&referenced, &sb->s_dentry_lru);
-	spin_unlock(&dcache_lru_lock);
+	KSTAT_PERF_ENTER(shrink_dcache);
+	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+				     dentry_lru_isolate, &dispose);
+	shrink_dentry_list(&dispose);
+	KSTAT_PERF_LEAVE(shrink_dcache);
+	return freed;
+}
+
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
 
-	shrink_dentry_list(&tmp);
+	return LRU_REMOVED;
 }
 
+
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -871,16 +1032,17 @@ void prune_dcache_sb(struct super_block *sb, int count)
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	LIST_HEAD(tmp);
+	long freed;
 
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		list_splice_init(&sb->s_dentry_lru, &tmp);
-		spin_unlock(&dcache_lru_lock);
-		shrink_dentry_list(&tmp);
-		spin_lock(&dcache_lru_lock);
-	}
-	spin_unlock(&dcache_lru_lock);
+	do {
+		LIST_HEAD(dispose);
+
+		freed = list_lru_walk(&sb->s_dentry_lru,
+			dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+		this_cpu_sub(nr_dentry_unused, freed);
+		shrink_dentry_list(&dispose);
+	} while (freed > 0);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -914,7 +1076,9 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			    !d_unhashed(dentry))
 				dentry->d_op->d_prune(dentry);
 
-			dentry_lru_del(dentry);
+			WARN_ON_ONCE(dentry->d_flags & DCACHE_SHRINK_LIST);
+			if (dentry->d_flags & DCACHE_LRU_LIST)
+				d_lru_del(dentry);
 			__d_shrink(dentry);
 
 			if (dentry->d_lockref.count != 0) {
@@ -1261,29 +1425,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 	if (data->start == dentry)
 		goto out;
 
-	/*
-	 * move only zero ref count dentries to the dispose list.
-	 *
-	 * Those which are presently on the shrink list, being processed
-	 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
-	 * loop in shrink_dcache_parent() might not make any progress
-	 * and loop forever.
-	 */
-	if (dentry->d_lockref.count) {
-		dentry_lru_del(dentry);
-	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-		dentry_lru_move_list(dentry, &data->dispose);
-		dentry->d_flags |= DCACHE_SHRINK_LIST;
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		data->found++;
-		ret = D_WALK_NORETRY;
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!dentry->d_lockref.count) {
+			d_shrink_add(dentry, &data->dispose);
+			data->found++;
+		}
 	}
 	/*
 	 * We can return to the caller if we have found some (this
 	 * ensures forward progress). We'll be coming back to find
 	 * the rest.
 	 */
-	if (data->found && need_resched())
-		ret = D_WALK_QUIT;
+	if (!list_empty(&data->dispose))
+		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
 out:
 	return ret;
 }
@@ -1424,11 +1582,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	 */
 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
 	if (name->len > DNAME_INLINE_LEN-1) {
-		dname = kmalloc(name->len + 1, GFP_KERNEL);
+		dname = kmalloc(name->len + 1, GFP_KERNEL_ACCOUNT);
 		if (!dname) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
+		if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+			kasan_unpoison_shadow(dname,
+					round_up(name->len + 1,
+						sizeof(unsigned long)));
+
 	} else  {
 		dname = dentry->d_iname;
 	}	
@@ -2803,7 +2966,7 @@ static int prepend_path(const struct path *path,
 	struct dentry *dentry;
 	struct vfsmount *vfsmnt;
 	struct mount *mnt;
-	int error = 0;
+	int error;
 	unsigned seq, m_seq = 0;
 	char *bptr;
 	int blen;
@@ -3078,7 +3241,6 @@ static char *__dentry_path(struct dentry *d, char *buf, int buflen)
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (!IS_ROOT(dentry)) {
 		struct dentry *parent = dentry->d_parent;
-		int error;
 
 		prefetch(parent);
 		error = prepend_name(&end, &len, &dentry->d_name);
@@ -3341,7 +3503,7 @@ static void __init dcache_init(void)
 	 * of the dcache. 
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -25,6 +24,7 @@
 #include <linux/parser.h>
 #include <linux/fsnotify.h>
 #include <linux/seq_file.h>
+#include <linux/ve.h>
 
 #define DEVPTS_DEFAULT_MODE 0600
 /*
@@ -141,9 +141,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
 		return inode->i_sb;
 #endif
-	if (!devpts_mnt)
-		return NULL;
-	return devpts_mnt->mnt_sb;
+	return get_exec_env()->devpts_sb;
 }
 
 #define PARSE_MOUNT	0
@@ -406,11 +404,19 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 }
 
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
+static int test_devpts_sb(struct super_block *s, void *p)
 {
-	if (devpts_mnt)
-		return devpts_mnt->mnt_sb == s;
-	return 0;
+	return get_exec_env()->devpts_sb == s;
+}
+
+static int set_devpts_sb(struct super_block *s, void *p)
+{
+	int error = set_anon_super(s, p);
+	if (!error && !get_exec_env()->devpts_sb) {
+		atomic_inc(&s->s_active);
+		get_exec_env()->devpts_sb = s;
+	}
+	return error;
 }
 
 /*
@@ -454,14 +460,14 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	/* Require newinstance for all user namespace mounts to ensure
 	 * the mount options are not changed.
 	 */
-	if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+	if (!IS_ENABLED(CONFIG_VE) &&
+	    (current_user_ns() != &init_user_ns) && !opts.newinstance)
 		return ERR_PTR(-EINVAL);
 
 	if (opts.newinstance)
-		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+		s = sget(fs_type, NULL, set_devpts_sb, flags, NULL);
 	else
-		s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
-			 NULL);
+		s = sget(fs_type, test_devpts_sb, set_devpts_sb, flags, NULL);
 
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -512,7 +518,7 @@ static struct file_system_type devpts_fs_type = {
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT | FS_VIRTUALIZED,
 #endif
 };
 
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,7 @@
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
 #include <linux/aio.h>
+#include <linux/virtinfo.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -821,6 +822,8 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 {
 	int ret = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, bdev_get_queue(map_bh->b_bdev));
+
 	if (dio->rw & WRITE) {
 		/*
 		 * Read accounting is performed in submit_bio()
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,18 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-static void drop_slab(void)
-{
-	int nr_objects;
-	struct shrink_control shrink = {
-		.gfp_mask = GFP_KERNEL,
-	};
-
-	do {
-		nr_objects = shrink_slab(&shrink, 1000, 1000);
-	} while (nr_objects > 10);
-}
-
 int drop_caches_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -655,6 +655,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
+	unsigned long flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
@@ -676,6 +677,7 @@ static struct ecryptfs_cache_info {
 		.cache = &ecryptfs_inode_info_cache,
 		.name = "ecryptfs_inode_cache",
 		.size = sizeof(struct ecryptfs_inode_info),
+		.flags = SLAB_ACCOUNT,
 		.ctor = inode_info_init_once,
 	},
 	{
@@ -747,8 +749,8 @@ static int ecryptfs_init_kmem_caches(void)
 		struct ecryptfs_cache_info *info;
 
 		info = &ecryptfs_cache_infos[i];
-		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		*(info->cache) = kmem_cache_create(info->name, info->size, 0,
+				SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -87,9 +87,9 @@ static void init_once(void *foo)
 static int init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
-				sizeof(struct efs_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct efs_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -312,7 +312,7 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static const struct file_operations eventpoll_fops;
+const static struct file_operations eventpoll_fops;
 
 static inline int is_file_epoll(struct file *f)
 {
@@ -879,21 +879,23 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventpoll *ep = f->private_data;
 	struct rb_node *rbp;
-	int ret = 0;
 
 	mutex_lock(&ep->mtx);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
-
-		ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
-				 epi->ffd.fd, epi->event.events,
-				 (long long)epi->event.data);
-		if (ret)
+		struct inode *inode = file_inode(epi->ffd.file);
+
+		seq_printf(m, "tfd: %8d events: %8x data: %16llx "
+			   " pos:%lli ino:%lx sdev:%x\n",
+			   epi->ffd.fd, epi->event.events,
+			   (long long)epi->event.data,
+			   (long long)epi->ffd.file->f_pos,
+			   inode->i_ino, inode->i_sb->s_dev);
+		if (m->count == m->size)
 			break;
 	}
 	mutex_unlock(&ep->mtx);
-
-	return ret;
+	return 0;
 }
 #endif
 
@@ -915,7 +917,7 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
-	struct epitem *epi;
+	struct epitem *epi, *next;
 
 	/*
 	 * We don't want to get "file->f_lock" because it is not
@@ -931,7 +933,7 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
 		ep = epi->ep;
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
@@ -999,6 +1001,50 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/virtinfo.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
@@ -56,6 +57,8 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -249,9 +252,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	err = -ENOMEM;
+	if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags,
+				NULL, UB_SOFT))
+		goto err_charge;
+
+	bprm->vma = vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (!vma)
-		return -ENOMEM;
+		goto err_alloc;
 
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
@@ -281,7 +289,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 err:
 	up_write(&mm->mmap_sem);
 	bprm->vma = NULL;
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
+err_alloc:
+	ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL);
+err_charge:
 	return err;
 }
 
@@ -583,6 +594,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
 	struct mmu_gather tlb;
+	unsigned long moved;
+	struct vm_area_struct *prev;
 
 	BUG_ON(new_start > new_end);
 
@@ -600,12 +613,11 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	/*
-	 * move the page tables downwards, on failure we rely on
-	 * process cleanup to remove whatever mess we made.
+	 * move the page tables downwards, on failure undo changes.
 	 */
-	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length, false))
-		return -ENOMEM;
+	moved = move_page_tables(vma, old_start, vma, new_start, length, false);
+	if (length != moved)
+		goto undo;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, old_start, old_end);
@@ -633,6 +645,36 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 
 	return 0;
+
+undo:
+	/*
+	 * move the page tables back.
+	 */
+	length = move_page_tables(vma, new_start, vma, old_start, moved, false);
+	if (WARN_ON(length != moved))
+		return -EFAULT;
+
+	/*
+	 * release unused page tables.
+	 */
+	find_vma_prev(mm, vma->vm_start, &prev);
+	tlb_gather_mmu(&tlb, mm, new_start, new_end);
+	if (new_end > old_start)
+		free_pgd_range(&tlb, new_start, old_start,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	else
+		free_pgd_range(&tlb, new_start, new_end,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	tlb_finish_mmu(&tlb, new_start, new_end);
+
+	/*
+	 * shrink the vma to the old range.
+	 */
+	vma_adjust(vma, old_start, old_end, vma->vm_pgoff, NULL);
+
+	return -ENOMEM;
 }
 
 /*
@@ -823,10 +865,10 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 }
 EXPORT_SYMBOL(read_code);
 
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct linux_binprm *bprm)
 {
 	struct task_struct *tsk;
-	struct mm_struct * old_mm, *active_mm;
+	struct mm_struct *old_mm, *active_mm, *mm;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
@@ -847,6 +889,9 @@ static int exec_mmap(struct mm_struct *mm)
 			return -EINTR;
 		}
 	}
+
+	mm = bprm->mm;
+	mm->vps_dumpable = VD_PTRACE_COREDUMP;
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
@@ -854,6 +899,8 @@ static int exec_mmap(struct mm_struct *mm)
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
+	bprm->mm = NULL;		/* We're using it now */
+
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
@@ -1096,12 +1143,10 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 * Release all of the old mmap stuff
 	 */
 	acct_arg_size(bprm, 0);
-	retval = exec_mmap(bprm->mm);
+	retval = exec_mmap(bprm);
 	if (retval)
 		goto out;
 
-	bprm->mm = NULL;		/* We're using it now */
-
 	set_fs(USER_DS);
 	current->flags &=
 		~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
 {
 	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 				sizeof(struct exofs_i_info), 0,
-				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				exofs_init_once);
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				SLAB_ACCOUNT, exofs_init_once);
 	if (exofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -206,15 +206,11 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		case ACL_TYPE_ACCESS:
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
 			if (acl) {
-				error = posix_acl_equiv_mode(acl, &inode->i_mode);
-				if (error < 0)
+				error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+				if (error)
 					return error;
-				else {
-					inode->i_ctime = CURRENT_TIME_SEC;
-					mark_inode_dirty(inode);
-					if (error == 0)
-						acl = NULL;
-				}
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
 			}
 			break;
 
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -199,7 +199,7 @@ static int init_inodecache(void)
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1536,7 +1536,7 @@ static struct file_system_type ext2_fs_type = {
 	.name		= "ext2",
 	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("ext2");
 
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -73,7 +73,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1320,7 +1320,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	if (err)
 		ext3_std_error(dir->i_sb, err);
 	brelse(bh);
-	return 0;
+	return err;
 }
 
 /*
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -3062,7 +3062,7 @@ static struct file_system_type ext3_fs_type = {
 	.name		= "ext3",
 	.mount		= ext3_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("ext3");
 
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
 		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-		xattr_trusted.o inline.o
+		xattr_trusted.o inline.o pfcache.o
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -258,6 +258,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
 	return num_clusters_in_group(sb, block_group) - 
 		ext4_num_overhead_clusters(sb, block_group, gdp);
 }
+EXPORT_SYMBOL(ext4_get_group_desc);
 
 /*
  * The free blocks are managed by bitmaps.  A file system contains several
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -103,6 +103,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 	return 1;
 }
 
+static inline int ext4_balloon(struct super_block *sb, unsigned ino)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	return sbi->s_balloon_ino && (sbi->s_balloon_ino->i_ino == ino);
+}
+
 static int ext4_readdir(struct file *filp,
 			 void *dirent, filldir_t filldir)
 {
@@ -236,7 +244,8 @@ static int ext4_readdir(struct file *filp,
 			}
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
-			if (le32_to_cpu(de->inode)) {
+			if (le32_to_cpu(de->inode) &&
+			    !ext4_balloon(sb, le32_to_cpu(de->inode))) {
 				/* We might block in the next section
 				 * if the data destination is
 				 * currently swapped out.  So, use a
@@ -511,6 +520,9 @@ static int call_filldir(struct file *filp, void *dirent,
 	}
 	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
 	while (fname) {
+		if (ext4_balloon(sb, fname->inode))
+			goto skip;
+
 		error = filldir(dirent, fname->name,
 				fname->name_len, curr_pos,
 				fname->inode,
@@ -520,6 +532,7 @@ static int call_filldir(struct file *filp, void *dirent,
 			info->extra_fname = fname;
 			return error;
 		}
+skip:
 		fname = fname->next;
 	}
 	return 0;
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
+#include <linux/pfcache.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
 #ifdef __KERNEL__
@@ -218,6 +219,12 @@ struct ext4_io_submit {
  */
 #define EXT4_LINK_MAX		65000
 
+#define EXT4_DATA_CSUM_SIZE	20
+#define EXT4_DATA_CSUM_NAME	"pfcache"
+
+#define EXT4_DIR_CSUM_VALUE	"auto"
+#define EXT4_DIR_CSUM_VALUE_LEN	4
+
 /*
  * Macro-instructions used to manage several block sizes
  */
@@ -506,6 +513,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+	__u32 size;
+	__u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -606,6 +618,9 @@ enum {
 #define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
 #define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
+#define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
+#define EXT4_IOC_MFSYNC			_IO('f', 43)
+#define EXT4_IOC_SET_RSV_BLOCKS		_IOW('f', 44, __u64)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -950,9 +965,9 @@ struct ext4_inode_info {
 	/* extents status tree */
 	struct ext4_es_tree i_es_tree;
 	rwlock_t i_es_lock;
-	struct list_head i_es_lru;
-	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
-	unsigned long i_touch_when;	/* jiffies of last accessing */
+	struct list_head i_es_list;
+	unsigned int i_es_all_nr;	/* protected by i_es_lock */
+	unsigned int i_es_shk_nr;	/* protected by i_es_lock */
 
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
@@ -1003,6 +1018,11 @@ struct ext4_inode_info {
 
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
 	__u32 i_csum_seed;
+
+	/* SHA-1 rolling data checksum state */
+	loff_t i_data_csum_end;
+	/* FIPS 180-1 digest if i_pfcache_csum_end == -1, partial SHA-1 otherwise */
+	u8 i_data_csum[EXT4_DATA_CSUM_SIZE];
 };
 
 /*
@@ -1069,6 +1089,7 @@ struct ext4_inode_info {
 						      blocks */
 #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
 						      file systems */
+#define EXT4_MOUNT2_PFCACHE_CSUM	0x00010000
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
@@ -1335,6 +1356,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
+	unsigned int s_bd_full_ratelimit;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
@@ -1356,6 +1378,8 @@ struct ext4_sb_info {
 	atomic_t s_mb_discarded;
 	atomic_t s_lock_busy;
 
+	struct inode *s_balloon_ino;
+
 	/* locality groups */
 	struct ext4_locality_group __percpu *s_locality_groups;
 
@@ -1393,17 +1417,28 @@ struct ext4_sb_info {
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_csum_seed;
 
+	bool s_err_event_sent;
+	bool s_abrt_event_sent;
+
 	/* Reclaim extents from extent status tree */
 	struct shrinker s_es_shrinker;
-	struct list_head s_es_lru;
-	unsigned long s_es_last_sorted;
-	struct percpu_counter s_extent_cache_cnt;
-	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+	struct list_head s_es_list;
+	long s_es_nr_inode;
+	struct ext4_es_stats s_es_stats;
+	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* data checksumming */
+	struct percpu_counter s_csum_partial;
+	struct percpu_counter s_csum_complete;
+
+	spinlock_t  s_pfcache_lock;
+	struct path s_pfcache_root;
+	struct percpu_counter s_pfcache_peers;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1470,6 +1505,7 @@ enum {
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
+	EXT4_STATE_PFCACHE_CSUM,	/* Data-checksumming enabled */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -2098,6 +2134,7 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2785,6 +2822,16 @@ extern int ext4_check_blockref(const char *, unsigned int,
 struct ext4_ext_path;
 struct ext4_extent;
 
+enum ext4_event_type {
+     EXT4_UA_MOUNT,
+     EXT4_UA_UMOUNT,
+     EXT4_UA_REMOUNT,
+     EXT4_UA_ERROR,
+     EXT4_UA_ABORT,
+     EXT4_UA_FREEZE,
+     EXT4_UA_UNFREEZE,
+};
+
 /*
  * Maximum number of logical blocks in a file; ext4_extent's ee_block is
  * __le32.
@@ -2832,6 +2879,11 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+			     struct inode *inode2, ext4_lblk_t lblk1,
+			     ext4_lblk_t lblk2,  ext4_lblk_t count,
+			     int mark_unwritten,int *err);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2841,8 +2893,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
-extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-			    struct ext4_extent **extent);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
@@ -2864,6 +2914,29 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 
+/* pfcache.c */
+extern int ext4_open_pfcache(struct inode *inode);
+extern int ext4_close_pfcache(struct inode *inode);
+extern int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb);
+extern long ext4_dump_pfcache(struct super_block *sb,
+					struct pfcache_dump_request __user *dump);
+extern int ext4_load_data_csum(struct inode *inode);
+extern void ext4_start_data_csum(struct inode *inode);
+extern void ext4_check_pos_data_csum(struct inode *inode, loff_t pos);
+extern void ext4_update_data_csum(struct inode *inode, loff_t pos,
+				  unsigned len, struct page* page);
+extern void ext4_commit_data_csum(struct inode *inode);
+extern void ext4_clear_data_csum(struct inode *inode);
+extern void ext4_truncate_data_csum(struct inode *inode, loff_t end);
+extern void ext4_load_dir_csum(struct inode *inode);
+extern void ext4_save_dir_csum(struct inode *inode);
+static inline int ext4_want_data_csum(struct inode *dir)
+{
+	return test_opt2(dir->i_sb, PFCACHE_CSUM) &&
+		ext4_test_inode_state(dir, EXT4_STATE_PFCACHE_CSUM);
+}
+extern struct xattr_handler ext4_xattr_trusted_csum_handler;
+
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -2918,6 +2991,49 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
 	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
 }
 
+/*
+ * Ploop support
+ */
+DECLARE_PER_CPU(unsigned long, ext4_bd_full_ratelimits);
+
+static inline int check_bd_full(struct inode *inode, long long nblocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	unsigned long ratelimit;
+	unsigned long *p;
+
+	bd_full_fn = inode->i_sb->s_bdi->bd_full_fn;
+	if (likely(!bd_full_fn))
+		return 0;
+
+	if (unlikely(inode->i_sb->s_bdi->bd_full))
+		ratelimit = 0;
+	else
+		ratelimit = sbi->s_bd_full_ratelimit;
+
+	preempt_disable();
+
+	p =  &__get_cpu_var(ext4_bd_full_ratelimits);
+	*p += nblocks;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
+		if (unlikely(bd_full_fn(inode->i_sb->s_bdi,
+					nblocks << inode->i_blkbits,
+					uid_eq(sbi->s_resuid,
+					       current_fsuid())))) {
+			inode->i_sb->s_bdi->bd_full = 1;
+			return 1;
+		}
+		inode->i_sb->s_bdi->bd_full = 0;
+		return 0;
+	}
+
+	preempt_enable();
+	return 0;
+}
+
 #endif	/* __KERNEL__ */
 
 #define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/module.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
@@ -289,6 +290,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 	return size;
 }
 
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+			   int nofail)
+{
+	struct ext4_ext_path *path = *ppath;
+	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+	return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+			EXT4_EX_NOCACHE|EXT4_GET_BLOCKS_PRE_IO |
+			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
 /*
  * Calculate the number of metadata blocks needed
  * to allocate @blocks
@@ -697,15 +712,19 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
-	int depth = path->p_depth;
+	int depth;
 	int i;
 
+	if (!path)
+		return;
+	depth = path->p_depth;
 	for (i = 0; i <= depth; i++, path++)
 		if (path->p_bh) {
 			brelse(path->p_bh);
 			path->p_bh = NULL;
 		}
 }
+EXPORT_SYMBOL(ext4_ext_drop_refs);
 
 /*
  * ext4_ext_binsearch_idx:
@@ -1561,7 +1580,7 @@ static int ext4_ext_search_right(struct inode *inode,
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -2845,7 +2864,6 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		 * ext4_ext_rm_leaf().
 		 */
 		if (end >= ee_block && end < ex_end) {
-			int split_flag = 0;
 
 			/*
 			 * If we're going to split the extent, note that
@@ -2858,21 +2876,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 					-(long long) EXT4_B2C(sbi, pblk);
 			}
 
-			if (ext4_ext_is_unwritten(ex))
-				split_flag = EXT4_EXT_MARK_UNWRIT1 |
-					     EXT4_EXT_MARK_UNWRIT2;
-
 			/*
 			 * Split the extent in two so that 'end' is the last
 			 * block in the first new extent. Also we should not
 			 * fail removing space due to ENOSPC so try to use
 			 * reserved block if that happens.
 			 */
-			err = ext4_split_extent_at(handle, inode, path,
-					end + 1, split_flag,
-					EXT4_EX_NOCACHE |
-					EXT4_GET_BLOCKS_PRE_IO |
-					EXT4_GET_BLOCKS_METADATA_NOFAIL);
+			err = ext4_force_split_extent_at(handle, inode, &path,
+							 end + 1, 1);
 
 			if (err < 0)
 				goto out;
@@ -4647,7 +4658,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
 	trace_ext4_ext_map_blocks_exit(inode, flags, map,
 				       err ? err : allocated);
-	ext4_es_lru_add(inode);
+	ext4_es_list_add(inode);
 	return err ? err : allocated;
 }
 
@@ -4685,6 +4696,21 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 	ext4_std_error(inode->i_sb, err);
 }
 
+static int ext4_convert_unwritten(struct inode *inode, loff_t offset,
+				  loff_t len)
+{
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, len);
+	mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 				  ext4_lblk_t len, loff_t new_size,
 				  int flags, int mode)
@@ -4933,12 +4959,21 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_CONVERT_UNWRITTEN))
 		return -EOPNOTSUPP;
 
+	/* If data is about to change we must drop csum */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    ((mode & ~FALLOC_FL_KEEP_SIZE)  || !(mode & FALLOC_FL_KEEP_SIZE)))
+		ext4_truncate_data_csum(inode, -1);
+
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return ext4_punch_hole(inode, offset, len);
 
+	if (mode & FALLOC_FL_CONVERT_UNWRITTEN)
+		return ext4_convert_unwritten(inode, offset, len);
+
 	ret = ext4_convert_inline_data(inode);
 	if (ret)
 		return ret;
@@ -5026,6 +5061,12 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 	 */
 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
 		      map.m_lblk);
+	/*
+	 * Protect us against freezing - AIO-DIO case. Caller didn't have to
+	 * have any protection against it
+	 */
+	sb_start_intwrite(inode->i_sb);
+
 	/*
 	 * This is somewhat ugly but the idea is clear: When transaction is
 	 * reserved, everything goes into it. Otherwise we rather start several
@@ -5070,6 +5111,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 	if (!credits)
 		ret2 = ext4_journal_stop(handle);
+	sb_end_intwrite(inode->i_sb);
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -5206,7 +5248,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		error = ext4_fill_fiemap_extents(inode, start_blk,
 						 len_blks, fieinfo);
 	}
-	ext4_es_lru_add(inode);
+	ext4_es_list_add(inode);
 	return error;
 }
 
@@ -5330,7 +5372,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	struct ext4_ext_path *path;
 	int ret = 0, depth;
 	struct ext4_extent *extent;
-	ext4_lblk_t stop_block, current_block;
+	ext4_lblk_t stop_block;
 	ext4_lblk_t ex_start, ex_end;
 
 	/* Let path point to the last extent */
@@ -5385,16 +5427,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 			return -EIO;
 		}
 
-		current_block = le32_to_cpu(extent->ee_block);
-		if (start > current_block) {
+		if (start > le32_to_cpu(extent->ee_block)) {
 			/* Hole, move to the next extent */
-			ret = mext_next_extent(inode, path, &extent);
-			if (ret != 0) {
+			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+				path[depth].p_ext++;
+			} else {
+				start = ext4_ext_next_allocated_block(path);
 				ext4_ext_drop_refs(path);
 				kfree(path);
-				if (ret == 1)
-					ret = 0;
-				break;
+				continue;
 			}
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
@@ -5554,3 +5595,201 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:	First inode
+ * @inode2:	Second inode
+ * @lblk1:	Start block for first inode
+ * @lblk2:	Start block for second inode
+ * @count:	Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:	Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * 		i_mutex is held for both inodes
+ * 		i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *		All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+		     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, int unwritten, int *erp)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+
+	ext4_discard_preallocations(inode1);
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (unlikely(*erp))
+		return 0;
+	ext4_discard_preallocations(inode2);
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path1))) {
+			*erp = PTR_ERR(path1);
+			path1 = NULL;
+		finish:
+			count = 0;
+			goto repeat;
+		}
+		path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path2))) {
+			*erp = PTR_ERR(path2);
+			path2 = NULL;
+			goto finish;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have somthing to swap ? */
+		if (unlikely(!ex2 || !ex1))
+			goto finish;
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		/* Hole handling */
+		if (!in_range(lblk1, e1_blk, e1_len) ||
+		    !in_range(lblk2, e2_blk, e2_len)) {
+			ext4_lblk_t next1, next2;
+
+			/* if hole after extent, then go to next extent */
+			next1 = ext4_ext_next_allocated_block(path1);
+			next2 = ext4_ext_next_allocated_block(path2);
+			/* If hole before extent, then shift to that extent */
+			if (e1_blk > lblk1)
+				next1 = e1_blk;
+			if (e2_blk > lblk2)
+				next2 = e1_blk;
+			/* Do we have something to swap */
+			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+				goto finish;
+			/* Move to the rightest boundary */
+			len = next1 - lblk1;
+			if (len < next2 - lblk2)
+				len = next2 - lblk2;
+			if (len > count)
+				len = count;
+			lblk1 += len;
+			lblk2 += len;
+			count -= len;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2,  lblk2, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1 + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2, lblk2 + len, 0);
+			if (*erp)
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = cpu_to_le16(e2_len);
+		ex2->ee_len = cpu_to_le16(e1_len);
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex2);
+		if (ext4_ext_is_unwritten(&tmp_ex))
+			ext4_ext_mark_unwritten(ex1);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (unlikely(*erp))
+			goto finish;
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+	repeat:
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+		path1 = path2 = NULL;
+	}
+	return replaced_count;
+}
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
  */
 #include <linux/rbtree.h>
 #include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include "ext4.h"
 #include "extents_status.h"
 
@@ -147,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			      ext4_lblk_t end);
 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 				       int nr_to_scan);
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
-			    struct ext4_inode_info *locked_ei);
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+		       struct ext4_inode_info *locked_ei);
 
 int __init ext4_init_es(void)
 {
@@ -296,6 +298,36 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
 	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
+void ext4_es_list_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	if (!list_empty(&ei->i_es_list))
+		return;
+
+	spin_lock(&sbi->s_es_lock);
+	if (list_empty(&ei->i_es_list)) {
+		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
+		sbi->s_es_nr_inode++;
+	}
+	spin_unlock(&sbi->s_es_lock);
+}
+
+void ext4_es_list_del(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	spin_lock(&sbi->s_es_lock);
+	if (!list_empty(&ei->i_es_list)) {
+		list_del_init(&ei->i_es_list);
+		sbi->s_es_nr_inode--;
+		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
+	}
+	spin_unlock(&sbi->s_es_lock);
+}
+
 static struct extent_status *
 ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
 		     ext4_fsblk_t pblk)
@@ -312,20 +344,28 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
 	 * We don't count delayed extent because we never try to reclaim them
 	 */
 	if (!ext4_es_is_delayed(es)) {
-		EXT4_I(inode)->i_es_lru_nr++;
-		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+		EXT4_I(inode)->i_es_shk_nr++;
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_shk_cnt);
 	}
 
+	EXT4_I(inode)->i_es_all_nr++;
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
 	return es;
 }
 
 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 {
-	/* Decrease the lru counter when this es is not delayed */
+	EXT4_I(inode)->i_es_all_nr--;
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
+	/* Decrease the shrink counter when this es is not delayed */
 	if (!ext4_es_is_delayed(es)) {
-		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
-		EXT4_I(inode)->i_es_lru_nr--;
-		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+		EXT4_I(inode)->i_es_shk_nr--;
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_shk_cnt);
 	}
 
 	kmem_cache_free(ext4_es_cachep, es);
@@ -683,8 +723,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 		goto error;
 retry:
 	err = __es_insert_extent(inode, &newes);
-	if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
-					       EXT4_I(inode)))
+	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+					  1, EXT4_I(inode)))
 		goto retry;
 	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
 		err = 0;
@@ -739,6 +779,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 			  struct extent_status *es)
 {
 	struct ext4_es_tree *tree;
+	struct ext4_es_stats *stats;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
 	int found = 0;
@@ -775,11 +816,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 	}
 
 out:
+	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
 	if (found) {
 		BUG_ON(!es1);
 		es->es_lblk = es1->es_lblk;
 		es->es_len = es1->es_len;
 		es->es_pblk = es1->es_pblk;
+		stats->es_stats_cache_hits++;
+	} else {
+		stats->es_stats_cache_misses++;
 	}
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -836,8 +881,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 				es->es_lblk = orig_es.es_lblk;
 				es->es_len = orig_es.es_len;
 				if ((err == -ENOMEM) &&
-				    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
-						     EXT4_I(inode)))
+				    __es_shrink(EXT4_SB(inode->i_sb),
+							1, EXT4_I(inode)))
 					goto retry;
 				goto out;
 			}
@@ -909,6 +954,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	end = lblk + len - 1;
 	BUG_ON(end < lblk);
 
+	/*
+	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
+	 * so that we are sure __es_shrink() is done with the inode before it
+	 * is reclaimed.
+	 */
 	write_lock(&EXT4_I(inode)->i_es_lock);
 	err = __es_remove_extent(inode, lblk, end);
 	write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -916,169 +966,270 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	return err;
 }
 
-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
-				     struct list_head *b)
-{
-	struct ext4_inode_info *eia, *eib;
-	eia = list_entry(a, struct ext4_inode_info, i_es_lru);
-	eib = list_entry(b, struct ext4_inode_info, i_es_lru);
-
-	if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
-	    !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
-		return 1;
-	if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
-	    ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
-		return -1;
-	if (eia->i_touch_when == eib->i_touch_when)
-		return 0;
-	if (time_after(eia->i_touch_when, eib->i_touch_when))
-		return 1;
-	else
-		return -1;
-}
-
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
-			    struct ext4_inode_info *locked_ei)
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+		       struct ext4_inode_info *locked_ei)
 {
 	struct ext4_inode_info *ei;
-	struct list_head *cur, *tmp;
-	LIST_HEAD(skipped);
-	int ret, nr_shrunk = 0;
-	int retried = 0, skip_precached = 1, nr_skipped = 0;
+	struct ext4_es_stats *es_stats;
+	ktime_t start_time;
+	u64 scan_time;
+	int nr_to_walk;
+	int nr_shrunk = 0;
+	int retried = 0, nr_skipped = 0;
 
-	spin_lock(&sbi->s_es_lru_lock);
+	es_stats = &sbi->s_es_stats;
+	start_time = ktime_get();
 
 retry:
-	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
-		/*
-		 * If we have already reclaimed all extents from extent
-		 * status tree, just stop the loop immediately.
-		 */
-		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
-			break;
+	spin_lock(&sbi->s_es_lock);
+	nr_to_walk = sbi->s_es_nr_inode;
+	while (nr_to_walk-- > 0) {
+		int shrunk;
 
-		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+		if (list_empty(&sbi->s_es_list)) {
+			spin_unlock(&sbi->s_es_lock);
+			goto out;
+		}
+		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+				      i_es_list);
+		/* Move the inode to the tail */
+		list_move(&ei->i_es_list, sbi->s_es_list.prev);
 
 		/*
-		 * Skip the inode that is newer than the last_sorted
-		 * time.  Normally we try hard to avoid shrinking
-		 * precached inodes, but we will as a last resort.
+		 * Normally we try hard to avoid shrinking precached inodes,
+		 * but we will as a last resort.
 		 */
-		if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
-		    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
-						EXT4_STATE_EXT_PRECACHED))) {
+		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
+						EXT4_STATE_EXT_PRECACHED)) {
 			nr_skipped++;
-			list_move_tail(cur, &skipped);
 			continue;
 		}
 
-		if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
-		    !write_trylock(&ei->i_es_lock))
+		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
+			nr_skipped++;
 			continue;
+		}
+		/*
+		 * Now we hold i_es_lock which protects us from inode reclaim
+		 * freeing inode under us
+		 */
+		spin_unlock(&sbi->s_es_lock);
 
-		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
-		if (ei->i_es_lru_nr == 0)
-			list_del_init(&ei->i_es_lru);
+		shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
 		write_unlock(&ei->i_es_lock);
 
-		nr_shrunk += ret;
-		nr_to_scan -= ret;
+		nr_shrunk += shrunk;
+		nr_to_scan -= shrunk;
+
 		if (nr_to_scan == 0)
-			break;
+			goto out;
+		spin_lock(&sbi->s_es_lock);
 	}
-
-	/* Move the newer inodes into the tail of the LRU list. */
-	list_splice_tail(&skipped, &sbi->s_es_lru);
-	INIT_LIST_HEAD(&skipped);
+	spin_unlock(&sbi->s_es_lock);
 
 	/*
 	 * If we skipped any inodes, and we weren't able to make any
-	 * forward progress, sort the list and try again.
+	 * forward progress, try again to scan precached inodes.
 	 */
 	if ((nr_shrunk == 0) && nr_skipped && !retried) {
 		retried++;
-		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
-		sbi->s_es_last_sorted = jiffies;
-		ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
-				      i_es_lru);
-		/*
-		 * If there are no non-precached inodes left on the
-		 * list, start releasing precached extents.
-		 */
-		if (ext4_test_inode_state(&ei->vfs_inode,
-					  EXT4_STATE_EXT_PRECACHED))
-			skip_precached = 0;
 		goto retry;
 	}
 
-	spin_unlock(&sbi->s_es_lru_lock);
-
 	if (locked_ei && nr_shrunk == 0)
 		nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
 
+out:
+	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+	if (likely(es_stats->es_stats_scan_time))
+		es_stats->es_stats_scan_time = (scan_time +
+				es_stats->es_stats_scan_time*3) / 4;
+	else
+		es_stats->es_stats_scan_time = scan_time;
+	if (scan_time > es_stats->es_stats_max_scan_time)
+		es_stats->es_stats_max_scan_time = scan_time;
+	if (likely(es_stats->es_stats_shrunk))
+		es_stats->es_stats_shrunk = (nr_shrunk +
+				es_stats->es_stats_shrunk*3) / 4;
+	else
+		es_stats->es_stats_shrunk = nr_shrunk;
+
+	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
+			     nr_skipped, retried);
 	return nr_shrunk;
 }
 
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long ext4_es_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	unsigned long nr;
+	struct ext4_sb_info *sbi;
+
+	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
+	return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct ext4_sb_info *sbi = container_of(shrink,
 					struct ext4_sb_info, s_es_shrinker);
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk;
 
-	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
+	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
 
 	if (!nr_to_scan)
 		return ret;
 
-	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
 
-	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
-	return ret;
+	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
+	return nr_shrunk;
 }
 
-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
 {
-	INIT_LIST_HEAD(&sbi->s_es_lru);
-	spin_lock_init(&sbi->s_es_lru_lock);
-	sbi->s_es_last_sorted = 0;
-	sbi->s_es_shrinker.shrink = ext4_es_shrink;
-	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-	register_shrinker(&sbi->s_es_shrinker);
+	return *pos ? NULL : SEQ_START_TOKEN;
 }
 
-void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	unregister_shrinker(&sbi->s_es_shrinker);
+	return NULL;
 }
 
-void ext4_es_lru_add(struct inode *inode)
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_sb_info *sbi = seq->private;
+	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+	struct ext4_inode_info *ei, *max = NULL;
+	unsigned int inode_cnt = 0;
 
-	ei->i_touch_when = jiffies;
+	if (v != SEQ_START_TOKEN)
+		return 0;
 
-	if (!list_empty(&ei->i_es_lru))
-		return;
+	/* here we just find an inode that has the max nr. of objects */
+	spin_lock(&sbi->s_es_lock);
+	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
+		inode_cnt++;
+		if (max && max->i_es_all_nr < ei->i_es_all_nr)
+			max = ei;
+		else if (!max)
+			max = ei;
+	}
+	spin_unlock(&sbi->s_es_lock);
+
+	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
+	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+		   es_stats->es_stats_cache_hits,
+		   es_stats->es_stats_cache_misses);
+	if (inode_cnt)
+		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
+
+	seq_printf(seq, "average:\n  %llu us scan time\n",
+	    div_u64(es_stats->es_stats_scan_time, 1000));
+	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
+	if (inode_cnt)
+		seq_printf(seq,
+		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+		    "  %llu us max scan time\n",
+		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
+		    div_u64(es_stats->es_stats_max_scan_time, 1000));
 
-	spin_lock(&sbi->s_es_lru_lock);
-	if (list_empty(&ei->i_es_lru))
-		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
-	spin_unlock(&sbi->s_es_lru_lock);
+	return 0;
 }
 
-void ext4_es_lru_del(struct inode *inode)
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+	.start = ext4_es_seq_shrinker_info_start,
+	.next  = ext4_es_seq_shrinker_info_next,
+	.stop  = ext4_es_seq_shrinker_info_stop,
+	.show  = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = PDE_DATA(inode);
+	}
+
+	return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_es_seq_shrinker_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+	int err;
+
+	INIT_LIST_HEAD(&sbi->s_es_list);
+	sbi->s_es_nr_inode = 0;
+	spin_lock_init(&sbi->s_es_lock);
+	sbi->s_es_stats.es_stats_shrunk = 0;
+	sbi->s_es_stats.es_stats_cache_hits = 0;
+	sbi->s_es_stats.es_stats_cache_misses = 0;
+	sbi->s_es_stats.es_stats_scan_time = 0;
+	sbi->s_es_stats.es_stats_max_scan_time = 0;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
+	if (err)
+		return err;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
+	if (err)
+		goto err1;
+
+	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker.count_objects = ext4_es_count;
+	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+	err = register_shrinker(&sbi->s_es_shrinker);
+	if (err)
+		goto err2;
+
+	if (sbi->s_proc)
+		proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+				 &ext4_es_seq_shrinker_info_fops, sbi);
+
+	return 0;
 
-	spin_lock(&sbi->s_es_lru_lock);
-	if (!list_empty(&ei->i_es_lru))
-		list_del_init(&ei->i_es_lru);
-	spin_unlock(&sbi->s_es_lru_lock);
+err2:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+err1:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	return err;
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+	if (sbi->s_proc)
+		remove_proc_entry("es_shrinker_info", sbi->s_proc);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+	unregister_shrinker(&sbi->s_es_shrinker);
 }
 
 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
@@ -1092,7 +1243,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
-	if (ei->i_es_lru_nr == 0)
+	if (ei->i_es_shk_nr == 0)
 		return 0;
 
 	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,16 @@ struct ext4_es_tree {
 	struct extent_status *cache_es;	/* recently accessed extent */
 };
 
+struct ext4_es_stats {
+	unsigned long es_stats_shrunk;
+	unsigned long es_stats_cache_hits;
+	unsigned long es_stats_cache_misses;
+	u64 es_stats_scan_time;
+	u64 es_stats_max_scan_time;
+	struct percpu_counter es_stats_all_cnt;
+	struct percpu_counter es_stats_shk_cnt;
+};
+
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,9 +148,9 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
 		       (pb & ~ES_MASK));
 }
 
-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_lru_add(struct inode *inode);
-extern void ext4_es_lru_del(struct inode *inode);
+extern void ext4_es_list_add(struct inode *inode);
+extern void ext4_es_list_del(struct inode *inode);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -45,12 +45,14 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1) &&
-		        !EXT4_I(inode)->i_reserved_data_blocks)
-	{
-		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_preallocations(inode);
-		up_write(&EXT4_I(inode)->i_data_sem);
+	    (atomic_read(&inode->i_writecount) == 1)) {
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+			ext4_commit_data_csum(inode);
+		if (!EXT4_I(inode)->i_reserved_data_blocks) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			ext4_discard_preallocations(inode);
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
 	}
 	if (is_dx(inode) && filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
@@ -307,7 +309,6 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.pmd_fault	= ext4_dax_pmd_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
@@ -316,11 +317,26 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
 static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= ext4_filemap_fault,
 	.page_mkwrite   = ext4_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct inode *inode = file->f_inode;
+
+	/*
+	 * f_op->mmap must be called with vma=NULL before taking mmap_sem;
+	 * workaround for wrong i_mutex vs mmap_sem lock ordering in pfcache
+	 * (PSBM-23133) - vdavydov@
+	 */
+	if (!vma) {
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			mutex_lock(&inode->i_mutex);
+			ext4_truncate_data_csum(inode, -1);
+			mutex_unlock(&inode->i_mutex);
+		}
+		return 0;
+	}
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -381,6 +397,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		if (ret < 0)
 			return ret;
 	}
+
+	if ((filp->f_mode & FMODE_WRITE) && inode->i_mapping->i_peer_file) {
+		mutex_lock(&inode->i_mutex);
+		ext4_close_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	return dquot_file_open(inode, filp);
 }
 
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -150,3 +150,111 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files)
+{
+	struct super_block *sb;
+	journal_t *journal;
+	int err = 0, err2 = 0, i = 0, j = 0;
+	int force_commit = 0, datawriteback = 0;
+	tid_t commit_tid = 0;
+	int need_barrier = 0;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+	if (!nr_files)
+		return 0;
+
+	sb = files[0]->f_mapping->host->i_sb;
+	journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY) {
+		/* Make shure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
+		return 0;
+	}
+	for (i = 0; i < nr_files; i++) {
+		struct address_space * mapping = files[i]->f_mapping;
+		struct inode *inode = mapping->host;
+
+		BUG_ON(sb != inode->i_sb);
+		if (!mapping->nrpages)
+			continue;
+
+		err = filemap_fdatawrite(mapping);
+		if (err)
+			break;
+
+	}
+	/*
+	 * Even if the above returned error, the pages may be
+	 * written partially (e.g. -ENOSPC), so we wait for it.
+	 * But the -EIO is special case, it may indicate the worst
+	 * thing (e.g. bug) happened, so we avoid waiting for it.
+	 */
+	if (err == -EIO)
+		goto out;
+
+	for (j = 0; j < i; j++) {
+		struct address_space * mapping = files[j]->f_mapping;
+		struct inode *inode = mapping->host;
+		struct ext4_inode_info *ei = EXT4_I(inode);
+		unsigned int datasync = flags[j];
+		tid_t tid;
+
+		if (mapping->nrpages) {
+			err2 = filemap_fdatawait(mapping);
+			if (!err || err2 == -EIO)
+				err = err2;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		force_commit  |= ext4_should_journal_data(inode);
+		datawriteback |= ext4_should_writeback_data(inode);
+		tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+		mutex_unlock(&inode->i_mutex);
+		trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync);
+		if (j == 0 || !tid_geq(commit_tid, tid))
+			commit_tid = tid;
+	}
+
+	/* Ext4 specific stuff starts here */
+	if (!journal) {
+		 return -ENOTSUPP;
+	} else if (force_commit) {
+		/* data=journal:
+		 *  filemap_fdatawrite won't do anything (the buffers are clean).
+		 *  ext4_force_commit will write the file data into the journal and
+		 *  will wait on that.
+		 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+		 *  (they were dirtied by commit).  But that's OK - the blocks are
+		 *  safe in-journal, which is all fsync() needs to ensure.
+		 */
+		err2 = ext4_force_commit(sb);
+	} else {
+		/*
+		 * data=writeback,ordered:
+		 * The caller's filemap_fdatawrite()/wait will sync the data.
+		 * Metadata is in the journal, we wait for proper transaction to
+		 * commit here.
+		 */
+		if (journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+			need_barrier = true;
+
+		err2 = jbd2_complete_transaction(journal, commit_tid);
+		/* Even if we had to wait for commit completion, it does not
+		 * mean a flush has been issued after data demanded by this
+		 * fsync were written back. Commit could be in state after
+		 * it is already done, but not yet in state where we should
+		 * not wait.
+		 */
+		if (need_barrier)
+			err2 = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+	}
+out:
+	trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier);
+	if (!err || err2 == -EIO)
+		err = err2;
+	return err;
+}
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,6 +1039,11 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	if (err)
 		goto fail_drop;
 
+	if (check_bd_full(inode, 1)) {
+		err = -ENOSPC;
+		goto fail_free_drop;
+	}
+
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -49,6 +49,8 @@
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
+DEFINE_PER_CPU(unsigned long, ext4_bd_full_ratelimits) = 0;
+
 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			      struct ext4_inode_info *ei)
 {
@@ -237,6 +239,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
+	if (inode->i_blocks && ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
 				    ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
@@ -496,7 +500,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
-		ext4_es_lru_add(inode);
+		ext4_es_list_add(inode);
 		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
 			map->m_pblk = ext4_es_pblock(&es) +
 					map->m_lblk - es.es_lblk;
@@ -1024,6 +1028,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	unlock_page(page);
 
 retry_journal:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		page_cache_release(page);
@@ -1134,6 +1142,10 @@ static int ext4_write_end(struct file *file,
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1207,6 +1219,9 @@ static int ext4_journalled_write_end(struct file *file,
 	size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1264,6 +1279,11 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
+	if (check_bd_full(inode, 1)) {
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+		return -ENOSPC;
+	}
+
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1499,7 +1519,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, iblock, &es)) {
-		ext4_es_lru_add(inode);
+		ext4_es_list_add(inode);
 		if (ext4_es_is_hole(&es)) {
 			retval = 0;
 			down_read(&EXT4_I(inode)->i_data_sem);
@@ -2616,8 +2636,15 @@ static int ext4_nonda_switch(struct super_block *sb)
 	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
 		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
 
-	if (2 * free_clusters < 3 * dirty_clusters ||
-	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+	/*
+	 * NOTE: Delalloc make data=writeback mode safer, similar to ordered
+	 * mode, so stale blocks after power failure no longer an issue Do not
+	 * disable delalloc to guarantee data security on data=writeback mode.
+	 *								-dmon
+	 */
+	if (test_opt(sb, DATA_FLAGS) != EXT4_MOUNT_WRITEBACK_DATA &&
+	    (2 * free_clusters < 3 * dirty_clusters ||
+	     free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK))) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
@@ -2691,6 +2718,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	 * of file which has an already mapped buffer.
 	 */
 retry_journal:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
@@ -2803,6 +2834,9 @@ static int ext4_da_write_end(struct file *file,
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
@@ -3328,6 +3362,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	if (ext4_has_inline_data(inode))
 		return 0;
 
+	if ((rw == WRITE) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, -1);
+
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3858,6 +3896,9 @@ void ext4_truncate(struct inode *inode)
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
+
 	if (ext4_has_inline_data(inode)) {
 		int has_inline = 1;
 
@@ -4360,10 +4401,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
+		if (test_opt2(sb, PFCACHE_CSUM) && !ext4_load_data_csum(inode))
+			ext4_open_pfcache(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations.ops;
 		inode->i_fop = &ext4_dir_operations;
 		inode->i_flags |= S_IOPS_WRAPPER;
+		if (test_opt2(sb, PFCACHE_CSUM))
+			ext4_load_dir_csum(inode);
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -4391,6 +4436,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
+	if (test_opt2(sb, PFCACHE_CSUM))
+		ext4_load_data_csum(inode);
 	unlock_new_inode(inode);
 	return inode;
 
@@ -4446,6 +4493,63 @@ static int ext4_inode_blocks_set(handle_t *handle,
 	return 0;
 }
 
+struct other_inode {
+	unsigned long		orig_ino;
+	struct ext4_inode	*raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+			     void *data)
+{
+	struct other_inode *oi = (struct other_inode *) data;
+
+	if ((inode->i_ino != ino) ||
+	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+			       I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+	    ((inode->i_state & I_DIRTY_TIME) == 0))
+		return 0;
+	spin_lock(&inode->i_lock);
+	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+				I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+	    (inode->i_state & I_DIRTY_TIME)) {
+		struct ext4_inode_info	*ei = EXT4_I(inode);
+
+		inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+
+		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+		ext4_inode_csum_set(inode, oi->raw_inode, ei);
+		spin_unlock(&inode->i_lock);
+		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+		return -1;
+	}
+	spin_unlock(&inode->i_lock);
+	return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+					  unsigned long orig_ino, char *buf)
+{
+	struct other_inode oi;
+	unsigned long ino;
+	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int inode_size = EXT4_INODE_SIZE(sb);
+
+	oi.orig_ino = orig_ino;
+	ino = (orig_ino & ~(inodes_per_block - 1)) + 1;
+	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+		if (ino == orig_ino)
+			continue;
+		oi.raw_inode = (struct ext4_inode *) buf;
+		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+	}
+}
+
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
@@ -4557,6 +4661,9 @@ static int ext4_do_update_inode(handle_t *handle,
 	}
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
+	if (inode->i_sb->s_flags & MS_LAZYTIME)
+		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+					      bh->b_data);
 
 	spin_unlock(&ei->i_raw_lock);
 
@@ -4791,6 +4898,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 		}
 		if (attr->ia_size != inode->i_size) {
+			if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+				ext4_truncate_data_csum(inode, attr->ia_size);
+
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
@@ -5150,11 +5260,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
+	if (flags == I_DIRTY_TIME)
+		return;
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -79,8 +79,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
 	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
 	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
-	ext4_es_lru_del(inode1);
-	ext4_es_lru_del(inode2);
+	ext4_es_list_del(inode1);
+	ext4_es_list_del(inode2);
 
 	isize = i_size_read(inode1);
 	i_size_write(inode1, i_size_read(inode2));
@@ -199,6 +199,60 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	return err;
 }
 
+static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct inode *balloon_ino;
+	int err, fd;
+	struct file *filp;
+	struct dentry *de;
+	struct path path;
+	fmode_t mode;
+
+	balloon_ino = EXT4_SB(sb)->s_balloon_ino;
+	err = -ENOENT;
+	if (balloon_ino == NULL)
+		goto err;
+
+	err = fd = get_unused_fd();
+	if (err < 0)
+		goto err_fd;
+
+	__iget(balloon_ino);
+	de = d_obtain_alias(balloon_ino);
+	err = PTR_ERR(de);
+	if (IS_ERR(de))
+		goto err_de;
+
+	path.dentry = de;
+	path.mnt = mntget(mnt);
+	err = mnt_want_write(path.mnt);
+	if (err)
+		mode = FMODE_READ;
+	else
+		mode = FMODE_READ | FMODE_WRITE;
+	filp = alloc_file(&path, mode,
+			&ext4_file_operations);
+	if (mode & FMODE_WRITE)
+		mnt_drop_write(path.mnt);
+	if (IS_ERR(filp)) {
+		err = PTR_ERR(filp);
+		goto err_filp;
+	}
+
+	filp->f_flags |= O_LARGEFILE;
+	fd_install(fd, filp);
+	return fd;
+
+err_filp:
+	path_put(&path);
+err_de:
+	put_unused_fd(fd);
+err_fd:
+	/* nothing */
+err:
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -250,7 +304,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
+			if (!ve_capable(CAP_LINUX_IMMUTABLE))
 				goto flags_out;
 		}
 
@@ -259,7 +313,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
@@ -592,6 +646,66 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		ext4_resize_end(sb);
 		return err;
 	}
+	case EXT4_IOC_SET_RSV_BLOCKS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		handle_t *handle;
+		int err = 0, err2 = 0;
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		if (n_blocks_count > ext4_blocks_count(EXT4_SB(sb)->s_es))
+			return -EINVAL;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resize_out;
+
+		handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto mnt_out;
+		}
+		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+		if (err) {
+			goto journal_out;
+		}
+		ext4_r_blocks_count_set(EXT4_SB(sb)->s_es, n_blocks_count);
+		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+journal_out:
+		err2 = ext4_journal_stop(handle);
+		if (err == 0)
+			err = err2;
+
+		if (!err && EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+mnt_out:
+		mnt_drop_write(filp->f_path.mnt);
+resize_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
 
 	case FITRIM:
 	{
@@ -624,6 +738,100 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		return ext4_ext_precache(inode);
 
+	case EXT4_IOC_OPEN_BALLOON:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return ext4_open_balloon(inode->i_sb, filp->f_path.mnt);
+
+	case FS_IOC_PFCACHE_OPEN:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_open_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_CLOSE:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_close_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_DUMP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return ext4_dump_pfcache(inode->i_sb,
+				(struct pfcache_dump_request __user *) arg);
+	case EXT4_IOC_MFSYNC:
+	{
+		struct ext4_ioc_mfsync_info mfsync;
+		struct file **filpp;
+		unsigned int *flags;
+		__u32 __user *usr_fd;
+		int i, err;
+
+		if (!ve_is_super(get_exec_env()))
+			return -ENOTSUPP;
+		if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+				   sizeof(mfsync)))
+			return -EFAULT;
+
+		if (mfsync.size == 0)
+			return 0;
+		if (mfsync.size > NR_FILE)
+			return -ENFILE;
+
+		usr_fd = (__u32 __user *) (arg + sizeof(__u32));
+
+		filpp = kzalloc(mfsync.size * sizeof(*filpp), GFP_KERNEL);
+		if (!filpp)
+			return -ENOMEM;
+		flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+		if (!flags) {
+			kfree(filpp);
+			return -ENOMEM;
+		}
+		for (i = 0; i < mfsync.size; i++) {
+			int fd;
+			int ret;
+
+			err = -EFAULT;
+			ret = get_user(fd, usr_fd + i);
+			if (ret)
+				goto mfsync_fput;
+
+			/* negative fd means fdata_sync */
+			flags[i] = (fd & (1<< 31)) != 0;
+			fd &= ~(1<< 31);
+
+			err = -EBADF;
+			filpp[i] = fget(fd);
+			if (!filpp[i])
+				goto mfsync_fput;
+		}
+		err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+		for (i = 0; i < mfsync.size; i++)
+			if (filpp[i])
+				fput(filpp[i]);
+		kfree(filpp);
+		kfree(flags);
+		return err;
+	}
 	default:
 		return -ENOTTY;
 	}
@@ -689,6 +897,10 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_RESIZE_FS:
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		break;
+	case FS_IOC_PFCACHE_OPEN:
+	case FS_IOC_PFCACHE_CLOSE:
+	case FS_IOC_PFCACHE_DUMP:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4458,6 +4458,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			*errp = -EDQUOT;
 			goto out;
 		}
+
+		if (check_bd_full(ar->inode, inquota)) {
+			ar->len = 0;
+			*errp = -ENOSPC;
+			goto out;
+		}
 	}
 
 	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,120 +27,26 @@
  * @lblock:	logical block number to find an extent path
  * @path:	pointer to an extent path pointer (for output)
  *
- * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
  * on failure.
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **orig_path)
+		struct ext4_ext_path **ppath)
 {
-	int ret = 0;
 	struct ext4_ext_path *path;
 
-	path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+	path = ext4_ext_find_extent(inode, lblock, *ppath, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
-		ret = PTR_ERR(path);
-	else if (path[ext_depth(inode)].p_ext == NULL)
-		ret = -ENODATA;
-	else
-		*orig_path = path;
-
-	return ret;
-}
-
-/**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src:	an extent for getting initialize status
- * @dest:	an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
-	if (ext4_ext_is_unwritten(src))
-		ext4_ext_mark_unwritten(dest);
-	else
-		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
-
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode:	inode which is searched
- * @path:	this will obtain data for the next extent
- * @extent:	pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-		      struct ext4_extent **extent)
-{
-	struct ext4_extent_header *eh;
-	int ppos, leaf_ppos = path->p_depth;
-
-	ppos = leaf_ppos;
-	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
-		/* leaf block */
-		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-		return 0;
-	}
-
-	while (--ppos >= 0) {
-		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
-		    path[ppos].p_idx) {
-			int cur_ppos = ppos;
-
-			/* index block */
-			path[ppos].p_idx++;
-			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
-			if (path[ppos+1].p_bh)
-				brelse(path[ppos+1].p_bh);
-			path[ppos+1].p_bh =
-				sb_bread(inode->i_sb, path[ppos].p_block);
-			if (!path[ppos+1].p_bh)
-				return -EIO;
-			path[ppos+1].p_hdr =
-				ext_block_hdr(path[ppos+1].p_bh);
-
-			/* Halfway index block */
-			while (++cur_ppos < leaf_ppos) {
-				path[cur_ppos].p_idx =
-					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
-				path[cur_ppos].p_block =
-					ext4_idx_pblock(path[cur_ppos].p_idx);
-				if (path[cur_ppos+1].p_bh)
-					brelse(path[cur_ppos+1].p_bh);
-				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
-					path[cur_ppos].p_block);
-				if (!path[cur_ppos+1].p_bh)
-					return -EIO;
-				path[cur_ppos+1].p_hdr =
-					ext_block_hdr(path[cur_ppos+1].p_bh);
-			}
-
-			path[leaf_ppos].p_ext = *extent = NULL;
-
-			eh = path[leaf_ppos].p_hdr;
-			if (le16_to_cpu(eh->eh_entries) == 0)
-				/* empty leaf is found */
-				return -ENODATA;
-
-			/* leaf block */
-			path[leaf_ppos].p_ext = *extent =
-				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
-			path[leaf_ppos].p_block =
-					ext4_ext_pblock(path[leaf_ppos].p_ext);
-			return 0;
-		}
+		return PTR_ERR(path);
+	if (path[ext_depth(inode)].p_ext == NULL) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		*ppath = NULL;
+		return -ENODATA;
 	}
-	/* We found the last extent */
-	return 1;
+	*ppath = path;
+	return 0;
 }
 
 /**
@@ -177,417 +83,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
 }
 
-/**
- * mext_insert_across_blocks - Insert extents across leaf block
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @o_start:		first original extent to be changed
- * @o_end:		last original extent to be changed
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
- */
-static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
-		struct ext4_extent *o_start, struct ext4_extent *o_end,
-		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
-		struct ext4_extent *end_ext)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	ext4_lblk_t eblock = 0;
-	int new_flag = 0;
-	int end_flag = 0;
-	int err = 0;
-
-	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
-		if (o_start == o_end) {
-
-			/*       start_ext   new_ext    end_ext
-			 * donor |---------|-----------|--------|
-			 * orig  |------------------------------|
-			 */
-			end_flag = 1;
-		} else {
-
-			/*       start_ext   new_ext   end_ext
-			 * donor |---------|----------|---------|
-			 * orig  |---------------|--------------|
-			 */
-			o_end->ee_block = end_ext->ee_block;
-			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-		}
-
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (start_ext->ee_len && new_ext->ee_len &&
-		   !end_ext->ee_len && o_start == o_end) {
-
-		/*	 start_ext	new_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (!start_ext->ee_len && new_ext->ee_len &&
-		   end_ext->ee_len && o_start == o_end) {
-
-		/*	  new_ext	end_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_end->ee_block = end_ext->ee_block;
-		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-
-		/*
-		 * Set 0 to the extent block if new_ext was
-		 * the first block.
-		 */
-		if (new_ext->ee_block)
-			eblock = le32_to_cpu(new_ext->ee_block);
-
-		new_flag = 1;
-	} else {
-		ext4_debug("ext4 move extent: Unexpected insert case\n");
-		return -EIO;
-	}
-
-	if (new_flag) {
-		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					orig_path, new_ext, 0))
-			goto out;
-	}
-
-	if (end_flag) {
-		err = get_ext_path(orig_inode,
-				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					   orig_path, end_ext, 0))
-			goto out;
-	}
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-
-	return err;
-
-}
-
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start:		first original extent to be moved
- * @o_end:		last original extent to be moved
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- * @eh:			extent header of target leaf block
- * @range_to_move:	used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
-			      struct ext4_extent *o_end,
-			      struct ext4_extent *start_ext,
-			      struct ext4_extent *new_ext,
-			      struct ext4_extent *end_ext,
-			      struct ext4_extent_header *eh,
-			      int range_to_move)
-{
-	int i = 0;
-	unsigned long len;
-
-	/* Move the existing extents */
-	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
-		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
-			(unsigned long)(o_end + 1);
-		memmove(o_end + 1 + range_to_move, o_end + 1, len);
-	}
-
-	/* Insert start entry */
-	if (start_ext->ee_len)
-		o_start[i++].ee_len = start_ext->ee_len;
-
-	/* Insert new entry */
-	if (new_ext->ee_len) {
-		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
-	}
-
-	/* Insert end entry */
-	if (end_ext->ee_len)
-		o_start[i] = *end_ext;
-
-	/* Increment the total entries counter on the extent block */
-	le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle:	journal handle
- * @orig_inode:	original inode
- * @orig_path:	path indicates first extent to be changed
- * @o_start:	first original extent to be changed
- * @o_end:	last original extent to be changed
- * @start_ext:	first new extent to be inserted
- * @new_ext:	middle of new extent to be inserted
- * @end_ext:	last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
-			 struct ext4_ext_path *orig_path,
-			 struct ext4_extent *o_start,
-			 struct ext4_extent *o_end,
-			 struct ext4_extent *start_ext,
-			 struct ext4_extent *new_ext,
-			 struct ext4_extent *end_ext)
-{
-	struct  ext4_extent_header *eh;
-	unsigned long need_slots, slots_range;
-	int	range_to_move, depth, ret;
-
-	/*
-	 * The extents need to be inserted
-	 * start_extent + new_extent + end_extent.
-	 */
-	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
-		(new_ext->ee_len ? 1 : 0);
-
-	/* The number of slots between start and end */
-	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
-		/ sizeof(struct ext4_extent);
-
-	/* Range to move the end of extent */
-	range_to_move = need_slots - slots_range;
-	depth = orig_path->p_depth;
-	orig_path += depth;
-	eh = orig_path->p_hdr;
-
-	if (depth) {
-		/* Register to journal */
-		BUFFER_TRACE(orig_path->p_bh, "get_write_access");
-		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
-		if (ret)
-			return ret;
-	}
-
-	/* Expansion */
-	if (range_to_move > 0 &&
-		(range_to_move > le16_to_cpu(eh->eh_max)
-			- le16_to_cpu(eh->eh_entries))) {
-
-		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
-					o_end, start_ext, new_ext, end_ext);
-		if (ret < 0)
-			return ret;
-	} else
-		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
-						end_ext, eh, range_to_move);
-
-	return ext4_ext_dirty(handle, orig_inode, orig_path);
-}
-
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @orig_path:		path indicates first extent to be changed
- * @dext:		donor extent
- * @from:		start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
-		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
-		     ext4_lblk_t *from)
-{
-	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
-	struct ext4_extent new_ext, start_ext, end_ext;
-	ext4_lblk_t new_ext_end;
-	int oext_alen, new_ext_alen, end_ext_alen;
-	int depth = ext_depth(orig_inode);
-	int ret;
-
-	start_ext.ee_block = end_ext.ee_block = 0;
-	o_start = o_end = oext = orig_path[depth].p_ext;
-	oext_alen = ext4_ext_get_actual_len(oext);
-	start_ext.ee_len = end_ext.ee_len = 0;
-
-	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
-	new_ext.ee_len = dext->ee_len;
-	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
-	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-
-	/*
-	 * Case: original extent is first
-	 * oext      |--------|
-	 * new_ext      |--|
-	 * start_ext |--|
-	 */
-	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
-		le32_to_cpu(new_ext.ee_block) <
-		le32_to_cpu(oext->ee_block) + oext_alen) {
-		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
-					       le32_to_cpu(oext->ee_block));
-		start_ext.ee_block = oext->ee_block;
-		copy_extent_status(oext, &start_ext);
-	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
-		prev_ext = oext - 1;
-		/*
-		 * We can merge new_ext into previous extent,
-		 * if these are contiguous and same extent type.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
-					       &new_ext)) {
-			o_start = prev_ext;
-			start_ext.ee_len = cpu_to_le16(
-				ext4_ext_get_actual_len(prev_ext) +
-				new_ext_alen);
-			start_ext.ee_block = oext->ee_block;
-			copy_extent_status(prev_ext, &start_ext);
-			new_ext.ee_len = 0;
-		}
-	}
-
-	/*
-	 * Case: new_ext_end must be less than oext
-	 * oext      |-----------|
-	 * new_ext       |-------|
-	 */
-	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		EXT4_ERROR_INODE(orig_inode,
-			"new_ext_end(%u) should be less than or equal to "
-			"oext->ee_block(%u) + oext_alen(%d) - 1",
-			new_ext_end, le32_to_cpu(oext->ee_block),
-			oext_alen);
-		ret = -EIO;
-		goto out;
-	}
-
-	/*
-	 * Case: new_ext is smaller than original extent
-	 * oext    |---------------|
-	 * new_ext |-----------|
-	 * end_ext             |---|
-	 */
-	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
-		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
-		end_ext.ee_len =
-			cpu_to_le16(le32_to_cpu(oext->ee_block) +
-			oext_alen - 1 - new_ext_end);
-		copy_extent_status(oext, &end_ext);
-		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
-		ext4_ext_store_pblock(&end_ext,
-			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
-		end_ext.ee_block =
-			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
-			oext_alen - end_ext_alen);
-	}
-
-	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
-				o_end, &start_ext, &new_ext, &end_ext);
-out:
-	return ret;
-}
-
-/**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext:		the extent that will belong to the original inode
- * @tmp_oext:		the extent that will belong to the donor inode
- * @orig_off:		block offset of original inode
- * @donor_off:		block offset of donor inode
- * @max_count:		the maximum length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
-			      struct ext4_extent *tmp_oext,
-			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
-			      ext4_lblk_t max_count)
-{
-	ext4_lblk_t diff, orig_diff;
-	struct ext4_extent dext_old, oext_old;
-
-	BUG_ON(orig_off != donor_off);
-
-	/* original and donor extents have to cover the same block offset */
-	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
-	    le32_to_cpu(tmp_oext->ee_block) +
-			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
-		return -ENODATA;
-
-	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
-	    le32_to_cpu(tmp_dext->ee_block) +
-			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
-		return -ENODATA;
-
-	dext_old = *tmp_dext;
-	oext_old = *tmp_oext;
-
-	/* When tmp_dext is too large, pick up the target range. */
-	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-
-	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-	le32_add_cpu(&tmp_dext->ee_block, diff);
-	le16_add_cpu(&tmp_dext->ee_len, -diff);
-
-	if (max_count < ext4_ext_get_actual_len(tmp_dext))
-		tmp_dext->ee_len = cpu_to_le16(max_count);
-
-	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-
-	/* Adjust extent length if donor extent is larger than orig */
-	if (ext4_ext_get_actual_len(tmp_dext) >
-	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
-		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
-						orig_diff);
-
-	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-
-	copy_extent_status(&oext_old, tmp_dext);
-	copy_extent_status(&dext_old, tmp_oext);
-
-	return 0;
-}
-
 /**
  * mext_check_coverage - Check that all extents in range has the same type
  *
@@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
 	}
 	ret = 1;
 out:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	return ret;
 }
 
-/**
- * mext_replace_branches - Replace original extents with new extents
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @donor_inode:	donor inode
- * @from:		block offset of orig_inode
- * @count:		block count to be replaced
- * @err:		pointer to save return value
- *
- * Replace original inode extents and donor inode extents page by page.
- * We implement this replacement in the following three steps:
- * 1. Save the block information of original and donor inodes into
- *    dummy extents.
- * 2. Change the block information of original inode to point at the
- *    donor inode blocks.
- * 3. Change the block information of donor inode to point at the saved
- *    original inode blocks in the dummy extents.
- *
- * Return replaced block count.
- */
-static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
-			   struct inode *donor_inode, ext4_lblk_t from,
-			   ext4_lblk_t count, int *err)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	struct ext4_ext_path *donor_path = NULL;
-	struct ext4_extent *oext, *dext;
-	struct ext4_extent tmp_dext, tmp_oext;
-	ext4_lblk_t orig_off = from, donor_off = from;
-	int depth;
-	int replaced_count = 0;
-	int dext_alen;
-
-	*err = ext4_es_remove_extent(orig_inode, from, count);
-	if (*err)
-		goto out;
-
-	*err = ext4_es_remove_extent(donor_inode, from, count);
-	if (*err)
-		goto out;
-
-	/* Get the original extent for the block "orig_off" */
-	*err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (*err)
-		goto out;
-
-	/* Get the donor extent for the head */
-	*err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (*err)
-		goto out;
-	depth = ext_depth(orig_inode);
-	oext = orig_path[depth].p_ext;
-	tmp_oext = *oext;
-
-	depth = ext_depth(donor_inode);
-	dext = donor_path[depth].p_ext;
-	if (unlikely(!dext))
-		goto missing_donor_extent;
-	tmp_dext = *dext;
-
-	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-				      donor_off, count);
-	if (*err)
-		goto out;
-
-	/* Loop for the donor extents */
-	while (1) {
-		/* The extent for donor must be found. */
-		if (unlikely(!dext)) {
-		missing_donor_extent:
-			EXT4_ERROR_INODE(donor_inode,
-				   "The extent for donor must be found");
-			*err = -EIO;
-			goto out;
-		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			EXT4_ERROR_INODE(donor_inode,
-				"Donor offset(%u) and the first block of donor "
-				"extent(%u) should be equal",
-				donor_off,
-				le32_to_cpu(tmp_dext.ee_block));
-			*err = -EIO;
-			goto out;
-		}
-
-		/* Set donor extent to orig extent */
-		*err = mext_leaf_block(handle, orig_inode,
-					   orig_path, &tmp_dext, &orig_off);
-		if (*err)
-			goto out;
-
-		/* Set orig extent to donor extent */
-		*err = mext_leaf_block(handle, donor_inode,
-					   donor_path, &tmp_oext, &donor_off);
-		if (*err)
-			goto out;
-
-		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
-		replaced_count += dext_alen;
-		donor_off += dext_alen;
-		orig_off += dext_alen;
-
-		BUG_ON(replaced_count > count);
-		/* Already moved the expected blocks */
-		if (replaced_count >= count)
-			break;
-
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		*err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(orig_inode);
-		oext = orig_path[depth].p_ext;
-		tmp_oext = *oext;
-
-		if (donor_path)
-			ext4_ext_drop_refs(donor_path);
-		*err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(donor_inode);
-		dext = donor_path[depth].p_ext;
-		tmp_dext = *dext;
-
-		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					   donor_off, count - replaced_count);
-		if (*err)
-			goto out;
-	}
-
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (donor_path) {
-		ext4_ext_drop_refs(donor_path);
-		kfree(donor_path);
-	}
-
-	return replaced_count;
-}
-
 /**
  * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
  *
  * @inode1:	the inode structure
  * @inode2:	the inode structure
- * @index:	page index
+ * @index1:	page index
+ * @index2:	page index
  * @page:	result page vector
  *
  * Grab two locked pages for inode's by inode order
  */
 static int
 mext_page_double_lock(struct inode *inode1, struct inode *inode2,
-		      pgoff_t index, struct page *page[2])
+		      pgoff_t index1, pgoff_t index2, struct page *page[2])
 {
 	struct address_space *mapping[2];
 	unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 		mapping[0] = inode1->i_mapping;
 		mapping[1] = inode2->i_mapping;
 	} else {
+		pgoff_t tmp = index1;
+		index1 = index2;
+		index2 = tmp;
 		mapping[0] = inode2->i_mapping;
 		mapping[1] = inode1->i_mapping;
 	}
 
-	page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
 	if (!page[0])
 		return -ENOMEM;
 
-	page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
 	if (!page[1]) {
 		unlock_page(page[0]);
 		page_cache_release(page[0]);
@@ -893,25 +245,27 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
  * @o_filp:			file structure of original file
  * @donor_inode:		donor inode
  * @orig_page_offset:		page index on original file
+ * @donor_page_offset:		page index on donor file
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
  * @unwritten:			orig extent is unwritten or not
  * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling mext_replace_branches().
+ * with donor inode extents by calling ext4_swap_extents().
  * Finally, write out the saved data in new original inode blocks. Return
  * replaced block count.
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
-		  pgoff_t orig_page_offset, int data_offset_in_page,
-		  int block_len_in_page, int unwritten, int *err)
+		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+		     int data_offset_in_page,
+		     int block_len_in_page, int unwritten, int *err)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
-	ext4_lblk_t orig_blk_offset;
+	ext4_lblk_t orig_blk_offset, donor_blk_offset;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int tmp_data_size, data_size, replaced_size;
 	int i, err2, jblocks, retries = 0;
@@ -937,6 +291,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
+	donor_blk_offset = donor_page_offset * blocks_per_page +
+		data_offset_in_page;
+
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -957,7 +314,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	replaced_size = data_size;
 
 	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
-				     pagep);
+				     donor_page_offset, pagep);
 	if (unlikely(*err < 0))
 		goto stop_journal;
 	/*
@@ -976,7 +333,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		if (*err)
 			goto drop_data_sem;
 
-		unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
 						 block_len_in_page, 1, err);
 		if (*err)
 			goto drop_data_sem;
@@ -992,9 +349,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 			*err = -EBUSY;
 			goto drop_data_sem;
 		}
-		replaced_count = mext_replace_branches(handle, orig_inode,
-						donor_inode, orig_blk_offset,
-						block_len_in_page, err);
+		replaced_count = ext4_swap_extents(handle, orig_inode,
+						   donor_inode, orig_blk_offset,
+						   donor_blk_offset,
+						   block_len_in_page, 1, err);
 	drop_data_sem:
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		goto unlock_pages;
@@ -1012,9 +370,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		goto unlock_pages;
 	}
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-					       orig_blk_offset,
-					       block_len_in_page, err);
+	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+					   orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 1, err);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (*err) {
 		if (replaced_count) {
@@ -1071,9 +429,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * Try to swap extents to it's original places
 	 */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
-					       orig_blk_offset,
-					       block_len_in_page, &err2);
+	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 0, &err2);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (replaced_count != block_len_in_page) {
 		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1103,10 +461,14 @@ mext_check_arguments(struct inode *orig_inode,
 		     struct inode *donor_inode, __u64 orig_start,
 		     __u64 donor_start, __u64 *len)
 {
-	ext4_lblk_t orig_blocks, donor_blocks;
+	__u64 orig_eof, donor_eof;
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
+	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1122,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode,
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be swapfile [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
+		return -EBUSY;
 	}
 
 	if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
@@ -1149,67 +511,28 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Start offset should be same */
-	if (orig_start != donor_start) {
+	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
 		ext4_debug("ext4 move extent: orig and donor's start "
-			"offset are not same [ino:orig %lu, donor %lu]\n",
+			"offset are not alligned [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 
 	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (donor_start >= EXT_MAX_BLOCKS) ||
 	    (*len > EXT_MAX_BLOCKS) ||
+	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
 	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
 		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
 			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-
-	if (orig_inode->i_size > donor_inode->i_size) {
-		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start >= donor_blocks) {
-			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file blocks "
-			"[%u] [ino:orig %lu, donor %lu]\n",
-			orig_start, donor_blocks,
-			orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start + *len > donor_blocks) {
-			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file blocks [%u]."
-				"So adjust length from %llu to %llu "
-				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_blocks,
-				*len, donor_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_blocks - orig_start;
-		}
-	} else {
-		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
-		if (orig_start >= orig_blocks) {
-			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file blocks "
-				"[%u] [ino:orig %lu, donor %lu]\n",
-				 orig_start, orig_blocks,
-				orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		if (orig_start + *len > orig_blocks) {
-			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %llu. Because it should be "
-				"less than original file blocks "
-				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_blocks - orig_start;
-		}
-	}
-
+	if (orig_eof < orig_start + *len - 1)
+		*len = orig_eof - orig_start;
+	if (donor_eof < donor_start + *len - 1)
+		*len = donor_eof - donor_start;
 	if (!*len) {
 		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1225,60 +548,27 @@ mext_check_arguments(struct inode *orig_inode,
  *
  * @o_filp:		file structure of the original file
  * @d_filp:		file structure of the donor file
- * @orig_start:		start offset in block for orig
- * @donor_start:	start offset in block for donor
+ * @orig_blk:		start offset in block for orig
+ * @donor_blk:		start offset in block for donor
  * @len:		the number of blocks to be moved
  * @moved_len:		moved block length
  *
  * This function returns 0 and moved block length is set in moved_len
  * if succeed, otherwise returns error value.
  *
- * Note: ext4_move_extents() proceeds the following order.
- * 1:ext4_move_extents() calculates the last block number of moving extent
- *   function by the start block number (orig_start) and the number of blocks
- *   to be moved (len) specified as arguments.
- *   If the {orig, donor}_start points a hole, the extent's start offset
- *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
- *   after hole behind.
- * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
- *   or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call mext_next_extent()
- *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
- *   until find un-continuous extent, the start logical block number exceeds
- *   the block_end or the extent points to the last extent.
- * 4:Exchange the original inode data with donor inode data
- *   from orig_page_offset to seq_end_page.
- *   The start indexes of data are specified as arguments.
- *   That of the original inode is orig_page_offset,
- *   and the donor inode is also orig_page_offset
- *   (To easily handle blocksize != pagesize case, the offset for the
- *   donor inode is block unit).
- * 5:Update holecheck_path and orig_path to points a next proceeding extent,
- *   then returns to step 2.
- * 6:Release holecheck_path, orig_path and set the len to moved_len
- *   which shows the number of moved blocks.
- *   The moved_len is useful for the command to calculate the file offset
- *   for starting next move extent ioctl.
- * 7:Return 0 on success, or a negative error value on failure.
  */
 int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
-		 __u64 orig_start, __u64 donor_start, __u64 len,
-		 __u64 *moved_len)
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		  __u64 donor_blk, __u64 len, __u64 *moved_len)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct inode *donor_inode = file_inode(d_filp);
-	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
-	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
-	ext4_lblk_t block_start = orig_start;
-	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
-	ext4_lblk_t rest_blocks;
-	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret, depth, last_extent = 0;
+	struct ext4_ext_path *path = NULL;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
-	int data_offset_in_page;
-	int block_len_in_page;
-	int unwritten;
+	ext4_lblk_t o_end, o_start = orig_blk;
+	ext4_lblk_t d_start = donor_blk;
+	int ret;
+	__u64 m_len = *moved_len;
 
 	if (orig_inode->i_sb != donor_inode->i_sb) {
 		ext4_debug("ext4 move extent: The argument files "
@@ -1320,121 +610,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	/* Protect extent tree against block allocations via delalloc */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
-	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
-				    donor_start, &len);
-	if (ret)
-		goto out;
-
-	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
-	block_end = block_start + len - 1;
-	if (file_end < block_end)
-		len -= block_end - file_end;
-
-	ret = get_ext_path(orig_inode, block_start, &orig_path);
-	if (ret)
-		goto out;
-
-	/* Get path structure to check the hole */
-	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
 	if (ret)
 		goto out;
+	o_end = o_start + len;
 
-	depth = ext_depth(orig_inode);
-	ext_cur = holecheck_path[depth].p_ext;
-
-	/*
-	 * Get proper starting location of block replacement if block_start was
-	 * within the hole.
-	 */
-	if (le32_to_cpu(ext_cur->ee_block) +
-		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
-		/*
-		 * The hole exists between extents or the tail of
-		 * original file.
-		 */
-		last_extent = mext_next_extent(orig_inode,
-					holecheck_path, &ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
-			goto out;
-		}
-		last_extent = mext_next_extent(orig_inode, orig_path,
-							&ext_dummy);
-		if (last_extent < 0) {
-			ret = last_extent;
-			goto out;
-		}
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
-		/* The hole exists at the beginning of original file. */
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	else
-		seq_start = block_start;
-
-	/* No blocks within the specified range. */
-	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-		ext4_debug("ext4 move extent: The specified range of file "
-							"may be the hole\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* Adjust start blocks */
-	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
-			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
-		     max(le32_to_cpu(ext_cur->ee_block), block_start);
-
-	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
-		seq_blocks += add_blocks;
+	while (o_start < o_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk, next_blk;
+		pgoff_t orig_page_index, donor_page_index;
+		int offset_in_page;
+		int unwritten, cur_len;
 
-		/* Adjust tail blocks */
-		if (seq_start + seq_blocks - 1 > block_end)
-			seq_blocks = block_end - seq_start + 1;
-
-		ext_prev = ext_cur;
-		last_extent = mext_next_extent(orig_inode, holecheck_path,
-						&ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
+		ret = get_ext_path(orig_inode, o_start, &path);
+		if (ret)
 			break;
-		}
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-
-		/*
-		 * Extend the length of contiguous block (seq_blocks)
-		 * if extents are contiguous.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode,
-					       ext_prev, ext_cur) &&
-		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
-		    !last_extent)
+		ex = path[path->p_depth].p_ext;
+		next_blk = ext4_ext_next_allocated_block(path);
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* Check hole before the start pos */
+		if (cur_blk + cur_len - 1 < o_start) {
+			if (next_blk == EXT_MAX_BLOCKS) {
+				o_start = o_end;
+				ret = -ENODATA;
+				break;
+			}
+			d_start += next_blk - o_start;
+			o_start = next_blk;
 			continue;
-
-		/* Is original extent is unwritten */
-		unwritten = ext4_ext_is_unwritten(ext_prev);
-
-		data_offset_in_page = seq_start % blocks_per_page;
-
-		/*
-		 * Calculate data blocks count that should be swapped
-		 * at the first page.
-		 */
-		if (data_offset_in_page + seq_blocks > blocks_per_page) {
-			/* Swapped blocks are across pages */
-			block_len_in_page =
-					blocks_per_page - data_offset_in_page;
-		} else {
-			/* Swapped blocks are in a page */
-			block_len_in_page = seq_blocks;
+		/* Check hole after the start pos */
+		} else if (cur_blk > o_start) {
+			/* Skip hole */
+			d_start += cur_blk - o_start;
+			o_start = cur_blk;
+			/* Extent inside requested range ?*/
+			if (cur_blk >= o_end)
+				break;
+		} else { /* in_range(o_start, o_blk, o_len) */
+			cur_len += cur_blk - o_start;
 		}
-
-		orig_page_offset = seq_start >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_end_page = (seq_start + seq_blocks - 1) >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-		rest_blocks = seq_blocks;
-
+		unwritten = ext4_ext_is_unwritten(ex);
+		if (o_end - o_start < cur_len)
+			cur_len = o_end - o_start;
+
+		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+					       orig_inode->i_blkbits);
+		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+					       donor_inode->i_blkbits);
+		offset_in_page = o_start % blocks_per_page;
+		if (cur_len > blocks_per_page- offset_in_page)
+			cur_len = blocks_per_page - offset_in_page;
 		/*
 		 * Up semaphore to avoid following problems:
 		 * a. transaction deadlock among ext4_journal_start,
@@ -1443,77 +670,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 *    in move_extent_per_page
 		 */
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
-
-		while (orig_page_offset <= seq_end_page) {
-
-			/* Swap original branches with new branches */
-			block_len_in_page = move_extent_per_page(
-						o_filp, donor_inode,
-						orig_page_offset,
-						data_offset_in_page,
-						block_len_in_page,
-						unwritten, &ret);
-
-			/* Count how many blocks we have exchanged */
-			*moved_len += block_len_in_page;
-			if (ret < 0)
-				break;
-			if (*moved_len > len) {
-				EXT4_ERROR_INODE(orig_inode,
-					"We replaced blocks too much! "
-					"sum of replaced: %llu requested: %llu",
-					*moved_len, len);
-				ret = -EIO;
-				break;
-			}
-
-			orig_page_offset++;
-			data_offset_in_page = 0;
-			rest_blocks -= block_len_in_page;
-			if (rest_blocks > blocks_per_page)
-				block_len_in_page = blocks_per_page;
-			else
-				block_len_in_page = rest_blocks;
-		}
-
+		/* Swap original branches with new branches */
+		move_extent_per_page(o_filp, donor_inode,
+				     orig_page_index, donor_page_index,
+				     offset_in_page, cur_len,
+				     unwritten, &ret);
 		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret < 0)
 			break;
-
-		/* Decrease buffer counter */
-		if (holecheck_path)
-			ext4_ext_drop_refs(holecheck_path);
-		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (ret)
-			break;
-		depth = holecheck_path->p_depth;
-
-		/* Decrease buffer counter */
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		ret = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (ret)
-			break;
-
-		ext_cur = holecheck_path[depth].p_ext;
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-		seq_blocks = 0;
-
+		o_start += cur_len;
+		d_start += cur_len;
+		m_len += cur_len;
 	}
 out:
-	if (*moved_len) {
-		ext4_discard_preallocations(orig_inode);
-		ext4_discard_preallocations(donor_inode);
-	}
-
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (holecheck_path) {
-		ext4_ext_drop_refs(holecheck_path);
-		kfree(holecheck_path);
-	}
+	WARN_ON(m_len > len);
+	if (ret == 0)
+		*moved_len = m_len;
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -34,6 +34,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <linux/virtinfo.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -96,6 +97,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	struct ext4_dir_entry *dirent;
 	int err = 0, is_dx_block = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	bh = ext4_bread(NULL, inode, block, 0, &err);
 	if (!bh) {
 		if (err == 0) {
@@ -1441,6 +1444,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 					 ino);
 			return ERR_PTR(-EIO);
 		}
+		if (!IS_ERR(inode) &&
+		    inode == EXT4_SB(inode->i_sb)->s_balloon_ino) {
+			iput(inode);
+			return ERR_PTR(-EPERM);
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -2269,6 +2277,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
+	if (!err && S_ISREG(mode) && ext4_want_data_csum(dir))
+		ext4_start_data_csum(inode);
 	return err;
 }
 
@@ -2421,6 +2431,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	err = ext4_mark_inode_dirty(handle, dir);
 	if (err)
 		goto out_clear_inode;
+	if (ext4_test_inode_state(dir, EXT4_STATE_PFCACHE_CSUM))
+		ext4_save_dir_csum(inode);
 	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 	if (IS_DIRSYNC(dir))
@@ -2784,6 +2796,10 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
+	retval = -EPERM;
+	if (inode == EXT4_SB(dir->i_sb)->s_balloon_ino)
+		goto end_unlink;
+
 	if (!inode->i_nlink) {
 		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
@@ -3218,7 +3234,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	retval = -ENOENT;
 	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
-		goto end_rename;
+		goto out_release;
 
 	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
 				 &new.de, &new.inlined);
@@ -3352,6 +3368,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = 0;
 
 end_rename:
+	if (handle)
+		ext4_journal_stop(handle);
+out_release:
 	brelse(old.dir_bh);
 	brelse(old.bh);
 	brelse(new.bh);
@@ -3361,8 +3380,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		unlock_new_inode(whiteout);
 		iput(whiteout);
 	}
-	if (handle)
-		ext4_journal_stop(handle);
 	return retval;
 }
 
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -397,6 +397,23 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
 	return 0;
 }
 
+
+static void bdi_congestion_wait(struct backing_dev_info *bdi)
+{
+	DEFINE_WAIT(_wait);
+
+	for (;;) {
+		prepare_to_wait(&bdi->cong_waitq, &_wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!bdi_write_congested2(bdi))
+			break;
+
+		io_schedule();
+	}
+
+	finish_wait(&bdi->cong_waitq, &_wait);
+}
+
 int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct page *page,
 			int len,
@@ -462,6 +479,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		set_buffer_async_write(bh);
 	} while ((bh = bh->b_this_page) != head);
 
+	if (!wbc->for_reclaim &&
+	    bdi_write_congested2(page->mapping->backing_dev_info))
+		bdi_congestion_wait(page->mapping->backing_dev_info);
+
 	/* Now submit buffers to write */
 	bh = head = page_buffers(page);
 	do {
--- /dev/null
+++ b/fs/ext4/pfcache.c
@@ -0,0 +1,774 @@
+/*
+ *  fs/ext4/pfcache.c
+ *
+ *  Automatic SHA-1 (FIPS 180-1) data checksummig
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *  Author: Konstantin Khlebnikov
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/cryptohash.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/init_task.h>	/* for init_cred */
+#include <linux/memcontrol.h>
+#include "ext4.h"
+#include "xattr.h"
+#include "../internal.h"
+
+#define PFCACHE_MAX_PATH	(EXT4_DATA_CSUM_SIZE * 2 + 2)
+static void pfcache_path(struct inode *inode, char *path)
+{
+	char *p;
+	int i;
+
+	/* like .git/objects hex[0]/hex[1..] */
+	p = pack_hex_byte(path, EXT4_I(inode)->i_data_csum[0]);
+	*p++ = '/';
+	for ( i = 1 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+		p = pack_hex_byte(p, EXT4_I(inode)->i_data_csum[i]);
+	*p = 0;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_open_pfcache(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	const struct cred *cur_cred;
+	char name[PFCACHE_MAX_PATH];
+	struct path root, path;
+	int ret;
+
+	if (inode->i_mapping->i_peer_file)
+		return -EBUSY;
+
+	if (!(ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	      EXT4_I(inode)->i_data_csum_end < 0))
+		return -ENODATA;
+
+	if (!EXT4_SB(sb)->s_pfcache_root.mnt)
+		return -ENODEV;
+
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	root = EXT4_SB(sb)->s_pfcache_root;
+	path_get(&root);
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+
+	if (!root.mnt)
+		return -ENODEV;
+
+	pfcache_path(inode, name);
+
+	/*
+	 * Lookups over shared area shouldn't be accounted to any particular
+	 * memory cgroup, otherwise a cgroup can be pinned for indefinitely
+	 * long after destruction, because a file or directory located in this
+	 * area is likely to be in use by another containers or host.
+	 */
+	memcg_stop_kmem_account();
+
+	cur_cred = override_creds(&init_cred);
+	/*
+	 * Files in cache area must not have csum attributes or
+	 * pfcache must be disabled for underlain filesystem,
+	 * otherwise real lock-recursion can happens for i_mutex.
+	 * Here we disable lockdep to avoid false-positive reports.
+	 */
+	lockdep_off();
+	ret = vfs_path_lookup(root.dentry, root.mnt, name, 0, &path);
+	lockdep_on();
+	revert_creds(cur_cred);
+	path_put(&root);
+	if (ret)
+		goto out;
+
+	ret = open_mapping_peer(inode->i_mapping, &path, &init_cred);
+	if (!ret)
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	path_put(&path);
+out:
+	memcg_resume_kmem_account();
+	return ret;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_close_pfcache(struct inode *inode)
+{
+	if (!inode->i_mapping->i_peer_file)
+		return -ENOENT;
+	close_mapping_peer(inode->i_mapping);
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	return 0;
+}
+
+/* under sb->s_umount write lock */
+int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb)
+{
+	int old_root = !!EXT4_SB(sb)->s_pfcache_root.mnt;
+	struct inode *inode, *old_inode = NULL;
+	struct file *file;
+	long nr_opened = 0, nr_closed = 0, nr_total;
+	bool reload_csum = false;
+	struct path root, path;
+
+	if (new_root) {
+		int err;
+
+		err = kern_path(new_root, LOOKUP_DIRECTORY, &root);
+		if (err) {
+			printk(KERN_ERR"PFCache: lookup \"%s\" failed %d\n",
+					new_root, err);
+			return new_sb ? 0 : err;
+		}
+		if (!test_opt2(sb, PFCACHE_CSUM)) {
+			set_opt2(sb, PFCACHE_CSUM);
+			reload_csum = true;
+		}
+	} else {
+		root.mnt = NULL;
+		root.dentry = NULL;
+	}
+
+	if (new_sb) {
+		path_put(&EXT4_SB(sb)->s_pfcache_root);
+		EXT4_SB(sb)->s_pfcache_root = root;
+		return 0;
+	}
+
+	path_get(&root);
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	path = EXT4_SB(sb)->s_pfcache_root;
+	EXT4_SB(sb)->s_pfcache_root = root;
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+	path_put(&path);
+
+	spin_lock(&inode_sb_list_lock);
+
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			continue;
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			if (!reload_csum)
+				continue;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0))
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_sb_list_lock);
+		iput(old_inode);
+		old_inode = inode;
+
+		path.mnt = NULL;
+		path.dentry = NULL;
+
+		mutex_lock(&inode->i_mutex);
+
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			if (!reload_csum)
+				goto next;
+			if (S_ISDIR(inode->i_mode)) {
+				ext4_load_dir_csum(inode);
+				goto next;
+			}
+			if (ext4_load_data_csum(inode))
+				goto next;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0) ||
+				S_ISDIR(inode->i_mode))
+			goto next;
+
+		if (new_root) {
+			char name[PFCACHE_MAX_PATH];
+			const struct cred *cur_cred;
+			int err;
+
+			pfcache_path(inode, name);
+			cur_cred = override_creds(&init_cred);
+			err = vfs_path_lookup(root.dentry, root.mnt,
+					name, 0, &path);
+			revert_creds(cur_cred);
+			if (err) {
+				path.mnt = NULL;
+				path.dentry = NULL;
+			}
+		}
+
+		file = inode->i_mapping->i_peer_file;
+		if ((!path.mnt && !file) || (path.mnt && file &&
+		     file->f_mapping == path.dentry->d_inode->i_mapping))
+			goto next;
+
+		if (file) {
+			close_mapping_peer(inode->i_mapping);
+			nr_closed++;
+		}
+
+		if (path.mnt) {
+			if (!open_mapping_peer(inode->i_mapping,
+						&path, &init_cred))
+				nr_opened++;
+		}
+next:
+		mutex_unlock(&inode->i_mutex);
+		path_put(&path);
+		cond_resched();
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+
+	percpu_counter_add(&EXT4_SB(sb)->s_pfcache_peers,
+			   nr_opened - nr_closed);
+	nr_total = percpu_counter_sum(&EXT4_SB(sb)->s_pfcache_peers);
+
+	if (new_root && (old_root || nr_total))
+		printk(KERN_INFO"PFCache: relink %u:%u to \"%s\""
+				" +%ld -%ld =%ld peers\n",
+				MAJOR(sb->s_dev), MINOR(sb->s_dev), new_root,
+				nr_opened, nr_closed, nr_total);
+	if (!new_root && nr_total)
+		printk(KERN_ERR"PFCache: %ld peers lost", nr_total);
+
+	path_put(&root);
+
+	return 0;
+}
+
+#define MAX_LOCK_BATCH	256
+
+long ext4_dump_pfcache(struct super_block *sb,
+		      struct pfcache_dump_request __user *user_req)
+{
+	struct inode *inode, *old_inode = NULL;
+	struct pfcache_dump_request req;
+	u8 __user *user_buffer;
+	u64 state, *x;
+	void *buffer, *p;
+	long ret, size;
+	int lock_batch = 0;
+
+	if (copy_from_user(&req, user_req, sizeof(req)))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_WRITE, user_req,
+		       req.header_size + req.buffer_size))
+		return -EFAULT;
+
+	/* check for unknown flags */
+	if ((req.filter & ~PFCACHE_FILTER_MASK) ||
+	    (req.payload & ~PFCACHE_PAYLOAD_MASK))
+		return -EINVAL;
+
+	buffer = kzalloc(PFCACHE_PAYLOAD_MAX_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	ret = 0;
+	/* skip all new fields in the user request header */
+	user_buffer = (void*)user_req + req.header_size;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) ||
+		    inode == EXT4_SB(sb)->s_balloon_ino)
+			goto next;
+
+		/* evaluate the inode state */
+		state = 0;
+
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+		    EXT4_I(inode)->i_data_csum_end < 0)
+			state |= PFCACHE_FILTER_WITH_CSUM;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_CSUM;
+
+		if (inode->i_mapping->i_peer_file)
+			state |= PFCACHE_FILTER_WITH_PEER;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_PEER;
+
+		/* check state-filter */
+		if (req.filter & state)
+			goto next;
+
+		/* check csum-filter */
+		if ((req.filter & PFCACHE_FILTER_COMPARE_CSUM) &&
+		    memcmp(EXT4_I(inode)->i_data_csum,
+			    req.csum_filter, EXT4_DATA_CSUM_SIZE))
+			goto next;
+
+		/* -- add new filters above this line -- */
+
+		/* check offset-filter at the last */
+		if (req.offset > 0) {
+			req.offset--;
+			goto next;
+		}
+
+		/* construct the payload */
+		p = buffer;
+
+		if (req.payload & PFCACHE_PAYLOAD_CSUM) {
+			BUILD_BUG_ON(PFCACHE_CSUM_SIZE != EXT4_DATA_CSUM_SIZE);
+			if (state & PFCACHE_FILTER_WITH_CSUM)
+				memcpy(p, EXT4_I(inode)->i_data_csum,
+						EXT4_DATA_CSUM_SIZE);
+			else
+				memset(p, 0, EXT4_DATA_CSUM_SIZE);
+			p += ALIGN(PFCACHE_CSUM_SIZE, sizeof(u64));
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FHANDLE) {
+			unsigned *x = p;
+
+			*x++ = 8;
+			*x++ = FILEID_INO32_GEN;
+			*x++ = inode->i_ino;
+			*x++ = inode->i_generation;
+			p += 16;
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_STATE) {
+			x = p;
+			*x = state;
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FSIZE) {
+			x = p;
+			*x = i_size_read(inode);
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_PAGES) {
+			x = p;
+			*x = inode->i_mapping->nrpages;
+			p += sizeof(u64);
+		}
+
+		/* -- add new payloads above this line -- */
+
+		size = p - buffer;
+		BUG_ON(!IS_ALIGNED(size, sizeof(u64)));
+		BUG_ON(size > PFCACHE_PAYLOAD_MAX_SIZE);
+
+		if (size > req.buffer_size)
+			goto out;
+
+		pagefault_disable();
+		if (!__copy_to_user_inatomic(user_buffer, buffer, size)) {
+			pagefault_enable();
+		} else {
+			pagefault_enable();
+			__iget(inode);
+			spin_unlock(&inode_sb_list_lock);
+			iput(old_inode);
+			old_inode = inode;
+			if (copy_to_user(user_buffer, buffer, size)) {
+				ret = -EFAULT;
+				goto out_nolock;
+			}
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_sb_list_lock);
+		}
+
+		ret++;
+		user_buffer += size;
+		req.buffer_size -= size;
+next:
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -EINTR;
+			goto out;
+		}
+		if (++lock_batch > MAX_LOCK_BATCH || need_resched() ||
+				spin_needbreak(&inode_sb_list_lock)) {
+			__iget(inode);
+			spin_unlock(&inode_sb_list_lock);
+			iput(old_inode);
+			old_inode = inode;
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_sb_list_lock);
+		}
+	}
+out:
+	spin_unlock(&inode_sb_list_lock);
+out_nolock:
+	iput(old_inode);
+
+	kfree(buffer);
+
+	return ret;
+}
+
+static void ext4_init_data_csum(struct inode *inode)
+{
+	EXT4_I(inode)->i_data_csum_end = 0;
+	sha_init((__u32 *)EXT4_I(inode)->i_data_csum);
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_clear_data_csum(struct inode *inode)
+{
+	ext4_clear_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (EXT4_I(inode)->i_data_csum_end < 0)
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	else
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_start_data_csum(struct inode *inode)
+{
+	if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		spin_lock(&inode->i_lock);
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+			ext4_init_data_csum(inode);
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+int ext4_load_data_csum(struct inode *inode)
+{
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE);
+	if (ret < 0)
+		return ret;
+	if (ret != EXT4_DATA_CSUM_SIZE)
+		return -EIO;
+
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	return 0;
+}
+
+static int ext4_save_data_csum(struct inode *inode, u8 *csum)
+{
+	int ret;
+
+	WARN_ON(journal_current_handle());
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0 &&
+	    memcmp(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE))
+		ext4_close_pfcache(inode);
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_clear_data_csum(inode);
+	memcpy(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE);
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	spin_unlock(&inode->i_lock);
+
+	ext4_open_pfcache(inode);
+
+	/* In order to guarantie csum consistenty force block allocation first */
+	ret = ext4_alloc_da_blocks(inode);
+	if (ret)
+		return ret;
+
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE, 0);
+}
+
+void ext4_load_dir_csum(struct inode *inode)
+{
+	char value[EXT4_DIR_CSUM_VALUE_LEN];
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			     EXT4_DATA_CSUM_NAME, value, sizeof(value));
+	if (ret == EXT4_DIR_CSUM_VALUE_LEN &&
+	    !strncmp(value, EXT4_DIR_CSUM_VALUE, sizeof(value)))
+		ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+}
+
+void ext4_save_dir_csum(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME,
+			EXT4_DIR_CSUM_VALUE,
+			EXT4_DIR_CSUM_VALUE_LEN, 0);
+}
+
+void ext4_truncate_data_csum(struct inode *inode, loff_t pos)
+{
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	if (EXT4_I(inode)->i_data_csum_end < 0) {
+		WARN_ON(journal_current_handle());
+		ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				EXT4_DATA_CSUM_NAME, NULL, 0, 0);
+		ext4_close_pfcache(inode);
+	}
+	spin_lock(&inode->i_lock);
+	ext4_clear_data_csum(inode);
+	if (!pos && test_opt2(inode->i_sb, PFCACHE_CSUM))
+		ext4_init_data_csum(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+void ext4_check_pos_data_csum(struct inode *inode, loff_t pos)
+{
+	if ((pos & ~(loff_t)(SHA_MESSAGE_BYTES-1)) !=
+	    EXT4_I(inode)->i_data_csum_end)
+		ext4_truncate_data_csum(inode, pos);
+}
+
+static void sha_batch_transform(__u32 *digest, const char *data, unsigned rounds)
+{
+	__u32 temp[SHA_WORKSPACE_WORDS];
+
+	while (rounds--) {
+		sha_transform(digest, data, temp);
+		data += SHA_MESSAGE_BYTES;
+	}
+}
+
+void ext4_update_data_csum(struct inode *inode, loff_t pos,
+			   unsigned len, struct page* page)
+{
+	__u32 *digest = (__u32 *)EXT4_I(inode)->i_data_csum;
+	u8 *kaddr, *data;
+
+	if (!len)
+		return;
+
+	len += pos & (SHA_MESSAGE_BYTES-1);
+	len &= ~(SHA_MESSAGE_BYTES-1);
+	pos &= ~(loff_t)(SHA_MESSAGE_BYTES-1);
+
+	BUG_ON(pos != EXT4_I(inode)->i_data_csum_end);
+	EXT4_I(inode)->i_data_csum_end += len;
+
+	kaddr = kmap_atomic(page);
+	data = kaddr + (pos & (PAGE_CACHE_SIZE - 1));
+	sha_batch_transform(digest, data, len / SHA_MESSAGE_BYTES);
+	kunmap_atomic(kaddr);
+}
+
+static int ext4_finish_data_csum(struct inode *inode, u8 *csum)
+{
+	__u32 *digest = (__u32 *)csum;
+	__u8 data[SHA_MESSAGE_BYTES * 2];
+	loff_t end;
+	unsigned tail;
+	__be64 bits;
+
+	BUILD_BUG_ON(EXT4_DATA_CSUM_SIZE != SHA_DIGEST_WORDS * 4);
+
+	memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+
+	end = EXT4_I(inode)->i_data_csum_end;
+	if (end < 0)
+		return 0;
+
+	if (!inode->i_size)
+		return -ENODATA;
+
+	tail = inode->i_size - end;
+	if (tail >= SHA_MESSAGE_BYTES)
+		return -EIO;
+
+	if (tail) {
+		struct page *page;
+		u8 *kaddr;
+
+		page = read_cache_page_gfp(inode->i_mapping,
+					   end >> PAGE_CACHE_SHIFT,
+					   GFP_NOFS);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		kaddr = kmap_atomic(page);
+		memcpy(data, kaddr + (end & (PAGE_CACHE_SIZE-1)), tail);
+		kunmap_atomic(kaddr);
+		page_cache_release(page);
+	}
+
+	memset(data + tail, 0, sizeof(data) - tail);
+	data[tail] = 0x80;
+
+	bits = cpu_to_be64((end + tail) << 3);
+	if (tail >= SHA_MESSAGE_BYTES - sizeof(bits)) {
+		memcpy(data + SHA_MESSAGE_BYTES * 2 - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 2);
+	} else {
+		memcpy(data + SHA_MESSAGE_BYTES - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 1);
+	}
+
+	for (tail = 0; tail < SHA_DIGEST_WORDS ; tail++)
+		digest[tail] = cpu_to_be32(digest[tail]);
+
+	return 0;
+}
+
+void ext4_commit_data_csum(struct inode *inode)
+{
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+
+	if (!S_ISREG(inode->i_mode) || EXT4_I(inode)->i_data_csum_end < 0)
+		return;
+
+	mutex_lock(&inode->i_mutex);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    !ext4_finish_data_csum(inode, csum))
+		ext4_save_data_csum(inode, csum);
+	else
+		ext4_truncate_data_csum(inode, 0);
+	mutex_unlock(&inode->i_mutex);
+}
+
+static int ext4_xattr_trusted_csum_get(struct dentry *dentry, const char *name,
+				       void *buffer, size_t size, int handler_flags)
+{
+	struct inode *inode = dentry->d_inode;
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, PFCACHE_CSUM))
+		return -EOPNOTSUPP;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode))
+		return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, buffer, size);
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (!buffer)
+		return EXT4_DATA_CSUM_SIZE * 2;
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0) {
+		memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+	} else {
+		spin_unlock(&inode->i_lock);
+		return -ENODATA;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(buffer, csum, EXT4_DATA_CSUM_SIZE);
+		return EXT4_DATA_CSUM_SIZE;
+	}
+
+	if (size >= EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+			buffer = pack_hex_byte(buffer, csum[i]);
+		return EXT4_DATA_CSUM_SIZE * 2;
+	}
+
+	return -ERANGE;
+}
+
+static int ext4_xattr_trusted_csum_set(struct dentry *dentry, const char *name,
+				const void *value, size_t size, int flags, int handler_flags)
+{
+	struct inode *inode = dentry->d_inode;
+	const char *text = value;
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, PFCACHE_CSUM))
+		return -EOPNOTSUPP;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (!value)
+			ext4_clear_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+		else if (size == EXT4_DIR_CSUM_VALUE_LEN &&
+			 !strncmp(value, EXT4_DIR_CSUM_VALUE, size))
+			ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+		else
+			return -EINVAL;
+
+		return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, value, size, flags);
+	}
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		if (flags & XATTR_CREATE)
+			return -EEXIST;
+	} else {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+	}
+
+	if (!value) {
+		ext4_truncate_data_csum(inode, 1);
+		return 0;
+	}
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(csum, value, EXT4_DATA_CSUM_SIZE);
+	} else if (size == EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ ) {
+			int hi = hex_to_bin(text[i*2]);
+			int lo = hex_to_bin(text[i*2+1]);
+			if ((hi < 0) || (lo < 0))
+				return -EINVAL;
+			csum[i] = (hi << 4) | lo;
+		}
+	} else
+		return -EINVAL;
+
+	if (mapping_writably_mapped(inode->i_mapping))
+		return -EBUSY;
+
+	return ext4_save_data_csum(inode, csum);
+}
+
+#define XATTR_TRUSTED_CSUM_PREFIX XATTR_TRUSTED_PREFIX EXT4_DATA_CSUM_NAME
+#define XATTR_TRUSTED_CSUM_PREFIX_LEN (sizeof (XATTR_TRUSTED_CSUM_PREFIX) - 1)
+
+static size_t
+ext4_xattr_trusted_csum_list(struct dentry *dentry, char *list, size_t list_size,
+			     const char *name, size_t name_len, int handler_flags)
+{
+	return 0;
+}
+
+struct xattr_handler ext4_xattr_trusted_csum_handler = {
+	.prefix = XATTR_TRUSTED_CSUM_PREFIX,
+	.list   = ext4_xattr_trusted_csum_list,
+	.get    = ext4_xattr_trusted_csum_get,
+	.set    = ext4_xattr_trusted_csum_set,
+};
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -314,6 +314,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
 
 		if (start_blk + itb > last_blk)
 			goto next_group;
+
 		group_data[it_index].inode_table = start_blk;
 		group = ext4_get_group_number(sb, start_blk);
 		next_group_start = ext4_group_first_block_no(sb, group + 1);
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,7 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 
 #include <linux/kthread.h>
@@ -192,6 +193,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_block_bitmap);
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -200,6 +202,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_bitmap);
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 			      struct ext4_group_desc *bg)
@@ -208,6 +211,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_table);
 
 __u32 ext4_free_group_clusters(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -297,6 +301,117 @@ void ext4_itable_unused_set(struct super_block *sb,
 		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 
+static int ext4_uuid_valid(const u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (uuid[i])
+			return 1;
+	}
+	return 0;
+}
+
+struct ext4_uevent {
+	struct super_block *sb;
+	enum ext4_event_type action;
+	struct work_struct work;
+};
+
+/**
+ * ext4_send_uevent - prepare and send uevent
+ *
+ * @sb:		super_block
+ * @action:		action type
+ *
+ */
+static void ext4_send_uevent_work(struct work_struct *w)
+{
+	struct ext4_uevent *e = container_of(w, struct ext4_uevent, work);
+	struct super_block *sb = e->sb;
+	struct kobj_uevent_env *env;
+	const u8 *uuid = sb->s_uuid;
+	enum kobject_action kaction = KOBJ_CHANGE;
+	int ret;
+
+	env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+	if (!env){
+		kfree(e);
+		return;
+	}
+	ret = add_uevent_var(env, "FS_TYPE=%s", sb->s_type->name);
+	if (ret)
+		goto out;
+	ret = add_uevent_var(env, "FS_NAME=%s", sb->s_id);
+	if (ret)
+		goto out;
+
+	if (ext4_uuid_valid(uuid)) {
+		ret = add_uevent_var(env, "UUID=%pUB", uuid);
+		if (ret)
+			goto out;
+	}
+
+	switch (e->action) {
+	case EXT4_UA_MOUNT:
+		kaction = KOBJ_ONLINE;
+		ret = add_uevent_var(env, "FS_ACTION=%s", "MOUNT");
+		break;
+	case EXT4_UA_UMOUNT:
+		kaction = KOBJ_OFFLINE;
+		ret = add_uevent_var(env, "FS_ACTION=%s", "UMOUNT");
+		break;
+	case EXT4_UA_REMOUNT:
+		ret = add_uevent_var(env, "FS_ACTION=%s", "REMOUNT");
+		break;
+	case EXT4_UA_ERROR:
+		ret = add_uevent_var(env, "FS_ACTION=%s", "ERROR");
+		break;
+	case EXT4_UA_ABORT:
+		ret = add_uevent_var(env, "FS_ACTION=%s", "ABORT");
+		break;
+	case EXT4_UA_FREEZE:
+		ret = add_uevent_var(env, "FS_ACTION=%s", "FREEZE");
+		break;
+	case EXT4_UA_UNFREEZE:
+		ret = add_uevent_var(env, "FS_ACTION=%s", "UNFREEZE");
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (ret)
+		goto out;
+	ret = kobject_uevent_env(&(EXT4_SB(sb)->s_kobj), kaction, env->envp);
+out:
+	kfree(env);
+	kfree(e);
+}
+
+/**
+ * ext4_send_uevent - prepare and schedule event submission
+ *
+ * @sb:		super_block
+ * @action:		action type
+ *
+ */
+int ext4_send_uevent(struct super_block *sb, enum ext4_event_type action)
+{
+	struct ext4_uevent *e;
+
+	smp_rmb();
+	if (!EXT4_SB(sb)->rsv_conversion_wq)
+		return -EPROTO;
+
+	e = kzalloc(sizeof(*e), GFP_NOIO);
+	if (!e)
+		return -ENOMEM;
+
+	e->sb = sb;
+	e->action = action;
+	INIT_WORK(&e->work, ext4_send_uevent_work);
+	queue_work(EXT4_SB(sb)->rsv_conversion_wq, &e->work);
+	return 0;
+}
 
 static void __save_error_info(struct super_block *sb, const char *func,
 			    unsigned int line)
@@ -385,12 +500,18 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 
 static void ext4_handle_error(struct super_block *sb)
 {
+	if (!xchg(&EXT4_SB(sb)->s_err_event_sent, 1))
+		ext4_send_uevent(sb, EXT4_UA_ERROR);
+
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
 	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
+		if (!xchg(&EXT4_SB(sb)->s_abrt_event_sent, 1))
+			ext4_send_uevent(sb, EXT4_UA_ABORT);
+
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
@@ -588,6 +709,10 @@ void __ext4_abort(struct super_block *sb, const char *function,
 
 	if ((sb->s_flags & MS_RDONLY) == 0) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+
+		if (!xchg(&EXT4_SB(sb)->s_abrt_event_sent, 1))
+			ext4_send_uevent(sb, EXT4_UA_ABORT);
+
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
@@ -783,13 +908,17 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int aborted = 0;
+	struct workqueue_struct * rsv_conversion_wq = sbi->rsv_conversion_wq;
 	int i, err;
 
+	ext4_send_uevent(sb, EXT4_UA_UMOUNT);
 	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	flush_workqueue(sbi->rsv_conversion_wq);
-	destroy_workqueue(sbi->rsv_conversion_wq);
+	sbi->rsv_conversion_wq = NULL;
+	smp_wmb();
+	flush_workqueue(rsv_conversion_wq);
+	destroy_workqueue(rsv_conversion_wq);
 
 	if (sbi->s_journal) {
 		aborted = is_journal_aborted(sbi->s_journal);
@@ -827,7 +956,9 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -888,9 +1019,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	spin_lock_init(&ei->i_prealloc_lock);
 	ext4_es_init_tree(&ei->i_es_tree);
 	rwlock_init(&ei->i_es_lock);
-	INIT_LIST_HEAD(&ei->i_es_lru);
-	ei->i_es_lru_nr = 0;
-	ei->i_touch_when = 0;
+	INIT_LIST_HEAD(&ei->i_es_list);
+	ei->i_es_all_nr = 0;
+	ei->i_es_shk_nr = 0;
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -956,7 +1087,7 @@ static int __init init_inodecache(void)
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
@@ -980,13 +1111,17 @@ void ext4_clear_inode(struct inode *inode)
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
-	ext4_es_lru_del(inode);
+	ext4_es_list_del(inode);
 	if (EXT4_I(inode)->jinode) {
 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 					       EXT4_I(inode)->jinode);
 		jbd2_free_inode(EXT4_I(inode)->jinode);
 		EXT4_I(inode)->jinode = NULL;
 	}
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		ext4_close_pfcache(inode);
+		ext4_clear_data_csum(inode);
+	}
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1169,11 +1304,14 @@ enum {
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_lazytime, Opt_nolazytime,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-	Opt_max_dir_size_kb, Opt_nojournal_checksum,
+	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_balloon_ino,
+	Opt_pfcache_csum, Opt_nopfcache_csum,
+	Opt_pfcache, Opt_nopfcache,
 };
 
 static const match_table_t tokens = {
@@ -1233,6 +1371,8 @@ static const match_table_t tokens = {
 	{Opt_dax, "dax"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
+	{Opt_lazytime, "lazytime"},
+	{Opt_nolazytime, "nolazytime"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
@@ -1251,6 +1391,11 @@ static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+	{Opt_balloon_ino, "balloon_ino=%u"},
+	{Opt_pfcache_csum, "pfcache_csum"},
+	{Opt_nopfcache_csum, "nopfcache_csum"},
+	{Opt_pfcache, "pfcache=%s"},
+	{Opt_nopfcache, "nopfcache"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
@@ -1368,6 +1513,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
+#define MOPT_WANT_SYS_ADMIN	0x0800
 
 static const struct mount_opts {
 	int	token;
@@ -1398,7 +1544,7 @@ static const struct mount_opts {
 				    EXT4_MOUNT_JOURNAL_CHECKSUM),
 	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
-	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR|MOPT_WANT_SYS_ADMIN},
 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
@@ -1452,12 +1598,16 @@ static const struct mount_opts {
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_balloon_ino, 0, 0},
+	{Opt_pfcache_csum, 0, 0},
+	{Opt_nopfcache_csum, 0, 0},
 	{Opt_err, 0, 0}
 };
 
 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			    substring_t *args, unsigned long *journal_devnum,
-			    unsigned int *journal_ioprio, int is_remount)
+			    unsigned int *journal_ioprio,
+			    unsigned long *balloon_ino, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	const struct mount_opts *m;
@@ -1491,6 +1641,27 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	case Opt_i_version:
 		sb->s_flags |= MS_I_VERSION;
 		return 1;
+	case Opt_pfcache:
+		if (capable(CAP_SYS_ADMIN)) {
+			char *path;
+			int err;
+
+			path = match_strdup(&args[0]);
+			err = ext4_relink_pfcache(sb, path, !is_remount);
+			kfree(path);
+			return err ? -1 : 1;
+		}
+		return 1;
+	case Opt_nopfcache:
+		if (capable(CAP_SYS_ADMIN))
+			ext4_relink_pfcache(sb, NULL, !is_remount);
+		return 1;
+	case Opt_lazytime:
+		sb->s_flags |= MS_LAZYTIME;
+		return 1;
+	case Opt_nolazytime:
+		sb->s_flags &= ~MS_LAZYTIME;
+		return 1;
 	}
 
 	for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1522,6 +1693,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		set_opt2(sb, EXPLICIT_DELALLOC);
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
+	if (m->flags & MOPT_WANT_SYS_ADMIN && !capable(CAP_SYS_ADMIN))
+		return 1;
+
 	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
 		ext4_msg(sb, KERN_ERR, "Cannot change quota "
 			 "options when quota turned on");
@@ -1622,6 +1796,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		}
 		*journal_ioprio =
 			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+	} else if (token == Opt_balloon_ino) {
+		*balloon_ino = arg;
+	} else if (token == Opt_pfcache_csum) {
+		if (capable(CAP_SYS_ADMIN))
+			set_opt2(sb, PFCACHE_CSUM);
+	} else if (token == Opt_nopfcache_csum) {
+		if (capable(CAP_SYS_ADMIN))
+			clear_opt2(sb, PFCACHE_CSUM);
 	} else if (m->flags & MOPT_DATAJ) {
 		if (is_remount) {
 			if (!sbi->s_journal)
@@ -1687,6 +1869,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
+			 unsigned long *balloon_ino,
 			 int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1707,7 +1890,8 @@ static int parse_options(char *options, struct super_block *sb,
 		args[0].to = args[0].from = NULL;
 		token = match_token(p, tokens, args);
 		if (handle_mount_opt(sb, p, token, args, journal_devnum,
-				     journal_ioprio, is_remount) < 0)
+				     journal_ioprio, balloon_ino,
+				     is_remount) < 0)
 			return 0;
 	}
 #ifdef CONFIG_QUOTA
@@ -1877,6 +2061,24 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	if (test_opt(sb, DATA_ERR_ABORT))
 		SEQ_OPTS_PUTS("data_err=abort");
 
+	if (sbi->s_balloon_ino)
+		SEQ_OPTS_PRINT("balloon_ino=%ld", sbi->s_balloon_ino->i_ino);
+
+	if (ve_is_super(get_exec_env())) {
+		if (test_opt2(sb, PFCACHE_CSUM))
+			SEQ_OPTS_PUTS("pfcache_csum");
+		else if (nodefs)
+			SEQ_OPTS_PUTS("nopfcache_csum");
+		if (sbi->s_pfcache_root.mnt) {
+			spin_lock(&sbi->s_pfcache_lock);
+			if (sbi->s_pfcache_root.mnt) {
+				SEQ_OPTS_PUTS("pfcache=");
+				seq_path(seq, &sbi->s_pfcache_root, "\\ \t\n");
+			}
+			spin_unlock(&sbi->s_pfcache_lock);
+		}
+	}
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -2655,6 +2857,30 @@ static ssize_t sbi_deprecated_show(struct ext4_attr *a,
 	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
 }
 
+static ssize_t csum_partial_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_partial));
+}
+
+static ssize_t csum_complete_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_complete));
+}
+
+static ssize_t pfcache_peers_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_pfcache_peers));
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
@@ -2701,6 +2927,9 @@ EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_RW_ATTR(reserved_clusters);
+EXT4_RO_ATTR(csum_partial);
+EXT4_RO_ATTR(csum_complete);
+EXT4_RO_ATTR(pfcache_peers);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2722,6 +2951,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+EXT4_RW_ATTR_SBI_UI(bd_full_ratelimit, s_bd_full_ratelimit);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2748,6 +2978,10 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(bd_full_ratelimit),
+	ATTR_LIST(csum_partial),
+	ATTR_LIST(csum_complete),
+	ATTR_LIST(pfcache_peers),
 	NULL,
 };
 
@@ -2802,6 +3036,54 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+static void ext4_load_balloon(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+
+	if (!ino) {
+		/* FIXME locking */
+		if (sbi->s_balloon_ino) {
+			iput(sbi->s_balloon_ino);
+			sbi->s_balloon_ino = NULL;
+		}
+
+		return;
+	}
+
+	if (ino < EXT4_FIRST_INO(sb)) {
+		ext4_msg(sb, KERN_WARNING, "bad balloon inode specified");
+		return;
+	}
+
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode)) {
+		ext4_msg(sb, KERN_WARNING, "can't load balloon inode (%ld)", PTR_ERR(inode));
+		return;
+	}
+
+	if (!S_ISREG(inode->i_mode)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should be regular");
+		return;
+	}
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should support extents");
+		return;
+	}
+
+	/* FIXME - locking */
+	if (sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+	sbi->s_balloon_ino = inode;
+	ext4_msg(sb, KERN_INFO, "loaded balloon from %ld (%ld blocks)",
+			inode->i_ino, inode->i_blocks);
+}
+
 static void ext4_feat_release(struct kobject *kobj)
 {
 	complete(&ext4_feat->f_kobj_unregister);
@@ -2949,7 +3231,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 	sb = elr->lr_super;
 	ngroups = EXT4_SB(sb)->s_groups_count;
 
-	sb_start_write(sb);
 	for (group = elr->lr_next_group; group < ngroups; group++) {
 		gdp = ext4_get_group_desc(sb, group, NULL);
 		if (!gdp) {
@@ -2976,8 +3257,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
 		elr->lr_next_group = group + 1;
 	}
-	sb_end_write(sb);
-
 	return ret;
 }
 
@@ -3027,9 +3306,9 @@ static struct task_struct *ext4_lazyinit_task;
 static int ext4_lazyinit_thread(void *arg)
 {
 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
-	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
 	unsigned long next_wakeup, cur;
+	LIST_HEAD(request_list);
 
 	BUG_ON(NULL == eli);
 
@@ -3042,21 +3321,43 @@ static int ext4_lazyinit_thread(void *arg)
 			mutex_unlock(&eli->li_list_mtx);
 			goto exit_thread;
 		}
-
-		list_for_each_safe(pos, n, &eli->li_request_list) {
-			elr = list_entry(pos, struct ext4_li_request,
-					 lr_request);
-
-			if (time_after_eq(jiffies, elr->lr_next_sched)) {
-				if (ext4_run_li_request(elr) != 0) {
-					/* error, remove the lazy_init job */
-					ext4_remove_li_request(elr);
-					continue;
+		list_splice_init(&eli->li_request_list, &request_list);
+		while (!list_empty(&request_list)) {
+			int err = 0;
+			int progress = 0;
+
+			elr = list_entry(request_list.next,
+					 struct ext4_li_request, lr_request);
+			list_move(request_list.next, &eli->li_request_list);
+			if (time_before(jiffies, elr->lr_next_sched)) {
+				if (time_before(elr->lr_next_sched, next_wakeup))
+					next_wakeup = elr->lr_next_sched;
+				continue;
+			}
+			if (down_read_trylock(&elr->lr_super->s_umount)) {
+				if (sb_start_write_trylock(elr->lr_super)) {
+					progress = 1;
+					/* We holds sb->s_umount, sb can not
+					 * be removed from the list, it is
+					 * now safe to drop li_list_mtx
+					 */
+					mutex_unlock(&eli->li_list_mtx);
+					err = ext4_run_li_request(elr);
+					sb_end_write(elr->lr_super);
+					mutex_lock(&eli->li_list_mtx);
 				}
+				up_read((&elr->lr_super->s_umount));
+			}
+			/* error, remove the lazy_init job */
+			if (err) {
+				ext4_remove_li_request(elr);
+				continue;
+			}
+			if (!progress) {
+				elr->lr_next_sched = jiffies +
+					(prandom_u32()
+					 % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
 			}
-
-			if (time_before(elr->lr_next_sched, next_wakeup))
-				next_wakeup = elr->lr_next_sched;
 		}
 		mutex_unlock(&eli->li_list_mtx);
 
@@ -3498,6 +3799,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	__u64 blocks_count;
 	int err = 0;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	unsigned long balloon_ino = 0;
 	ext4_group_t first_not_zeroed;
 
 	if ((data && !orig_data) || !sbi)
@@ -3619,8 +3921,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
 		set_opt(sb, WRITEBACK_DATA);
 
-	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-		set_opt(sb, ERRORS_PANIC);
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) {
+		if (capable(CAP_SYS_ADMIN))
+			set_opt(sb, ERRORS_PANIC);
+		else
+			set_opt(sb, ERRORS_RO);
+	}
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
 		set_opt(sb, ERRORS_CONT);
 	else
@@ -3660,7 +3966,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (!s_mount_opts)
 			goto failed_mount;
 		if (!parse_options(s_mount_opts, sb, &journal_devnum,
-				   &journal_ioprio, 0)) {
+				   &journal_ioprio, &balloon_ino, 0)) {
 			ext4_msg(sb, KERN_WARNING,
 				 "failed to parse options in superblock: %s",
 				 s_mount_opts);
@@ -3669,7 +3975,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
-			   &journal_ioprio, 0))
+			   &journal_ioprio, &balloon_ino, 0))
 		goto failed_mount;
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -4042,22 +4348,29 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+	spin_lock_init(&sbi->s_pfcache_lock);
 
 	init_timer(&sbi->s_err_report);
 	sbi->s_err_report.function = print_daily_error_info;
 	sbi->s_err_report.data = (unsigned long) sb;
 
 	/* Register extent status tree shrinker */
-	ext4_es_register_shrinker(sbi);
-
-	err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
-	if (err) {
+	err = percpu_counter_init(&sbi->s_csum_partial, 0, GFP_KERNEL);
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_csum_complete, 0, GFP_KERNEL);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_pfcache_peers, 0, GFP_KERNEL);
+	}
+	if (err != 0)
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
+
+	if (ext4_es_register_shrinker(sbi))
 		goto failed_mount3;
-	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_extent_max_zeroout_kb = 32;
+	sbi->s_bd_full_ratelimit = 1024;
 
 	/*
 	 * set up enough so that it can read an inode
@@ -4331,6 +4644,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "the device does not support discard");
 	}
 
+	ext4_load_balloon(sb, balloon_ino);
+
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 		 "Opts: %.*s%s%s", descr,
 		 (int) sizeof(sbi->s_es->s_mount_opts),
@@ -4345,6 +4660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
 	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
 
+	ext4_send_uevent(sb, EXT4_UA_MOUNT);
 	kfree(orig_data);
 	return 0;
 
@@ -4382,10 +4698,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
-failed_mount3:
 	ext4_es_unregister_shrinker(sbi);
+failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
-	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4399,6 +4717,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
+	if (sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, true);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -4813,8 +5133,12 @@ int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & MS_RDONLY) {
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
 		return 0;
+	}
 
 	journal = EXT4_SB(sb)->s_journal;
 	return ext4_journal_force_commit(journal);
@@ -4884,8 +5208,10 @@ static int ext4_freeze(struct super_block *sb)
 	int error = 0;
 	journal_t *journal;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & MS_RDONLY) {
+		ext4_send_uevent(sb, EXT4_UA_FREEZE);
 		return 0;
+	}
 
 	journal = EXT4_SB(sb)->s_journal;
 
@@ -4906,6 +5232,9 @@ static int ext4_freeze(struct super_block *sb)
 out:
 	/* we rely on upper layer to stop further updates */
 	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	if (!error)
+		ext4_send_uevent(sb, EXT4_UA_FREEZE);
+
 	return error;
 }
 
@@ -4915,6 +5244,8 @@ static int ext4_freeze(struct super_block *sb)
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
+	ext4_send_uevent(sb, EXT4_UA_UNFREEZE);
+
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
@@ -4954,6 +5285,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i, j;
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
+	unsigned long balloon_ino = -1;
 
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
@@ -4982,7 +5314,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
-	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio, &balloon_ino, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -5138,6 +5470,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
 		ext4_commit_super(sb, 1);
 
+	if (balloon_ino != -1)
+		ext4_load_balloon(sb, balloon_ino);
+
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -5154,7 +5489,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 
+	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	ext4_send_uevent(sb, EXT4_UA_REMOUNT);
 	kfree(orig_data);
 	return 0;
 
@@ -5210,6 +5547,20 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+	if (sbi->s_balloon_ino) {
+		struct ext4_inode_info *ei;
+		blkcnt_t balloon_blocks;
+
+		balloon_blocks = sbi->s_balloon_ino->i_blocks;
+		ei = EXT4_I(sbi->s_balloon_ino);
+		spin_lock(&ei->i_block_reservation_lock);
+		balloon_blocks += ei->i_reserved_data_blocks;
+		spin_unlock(&ei->i_block_reservation_lock);
+
+		BUG_ON(sbi->s_balloon_ino->i_blkbits < 9);
+		buf->f_blocks -= balloon_blocks >> (sbi->s_balloon_ino->i_blkbits - 9);
+	}
+
 	return 0;
 }
 
@@ -5596,6 +5947,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data)
 {
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
 	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 
@@ -5661,13 +6014,29 @@ static inline void unregister_as_ext3(void) { }
 static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
+static void ext4_kill_sb(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	if (sbi && sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+
+	if (sbi && sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, false);
+
+	kill_block_super(sb);
+}
+
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
 	.mount		= ext4_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= ext4_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_INVALIDATE_RANGE |
-			  FS_HAS_DIO_IODONE2,
+			  FS_HAS_DIO_IODONE2 | FS_VIRTUALIZED |
+			  FS_HAS_MMAP_PREP | FS_USERNS_MOUNT |
+			  FS_USERNS_DEV_MOUNT,
 };
 MODULE_ALIAS_FS("ext4");
 
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -12,6 +12,8 @@ static inline void ext4_truncate_failed_write(struct inode *inode)
 {
 	down_write(&EXT4_I(inode)->i_mmap_sem);
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	ext4_truncate(inode);
 	up_write(&EXT4_I(inode)->i_mmap_sem);
 }
@@ -41,5 +43,6 @@ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
 		needed = EXT4_MAX_TRANS_DATA;
 
 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+
 }
 
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -106,6 +106,7 @@ static const struct xattr_handler *ext4_xattr_handler_map[] = {
 
 const struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_csum_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
@@ -871,6 +872,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 						EXT4_C2B(EXT4_SB(sb), 1));
 				if (error)
 					goto cleanup;
+				if (check_bd_full(inode, 1)) {
+					error = -ENOSPC;
+					goto cleanup_dquot;
+				}
 				BUFFER_TRACE(new_bh, "get_write_access");
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -19,7 +19,12 @@ ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
 	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!strcmp(name, EXT4_DATA_CSUM_NAME) &&
+	    (!capable(CAP_SYS_ADMIN) ||
+	     !test_opt2(dentry->d_inode->i_sb, PFCACHE_CSUM)))
+		return 0;
+
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return 0;
 
 	if (list && total_len <= list_size) {
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -223,12 +223,10 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
 			set_acl_inode(fi, inode->i_mode);
-			if (error == 0)
-				acl = NULL;
 		}
 		break;
 
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -100,7 +100,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 static const struct vm_operations_struct f2fs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -732,8 +732,9 @@ MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
-	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-			sizeof(struct f2fs_inode_info), NULL);
+	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
 	if (f2fs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -612,7 +612,7 @@ static int __init fat_init_inodecache(void)
 	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/shmem_fs.h>
+#include <linux/ve.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -29,11 +30,50 @@
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
+void generic_set_file_flags_unlocked(struct file *filp, unsigned int arg)
+{
+	filp->f_flags = (arg & SETFL_MASK) |
+		(filp->f_flags & ~SETFL_MASK);
+
+}
+EXPORT_SYMBOL(generic_set_file_flags_unlocked);
+
+int generic_set_file_flags(struct file *filp, unsigned int arg)
+{
+	spin_lock(&filp->f_lock);
+	generic_set_file_flags_unlocked(filp, arg);
+	spin_unlock(&filp->f_lock);
+	return 0;
+
+}
+EXPORT_SYMBOL(generic_set_file_flags);
+
+int may_use_odirect(void)
+{
+	int may;
+
+	if (ve_is_super(get_exec_env()))
+		return 1;
+
+	may = capable(CAP_SYS_RAWIO);
+	if (!may) {
+		may = get_exec_env()->odirect_enable;
+		if (may == 2)
+			may = get_ve0()->odirect_enable;
+	}
+
+	return may;
+}
+
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = file_inode(filp);
 	int error = 0;
 
+	if (!may_use_odirect())
+		arg &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
@@ -57,10 +97,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 				return -EINVAL;
 	}
 
-	if (filp->f_op && filp->f_op->check_flags)
-		error = filp->f_op->check_flags(arg);
-	if (error)
-		return error;
 
 	/*
 	 * ->fasync() is responsible for setting the FASYNC bit.
@@ -73,10 +109,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		if (error > 0)
 			error = 0;
 	}
-	spin_lock(&filp->f_lock);
-	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
-	spin_unlock(&filp->f_lock);
 
+	if (filp->f_op && filp->f_op->set_flags)
+		error = filp->f_op->set_flags(filp, arg);
+	else
+		error = generic_set_file_flags(filp, arg);
  out:
 	return error;
 }
@@ -746,7 +783,7 @@ static int __init fcntl_init(void)
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+		sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,11 @@ static void *alloc_fdmem(size_t size)
 	 * vmalloc() if the allocation size will be considered "large" by the VM.
 	 */
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+		void *data = kmalloc(size, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
 		if (data != NULL)
 			return data;
 	}
-	return vmalloc(size);
+	return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 
 static void free_fdmem(void *ptr)
@@ -110,7 +110,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -27,9 +27,13 @@
 #include <linux/task_work.h>
 #include <linux/ima.h>
 #include <linux/swap.h>
+#include <linux/ve.h>
 
 #include <linux/atomic.h>
 
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
 #include "internal.h"
 
 /* sysctl tunables... */
@@ -52,8 +56,10 @@ static void file_free_rcu(struct rcu_head *head)
 
 static inline void file_free(struct file *f)
 {
-	percpu_counter_dec(&nr_files);
 	file_check_state(f);
+	if (f->f_ub == get_ub0())
+		percpu_counter_dec(&nr_files);
+	ub_file_uncharge(f);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -108,11 +114,14 @@ struct file *get_empty_filp(void)
 	static long old_max;
 	struct file *f;
 	int error;
+	int acct;
 
+	acct = (get_exec_ub() == get_ub0());
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+	if (acct && get_nr_files() >= files_stat.max_files &&
+			!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * percpu_counters are inaccurate.  Do an expensive check before
 		 * we go and fail.
@@ -125,7 +134,13 @@ struct file *get_empty_filp(void)
 	if (unlikely(!f))
 		return ERR_PTR(-ENOMEM);
 
-	percpu_counter_inc(&nr_files);
+	if (ub_file_charge(f)) {
+		kmem_cache_free(filp_cachep, f);
+		return ERR_PTR(-ENOMEM);
+	}
+	if (acct)
+		percpu_counter_inc(&nr_files);
+
 	f->f_cred = get_cred(cred);
 	error = security_file_alloc(f);
 	if (unlikely(error)) {
@@ -227,6 +242,8 @@ static void __fput(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
+	struct dentry *original_dentry = file->f_original_path.dentry;
+	struct vfsmount *original_mnt = file->f_original_path.mnt;
 
 	might_sleep();
 
@@ -258,10 +275,14 @@ static void __fput(struct file *file)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
 	file->f_path.mnt = NULL;
+	file->f_original_path.dentry = NULL;
+	file->f_original_path.mnt = NULL;
 	file->f_inode = NULL;
 	file_free(file);
 	dput(dentry);
 	mntput(mnt);
+	dput(original_dentry);
+	mntput(original_mnt);
 }
 
 static DEFINE_SPINLOCK(delayed_fput_lock);
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -217,6 +217,11 @@ int __init get_filesystem_list(char *buf)
 	return len;
 }
 
+static inline bool filesystem_permitted(const struct file_system_type *fs)
+{
+	return ve_is_super(get_exec_env()) || (fs->fs_flags & FS_VIRTUALIZED);
+}
+
 #ifdef CONFIG_PROC_FS
 static int filesystems_proc_show(struct seq_file *m, void *v)
 {
@@ -225,9 +230,11 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 	read_lock(&file_systems_lock);
 	tmp = file_systems;
 	while (tmp) {
-		seq_printf(m, "%s\t%s\n",
-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
-			tmp->name);
+		if (filesystem_permitted(tmp)) {
+			seq_printf(m, "%s\t%s\n",
+				(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+				tmp->name);
+		}
 		tmp = tmp->next;
 	}
 	read_unlock(&file_systems_lock);
@@ -236,7 +243,7 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 
 static int filesystems_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, filesystems_proc_show, NULL);
+	return single_open(file, filesystems_proc_show, inode->i_sb);
 }
 
 static const struct file_operations filesystems_proc_fops = {
@@ -248,7 +255,7 @@ static const struct file_operations filesystems_proc_fops = {
 
 static int __init proc_filesystems_init(void)
 {
-	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	proc_create("filesystems", S_ISVTX, NULL, &filesystems_proc_fops);
 	return 0;
 }
 module_init(proc_filesystems_init);
@@ -276,7 +283,8 @@ struct file_system_type *get_fs_type(const char *name)
 	if (!fs && (request_module("fs-%.*s", len, name) == 0))
 		fs = __get_fs_type(name, len);
 
-	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
+	if (fs && (!filesystem_permitted(fs) ||
+		   (dot && !(fs->fs_flags & FS_HAS_SUBTYPE)))) {
 		put_filesystem(fs);
 		fs = NULL;
 	}
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -28,6 +28,7 @@
 #include <linux/tracepoint.h>
 #include <linux/device.h>
 #include "internal.h"
+#include <bc/io_acct.h>
 
 /*
  * 4MB minimal write chunk size
@@ -42,6 +43,7 @@ struct wb_writeback_work {
 	struct super_block *sb;
 	unsigned long *older_than_this;
 	enum writeback_sync_modes sync_mode;
+	unsigned int filter_ub:1;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
@@ -51,8 +53,21 @@ struct wb_writeback_work {
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
+	struct user_beancounter *ub;
 };
 
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals.  We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -108,7 +123,8 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+			struct user_beancounter *ub, bool range_cyclic,
+			enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -127,6 +143,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
+	work->ub	= ub;
 
 	bdi_queue_work(bdi, work);
 }
@@ -146,7 +163,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	__bdi_start_writeback(bdi, nr_pages, NULL, true, reason);
 }
 
 /**
@@ -237,14 +254,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
+#define EXPIRE_DIRTY_ATIME 0x0001
+
 /*
  * Move expired (dirtied before work->older_than_this) dirty inodes from
  * @delaying_queue to @dispatch_queue.
  */
 static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
+			       int flags,
 			       struct wb_writeback_work *work)
 {
+	unsigned long *older_than_this = NULL;
+	unsigned long expire_time;
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
@@ -252,13 +274,24 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+		older_than_this = work->older_than_this;
+	else if (!work->for_sync) {
+		expire_time = jiffies - (dirtytime_expire_interval * HZ);
+		older_than_this = &expire_time;
+	}
+
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
+
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
+		if (flags & EXPIRE_DIRTY_ATIME)
+			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+
 		if (sb_is_blkdev_sb(inode->i_sb))
 			continue;
 		if (sb && sb != inode->i_sb)
@@ -299,9 +332,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	int moved;
+
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+				     EXPIRE_DIRTY_ATIME, work);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -425,6 +461,9 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * updates after data IO completion.
 		 */
 		redirty_tail(inode, wb);
+	} else if (inode->i_state & I_DIRTY_TIME) {
+		inode->dirtied_when = jiffies;
+		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
 		list_del_init(&inode->i_wb_list);
@@ -437,7 +476,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  * setting I_SYNC flag and calling inode_sync_complete() to clear it.
  */
 static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+__do_writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	long nr_to_write = wbc->nr_to_write;
@@ -472,11 +511,23 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		inode->i_state &= ~I_DIRTY_PAGES;
-	dirty = inode->i_state & I_DIRTY;
-	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+
+	dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	if (inode->i_state & I_DIRTY_TIME) {
+		if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+		    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
+		    unlikely(time_after(jiffies,
+					(inode->dirtied_time_when +
+					 dirtytime_expire_interval * HZ)))) {
+			dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+			trace_writeback_lazytime(inode);
+		}
+	} else
+		inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
+	inode->i_state &= ~dirty;
 	spin_unlock(&inode->i_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (dirty & ~I_DIRTY_PAGES) {
 		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
@@ -485,6 +536,25 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
+static int
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct user_beancounter *ub;
+	int ret;
+
+	rcu_read_lock();
+	ub = rcu_dereference(inode->i_mapping->dirtied_ub);
+	if (!ub || !get_beancounter_rcu(ub))
+		ub = get_beancounter(get_ub0());
+	rcu_read_unlock();
+
+	ub = set_exec_ub(ub);
+	ret = __do_writeback_single_inode(inode, wbc);
+	put_beancounter(set_exec_ub(ub));
+
+	return ret;
+}
+
 /*
  * Write out an inode's dirty pages. Either the caller has an active reference
  * on the inode or the inode has I_WILL_FREE set.
@@ -524,7 +594,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * make sure inode is on some writeback list and leave it there unless
 	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY) &&
+	if (!(inode->i_state & I_DIRTY_ALL) &&
 	    (wbc->sync_mode != WB_SYNC_ALL ||
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
@@ -539,7 +609,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
 	 * touch it. See comment above for explanation.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		list_del_init(&inode->i_wb_list);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
@@ -635,6 +705,14 @@ static long writeback_sb_inodes(struct super_block *sb,
 			redirty_tail(inode, wb);
 			continue;
 		}
+		if ((work->ub || work->filter_ub) &&
+		    ((inode->i_state & I_DIRTY) == I_DIRTY_PAGES) &&
+		     ub_should_skip_writeback(work->ub, inode)) {
+			spin_unlock(&inode->i_lock);
+			redirty_tail(inode, wb);
+			continue;
+		}
+
 		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
 			/*
 			 * If this inode is locked for writeback and we are not
@@ -650,6 +728,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			trace_writeback_sb_inodes_requeue(inode);
 			continue;
 		}
+
 		spin_unlock(&wb->list_lock);
 
 		/*
@@ -681,7 +760,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
-		if (!(inode->i_state & I_DIRTY))
+		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
@@ -697,6 +776,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 			if (work->nr_pages <= 0)
 				break;
 		}
+
+		WARN_ON(wbc.pages_skipped > write_chunk - wbc.nr_to_write);
+		wrote -= wbc.pages_skipped;
 	}
 	return wrote;
 }
@@ -711,9 +793,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
 
-		if (!grab_super_passive(sb)) {
+		if (!trylock_super(sb)) {
 			/*
-			 * grab_super_passive() may fail consistently due to
+			 * trylock_super() may fail consistently due to
 			 * s_umount being grabbed by someone else. Don't use
 			 * requeue_io() to avoid busy retrying the inode/sb.
 			 */
@@ -721,7 +803,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 			continue;
 		}
 		wrote += writeback_sb_inodes(sb, wb, work);
-		drop_super(sb);
+		up_read(&sb->s_umount);
 
 		/* refer to the same tests at the end of writeback_sb_inodes */
 		if (wrote) {
@@ -735,14 +817,15 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
-				enum wb_reason reason)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			enum wb_reason reason, struct user_beancounter *ub)
 {
 	struct wb_writeback_work work = {
 		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.ub		= ub,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -757,6 +840,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 static bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
+	unsigned long bdi_thresh, bdi_bg_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
@@ -764,8 +848,11 @@ static bool over_bground_thresh(struct backing_dev_info *bdi)
 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
 		return true;
 
-	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
-				bdi_dirty_limit(bdi, background_thresh))
+	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	bdi_bg_thresh = div_u64((u64)bdi_thresh * background_thresh,
+				dirty_thresh + 1);
+
+	if (bdi_stat(bdi, BDI_RECLAIMABLE) > bdi_bg_thresh)
 		return true;
 
 	return false;
@@ -830,8 +917,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb->bdi))
-			break;
+		if (work->for_background) {
+			if (over_bground_thresh(wb->bdi))
+				work->filter_ub = 0;
+			else if (ub_over_bground_thresh())
+				work->filter_ub = 1;
+			else
+				break;
+		}
 
 		/*
 		 * Kupdate and background works are special and we want to
@@ -922,7 +1015,8 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb->bdi)) {
+	if (over_bground_thresh(wb->bdi) ||
+		ub_over_bground_thresh()) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
@@ -1041,7 +1135,7 @@ void bdi_writeback_workfn(struct work_struct *work)
 		 * enough for efficient IO.
 		 */
 		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
-						    WB_REASON_FORKER_THREAD);
+						WB_REASON_FORKER_THREAD, NULL);
 		trace_writeback_pages_written(pages_written);
 	}
 
@@ -1057,7 +1151,8 @@ void bdi_writeback_workfn(struct work_struct *work)
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -1068,9 +1163,64 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+		__bdi_start_writeback(bdi, nr_pages, ub, false, reason);
+	}
+	rcu_read_unlock();
+}
+
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+{
+	wakeup_flusher_threads_ub(nr_pages, NULL, reason);
+}
+
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.  (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (list_empty(&bdi->wb.b_dirty_time))
+			continue;
+		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 	}
 	rcu_read_unlock();
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+	return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
 }
 
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1119,16 +1269,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
+	int dirtytime;
+
+	trace_writeback_mark_inode_dirty(inode, flags);
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * dirty the inode itself
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
 		trace_writeback_dirty_inode_start(inode, flags);
 
 		if (sb->s_op->dirty_inode)
@@ -1136,6 +1290,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 		trace_writeback_dirty_inode(inode, flags);
 	}
+	if (flags & I_DIRTY_INODE)
+		flags &= ~I_DIRTY_TIME;
+	dirtytime = flags & I_DIRTY_TIME;
 
 	/*
 	 * make sure that changes are seen by all cpus before we test i_state
@@ -1144,16 +1301,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	smp_mb();
 
 	/* avoid the locking if we can */
-	if ((inode->i_state & flags) == flags)
+	if (((inode->i_state & flags) == flags) ||
+	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
 		return;
 
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode->i_lock);
+	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+		goto out_unlock_inode;
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		if (flags & I_DIRTY_INODE)
+			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
 
 		/*
@@ -1200,8 +1362,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			if (dirtytime)
+				inode->dirtied_time_when = jiffies;
+			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			else
+				list_move(&inode->i_wb_list,
+					  &bdi->wb.b_dirty_time);
 			spin_unlock(&bdi->wb.list_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
 
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
@@ -1214,7 +1383,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-static void wait_sb_inodes(struct super_block *sb)
+static void wait_sb_inodes(struct super_block *sb, struct user_beancounter *ub)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1242,6 +1411,12 @@ static void wait_sb_inodes(struct super_block *sb)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
+		if (ub && (mapping->dirtied_ub != ub) &&
+		    ((inode->i_state & I_DIRTY) == I_DIRTY_PAGES)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_sb_list_lock);
@@ -1282,7 +1457,8 @@ static void wait_sb_inodes(struct super_block *sb)
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb,
+static void writeback_inodes_sb_ub_nr(struct super_block *sb,
+			    struct user_beancounter *ub,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
@@ -1294,6 +1470,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 		.done			= &done,
 		.nr_pages		= nr,
 		.reason			= reason,
+		.ub			= ub,
 	};
 
 	if (sb->s_bdi == &noop_backing_dev_info)
@@ -1302,8 +1479,22 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
+{
+
+	writeback_inodes_sb_ub_nr(sb, NULL, nr, reason);
+}
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
+void writeback_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub,
+			enum wb_reason reason)
+{
+	return writeback_inodes_sb_ub_nr(sb, ub, get_nr_dirty_pages(), reason);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1365,7 +1556,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1376,6 +1567,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
+		.ub		= ub,
 	};
 
 	/* Nothing to do? */
@@ -1386,7 +1578,12 @@ void sync_inodes_sb(struct super_block *sb)
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 
-	wait_sb_inodes(sb);
+	wait_sb_inodes(sb, ub);
+}
+
+void sync_inodes_sb(struct super_block *sb)
+{
+	sync_inodes_sb_ub(sb, NULL);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/seq_file.h>
 
 #define FUSE_CTL_SUPER_MAGIC 0x65735543
 
@@ -196,6 +197,262 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.llseek = no_llseek,
 };
 
+struct fuse_conn_priv {
+	struct fuse_conn *conn;
+	struct list_head *req_list;
+};
+
+enum {
+	FUSE_PENDING_REQ = 1,
+	FUSE_PROCESSING_REQ,
+	FUSE_IO_REQ,
+};
+
+static void *fuse_req_start(struct seq_file *m, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+
+	spin_lock(&fcp->conn->lock);
+	return seq_list_start(fcp->req_list, *p);
+}
+
+static void *fuse_req_next(struct seq_file *m, void *v, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	return seq_list_next(v, fcp->req_list, p);
+}
+
+static void fuse_req_stop(struct seq_file *m, void *v)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	spin_unlock(&fcp->conn->lock);
+}
+
+static int fuse_req_show(struct seq_file *f, void *v)
+{
+	struct fuse_req *req;
+
+	req = list_entry((struct list_head *)v, struct fuse_req, list);
+	seq_printf(f, "flags: %c%c%c%c%c%c%c%c%c%c "
+			"in: op %-4d uniq 0x%016Lx node 0x%016Lx "
+			"out: err %-6d uniq 0x%016Lx\n",
+			test_bit(FR_ISREPLY, &req->flags) ? 'r' : '-',
+			test_bit(FR_FORCE, &req->flags) ? 'f' : '-',
+			test_bit(FR_ABORTED, &req->flags) ? 'a' : '-',
+			test_bit(FR_BACKGROUND, &req->flags) ? 'b' : '-',
+			test_bit(FR_INTERRUPTED, &req->flags) ? 'i' : '-',
+			test_bit(FR_LOCKED, &req->flags) ? 'l' : '-',
+			test_bit(FR_WAITING, &req->flags) ? 'w': '-',
+			test_bit(FR_PENDING, &req->flags) ? 'p': '-',
+			test_bit(FR_SENT, &req->flags) ? 's': '-',
+			test_bit(FR_FINISHED, &req->flags) ? 'f': '-',
+			req->in.h.opcode,
+			req->in.h.unique,
+			req->in.h.nodeid,
+			req->out.h.error,
+			req->out.h.unique);
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_req_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_req_show,
+};
+
+static int fuse_conn_seq_open(struct file *filp, int list_id)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_req_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	switch (list_id) {
+	case FUSE_PENDING_REQ:
+		fcp->req_list = &conn->main_iq.pending;
+		break;
+#if 0
+	case FUSE_PROCESSING_REQ:
+		fcp->req_list = &conn->pq.processing;
+		break;
+	case FUSE_IO_REQ:
+		fcp->req_list = &conn->pq.io;
+		break;
+#endif
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int fuse_conn_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn_priv *fcp = ((struct seq_file *)filp->private_data)->private;
+
+	if (fcp)
+		fuse_conn_put(fcp->conn);
+
+	return seq_release_private(inode, filp);
+}
+
+static int fuse_conn_pending_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PENDING_REQ);
+}
+
+static const struct file_operations fuse_conn_pending_req = {
+	.open = fuse_conn_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+#if 0
+static int fuse_conn_processing_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PROCESSING_REQ);
+}
+
+static const struct file_operations fuse_conn_processing_req = {
+	.open = fuse_conn_processing_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_io_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_IO_REQ);
+}
+
+static const struct file_operations fuse_conn_io_req = {
+	.open = fuse_conn_io_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+#endif
+
+static int fuse_files_show(struct seq_file *f, void *v)
+{
+	struct fuse_file *ff;
+
+	ff = list_entry(v, struct fuse_file, fl);
+	seq_printf(f, "kh 0x%016Lx fh 0x%016Lx node 0x%016Lx flags 0x%08x name ",
+			ff->kh, ff->fh, ff->nodeid, ff->open_flags);
+	if (ff->ff_dentry)
+		seq_dentry(f, ff->ff_dentry, "");
+	else
+		seq_putc(f, '-');
+	seq_putc(f, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_files_seq_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_files_show,
+};
+
+static int fuse_conn_files_open(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_files_seq_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	fcp->req_list = &conn->conn_files;
+	return 0;
+}
+
+static const struct file_operations fuse_conn_files_ops = {
+	.open = fuse_conn_files_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_show(struct seq_file *sf, void *v)
+{
+	struct fuse_conn *fc = sf->private;
+	struct fuse_dev *fud;
+	int n_total = 0;
+	int n_active = 0;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(fud, &fc->devices, entry) {
+		struct fuse_iqueue *fiq = fud->fiq;
+		if (waitqueue_active(&fiq->waitq))
+			n_active++;
+		n_total++;
+	}
+	spin_unlock(&fc->lock);
+
+	seq_printf(sf, "Connected: %d\n", fc->connected);
+	seq_printf(sf, "Initialized: %d\n", fc->initialized);
+	seq_printf(sf, "Blocked: %d\n", fc->blocked);
+	seq_printf(sf, "WQ active: %d of %d\n", n_active, n_total);
+	seq_printf(sf, "Blocked_wq active: %d\n", waitqueue_active(&fc->blocked_waitq));
+	seq_printf(sf, "num_background: %d\n", fc->num_background);
+	seq_printf(sf, "num_waiting: %d\n", atomic_read(&fc->num_waiting));
+	return 0;
+}
+
+static int fuse_conn_info_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct fuse_conn *conn;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	ret = single_open(filp, fuse_conn_show, conn);
+	if (ret)
+		fuse_conn_put(conn);
+
+	return ret;
+}
+
+static int fuse_conn_info_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn = ((struct seq_file *)filp->private_data)->private;
+	fuse_conn_put(conn);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations fuse_conn_info_ops = {
+	.open = fuse_conn_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_info_release,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name,
@@ -260,7 +517,25 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 				 1, NULL, &fuse_conn_max_background_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
 				 S_IFREG | 0600, 1, NULL,
-				 &fuse_conn_congestion_threshold_ops))
+				 &fuse_conn_congestion_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "pending_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_pending_req) ||
+#if 0
+	    !fuse_ctl_add_dentry(parent, fc, "processing_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_processing_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "io_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_io_req) ||
+#endif
+	    !fuse_ctl_add_dentry(parent, fc, "open_files",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_files_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "conn_info",
+			    	S_IFREG | 0600, 1, NULL,
+				&fuse_conn_info_ops)
+	    )
 		goto err;
 
 	return 0;
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
 	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
@@ -410,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 err_region:
 	unregister_chrdev_region(devt, 1);
 err:
-	fuse_conn_kill(fc);
+	fuse_abort_conn(fc);
 	goto out;
 }
 
@@ -493,6 +494,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
  */
 static int cuse_channel_open(struct inode *inode, struct file *file)
 {
+	struct fuse_dev *fud;
 	struct cuse_conn *cc;
 	int rc;
 
@@ -501,19 +503,28 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
 	if (!cc)
 		return -ENOMEM;
 
-	fuse_conn_init(&cc->fc);
+	rc = fuse_conn_init(&cc->fc);
+	if (rc) {
+		kfree(cc);
+		return rc;
+	}
+
+	fud = fuse_dev_alloc(&cc->fc);
+	if (!fud) {
+		kfree(cc);
+		return -ENOMEM;
+	}
 
 	INIT_LIST_HEAD(&cc->list);
 	cc->fc.release = cuse_fc_release;
 
-	cc->fc.connected = 1;
 	cc->fc.initialized = 1;
 	rc = cuse_send_init(cc);
 	if (rc) {
-		fuse_conn_put(&cc->fc);
+		fuse_dev_free(fud);
 		return rc;
 	}
-	file->private_data = &cc->fc;	/* channel owns base reference to cc */
+	file->private_data = fud;
 
 	return 0;
 }
@@ -531,7 +542,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
  */
 static int cuse_channel_release(struct inode *inode, struct file *file)
 {
-	struct cuse_conn *cc = fc_to_cc(file->private_data);
+	struct fuse_dev *fud = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(fud->fc);
 	int rc;
 
 	/* remove from the conntbl, no more access from this point on */
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,17 +25,19 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
+extern struct workqueue_struct *fuse_fput_wq;
 
-static struct fuse_conn *fuse_get_conn(struct file *file)
+static struct fuse_dev *fuse_get_dev(struct file *file)
 {
 	/*
 	 * Lockless access is OK, because file->private data is set
 	 * once during mount and is valid until the file is released.
 	 */
-	return file->private_data;
+	return ACCESS_ONCE(file->private_data);
 }
 
-static void fuse_request_init(struct fuse_req *req, struct page **pages,
+static void fuse_request_init(struct fuse_conn *fc,
+			      struct fuse_req *req, struct page **pages,
 			      struct fuse_page_desc *page_descs,
 			      unsigned npages)
 {
@@ -49,9 +51,12 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
 	req->pages = pages;
 	req->page_descs = page_descs;
 	req->max_pages = npages;
+	req->fiq = &fc->main_iq;
+	__set_bit(FR_PENDING, &req->flags);
 }
 
-static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+static struct fuse_req *__fuse_request_alloc(struct fuse_conn *fc,
+					     unsigned npages, gfp_t flags)
 {
 	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
 	if (req) {
@@ -74,20 +79,20 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
 			return NULL;
 		}
 
-		fuse_request_init(req, pages, page_descs, npages);
+		fuse_request_init(fc, req, pages, page_descs, npages);
 	}
 	return req;
 }
 
-struct fuse_req *fuse_request_alloc(unsigned npages)
+struct fuse_req *fuse_request_alloc(struct fuse_conn *fc, unsigned npages)
 {
-	return __fuse_request_alloc(npages, GFP_KERNEL);
+	return __fuse_request_alloc(fc, npages, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
+struct fuse_req *fuse_request_alloc_nofs(struct fuse_conn *fc, unsigned npages)
 {
-	return __fuse_request_alloc(npages, GFP_NOFS);
+	return __fuse_request_alloc(fc, npages, GFP_NOFS);
 }
 
 void fuse_request_free(struct fuse_req *req)
@@ -99,19 +104,6 @@ void fuse_request_free(struct fuse_req *req)
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
-static void block_sigs(sigset_t *oldset)
-{
-	sigset_t mask;
-
-	siginitsetinv(&mask, sigmask(SIGKILL));
-	sigprocmask(SIG_BLOCK, &mask, oldset);
-}
-
-static void restore_sigs(sigset_t *oldset)
-{
-	sigprocmask(SIG_SETMASK, oldset, NULL);
-}
-
 void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
@@ -128,7 +120,14 @@ static void fuse_req_init_context(struct fuse_req *req)
 {
 	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
 	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
-	req->in.h.pid = current->pid;
+	req->in.h.pid = task_pid_vnr(current);
+}
+
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+	/* Make sure stores before this are seen on another CPU */
+	smp_wmb();
+	fc->initialized = 1;
 }
 
 static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
@@ -144,23 +143,23 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	atomic_inc(&fc->num_waiting);
 
 	if (fuse_block_alloc(fc, for_background)) {
-		sigset_t oldset;
-		int intr;
-
-		block_sigs(&oldset);
-		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
-				!fuse_block_alloc(fc, for_background));
-		restore_sigs(&oldset);
 		err = -EINTR;
-		if (intr)
+		if (wait_event_killable_exclusive(fc->blocked_waitq,
+				!fuse_block_alloc(fc, for_background)))
 			goto out;
 	}
+	/* Matches smp_wmb() in fuse_set_initialized() */
+	smp_rmb();
 
 	err = -ENOTCONN;
 	if (!fc->connected)
 		goto out;
 
-	req = fuse_request_alloc(npages);
+	err = -ECONNREFUSED;
+	if (fc->conn_error)
+		goto out;
+
+	req = fuse_request_alloc(fc, npages);
 	err = -ENOMEM;
 	if (!req) {
 		if (for_background)
@@ -169,8 +168,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	}
 
 	fuse_req_init_context(req);
-	req->waiting = 1;
-	req->background = for_background;
+	__set_bit(FR_WAITING, &req->flags);
+	if (for_background)
+		__set_bit(FR_BACKGROUND, &req->flags);
+
 	return req;
 
  out:
@@ -225,7 +226,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 	struct fuse_file *ff = file->private_data;
 
 	spin_lock(&fc->lock);
-	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
+	fuse_request_init(fc, req, req->pages, req->page_descs, req->max_pages);
 	BUG_ON(ff->reserved_req);
 	ff->reserved_req = req;
 	wake_up_all(&fc->reserved_req_waitq);
@@ -253,20 +254,22 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
 
 	atomic_inc(&fc->num_waiting);
 	wait_event(fc->blocked_waitq, fc->initialized);
-	req = fuse_request_alloc(0);
+	/* Matches smp_wmb() in fuse_set_initialized() */
+	smp_rmb();
+	req = fuse_request_alloc(fc, 0);
 	if (!req)
 		req = get_reserved_req(fc, file);
 
 	fuse_req_init_context(req);
-	req->waiting = 1;
-	req->background = 0;
+	__set_bit(FR_WAITING, &req->flags);
+	__clear_bit(FR_BACKGROUND, &req->flags);
 	return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		if (unlikely(req->background)) {
+		if (test_bit(FR_BACKGROUND, &req->flags)) {
 			/*
 			 * We get here in the unlikely case that a background
 			 * request was allocated but not sent
@@ -277,8 +280,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 			spin_unlock(&fc->lock);
 		}
 
-		if (req->waiting)
+		if (test_bit(FR_WAITING, &req->flags)) {
+			__clear_bit(FR_WAITING, &req->flags);
 			atomic_dec(&fc->num_waiting);
+		}
 
 		if (req->stolen_file)
 			put_reserved_req(fc, req);
@@ -299,49 +304,41 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 	return nbytes;
 }
 
-static u64 fuse_get_unique(struct fuse_conn *fc)
+static u64 fuse_get_unique(struct fuse_iqueue *fiq)
 {
-	fc->reqctr++;
-	/* zero is special */
-	if (fc->reqctr == 0)
-		fc->reqctr = 1;
-
-	return fc->reqctr;
+	return ++fiq->reqctr;
 }
 
-static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-	list_add_tail(&req->list, &fc->pending);
-	req->state = FUSE_REQ_PENDING;
-	if (!req->waiting) {
-		req->waiting = 1;
-		atomic_inc(&fc->num_waiting);
-	}
-	wake_up(&fc->waitq);
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	list_add_tail(&req->list, &fiq->pending);
+	wake_up_locked(&fiq->waitq);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 		       u64 nodeid, u64 nlookup)
 {
+	struct fuse_iqueue *fiq = &fc->main_iq;
+
 	forget->forget_one.nodeid = nodeid;
 	forget->forget_one.nlookup = nlookup;
 
-	spin_lock(&fc->lock);
-	if (fc->connected) {
-		fc->forget_list_tail->next = forget;
-		fc->forget_list_tail = forget;
-		wake_up(&fc->waitq);
-		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	spin_lock(&fiq->waitq.lock);
+	if (fiq->connected) {
+		fiq->forget_list_tail->next = forget;
+		fiq->forget_list_tail = forget;
+		wake_up_locked(&fiq->waitq);
+		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 	} else {
 		kfree(forget);
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 }
 
-static void flush_bg_queue(struct fuse_conn *fc)
+static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
 {
 	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
@@ -350,8 +347,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
 		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
-		req->in.h.unique = fuse_get_unique(fc);
-		queue_request(fc, req);
+		spin_lock(&fiq->waitq.lock);
+		req->in.h.unique = fuse_get_unique(fiq);
+		queue_request(fiq, req);
+		spin_unlock(&fiq->waitq.lock);
 	}
 }
 
@@ -362,20 +361,22 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * was closed.  The requester thread is woken up (if still waiting),
  * the 'end' callback is called if given, else the reference to the
  * request is released
- *
- * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
 {
-	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-	req->end = NULL;
-	list_del(&req->list);
-	list_del(&req->intr_entry);
-	req->state = FUSE_REQ_FINISHED;
-	if (req->background) {
-		req->background = 0;
+	struct fuse_iqueue *fiq = req->fiq;
 
+	if (test_and_set_bit(FR_FINISHED, &req->flags))
+		return;
+
+	spin_lock(&fiq->waitq.lock);
+	list_del_init(&req->intr_entry);
+	spin_unlock(&fiq->waitq.lock);
+	WARN_ON(test_bit(FR_PENDING, &req->flags));
+	WARN_ON(test_bit(FR_SENT, &req->flags));
+	if (test_bit(FR_BACKGROUND, &req->flags)) {
+		spin_lock(&fc->lock);
+		clear_bit(FR_BACKGROUND, &req->flags);
 		if (fc->num_background == fc->max_background)
 			fc->blocked = 0;
 
@@ -390,131 +391,136 @@ __releases(fc->lock)
 		}
 		fc->num_background--;
 		fc->active_background--;
-		flush_bg_queue(fc);
+		flush_bg_queue(fc, fiq);
+		spin_unlock(&fc->lock);
 	}
-	spin_unlock(&fc->lock);
 	wake_up(&req->waitq);
-	if (end)
-		end(fc, req);
+	if (req->end)
+		req->end(fc, req);
 	fuse_put_request(fc, req);
 }
 
-static void wait_answer_interruptible(struct fuse_conn *fc,
-				      struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
+static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
-	if (signal_pending(current))
+	spin_lock(&fiq->waitq.lock);
+	if (test_bit(FR_FINISHED, &req->flags)) {
+		spin_unlock(&fiq->waitq.lock);
 		return;
-
-	spin_unlock(&fc->lock);
-	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-	spin_lock(&fc->lock);
-}
-
-static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
-{
-	list_add_tail(&req->intr_entry, &fc->interrupts);
-	wake_up(&fc->waitq);
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	}
+	if (list_empty(&req->intr_entry)) {
+		list_add_tail(&req->intr_entry, &fiq->interrupts);
+		wake_up_locked(&fiq->waitq);
+	}
+	spin_unlock(&fiq->waitq.lock);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
+	struct fuse_iqueue *fiq = req->fiq;
+	int err;
+
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
-		wait_answer_interruptible(fc, req);
-
-		if (req->aborted)
-			goto aborted;
-		if (req->state == FUSE_REQ_FINISHED)
+		err = wait_event_interruptible(req->waitq,
+					test_bit(FR_FINISHED, &req->flags));
+		if (!err)
 			return;
 
-		req->interrupted = 1;
-		if (req->state == FUSE_REQ_SENT)
-			queue_interrupt(fc, req);
+		set_bit(FR_INTERRUPTED, &req->flags);
+		/* matches barrier in fuse_dev_do_read() */
+		smp_mb__after_atomic();
+		if (test_bit(FR_SENT, &req->flags))
+			queue_interrupt(fiq, req);
 	}
 
-	if (!req->force) {
-		sigset_t oldset;
-
+	if (!test_bit(FR_FORCE, &req->flags)) {
 		/* Only fatal signals may interrupt this */
-		block_sigs(&oldset);
-		wait_answer_interruptible(fc, req);
-		restore_sigs(&oldset);
+		err = wait_event_killable(req->waitq,
+					test_bit(FR_FINISHED, &req->flags));
 
-		if (req->aborted)
-			goto aborted;
-		if (req->state == FUSE_REQ_FINISHED)
+		if (!err)
 			return;
 
+		spin_lock(&fiq->waitq.lock);
 		/* Request is not yet in userspace, bail out */
-		if (req->state == FUSE_REQ_PENDING) {
+		if (test_bit(FR_PENDING, &req->flags)) {
 			list_del(&req->list);
+			spin_unlock(&fiq->waitq.lock);
 			__fuse_put_request(req);
 			req->out.h.error = -EINTR;
 			return;
 		}
+		spin_unlock(&fiq->waitq.lock);
 	}
 
 	/*
 	 * Either request is already in userspace, or it was forced.
 	 * Wait it out.
 	 */
-	spin_unlock(&fc->lock);
-	wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
-	spin_lock(&fc->lock);
-
-	if (!req->aborted)
-		return;
-
- aborted:
-	BUG_ON(req->state != FUSE_REQ_FINISHED);
-	if (req->locked) {
-		/* This is uninterruptible sleep, because data is
-		   being copied to/from the buffers of req.  During
-		   locked state, there mustn't be any filesystem
-		   operation (e.g. page fault), since that could lead
-		   to deadlock */
-		spin_unlock(&fc->lock);
-		wait_event(req->waitq, !req->locked);
-		spin_lock(&fc->lock);
-	}
+	wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
 }
 
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req,
+				struct fuse_file *ff)
 {
-	BUG_ON(req->background);
-	spin_lock(&fc->lock);
-	if (!fc->connected)
+	struct fuse_iqueue *fiq = req->fiq;
+
+	BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
+	spin_lock(&fiq->waitq.lock);
+	if (!fiq->connected) {
+		spin_unlock(&fiq->waitq.lock);
 		req->out.h.error = -ENOTCONN;
-	else if (fc->conn_error)
-		req->out.h.error = -ECONNREFUSED;
-	else {
-		req->in.h.unique = fuse_get_unique(fc);
-		queue_request(fc, req);
+	} else if (ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+		spin_unlock(&fiq->waitq.lock);
+		req->out.h.error = -EIO;
+	} else {
+		req->in.h.unique = fuse_get_unique(fiq);
+		queue_request(fiq, req);
 		/* acquire extra reference, since request is still needed
 		   after request_end() */
 		__fuse_get_request(req);
+		spin_unlock(&fiq->waitq.lock);
 
 		request_wait_answer(fc, req);
+		/* Pairs with smp_wmb() in request_end() */
+		smp_rmb();
 	}
-	spin_unlock(&fc->lock);
+}
+
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff)
+{
+	__set_bit(FR_ISREPLY, &req->flags);
+	if (!test_bit(FR_WAITING, &req->flags)) {
+		__set_bit(FR_WAITING, &req->flags);
+		atomic_inc(&fc->num_waiting);
+	}
+	__fuse_request_send(fc, req, ff);
 }
 
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->isreply = 1;
-	__fuse_request_send(fc, req);
+	fuse_request_check_and_send(fc, req, NULL);
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
 
-static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-					    struct fuse_req *req)
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
 {
-	BUG_ON(!req->background);
+	struct fuse_iqueue *fiq = req->fiq;
+
+	BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+	if (!test_bit(FR_WAITING, &req->flags)) {
+		__set_bit(FR_WAITING, &req->flags);
+		atomic_inc(&fc->num_waiting);
+	}
+	__set_bit(FR_ISREPLY, &req->flags);
 	fc->num_background++;
 	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
@@ -524,57 +530,52 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 		set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
 	}
 	list_add_tail(&req->list, &fc->bg_queue);
-	flush_bg_queue(fc);
+	flush_bg_queue(fc, fiq);
 }
 
-static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
+	BUG_ON(!req->end);
 	spin_lock(&fc->lock);
-	if (fc->connected) {
-		fuse_request_send_nowait_locked(fc, req);
+	if (req->page_cache && req->ff &&
+	    test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->ff->ff_state)) {
+		BUG_ON(req->in.h.opcode != FUSE_READ);
+		req->out.h.error = -EIO;
+		__clear_bit(FR_BACKGROUND, &req->flags);
+		__clear_bit(FR_PENDING, &req->flags);
+		list_del_init(&req->list);
+		spin_unlock(&fc->lock);
+		request_end(fc, req);
+	} else if (fc->connected) {
+		fuse_request_send_background_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
+		spin_unlock(&fc->lock);
 		req->out.h.error = -ENOTCONN;
-		request_end(fc, req);
+		req->end(fc, req);
+		fuse_put_request(fc, req);
 	}
 }
-
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
-{
-	req->isreply = 1;
-	fuse_request_send_nowait(fc, req);
-}
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
 static int fuse_request_send_notify_reply(struct fuse_conn *fc,
 					  struct fuse_req *req, u64 unique)
 {
 	int err = -ENODEV;
+	struct fuse_iqueue *fiq = req->fiq;
 
-	req->isreply = 0;
+	__clear_bit(FR_ISREPLY, &req->flags);
 	req->in.h.unique = unique;
-	spin_lock(&fc->lock);
-	if (fc->connected) {
-		queue_request(fc, req);
+	spin_lock(&fiq->waitq.lock);
+	if (fiq->connected) {
+		queue_request(fiq, req);
 		err = 0;
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 
 	return err;
 }
 
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
-					 struct fuse_req *req)
-{
-	req->isreply = 1;
-	fuse_request_send_nowait_locked(fc, req);
-}
-
 void fuse_force_forget(struct file *file, u64 nodeid)
 {
 	struct inode *inode = file_inode(file);
@@ -590,8 +591,8 @@ void fuse_force_forget(struct file *file, u64 nodeid)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	req->isreply = 0;
-	__fuse_request_send(fc, req);
+	__clear_bit(FR_ISREPLY, &req->flags);
+	__fuse_request_send(fc, req, NULL);
 	/* ignore errors */
 	fuse_put_request(fc, req);
 }
@@ -601,38 +602,39 @@ void fuse_force_forget(struct file *file, u64 nodeid)
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
  */
-static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int lock_request(struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
-		spin_lock(&fc->lock);
-		if (req->aborted)
+		spin_lock(&req->waitq.lock);
+		if (test_bit(FR_ABORTED, &req->flags))
 			err = -ENOENT;
 		else
-			req->locked = 1;
-		spin_unlock(&fc->lock);
+			set_bit(FR_LOCKED, &req->flags);
+		spin_unlock(&req->waitq.lock);
 	}
 	return err;
 }
 
 /*
- * Unlock request.  If it was aborted during being locked, the
- * requester thread is currently waiting for it to be unlocked, so
- * wake it up.
+ * Unlock request.  If it was aborted while locked, caller is responsible
+ * for unlocking and ending the request.
  */
-static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int unlock_request(struct fuse_req *req)
 {
+	int err = 0;
 	if (req) {
-		spin_lock(&fc->lock);
-		req->locked = 0;
-		if (req->aborted)
-			wake_up(&req->waitq);
-		spin_unlock(&fc->lock);
+		spin_lock(&req->waitq.lock);
+		if (test_bit(FR_ABORTED, &req->flags))
+			err = -ENOENT;
+		else
+			clear_bit(FR_LOCKED, &req->flags);
+		spin_unlock(&req->waitq.lock);
 	}
+	return err;
 }
 
 struct fuse_copy_state {
-	struct fuse_conn *fc;
 	int write;
 	struct fuse_req *req;
 	const struct iovec *iov;
@@ -649,12 +651,10 @@ struct fuse_copy_state {
 	unsigned move_pages:1;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-			   int write,
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
 			   const struct iovec *iov, unsigned long nr_segs)
 {
 	memset(cs, 0, sizeof(*cs));
-	cs->fc = fc;
 	cs->write = write;
 	cs->iov = iov;
 	cs->nr_segs = nr_segs;
@@ -694,7 +694,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	unsigned long offset;
 	int err;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 	if (cs->pipebufs) {
 		struct pipe_buffer *buf = cs->pipebufs;
@@ -752,7 +755,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		cs->addr += cs->len;
 	}
 
-	return lock_request(cs->fc, cs->req);
+	return lock_request(cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -798,7 +801,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	struct page *newpage;
 	struct pipe_buffer *buf = cs->pipebufs;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 
 	err = buf->ops->confirm(cs->pipe, buf);
@@ -852,12 +858,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 		lru_cache_add_file(newpage);
 
 	err = 0;
-	spin_lock(&cs->fc->lock);
-	if (cs->req->aborted)
+	spin_lock(&cs->req->waitq.lock);
+	if (test_bit(FR_ABORTED, &cs->req->flags))
 		err = -ENOENT;
 	else
 		*pagep = newpage;
-	spin_unlock(&cs->fc->lock);
+	spin_unlock(&cs->req->waitq.lock);
 
 	if (err) {
 		unlock_page(newpage);
@@ -877,7 +883,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
 	cs->buf = cs->mapaddr + buf->offset;
 
-	err = lock_request(cs->fc, cs->req);
+	err = lock_request(cs->req);
 	if (err)
 		return err;
 
@@ -888,11 +894,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 			 unsigned offset, unsigned count)
 {
 	struct pipe_buffer *buf;
+	int err;
 
 	if (cs->nr_segs == cs->pipe->buffers)
 		return -EIO;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 
 	buf = cs->pipebufs;
@@ -913,7 +923,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
  * done atomically
  */
 static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
-			  unsigned offset, unsigned count, int zeroing)
+			  unsigned offset, unsigned count, int zeroing, int moving)
 {
 	int err;
 	struct page *page = *pagep;
@@ -925,7 +935,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 		if (cs->write && cs->pipebufs && page) {
 			return fuse_ref_page(cs, page, offset, count);
 		} else if (!cs->len) {
-			if (cs->move_pages && page &&
+			if (cs->move_pages && page && moving &&
 			    offset == 0 && count == PAGE_SIZE) {
 				err = fuse_try_move_page(cs, pagep);
 				if (err <= 0)
@@ -962,7 +972,7 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 		unsigned count = min(nbytes, req->page_descs[i].length);
 
 		err = fuse_copy_page(cs, &req->pages[i], offset, count,
-				     zeroing);
+				     zeroing, 1);
 		if (err)
 			return err;
 
@@ -971,6 +981,24 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 	return 0;
 }
 
+static int fuse_copy_bvec(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+
+	for (i = 0; i < req->num_bvecs && (nbytes || zeroing); i++) {
+		struct bio_vec *bvec = &req->bvec[i];
+
+		int err = fuse_copy_page(cs, &bvec->bv_page,
+					 bvec->bv_offset, bvec->bv_len, zeroing, 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Copy a single argument in the request to/from userspace buffer */
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
@@ -987,7 +1015,7 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 
 /* Copy request arguments to/from userspace buffer */
 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
-			  unsigned argpages, struct fuse_arg *args,
+			  unsigned argpages, unsigned argbvec, struct fuse_arg *args,
 			  int zeroing)
 {
 	int err = 0;
@@ -997,42 +1025,23 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 		struct fuse_arg *arg = &args[i];
 		if (i == numargs - 1 && argpages)
 			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else if (i == numargs - 1 && argbvec)
+			err = fuse_copy_bvec(cs, arg->size, zeroing);
 		else
 			err = fuse_copy_one(cs, arg->value, arg->size);
 	}
 	return err;
 }
 
-static int forget_pending(struct fuse_conn *fc)
+static int forget_pending(struct fuse_iqueue *fiq)
 {
-	return fc->forget_list_head.next != NULL;
+	return fiq->forget_list_head.next != NULL;
 }
 
-static int request_pending(struct fuse_conn *fc)
+static int request_pending(struct fuse_iqueue *fiq)
 {
-	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
-		forget_pending(fc);
-}
-
-/* Wait until a request is available on the pending list */
-static void request_wait(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->connected && !request_pending(fc)) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (signal_pending(current))
-			break;
-
-		spin_unlock(&fc->lock);
-		schedule();
-		spin_lock(&fc->lock);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&fc->waitq, &wait);
+	return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
+		forget_pending(fiq);
 }
 
 /*
@@ -1041,11 +1050,12 @@ __acquires(fc->lock)
  * Unlike other requests this is assembled on demand, without a need
  * to allocate a separate fuse_req structure.
  *
- * Called with fc->lock held, releases it
+ * Called with fiq->waitq.lock held, releases it
  */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_interrupt(struct fuse_iqueue *fiq,
+			       struct fuse_copy_state *cs,
 			       size_t nbytes, struct fuse_req *req)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
@@ -1053,7 +1063,7 @@ __releases(fc->lock)
 	int err;
 
 	list_del_init(&req->intr_entry);
-	req->intr_unique = fuse_get_unique(fc);
+	req->intr_unique = fuse_get_unique(fiq);
 	memset(&ih, 0, sizeof(ih));
 	memset(&arg, 0, sizeof(arg));
 	ih.len = reqsize;
@@ -1061,7 +1071,7 @@ __releases(fc->lock)
 	ih.unique = req->intr_unique;
 	arg.unique = req->in.h.unique;
 
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	if (nbytes < reqsize)
 		return -EINVAL;
 
@@ -1073,21 +1083,21 @@ __releases(fc->lock)
 	return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
 					       unsigned max,
 					       unsigned *countp)
 {
-	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link *head = fiq->forget_list_head.next;
 	struct fuse_forget_link **newhead = &head;
 	unsigned count;
 
 	for (count = 0; *newhead != NULL && count < max; count++)
 		newhead = &(*newhead)->next;
 
-	fc->forget_list_head.next = *newhead;
+	fiq->forget_list_head.next = *newhead;
 	*newhead = NULL;
-	if (fc->forget_list_head.next == NULL)
-		fc->forget_list_tail = &fc->forget_list_head;
+	if (fiq->forget_list_head.next == NULL)
+		fiq->forget_list_tail = &fiq->forget_list_head;
 
 	if (countp != NULL)
 		*countp = count;
@@ -1095,24 +1105,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
 	return head;
 }
 
-static int fuse_read_single_forget(struct fuse_conn *fc,
+static int fuse_read_single_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs,
 				   size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	int err;
-	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+	struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
 	struct fuse_forget_in arg = {
 		.nlookup = forget->forget_one.nlookup,
 	};
 	struct fuse_in_header ih = {
 		.opcode = FUSE_FORGET,
 		.nodeid = forget->forget_one.nodeid,
-		.unique = fuse_get_unique(fc),
+		.unique = fuse_get_unique(fiq),
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	kfree(forget);
 	if (nbytes < ih.len)
 		return -EINVAL;
@@ -1128,9 +1138,9 @@ __releases(fc->lock)
 	return ih.len;
 }
 
-static int fuse_read_batch_forget(struct fuse_conn *fc,
+static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs, size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	int err;
 	unsigned max_forgets;
@@ -1139,18 +1149,18 @@ __releases(fc->lock)
 	struct fuse_batch_forget_in arg = { .count = 0 };
 	struct fuse_in_header ih = {
 		.opcode = FUSE_BATCH_FORGET,
-		.unique = fuse_get_unique(fc),
+		.unique = fuse_get_unique(fiq),
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
 	if (nbytes < ih.len) {
-		spin_unlock(&fc->lock);
+		spin_unlock(&fiq->waitq.lock);
 		return -EINVAL;
 	}
 
 	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
-	head = dequeue_forget(fc, max_forgets, &count);
-	spin_unlock(&fc->lock);
+	head = dequeue_forget(fiq, max_forgets, &count);
+	spin_unlock(&fiq->waitq.lock);
 
 	arg.count = count;
 	ih.len += count * sizeof(struct fuse_forget_one);
@@ -1177,14 +1187,15 @@ __releases(fc->lock)
 	return ih.len;
 }
 
-static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
+			    struct fuse_copy_state *cs,
 			    size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
-	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
-		return fuse_read_single_forget(fc, cs, nbytes);
+	if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fiq, cs, nbytes);
 	else
-		return fuse_read_batch_forget(fc, cs, nbytes);
+		return fuse_read_batch_forget(fiq, cs, nbytes);
 }
 
 /*
@@ -1196,46 +1207,51 @@ __releases(fc->lock)
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
 				struct fuse_copy_state *cs, size_t nbytes)
 {
-	int err;
+	ssize_t err;
+	struct fuse_conn *fc = fud->fc;
+	struct fuse_iqueue *fiq = fud->fiq;
+	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	unsigned reqsize;
 
  restart:
-	spin_lock(&fc->lock);
+	spin_lock(&fiq->waitq.lock);
 	err = -EAGAIN;
-	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-	    !request_pending(fc))
+	if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
+	    !request_pending(fiq))
 		goto err_unlock;
 
-	request_wait(fc);
-	err = -ENODEV;
-	if (!fc->connected)
+	err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+				!fiq->connected || request_pending(fiq));
+	if (err)
 		goto err_unlock;
-	err = -ERESTARTSYS;
-	if (!request_pending(fc))
+
+	err = -ENODEV;
+	if (!fiq->connected)
 		goto err_unlock;
 
-	if (!list_empty(&fc->interrupts)) {
-		req = list_entry(fc->interrupts.next, struct fuse_req,
+	if (!list_empty(&fiq->interrupts)) {
+		req = list_entry(fiq->interrupts.next, struct fuse_req,
 				 intr_entry);
-		return fuse_read_interrupt(fc, cs, nbytes, req);
+		return fuse_read_interrupt(fiq, cs, nbytes, req);
 	}
 
-	if (forget_pending(fc)) {
-		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
-			return fuse_read_forget(fc, cs, nbytes);
+	if (forget_pending(fiq)) {
+		if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
+			return fuse_read_forget(fc, fiq, cs, nbytes);
 
-		if (fc->forget_batch <= -8)
-			fc->forget_batch = 16;
+		if (fiq->forget_batch <= -8)
+			fiq->forget_batch = 16;
 	}
 
-	req = list_entry(fc->pending.next, struct fuse_req, list);
-	req->state = FUSE_REQ_READING;
-	list_move(&req->list, &fc->io);
+	req = list_entry(fiq->pending.next, struct fuse_req, list);
+	clear_bit(FR_PENDING, &req->flags);
+	list_del_init(&req->list);
+	spin_unlock(&fiq->waitq.lock);
 
 	in = &req->in;
 	reqsize = in->h.len;
@@ -1248,37 +1264,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 		request_end(fc, req);
 		goto restart;
 	}
-	spin_unlock(&fc->lock);
+	spin_lock(&fpq->lock);
+	list_add(&req->list, &fpq->io);
+	spin_unlock(&fpq->lock);
 	cs->req = req;
 	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
 	if (!err)
-		err = fuse_copy_args(cs, in->numargs, in->argpages,
+		err = fuse_copy_args(cs, in->numargs, in->argpages, in->argbvec,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(cs);
-	spin_lock(&fc->lock);
-	req->locked = 0;
-	if (req->aborted) {
-		request_end(fc, req);
-		return -ENODEV;
+	spin_lock(&fpq->lock);
+	clear_bit(FR_LOCKED, &req->flags);
+	if (!fpq->connected) {
+		err = -ENODEV;
+		goto out_end;
 	}
 	if (err) {
 		req->out.h.error = -EIO;
-		request_end(fc, req);
-		return err;
+		goto out_end;
 	}
-	if (!req->isreply)
-		request_end(fc, req);
-	else {
-		req->state = FUSE_REQ_SENT;
-		list_move_tail(&req->list, &fc->processing);
-		if (req->interrupted)
-			queue_interrupt(fc, req);
-		spin_unlock(&fc->lock);
+	if (!test_bit(FR_ISREPLY, &req->flags)) {
+		err = reqsize;
+		goto out_end;
 	}
+	list_move_tail(&req->list, &fpq->processing);
+	spin_unlock(&fpq->lock);
+	set_bit(FR_SENT, &req->flags);
+	/* matches barrier in request_wait_answer() */
+	smp_mb__after_atomic();
+	if (test_bit(FR_INTERRUPTED, &req->flags))
+		queue_interrupt(fiq, req);
+
 	return reqsize;
 
+out_end:
+	if (!test_bit(FR_PRIVATE, &req->flags))
+		list_del_init(&req->list);
+	spin_unlock(&fpq->lock);
+	request_end(fc, req);
+	return err;
+
  err_unlock:
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	return err;
 }
 
@@ -1287,13 +1314,14 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct fuse_copy_state cs;
 	struct file *file = iocb->ki_filp;
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return -EPERM;
 
-	fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+	fuse_copy_init(&cs, 1, iov, nr_segs);
 
-	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+	return fuse_dev_do_read(fud, file, &cs, iov_length(iov, nr_segs));
 }
 
 static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -1321,18 +1349,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	int do_wakeup = 0;
 	struct pipe_buffer *bufs;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(in);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(in);
+
+	if (!fud)
 		return -EPERM;
 
-	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	bufs = kvmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
 
-	fuse_copy_init(&cs, fc, 1, NULL, 0);
+	fuse_copy_init(&cs, 1, NULL, 0);
 	cs.pipebufs = bufs;
 	cs.pipe = pipe;
-	ret = fuse_dev_do_read(fc, in, &cs, len);
+	ret = fuse_dev_do_read(fud, in, &cs, len);
 	if (ret < 0)
 		goto out;
 
@@ -1382,7 +1411,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	for (; page_nr < cs.nr_segs; page_nr++)
 		page_cache_release(bufs[page_nr].page);
 
-	kfree(bufs);
+	kvfree(bufs);
 	return ret;
 }
 
@@ -1598,7 +1627,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 			goto out_iput;
 
 		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
-		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0, 1);
 		if (!err && offset == 0 && (num != 0 || file_size == end))
 			SetPageUptodate(page);
 		unlock_page(page);
@@ -1734,9 +1763,43 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
 	return err;
 }
 
+static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_files_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (!fc->sb)
+		goto err_unlock;
+
+	err = fuse_invalidate_files(fc, outarg.ino);
+
+err_unlock:
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
+	/* ASSUMPTION: pstorage fused doesn't use FUSE_NOTIFY_STORE */
+	if (fc->compat_inval_files && code == 4)
+		code = FUSE_NOTIFY_INVAL_FILES;
+
 	switch (code) {
 	case FUSE_NOTIFY_POLL:
 		return fuse_notify_poll(fc, size, cs);
@@ -1756,6 +1819,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_DELETE:
 		return fuse_notify_delete(fc, size, cs);
 
+	case FUSE_NOTIFY_INVAL_FILES:
+		return fuse_notify_inval_files(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -1763,13 +1829,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 }
 
 /* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 {
-	struct list_head *entry;
+	struct fuse_req *req;
 
-	list_for_each(entry, &fc->processing) {
-		struct fuse_req *req;
-		req = list_entry(entry, struct fuse_req, list);
+	list_for_each_entry(req, &fpq->processing, list) {
 		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
@@ -1795,8 +1859,8 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 			return -EINVAL;
 		lastarg->size -= diffsize;
 	}
-	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
-			      out->page_zeroing);
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->argbvec,
+			      out->args, out->page_zeroing);
 }
 
 /*
@@ -1806,10 +1870,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 				 struct fuse_copy_state *cs, size_t nbytes)
 {
 	int err;
+	struct fuse_conn *fc = fud->fc;
+	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 
@@ -1837,63 +1903,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
 	if (oh.error <= -1000 || oh.error > 0)
 		goto err_finish;
 
-	spin_lock(&fc->lock);
+	spin_lock(&fpq->lock);
 	err = -ENOENT;
-	if (!fc->connected)
-		goto err_unlock;
+	if (!fpq->connected)
+		goto err_unlock_pq;
 
-	req = request_find(fc, oh.unique);
+	req = request_find(fpq, oh.unique);
 	if (!req)
-		goto err_unlock;
+		goto err_unlock_pq;
 
-	if (req->aborted) {
-		spin_unlock(&fc->lock);
-		fuse_copy_finish(cs);
-		spin_lock(&fc->lock);
-		request_end(fc, req);
-		return -ENOENT;
-	}
 	/* Is it an interrupt reply? */
 	if (req->intr_unique == oh.unique) {
+		spin_unlock(&fpq->lock);
+
 		err = -EINVAL;
 		if (nbytes != sizeof(struct fuse_out_header))
-			goto err_unlock;
+			goto err_finish;
 
 		if (oh.error == -ENOSYS)
 			fc->no_interrupt = 1;
 		else if (oh.error == -EAGAIN)
-			queue_interrupt(fc, req);
+			queue_interrupt(req->fiq, req);
 
-		spin_unlock(&fc->lock);
 		fuse_copy_finish(cs);
 		return nbytes;
 	}
 
-	req->state = FUSE_REQ_WRITING;
-	list_move(&req->list, &fc->io);
+	clear_bit(FR_SENT, &req->flags);
+	list_move(&req->list, &fpq->io);
 	req->out.h = oh;
-	req->locked = 1;
+	set_bit(FR_LOCKED, &req->flags);
+	spin_unlock(&fpq->lock);
 	cs->req = req;
 	if (!req->out.page_replace)
 		cs->move_pages = 0;
-	spin_unlock(&fc->lock);
 
 	err = copy_out_args(cs, &req->out, nbytes);
 	fuse_copy_finish(cs);
 
-	spin_lock(&fc->lock);
-	req->locked = 0;
-	if (!err) {
-		if (req->aborted)
-			err = -ENOENT;
-	} else if (!req->aborted)
+	spin_lock(&fpq->lock);
+	clear_bit(FR_LOCKED, &req->flags);
+	if (!fpq->connected)
+		err = -ENOENT;
+	else if (err)
 		req->out.h.error = -EIO;
+	if (!test_bit(FR_PRIVATE, &req->flags))
+		list_del_init(&req->list);
+	spin_unlock(&fpq->lock);
+
 	request_end(fc, req);
 
 	return err ? err : nbytes;
 
- err_unlock:
-	spin_unlock(&fc->lock);
+ err_unlock_pq:
+	spin_unlock(&fpq->lock);
  err_finish:
 	fuse_copy_finish(cs);
 	return err;
@@ -1903,13 +1966,14 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+
+	if (!fud)
 		return -EPERM;
 
-	fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+	fuse_copy_init(&cs, 0, iov, nr_segs);
 
-	return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+	return fuse_dev_do_write(fud, &cs, iov_length(iov, nr_segs));
 }
 
 static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
@@ -1920,15 +1984,15 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	unsigned idx;
 	struct pipe_buffer *bufs;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc;
+	struct fuse_dev *fud;
 	size_t rem;
 	ssize_t ret;
 
-	fc = fuse_get_conn(out);
-	if (!fc)
+	fud = fuse_get_dev(out);
+	if (!fud)
 		return -EPERM;
 
-	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	bufs = kvmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
 
@@ -1972,39 +2036,42 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	}
 	pipe_unlock(pipe);
 
-	fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+	fuse_copy_init(&cs, 0, NULL, nbuf);
 	cs.pipebufs = bufs;
 	cs.pipe = pipe;
 
 	if (flags & SPLICE_F_MOVE)
 		cs.move_pages = 1;
 
-	ret = fuse_dev_do_write(fc, &cs, len);
+	ret = fuse_dev_do_write(fud, &cs, len);
 
 	for (idx = 0; idx < nbuf; idx++) {
 		struct pipe_buffer *buf = &bufs[idx];
 		buf->ops->release(pipe, buf);
 	}
 out:
-	kfree(bufs);
+	kvfree(bufs);
 	return ret;
 }
 
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
 	unsigned mask = POLLOUT | POLLWRNORM;
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_iqueue *fiq;
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return POLLERR;
 
-	poll_wait(file, &fc->waitq, wait);
+	fiq = fud->fiq;
+	poll_wait(file, &fiq->waitq, wait);
 
-	spin_lock(&fc->lock);
-	if (!fc->connected)
+	spin_lock(&fiq->waitq.lock);
+	if (!fiq->connected)
 		mask = POLLERR;
-	else if (request_pending(fc))
+	else if (request_pending(fiq))
 		mask |= POLLIN | POLLRDNORM;
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 
 	return mask;
 }
@@ -2015,67 +2082,17 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
-		request_end(fc, req);
-		spin_lock(&fc->lock);
-	}
-}
-
-/*
- * Abort requests under I/O
- *
- * The requests are set to aborted and finished, and the request
- * waiter is woken up.  This will make request_wait_answer() wait
- * until the request is unlocked and then return.
- *
- * If the request is asynchronous, then the end function needs to be
- * called after waiting for the request to be unlocked (if it was
- * locked).
- */
-static void end_io_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	while (!list_empty(&fc->io)) {
-		struct fuse_req *req =
-			list_entry(fc->io.next, struct fuse_req, list);
-		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-
-		req->aborted = 1;
-		req->out.h.error = -ECONNABORTED;
-		req->state = FUSE_REQ_FINISHED;
+		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
-		wake_up(&req->waitq);
-		if (end) {
-			req->end = NULL;
-			__fuse_get_request(req);
-			spin_unlock(&fc->lock);
-			wait_event(req->waitq, !req->locked);
-			end(fc, req);
-			fuse_put_request(fc, req);
-			spin_lock(&fc->lock);
-		}
+		request_end(fc, req);
 	}
 }
 
-static void end_queued_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	fc->max_background = UINT_MAX;
-	flush_bg_queue(fc);
-	end_requests(fc, &fc->pending);
-	end_requests(fc, &fc->processing);
-	while (forget_pending(fc))
-		kfree(dequeue_forget(fc, 1, NULL));
-}
-
 static void end_polls(struct fuse_conn *fc)
 {
 	struct rb_node *p;
@@ -2091,70 +2108,197 @@ static void end_polls(struct fuse_conn *fc)
 	}
 }
 
+void fuse_abort_iqueue(struct fuse_iqueue *fiq, struct list_head *to_end)
+{
+	struct fuse_req *req;
+
+	spin_lock(&fiq->waitq.lock);
+	fiq->connected = 0;
+	list_splice_init(&fiq->pending, to_end);
+	list_for_each_entry(req, to_end, list)
+		clear_bit(FR_PENDING, &req->flags);
+	while (forget_pending(fiq))
+		kfree(dequeue_forget(fiq, 1, NULL));
+	wake_up_all_locked(&fiq->waitq);
+	spin_unlock(&fiq->waitq.lock);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+}
+
 /*
  * Abort all requests.
  *
- * Emergency exit in case of a malicious or accidental deadlock, or
- * just a hung filesystem.
- *
- * The same effect is usually achievable through killing the
- * filesystem daemon and all users of the filesystem.  The exception
- * is the combination of an asynchronous request and the tricky
- * deadlock (see Documentation/filesystems/fuse.txt).
+ * Emergency exit in case of a malicious or accidental deadlock, or just a hung
+ * filesystem.
  *
- * During the aborting, progression of requests from the pending and
- * processing lists onto the io list, and progression of new requests
- * onto the pending list is prevented by req->connected being false.
+ * The same effect is usually achievable through killing the filesystem daemon
+ * and all users of the filesystem.  The exception is the combination of an
+ * asynchronous request and the tricky deadlock (see
+ * Documentation/filesystems/fuse.txt).
  *
- * Progression of requests under I/O to the processing list is
- * prevented by the req->aborted flag being true for these requests.
- * For this reason requests on the io list must be aborted first.
+ * Aborting requests under I/O goes as follows: 1: Separate out unlocked
+ * requests, they should be finished off immediately.  Locked requests will be
+ * finished after unlock; see unlock_request(). 2: Finish off the unlocked
+ * requests.  It is possible that some request will finish before we can.  This
+ * is OK, the request will in that case be removed from the list before we touch
+ * it.
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
+	int cpu;
+
 	spin_lock(&fc->lock);
 	if (fc->connected) {
+		struct fuse_dev *fud;
+		struct fuse_req *req, *next;
+		LIST_HEAD(to_end1);
+		LIST_HEAD(to_end2);
+
 		fc->connected = 0;
 		fc->blocked = 0;
-		fc->initialized = 1;
-		end_io_requests(fc);
-		end_queued_requests(fc);
+		fuse_set_initialized(fc);
+		list_for_each_entry(fud, &fc->devices, entry) {
+			struct fuse_pqueue *fpq = &fud->pq;
+
+			spin_lock(&fpq->lock);
+			fpq->connected = 0;
+			list_for_each_entry_safe(req, next, &fpq->io, list) {
+				req->out.h.error = -ECONNABORTED;
+				spin_lock(&req->waitq.lock);
+				set_bit(FR_ABORTED, &req->flags);
+				if (!test_bit(FR_LOCKED, &req->flags)) {
+					set_bit(FR_PRIVATE, &req->flags);
+					list_move(&req->list, &to_end1);
+				}
+				spin_unlock(&req->waitq.lock);
+			}
+			list_splice_init(&fpq->processing, &to_end2);
+			spin_unlock(&fpq->lock);
+		}
+		fc->max_background = UINT_MAX;
+		for_each_online_cpu(cpu)
+			flush_bg_queue(fc, per_cpu_ptr(fc->iqs, cpu));
+		flush_bg_queue(fc, &fc->main_iq);
+
+		for_each_online_cpu(cpu)
+			fuse_abort_iqueue(per_cpu_ptr(fc->iqs, cpu), &to_end2);
+		fuse_abort_iqueue(&fc->main_iq, &to_end2);
+
 		end_polls(fc);
-		wake_up_all(&fc->waitq);
 		wake_up_all(&fc->blocked_waitq);
-		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+		spin_unlock(&fc->lock);
+
+		while (!list_empty(&to_end1)) {
+			req = list_first_entry(&to_end1, struct fuse_req, list);
+			__fuse_get_request(req);
+			list_del_init(&req->list);
+			request_end(fc, req);
+		}
+		end_requests(fc, &to_end2);
+	} else {
+		spin_unlock(&fc->lock);
 	}
-	spin_unlock(&fc->lock);
 }
 EXPORT_SYMBOL_GPL(fuse_abort_conn);
 
 int fuse_dev_release(struct inode *inode, struct file *file)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (fc) {
-		spin_lock(&fc->lock);
-		fc->connected = 0;
-		fc->blocked = 0;
-		fc->initialized = 1;
-		end_queued_requests(fc);
-		end_polls(fc);
-		wake_up_all(&fc->blocked_waitq);
-		spin_unlock(&fc->lock);
-		fuse_conn_put(fc);
-	}
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (fud) {
+		struct fuse_conn *fc = fud->fc;
+		struct fuse_pqueue *fpq = &fud->pq;
 
+		WARN_ON(!list_empty(&fpq->io));
+		end_requests(fc, &fpq->processing);
+		/* Are we the last open device? */
+		if (atomic_dec_and_test(&fc->dev_count)) {
+			WARN_ON(fud->fiq->fasync != NULL);
+			fuse_abort_conn(fc);
+		}
+		fuse_dev_free(fud);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fuse_dev_release);
 
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return -EPERM;
 
 	/* No locking - fasync_helper does its own locking */
-	return fasync_helper(fd, file, on, &fc->fasync);
+	return fasync_helper(fd, file, on, &fud->fiq->fasync);
+}
+
+static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+{
+	struct fuse_dev *fud;
+
+	if (new->private_data)
+		return -EINVAL;
+
+	fud = fuse_dev_alloc(fc);
+	if (!fud)
+		return -ENOMEM;
+
+	new->private_data = fud;
+	atomic_inc(&fc->dev_count);
+
+	return 0;
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	int err = -ENOTTY;
+
+	if (cmd == FUSE_DEV_IOC_CLONE) {
+		int oldfd;
+
+		err = -EFAULT;
+		if (!get_user(oldfd, (__u32 __user *) arg)) {
+			struct file *old = fget(oldfd);
+
+			err = -EINVAL;
+			if (old) {
+				struct fuse_dev *fud = NULL;
+
+				/*
+				 * Check against file->f_op because CUSE
+				 * uses the same ioctl handler.
+				 */
+				if (old->f_op == file->f_op &&
+				    old->f_cred->user_ns == file->f_cred->user_ns)
+					fud = fuse_get_dev(old);
+
+				if (fud) {
+					mutex_lock(&fuse_mutex);
+					err = fuse_device_clone(fud->fc, file);
+					mutex_unlock(&fuse_mutex);
+				}
+				fput(old);
+			}
+		}
+	} else if (cmd == FUSE_DEV_IOC_SETAFF) {
+		if (arg >= NR_CPUS || !cpu_possible(arg)) {
+			err = -EINVAL;
+		} else {
+			struct fuse_dev *fud = fuse_get_dev(file);
+			spin_lock(&fud->fc->lock);
+
+			fud->fiq->handled_by_fud--;
+			BUG_ON(fud->fiq->handled_by_fud < 0);
+
+			fud->fiq = per_cpu_ptr(fud->fc->iqs, arg);
+
+			fud->fiq->handled_by_fud++;
+			spin_unlock(&fud->fc->lock);
+			err = 0;
+		}
+	}
+	return err;
 }
 
 const struct file_operations fuse_dev_operations = {
@@ -2169,6 +2313,8 @@ const struct file_operations fuse_dev_operations = {
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
 	.fasync		= fuse_dev_fasync,
+	.unlocked_ioctl = fuse_dev_ioctl,
+	.compat_ioctl   = fuse_dev_ioctl,
 };
 EXPORT_SYMBOL_GPL(fuse_dev_operations);
 
@@ -2181,11 +2327,16 @@ static struct miscdevice fuse_miscdevice = {
 int __init fuse_dev_init(void)
 {
 	int err = -ENOMEM;
+
+	fuse_fput_wq = create_workqueue("fuse_fput");
+	if (!fuse_fput_wq)
+		goto out;
+
 	fuse_req_cachep = kmem_cache_create("fuse_request",
 					    sizeof(struct fuse_req),
 					    0, 0, NULL);
 	if (!fuse_req_cachep)
-		goto out;
+		goto out_destroq_wq;
 
 	err = misc_register(&fuse_miscdevice);
 	if (err)
@@ -2195,6 +2346,8 @@ int __init fuse_dev_init(void)
 
  out_cache_clean:
 	kmem_cache_destroy(fuse_req_cachep);
+ out_destroq_wq:
+	destroy_workqueue(fuse_fput_wq);
  out:
 	return err;
 }
@@ -2203,4 +2356,5 @@ void fuse_dev_cleanup(void)
 {
 	misc_deregister(&fuse_miscdevice);
 	kmem_cache_destroy(fuse_req_cachep);
+	destroy_workqueue(fuse_fput_wq);
 }
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -322,7 +322,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 
 	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
 			   &outarg->attr, entry_attr_timeout(outarg),
-			   attr_version);
+			   attr_version, 0);
 	err = -ENOMEM;
 	if (!*inode) {
 		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
@@ -419,6 +419,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	/* Userspace expects S_IFREG in create mode */
 	BUG_ON((mode & S_IFMT) != S_IFREG);
 
+	if ((flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	forget = fuse_alloc_forget();
 	err = -ENOMEM;
 	if (!forget)
@@ -473,7 +476,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopen.open_flags;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
-			  &outentry.attr, entry_attr_timeout(&outentry), 0);
+			  &outentry.attr, entry_attr_timeout(&outentry), 0, 1);
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		fuse_sync_release(ff, flags);
@@ -487,6 +490,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	fuse_invalidate_attr(dir);
 	err = finish_open(file, entry, generic_file_open, opened);
 	if (err) {
+		if (fc->writeback_cache) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			atomic_dec(&fi->num_openers);
+		}
 		fuse_sync_release(ff, flags);
 	} else {
 		file->private_data = fuse_file_get(ff);
@@ -588,7 +595,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		goto out_put_forget_req;
 
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-			  &outarg.attr, entry_attr_timeout(&outarg), 0);
+			  &outarg.attr, entry_attr_timeout(&outarg), 0, 0);
 	if (!inode) {
 		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 		return -ENOMEM;
@@ -898,6 +905,14 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 			  struct kstat *stat)
 {
 	unsigned int blkbits;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/* see the comment in fuse_change_attributes() */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+		attr->size = i_size_read(inode);
+		attr->mtime = inode->i_mtime.tv_sec;
+		attr->mtimensec = inode->i_mtime.tv_nsec;
+	}
 
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = attr->ino;
@@ -924,7 +939,7 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 }
 
 static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
-			   struct file *file)
+			   struct file *file, int get_size_form_attr)
 {
 	int err;
 	struct fuse_getattr_in inarg;
@@ -970,13 +985,32 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 			fuse_change_attributes(inode, &outarg.attr,
 					       attr_timeout(&outarg),
 					       attr_version);
-			if (stat)
+			if (get_size_form_attr)
+				stat->size = outarg.attr.size;
+			else if (stat) {
+				struct fuse_inode *fi = get_fuse_inode(inode);
 				fuse_fillattr(inode, &outarg.attr, stat);
+				if (!atomic_read(&fi->num_openers))
+					stat->size = outarg.attr.size;
+			}
 		}
 	}
 	return err;
 }
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size)
+{
+	struct kstat stat;
+	int err;
+
+	err = fuse_do_getattr(inode, &stat, file, 1);
+	if (err)
+		return err;
+
+	*size = stat.size;
+	return 0;
+}
+
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed)
 {
@@ -986,7 +1020,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 
 	if (time_before64(fi->i_time, get_jiffies_64())) {
 		r = true;
-		err = fuse_do_getattr(inode, stat, file);
+		err = fuse_do_getattr(inode, stat, file, 0);
 	} else {
 		r = false;
 		err = 0;
@@ -1136,7 +1170,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	return fuse_do_getattr(inode, NULL, NULL);
+	return fuse_do_getattr(inode, NULL, NULL, 0);
 }
 
 /*
@@ -1322,7 +1356,7 @@ static int fuse_direntplus_link(struct file *file,
 		goto out;
 
 	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
-			  &o->attr, entry_attr_timeout(o), attr_version);
+			  &o->attr, entry_attr_timeout(o), attr_version, 0);
 	if (!inode)
 		goto out;
 
@@ -1624,6 +1658,89 @@ void fuse_release_nowrite(struct inode *inode)
 	spin_unlock(&fc->lock);
 }
 
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+			      struct inode *inode,
+			      struct fuse_setattr_in *inarg_p,
+			      struct fuse_attr_out *outarg_p)
+{
+	req->in.h.opcode = FUSE_SETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*inarg_p);
+	req->in.args[0].value = inarg_p;
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(*outarg_p);
+	req->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_mtime(struct file *file, bool nofail)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = NULL;
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	int err;
+
+	if (nofail) {
+		req = fuse_get_req_nofail_nopages(fc, file);
+	} else {
+		req = fuse_get_req_nopages(fc);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+	}
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+
+	inarg.valid |= FATTR_MTIME;
+	inarg.mtime = inode->i_mtime.tv_sec;
+	inarg.mtimensec = inode->i_mtime.tv_nsec;
+
+	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err)
+		clear_bit(FUSE_I_MTIME_UPDATED, &fi->state);
+
+	return err;
+}
+
+static inline void set_mtime_helper(struct inode *inode, struct timespec mtime)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	inode->i_mtime = mtime;
+	clear_bit(FUSE_I_MTIME_UPDATED, &fi->state);
+}
+
+/*
+ * S_NOCMTIME is clear, so we need to update inode->i_mtime manually. But
+ * we can also clear FUSE_I_MTIME_UPDATED if FUSE_SETATTR has just changed
+ * mtime on server.
+ */
+static void fuse_set_mtime_local(struct iattr *iattr, struct inode *inode)
+{
+	unsigned ivalid = iattr->ia_valid;
+
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+		if (ivalid & ATTR_MTIME_SET)
+			set_mtime_helper(inode, iattr->ia_mtime);
+		else
+			set_mtime_helper(inode, current_fs_time(inode->i_sb));
+	} else if (ivalid & ATTR_SIZE)
+		set_mtime_helper(inode, current_fs_time(inode->i_sb));
+}
+
 /*
  * Set attributes, and at the same time refresh them.
  *
@@ -1641,6 +1758,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
 	bool is_truncate = false;
+	bool is_wb = fc->writeback_cache;
 	loff_t oldsize;
 	int err;
 
@@ -1682,17 +1800,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 		inarg.valid |= FATTR_LOCKOWNER;
 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
 	}
-	req->in.h.opcode = FUSE_SETATTR;
-	req->in.h.nodeid = get_node_id(inode);
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	req->out.numargs = 1;
-	if (fc->minor < 9)
-		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
-	else
-		req->out.args[0].size = sizeof(outarg);
-	req->out.args[0].value = &outarg;
+	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
 	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -1709,10 +1817,16 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	}
 
 	spin_lock(&fc->lock);
+	/* the kernel maintains i_mtime locally */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode))
+		fuse_set_mtime_local(attr, inode);
+
 	fuse_change_attributes_common(inode, &outarg.attr,
 				      attr_timeout(&outarg));
 	oldsize = inode->i_size;
-	i_size_write(inode, outarg.attr.size);
+	/* see the comment in fuse_change_attributes() */
+	if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+		i_size_write(inode, outarg.attr.size);
 
 	if (is_truncate) {
 		/* NOTE: this may release/reacquire fc->lock */
@@ -1724,9 +1838,11 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	 * Only call invalidate_inode_pages2() after removing
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
-	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+	if ((is_truncate || !is_wb) &&
+			S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
 		truncate_pagecache(inode, outarg.attr.size);
-		invalidate_inode_pages2(inode->i_mapping);
+		if (!is_wb)
+			invalidate_inode_pages2(inode->i_mapping);
 	}
 
 	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1936,6 +2052,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	return err;
 }
 
+static int fuse_update_time(struct inode *inode, struct timespec *now,
+			    int flags)
+{
+	if (flags & S_MTIME) {
+		inode->i_mtime = *now;
+		set_bit(FUSE_I_MTIME_UPDATED, &get_fuse_inode(inode)->state);
+		BUG_ON(!S_ISREG(inode->i_mode));
+	}
+	return 0;
+}
+
 static const struct inode_operations_wrapper fuse_dir_inode_operations = {
 	.ops = {
 	.lookup		= fuse_lookup,
@@ -1978,6 +2105,8 @@ static const struct inode_operations fuse_common_inode_operations = {
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
+	.update_time	= fuse_update_time,
+	.fiemap		= fuse_fiemap,
 };
 
 static const struct inode_operations fuse_symlink_inode_operations = {
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -17,8 +17,27 @@
 #include <linux/swap.h>
 #include <linux/aio.h>
 #include <linux/falloc.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/virtinfo.h>
+#include <linux/file.h>
+
+struct workqueue_struct *fuse_fput_wq;
+static DEFINE_SPINLOCK(fuse_fput_lock);
+static LIST_HEAD(fuse_fput_head);
+static void fuse_fput_routine(struct work_struct *);
+static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
 
 static const struct file_operations fuse_direct_io_file_operations;
+static void fuse_sync_writes(struct inode *inode);
+
+static void fuse_account_request(struct fuse_conn *fc, size_t count)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	ub_percpu_inc(ub, fuse_requests);
+	ub_percpu_add(ub, fuse_bytes, count);
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_FUSE_REQ, NULL);
+}
 
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
@@ -58,27 +77,40 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	if (unlikely(!ff))
 		return NULL;
 
+	ff->ff_state = 0;
+
 	ff->fc = fc;
-	ff->reserved_req = fuse_request_alloc(0);
+	ff->reserved_req = fuse_request_alloc(fc, 0);
 	if (unlikely(!ff->reserved_req)) {
 		kfree(ff);
 		return NULL;
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
+	INIT_LIST_HEAD(&ff->rw_entry);
 	atomic_set(&ff->count, 0);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
 	spin_lock(&fc->lock);
 	ff->kh = ++fc->khctr;
+	ff->ff_dentry = NULL;
+	list_add_tail(&ff->fl, &fc->conn_files);
 	spin_unlock(&fc->lock);
 
 	return ff;
 }
 
+static void fuse_file_list_del(struct fuse_file *ff)
+{
+	spin_lock(&ff->fc->lock);
+	list_del_init(&ff->fl);
+	spin_unlock(&ff->fc->lock);
+}
+
 void fuse_file_free(struct fuse_file *ff)
 {
+	fuse_file_list_del(ff);
 	fuse_request_free(ff->reserved_req);
 	kfree(ff);
 }
@@ -128,19 +160,33 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 		struct fuse_req *req = ff->reserved_req;
 
 		if (sync) {
-			req->background = 0;
+			__clear_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send(ff->fc, req);
+			if (req->out.h.error == -EINTR) {
+				__set_bit(FR_PENDING, &req->flags);
+				req->out.h.error = 0;
+				goto async_fallback;
+			}
+			fuse_file_list_del(ff);
 			path_put(&req->misc.release.path);
 			fuse_put_request(ff->fc, req);
 		} else {
+async_fallback:
+			fuse_file_list_del(ff);
 			req->end = fuse_release_end;
-			req->background = 1;
+			__set_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send_background(ff->fc, req);
 		}
 		kfree(ff);
 	}
 }
 
+static void __fuse_file_put(struct fuse_file *ff)
+{
+	if (atomic_dec_and_test(&ff->count))
+		BUG();
+}
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir)
 {
@@ -171,11 +217,43 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 }
 EXPORT_SYMBOL_GPL(fuse_do_open);
 
+static void fuse_link_file(struct file *file, bool write)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff = file->private_data;
+
+	struct list_head *entry = write ? &ff->write_entry : &ff->rw_entry;
+	struct list_head *list  = write ? &fi->write_files : &fi->rw_files;
+
+	/*
+	 * file may be written through mmap, so chain it onto the
+	 * inodes's write_file list
+	 */
+	spin_lock(&fc->lock);
+	if (list_empty(entry))
+		list_add(entry, list);
+	spin_unlock(&fc->lock);
+}
+
+static void fuse_link_write_file(struct file *file)
+{
+	fuse_link_file(file, true);
+}
+
+static void fuse_link_rw_file(struct file *file)
+{
+	fuse_link_file(file, false);
+}
+
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
+	ff->ff_dentry = file->f_dentry;
+
 	if (ff->open_flags & FOPEN_DIRECT_IO)
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
@@ -191,6 +269,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
 	}
+	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+		fuse_link_write_file(file);
+
+	fuse_link_rw_file(file);
 }
 
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -198,6 +280,9 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
+	if ((file->f_flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -206,6 +291,40 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	if (err)
 		return err;
 
+	if (fc->writeback_cache && !isdir) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		u64 size;
+
+		mutex_lock(&inode->i_mutex);
+
+		spin_lock(&fc->lock);
+		atomic_inc(&fi->num_openers);
+
+		if (atomic_read(&fi->num_openers) == 1) {
+			fi->i_size_unstable = 1;
+			spin_unlock(&fc->lock);
+			err = fuse_getattr_size(inode, file, &size);
+			if (err) {
+				spin_lock(&fc->lock);
+				atomic_dec(&fi->num_openers);
+				fi->i_size_unstable = 0;
+				spin_unlock(&fc->lock);
+
+				mutex_unlock(&inode->i_mutex);
+				fuse_release_common(file, FUSE_RELEASE);
+				return err;
+			}
+
+			spin_lock(&fc->lock);
+			i_size_write(inode, size);
+			fi->i_size_unstable = 0;
+			spin_unlock(&fc->lock);
+		} else
+			spin_unlock(&fc->lock);
+
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	fuse_finish_open(inode, file);
 
 	return 0;
@@ -219,6 +338,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 
 	spin_lock(&fc->lock);
 	list_del(&ff->write_entry);
+	list_del(&ff->rw_entry);
 	if (!RB_EMPTY_NODE(&ff->polled_node))
 		rb_erase(&ff->polled_node, &fc->polled_files);
 	spin_unlock(&fc->lock);
@@ -256,6 +376,14 @@ void fuse_release_common(struct file *file, int opcode)
 	path_get(&file->f_path);
 	req->misc.release.path = file->f_path;
 
+	/*
+	 * No more in-flight asynchronous READ or WRITE requests if
+	 * fuse file release is synchronous
+	 */
+	if (ff->fc->close_wait) {
+		BUG_ON(atomic_read(&ff->count) != 1);
+	}
+
 	/*
 	 * Normally this will send the RELEASE request, however if
 	 * some asynchronous READ or WRITE requests are outstanding,
@@ -265,7 +393,8 @@ void fuse_release_common(struct file *file, int opcode)
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL ||
+			  ff->fc->close_wait);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -275,6 +404,57 @@ static int fuse_open(struct inode *inode, struct file *file)
 
 static int fuse_release(struct inode *inode, struct file *file)
 {
+	struct fuse_file *ff = file->private_data;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	if (ff->fc->writeback_cache) {
+		if (file->f_mode & FMODE_WRITE) {
+			filemap_write_and_wait(file->f_mapping);
+
+			/* Must remove file from write list. Otherwise it is possible this
+			 * file will get more writeback from another files rerouted via write_files
+			 */
+			spin_lock(&ff->fc->lock);
+			list_del_init(&ff->write_entry);
+			spin_unlock(&ff->fc->lock);
+
+			/* A writeback from another fuse file might come after
+			 * filemap_write_and_wait() above
+			 */
+			if (!ff->fc->close_wait)
+				filemap_write_and_wait(file->f_mapping);
+		} else
+			BUG_ON(!list_empty(&ff->write_entry));
+
+		/* This can livelock. Inode can be open via another file
+		 * and that file can generate continuous writeback.
+		 * I think i_mutex could be taken around this.
+		 * 
+		 * For now we replace this with waiting on ff->count,
+		 * it is safe, because we essentially wait only for writeback (and readahead)
+		 * enqueued on this file and it is not going to get new one: it is closing.
+		 */
+		if (!ff->fc->close_wait)
+			wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages));
+		else
+			wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
+		/* Wait for threads just released ff to leave their critical sections.
+		 * Taking spinlock is the first thing fuse_release_common does, so that
+		 * this is unneseccary, but it is still good to emphasize right here,
+		 * that we need this.
+		 */
+		spin_unlock_wait(&ff->fc->lock);
+
+		/* since now we can trust userspace attr.size */
+		atomic_dec(&fi->num_openers);
+	} else if (ff->fc->close_wait)
+		wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
+	if (test_bit(FUSE_I_MTIME_UPDATED,
+		     &get_fuse_inode(inode)->state))
+		fuse_flush_mtime(file, true);
+
 	fuse_release_common(file, FUSE_RELEASE);
 
 	/* return value is ignored by VFS */
@@ -284,9 +464,10 @@ static int fuse_release(struct inode *inode, struct file *file)
 void fuse_sync_release(struct fuse_file *ff, int flags)
 {
 	WARN_ON(atomic_read(&ff->count) > 1);
+	fuse_file_list_del(ff);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
-	ff->reserved_req->force = 1;
-	ff->reserved_req->background = 0;
+	__set_bit(FR_FORCE, &ff->reserved_req->flags);
+	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
 	fuse_request_send(ff->fc, ff->reserved_req);
 	fuse_put_request(ff->fc, ff->reserved_req);
 	kfree(ff);
@@ -315,6 +496,40 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+				    pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool found = false;
+	struct rb_node *n;
+
+	spin_lock(&fc->lock);
+
+	n = fi->writepages.rb_node;
+
+	while (n) {
+		struct fuse_req *req;
+		pgoff_t curr_index;
+
+		req = rb_entry(n, struct fuse_req, writepages_entry);
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+
+		if (idx_from >= curr_index + req->num_pages)
+			n = n->rb_right;
+		else if (idx_to < curr_index)
+			n = n->rb_left;
+		else {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
 /*
  * Check if page is under writeback
  *
@@ -325,16 +540,26 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_req *req;
 	bool found = false;
+	struct rb_node *n;
 
 	spin_lock(&fc->lock);
-	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+
+	n = fi->writepages.rb_node;
+
+	while (n) {
+		struct fuse_req *req;
 		pgoff_t curr_index;
 
+		req = rb_entry(n, struct fuse_req, writepages_entry);
 		BUG_ON(req->inode != inode);
 		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-		if (curr_index == index) {
+
+		if (index >= curr_index + req->num_pages)
+			n = n->rb_right;
+		else if (index < curr_index)
+			n = n->rb_left;
+		else {
 			found = true;
 			break;
 		}
@@ -350,12 +575,45 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
  * Since fuse doesn't rely on the VM writeback tracking, this has to
  * use some other means.
  */
-static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-	return 0;
+}
+
+/*
+ * Can be woken up by FUSE_NOTIFY_INVAL_FILES
+ */
+static void __fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
+							struct fuse_file *ff,
+							pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index) ||
+		   test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
+}
+
+static void fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
+						      struct file *file,
+						      pgoff_t index)
+{
+	return __fuse_wait_on_page_writeback_or_invalidate(inode,
+				file->private_data, index);
+}
+
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+				   size_t bytes)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	pgoff_t idx_from, idx_to;
+
+	idx_from = start >> PAGE_CACHE_SHIFT;
+	idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+	wait_event(fi->page_waitq,
+		   !fuse_range_is_writeback(inode, idx_from, idx_to));
 }
 
 static int fuse_flush(struct file *file, fl_owner_t id)
@@ -370,9 +628,24 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	if (fc->no_flush)
+	if (fc->no_flush || !(file->f_mode & FMODE_WRITE))
 		return 0;
 
+	err = filemap_write_and_wait(file->f_mapping);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	fuse_sync_writes(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	if (test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
+		err = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
+		err = -EIO;
+	if (err)
+		return err;
+
 	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -382,7 +655,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	req->force = 1;
+	__set_bit(FR_FORCE, &req->flags);
 	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -441,12 +714,31 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 
 	fuse_sync_writes(inode);
 
+	/* Due to implementation of fuse writeback filemap_write_and_wait_range()
+	 * does not catch errors. We have to do this directly after fuse_sync_writes()
+	 */
+	if (test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
+		err = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
+		err = -EIO;
+	if (err)
+		goto out;
+
+	if (!datasync && test_bit(FUSE_I_MTIME_UPDATED,
+				  &get_fuse_inode(inode)->state)) {
+		err = fuse_flush_mtime(file, false);
+		if (err)
+			goto out;
+	}
+
 	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
 	}
 
+	mutex_unlock(&inode->i_mutex);
+
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.fsync_flags = datasync ? 1 : 0;
@@ -465,6 +757,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 			fc->no_fsync = 1;
 		err = 0;
 	}
+	return err;
 out:
 	mutex_unlock(&inode->i_mutex);
 	return err;
@@ -494,6 +787,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = count;
+
+	if (opcode == FUSE_READ) {
+		struct fuse_iqueue *fiq = __this_cpu_ptr(ff->fc->iqs);
+		if (fiq->handled_by_fud)
+			req->fiq = fiq;
+		req->inode = file->f_dentry->d_inode;
+	}
 }
 
 static void fuse_release_user_pages(struct fuse_req *req, int write)
@@ -508,6 +808,29 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 	}
 }
 
+static void fuse_fput_routine(struct work_struct *data)
+{
+	spin_lock(&fuse_fput_lock);
+	while (likely(!list_empty(&fuse_fput_head))) {
+		struct fuse_io_priv *io = list_entry(fuse_fput_head.next,
+						     struct fuse_io_priv,
+						     list);
+		struct file *file = io->file;
+
+		list_del(&io->list);
+		spin_unlock(&fuse_fput_lock);
+
+		/* hack: __fput() is not visible outside fs/file_table.c */
+		BUG_ON(atomic_long_read(&file->f_count));
+		atomic_long_inc(&file->f_count);
+		fput(file);
+
+		kfree(io);
+		spin_lock(&fuse_fput_lock);
+	}
+	spin_unlock(&fuse_fput_lock);
+}
+
 /**
  * In case of short read, the caller sets 'pos' to the position of
  * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -539,6 +862,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 
 	if (!left) {
 		long res;
+		struct file *file = io->iocb->ki_filp;
 
 		if (io->err)
 			res = io->err;
@@ -548,7 +872,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			res = io->bytes < 0 ? io->size : io->bytes;
 
 			if (!is_sync_kiocb(io->iocb)) {
-				struct path *path = &io->iocb->ki_filp->f_path;
+				struct path *path = &file->f_path;
 				struct inode *inode = path->dentry->d_inode;
 				struct fuse_conn *fc = get_fuse_conn(inode);
 				struct fuse_inode *fi = get_fuse_inode(inode);
@@ -559,8 +883,32 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			}
 		}
 
+		if (res < 0 && printk_ratelimit())
+			printk("fuse_aio_complete(io=%p, err=%d, pos=%ld"
+			       "): io->err=%d io->bytes=%ld io->size=%ld "
+			       "is_sync=%d res=%ld ki_opcode=%d ki_pos=%llu\n",
+			       io, err, pos, io->err, io->bytes,
+			       io->size, is_sync_kiocb(io->iocb), res,
+			       io->iocb->ki_opcode, io->iocb->ki_pos);
+
+		/* We have to bump f_count here to avoid deadlock for
+		 * single-threaded fuse daemon: if process who generated
+		 * AIO is already close(2) the file, fput() called from
+		 * aio_complete will be the last fput(); hence, it will send
+		 * flush_mtime (or release) request to userspace who is busy
+		 * now writing ACK for given AIO to in-kernel fuse */
+		get_file(file);
+		BUG_ON(io->file != io->iocb->ki_filp);
 		aio_complete(io->iocb, res, 0);
-		kfree(io);
+
+		if (unlikely(atomic_long_dec_and_test(&file->f_count))) {
+			spin_lock(&fuse_fput_lock);
+			list_add(&io->list, &fuse_fput_head);
+			spin_unlock(&fuse_fput_lock);
+			queue_work(fuse_fput_wq, &fuse_fput_work);
+		} else {
+			kfree(io);
+		}
 	}
 }
 
@@ -569,7 +917,8 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
 	struct fuse_io_priv *io = req->io;
 	ssize_t pos = -1;
 
-	fuse_release_user_pages(req, !io->write);
+	if (!req->bvec)
+		fuse_release_user_pages(req, !io->write);
 
 	if (io->write) {
 		if (req->misc.write.in.size != req->misc.write.out.size)
@@ -581,6 +930,15 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
 				req->out.args[0].size;
 	}
 
+	if (req->out.h.error)
+		printk("fuse_aio_complete_req: request (rw=%s fh=0x%llx "
+		       "pos=%lld size=%d) completed with err=%d\n",
+		       !io->write ? "READ"                   : "WRITE",
+		       !io->write ? req->misc.read.in.fh     : req->misc.write.in.fh,
+		       !io->write ? req->misc.read.in.offset : req->misc.write.in.offset,
+		       !io->write ? req->misc.read.in.size   : req->misc.write.in.size,
+		       req->out.h.error);
+
 	fuse_aio_complete(io, req->out.h.error, pos);
 }
 
@@ -609,6 +967,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 	struct fuse_conn *fc = ff->fc;
 
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	if (owner != NULL) {
 		struct fuse_read_in *inarg = &req->misc.read.in;
 
@@ -619,7 +978,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 	if (io->async)
 		return fuse_async_req_send(fc, req, count, io);
 
-	fuse_request_send(fc, req);
+	fuse_request_check_and_send(fc, req, ff);
 	return req->out.args[0].size;
 }
 
@@ -638,86 +997,156 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
 	spin_unlock(&fc->lock);
 }
 
-static int fuse_readpage(struct file *file, struct page *page)
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+			    u64 attr_ver)
+{
+	size_t num_read = req->out.args[0].size;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (fc->writeback_cache) {
+		/*
+		 * A hole in a file. Some data after the hole are in page cache,
+		 * but have not reached the client fs yet. So, the hole is not
+		 * present there.
+		 */
+		int i;
+		int start_idx = num_read >> PAGE_CACHE_SHIFT;
+		size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+		for (i = start_idx; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			void *mapaddr = kmap_atomic(page);
+
+			memset(mapaddr + off, 0, PAGE_CACHE_SIZE - off);
+
+			kunmap_atomic(mapaddr);
+			off = 0;
+		}
+	} else {
+		loff_t pos = page_offset(req->pages[0]) + num_read;
+		fuse_read_update_size(inode, pos, attr_ver);
+	}
+}
+
+static int __fuse_readpage(struct file *file, struct page *page, size_t count,
+			   int *err, struct fuse_req **req_pp, u64 *attr_ver_p,
+			   bool page_needs_release, bool *killed_p)
 {
 	struct fuse_io_priv io = { .async = 0, .file = file };
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
-	size_t num_read;
-	loff_t pos = page_offset(page);
-	size_t count = PAGE_CACHE_SIZE;
-	u64 attr_ver;
-	int err;
-
-	err = -EIO;
-	if (is_bad_inode(inode))
-		goto out;
+	size_t num_read = 0;
+	bool killed = false;
 
 	/*
 	 * Page writeback can extend beyond the lifetime of the
 	 * page-cache page, so make sure we read a properly synced
 	 * page.
+	 *
+	 * But we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress.
 	 */
-	fuse_wait_on_page_writeback(inode, page->index);
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	req = fuse_get_req(fc, 1);
-	err = PTR_ERR(req);
+	*err = PTR_ERR(req);
 	if (IS_ERR(req))
-		goto out;
+		goto read_done;
 
-	attr_ver = fuse_get_attr_version(fc);
+	if (attr_ver_p)
+		*attr_ver_p = fuse_get_attr_version(fc);
 
 	req->out.page_zeroing = 1;
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	req->page_descs[0].length = count;
-	num_read = fuse_send_read(req, &io, pos, count, NULL);
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
+	req->page_cache = 1;
+	req->page_needs_release = page_needs_release;
+
+	num_read = fuse_send_read(req, &io, page_offset(page), count, NULL);
+	killed = req->killed;
+	*err = killed ? -EIO : req->out.h.error;
+
+	if (*err)
+		fuse_put_request(fc, req);
+	else
+		*req_pp = req;
+read_done:
+	if (killed_p)
+		*killed_p = killed;
+	return num_read;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = NULL;
+	size_t num_read;
+	size_t count = PAGE_CACHE_SIZE;
+	u64 attr_ver;
+	int err;
+	bool killed = false;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
 
+	num_read = __fuse_readpage(file, page, count, &err, &req, &attr_ver,
+				   false, &killed);
 	if (!err) {
 		/*
 		 * Short read means EOF.  If file size is larger, truncate it
 		 */
 		if (num_read < count)
-			fuse_read_update_size(inode, pos + num_read, attr_ver);
+			fuse_short_read(req, inode, attr_ver);
 
 		SetPageUptodate(page);
 	}
-
-	fuse_invalidate_attr(inode); /* atime changed */
- out:
-	unlock_page(page);
+	if (req) {
+		fuse_put_request(fc, req);
+		fuse_invalidate_attr(inode); /* atime changed */
+	}
+out:
+	if (!killed)
+		unlock_page(page);
 	return err;
 }
 
+void fuse_release_ff(struct inode *inode, struct fuse_file *ff)
+{
+	if (ff) {
+		if (ff->fc->close_wait) {
+			spin_lock(&ff->fc->lock);
+			__fuse_file_put(ff);
+			wake_up(&get_fuse_inode(inode)->page_waitq);
+			spin_unlock(&ff->fc->lock);
+		} else {
+			fuse_file_put(ff, false);
+		}
+	}
+}
+
 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int i;
 	size_t count = req->misc.read.in.size;
 	size_t num_read = req->out.args[0].size;
-	struct address_space *mapping = NULL;
-
-	for (i = 0; mapping == NULL && i < req->num_pages; i++)
-		mapping = req->pages[i]->mapping;
+	struct inode *inode = req->inode;
 
-	if (mapping) {
-		struct inode *inode = mapping->host;
+	/* fused might process given request before lost-lease happened */
+	if (req->killed && !req->out.h.error)
+		req->out.h.error = -EIO;
 
-		/*
-		 * Short read means EOF. If file size is larger, truncate it
-		 */
-		if (!req->out.h.error && num_read < count) {
-			loff_t pos;
+	if (req->killed)
+		goto killed;
 
-			pos = page_offset(req->pages[0]) + num_read;
-			fuse_read_update_size(inode, pos,
-					      req->misc.read.attr_ver);
-		}
-		fuse_invalidate_attr(inode); /* atime changed */
-	}
+	/*
+	 * Short read means EOF. If file size is larger, truncate it
+	 */
+	if (!req->out.h.error && num_read < count)
+		fuse_short_read(req, inode, req->misc.read.attr_ver);
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -728,8 +1157,12 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		unlock_page(page);
 		page_cache_release(page);
 	}
+
+killed:
+	fuse_invalidate_attr(inode); /* atime changed */
+
 	if (req->ff)
-		fuse_file_put(req->ff, false);
+		fuse_release_ff(inode, req->ff);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -742,7 +1175,10 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
 	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
 	req->out.page_replace = 1;
+	req->page_cache = 1;
+	req->page_needs_release = false;
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	req->misc.read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
 		req->ff = fuse_file_get(ff);
@@ -760,6 +1196,7 @@ struct fuse_fill_data {
 	struct file *file;
 	struct inode *inode;
 	unsigned nr_pages;
+	struct page **orig_pages;
 };
 
 static int fuse_readpages_fill(void *_data, struct page *page)
@@ -767,9 +1204,11 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	struct fuse_fill_data *data = _data;
 	struct fuse_req *req = data->req;
 	struct inode *inode = data->inode;
+	struct file *file = data->file;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	fuse_wait_on_page_writeback(inode, page->index);
+	/* we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress */
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	if (req->num_pages &&
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
@@ -892,6 +1331,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 	struct fuse_write_in *inarg = &req->misc.write.in;
 
 	fuse_write_fill(req, ff, pos, count);
+	fuse_account_request(fc, count);
 	inarg->flags = file->f_flags;
 	if (owner != NULL) {
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -905,16 +1345,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 	return req->misc.write.out.size;
 }
 
-void fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool ret = false;
 
 	spin_lock(&fc->lock);
 	fi->attr_version = ++fc->attr_version;
-	if (pos > inode->i_size)
+	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
+		ret = true;
+	}
 	spin_unlock(&fc->lock);
+
+	return ret;
 }
 
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -993,6 +1438,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 
 		mark_page_accessed(page);
 
+		iov_iter_advance(ii, tmp);
 		if (!tmp) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1005,7 +1451,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		req->page_descs[req->num_pages].length = tmp;
 		req->num_pages++;
 
-		iov_iter_advance(ii, tmp);
 		count += tmp;
 		pos += tmp;
 		offset += tmp;
@@ -1099,6 +1544,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 	loff_t endbyte = 0;
 
+	if (get_fuse_conn(inode)->writeback_cache)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
 	WARN_ON(iocb->ki_pos != pos);
 
 	ocount = 0;
@@ -1181,7 +1629,12 @@ static inline void fuse_page_descs_length_init(struct fuse_req *req,
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
 {
-	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+	struct iovec *iov;
+
+	BUG_ON(!iov_iter_has_iovec(ii));
+	iov = (struct iovec *)ii->data;
+
+	return (unsigned long)iov->iov_base + ii->iov_offset;
 }
 
 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
@@ -1272,8 +1725,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write)
+		       int flags)
 {
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
@@ -1283,6 +1738,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 	struct fuse_req *req;
 	struct iov_iter ii;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	iov_iter_init(&ii, iov, nr_segs, count, 0);
 
 	if (io->async)
@@ -1302,10 +1759,17 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 			break;
 		}
 
-		if (write)
+		if (!cuse)
+			fuse_wait_on_writeback(file->f_mapping->host, pos,
+					       nbytes);
+
+		if (write) {
 			nres = fuse_send_write(req, io, pos, nbytes, owner);
-		else
+			task_io_account_write(nbytes);
+		} else {
 			nres = fuse_send_read(req, io, pos, nbytes, owner);
+			task_io_account_read(nbytes);
+		}
 
 		if (!io->async)
 			fuse_release_user_pages(req, !write);
@@ -1380,7 +1844,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res)
-		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+				     FUSE_DIO_WRITE);
 
 	fuse_invalidate_attr(inode);
 
@@ -1410,8 +1875,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-	__free_page(req->pages[0]);
-	fuse_file_put(req->ff, false);
+	int i;
+
+	for (i = 0; i < req->num_pages; i++)
+		__free_page(req->pages[i]);
+
+	if (!fc->writeback_cache && !fc->close_wait)
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1419,11 +1889,16 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	int i;
 
-	list_del(&req->writepages_entry);
-	dec_bdi_stat(bdi, BDI_WRITEBACK);
-	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
-	bdi_writeout_inc(bdi);
+	rb_erase(&req->writepages_entry, &fi->writepages);
+	if (fc->writeback_cache || fc->close_wait)
+		__fuse_file_put(req->ff);
+	for (i = 0; i < req->num_pages; i++) {
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
+	}
 	wake_up(&fi->page_waitq);
 }
 
@@ -1435,14 +1910,16 @@ __acquires(fc->lock)
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
+	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
-	if (!fc->connected)
+	if (!fc->connected ||
+	    test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->ff->ff_state))
 		goto out_free;
 
-	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
-		inarg->size = PAGE_CACHE_SIZE;
+	if (inarg->offset + data_size <= size) {
+		inarg->size = data_size;
 	} else if (inarg->offset < size) {
-		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+		inarg->size = size - inarg->offset;
 	} else {
 		/* Got truncated off completely */
 		goto out_free;
@@ -1495,34 +1972,106 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_free(fc, req);
 }
 
-static int fuse_writepage_locked(struct page *page)
+static struct fuse_file *fuse_write_file(struct fuse_conn *fc,
+					 struct fuse_inode *fi)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_req *req;
-	struct fuse_file *ff;
+	struct fuse_file *ff = NULL;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+		fuse_file_get(ff);
+	}
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+static int tree_insert(struct rb_root *root, struct fuse_req *ins_req)
+{
+	pgoff_t idx_from = ins_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+	pgoff_t idx_to   = idx_from + ins_req->num_pages - 1;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node  *parent = NULL;
+
+	while (*p) {
+		struct fuse_req *req;
+		pgoff_t curr_index;
+
+		parent = *p;
+		req = rb_entry(parent, struct fuse_req, writepages_entry);
+		BUG_ON(req->inode != ins_req->inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+
+		if (idx_from >= curr_index + req->num_pages)
+			p = &(*p)->rb_right;
+		else if (idx_to < curr_index)
+			p = &(*p)->rb_left;
+		else
+			BUG();
+	}
+
+	rb_link_node(&ins_req->writepages_entry, parent, p);
+	rb_insert_color(&ins_req->writepages_entry, root);
+	return 0;
+}
+
+static int fuse_writepage_locked(struct page *page,
+				 struct writeback_control *wbc,
+				 struct fuse_file **ff_pp)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
 	struct page *tmp_page;
+	struct fuse_file *ff;
+	int err = 0;
 
-	set_page_writeback(page);
+	if (fuse_page_is_writeback(inode, page->index)) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			return 0;
+		}
+
+		/* we can acquire ff here because we do have locked pages here! */
+		ff = fuse_write_file(fc, get_fuse_inode(inode));
+		if (!ff)
+			goto dummy_end_page_wb_err;
+
+		/* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */
+		__fuse_wait_on_page_writeback_or_invalidate(inode, ff, page->index);
+
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+			if (ff_pp)
+				*ff_pp = ff;
+			goto dummy_end_page_wb;
+		}
+
+		fuse_release_ff(inode, ff);
+	}
 
-	req = fuse_request_alloc_nofs(1);
+	if (test_set_page_writeback(page))
+		BUG();
+
+	req = fuse_request_alloc_nofs(fc, 1);
 	if (!req)
 		goto err;
 
-	req->background = 1; /* writeback always goes to bg_queue */
+	/* writeback always goes to bg_queue */
+	__set_bit(FR_BACKGROUND, &req->flags);
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
 
-	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fi->write_files));
-	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-	req->ff = fuse_file_get(ff);
-	spin_unlock(&fc->lock);
-
-	fuse_write_fill(req, ff, page_offset(page), 0);
+	req->ff = fuse_write_file(fc, fi);
+	if (!req->ff)
+		goto err_nofile;
+	if (ff_pp)
+		*ff_pp = fuse_file_get(req->ff);
+	fuse_write_fill(req, req->ff, page_offset(page), 0);
+	fuse_account_request(fc, PAGE_CACHE_SIZE);
 
 	copy_highpage(tmp_page, page);
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
@@ -1538,7 +2087,7 @@ static int fuse_writepage_locked(struct page *page)
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
-	list_add(&req->writepages_entry, &fi->writepages);
+	tree_insert(&fi->writepages, req);
 	list_add_tail(&req->list, &fi->queued_writes);
 	fuse_flush_writepages(inode);
 	spin_unlock(&fc->lock);
@@ -1547,31 +2096,446 @@ static int fuse_writepage_locked(struct page *page)
 
 	return 0;
 
+err_nofile:
+	printk("FUSE: page dirtied on dead file\n");
+	__free_page(tmp_page);
 err_free:
 	fuse_request_free(req);
 err:
 	end_page_writeback(page);
 	return -ENOMEM;
+
+dummy_end_page_wb_err:
+	printk("FUSE: page under fwb dirtied on dead file\n");
+	err = -EIO;
+	/* fall through ... */
+dummy_end_page_wb:
+	if (test_set_page_writeback(page))
+		BUG();
+	end_page_writeback(page);
+	return err;
 }
 
 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int err;
 
-	err = fuse_writepage_locked(page);
+	err = fuse_writepage_locked(page, wbc, NULL);
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_send_writepages(struct fuse_fill_data *data)
+{
+	int i, all_ok = 1;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff;
+	loff_t off = -1;
+	int num_pages = req->num_pages;
+
+	/* we can acquire ff here because we do have locked pages here! */
+	ff = fuse_write_file(fc, fi);
+
+	if (!ff) {
+		printk("FUSE: pages dirtied on dead file\n");
+		for (i = 0; i < req->num_pages; i++)
+			end_page_writeback(req->pages[i]);
+		return -EIO;
+	}
+
+	if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+		for (i = 0; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			req->pages[i] = NULL;
+			SetPageError(page);
+			end_page_writeback(page);
+		}
+		fuse_release_ff(inode, ff);
+		fuse_put_request(fc, req);
+		return 0;
+	}
+
+	req->inode = inode;
+	req->misc.write.in.offset = page_offset(req->pages[0]);
+
+	spin_lock(&fc->lock);
+	tree_insert(&fi->writepages, req);
+	spin_unlock(&fc->lock);
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		struct page *tmp_page;
+
+		tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (tmp_page) {
+			copy_highpage(tmp_page, page);
+			inc_bdi_stat(bdi, BDI_WRITEBACK);
+			inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+		} else
+			all_ok = 0;
+		req->pages[i] = tmp_page;
+		if (i == 0)
+			off = page_offset(page);
+
+		data->orig_pages[i] = page;
+	}
+
+	if (!all_ok) {
+		for (i = 0; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			if (page) {
+				dec_bdi_stat(bdi, BDI_WRITEBACK);
+				dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+				__free_page(page);
+				req->pages[i] = NULL;
+			}
+			end_page_writeback(data->orig_pages[i]);
+		}
+
+		spin_lock(&fc->lock);
+		rb_erase(&req->writepages_entry, &fi->writepages);
+		wake_up(&fi->page_waitq);
+		spin_unlock(&fc->lock);
+
+		fuse_release_ff(inode, ff);
+		return -ENOMEM;
+	}
+
+	req->ff = fuse_file_get(ff);
+	fuse_write_fill(req, ff, off, 0);
+	fuse_account_request(fc, req->num_pages << PAGE_CACHE_SHIFT);
+
+	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->in.argpages = 1;
+	__set_bit(FR_BACKGROUND, &req->flags);
+	fuse_page_descs_length_init(req, 0, req->num_pages);
+	req->end = fuse_writepage_end;
+
+	spin_lock(&fc->lock);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(data->inode);
+	spin_unlock(&fc->lock);
+
+	for (i = 0; i < num_pages; i++)
+		end_page_writeback(data->orig_pages[i]);
+
+	fuse_release_ff(inode, ff);
+	return 0;
+}
+
+/*
+ * Returns true if and only if fuse connection is blocked and there is
+ * no file invalidation in progress.
+ */
+static inline bool fuse_blocked_for_wb(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool blocked = true;
+
+	if (!fc->blocked)
+		return false;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->rw_files)) {
+		struct fuse_file *ff = list_entry(fi->rw_files.next,
+						  struct fuse_file, rw_entry);
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+			blocked = false;
+	}
+	spin_unlock(&fc->lock);
+
+	return blocked;
+}
+
+static int fuse_writepages_fill(struct page *page,
+		struct writeback_control *wbc, void *_data)
+{
+	struct fuse_fill_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int check_for_blocked = 0;
+
+	if (fuse_page_is_writeback(inode, page->index)) {
+		struct fuse_file *ff;
+
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+
+		/* we can acquire ff here because we do have locked pages here! */
+		ff = fuse_write_file(fc, get_fuse_inode(inode));
+		if (!ff) {
+			printk("FUSE: dirty page on dead file\n");
+			unlock_page(page);
+			return -EIO;
+		}
+
+		/* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */
+		__fuse_wait_on_page_writeback_or_invalidate(inode, ff, page->index);
+
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+			unlock_page(page);
+			fuse_release_ff(inode, ff);
+			return 0;
+		}
+
+		fuse_release_ff(inode, ff);
+	}
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int err;
+
+		err = fuse_send_writepages(data);
+		if (err) {
+			unlock_page(page);
+			return err;
+		}
+
+		data->req = req =
+			fuse_request_alloc_nofs(fc, FUSE_MAX_PAGES_PER_REQ);
+		if (!req) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
+
+		check_for_blocked = 1;
+	}
+
+	req->pages[req->num_pages] = page;
+	req->num_pages++;
+
+	if (test_set_page_writeback(page))
+		BUG();
+
 	unlock_page(page);
 
+	if (wbc->sync_mode != WB_SYNC_NONE && check_for_blocked)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	return 0;
+}
+
+static int fuse_dummy_writepage(struct page *page,
+				struct writeback_control *wbc,
+				void *data)
+{
+	unlock_page(page);
+	return 0;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_fill_data data;
+	struct fuse_file *ff;
+	int err;
+
+	if (!fc->writeback_cache)
+		return generic_writepages(mapping, wbc);
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	/*
+	 * We use fuse_blocked_for_wb() instead of just fc->blocked to avoid
+	 * deadlock when we are called from fuse_invalidate_files() in case
+	 * of single-threaded fused.
+	 */
+	if (wbc->sync_mode != WB_SYNC_NONE)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	/* More than optimization: writeback pages to /dev/null; fused would
+	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
+	 * sending NOTIFY_INVAL_FILES until we return!
+	 *
+	 * NB: We can't wait till fuse_send_writepages() because
+	 * fuse_writepages_fill() would possibly deadlock on
+	 * fuse_page_is_writeback().
+	 */
+	ff = fuse_write_file(fc, get_fuse_inode(inode));
+	if (ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+		err = write_cache_pages(mapping, wbc, fuse_dummy_writepage,
+					mapping);
+		fuse_release_ff(inode, ff);
+		goto out;
+	}
+	if (ff)
+		fuse_release_ff(inode, ff);
+
+	data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+				  sizeof(struct page *),
+				  GFP_NOFS);
+	if (!data.orig_pages)
+		goto out;
+
+	data.inode = inode;
+	data.req = fuse_request_alloc_nofs(fc, FUSE_MAX_PAGES_PER_REQ);
+	err = -ENOMEM;
+	if (!data.req)
+		goto out_and_free;
+
+	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+	if (data.req) {
+		if (!err && data.req->num_pages) {
+			err = fuse_send_writepages(&data);
+			if (err)
+				fuse_put_request(fc, data.req);
+		} else
+			fuse_put_request(fc, data.req);
+	}
+out_and_free:
+	kfree(data.orig_pages);
+out:
 	return err;
 }
 
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline unsigned fuse_page_length(struct page *page)
+{
+	loff_t i_size = i_size_read(page_file_mapping(page)->host);
+
+	if (i_size > 0) {
+		pgoff_t page_index = page_file_index(page);
+		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		if (page_index < end_index)
+			return PAGE_CACHE_SIZE;
+		if (page_index == end_index)
+			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+	}
+	return 0;
+}
+
+static inline bool fuse_file_fail_immediately(struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+
+	return test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+}
+
+static int fuse_prepare_write(struct fuse_conn *fc, struct file *file,
+		struct page *page, loff_t pos, unsigned len)
+{
+	struct fuse_req *req = NULL;
+	unsigned num_read;
+	unsigned page_len;
+	int err;
+	bool killed = false;
+
+	if (fuse_file_fail_immediately(file)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return -EIO;
+	}
+
+	if (PageUptodate(page) || (len == PAGE_CACHE_SIZE))
+		return 0;
+
+	page_len = fuse_page_length(page);
+	if (!page_len) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	num_read = __fuse_readpage(file, page, page_len, &err, &req, NULL,
+				   true, &killed);
+	if (req)
+		fuse_put_request(fc, req);
+	if (err) {
+		if (!killed) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	} else if (num_read != PAGE_CACHE_SIZE) {
+		zero_user_segment(page, num_read, PAGE_CACHE_SIZE);
+	}
+	return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+
+	BUG_ON(!fc->writeback_cache);
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+
+	return fuse_prepare_write(fc, file, *pagep, pos, len);
+}
+
+static int fuse_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	fuse_write_update_size(inode, pos);
+	set_page_dirty(page);
+	return 0;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied,
+		struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+	fuse_commit_write(file, page, from, from+copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
 static int fuse_launder_page(struct page *page)
 {
 	int err = 0;
 	if (clear_page_dirty_for_io(page)) {
 		struct inode *inode = page->mapping->host;
-		err = fuse_writepage_locked(page);
-		if (!err)
-			fuse_wait_on_page_writeback(inode, page->index);
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+		};
+		struct fuse_file *ff = NULL;
+		err = fuse_writepage_locked(page, &wbc, &ff);
+		if (!err) {
+			/*
+			 * We need to check FAIL_IMMEDIATELY because otherwise
+			 * fuse_do_setattr may stick in invalidate_inode_pages2
+			 * forever (if fuse_invalidate_files is in progress).
+			 */
+			__fuse_wait_on_page_writeback_or_invalidate(inode,
+								    ff, page->index);
+			fuse_release_ff(inode, ff);
+		}
 	}
 	return err;
 }
@@ -1582,7 +2546,11 @@ static int fuse_launder_page(struct page *page)
  */
 static void fuse_vma_close(struct vm_area_struct *vma)
 {
-	filemap_write_and_wait(vma->vm_file->f_mapping);
+	struct file *file = vma->vm_file;
+	struct fuse_file *ff = file->private_data;
+
+	if (!ff->fc->writeback_cache)
+		filemap_write_and_wait(file->f_mapping);
 }
 
 /*
@@ -1609,6 +2577,9 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	struct inode *inode = vma->vm_file->f_mapping->host;
 
+	if (fuse_file_fail_immediately(vma->vm_file))
+		return -EIO;
+
 	fuse_wait_on_page_writeback(inode, page->index);
 	return 0;
 }
@@ -1617,25 +2588,13 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 	.close		= fuse_vma_close,
 	.fault		= filemap_fault,
 	.page_mkwrite	= fuse_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
-		struct inode *inode = file_inode(file);
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_file *ff = file->private_data;
-		/*
-		 * file may be written through mmap, so chain it onto the
-		 * inodes's write_file list
-		 */
-		spin_lock(&fc->lock);
-		if (list_empty(&ff->write_entry))
-			list_add(&ff->write_entry, &fi->write_files);
-		spin_unlock(&fc->lock);
-	}
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		fuse_link_write_file(file);
+
 	file_accessed(file);
 	vma->vm_ops = &fuse_file_vm_ops;
 	return 0;
@@ -1939,8 +2898,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 		kaddr = kmap(page);
 
 		while (todo) {
-			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
-			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			struct iovec *iiov = (struct iovec *)ii.data;
+			char __user *uaddr = iiov->iov_base + ii.iov_offset;
+			size_t iov_len = iiov->iov_len - ii.iov_offset;
 			size_t copy = min(todo, iov_len);
 			size_t left;
 
@@ -2431,6 +3391,104 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
 	return 0;
 }
 
+static struct fuse_io_priv *fuse_io_priv_create(struct kiocb *iocb,
+		loff_t off, int rw, bool async)
+{
+	struct fuse_io_priv *io;
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return NULL;
+
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = off;
+	io->write = (rw == WRITE);
+	io->err = 0;
+	io->file = iocb->ki_filp;
+	io->async = async;
+	io->iocb = iocb;
+
+	return io;
+}
+
+static ssize_t fuse_direct_IO_bvec(int rw, struct kiocb *iocb,
+		struct bio_vec *bvec, loff_t offset, unsigned long bvec_len)
+{
+	struct fuse_io_priv *io;
+	struct fuse_req *req;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	size_t nmax = (rw == WRITE ? fc->max_write : fc->max_read);
+	size_t filled, nres;
+	loff_t pos = iocb->ki_pos;
+	int i;
+
+	if (nmax > FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT)
+		nmax = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	io = fuse_io_priv_create(iocb, pos, rw, true);
+	if (!io)
+		return -ENOMEM;
+
+	req = NULL;
+	filled = 0;
+	i = 0;
+
+	while (1) {
+		if (!req) {
+			req = fuse_get_req_for_background(fc, 0);
+			if (IS_ERR(req))
+				break;
+
+			if (rw == WRITE)
+				req->in.argbvec = 1;
+			else
+				req->out.argbvec = 1;
+
+			filled = 0;
+			req->bvec = bvec;
+		}
+
+		if (filled + bvec->bv_len <= nmax) {
+			filled += bvec->bv_len;
+			req->num_bvecs++;
+			bvec++;
+			i++;
+
+			if (i < bvec_len)
+				continue;
+		}
+
+		BUG_ON(!filled);
+
+		if (rw == WRITE)
+			nres = fuse_send_write(req, io, pos,
+					filled, NULL);
+		else
+			nres = fuse_send_read(req, io, pos,
+					filled, NULL);
+
+		BUG_ON(nres != filled);
+		fuse_put_request(fc, req);
+
+		if (i == bvec_len)
+			break;
+
+		pos += filled;
+		req = NULL;
+		filled = 0;
+	}
+
+	fuse_aio_complete(io, !IS_ERR(req) ? 0 : PTR_ERR(req), -1);
+	return -EIOCBQUEUED;
+}
+
 static void fuse_do_truncate(struct file *file)
 {
 	struct inode *inode = file->f_mapping->host;
@@ -2457,7 +3515,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	ssize_t ret = 0;
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
-	bool async_dio = ff->fc->async_dio;
+	bool async_dio = ff->fc->async_dio | ff->fc->writeback_cache;
 	loff_t pos = 0;
 	struct inode *inode;
 	loff_t i_size;
@@ -2468,37 +3526,40 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	inode = file->f_mapping->host;
 	i_size = i_size_read(inode);
 
+	if ((rw == READ) && (offset > i_size))
+		return 0;
+
 	/* optimization for short read */
 	if (async_dio && rw != WRITE && offset + count > i_size) {
+		loff_t new_count;
+
 		if (offset >= i_size)
 			return 0;
-		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+
+		new_count = i_size - offset;
+		if (!ff->fc->writeback_cache)
+			new_count = fuse_round_up(new_count);
+
+		count = min_t(loff_t, count, new_count);
 	}
 
-	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
-	if (!io)
-		return -ENOMEM;
-	spin_lock_init(&io->lock);
-	io->reqs = 1;
-	io->bytes = -1;
-	io->size = 0;
-	io->offset = offset;
-	io->write = (rw == WRITE);
-	io->err = 0;
-	io->file = file;
 	/*
 	 * By default, we want to optimize all I/Os with async request
 	 * submission to the client filesystem if supported.
 	 */
-	io->async = async_dio;
-	io->iocb = iocb;
+	io = fuse_io_priv_create(iocb, offset, rw, async_dio);
+	if (!io)
+		return -ENOMEM;
 
 	/*
 	 * We cannot asynchronously extend the size of a file. We have no method
 	 * to wait on real async I/O requests, so we must submit this request
 	 * synchronously.
+	 * And it's useless to process small sync READs asynchronously.
 	 */
-	if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+	if ((!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) ||
+	    (rw != WRITE && is_sync_kiocb(iocb) &&
+	     count <= (FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT)))
 		io->async = false;
 
 	if (rw == WRITE)
@@ -2507,6 +3568,14 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
 
 	if (io->async) {
+		if (ret != count && printk_ratelimit()) {
+			struct fuse_file *ff = file->private_data;
+			printk("fuse_direct_IO: failed to %s %ld bytes "
+			       "(offset=%llu ret=%ld i_size=%llu ino=%lu "
+			       "fh=%llu\n", rw == WRITE ? "write" : "read",
+			       count, offset, ret, i_size, inode->i_ino,
+			       ff->fh);
+		}
 		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
 
 		/* we have a non-extending, async request, so return */
@@ -2528,6 +3597,32 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	return ret;
 }
 
+static ssize_t fuse_direct_IO_page(int rw, struct kiocb *iocb,
+	struct page *page, loff_t offset)
+{
+	struct iovec iov;
+	mm_segment_t oldfs;
+	ssize_t ret;
+
+	iov.iov_base = kmap(page);
+	iov.iov_len = PAGE_SIZE;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	ret = fuse_direct_IO(rw, iocb, &iov, offset, 1);
+	if (ret != -EIOCBQUEUED && ret != PAGE_SIZE)
+		printk("fuse_direct_IO_page: io failed with err=%ld "
+		       "(rw=%s fh=0x%llx pos=%lld)\n",
+		       ret, rw == WRITE ? "WRITE" : "READ",
+		       ((struct fuse_file *)iocb->ki_filp->private_data)->fh,
+		       offset);
+
+	set_fs(oldfs);
+	kunmap(page);
+	return ret;
+}
+
 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 				loff_t length)
 {
@@ -2544,14 +3639,19 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	};
 	int err;
 	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
-			   (mode & FALLOC_FL_PUNCH_HOLE);
+			   (mode & (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_ZERO_RANGE));
+
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
+		return -EOPNOTSUPP;
 
 	if (fc->no_fallocate)
 		return -EOPNOTSUPP;
 
 	if (lock_inode) {
 		mutex_lock(&inode->i_mutex);
-		if (mode & FALLOC_FL_PUNCH_HOLE) {
+		if (mode & (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_ZERO_RANGE)) {
 			loff_t endbyte = offset + length - 1;
 			err = filemap_write_and_wait_range(inode->i_mapping,
 							   offset, endbyte);
@@ -2591,7 +3691,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		fuse_write_update_size(inode, offset + length);
 
-	if (mode & FALLOC_FL_PUNCH_HOLE)
+	if (mode & (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_ZERO_RANGE))
 		truncate_pagecache_range(inode, offset, offset + length - 1);
 
 	fuse_invalidate_attr(inode);
@@ -2606,6 +3706,232 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	return err;
 }
 
+static void copy_fiemap_extent(struct fiemap_extent * le, struct page ** pages, int index)
+{
+	struct page * page;
+	void * addr;
+	unsigned int linear_off = index * sizeof (struct fiemap_extent);
+
+	page = pages[linear_off / PAGE_SIZE];
+	addr = kmap_atomic(page);
+	if (((linear_off + sizeof(struct fiemap_extent) - 1) / PAGE_SIZE) == (linear_off / PAGE_SIZE)) {
+		memcpy(le, addr + (linear_off % PAGE_SIZE), sizeof(struct fiemap_extent));
+	} else {
+		int split = PAGE_SIZE - (linear_off % PAGE_SIZE);
+		memcpy(le, addr + (linear_off % PAGE_SIZE), split);
+		kunmap_atomic(addr);
+		page = pages[(linear_off / PAGE_SIZE) + 1];
+		addr = kmap_atomic(page);
+		memcpy((void *)le + split, addr, sizeof(struct fiemap_extent) - split);
+	}
+	kunmap_atomic(addr);
+}
+
+static int fuse_request_fiemap(struct inode *inode, u32 cur_max,
+			       __u64 * start_p, __u64 * len_p, int * last_p, struct fiemap_extent_info * dest)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct fuse_ioctl_in inarg;
+	struct fuse_ioctl_out outarg;
+	struct fiemap ifiemap;
+	struct fiemap ofiemap;
+	int err;
+	int npages = 0;
+	int allocated = 0;
+
+	err = 0;
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		struct fuse_file *ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+		inarg.fh = ff->fh;
+	} else if (!list_empty(&fi->rw_files)) {
+		struct fuse_file *ff = list_entry(fi->rw_files.next, struct fuse_file, rw_entry);
+		inarg.fh = ff->fh;
+	} else {
+		err = -EINVAL;
+	}
+	spin_unlock(&fc->lock);
+	if (err)
+		return err;
+
+	inarg.cmd = FS_IOC_FIEMAP;
+	inarg.arg = 0;
+	inarg.flags = 0;
+
+	ifiemap.fm_start = *start_p;
+	ifiemap.fm_length = *len_p;
+	ifiemap.fm_flags = dest->fi_flags;
+	ifiemap.fm_mapped_extents = 0;
+	ifiemap.fm_extent_count = cur_max;
+	ifiemap.fm_reserved = 0;
+
+	if (cur_max)
+		npages = (cur_max*sizeof(struct fiemap_extent) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	req = fuse_get_req(fc, npages);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = get_node_id(inode);
+
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = sizeof(ifiemap);
+	req->in.args[1].value = &ifiemap;
+
+	req->out.numargs = npages ? 3 : 2;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	req->out.args[1].size = sizeof(ofiemap);
+	req->out.args[1].value = &ofiemap;
+	if (npages) {
+		req->out.args[2].size = npages*PAGE_SIZE;
+		req->out.argvar = 1;
+		req->out.argpages = 1;
+		req->num_pages = npages;
+
+		err = -ENOMEM;
+		for (allocated = 0; allocated < npages; allocated++) {
+			req->pages[allocated] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+			if (!req->pages[allocated])
+				goto out;
+			req->page_descs[allocated].length = PAGE_SIZE;
+		}
+	}
+
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	if (err)
+		goto out;
+
+	if (cur_max == 0) {
+		dest->fi_extents_mapped += ofiemap.fm_mapped_extents;
+		goto out;
+	}
+	if (ofiemap.fm_mapped_extents == 0) {
+		/* No extents means all the range is a hole */
+		*start_p += *len_p;
+		*len_p = 0;
+	} else {
+		struct fiemap_extent fe;
+		u64 next_start;
+		int i;
+
+		if (ofiemap.fm_mapped_extents > cur_max) {
+			err = -EIO;
+			goto out;
+		}
+
+		for (i = 0; i < ofiemap.fm_mapped_extents; i++) {
+			copy_fiemap_extent(&fe, req->pages, i);
+			err = fiemap_fill_next_extent(dest, fe.fe_logical,
+						      fe.fe_physical, fe.fe_length, fe.fe_flags);
+			if (err == 1) {
+				*last_p = 1;
+				err = 0;
+				goto out;
+			}
+			if (err)
+				goto out;
+		}
+
+		if (fe.fe_flags & FIEMAP_EXTENT_LAST)
+			*last_p = 1;
+		next_start = fe.fe_logical + fe.fe_length;
+		if (next_start >= *start_p + *len_p)
+			*len_p = 0;
+		else
+			*len_p = *start_p + *len_p - next_start;
+		*start_p = next_start;
+	}
+
+out:
+	while (--allocated >= 0) {
+		__free_page(req->pages[allocated]);
+		req->pages[allocated] = NULL;
+	}
+	fuse_put_request(fc, req);
+	return err;
+}
+
+int fuse_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err = 0;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if (fc->no_fiemap)
+		return -EOPNOTSUPP;
+
+	/* It is possible to implement, but implementation is going to be
+	 * very chumbersome. Ww have to get fiemap from user space daemon,
+	 * and then on each hole page-by-page we have to scan page cache
+	 * for dirty and writeback pages and fuse queue for "hidden" writeback
+	 * pages, merging all the results. It is doable and would give some
+	 * satisfaction from completed work :-), but still it does not have
+	 * any practical sense. Current coreutils use FIEMAP_FLAG_SYNC and
+	 * apparently are not going to fix this, switching to SEEK_DATA instead.
+	 * So, until the first user appears...
+	 *
+	 * Also, we can force FIEMAP_FLAG_SYNC... But for now I think it is better
+	 * to fail to catch possible users
+	 */
+	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) {
+		if (printk_ratelimit())
+			printk(KERN_DEBUG "fuse fiemap w/o sync %s[%u]\n",
+			       current->comm, current->pid);
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	fuse_sync_writes(inode);
+
+	if (fieinfo->fi_extents_max == 0) {
+		err = fuse_request_fiemap(inode, 0, &start, &len, NULL, fieinfo);
+		goto out;
+	}
+
+	for (;;) {
+		int last = 0;
+		unsigned int npages;
+		u32 cur_max = fieinfo->fi_extents_max - fieinfo->fi_extents_mapped;
+
+		if (cur_max == 0)
+			break;
+
+		npages = (cur_max*sizeof(struct fiemap_extent) + PAGE_SIZE - 1) / PAGE_SIZE;
+		if (npages > FUSE_MAX_PAGES_PER_REQ) {
+			npages = FUSE_MAX_PAGES_PER_REQ;
+			cur_max = (npages * PAGE_SIZE) / sizeof(struct fiemap_extent);
+		}
+
+		err = fuse_request_fiemap(inode, cur_max, &start, &len, &last, fieinfo);
+		if (err < 0)
+			goto out;
+
+		if (len == 0 || last)
+			break;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	if (err == -ENOSYS || err == -ENOIOCTLCMD || err == -ENOTTY) {
+		fc->no_fiemap = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -2624,6 +3950,8 @@ static const struct file_operations fuse_file_operations = {
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
 	.fallocate	= fuse_file_fallocate,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -2647,11 +3975,16 @@ static const struct file_operations fuse_direct_io_file_operations = {
 static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.writepage	= fuse_writepage,
+	.writepages	= fuse_writepages,
 	.launder_page	= fuse_launder_page,
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
 	.direct_IO	= fuse_direct_IO,
+	.direct_IO_bvec	= fuse_direct_IO_bvec,
+	.direct_IO_page	= fuse_direct_IO_page,
+	.write_begin	= fuse_write_begin,
+	.write_end	= fuse_write_end,
 };
 
 void fuse_init_file_inode(struct inode *inode)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -33,7 +33,7 @@
 #define FUSE_NAME_MAX 1024
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 5
+#define FUSE_CTL_NUM_DENTRIES 10
 
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
@@ -44,6 +44,15 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/* Enable direct access */
+#define FUSE_ODIRECT             (1 << 2)
+
+/* Enable synchronous umount */
+#define FUSE_UMOUNT_WAIT	(1 << 3)
+
+/* Disable synchronous close */
+#define FUSE_DISABLE_CLOSE_WAIT	(1 << 4)
+
 /** Number of page pointers embedded in fuse_req */
 #define FUSE_REQ_INLINE_PAGES 1
 
@@ -94,6 +103,9 @@ struct fuse_inode {
 	/** Files usable in writepage.  Protected by fc->lock */
 	struct list_head write_files;
 
+	/** List of all opened files.  Protected by fc->lock */
+	struct list_head rw_files;
+
 	/** Writepages pending on truncate or fsync */
 	struct list_head queued_writes;
 
@@ -105,10 +117,16 @@ struct fuse_inode {
 	wait_queue_head_t page_waitq;
 
 	/** List of writepage requestst (pending or sent) */
-	struct list_head writepages;
+	struct rb_root writepages;
 
 	/** Miscellaneous bits describing inode state */
 	unsigned long state;
+
+	/** Mostly to detect very first open */
+	atomic_t num_openers;
+
+	/** Even though num_openers>0, trust server i_size */
+	int i_size_unstable;
 };
 
 /** FUSE inode state bits */
@@ -117,6 +135,8 @@ enum {
 	FUSE_I_ADVISE_RDPLUS,
 	/** An operation changing file size is in progress  */
 	FUSE_I_SIZE_UNSTABLE,
+	/** i_mtime has been updated locally; a flush to userspace needed */
+	FUSE_I_MTIME_UPDATED,
 };
 
 struct fuse_conn;
@@ -147,14 +167,28 @@ struct fuse_file {
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
 
+	/** Entry on inode's rw_files list */
+	struct list_head rw_entry;
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
 	/** Wait queue head for poll */
 	wait_queue_head_t poll_wait;
 
+	struct list_head fl;
+	struct dentry *ff_dentry;
+
 	/** Has flock been performed on this file? */
 	bool flock:1;
+
+	unsigned long ff_state;
+};
+
+/** FUSE file states (ff_state) */
+enum {
+	/** Any fops on given ff should fail immediately */
+	FUSE_S_FAIL_IMMEDIATELY,
 };
 
 /** One input argument of a request */
@@ -170,6 +204,8 @@ struct fuse_in {
 
 	/** True if the data for the last argument is in req->pages */
 	unsigned argpages:1;
+	/** True is the data for the last argument is in req->bvecs */
+	unsigned argbvec:1;
 
 	/** Number of arguments */
 	unsigned numargs;
@@ -200,6 +236,8 @@ struct fuse_out {
 
 	/** Last argument is a list of pages to copy data to */
 	unsigned argpages:1;
+	/** Last argument is a list of bvecs to copy data to */
+	unsigned argbvec:1;
 
 	/** Zero partially or not copied pages */
 	unsigned page_zeroing:1;
@@ -220,16 +258,6 @@ struct fuse_page_desc {
 	unsigned int offset;
 };
 
-/** The request state */
-enum fuse_req_state {
-	FUSE_REQ_INIT = 0,
-	FUSE_REQ_PENDING,
-	FUSE_REQ_READING,
-	FUSE_REQ_SENT,
-	FUSE_REQ_WRITING,
-	FUSE_REQ_FINISHED
-};
-
 /** The request IO state (for asynchronous processing) */
 struct fuse_io_priv {
 	int async;
@@ -242,10 +270,44 @@ struct fuse_io_priv {
 	int err;
 	struct kiocb *iocb;
 	struct file *file;
+	struct list_head list;
+};
+
+/**
+ * Request flags
+ *
+ * FR_ISREPLY:		set if the request has reply
+ * FR_FORCE:		force sending of the request even if interrupted
+ * FR_BACKGROUND:	request is sent in the background
+ * FR_WAITING:		request is counted as "waiting"
+ * FR_ABORTED:		the request was aborted
+ * FR_INTERRUPTED:	the request has been interrupted
+ * FR_LOCKED:		data is being copied to/from the request
+ * FR_PENDING:		request is not yet in userspace
+ * FR_SENT:		request is in userspace, waiting for an answer
+ * FR_FINISHED:		request is finished
+ * FR_PRIVATE:		request is on private list
+ */
+enum fuse_req_flag {
+	FR_ISREPLY,
+	FR_FORCE,
+	FR_BACKGROUND,
+	FR_WAITING,
+	FR_ABORTED,
+	FR_INTERRUPTED,
+	FR_LOCKED,
+	FR_PENDING,
+	FR_SENT,
+	FR_FINISHED,
+	FR_PRIVATE,
 };
 
 /**
  * A request to the client
+ *
+ * .waitq.lock protects the following fields:
+ *   - FR_ABORTED
+ *   - FR_LOCKED (may also be modified under fc->lock, tested under both)
  */
 struct fuse_req {
 	/** This can be on either pending processing or io lists in
@@ -261,35 +323,17 @@ struct fuse_req {
 	/** Unique ID for the interrupt request */
 	u64 intr_unique;
 
-	/*
-	 * The following bitfields are either set once before the
-	 * request is queued or setting/clearing them is protected by
-	 * fuse_conn->lock
-	 */
-
-	/** True if the request has reply */
-	unsigned isreply:1;
+	/* Request flags, updated with test/set/clear_bit() */
+	unsigned long flags;
 
-	/** Force sending of the request even if interrupted */
-	unsigned force:1;
+	/** Request contains pages from page-cache */
+	unsigned page_cache:1;
 
-	/** The request was aborted */
-	unsigned aborted:1;
+	/** Request pages need page_cache_release() */
+	unsigned page_needs_release:1;
 
-	/** Request is sent in the background */
-	unsigned background:1;
-
-	/** The request has been interrupted */
-	unsigned interrupted:1;
-
-	/** Data is being copied to/from the request */
-	unsigned locked:1;
-
-	/** Request is counted as "waiting" */
-	unsigned waiting:1;
-
-	/** State of the request */
-	enum fuse_req_state state;
+	/** Request was killed -- pages were released */
+	unsigned killed:1;
 
 	/** The request input */
 	struct fuse_in in;
@@ -326,6 +370,7 @@ struct fuse_req {
 
 	/** page vector */
 	struct page **pages;
+	struct bio_vec *bvec;
 
 	/** page-descriptor vector */
 	struct fuse_page_desc *page_descs;
@@ -339,8 +384,11 @@ struct fuse_req {
 	/** inline page-descriptor vector */
 	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
 
-	/** number of pages in vector */
-	unsigned num_pages;
+	/** number of pages/bvecs in vector */
+	union {
+		unsigned num_pages;
+		unsigned num_bvecs;
+	};
 
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
@@ -352,13 +400,77 @@ struct fuse_req {
 	struct fuse_io_priv *io;
 
 	/** Link on fi->writepages */
-	struct list_head writepages_entry;
+	struct rb_node writepages_entry;
 
 	/** Request completion callback */
 	void (*end)(struct fuse_conn *, struct fuse_req *);
 
 	/** Request is stolen from fuse_file->reserved_req */
 	struct file *stolen_file;
+
+	/** Request will be handled by fud pointing to this fiq */
+	struct fuse_iqueue *fiq;
+};
+
+struct fuse_iqueue {
+	/** Connection established */
+	unsigned connected;
+
+	/** # of fuds pointing to this fiq */
+	int handled_by_fud;
+
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** Pending interrupts */
+	struct list_head interrupts;
+
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
+};
+
+struct fuse_pqueue {
+	/** Connection established */
+	unsigned connected;
+
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** The list of requests under I/O */
+	struct list_head io;
+};
+
+/**
+ * Fuse device instance
+ */
+struct fuse_dev {
+	/** Fuse connection for this device */
+	struct fuse_conn *fc;
+
+	/** Input queue */
+	struct fuse_iqueue *fiq;
+
+	/** Processing queue */
+	struct fuse_pqueue pq;
+
+	/** list entry on fc->devices */
+	struct list_head entry;
 };
 
 /**
@@ -380,6 +492,9 @@ struct fuse_conn {
 
 	struct rcu_head rcu;
 
+	/** Number of fuse_dev's */
+	atomic_t dev_count;
+
 	/** The user id for this mount */
 	kuid_t user_id;
 
@@ -395,17 +510,11 @@ struct fuse_conn {
 	/** Maximum write size */
 	unsigned max_write;
 
-	/** Readers of the connection are waiting on this */
-	wait_queue_head_t waitq;
-
-	/** The list of pending requests */
-	struct list_head pending;
-
-	/** The list of requests being processed */
-	struct list_head processing;
+	/** Main input queue */
+	struct fuse_iqueue main_iq;
 
-	/** The list of requests under I/O */
-	struct list_head io;
+	/** Per-cpu input queues */
+	struct fuse_iqueue __percpu *iqs;
 
 	/** The next unique kernel file handle */
 	u64 khctr;
@@ -428,16 +537,6 @@ struct fuse_conn {
 	/** The list of background requests set aside for later queuing */
 	struct list_head bg_queue;
 
-	/** Pending interrupts */
-	struct list_head interrupts;
-
-	/** Queue of pending forgets */
-	struct fuse_forget_link forget_list_head;
-	struct fuse_forget_link *forget_list_tail;
-
-	/** Batching of FORGET requests (positive indicates FORGET batch) */
-	int forget_batch;
-
 	/** Flag indicating that INIT reply has been received. Allocating
 	 * any fuse request will be suspended until the flag is set */
 	int initialized;
@@ -453,9 +552,6 @@ struct fuse_conn {
 	/** waitq for reserved requests */
 	wait_queue_head_t reserved_req_waitq;
 
-	/** The next unique request id */
-	u64 reqctr;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -480,6 +576,9 @@ struct fuse_conn {
 	/** Set if bdi is valid */
 	unsigned bdi_initialized:1;
 
+	/** write-back cache policy (default is write-through) */
+	unsigned writeback_cache:1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -542,6 +641,9 @@ struct fuse_conn {
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
+	/** Wait for response from daemon on close */
+	unsigned close_wait:1;
+
 	/** Does the filesystem support readdirplus? */
 	unsigned do_readdirplus:1;
 
@@ -554,6 +656,12 @@ struct fuse_conn {
 	/** Is lseek not implemented by fs? */
 	unsigned no_lseek:1;
 
+	/** Handle wrong FUSE_NOTIFY_INVAL_FILES from old fused */
+	unsigned compat_inval_files:1;
+
+	/** No ioctl(FIEMAP) */
+	unsigned no_fiemap:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -575,9 +683,6 @@ struct fuse_conn {
 	/** number of dentries used in the above array */
 	int ctl_ndents;
 
-	/** O_ASYNC requests */
-	struct fasync_struct *fasync;
-
 	/** Key for lock owner ID scrambling */
 	u32 scramble_key[4];
 
@@ -595,6 +700,11 @@ struct fuse_conn {
 
 	/** Read/write semaphore to hold when accessing sb. */
 	struct rw_semaphore killsb;
+
+	struct list_head conn_files;
+
+	/** List of device instances belonging to this connection */
+	struct list_head devices;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -632,7 +742,7 @@ int fuse_inode_eq(struct inode *inode, void *_nodeidp);
  */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version);
+			u64 attr_valid, u64 attr_version, int creat);
 
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 		     struct fuse_entry_out *outarg, struct inode **inode);
@@ -728,9 +838,9 @@ void fuse_ctl_cleanup(void);
 /**
  * Allocate a request
  */
-struct fuse_req *fuse_request_alloc(unsigned npages);
+struct fuse_req *fuse_request_alloc(struct fuse_conn *fc, unsigned npages);
 
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+struct fuse_req *fuse_request_alloc_nofs(struct fuse_conn *fc, unsigned npages);
 
 /**
  * Free a request
@@ -776,6 +886,12 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
  */
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
+/**
+ * Send a request (synchronous) if not FUSE_S_FAIL_IMMEDIATELY
+ */
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff);
+
 /**
  * Send a request in the background
  */
@@ -799,18 +915,19 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
  */
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 
-void fuse_conn_kill(struct fuse_conn *fc);
-
 /**
  * Initialize fuse_conn
  */
-void fuse_conn_init(struct fuse_conn *fc);
+int fuse_conn_init(struct fuse_conn *fc);
 
 /**
  * Release reference to fuse_conn
  */
 void fuse_conn_put(struct fuse_conn *fc);
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+void fuse_dev_free(struct fuse_dev *fud);
+
 /**
  * Add connection to control filesystem
  */
@@ -836,6 +953,8 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed);
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size);
+
 void fuse_flush_writepages(struct inode *inode);
 
 void fuse_set_nowrite(struct inode *inode);
@@ -862,11 +981,28 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 			     u64 child_nodeid, struct qstr *name);
 
+/**
+ * File-system tells the kernel to invalidate all fuse-files (and cache)
+ * for the given node id.
+ */
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write);
+		       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -874,9 +1010,16 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
-void fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_mtime(struct file *file, bool nofail);
 
 int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 		    struct file *file);
 
+void fuse_set_initialized(struct fuse_conn *fc);
+
+int fuse_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len);
+
 #endif /* _FS_FUSE_I_H */
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/ve.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -29,6 +30,8 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
+static int fuse_ve_odirect;
+
 static int set_global_limit(const char *val, struct kernel_param *kp);
 
 unsigned max_user_bgreq;
@@ -66,6 +69,7 @@ struct fuse_mount_data {
 	unsigned rootmode_present:1;
 	unsigned user_id_present:1;
 	unsigned group_id_present:1;
+	unsigned writeback_cache:1;
 	unsigned flags;
 	unsigned max_read;
 	unsigned blksize;
@@ -93,9 +97,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->writectr = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->i_size_unstable = 0;
 	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->rw_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
-	INIT_LIST_HEAD(&fi->writepages);
+	fi->writepages = RB_ROOT;
 	init_waitqueue_head(&fi->page_waitq);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget) {
@@ -116,6 +122,7 @@ static void fuse_destroy_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->rw_files));
 	BUG_ON(!list_empty(&fi->queued_writes));
 	kfree(fi->forget);
 	call_rcu(&inode->i_rcu, fuse_i_callback);
@@ -170,8 +177,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
-	inode->i_mtime.tv_sec   = attr->mtime;
-	inode->i_mtime.tv_nsec  = attr->mtimensec;
+	/* mtime from server may be stale due to local buffered write */
+	if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+		inode->i_mtime.tv_sec   = attr->mtime;
+		inode->i_mtime.tv_nsec  = attr->mtimensec;
+	}
 	inode->i_ctime.tv_sec   = attr->ctime;
 	inode->i_ctime.tv_nsec  = attr->ctimensec;
 
@@ -197,6 +207,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool is_wb = fc->writeback_cache;
 	loff_t oldsize;
 	struct timespec old_mtime;
 
@@ -211,10 +222,17 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
-	i_size_write(inode, attr->size);
+        /*
+	 * In case of writeback_cache enabled, the cached writes beyond EOF
+	 * extend local i_size without keeping userspace server in sync. So,
+	 * attr->size coming from server can be stale. We cannot trust it.
+	 */
+	if (!is_wb || !S_ISREG(inode->i_mode) ||
+	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
+		i_size_write(inode, attr->size);
 	spin_unlock(&fc->lock);
 
-	if (S_ISREG(inode->i_mode)) {
+	if (!is_wb && S_ISREG(inode->i_mode)) {
 		bool inval = false;
 
 		if (oldsize != attr->size) {
@@ -239,10 +257,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	}
 }
 
-static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
+			    int num_openers)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	atomic_set(&fi->num_openers, num_openers);
+
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
+	inode->i_mtime.tv_sec  = attr->mtime;
+	inode->i_mtime.tv_nsec = attr->mtimensec;
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
 		fuse_init_file_inode(inode);
@@ -277,7 +301,7 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp)
 
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version)
+			u64 attr_valid, u64 attr_version, int creat)
 {
 	struct inode *inode;
 	struct fuse_inode *fi;
@@ -289,10 +313,13 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 		return NULL;
 
 	if ((inode->i_state & I_NEW)) {
-		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_flags |= S_NOATIME;
+		if (!fc->writeback_cache)
+			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
 		inode->i_data.backing_dev_info = &fc->bdi;
-		fuse_init_inode(inode, attr);
+		fuse_init_inode(inode, attr,
+				fc->writeback_cache ? creat : 0);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
 		/* Inode has changed type, any I/O on the old should fail */
@@ -335,6 +362,90 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 	return 0;
 }
 
+static void fuse_kill_requests(struct fuse_conn *fc, struct inode *inode,
+			       struct list_head *req_list)
+{
+	struct fuse_req *req;
+
+	list_for_each_entry(req, req_list, list)
+		if (req->inode == inode && req->page_cache && !req->killed) {
+			int i;
+
+			BUG_ON(req->in.h.opcode != FUSE_READ);
+			req->killed = 1;
+
+			for (i = 0; i < req->num_pages; i++) {
+				struct page *page = req->pages[i];
+				SetPageError(page);
+				unlock_page(page);
+				if (req->page_needs_release)
+					page_cache_release(page);
+				req->pages[i] = NULL;
+			}
+
+			req->num_pages = 0;
+		}
+}
+
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
+{
+	struct super_block *sb = fc->sb;
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_file *ff;
+	int err;
+
+	if (!fc->async_read) {
+		printk(KERN_ERR "Turn async_read ON to use "
+				"FUSE_NOTIFY_INVAL_FILES!\n");
+		return -EOPNOTSUPP;
+	}
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
+	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
+		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+	}
+	spin_unlock(&fc->lock);
+
+	/* let them see FUSE_S_FAIL_IMMEDIATELY */
+	wake_up_all(&fc->blocked_waitq);
+
+	/* see how fuse_writepages_fill() waits for fuse writeback */
+	wake_up(&fi->page_waitq);
+
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
+		struct fuse_dev *fud;
+		spin_lock(&fc->lock);
+		list_for_each_entry(fud, &fc->devices, entry) {
+			struct fuse_pqueue *fpq = &fud->pq;
+			struct fuse_iqueue *fiq = fud->fiq;
+			spin_lock(&fpq->lock);
+			fuse_kill_requests(fc, inode, &fpq->processing);
+			fuse_kill_requests(fc, inode, &fiq->pending);
+			fuse_kill_requests(fc, inode, &fpq->io);
+			spin_unlock(&fpq->lock);
+		}
+		fuse_kill_requests(fc, inode, &fc->main_iq.pending);
+		fuse_kill_requests(fc, inode, &fc->bg_queue);
+		wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
+		spin_unlock(&fc->lock);
+
+		err = invalidate_inode_pages2(inode->i_mapping);
+	}
+
+	if (!err)
+		fuse_invalidate_attr(inode);
+
+	iput(inode);
+	return err;
+}
+
 static void fuse_umount_begin(struct super_block *sb)
 {
 	fuse_abort_conn(get_fuse_conn_super(sb));
@@ -346,8 +457,8 @@ static void fuse_send_destroy(struct fuse_conn *fc)
 	if (req && fc->conn_init) {
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
-		req->force = 1;
-		req->background = 0;
+		__set_bit(FR_FORCE, &req->flags);
+		__clear_bit(FR_BACKGROUND, &req->flags);
 		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
@@ -359,28 +470,13 @@ static void fuse_bdi_destroy(struct fuse_conn *fc)
 		bdi_destroy(&fc->bdi);
 }
 
-void fuse_conn_kill(struct fuse_conn *fc)
-{
-	spin_lock(&fc->lock);
-	fc->connected = 0;
-	fc->blocked = 0;
-	fc->initialized = 1;
-	spin_unlock(&fc->lock);
-	/* Flush all readers on this fs */
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
-	wake_up_all(&fc->waitq);
-	wake_up_all(&fc->blocked_waitq);
-	wake_up_all(&fc->reserved_req_waitq);
-}
-EXPORT_SYMBOL_GPL(fuse_conn_kill);
-
 static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	fuse_send_destroy(fc);
 
-	fuse_conn_kill(fc);
+	fuse_abort_conn(fc);
 	mutex_lock(&fuse_mutex);
 	list_del(&fc->entry);
 	fuse_ctl_remove_conn(fc);
@@ -446,6 +542,10 @@ enum {
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_WBCACHE,
+	OPT_ODIRECT,
+	OPT_UMOUNT_WAIT,
+	OPT_DISABLE_CLOSE_WAIT,
 	OPT_ERR
 };
 
@@ -458,6 +558,10 @@ static const match_table_t tokens = {
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_WBCACHE,			"writeback_enable"},
+	{OPT_ODIRECT,			"direct_enable"},
+	{OPT_UMOUNT_WAIT,		"umount_wait"},
+	{OPT_DISABLE_CLOSE_WAIT,	"disable_close_wait"},
 	{OPT_ERR,			NULL}
 };
 
@@ -531,6 +635,28 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 			d->blksize = value;
 			break;
 
+		case OPT_WBCACHE:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->writeback_cache = 1;
+			break;
+
+		case OPT_ODIRECT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_ODIRECT;
+			break;
+
+		case OPT_UMOUNT_WAIT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_UMOUNT_WAIT;
+			break;
+
+		case OPT_DISABLE_CLOSE_WAIT:
+			d->flags |= FUSE_DISABLE_CLOSE_WAIT;
+			break;
+
 		default:
 			return 0;
 		}
@@ -554,40 +680,72 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		seq_puts(m, ",allow_other");
+	if (fc->flags & FUSE_ODIRECT)
+		seq_puts(m, ",direct_enable");
+	if (fc->flags & FUSE_UMOUNT_WAIT)
+		seq_puts(m, ",umount_wait");
+	if (fc->flags & FUSE_DISABLE_CLOSE_WAIT)
+		seq_puts(m, ",disable_close_wait");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
 		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+	if (fc->writeback_cache)
+		seq_puts(m, ",writeback_enable");
 	return 0;
 }
 
-void fuse_conn_init(struct fuse_conn *fc)
+static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+{
+	memset(fiq, 0, sizeof(struct fuse_iqueue));
+	init_waitqueue_head(&fiq->waitq);
+	INIT_LIST_HEAD(&fiq->pending);
+	INIT_LIST_HEAD(&fiq->interrupts);
+	fiq->forget_list_tail = &fiq->forget_list_head;
+	fiq->connected = 1;
+}
+
+static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+{
+	memset(fpq, 0, sizeof(struct fuse_pqueue));
+	spin_lock_init(&fpq->lock);
+	INIT_LIST_HEAD(&fpq->processing);
+	INIT_LIST_HEAD(&fpq->io);
+	fpq->connected = 1;
+}
+
+int fuse_conn_init(struct fuse_conn *fc)
 {
+	int cpu;
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
 	mutex_init(&fc->inst_mutex);
 	init_rwsem(&fc->killsb);
 	atomic_set(&fc->count, 1);
-	init_waitqueue_head(&fc->waitq);
+	atomic_set(&fc->dev_count, 1);
 	init_waitqueue_head(&fc->blocked_waitq);
 	init_waitqueue_head(&fc->reserved_req_waitq);
-	INIT_LIST_HEAD(&fc->pending);
-	INIT_LIST_HEAD(&fc->processing);
-	INIT_LIST_HEAD(&fc->io);
-	INIT_LIST_HEAD(&fc->interrupts);
+	fuse_iqueue_init(&fc->main_iq);
+	fc->iqs = alloc_percpu(struct fuse_iqueue);
+	if (!fc->iqs)
+		return -ENOMEM;
+	for_each_online_cpu(cpu)
+		fuse_iqueue_init(per_cpu_ptr(fc->iqs, cpu));
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
-	fc->forget_list_tail = &fc->forget_list_head;
+	INIT_LIST_HEAD(&fc->conn_files);
+	INIT_LIST_HEAD(&fc->devices);
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
-	fc->reqctr = 0;
 	fc->blocked = 0;
 	fc->initialized = 0;
+	fc->connected = 1;
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+	return 0;
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
@@ -617,7 +775,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
 	attr.nlink = 1;
-	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+	return fuse_iget(sb, 1, 0, &attr, 0, 0, 0);
 }
 
 struct fuse_inode_handle {
@@ -887,7 +1045,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fc->initialized = 1;
+	fuse_set_initialized(fc);
 	wake_up_all(&fc->blocked_waitq);
 }
 
@@ -920,6 +1078,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 static void fuse_free_conn(struct fuse_conn *fc)
 {
+	WARN_ON(!list_empty(&fc->devices));
+	free_percpu(fc->iqs);
 	kfree_rcu(fc, rcu);
 }
 
@@ -949,10 +1109,10 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 		return err;
 
 	/*
-	 * For a single fuse filesystem use max 1% of dirty +
+	 * For a single fuse filesystem use max 20% of dirty +
 	 * writeback threshold.
 	 *
-	 * This gives about 1M of write buffer for memory maps on a
+	 * This gives about 20M of write buffer for memory maps on a
 	 * machine with 1G and 10% dirty_ratio, which should be more
 	 * than enough.
 	 *
@@ -960,13 +1120,57 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	 *
 	 *    /sys/class/bdi/<bdi>/max_ratio
 	 */
-	bdi_set_max_ratio(&fc->bdi, 1);
+	bdi_set_max_ratio(&fc->bdi, 20);
+
+	/*
+	 * These values have precedence over max_ratio
+	 */
+	bdi_set_max_dirty(&fc->bdi, (512 * 1024 * 1024) / PAGE_SIZE);
+	bdi_set_min_dirty(&fc->bdi, (256 * 1024 * 1024) / PAGE_SIZE);
 
 	return 0;
 }
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+{
+	struct fuse_dev *fud;
+
+	fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+	if (fud) {
+		fud->fc = fuse_conn_get(fc);
+		fud->fiq = &fc->main_iq;
+		fuse_pqueue_init(&fud->pq);
+
+		spin_lock(&fc->lock);
+		fud->fiq->handled_by_fud++;
+		list_add_tail(&fud->entry, &fc->devices);
+		spin_unlock(&fc->lock);
+	}
+
+	return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_free(struct fuse_dev *fud)
+{
+	struct fuse_conn *fc = fud->fc;
+
+	if (fc) {
+		spin_lock(&fc->lock);
+		fud->fiq->handled_by_fud--;
+		BUG_ON(fud->fiq->handled_by_fud < 0);
+		list_del(&fud->entry);
+		spin_unlock(&fc->lock);
+
+		fuse_conn_put(fc);
+	}
+	kfree(fud);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_free);
+
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
+	struct fuse_dev *fud;
 	struct fuse_conn *fc;
 	struct inode *root;
 	struct fuse_mount_data d;
@@ -1007,7 +1211,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err;
 
 	if ((file->f_op != &fuse_dev_operations) ||
-	    (file->f_cred->user_ns != &init_user_ns))
+	    ((file->f_cred->user_ns != &init_user_ns) &&
+	     (file->f_cred->user_ns != ve_init_user_ns())))
 		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
@@ -1015,13 +1220,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!fc)
 		goto err_fput;
 
-	fuse_conn_init(fc);
+	err = fuse_conn_init(fc);
+	if (err) {
+		kfree(fc);
+		goto err_fput;
+	}
+	fc->release = fuse_free_conn;
+
+	fud = fuse_dev_alloc(fc);
+	if (!fud)
+		goto err_put_conn;
 
 	fc->dev = sb->s_dev;
 	fc->sb = sb;
 	err = fuse_bdi_init(fc, sb);
 	if (err)
-		goto err_put_conn;
+		goto err_dev_free;
 
 	sb->s_bdi = &fc->bdi;
 
@@ -1030,11 +1244,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		fc->dont_mask = 1;
 	sb->s_flags |= MS_POSIXACL;
 
-	fc->release = fuse_free_conn;
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
 	fc->max_read = max_t(unsigned, 4096, d.max_read);
+	fc->writeback_cache = d.writeback_cache;
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
@@ -1043,17 +1257,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	root = fuse_get_root_inode(sb, d.rootmode);
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
-		goto err_put_conn;
+		goto err_dev_free;
 	/* only now - we want root dentry with NULL ->d_op */
 	sb->s_d_op = &fuse_dentry_operations;
 
-	init_req = fuse_request_alloc(0);
+	init_req = fuse_request_alloc(fc, 0);
 	if (!init_req)
 		goto err_put_root;
-	init_req->background = 1;
+	__set_bit(FR_BACKGROUND, &init_req->flags);
 
-	if (is_bdev) {
-		fc->destroy_req = fuse_request_alloc(0);
+	if (is_bdev || (fc->flags & FUSE_UMOUNT_WAIT)) {
+		fc->destroy_req = fuse_request_alloc(fc, 0);
 		if (!fc->destroy_req)
 			goto err_free_init_req;
 	}
@@ -1069,8 +1283,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
-	fc->connected = 1;
-	file->private_data = fuse_conn_get(fc);
+	file->private_data = fud;
 	mutex_unlock(&fuse_mutex);
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
@@ -1089,6 +1302,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
+ err_dev_free:
+	fuse_dev_free(fud);
  err_put_conn:
 	fuse_bdi_destroy(fc);
 	fuse_conn_put(fc);
@@ -1102,7 +1317,25 @@ static struct dentry *fuse_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
 		       void *raw_data)
 {
-	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	struct dentry *dentry;
+
+	dentry = mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+
+	/* Hack to distinguish pcs fuse service and to force synchronous close for it.
+	 * Seems, this is the only place where we have some variable (dev_name), which
+	 * is not confined by fuse API and already defined.
+	 */
+	if (!IS_ERR(dentry) && dev_name &&
+			(strncmp(dev_name, "pstorage://", 11) == 0 ||
+				strncmp(dev_name, "vstorage://", 11) == 0) ) {
+		struct fuse_conn *fc = dentry->d_sb->s_fs_info;
+
+		if (!(fc->flags & FUSE_DISABLE_CLOSE_WAIT))
+			fc->close_wait = 1;
+
+		fc->compat_inval_files = 1;
+	}
+	return dentry;
 }
 
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1121,7 +1354,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
-	.fs_flags	= FS_HAS_SUBTYPE,
+	.fs_flags	= FS_HAS_SUBTYPE | FS_VIRTUALIZED,
 	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
@@ -1189,8 +1422,8 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode),
-					      0, SLAB_HWCACHE_ALIGN,
+					      sizeof(struct fuse_inode), 0,
+					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
@@ -1260,6 +1493,24 @@ static void fuse_sysfs_cleanup(void)
 	kobject_put(fuse_kobj);
 }
 
+static ctl_table fuse_table[] = {
+	{
+		.procname	= "fuse-ve-odirect",
+		.data		= &fuse_ve_odirect,
+		.maxlen		= sizeof(fuse_ve_odirect),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_path fuse_path[] = {
+	{ .procname = "fs", },
+	{},
+};
+
+static struct ctl_table_header * fuse_sysctl_header;
+
 static int __init fuse_init(void)
 {
 	int res;
@@ -1287,6 +1538,8 @@ static int __init fuse_init(void)
 	sanitize_global_limit(&max_user_bgreq);
 	sanitize_global_limit(&max_user_congthresh);
 
+	fuse_sysctl_header = register_sysctl_paths(fuse_path, fuse_table);
+
 	return 0;
 
  err_sysfs_cleanup:
@@ -1307,6 +1560,7 @@ static void __exit fuse_exit(void)
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
 	fuse_dev_cleanup();
+	unregister_sysctl_table(fuse_sysctl_header);
 }
 
 module_init(fuse_init);
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = gfs2_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 /**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	int sync_state = inode->i_state & I_DIRTY;
+	int sync_state = inode->i_state & I_DIRTY_ALL;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret = 0, ret1 = 0;
 
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	if (!gfs2_is_jdata(ip))
 		sync_state &= ~I_DIRTY_PAGES;
 	if (datasync)
-		sync_state &= ~I_DIRTY_SYNC;
+		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
 
 	if (sync_state) {
 		ret = sync_inode_metadata(inode, 1);
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1382,21 +1382,22 @@ __acquires(&lru_lock)
  * gfs2_dispose_glock_lru() above.
  */
 
-static void gfs2_scan_glock_lru(int nr)
+static long gfs2_scan_glock_lru(int nr)
 {
 	struct gfs2_glock *gl;
 	LIST_HEAD(skipped);
 	LIST_HEAD(dispose);
+	long freed = 0;
 
 	spin_lock(&lru_lock);
-	while(nr && !list_empty(&lru_list)) {
+	while ((nr-- >= 0) && !list_empty(&lru_list)) {
 		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
 
 		/* Test for being demotable */
 		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
 			list_move(&gl->gl_lru, &dispose);
 			atomic_dec(&lru_count);
-			nr--;
+			freed++;
 			continue;
 		}
 
@@ -1406,23 +1407,28 @@ static void gfs2_scan_glock_lru(int nr)
 	if (!list_empty(&dispose))
 		gfs2_dispose_glock_lru(&dispose);
 	spin_unlock(&lru_lock);
+
+	return freed;
 }
 
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
-				    struct shrink_control *sc)
+static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
+					    struct shrink_control *sc)
 {
-	if (sc->nr_to_scan) {
-		if (!(sc->gfp_mask & __GFP_FS))
-			return -1;
-		gfs2_scan_glock_lru(sc->nr_to_scan);
-	}
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+	return gfs2_scan_glock_lru(sc->nr_to_scan);
+}
 
-	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
+static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
+					     struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_read(&lru_count));
 }
 
 static struct shrinker glock_shrinker = {
-	.shrink = gfs2_shrink_glock_memory,
 	.seeks = DEFAULT_SEEKS,
+	.count_objects = gfs2_glock_shrink_count,
+	.scan_objects = gfs2_glock_shrink_scan,
 };
 
 /**
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -112,7 +112,8 @@ static int __init init_gfs2_fs(void)
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
-					          SLAB_MEM_SPREAD,
+						  SLAB_MEM_SPREAD|
+						  SLAB_ACCOUNT,
 					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -142,7 +142,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 	}
 }
 
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -152,35 +153,41 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
 
 	if (qd->qd_lockref.count == 0) {
 		lockref_mark_dead(&qd->qd_lockref);
-		list_move(&qd->qd_lru, dispose);
+		list_lru_isolate_move(lru, &qd->qd_lru, dispose);
 	}
 
 	spin_unlock(&qd->qd_lockref.lock);
 	return LRU_REMOVED;
 }
 
-static int gfs2_shrink_qd_memory(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
 {
 	LIST_HEAD(dispose);
-
-	if (sc->nr_to_scan == 0)
-		goto out;
+	unsigned long freed;
 
 	if (!(sc->gfp_mask & __GFP_FS))
-		return -1;
+		return SHRINK_STOP;
 
-	list_lru_walk(&gfs2_qd_lru, gfs2_qd_isolate, &dispose, sc->nr_to_scan);
+	freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+				     gfs2_qd_isolate, &dispose);
 
 	gfs2_qd_dispose(&dispose);
 
-out:
-	return (list_lru_count(&gfs2_qd_lru) * sysctl_vfs_cache_pressure) / 100;
+	return freed;
+}
+
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
 struct shrinker gfs2_qd_shrinker = {
-	.shrink = gfs2_shrink_qd_memory,
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
 };
 
 static u64 qd2index(struct gfs2_quota_data *qd)
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -481,8 +481,8 @@ static int __init init_hfs_fs(void)
 	int err;
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
-		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once);
+		sizeof(struct hfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -656,7 +656,7 @@ static int __init init_hfsplus_fs(void)
 	int err;
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
-		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -228,7 +228,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
 	if (hi == NULL)
 		return NULL;
 	hi->fd = -1;
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -207,7 +207,7 @@ static int init_inodecache(void)
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1377,7 +1377,7 @@ static int __init init_hugetlbfs_fs(void)
 	error = -ENOMEM;
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once);
+					0, SLAB_ACCOUNT, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		goto out2;
 
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,9 @@
 #include <linux/prefetch.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <linux/vzstat.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 
 /*
@@ -24,12 +27,12 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
- * inode->i_sb->s_inode_lru_lock protects:
+ * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
@@ -37,7 +40,7 @@
  *
  * inode_sb_list_lock
  *   inode->i_lock
- *     inode->i_sb->s_inode_lru_lock
+ *     Inode LRU list locks
  *
  * bdi->wb.list_lock
  *   inode->i_lock
@@ -56,6 +59,7 @@ static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+EXPORT_SYMBOL_GPL(inode_sb_list_lock);
 
 /*
  * Empty aops. Can be used for the cases where the user does not
@@ -65,38 +69,42 @@ const struct address_space_operations empty_aops = {
 };
 EXPORT_SYMBOL(empty_aops);
 
+const struct inode_operations empty_iops = {
+};
+EXPORT_SYMBOL(empty_iops);
+
 /*
  * Statistics gathering..
  */
 struct inodes_stat_t inodes_stat;
 
-static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static DEFINE_PER_CPU(unsigned int, nr_unused);
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static int get_nr_inodes(void)
+static long get_nr_inodes(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_inodes, i);
 	return sum < 0 ? 0 : sum;
 }
 
-static inline int get_nr_inodes_unused(void)
+static inline long get_nr_inodes_unused(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_unused, i);
 	return sum < 0 ? 0 : sum;
 }
 
-int get_nr_dirty_inodes(void)
+long get_nr_dirty_inodes(void)
 {
 	/* not actually dirty inodes, but a wild approximation */
-	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
 }
 
@@ -109,7 +117,7 @@ int proc_nr_inodes(ctl_table *table, int write,
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
 	inodes_stat.nr_unused = get_nr_inodes_unused();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -174,6 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	mapping->private_data = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
+	mapping->dirtied_ub = NULL;
 
 	/*
 	 * If the block_device provides a backing_dev_info for client
@@ -237,6 +246,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	BUG_ON(inode->i_data.dirtied_ub);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 	if (!inode->i_nlink) {
@@ -357,7 +367,7 @@ void address_space_init_once(struct address_space *mapping)
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	INIT_LIST_HEAD(&mapping->i_peer_list);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
@@ -395,6 +405,7 @@ void __iget(struct inode *inode)
 {
 	atomic_inc(&inode->i_count);
 }
+EXPORT_SYMBOL(__iget);
 
 /*
  * get additional reference to inode; caller must already hold one.
@@ -407,13 +418,8 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (list_empty(&inode->i_lru)) {
-		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
-		inode->i_sb->s_nr_inodes_unused++;
+	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_inc(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /*
@@ -423,7 +429,8 @@ static void inode_lru_list_add(struct inode *inode)
  */
 void inode_add_lru(struct inode *inode)
 {
-	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+				I_FREEING | I_WILL_FREE)) &&
 	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
 		inode_lru_list_add(inode);
 }
@@ -431,13 +438,9 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (!list_empty(&inode->i_lru)) {
-		list_del_init(&inode->i_lru);
-		inode->i_sb->s_nr_inodes_unused--;
+
+	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_dec(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /**
@@ -658,7 +661,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
-		if (inode->i_state & I_DIRTY && !kill_dirty) {
+		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
@@ -681,24 +684,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	return busy;
 }
 
-static int can_unuse(struct inode *inode)
-{
-	if (inode->i_state & ~I_REFERENCED)
-		return 0;
-	if (inode_has_buffers(inode))
-		return 0;
-	if (atomic_read(&inode->i_count))
-		return 0;
-	if (inode->i_data.nrpages)
-		return 0;
-	return 1;
-}
-
 /*
- * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * This is called from the superblock shrinker function with a number of inodes
- * to trim from the LRU. Inodes to be freed are moved to a temporary list and
- * then are freed outside inode_lock by dispose_list().
+ * Isolate the inode from the LRU in preparation for freeing it.
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -712,89 +699,83 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-void prune_icache_sb(struct super_block *sb, int nr_to_scan)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
-	LIST_HEAD(freeable);
-	int nr_scanned;
-	unsigned long reap = 0;
+	struct list_head *freeable = arg;
+	struct inode	*inode = container_of(item, struct inode, i_lru);
 
-	spin_lock(&sb->s_inode_lru_lock);
-	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
-		struct inode *inode;
+	/*
+	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&inode->i_lock))
+		return LRU_SKIP;
 
-		if (list_empty(&sb->s_inode_lru))
-			break;
+	/*
+	 * Referenced or dirty inodes are still in use. Give them another pass
+	 * through the LRU as we canot reclaim them now.
+	 */
+	if (atomic_read(&inode->i_count) ||
+	    (inode->i_state & ~I_REFERENCED)) {
+		list_lru_isolate(lru, &inode->i_lru);
+		spin_unlock(&inode->i_lock);
+		this_cpu_dec(nr_unused);
+		return LRU_REMOVED;
+	}
 
-		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
+		inode->i_state &= ~I_REFERENCED;
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
 
-		/*
-		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
-		 * so use a trylock. If we fail to get the lock, just move the
-		 * inode to the back of the list so we don't spin on it.
-		 */
-		if (!spin_trylock(&inode->i_lock)) {
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			continue;
+	if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(lru_lock);
+		if (remove_inode_buffers(inode)) {
+			unsigned long reap;
+			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+			if (current_is_kswapd())
+				__count_vm_events(KSWAPD_INODESTEAL, reap);
+			else
+				__count_vm_events(PGINODESTEAL, reap);
+			if (current->reclaim_state)
+				current->reclaim_state->reclaimed_slab += reap;
 		}
+		iput(inode);
+		spin_lock(lru_lock);
+		return LRU_RETRY;
+	}
 
-		/*
-		 * Referenced or dirty inodes are still in use. Give them
-		 * another pass through the LRU as we canot reclaim them now.
-		 */
-		if (atomic_read(&inode->i_count) ||
-		    (inode->i_state & ~I_REFERENCED)) {
-			list_del_init(&inode->i_lru);
-			spin_unlock(&inode->i_lock);
-			sb->s_nr_inodes_unused--;
-			this_cpu_dec(nr_unused);
-			continue;
-		}
+	WARN_ON(inode->i_state & I_NEW);
+	inode->i_state |= I_FREEING;
+	list_lru_isolate_move(lru, &inode->i_lru, freeable);
+	spin_unlock(&inode->i_lock);
 
-		/* recently referenced inodes get one more pass */
-		if (inode->i_state & I_REFERENCED) {
-			inode->i_state &= ~I_REFERENCED;
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
-			__iget(inode);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&sb->s_inode_lru_lock);
-			if (remove_inode_buffers(inode))
-				reap += invalidate_mapping_pages(&inode->i_data,
-								0, -1);
-			iput(inode);
-			spin_lock(&sb->s_inode_lru_lock);
-
-			if (inode != list_entry(sb->s_inode_lru.next,
-						struct inode, i_lru))
-				continue;	/* wrong inode or list_empty */
-			/* avoid lock inversions with trylock */
-			if (!spin_trylock(&inode->i_lock))
-				continue;
-			if (!can_unuse(inode)) {
-				spin_unlock(&inode->i_lock);
-				continue;
-			}
-		}
-		WARN_ON(inode->i_state & I_NEW);
-		inode->i_state |= I_FREEING;
-		spin_unlock(&inode->i_lock);
+	this_cpu_dec(nr_unused);
+	return LRU_REMOVED;
+}
 
-		list_move(&inode->i_lru, &freeable);
-		sb->s_nr_inodes_unused--;
-		this_cpu_dec(nr_unused);
-	}
-	if (current_is_kswapd())
-		__count_vm_events(KSWAPD_INODESTEAL, reap);
-	else
-		__count_vm_events(PGINODESTEAL, reap);
-	spin_unlock(&sb->s_inode_lru_lock);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += reap;
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+{
+	LIST_HEAD(freeable);
+	long freed;
 
+	KSTAT_PERF_ENTER(shrink_icache);
+	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+				     inode_lru_isolate, &freeable);
 	dispose_list(&freeable);
+	KSTAT_PERF_LEAVE(shrink_icache);
+	return freed;
 }
 
 static void __wait_on_freeing_inode(struct inode *inode);
@@ -1328,6 +1309,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
 
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @match:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+				unsigned long hashval,
+				int (*match)(struct inode *, unsigned long,
+					     void *),
+				void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode, *ret_inode = NULL;
+	int mval;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		mval = match(inode, hashval, data);
+		if (mval == 0)
+			continue;
+		if (mval == 1)
+			ret_inode = inode;
+		goto out;
+	}
+out:
+	spin_unlock(&inode_hash_lock);
+	return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
 int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
@@ -1478,11 +1509,20 @@ static void iput_final(struct inode *inode)
  */
 void iput(struct inode *inode)
 {
-	if (inode) {
-		BUG_ON(inode->i_state & I_CLEAR);
-
-		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
-			iput_final(inode);
+	if (!inode)
+		return;
+	BUG_ON(inode->i_state & I_CLEAR);
+retry:
+	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+			atomic_inc(&inode->i_count);
+			inode->i_state &= ~I_DIRTY_TIME;
+			spin_unlock(&inode->i_lock);
+			trace_writeback_lazytime_iput(inode);
+			mark_inode_dirty_sync(inode);
+			goto retry;
+		}
+		iput_final(inode);
 	}
 }
 EXPORT_SYMBOL(iput);
@@ -1507,6 +1547,8 @@ sector_t bmap(struct inode *inode, sector_t block)
 }
 EXPORT_SYMBOL(bmap);
 
+unsigned __read_mostly relatime_interval = 24*60*60; /* one day */
+
 /*
  * Update times in overlayed inode from underlying real inode
  */
@@ -1547,10 +1589,10 @@ static int relatime_need_update(const struct path *path, struct inode *inode,
 		return 1;
 
 	/*
-	 * Is the previous atime value older than a day? If yes,
-	 * update atime:
+	 * Is the previous atime value older than a update interval?
+	 * If yes, update atime:
 	 */
-	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= relatime_interval)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
@@ -1558,14 +1600,9 @@ static int relatime_need_update(const struct path *path, struct inode *inode,
 	return 0;
 }
 
-/*
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 {
-	if (inode->i_op->update_time)
-		return inode->i_op->update_time(inode, time, flags);
+	int iflags = I_DIRTY_TIME;
 
 	if (flags & S_ATIME)
 		inode->i_atime = *time;
@@ -1575,9 +1612,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 		inode->i_ctime = *time;
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
-	mark_inode_dirty_sync(inode);
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
 	return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int (*update_time)(struct inode *, struct timespec *, int);
+
+	update_time = inode->i_op->update_time ? inode->i_op->update_time :
+		generic_update_time;
+
+	return update_time(inode, time, flags);
+}
 
 /**
  *	touch_atime	-	update the access time
@@ -1861,7 +1916,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
+					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 
 /*
  * block_dev.c
@@ -50,8 +51,6 @@ extern void __init chrdev_init(void);
  * namei.c
  */
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
-extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
-			   const char *, unsigned int, struct path *);
 
 /*
  * namespace.c
@@ -66,6 +65,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
+extern struct lglock vfsmount_lock;
+
 extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
 extern void __mnt_drop_write(struct vfsmount *);
@@ -85,7 +86,7 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
-extern bool grab_super_passive(struct super_block *sb);
+extern bool trylock_super(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
 			       int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
@@ -112,6 +113,7 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern void inode_add_lru(struct inode *inode);
 
 extern bool __atime_needs_update(const struct path *, struct inode *, bool);
@@ -128,7 +130,7 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *);
  */
 extern void inode_wb_list_del(struct inode *inode);
 
-extern int get_nr_dirty_inodes(void);
+extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
@@ -137,6 +139,7 @@ extern int invalidate_inodes(struct super_block *, bool);
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 
 /*
  * read_write.c
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -203,8 +203,12 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
 		return -EFAULT;
 
-	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
-		filemap_write_and_wait(inode->i_mapping);
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) {
+		error = filemap_write_and_wait_range(inode->i_mapping,
+				fiemap.fm_start, fiemap.fm_start + len - 1);
+		if (error)
+			return error;
+	}
 
 	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
 	fiemap.fm_flags = fieinfo.fi_flags;
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
+#include <bc/beancounter.h>
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
@@ -68,6 +69,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 	kuid_t uid;
 	int ret;
 
+	if (!ve_is_super(get_exec_env())) {
+		if (which == IOPRIO_WHO_UBC)
+			return -EPERM;
+
+		switch (class) {
+			case IOPRIO_CLASS_RT:
+				if (!ve_capable(CAP_SYS_ADMIN))
+					return -EPERM;
+				class = IOPRIO_CLASS_BE;
+				data = 0;
+				break;
+			case IOPRIO_CLASS_IDLE:
+				class = IOPRIO_CLASS_BE;
+				data = IOPRIO_BE_NR - 1;
+				break;
+		}
+		ioprio = IOPRIO_PRIO_VALUE(class, data);
+	}
+
 	switch (class) {
 		case IOPRIO_CLASS_RT:
 			if (!capable(CAP_SYS_ADMIN))
@@ -88,6 +108,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 			return -EINVAL;
 	}
 
+	if (which == IOPRIO_WHO_UBC) {
+		if (class != IOPRIO_CLASS_BE)
+			return -ERANGE;
+		return ub_set_ioprio(who, data);
+	}
+
 	ret = -ESRCH;
 	rcu_read_lock();
 	switch (which) {
@@ -123,6 +149,10 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
+#ifdef CONFIG_VE
+				if (p->task_ve != get_exec_env())
+					continue;
+#endif
 				if (!uid_eq(task_uid(p), uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
@@ -149,8 +179,10 @@ static int get_task_ioprio(struct task_struct *p)
 	if (ret)
 		goto out;
 	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	task_lock(p);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
+	task_unlock(p);
 out:
 	return ret;
 }
@@ -220,6 +252,10 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
+#ifdef CONFIG_VE
+				if (p->task_ve != get_exec_env())
+					continue;
+#endif
 				if (!uid_eq(task_uid(p), user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -98,7 +98,7 @@ static int init_inodecache(void)
 	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
 #include <linux/backing-dev.h>
+#include <linux/virtinfo.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -98,6 +99,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 		goto out;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -716,10 +716,9 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 				!tid_gt(tid, journal->j_commit_sequence));
 		read_lock(&journal->j_state_lock);
 	}
-	read_unlock(&journal->j_state_lock);
-
 	if (unlikely(is_journal_aborted(journal)))
 		err = -EIO;
+	read_unlock(&journal->j_state_lock);
 	return err;
 }
 
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -36,6 +36,9 @@ struct recovery_info
 	int		nr_replays;
 	int		nr_revokes;
 	int		nr_revoke_hits;
+
+	unsigned int		last_log_block;
+	struct buffer_head	*last_commit_bh;
 };
 
 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -233,6 +236,71 @@ do {									\
 		var -= ((journal)->j_last - (journal)->j_first);	\
 } while (0)
 
+/*
+ * The 'Raid amnesia' effect protection: https://jira.sw.ru/browse/PSBM-15484
+ *
+ * Some blockdevices can return different data on read requests from same block
+ * after power failure (for example mirrored raid is out of sync, and resync is
+ * in progress) In that case following sutuation is possible:
+ *
+ * Power failure happen after transaction commit log was issued for
+ * transaction 'D', next boot first dist will have commit block, but
+ * second one will not.
+ * mirror1: journal={Ac-Bc-Cc-Dc }
+ * mirror2: journal={Ac-Bc-Cc-D  }
+ * Now let's let assumes that we read from mirror1 and found that 'D' has
+ * valid commit block, so journal_replay will replay that transaction, but
+ * second power failure may happen before journal_reset() so next
+ * journal_replay() may read from mirror2 and found that 'C' is last valid
+ * transaction. This result in corruption because we already replayed
+ * trandaction 'D'.
+ * In order to avoid such ambiguity we should pefrorm 'stabilize write'.
+ * 1) Read and rewrite latest commit id block
+ * 2) Invalidate next block in
+ * order to guarantee that journal head becomes stable.
+ * Yes i know that 'stabilize write' approach is ugly but this is the only
+ * way to run filesystem on blkdevices with 'raid amnesia' effect
+ */
+static int stabilize_journal_head(journal_t *journal, struct recovery_info *info)
+{
+	struct buffer_head *bh[2] = {NULL, NULL};
+	int err, err2, i;
+
+	if (!info->last_commit_bh)
+		return 0;
+
+	bh[0] = info->last_commit_bh;
+	info->last_commit_bh = NULL;
+
+	err = jread(&bh[1], journal, info->last_log_block);
+	if (err)
+		goto out;
+
+	for (i = 0; i < 2; i++) {
+		lock_buffer(bh[i]);
+		/* Explicitly invalidate block beyond last commit block */
+		if (i == 1)
+			memset(bh[i]->b_data, 0, journal->j_blocksize);
+
+		BUFFER_TRACE(bh[i], "marking dirty");
+		set_buffer_uptodate(bh[i]);
+		mark_buffer_dirty(bh[i]);
+		BUFFER_TRACE(bh[i], "marking uptodate");
+		unlock_buffer(bh[i]);
+	}
+	err = sync_blockdev(journal->j_dev);
+	/* Make sure data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+		if (!err)
+			err = err2;
+	}
+out:
+	brelse(bh[0]);
+	brelse(bh[1]);
+	return err;
+}
+
 /**
  * jbd2_journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
@@ -269,6 +337,8 @@ int jbd2_journal_recover(journal_t *journal)
 	}
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = stabilize_journal_head(journal, &info);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
 	if (!err)
@@ -319,6 +389,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 	memset (&info, 0, sizeof(info));
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
+	brelse(info.last_commit_bh);
 
 	if (err) {
 		printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
@@ -422,6 +493,7 @@ static int do_one_pass(journal_t *journal,
 {
 	unsigned int		first_commit_ID, next_commit_ID;
 	unsigned long		next_log_block;
+	unsigned long		last_commit_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -442,6 +514,7 @@ static int do_one_pass(journal_t *journal,
 	sb = journal->j_superblock;
 	next_commit_ID = be32_to_cpu(sb->s_sequence);
 	next_log_block = be32_to_cpu(sb->s_start);
+	last_commit_block = 0;
 
 	first_commit_ID = next_commit_ID;
 	if (pass == PASS_SCAN)
@@ -758,7 +831,9 @@ static int do_one_pass(journal_t *journal,
 					break;
 				}
 			}
-			brelse(bh);
+			brelse(info->last_commit_bh);
+			info->last_commit_bh = bh;
+			info->last_log_block = next_log_block;
 			next_commit_ID++;
 			continue;
 
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/virtinfo.h>
 
 #include <trace/events/jbd2.h>
 
@@ -294,6 +295,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 		return -ENOSPC;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kmem_cache_zalloc(transaction_cache,
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -243,9 +243,10 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	case ACL_TYPE_ACCESS:
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			rc = posix_acl_equiv_mode(acl, &mode);
-			if (rc < 0)
+			umode_t mode;
+
+			rc = posix_acl_update_mode(inode, &mode, &acl);
+			if (rc)
 				return rc;
 			if (inode->i_mode != mode) {
 				struct iattr attr;
@@ -257,8 +258,6 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 				if (rc < 0)
 					return rc;
 			}
-			if (rc == 0)
-				acl = NULL;
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -386,7 +386,7 @@ static int __init init_jffs2_fs(void)
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		pr_err("error: Failed to initialise inode cache\n");
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,7 +38,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return rc;
 
 	mutex_lock(&inode->i_mutex);
-	if (!(inode->i_state & I_DIRTY) ||
+	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -878,7 +878,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
-			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -962,7 +962,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
 
 	mutex_lock(&inode->i_mutex);
 	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,6 +57,9 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
+atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+
 int lockd_net_id;
 
 /*
@@ -289,7 +292,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
 	struct sockaddr_in sin;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nlm_ntf_refcnt))
 		goto out;
 
 	if (nlmsvc_rqst) {
@@ -300,6 +304,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
 		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
 			(struct sockaddr *)&sin);
 	}
+	atomic_dec(&nlm_ntf_refcnt);
+	wake_up(&nlm_ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -316,7 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
 	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
 	struct sockaddr_in6 sin6;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nlm_ntf_refcnt))
 		goto out;
 
 	if (nlmsvc_rqst) {
@@ -326,6 +333,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
 		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
 			(struct sockaddr *)&sin6);
 	}
+	atomic_dec(&nlm_ntf_refcnt);
+	wake_up(&nlm_ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -342,10 +351,12 @@ static void lockd_unregister_notifiers(void)
 #if IS_ENABLED(CONFIG_IPV6)
 	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
 #endif
+	wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0);
 }
 
 static void lockd_svc_exit_thread(void)
 {
+	atomic_dec(&nlm_ntf_refcnt);
 	lockd_unregister_notifiers();
 	svc_exit_thread(nlmsvc_rqst);
 }
@@ -369,6 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv)
 		goto out_rqst;
 	}
 
+	atomic_inc(&nlm_ntf_refcnt);
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,9 @@
 
 #include <asm/uaccess.h>
 
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
@@ -216,10 +219,26 @@ static void locks_init_lock_heads(struct file_lock *fl)
 }
 
 /* Allocate an empty lock structure. */
-struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(int charge)
 {
-	struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
+	struct file_lock *fl;
+
+	fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
+#ifdef CONFIG_BEANCOUNTERS
+	if (fl == NULL)
+		goto out;
+	fl->fl_ub = get_beancounter(get_exec_ub());
+	fl->fl_charged = 0;
+	if (!charge)
+		goto out;
+	if (!ub_flock_charge(fl, 1))
+		goto out;
 
+	put_beancounter(fl->fl_ub);
+	kmem_cache_free(filelock_cache, fl);
+	fl = NULL;
+out:
+#endif
 	if (fl)
 		locks_init_lock_heads(fl);
 
@@ -253,7 +272,11 @@ void locks_free_lock(struct file_lock *fl)
 	BUG_ON(!list_empty(&fl->fl_block));
 	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
+	ub_flock_uncharge(fl);
 	locks_release_private(fl);
+#ifdef CONFIG_BEANCOUNTERS
+	put_beancounter(fl->fl_ub);
+#endif
 	kmem_cache_free(filelock_cache, fl);
 }
 EXPORT_SYMBOL(locks_free_lock);
@@ -357,7 +380,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
 	if (type < 0)
 		return type;
 	
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(type != F_UNLCK);
 	if (fl == NULL)
 		return -ENOMEM;
 
@@ -518,7 +541,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
 /* Allocate a file_lock initialised to this type of lease */
 static struct file_lock *lease_alloc(struct file *filp, long type)
 {
-	struct file_lock *fl = locks_alloc_lock();
+	struct file_lock *fl = locks_alloc_lock(1);
 	int error = -ENOMEM;
 
 	if (fl == NULL)
@@ -878,7 +901,12 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 	LIST_HEAD(dispose);
 
 	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
-		new_fl = locks_alloc_lock();
+		/*
+		 * Nont F_UNLCK request must be already charged in
+		 * flock_make_lock(). Actually new_fl must be charged not the
+		 * request, but we try to fail earlier.
+		 */
+		new_fl = locks_alloc_lock(0);
 		if (!new_fl)
 			return -ENOMEM;
 	}
@@ -908,16 +936,6 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 		goto out;
 	}
 
-	/*
-	 * If a higher-priority process was blocked on the old file lock,
-	 * give it the opportunity to lock the file.
-	 */
-	if (found) {
-		spin_unlock(&inode->i_lock);
-		cond_resched();
-		spin_lock(&inode->i_lock);
-	}
-
 find_conflict:
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
@@ -936,6 +954,10 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 	}
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
+
+	set_flock_charged(new_fl);
+	unset_flock_charged(request);
+
 	locks_copy_lock(new_fl, request);
 	locks_insert_lock(before, new_fl);
 	new_fl = NULL;
@@ -970,8 +992,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	if (!(request->fl_flags & FL_ACCESS) &&
 	    (request->fl_type != F_UNLCK ||
 	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
-		new_fl = locks_alloc_lock();
-		new_fl2 = locks_alloc_lock();
+		if (request->fl_type != F_UNLCK)
+			new_fl = locks_alloc_lock(1);
+		else
+			new_fl = NULL;
+		new_fl2 = locks_alloc_lock(0);
 	}
 
 	spin_lock(&inode->i_lock);
@@ -1124,7 +1149,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	 * done below this, so it's safe yet to bail out.
 	 */
 	error = -ENOLCK; /* "no luck" */
-	if (right && left == right && !new_fl2)
+	if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2))
 		goto out;
 
 	error = 0;
@@ -1135,23 +1160,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 
-		if (!new_fl) {
-			error = -ENOLCK;
+		error = -ENOLCK;
+		if (!new_fl)
+			goto out;
+		if (right && (left == right) && ub_flock_charge(new_fl, 1))
 			goto out;
-		}
 		locks_copy_lock(new_fl, request);
 		locks_insert_lock(before, new_fl);
 		new_fl = NULL;
+		error = 0;
 	}
 	if (right) {
 		if (left == right) {
 			/* The new lock breaks the old one in two pieces,
 			 * so we have to use the second new lock.
 			 */
+			error = -ENOLCK;
+			if (added && ub_flock_charge(new_fl2,
+						request->fl_type != F_UNLCK))
+				goto out;
+			/* FIXME move all fl_charged manipulations in ub code */
+			set_flock_charged(new_fl2);
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
 			locks_insert_lock(before, left);
+			error = 0;
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1588,8 +1622,9 @@ static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct dentry *dentry = filp->f_original_path.mnt ?
+		filp->f_original_path.dentry: filp->f_path.dentry;
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
 
@@ -1719,7 +1754,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 	struct inode *inode = locks_inode(filp);
 	int error;
 
-	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
+	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !ve_capable(CAP_LEASE))
 		return -EACCES;
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
@@ -2119,7 +2154,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock flock;
 	struct inode *inode;
 	struct file *f;
@@ -2238,7 +2273,7 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock64 __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock64 flock;
 	struct inode *inode;
 	struct file *f;
@@ -2557,6 +2592,7 @@ void show_fd_locks(struct seq_file *f,
 		 * matches ->fl_file.
 		 */
 		if (fl->fl_owner != files &&
+		    fl->fl_owner != (fl_owner_t)filp &&
 		    fl->fl_owner != NULL)
 			continue;
 
@@ -2613,7 +2649,7 @@ static const struct file_operations proc_locks_operations = {
 
 static int __init proc_locks_init(void)
 {
-	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create("locks", S_ISVTX, NULL, &proc_locks_operations);
 	return 0;
 }
 module_init(proc_locks_init);
@@ -2704,7 +2740,7 @@ static int __init filelock_init(void)
 	int i;
 
 	filelock_cache = kmem_cache_create("file_lock_cache",
-			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+			sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
 
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -408,7 +408,8 @@ const struct super_operations logfs_super_operations = {
 int logfs_init_inode_cache(void)
 {
 	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
-			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			sizeof(struct logfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			logfs_init_once);
 	if (!logfs_inode_cache)
 		return -ENOMEM;
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
-/*
- * What the mbcache registers as to get shrunk dynamically.
- */
-
-static int mb_cache_shrink_fn(struct shrinker *shrink,
-			      struct shrink_control *sc);
-
-static struct shrinker mb_cache_shrinker = {
-	.shrink = mb_cache_shrink_fn,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static inline int
 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 {
@@ -151,7 +139,7 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
 
 
 /*
- * mb_cache_shrink_fn()  memory pressure callback
+ * mb_cache_shrink_scan()  memory pressure callback
  *
  * This function is called by the kernel memory management when memory
  * gets low.
@@ -159,17 +147,16 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
  * @shrink: (ignored)
  * @sc: shrink_control passed from reclaim
  *
- * Returns the number of objects which are present in the cache.
+ * Returns the number of objects freed.
  */
-static int
-mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	LIST_HEAD(free_list);
-	struct mb_cache *cache;
 	struct mb_cache_entry *entry, *tmp;
-	int count = 0;
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
+	unsigned long freed = 0;
 
 	mb_debug("trying to free %d entries", nr_to_scan);
 	spin_lock(&mb_cache_spinlock);
@@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 				   struct mb_cache_entry, e_lru_list);
 		list_move_tail(&ce->e_lru_list, &free_list);
 		__mb_cache_entry_unhash(ce);
+		freed++;
+	}
+	spin_unlock(&mb_cache_spinlock);
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
 	}
+	return freed;
+}
+
+static unsigned long
+mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct mb_cache *cache;
+	unsigned long count = 0;
+
+	spin_lock(&mb_cache_spinlock);
 	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
 		mb_debug("cache %s (%d)", cache->c_name,
 			  atomic_read(&cache->c_entry_count));
 		count += atomic_read(&cache->c_entry_count);
 	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-		__mb_cache_entry_forget(entry, gfp_mask);
-	}
-	return (count / 100) * sysctl_vfs_cache_pressure;
+
+	return vfs_pressure_ratio(count);
 }
 
+static struct shrinker mb_cache_shrinker = {
+	.count_objects = mb_cache_shrink_count,
+	.scan_objects = mb_cache_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
 
 /*
  * mb_cache_create()  create a new cache
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,8 +8,8 @@ struct mnt_namespace {
 	unsigned int		proc_inum;
 	struct mount *	root;
 	struct list_head	list;
-	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
+	struct user_namespace	*user_ns;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
@@ -20,13 +20,6 @@ struct mnt_pcp {
 	int mnt_writers;
 };
 
-struct mountpoint {
-	struct hlist_node m_hash;
-	struct dentry *m_dentry;
-	struct hlist_head m_list;
-	int m_count;
-};
-
 struct mount {
 	struct hlist_node mnt_hash;
 	struct mount *mnt_parent;
@@ -55,6 +48,9 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+#ifdef CONFIG_VE
+	struct ve_struct *ve_owner;	/* VE in which this mount was created */
+#endif /* CONFIG_VE */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
@@ -85,6 +81,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 
+extern struct rw_semaphore namespace_sem;
+
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -859,7 +860,7 @@ static int may_linkat(struct path *link)
 	 * otherwise, it must be a safe source.
 	 */
 	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
-	    capable(CAP_FOWNER))
+	    ve_capable(CAP_FOWNER))
 		return 0;
 
 	audit_log_link_denied("linkat", link);
@@ -2543,7 +2544,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 		return -EPERM;
 
 	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+	    IS_IMMUTABLE(inode) ||
+	    (IS_SWAPFILE(inode) && inode->i_nlink == 1))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -3461,7 +3463,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !ve_capable(CAP_MKNOD))
 		return -EPERM;
 
 	if (!dir->i_op->mknod)
@@ -4028,7 +4030,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	 * handlink using the passed filedescriptor.
 	 */
 	if (flags & AT_EMPTY_PATH) {
-		if (!capable(CAP_DAC_READ_SEARCH))
+		if (!ve_capable(CAP_DAC_READ_SEARCH))
 			return -ENOENT;
 		how = LOOKUP_EMPTY;
 	}
@@ -4290,7 +4292,7 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 	    (flags & RENAME_EXCHANGE))
 		return -EINVAL;
 
-	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+	if ((flags & RENAME_WHITEOUT) && !ve_capable(CAP_MKNOD))
 		return -EPERM;
 
 retry:
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -26,6 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
+#include <linux/ve.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -64,7 +65,7 @@ static int mnt_group_start = 1;
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static DECLARE_RWSEM(namespace_sem);
+DECLARE_RWSEM(namespace_sem);
 
 /* /sys/fs */
 struct kobject *fs_kobj;
@@ -196,9 +197,22 @@ static void drop_mountpoint(struct fs_pin *p)
 	mntput(&m->mnt);
 }
 
+static inline int ve_mount_allowed(void);
+static inline void ve_mount_nr_inc(struct mount *mnt);
+static inline void ve_mount_nr_dec(struct mount *mnt);
+
 static struct mount *alloc_vfsmnt(const char *name)
 {
-	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	struct mount *mnt;
+
+	if (!ve_mount_allowed()) {
+		pr_warn_ratelimited(
+			"CT#%s reached the limit on mounts.\n",
+			ve_name(get_exec_env()));
+		return NULL;
+	}
+
+	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 
@@ -207,7 +221,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 			goto out_free_cache;
 
 		if (name) {
-			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL_ACCOUNT);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
@@ -237,6 +251,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 #endif
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
+	ve_mount_nr_inc(mnt);
 	return mnt;
 
 #ifdef CONFIG_SMP
@@ -577,6 +592,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+	ve_mount_nr_dec(mnt);
 	kfree(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -1118,7 +1134,8 @@ static void mntput_no_expire(struct mount *mnt)
 	}
 	unlock_mount_hash();
 
-	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
+	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))
+	    && !(mnt->mnt.mnt_sb->s_iflags & SB_I_UMOUNT_SYNC)) {
 		struct task_struct *task = current;
 		if (likely(!(task->flags & PF_KTHREAD))) {
 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
@@ -1538,7 +1555,7 @@ static int do_umount(struct mount *mnt, int flags)
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY))
@@ -2180,6 +2197,194 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 	return error;
 }
 
+#ifdef CONFIG_VE
+/*
+ * Returns first occurrence of needle in haystack separated by sep,
+ * or NULL if not found
+ */
+static char *strstr_separated(char *haystack, char *needle, char sep)
+{
+	int needle_len = strlen(needle);
+
+	while (haystack) {
+		if (!strncmp(haystack, needle, needle_len) &&
+		    (haystack[needle_len] == 0 || /* end-of-line or */
+		     haystack[needle_len] == sep)) /* separator */
+			return haystack;
+
+		haystack = strchr(haystack, sep);
+		if (haystack)
+			haystack++;
+	}
+
+	return NULL;
+}
+
+static int ve_devmnt_check(char *options, char *allowed)
+{
+	char *p;
+	char *tmp_options;
+
+	if (!options || !*options)
+		return 0;
+
+	if (!allowed)
+		return -EPERM;
+
+	/* strsep() changes provided string: puts '\0' instead of separators */
+	tmp_options = kstrdup(options, GFP_KERNEL);
+	if (!tmp_options)
+		return -ENOMEM;
+
+	while ((p = strsep(&tmp_options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		if (!strstr_separated(allowed, p, ',')) {
+			kfree(tmp_options);
+			return -EPERM;
+		}
+	}
+
+	kfree(tmp_options);
+	return 0;
+}
+
+static int ve_devmnt_insert(char *options, char *hidden)
+{
+	int options_len;
+	int hidden_len;
+
+	if (!hidden)
+		return 0;
+
+	if (!options)
+		return -EAGAIN;
+
+	options_len = strlen(options);
+	hidden_len = strlen(hidden);
+
+	if (hidden_len + options_len + 2 > PAGE_SIZE)
+		return -EPERM;
+
+	memmove(options + hidden_len + 1, options, options_len);
+	memcpy(options, hidden, hidden_len);
+
+	options[hidden_len] = ',';
+	options[hidden_len + options_len + 1] = 0;
+
+	return 0;
+}
+
+int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remount)
+{
+	void *data = *data_pp;
+	struct ve_devmnt *devmnt;
+	int err;
+again:
+	err = 1;
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		if (devmnt->dev == dev) {
+			err = ve_devmnt_check(data, devmnt->allowed_options);
+
+			/*
+			 * In case of @is_pseudouser set, ie restore procedure,
+			 * we don't check for allowed options filtering, since
+			 * restore mode is special.
+			 */
+			if ((ve->is_pseudosuper || !err) && !remount)
+				err = ve_devmnt_insert(data, devmnt->hidden_options);
+
+			break;
+		}
+	}
+	mutex_unlock(&ve->devmnt_mutex);
+
+	switch (err) {
+	case -EAGAIN:
+		if (!(data = (void *)__get_free_page(GFP_KERNEL)))
+			return -ENOMEM;
+		*(char *)data = 0; /* the string must be zero-terminated */
+		goto again;
+	case 1:
+		if (*data_pp) {
+			/*
+			 * Same as in chunk above but for case where
+			 * ve->devmnt_list is empty. Depending on
+			 * the way userspace tool restore container
+			 * it might be nonempty as well.
+			 */
+			if (ve->is_pseudosuper) {
+				err = 0;
+			} else {
+				ve_pr_warn_ratelimited(VE_LOG_BOTH, "VE%s: no allowed "
+					  "mount options found for device %u:%u\n",
+					  ve->ve_name, MAJOR(dev), MINOR(dev));
+				err = -EPERM;
+			}
+		} else
+			err = 0;
+		break;
+	case 0:
+		*data_pp = data;
+		break;
+	}
+
+	if (data && data != *data_pp)
+		free_page((unsigned long)data);
+
+	return err;
+}
+
+static inline int ve_mount_allowed(void)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	return ve_is_super(ve) ||
+		atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
+}
+
+static inline void ve_mount_nr_inc(struct mount *mnt)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	mnt->ve_owner = get_ve(ve);
+	atomic_inc(&ve->mnt_nr);
+}
+
+static inline void ve_mount_nr_dec(struct mount *mnt)
+{
+	struct ve_struct *ve = mnt->ve_owner;
+
+	atomic_dec(&ve->mnt_nr);
+	put_ve(ve);
+	mnt->ve_owner = NULL;
+}
+
+#else /* CONFIG_VE */
+
+static inline int ve_mount_allowed(void) { return 1; }
+static inline void ve_mount_nr_inc(struct mount *mnt) { }
+static inline void ve_mount_nr_dec(struct mount *mnt) { }
+#endif /* CONFIG_VE */
+
+static int do_check_and_remount_sb(struct super_block *sb, int flags, void *data)
+{
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (sb->s_bdev && data && !ve_is_super(ve)) {
+		int err;
+
+		err = ve_devmnt_process(ve, sb->s_bdev->bd_dev, &data, 1);
+		if (err)
+			return err;
+	}
+#endif
+	return do_remount_sb(sb, flags, data, 0);
+}
+
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
@@ -2205,13 +2410,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else if (!capable(CAP_SYS_ADMIN))
+	else if (!ve_capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
-		err = do_remount_sb(sb, flags, data, 0);
+		err = do_check_and_remount_sb(sb, flags, data);
 	if (!err) {
 		lock_mount_hash();
-		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
 		unlock_mount_hash();
@@ -2386,7 +2591,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (!type)
 		return -ENODEV;
 
-	if (user_ns != &init_user_ns) {
+	if (user_ns != ve_init_user_ns()) {
 		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
 			put_filesystem(type);
 			return -EPERM;
@@ -2791,7 +2996,11 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 }
 
 /* namespace.unpriv_enable = 1 */
-static bool enable_unpriv_mnt_ns_creation;
+/* While unprivileged user namespaces remain in tech preview in RHEL7 - they
+ * are disabled by default. Virtuozzo Containers are run in user namespaces and
+ * we want to run Docker Containers inside Virtuozzo Containers, so enable them
+ * by default. */
+static bool enable_unpriv_mnt_ns_creation = true;
 module_param_named(unpriv_enable, enable_unpriv_mnt_ns_creation, bool, 0444);
 MODULE_PARM_DESC(unpriv_enable, "Enable unprivileged creation of mount namespaces");
 
@@ -2814,13 +3023,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 
 	/* Unprivileged creation currently tech preview in RHEL7  */
 	if (user_ns != &init_user_ns) {
-		static int __read_mostly called_mark_tech_preview = 0;
 		if (!enable_unpriv_mnt_ns_creation) {
 			return ERR_PTR(-EPERM);
 		}
-		if (!called_mark_tech_preview &&
-		    !xchg(&called_mark_tech_preview, 1))
-			mark_tech_preview("unpriv mount namespace", NULL);
 	}
 
 	old = ns->root;
@@ -3152,7 +3357,7 @@ void __init mnt_init(void)
 	int err;
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -80,7 +80,7 @@ static int init_inodecache(void)
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static DEFINE_MUTEX(nfs41_callback_mutex);
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
 		if (try_to_freeze())
 			continue;
 
+		mutex_lock(&nfs41_callback_mutex);
+		if (kthread_should_stop()) {
+			mutex_unlock(&nfs41_callback_mutex);
+			return 0;
+		}
+
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
 		if (!list_empty(&serv->sv_cb_list)) {
@@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
 			error = bc_svc_process(serv, req, rqstp);
 			dprintk("bc_svc_process() returned w/ error code= %d\n",
 				error);
+			mutex_unlock(&nfs41_callback_mutex);
 		} else {
 			spin_unlock_bh(&serv->sv_cb_lock);
+			mutex_unlock(&nfs41_callback_mutex);
 			schedule();
 			finish_wait(&serv->sv_cb_waitq, &wq);
 		}
@@ -139,6 +149,13 @@ nfs41_callback_svc(void *vrqstp)
 	return 0;
 }
 
+static void nfs41_callback_down_net(struct svc_serv *serv, struct net *net)
+{
+	mutex_lock(&nfs41_callback_mutex);
+	bc_svc_flush_queue_net(serv, net);
+	mutex_unlock(&nfs41_callback_mutex);
+}
+
 /*
  * Bring up the NFSv4.1 callback service
  */
@@ -150,6 +167,7 @@ nfs41_callback_up(struct svc_serv *serv)
 	INIT_LIST_HEAD(&serv->sv_cb_list);
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
+	serv->svc_cb_down_net = nfs41_callback_down_net;
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
@@ -242,6 +260,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
 		return;
 
 	dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+	if (serv->svc_cb_down_net)
+		serv->svc_cb_down_net(serv, net);
 	svc_shutdown_net(serv, net);
 }
 
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -189,15 +189,31 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 	nfs_inode_set_delegation(inode, cred, res);
 }
 
+static bool ve_abort_delegation(struct inode *inode)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct rpc_xprt *xprt;
+	bool abort;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clp->cl_rpcclient->cl_xprt);
+	abort = xprt->xprt_net->owner_ve->ve_netns == NULL;
+	rcu_read_unlock();
+
+	return abort;
+}
+
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
 	int res = 0;
 
-	if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+	if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+	    !ve_abort_delegation(inode)) {
 		res = nfs4_proc_delegreturn(inode,
 				delegation->cred,
 				&delegation->stateid,
 				issync);
+	}
 	nfs_free_delegation(delegation);
 	return res;
 }
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1495,7 +1495,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
-	err = nfs_check_flags(open_flags);
+	err = nfs_set_flags(file, open_flags);
 	if (err)
 		return err;
 
@@ -2119,11 +2119,13 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-int nfs_do_access_cache_shrinker(int nr_to_scan)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
+	long freed = 0;
 
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
@@ -2139,6 +2141,7 @@ int nfs_do_access_cache_shrinker(int nr_to_scan)
 				struct nfs_access_entry, lru);
 		list_move(&cache->lru, &head);
 		rb_erase(&cache->rb_node, &nfsi->access_cache);
+		freed++;
 		if (!list_empty(&nfsi->access_cache_entry_lru))
 			list_move_tail(&nfsi->access_cache_inode_lru,
 					&nfs_access_lru_list);
@@ -2153,18 +2156,24 @@ int nfs_do_access_cache_shrinker(int nr_to_scan)
 	}
 	spin_unlock(&nfs_access_lru_lock);
 	nfs_access_free_list(&head);
-	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+	return freed;
 }
 
-int nfs_access_cache_shrinker(struct shrinker *shrink,
-			      struct shrink_control *sc)
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
+}
+
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return (nr_to_scan == 0) ? 0 : -1;
-	return nfs_do_access_cache_shrinker(nr_to_scan);
+		return SHRINK_STOP;
+	return nfs_do_access_cache_scan(nr_to_scan);
 }
 
 static void
@@ -2180,7 +2189,7 @@ nfs_access_cache_enforce_limit(void)
 	diff = nr_entries - nfs_access_max_cachesize;
 	if (diff < nr_to_scan)
 		nr_to_scan = diff;
-	nfs_do_access_cache_shrinker(nr_to_scan);
+	nfs_do_access_cache_scan(nr_to_scan);
 }
 
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -51,6 +51,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/virtinfo.h>
 
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
@@ -642,6 +643,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result = -EINVAL;
 	size_t count;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
@@ -1126,6 +1129,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	loff_t end;
 	size_t count;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	count = iov_length(iov, nr_segs);
 	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 
@@ -1151,12 +1156,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		goto out_unlock;
 
-	if (mapping->nrpages) {
-		result = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
-		if (result)
-			goto out_unlock;
-	}
+	result = invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
+	if (result)
+		goto out_unlock;
 
 	task_io_account_write(count);
 
@@ -1180,10 +1183,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
 
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
-	}
+	invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
 
 	mutex_unlock(&inode->i_mutex);
 
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -49,14 +49,14 @@ static const struct vm_operations_struct nfs_file_vm_ops;
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
-int nfs_check_flags(int flags)
+int nfs_set_flags(struct file * filp, int flags)
 {
 	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
 		return -EINVAL;
 
-	return 0;
+	return generic_set_file_flags(filp, flags);
 }
-EXPORT_SYMBOL_GPL(nfs_check_flags);
+EXPORT_SYMBOL_GPL(nfs_set_flags);
 
 /*
  * Open file
@@ -69,7 +69,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 	dprintk("NFS: open file(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-	res = nfs_check_flags(filp->f_flags);
+	res = nfs_set_flags(filp, filp->f_flags);
 	if (res)
 		return res;
 
@@ -470,31 +470,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
  */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
-	struct address_space *mapping = page->mapping;
-
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-	/* Always try to initiate a 'commit' if relevant, but only
-	 * wait for it if __GFP_WAIT is set.  Even then, only wait 1
-	 * second and only if the 'bdi' is not congested.
-	 * Waiting indefinitely can cause deadlocks when the NFS
-	 * server is on this machine, when a new TCP connection is
-	 * needed and in other rare cases.  There is no particular
-	 * need to wait extensively here.  A short wait has the
-	 * benefit that someone else can worry about the freezer.
-	 */
-	if (mapping) {
-		struct nfs_server *nfss = NFS_SERVER(mapping->host);
-		nfs_commit_inode(mapping->host, 0);
-		if ((gfp & __GFP_WAIT) &&
-		    !bdi_write_congested(&nfss->backing_dev_info)) {
-			wait_on_page_bit_killable_timeout(page, PG_private,
-							  HZ);
-			if (PagePrivate(page))
-				set_bdi_congested(&nfss->backing_dev_info,
-						  BLK_RW_ASYNC);
-		}
-	}
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
 		return 0;
@@ -636,7 +613,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = nfs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_check_write(struct file *filp, struct inode *inode)
@@ -956,7 +932,7 @@ const struct file_operations nfs_file_operations = {
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
 	.splice_write	= nfs_file_splice_write,
-	.check_flags	= nfs_check_flags,
+	.set_flags	= nfs_set_flags,
 	.setlease	= nfs_setlease,
 };
 EXPORT_SYMBOL_GPL(nfs_file_operations);
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
-#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
@@ -73,7 +72,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 
 static int nfs_wait_killable(int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
@@ -1089,10 +1088,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 			if (ret < 0)
 				return ret;
 		}
-		ret = invalidate_inode_pages2(mapping);
-		if (ret < 0)
-			return ret;
 	}
+	ret = invalidate_inode_pages2(mapping);
+	if (ret < 0)
+		return ret;
+
 	if (S_ISDIR(inode->i_mode)) {
 		spin_lock(&inode->i_lock);
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
@@ -1995,7 +1995,7 @@ static int __init nfs_init_inodecache(void)
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -348,8 +348,10 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct nfs_client_initdata *);
 
 /* dir.c */
-extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-					struct shrink_control *sc);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 extern void nfs_advise_use_readdirplus(struct inode *dir);
 extern void nfs_force_use_readdirplus(struct inode *dir);
 struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
@@ -375,7 +377,7 @@ int nfs_lock(struct file *, int, struct file_lock *);
 int nfs_flock(struct file *, int, struct file_lock *);
 ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
 			      size_t, unsigned int);
-int nfs_check_flags(int);
+int nfs_set_flags(struct file * filp, int flags);
 int nfs_setlease(struct file *, long, struct file_lock **, void **priv);
 
 /* inode.c */
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -17,7 +17,6 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfs_mount.h>
-#include <linux/freezer.h>
 
 #include "iostat.h"
 #include "internal.h"
@@ -34,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -44,7 +44,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	dprintk("NFS: open file(%pd2)\n", dentry);
 
-	err = nfs_check_flags(openflags);
+	err = nfs_set_flags(filp, openflags);
 	if (err)
 		return err;
 
@@ -284,7 +284,7 @@ const struct file_operations_extend nfs4_file_operations = {
 		.flock		= nfs_flock,
 		.splice_read	= nfs_file_splice_read,
 		.splice_write	= nfs_file_splice_write,
-		.check_flags	= nfs_check_flags,
+		.set_flags	= nfs_set_flags,
 		.setlease	= nfs_setlease,
 #ifdef CONFIG_NFS_V4_2
 		.llseek		= nfs4_file_llseek,
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -53,7 +53,6 @@
 #include <linux/module.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
-#include <linux/freezer.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -371,7 +370,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 
 	might_sleep();
 
-	freezable_schedule_timeout_killable_unsafe(
+	schedule_timeout_killable(
 		nfs4_update_delay(timeout));
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
@@ -6305,7 +6304,7 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
 		status = nfs4_proc_setlk(state, cmd, request);
 		if ((status != -EAGAIN) || IS_SETLK(cmd))
 			break;
-		freezable_schedule_timeout_interruptible(timeout);
+		schedule_timeout_interruptible(timeout);
 		timeout *= 2;
 		timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
 		status = -ERESTARTSYS;
@@ -6390,7 +6389,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_irqrestore(&q->lock, flags);
 
-		freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+		schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
 	}
 
 	finish_wait(q, &wait);
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -41,7 +41,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
-#include <linux/freezer.h>
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -55,6 +55,9 @@
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
 
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
+
 #include <asm/uaccess.h>
 
 #include "nfs4_fs.h"
@@ -292,7 +295,8 @@ struct file_system_type nfs_fs_type = {
 	.name		= "nfs",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|FS_HAS_INVALIDATE_RANGE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|
+			  FS_HAS_INVALIDATE_RANGE|FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfs");
 EXPORT_SYMBOL_GPL(nfs_fs_type);
@@ -332,7 +336,9 @@ struct file_system_type nfs4_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|FS_HAS_INVALIDATE_RANGE|FS_HAS_FO_EXTEND,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|
+			  FS_HAS_INVALIDATE_RANGE|FS_HAS_FO_EXTEND|
+			  FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfs4");
 MODULE_ALIAS("nfs4");
@@ -359,7 +365,8 @@ static void unregister_nfs4_fs(void)
 #endif
 
 static struct shrinker acl_shrinker = {
-	.shrink		= nfs_access_cache_shrinker,
+	.count_objects	= nfs_access_cache_count,
+	.scan_objects	= nfs_access_cache_scan,
 	.seeks		= DEFAULT_SEEKS,
 };
 
@@ -2407,6 +2414,7 @@ static int nfs_set_super(struct super_block *s, void *data)
 	int ret;
 
 	s->s_flags = sb_mntdata->mntflags;
+	s->s_iflags |= SB_I_UMOUNT_SYNC;
 	s->s_fs_info = server;
 	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
 	ret = set_anon_super(s, server);
@@ -2654,6 +2662,11 @@ struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	struct nfs_subversion *nfs_mod;
 	int error;
 
+	if (!(get_exec_env()->features & VE_FEATURE_NFS))
+		return ERR_PTR(-ENODEV);
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
+
 	mount_info.parsed = nfs_alloc_parsed_mount_data();
 	mount_info.mntfh = nfs_alloc_fhandle();
 	if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -119,6 +119,9 @@ struct nfsd_net {
 	u32 clverifier_counter;
 
 	struct svc_serv *nfsd_serv;
+
+	wait_queue_head_t ntf_wq;
+	atomic_t ntf_refcnt;
 };
 
 /* Simple check to find out if a given net was properly initialized */
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -559,6 +559,9 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
+		resp->f_time_gran = 0;
+		if (!strcmp(sb->s_type->name, "ext4"))
+			resp->f_time_gran = sb->s_time_gran;
 	}
 
 	fh_put(&argp->fh);
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1057,8 +1057,13 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
 		*p++ = htonl(resp->f_wtmult);
 		*p++ = htonl(resp->f_dtpref);
 		p = xdr_encode_hyper(p, resp->f_maxfilesize);
-		*p++ = xdr_one;
-		*p++ = xdr_zero;
+		if (resp->f_time_gran) {
+			*p++ = xdr_zero;
+			*p++ = htonl(resp->f_time_gran);
+		} else {
+			*p++ = xdr_one;
+			*p++ = xdr_zero;
+		}
 		*p++ = htonl(resp->f_properties);
 	}
 
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -179,7 +179,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
 	struct file_lock *fl;
 	int status;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(1);
 	if (!fl)
 		return -ENOMEM;
 	locks_init_lock(fl);
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -36,7 +36,6 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <net/net_namespace.h>
@@ -44,6 +43,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfsd/cld.h>
 
+#include <linux/ve.h>
+
 #include "nfsd.h"
 #include "state.h"
 #include "vfs.h"
@@ -701,7 +702,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	struct cld_upcall *tmp, *cup;
 	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
 	uint32_t xid;
-	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+	struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
 						nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
 
@@ -1205,7 +1206,7 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
 	argv[2] = arg;
 	argv[3] = NULL;
 
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	ret = call_usermodehelper_ve(get_exec_env(), argv[0], argv, envp, UMH_WAIT_PROC);
 	/*
 	 * Disable the upcall mechanism if we're getting an ENOENT or EACCES
 	 * error. The admin can re-enable it on the fly by using sysfs
@@ -1248,13 +1249,6 @@ nfsd4_umh_cltrack_init(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
 
-	/* XXX: The usermode helper s not working in container yet. */
-	if (net != &init_net) {
-		pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
-		kfree(grace_start);
-		return -EINVAL;
-	}
-
 	ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
 	kfree(grace_start);
 	return ret;
@@ -1415,24 +1409,29 @@ nfsd4_client_tracking_init(struct net *net)
 	if (!status)
 		return status;
 
-	/*
-	 * See if the recoverydir exists and is a directory. If it is,
-	 * then use the legacy ops.
-	 */
-	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
-	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
-	if (!status) {
-		status = S_ISDIR(path.dentry->d_inode->i_mode);
-		path_put(&path);
-		if (status)
-			goto do_init;
+	if (net_eq(net, &init_net)) {
+		/*
+		 * See if the recoverydir exists and is a directory. If it is,
+		 * then use the legacy ops.
+		 */
+		nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+		status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+		if (!status) {
+			status = S_ISDIR(path.dentry->d_inode->i_mode);
+			path_put(&path);
+			if (status)
+				goto do_init;
+		}
 	}
 
 	/* Finally, try to use nfsdcld */
 	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
-	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
-			"removed in 3.10. Please transition to using "
-			"nfsdcltrack.\n");
+
+	if (net_eq(net, &init_net)) {
+		printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+				"removed in 3.10. Please transition to using "
+				"nfsdcltrack.\n");
+	}
 do_init:
 	status = nn->client_tracking_ops->init(net);
 	if (status) {
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4119,7 +4119,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 {
 	struct file_lock *fl;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(1);
 	if (!fl)
 		return NULL;
 	fl->fl_lmops = &nfsd_lease_mng_ops;
@@ -5845,7 +5845,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	conflock = locks_alloc_lock();
+	conflock = locks_alloc_lock(1);
 	if (!conflock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
@@ -5965,7 +5965,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		goto out;
 
-	file_lock = locks_alloc_lock();
+	file_lock = locks_alloc_lock(1);
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
@@ -6042,7 +6042,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_lock_range;
 		goto put_stateid;
 	}
-	file_lock = locks_alloc_lock();
+	file_lock = locks_alloc_lock(1);
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -64,11 +64,14 @@ static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 static void	cache_cleaner_func(struct work_struct *unused);
-static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
-					struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 
 static struct shrinker nfsd_reply_cache_shrinker = {
-	.shrink	= nfsd_reply_cache_shrink,
+	.scan_objects = nfsd_reply_cache_scan,
+	.count_objects = nfsd_reply_cache_count,
 	.seeks	= 1,
 };
 
@@ -243,6 +246,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
 		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
 			break;
 		nfsd_reply_cache_free_locked(rp);
+		freed++;
 	}
 	return freed;
 }
@@ -285,12 +289,17 @@ cache_cleaner_func(struct work_struct *unused)
 	prune_cache_entries();
 }
 
-static int
-nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	return atomic_read(&num_drc_entries);
 }
 
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return prune_cache_entries();
+}
 /*
  * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
  */
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -15,6 +15,7 @@
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/module.h>
+#include <uapi/linux/vzcalluser.h>
 
 #include "idmap.h"
 #include "nfsd.h"
@@ -1167,6 +1168,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 static struct dentry *nfsd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
+	if (!(get_exec_env()->features & VE_FEATURE_NFSD))
+		return ERR_PTR(-ENODEV);
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
 	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
 }
 
@@ -1183,6 +1188,7 @@ static struct file_system_type nfsd_fs_type = {
 	.name		= "nfsd",
 	.mount		= nfsd_mount,
 	.kill_sb	= nfsd_umount,
+	.fs_flags	= FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfsd");
 
@@ -1226,6 +1232,9 @@ static __net_init int nfsd_init_net(struct net *net)
 	nn->nfsd4_grace = 90;
 	nn->clverifier_counter = prandom_u32();
 	nn->clientid_counter = prandom_u32();
+
+	atomic_set(&nn->ntf_refcnt, 0);
+	init_waitqueue_head(&nn->ntf_wq);
 	return 0;
 
 out_idmap_error:
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -319,7 +319,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct sockaddr_in sin;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nn->ntf_refcnt))
 		goto out;
 
 	if (nn->nfsd_serv) {
@@ -328,6 +329,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
 		sin.sin_addr.s_addr = ifa->ifa_local;
 		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
 	}
+	atomic_dec(&nn->ntf_refcnt);
+	wake_up(&nn->ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -347,7 +350,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct sockaddr_in6 sin6;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nn->ntf_refcnt))
 		goto out;
 
 	if (nn->nfsd_serv) {
@@ -356,7 +360,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
 		sin6.sin6_addr = ifa->addr;
 		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
 	}
-
+	atomic_dec(&nn->ntf_refcnt);
+	wake_up(&nn->ntf_wq);
 out:
 	return NOTIFY_DONE;
 }
@@ -373,6 +378,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	atomic_dec(&nn->ntf_refcnt);
 	/* check if the notifier still has clients */
 	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
 		unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
@@ -380,6 +386,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 		unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
 	}
+	wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
 
 	/*
 	 * write_ports can create the server without actually starting
@@ -502,6 +509,7 @@ int nfsd_create_serv(struct net *net)
 		register_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
 	}
+	atomic_inc(&nn->ntf_refcnt);
 	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
 	return 0;
 }
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2240,7 +2240,7 @@ nfsd_racache_init(int cache_size)
 
 		raparm = &raparm_hash[i].pb_head;
 		for (j = 0; j < nperbucket; j++) {
-			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL_ACCOUNT);
 			if (!*raparm)
 				goto out_nomem;
 			raparm = &(*raparm)->p_next;
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -206,6 +206,7 @@ struct nfsd3_fsinfores {
 	__u32			f_dtpref;
 	__u64			f_maxfilesize;
 	__u32			f_properties;
+	__u32			f_time_gran;
 };
 
 struct nfsd3_pathconfres {
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -135,7 +135,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= nilfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1403,7 +1403,8 @@ static int __init nilfs_init_cachep(void)
 {
 	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
 			sizeof(struct nilfs_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+			nilfs_inode_init_once);
 	if (!nilfs_inode_cachep)
 		goto fail;
 
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -714,7 +714,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	pr_debug("%s: flags=%d event_f_flags=%d\n",
 		__func__, flags, event_f_flags);
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (flags & ~FAN_ALL_INIT_FLAGS)
@@ -785,7 +785,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	if (flags & FAN_UNLIMITED_QUEUE) {
 		fd = -EPERM;
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			goto out_destroy_group;
 		group->max_events = UINT_MAX;
 	} else {
@@ -794,7 +794,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	if (flags & FAN_UNLIMITED_MARKS) {
 		fd = -EPERM;
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			goto out_destroy_group;
 		group->fanotify_data.max_marks = UINT_MAX;
 	} else {
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -25,16 +25,15 @@ static int show_fdinfo(struct seq_file *m, struct file *f,
 {
 	struct fsnotify_group *group = f->private_data;
 	struct fsnotify_mark *mark;
-	int ret = 0;
 
 	mutex_lock(&group->mark_mutex);
 	list_for_each_entry(mark, &group->marks_list, g_list) {
-		ret = show(m, mark);
-		if (ret)
+		show(m, mark);
+		if (m->count == m->size)
 			break;
 	}
 	mutex_unlock(&group->mark_mutex);
-	return ret;
+	return 0;
 }
 
 #if defined(CONFIG_EXPORTFS)
@@ -58,13 +57,13 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	f.handle.handle_type = ret;
 	f.handle.handle_bytes = size * sizeof(u32);
 
-	ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
-			 f.handle.handle_bytes, f.handle.handle_type);
+	seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+		   f.handle.handle_bytes, f.handle.handle_type);
 
 	for (i = 0; i < f.handle.handle_bytes; i++)
-		ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+		seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
 
-	return ret;
+	return 0;
 }
 #else
 static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
@@ -79,7 +78,6 @@ static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 {
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
-	int ret = 0;
 
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
 	    !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
@@ -95,17 +93,17 @@ static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		 * used only internally to the kernel.
 		 */
 		u32 mask = mark->mask & IN_ALL_EVENTS;
-		ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
+		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
 				 "mask:%x ignored_mask:%x ",
 				 inode_mark->wd, inode->i_ino,
 				 inode->i_sb->s_dev,
 				 mask, mark->ignored_mask);
-		ret |= show_mark_fhandle(m, inode);
-		ret |= seq_putc(m, '\n');
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
 		iput(inode);
 	}
 
-	return ret;
+	return 0;
 }
 
 int inotify_show_fdinfo(struct seq_file *m, struct file *f)
@@ -121,7 +119,6 @@ static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 {
 	unsigned int mflags = 0;
 	struct inode *inode;
-	int ret = 0;
 
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
 		return 0;
@@ -132,23 +129,20 @@ static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
 		inode = igrab(mark->inode);
 		if (!inode)
-			goto out;
-		ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
-				 "mflags:%x mask:%x ignored_mask:%x ",
-				 inode->i_ino, inode->i_sb->s_dev,
-				 mflags, mark->mask, mark->ignored_mask);
-		ret |= show_mark_fhandle(m, inode);
-		ret |= seq_putc(m, '\n');
+			return 0;
+		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+			   inode->i_ino, inode->i_sb->s_dev,
+			   mflags, mark->mask, mark->ignored_mask);
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
 		iput(inode);
 	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
 		struct mount *mnt = real_mount(mark->mnt);
 
-		ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
-				 "ignored_mask:%x\n", mnt->mnt_id, mflags,
-				 mark->mask, mark->ignored_mask);
+		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
+			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
 	}
-out:
-	return ret;
+	return 0;
 }
 
 int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	inotify_max_user_instances = INT_MAX;
+	inotify_max_user_watches = INT_MAX;
 
 	return 0;
 }
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3146,8 +3146,8 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+			SLAB_ACCOUNT, ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_big_inode_cache_name);
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -274,8 +274,9 @@ static int ocfs2_set_acl(handle_t *handle,
 	case ACL_TYPE_ACCESS:
 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			ret = posix_acl_equiv_mode(acl, &mode);
+			umode_t mode;
+
+			ret = posix_acl_update_mode(inode, &mode, &acl);
 			if (ret < 0)
 				return ret;
 			else {
@@ -286,7 +287,6 @@ static int ocfs2_set_acl(handle_t *handle,
 							 handle, mode);
 				if (ret)
 					return ret;
-
 			}
 		}
 		break;
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -656,7 +656,7 @@ static int __init init_dlmfs_fs(void)
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				dlmfs_init_once);
 	if (!dlmfs_inode_cache) {
 		status = -ENOMEM;
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1790,7 +1790,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
@@ -2100,6 +2100,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
 	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+	memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+	       sizeof(di->id2.i_super.s_uuid));
 
 	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
 
@@ -2363,7 +2365,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		mlog_errno(status);
 		goto bail;
 	}
-	cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+	cleancache_init_shared_fs(sb);
 
 bail:
 	return status;
--- a/fs/open.c
+++ b/fs/open.c
@@ -245,7 +245,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_CONVERT_UNWRITTEN))
 		return -EOPNOTSUPP;
 
 	/* Punch hole and zero range are mutually exclusive */
@@ -263,6 +264,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
 		return -EINVAL;
 
+	/* Convert-and-extend should only be used exclusively. */
+	if ((mode & FALLOC_FL_CONVERT_UNWRITTEN) &&
+	    (mode & ~FALLOC_FL_CONVERT_UNWRITTEN))
+		return -EINVAL;
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
@@ -716,6 +722,11 @@ static int do_dentry_open(struct file *f,
 	static const struct file_operations empty_fops = {};
 	int error;
 
+	if (!may_use_odirect())
+		f->f_flags &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
@@ -893,9 +904,17 @@ int vfs_open(const struct path *path, struct file *filp,
 {
 	struct inode *inode = path->dentry->d_inode;
 	iop_dentry_open_t dentry_open = get_dentry_open_iop(inode);
+	int do_cleanup = 0;
+	int ret;
+
+	if (!filp->f_original_path.mnt && dentry_open) {
+		filp->f_original_path = *path;
+		path_get(&filp->f_original_path);
+		do_cleanup = 1;
+	}
 
 	if (dentry_open)
-		return dentry_open(path->dentry, filp, cred);
+		ret = dentry_open(path->dentry, filp, cred);
 	else {
 		struct dentry *dentry = d_real(path->dentry, NULL, filp->f_flags);
 
@@ -903,8 +922,15 @@ int vfs_open(const struct path *path, struct file *filp,
 			return PTR_ERR(dentry);
 
 		filp->f_path = *path;
-		return do_dentry_open(filp, dentry->d_inode, NULL, cred);
+		ret = do_dentry_open(filp, dentry->d_inode, NULL, cred);
+	}
+
+	if (ret && do_cleanup) {
+		path_put(&filp->f_original_path);
+		filp->f_original_path.mnt = NULL;
+		filp->f_original_path.dentry = NULL;
 	}
+	return ret;
 }
 EXPORT_SYMBOL(vfs_open);
 
@@ -1131,7 +1157,7 @@ EXPORT_SYMBOL(sys_close);
  */
 SYSCALL_DEFINE0(vhangup)
 {
-	if (capable(CAP_SYS_TTY_CONFIG)) {
+	if (ve_capable(CAP_SYS_TTY_CONFIG)) {
 		tty_vhangup_self();
 		return 0;
 	}
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -449,7 +449,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD),
+					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1415,13 +1415,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
 				const char *dev_name, void *raw_data)
 {
-	static bool seen = false;
-
-	if (!seen) {
-		mark_tech_preview("Overlay filesystem", THIS_MODULE);
-		seen = true;
-	}
-
 	return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
 }
 
@@ -1430,7 +1423,7 @@ static struct file_system_type ovl_fs_type = {
 	.name		= "overlay",
 	.mount		= ovl_mount,
 	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_HAS_DOPS_WRAPPER,
+	.fs_flags	= FS_HAS_DOPS_WRAPPER | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("overlay");
 
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -22,6 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/aio.h>
+#include <linux/memcontrol.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -229,12 +230,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 	 * temporary page, let's keep track of it as a one-deep
 	 * allocation cache. (Otherwise just release our reference to it)
 	 */
-	if (page_count(page) == 1 && !pipe->tmp_page)
+	if (page_count(page) == 1 && !pipe->tmp_page) {
 		pipe->tmp_page = page;
-	else
+	} else
 		page_cache_release(page);
 }
 
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	if (page_count(page) == 1) {
+		memcg_kmem_uncharge_pages(page, 0);
+		lock_page(page);
+		return 0;
+	}
+	return 1;
+}
 /**
  * generic_pipe_buf_map - virtually map a pipe buffer
  * @pipe:	the pipe that the buffer belongs to
@@ -365,7 +378,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -375,7 +388,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -594,7 +607,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 			size_t remaining;
 
 			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
+				page = alloc_pages(GFP_HIGHUSER | __GFP_ACCOUNT, 0);
 				if (unlikely(!page)) {
 					ret = ret ? : -ENOMEM;
 					break;
@@ -824,7 +837,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
-	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 	if (pipe) {
 		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 		struct user_struct *user = get_current_user();
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -83,6 +83,9 @@
 #include <linux/tracehook.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -174,6 +177,18 @@ static inline int get_task_umask(struct task_struct *tsk)
 	return umask;
 }
 
+static int task_virtual_pid(struct task_struct *t)
+{
+	struct pid *pid;
+
+	pid = task_pid(t);
+	/*
+	 * this will give wrong result for tasks,
+	 * that failed to enter VE, but that's OK
+	 */
+	return pid ? pid->numbers[pid->level].nr : 0;
+}
+
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
@@ -182,17 +197,17 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	int g, umask;
 	struct fdtable *fdt = NULL;
 	const struct cred *cred;
-	pid_t ppid, tpid;
+	pid_t ppid, tpid, vpid;
 
 	rcu_read_lock();
-	ppid = pid_alive(p) ?
-		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+	ppid = pid_alive(p) ? ve_task_ppid_nr_ns(p, ns) : 0;
 	tpid = 0;
 	if (pid_alive(p)) {
 		struct task_struct *tracer = ptrace_parent(p);
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	vpid = task_virtual_pid(p);
 	cred = get_task_cred(p);
 
 	umask = get_task_umask(p);
@@ -239,7 +254,32 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 			   from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
 	put_cred(cred);
 
+#ifdef CONFIG_PID_NS
+	seq_puts(m, "\nNStgid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_tgid_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSpid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_pid_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSpgid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_pgrp_nr_ns(p, pid->numbers[g].ns));
+	seq_puts(m, "\nNSsid:");
+	for (g = ns->level; g <= pid->level; g++)
+		seq_printf(m, "\t%d",
+			task_session_nr_ns(p, pid->numbers[g].ns));
+#endif
 	seq_putc(m, '\n');
+
+#ifdef CONFIG_VE
+	rcu_read_lock();
+	seq_printf(m, "envID:\t%s\nVPid:\t%d\n",
+			task_ve_name(p), vpid);
+	rcu_read_unlock();
+#endif
 }
 
 void render_sigset_t(struct seq_file *m, const char *header,
@@ -279,10 +319,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
 	}
 }
 
-static inline void task_sig(struct seq_file *m, struct task_struct *p)
+void task_sig(struct seq_file *m, struct task_struct *p)
 {
 	unsigned long flags;
-	sigset_t pending, shpending, blocked, ignored, caught;
+	sigset_t pending, shpending, blocked, ignored, caught, saved;
 	int num_threads = 0;
 	unsigned long qsize = 0;
 	unsigned long qlim = 0;
@@ -292,11 +332,13 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 	sigemptyset(&blocked);
 	sigemptyset(&ignored);
 	sigemptyset(&caught);
+	sigemptyset(&saved);
 
 	if (lock_task_sighand(p, &flags)) {
 		pending = p->pending.signal;
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
+		saved = p->saved_sigmask;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = get_nr_threads(p);
 		rcu_read_lock();  /* FIXME: is this correct? */
@@ -315,6 +357,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 	render_sigset_t(m, "SigBlk:\t", &blocked);
 	render_sigset_t(m, "SigIgn:\t", &ignored);
 	render_sigset_t(m, "SigCgt:\t", &caught);
+	render_sigset_t(m, "SigSvd:\t", &saved);
 }
 
 static void render_cap_t(struct seq_file *m, const char *header,
@@ -359,6 +402,20 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_BEANCOUNTERS
+static inline void ub_dump_task_info(struct task_struct *tsk,
+		char *stsk, int ltsk, char *smm, int lmm)
+{
+	snprintf(stsk, ltsk, "%s", tsk->task_bc.task_ub->ub_name);
+	task_lock(tsk);
+	if (tsk->mm)
+		snprintf(smm, lmm, "%s", tsk->mm->mm_ub->ub_name);
+	else
+		strncpy(smm, "N/A", lmm);
+	task_unlock(tsk);
+}
+#endif
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -382,6 +439,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
+#ifdef CONFIG_BEANCOUNTERS
+	char tsk_ub_info[64], mm_ub_info[64];
+#endif
 
 	task_name(m, task);
 	task_state(m, ns, pid, task);
@@ -396,6 +456,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task,
+			tsk_ub_info, sizeof(tsk_ub_info),
+			mm_ub_info, sizeof(mm_ub_info));
+
+	seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info);
+	seq_printf(m, "MMUB:\t%s\n", mm_ub_info);
+#endif
 	return 0;
 }
 
@@ -419,6 +487,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
+	int is_super = ve_is_super(get_exec_env());
+#ifdef CONFIG_BEANCOUNTERS
+	char ub_task_info[64];
+	char ub_mm_info[64];
+#endif
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -476,7 +549,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		}
 
 		sid = task_session_nr_ns(task, ns);
-		ppid = task_tgid_nr_ns(task->real_parent, ns);
+		ppid = ve_task_ppid_nr_ns(task, ns);
 		pgid = task_pgrp_nr_ns(task, ns);
 
 		unlock_task_sighand(task, &flags);
@@ -501,9 +574,28 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time =
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
+#ifdef CONFIG_VE
+	if (!is_super) {
+		struct timespec *ve_start_ts =
+				&get_exec_env()->real_start_timespec;
+		start_time -=
+			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
+				+ ve_start_ts->tv_nsec;
+	}
+	/* tasks inside a CT can have negative start time e.g. if the CT was
+	 * migrated from another hw node, in which case we will report 0 in
+	 * order not to confuse userspace */
+	if ((s64)start_time < 0)
+		start_time = 0;
+#endif
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info),
+			ub_mm_info, sizeof(ub_mm_info));
+#endif
+
 	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
 	seq_put_decimal_ll(m, ' ', ppid);
 	seq_put_decimal_ll(m, ' ', pgid);
@@ -544,7 +636,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ll(m, ' ', task->exit_signal);
-	seq_put_decimal_ll(m, ' ', task_cpu(task));
+	seq_put_decimal_ll(m, ' ', is_super ? task_cpu(task) : task_vcpu_id(task));
 	seq_put_decimal_ull(m, ' ', task->rt_priority);
 	seq_put_decimal_ull(m, ' ', task->policy);
 	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
@@ -567,6 +659,18 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	else
 		seq_put_decimal_ll(m, ' ', 0);
 
+#ifdef CONFIG_VE
+	seq_printf(m, " %s", " 0 0 0 0 0");
+	seq_put_decimal_ll(m, ' ', task_pid_nr_ns(task, task_active_pid_ns(task)));
+	rcu_read_lock();
+	seq_printf(m, " %s", task_ve_name(task));
+	rcu_read_unlock();
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+	seq_printf(m, " %s", ub_task_info);
+	seq_printf(m, " %s", ub_mm_info);
+#endif
+
 	seq_putc(m, '\n');
 	if (mm)
 		mmput(mm);
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
 #include <linux/slab.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
+#include <linux/aio.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -608,10 +609,14 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long totalpages = totalram_pages + total_swap_pages;
 	unsigned long points = 0;
+	struct user_beancounter *ub = get_exec_ub();
+
+	if (ub != get_ub0())
+		totalpages = min(totalpages, ub_total_pages(ub, true));
 
 	tasklist_read_lock();
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL, totalpages) *
+		points = oom_badness(task, NULL, NULL, totalpages, NULL) *
 						1000 / totalpages;
 	qread_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
@@ -718,17 +723,36 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 static int proc_fd_access_allowed(struct inode *inode)
 {
 	struct task_struct *task;
-	int allowed = 0;
+	int err;
+
 	/* Allow access to a task's file descriptors if it is us or we
 	 * may use ptrace attach to the process and find out that
 	 * information.
 	 */
+	err = -ENOENT;
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		if (task->flags & PF_KTHREAD)
+			/*
+			 * Always allow access to kernel threads /proc entries.
+			 */
+			err = 0;
+		else if (ptrace_may_access(task, PTRACE_MODE_READ))
+			err = 0;
+		else
+			/*
+			 * This clever ptrace_may_attach() may play a trick
+			 * on us. If the task is zombie it will consider this
+			 * task to be not dumpable at all and will deny any
+			 * ptracing in VE. Not a big deal for ptrace(), but
+			 * following the link will fail with the -EACCESS
+			 * reason. Some software is unable to stand such a
+			 * swindle and refuses to work :(
+			 */
+			err = (task->mm ? -EACCES : -ENOENT);
 		put_task_struct(task);
 	}
-	return allowed;
+	return err;
 }
 
 int proc_setattr(struct dentry *dentry, struct iattr *attr)
@@ -763,6 +787,14 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 	return ptrace_may_access(task, PTRACE_MODE_READ);
 }
 
+static bool is_visible_task(struct pid_namespace *ns, struct task_struct *tsk)
+{
+	if (ns->hide_pidns == 1 && task_active_pid_ns(tsk) != ns)
+		return false;
+	if (!has_pid_permissions(ns, tsk, 2))
+		return false;
+	return true;
+}
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
@@ -1199,7 +1231,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 	if (!task)
 		return -ESRCH;
 	if (lock_task_sighand(task, &flags)) {
-		oom_score_adj = task->signal->oom_score_adj;
+		oom_score_adj = get_task_oom_score_adj(task);
 		unlock_task_sighand(task, &flags);
 	}
 	put_task_struct(task);
@@ -1251,7 +1283,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	}
 
 	if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
-			!capable(CAP_SYS_RESOURCE)) {
+			!ve_capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
 	}
@@ -1640,10 +1672,11 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct path path;
-	int error = -EACCES;
+	int error;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
@@ -1682,12 +1715,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
 
 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
 {
-	int error = -EACCES;
+	int error;
 	struct inode *inode = dentry->d_inode;
 	struct path path;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
@@ -2327,8 +2361,33 @@ static const struct file_operations proc_timers_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release_private,
 };
+
+
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+#ifdef CONFIG_VE
+static long proc_aio_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task;
+	int ret;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	ret = ve_aio_ioctl(task, cmd, arg);
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static const struct file_operations proc_aio_operations = {
+	.unlocked_ioctl		= proc_aio_ioctl,
+};
+#endif /* CONFIG_VE */
+
 static struct dentry *proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2950,6 +3009,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	REG("timers",	  S_IRUGO, proc_timers_operations),
+	REG("aio",	  S_IRUGO|S_IWUSR, proc_aio_operations),
 #endif
 };
 
@@ -3200,7 +3260,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		if (has_pid_permissions(ns, iter.task, 2))
+		if (is_visible_task(ns, iter.task))
 			__filldir = filldir;
 		else
 			__filldir = fake_filldir;
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -2,10 +2,12 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int cmdline_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "%s\n", saved_command_line);
+	seq_printf(m, "%s\n",
+		ve_is_super(get_exec_env()) ? saved_command_line : "quiet");
 	return 0;
 }
 
@@ -23,7 +25,7 @@ static const struct file_operations cmdline_proc_fops = {
 
 static int __init proc_cmdline_init(void)
 {
-	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	proc_create("cmdline", S_ISVTX, NULL, &cmdline_proc_fops);
 	return 0;
 }
 module_init(proc_cmdline_init);
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -18,7 +18,7 @@ static const struct file_operations proc_cpuinfo_operations = {
 
 static int __init proc_cpuinfo_init(void)
 {
-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	proc_create("cpuinfo", S_ISVTX, NULL, &proc_cpuinfo_operations);
 	return 0;
 }
 module_init(proc_cpuinfo_init);
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -2,6 +2,7 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int devinfo_show(struct seq_file *f, void *v)
 {
@@ -64,7 +65,7 @@ static const struct file_operations proc_devinfo_operations = {
 
 static int __init proc_devices_init(void)
 {
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create("devices", S_ISVTX, NULL, &proc_devinfo_operations);
 	return 0;
 }
 module_init(proc_devices_init);
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -160,6 +160,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 
 		spin_lock(&files->file_lock);
 		fd_file = fcheck_files(files, fd);
+		ret = -EACCES;
 		if (fd_file) {
 			*path = fd_file->f_path;
 			path_get(&fd_file->f_path);
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -100,12 +100,21 @@ bool pde_subdir_insert(struct proc_dir_entry *dir,
 	return true;
 }
 
+bool proc_in_container(struct super_block *sb)
+{
+	return !ve_is_super(get_exec_env());
+}
+
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct proc_dir_entry *de = PDE(inode);
 	int error;
 
+	if (proc_in_container(dentry->d_sb) &&
+	    (iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)))
+	    return -EPERM;
+
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		return error;
@@ -113,9 +122,14 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 
-	de->uid = inode->i_uid;
-	de->gid = inode->i_gid;
-	de->mode = inode->i_mode;
+	if (iattr->ia_valid & ATTR_UID)
+		de->uid = inode->i_uid;
+	if (iattr->ia_valid & ATTR_GID)
+		de->gid = inode->i_gid;
+	if (iattr->ia_valid & ATTR_MODE)
+		de->mode = (de->mode & ~S_IRWXUGO) |
+			   (inode->i_mode & S_IRWXUGO);
+
 	return 0;
 }
 
@@ -259,10 +273,15 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		struct dentry *dentry)
 {
 	struct inode *inode;
+	bool in_container = proc_in_container(dentry->d_sb);
 
 	spin_lock(&proc_subdir_lock);
 	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
 	if (de) {
+		if (in_container && !(de->mode & S_ISVTX)) {
+			spin_unlock(&proc_subdir_lock);
+			return ERR_PTR(-ENOENT);
+		}
 		pde_get(de);
 		spin_unlock(&proc_subdir_lock);
 		inode = proc_get_inode(dir->i_sb, de);
@@ -298,6 +317,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	int i;
 	struct inode *inode = file_inode(filp);
 	int ret = 0;
+	bool in_container = proc_in_container(filp->f_path.dentry->d_sb);
 
 	ino = inode->i_ino;
 	i = filp->f_pos;
@@ -326,15 +346,22 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 					spin_unlock(&proc_subdir_lock);
 					goto out;
 				}
-				if (!i)
-					break;
+				if (!in_container || (de->mode & S_ISVTX)) {
+					if (!i)
+						break;
+					i--;
+				}
 				de = pde_subdir_next(de);
-				i--;
 			}
 
 			do {
 				struct proc_dir_entry *next;
 
+				if (in_container && !(de->mode & S_ISVTX)) {
+					de = pde_subdir_next(de);
+					continue;
+				}
+
 				/* filldir passes info to user space */
 				pde_get(de);
 				spin_unlock(&proc_subdir_lock);
@@ -353,7 +380,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	}
 	ret = 1;
 out:
-	return ret;	
+	return ret;
 }
 
 int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -461,13 +488,12 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	return ent;
 }
 
-struct proc_dir_entry *proc_symlink(const char *name,
+struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const char *dest)
 {
 	struct proc_dir_entry *ent;
 
-	ent = __proc_create(&parent, name,
-			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
+	ent = __proc_create(&parent, name, S_IFLNK | mode, 1);
 
 	if (ent) {
 		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
@@ -485,7 +511,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 	}
 	return ent;
 }
-EXPORT_SYMBOL(proc_symlink);
+EXPORT_SYMBOL(proc_symlink_mode);
 
 struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, void *data)
@@ -535,7 +561,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 		return NULL;
 	}
 
-	if ((mode & S_IALLUGO) == 0)
+	if ((mode & S_IRWXUGO) == 0)
 		mode |= S_IRUGO;
 	pde = __proc_create(&parent, name, mode, 1);
 	if (!pde)
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -103,7 +103,8 @@ void __init proc_init_inodecache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_PANIC),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_PANIC),
 					     init_once);
 }
 
@@ -116,6 +117,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
 	if (pid->hide_pid != 0)
 		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+	if (pid->hide_pidns)
+		seq_printf(seq, ",hidepidns=%u", pid->hide_pidns);
 
 	return 0;
 }
@@ -382,8 +385,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->pde = de;
 
-		if (de->mode) {
-			inode->i_mode = de->mode;
+		if (de->mode & (S_IFMT | S_IRWXUGO)) {
+			inode->i_mode = de->mode & (S_IFMT | S_IRWXUGO);
 			inode->i_uid = de->uid;
 			inode->i_gid = de->gid;
 		}
@@ -421,7 +424,7 @@ int proc_fill_super(struct super_block *s)
 	s->s_magic = PROC_SUPER_MAGIC;
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
-	
+
 	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, &proc_root);
 	if (!root_inode) {
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -35,33 +35,6 @@ struct mempolicy;
  * only, so, changes in this structure should be bypassed by kABI checker, and
  * such changes should not impact of procfs users.
  */
-struct proc_dir_entry {
-	unsigned int low_ino;
-	umode_t mode;
-	nlink_t nlink;
-	kuid_t uid;
-	kgid_t gid;
-	loff_t size;
-	const struct inode_operations *proc_iops;
-	const struct file_operations *proc_fops;
-#ifdef __GENKSYMS__
-	struct proc_dir_entry *next, *parent, *subdir;
-#else
-	struct proc_dir_entry *parent;
-	struct rb_root subdir;
-	struct rb_node subdir_node;
-#endif
-	void *data;
-	atomic_t count;		/* use count */
-	atomic_t in_use;	/* number of callers into module in progress; */
-			/* negative -> it's going away RSN */
-	struct completion *pde_unload_completion;
-	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
-	u8 namelen;
-	char name[];
-};
-
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -13,11 +13,13 @@
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
 #include <linux/syslog.h>
+#include <linux/module.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-extern wait_queue_head_t log_wait;
+extern void log_poll_wait(struct file *filp, poll_table *p);
 
 static int kmsg_open(struct inode * inode, struct file * file)
 {
@@ -41,7 +43,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &log_wait, wait);
+	log_poll_wait(file, wait);
 	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
 		return POLLIN | POLLRDNORM;
 	return 0;
@@ -58,7 +60,7 @@ static const struct file_operations proc_kmsg_operations = {
 
 static int __init proc_kmsg_init(void)
 {
-	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	proc_create("kmsg", S_IRUSR|S_ISVTX, NULL, &proc_kmsg_operations);
 	return 0;
 }
 module_init(proc_kmsg_init);
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -6,6 +6,7 @@
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
+#include <linux/ve.h>
 
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
@@ -13,6 +14,15 @@
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_loadavg(ve, m);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	get_avenrun(avnrun, FIXED_1/200, 0);
 
@@ -39,7 +49,7 @@ static const struct file_operations loadavg_proc_fops = {
 
 static int __init proc_loadavg_init(void)
 {
-	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	proc_create("loadavg", S_ISVTX, NULL, &loadavg_proc_fops);
 	return 0;
 }
 module_init(proc_loadavg_init);
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -10,19 +10,84 @@
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
 #include <linux/atomic.h>
 #include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <bc/beancounter.h>
 #include "internal.h"
 
 void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
 {
 }
 
-static int meminfo_proc_show(struct seq_file *m, void *v)
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
+static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
+{
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Shmem:          %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		,
+		K(mi->si->totalram),
+		K(mi->si->freeram),
+		K(mi->cached),
+		K(0L),
+		K(mi->pages[LRU_ACTIVE_ANON]   + mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_ACTIVE_ANON]),
+		K(mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_UNEVICTABLE]),
+		K(mi->locked),
+		K(mi->si->totalswap),
+		K(mi->si->freeswap),
+		K(mi->dirty_pages),
+		K(mi->writeback_pages),
+		K(mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->shmem),
+		K(mi->slab_reclaimable + mi->slab_unreclaimable),
+		K(mi->slab_reclaimable),
+		K(mi->slab_unreclaimable));
+
+	return 0;
+}
+
+#ifdef CONFIG_TCACHE
+extern unsigned long get_nr_tcache_pages(void);
+#endif
+#ifdef CONFIG_TSWAP
+extern unsigned long get_nr_tswap_pages(void);
+#endif
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val)
 {
+	int ret;
 	struct sysinfo i;
+	struct meminfo mi;
 	unsigned long committed;
 	struct vmalloc_info vmi;
 	long cached;
@@ -30,12 +95,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	unsigned long pages[NR_LRU_LISTS];
 	int lru;
 
+	si_meminfo(&i);
+	si_swapinfo(&i);
+
+	memset(&mi, 0, sizeof(mi));
+	mi.si = &i;
+	mi.ub = ub;
+	mi.meminfo_val = meminfo_val;
+
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi);
+	if (ret & NOTIFY_FAIL)
+		return 0;
+	if (ret & NOTIFY_OK)
+		return meminfo_proc_show_mi(m, &mi);
+
 /*
  * display in kilobytes.
  */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
 
 	cached = global_page_state(NR_FILE_PAGES) -
@@ -105,6 +181,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		"AnonHugePages:  %8lu kB\n"
+#endif
+#ifdef CONFIG_TCACHE
+		"Tcache:         %8lu kB\n"
+#endif
+#ifdef CONFIG_TSWAP
+		"Tswap:          %8lu kB\n"
 #endif
 		,
 		K(i.totalram),
@@ -166,6 +248,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
 		   HPAGE_PMD_NR)
+#endif
+#ifdef CONFIG_TCACHE
+		,K(get_nr_tcache_pages())
+#endif
+#ifdef CONFIG_TSWAP
+		,K(get_nr_tswap_pages())
 #endif
 		);
 
@@ -177,6 +265,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #undef K
 }
 
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	return meminfo_proc_show_ub(m, v, mm_ub(current->mm),
+			get_exec_env()->meminfo_val);
+}
+
 static int meminfo_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, meminfo_proc_show, NULL);
@@ -191,7 +285,7 @@ static const struct file_operations meminfo_proc_fops = {
 
 static int __init proc_meminfo_init(void)
 {
-	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	proc_create("meminfo", S_ISVTX, NULL, &meminfo_proc_fops);
 	return 0;
 }
 module_init(proc_meminfo_init);
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -27,6 +27,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+	&pidns_for_children_operations,
 #endif
 #ifdef CONFIG_USER_NS
 	&userns_operations,
@@ -150,6 +151,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	struct proc_inode *ei = PROC_I(inode);
 	const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
 	struct task_struct *task;
+	const char *link_name;
 	void *ns;
 	char name[50];
 	int len = -EACCES;
@@ -166,7 +168,8 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!ns)
 		goto out_put_task;
 
-	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
+	link_name = ns_ops->real_ns_name ? : ns_ops->name;
+	snprintf(name, sizeof(name), "%s:[%u]", link_name, ns_ops->inum(ns));
 	len = strlen(name);
 
 	if (len > buflen)
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -215,10 +216,62 @@ static const struct file_operations proc_kpageflags_operations = {
 	.read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 ino;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+
+		if (ppage)
+			ino = page_cgroup_ino(ppage);
+		else
+			ino = 0;
+
+		if (put_user(ino, out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+	proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
 	return 0;
 }
 module_init(proc_page_init);
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -183,6 +183,15 @@ const struct file_operations proc_net_operations = {
 	.readdir	= proc_tgid_net_readdir,
 };
 
+struct proc_dir_entry *proc_net_create_data(const char *name, umode_t mode,
+					    struct proc_dir_entry *parent,
+					    const struct file_operations *fops,
+					    void *data)
+{
+	return proc_create_data(name, S_ISVTX | mode, parent, fops, data);
+}
+EXPORT_SYMBOL_GPL(proc_net_create_data);
+
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
@@ -228,7 +237,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
 
 int __init proc_net_init(void)
 {
-	proc_symlink("net", NULL, "self/net");
+	proc_symlink_mode("net", S_ISVTX | S_IRWXUGO, NULL, "self/net");
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/nsproxy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include "internal.h"
@@ -35,6 +36,16 @@ static struct ctl_table root_table[] = {
 	},
 	{ }
 };
+
+static int sysctl_root_permissions(struct ctl_table_header *head,
+		struct ctl_table *table)
+{
+	if (ve_is_super(get_exec_env()) || (table->mode & S_ISVTX))
+		return table->mode;
+
+	return table->mode & ~S_IWUGO;
+}
+
 static struct ctl_table_root sysctl_table_root = {
 	.default_set.dir.header = {
 		{{.count = 1,
@@ -44,6 +55,7 @@ static struct ctl_table_root sysctl_table_root = {
 		.root = &sysctl_table_root,
 		.set = &sysctl_table_root.default_set,
 	},
+	.permissions = sysctl_root_permissions,
 };
 
 static DEFINE_SPINLOCK(sysctl_lock);
@@ -410,7 +422,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	ei->sysctl_entry = table;
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_mode = table->mode;
+	inode->i_mode = table->mode & S_IRWXUGO;
 	if (!S_ISDIR(table->mode)) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
@@ -747,13 +759,23 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	struct inode *inode = dentry->d_inode;
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_root *root;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
+	root = head->root;
+
 	generic_fillattr(inode, stat);
-	if (table)
-		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	if (table) {
+		umode_t mode = table->mode;
+
+		if (root->permissions)
+			mode = root->permissions(head, table);
+
+		stat->mode = (stat->mode & S_IFMT) | (mode & S_IRWXUGO);
+	}
 
 	sysctl_head_finish(head);
 	return 0;
@@ -1023,11 +1045,13 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 				err = sysctl_err(path, table, "No data");
 			if (!table->maxlen)
 				err = sysctl_err(path, table, "No maxlen");
+			if (table->mode & S_ISVTX)
+				err = sysctl_err(path, table, "Unsafe v12n");
 		}
 		if (!table->proc_handler)
 			err = sysctl_err(path, table, "No proc_handler");
 
-		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+		if ((table->mode & (S_IRUGO|S_IWUGO|S_ISVTX)) != table->mode)
 			err = sysctl_err(path, table, "bogus .mode 0%o",
 				table->mode);
 	}
@@ -1590,7 +1614,7 @@ int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
 
-	proc_sys_root = proc_mkdir("sys", NULL);
+	proc_sys_root = proc_mkdir_mode("sys", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -39,11 +39,12 @@ static int proc_set_super(struct super_block *sb, void *data)
 }
 
 enum {
-	Opt_gid, Opt_hidepid, Opt_err,
+	Opt_gid, Opt_hidepid, Opt_hidepidns, Opt_err,
 };
 
 static const match_table_t tokens = {
 	{Opt_hidepid, "hidepid=%u"},
+	{Opt_hidepidns, "hidepidns=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_err, NULL},
 };
@@ -79,6 +80,15 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 			}
 			pid->hide_pid = option;
 			break;
+		case Opt_hidepidns:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 1) {
+				pr_err("proc: hidepidns value must be between 0 and 1.\n");
+				return 0;
+			}
+			pid->hide_pidns = option;
+			break;
 		default:
 			pr_err("proc: unrecognized mount option \"%s\" "
 			       "or missing value\n", p);
@@ -125,6 +135,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	}
 
 	if (!sb->s_root) {
+		sb->s_flags &= ~MS_RDONLY;
+
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
@@ -152,7 +164,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 void __init proc_root_init(void)
@@ -165,16 +177,17 @@ void __init proc_root_init(void)
 		return;
 
 	proc_self_init();
-	proc_symlink("mounts", NULL, "self/mounts");
+	proc_symlink_mode("mounts", S_ISVTX | S_IRWXUGO, NULL, "self/mounts");
 
 	proc_net_init();
 
 #ifdef CONFIG_SYSVIPC
-	proc_mkdir("sysvipc", NULL);
+	proc_mkdir_mode("sysvipc", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 #endif
-	proc_mkdir("fs", NULL);
+	proc_mkdir_mode("fs", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 	proc_mkdir("driver", NULL);
-	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+	/* somewhere for the nfsd filesystem to be mounted */
+	proc_mkdir_mode("fs/nfsd", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
 	/* just give it a mountpoint */
 	proc_mkdir("openprom", NULL);
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -57,7 +57,7 @@ int proc_setup_self(struct super_block *s)
 		if (inode) {
 			inode->i_ino = self_inum;
 			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_mode = S_IFLNK | S_IRWXUGO | S_ISVTX;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_self_inode_operations;
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -4,13 +4,15 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <linux/cputime.h>
 #include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/ve.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -87,12 +89,22 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
+	struct ve_struct *ve;
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_cpu_stat(ve, p);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = 0;
 	guest = guest_nice = 0;
-	getboottime(&boottime);
-	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
 		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -200,7 +212,7 @@ static const struct file_operations proc_stat_operations = {
 
 static int __init proc_stat_init(void)
 {
-	proc_create("stat", 0, NULL, &proc_stat_operations);
+	proc_create("stat", S_ISVTX, NULL, &proc_stat_operations);
 	return 0;
 }
 module_init(proc_stat_init);
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -443,7 +444,6 @@ struct mem_size_stats {
 	unsigned long anonymous;
 	unsigned long anonymous_thp;
 	unsigned long swap;
-	unsigned long nonlinear;
 	u64 pss;
 	bool check_shmem_swap;
 };
@@ -466,7 +466,6 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
-	pgoff_t pgoff = linear_page_index(vma, addr);
 	struct page *page = NULL;
 	int mapcount;
 
@@ -481,9 +480,6 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 			page = migration_entry_to_page(swpent);
 		else if (is_hmm_entry(swpent))
 			page = hmm_entry_to_page(swpent);
-	} else if (pte_file(ptent)) {
-		if (pte_to_pgoff(ptent) != pgoff)
-			mss->nonlinear += ptent_size;
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 			    && pte_none(ptent))) {
 		/* We shouldn't encounter huge pages here */
@@ -507,12 +503,9 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 	if (PageAnon(page))
 		mss->anonymous += ptent_size;
 
-	if (page->index != pgoff)
-		mss->nonlinear += ptent_size;
-
 	mss->resident += ptent_size;
 	/* Accumulate the size in pages that have been accessed. */
-	if (pte_young(ptent) || PageReferenced(page))
+	if (pte_young(ptent) || page_is_young(page) || PageReferenced(page))
 		mss->referenced += ptent_size;
 	mapcount = page_mapcount(page);
 	if (mapcount >= 2) {
@@ -594,7 +587,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
-		[ilog2(VM_NONLINEAR)]	= "nl",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_DONTDUMP)]	= "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -693,10 +685,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   (vma->vm_flags & VM_LOCKED) ?
 			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
-	if (vma->vm_flags & VM_NONLINEAR)
-		seq_printf(m, "Nonlinear:      %8lu kB\n",
-				mss.nonlinear >> 10);
-
 	show_smap_vma_flags(m, vma);
 	m_cache_vma(m, vma);
 	return 0;
@@ -797,8 +785,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
-	} else if (pte_file(ptent)) {
-		ptent = pte_file_clear_soft_dirty(ptent);
 	}
 
 	set_pte_at(vma->vm_mm, addr, pte, ptent);
@@ -836,6 +822,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
+		test_and_clear_page_young(page);
 		ClearPageReferenced(page);
 	}
 	pte_unmap_unlock(pte - 1, ptl);
@@ -921,6 +908,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				continue;
 			if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
 				continue;
+			if (type == CLEAR_REFS_SOFT_DIRTY) {
+				if (vma->vm_flags & VM_SOFTDIRTY)
+					vma->vm_flags &= ~VM_SOFTDIRTY;
+			}
 			walk_page_range(vma->vm_start, vma->vm_end,
 					&clear_refs_walk);
 		}
@@ -1306,6 +1297,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 static int pagemap_open(struct inode *inode, struct file *file)
 {
+	/* do not disclose physical addresses: attack vector */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
 			"to stop being page-shift some time soon. See the "
 			"linux/Documentation/vm/pagemap.txt for details.\n");
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,16 +1,15 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
 #include <linux/cputime.h>
+#include <linux/ve.h>
+#include <linux/cgroup.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static inline void get_ve0_idle(struct timespec *idle)
 {
-	struct timespec uptime;
-	struct timespec idle;
 	u64 idletime;
 	u64 nsec;
 	u32 rem;
@@ -20,10 +19,38 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	for_each_possible_cpu(i)
 		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
-	get_monotonic_boottime(&uptime);
 	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
-	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
-	idle.tv_nsec = rem;
+	idle->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+	idle->tv_nsec = rem;
+}
+
+static inline void get_veX_idle(struct ve_struct *ve, struct timespec *idle)
+{
+	struct kernel_cpustat kstat;
+
+	ve_get_cpu_stat(ve, &kstat);
+	cputime_to_timespec(kstat.cpustat[CPUTIME_IDLE], idle);
+}
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	struct ve_struct *ve = get_exec_env();
+
+	if (ve_is_super(ve))
+		get_ve0_idle(&idle);
+	else
+		get_veX_idle(ve, &idle);
+
+	get_monotonic_boottime(&uptime);
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env())) {
+		set_normalized_timespec(&uptime,
+			uptime.tv_sec - get_exec_env()->real_start_timespec.tv_sec,
+			uptime.tv_nsec - get_exec_env()->real_start_timespec.tv_nsec);
+	}
+#endif
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
@@ -46,7 +73,7 @@ static const struct file_operations uptime_proc_fops = {
 
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	proc_create("uptime", S_ISVTX, NULL, &uptime_proc_fops);
 	return 0;
 }
 module_init(proc_uptime_init);
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -28,7 +28,7 @@ static const struct file_operations version_proc_fops = {
 
 static int __init proc_version_init(void)
 {
-	proc_create("version", 0, NULL, &version_proc_fops);
+	proc_create("version", S_ISVTX, NULL, &version_proc_fops);
 	return 0;
 }
 module_init(proc_version_init);
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
+		{ MS_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -382,7 +382,7 @@ static int init_inodecache(void)
 	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -642,7 +642,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -27,7 +27,7 @@ config QUOTA_NETLINK_INTERFACE
 config PRINT_QUOTA_WARNING
 	bool "Print quota warnings to console (OBSOLETE)"
 	depends on QUOTA
-	default y
+	default n
 	help
 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
 	  hardlimit, etc.) will be printed to the process' controlling terminal.
@@ -42,6 +42,15 @@ config QUOTA_DEBUG
 	  If you say Y here, quota subsystem will perform some additional
 	  sanity checks of quota internal structures. If unsure, say N.
 
+config QUOTA_COMPAT
+	bool "Compatibility with older quotactl interface"
+	depends on QUOTA
+	help
+	  This option enables compatibility layer for older version
+	  of quotactl interface with byte granularity (QUOTAON at 0x0100,
+	  GETQUOTA at 0x0D00).  Interface versions older than that one and
+	  with block granularity are still not supported.
+
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
 	 tristate
--- a/fs/quota/compat.c
+++ b/fs/quota/compat.c
@@ -51,6 +51,11 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 	compat_uint_t data;
 	u16 xdata;
 	long ret;
+#ifdef CONFIG_QUOTA_COMPAT
+	struct compat_dqblk __user *cdq;
+	struct compat_compat_dqblk __user *compat_cdq;
+	compat_time_t time;
+#endif
 
 	cmds = cmd >> SUBCMDSHIFT;
 
@@ -111,6 +116,43 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			break;
 		ret = 0;
 		break;
+#ifdef CONFIG_QUOTA_COMPAT
+	case QC_GETQUOTA:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &cdq->dqb_btime) ||
+			put_user(time, &compat_cdq->dqb_btime) ||
+			get_user(time, &cdq->dqb_itime) ||
+			put_user(time, &compat_cdq->dqb_itime))
+			break;
+		ret = 0;
+		break;
+	case QC_SETQUOTA:
+	case QC_SETUSE:
+	case QC_SETQLIM:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = -EFAULT;
+		if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &compat_cdq->dqb_btime) ||
+			put_user(time, &cdq->dqb_btime) ||
+			get_user(time, &compat_cdq->dqb_itime) ||
+			put_user(time, &cdq->dqb_itime))
+			break;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		break;
+#endif
 	default:
 		ret = sys_quotactl(cmd, special, id, addr);
 	}
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -246,7 +246,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
-static qsize_t inode_get_rsv_space(struct inode *inode);
 static void __dquot_initialize(struct inode *inode, int type);
 
 static inline unsigned int
@@ -684,45 +683,39 @@ int dquot_quota_sync(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_quota_sync);
 
-/* Free unused dquots from cache */
-static void prune_dqcache(int count)
+static unsigned long
+dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct list_head *head;
 	struct dquot *dquot;
+	unsigned long freed = 0;
 
+	spin_lock(&dq_list_lock);
 	head = free_dquots.prev;
-	while (head != &free_dquots && count) {
+	while (head != &free_dquots && sc->nr_to_scan) {
 		dquot = list_entry(head, struct dquot, dq_free);
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
 		do_destroy_dquot(dquot);
-		count--;
+		sc->nr_to_scan--;
+		freed++;
 		head = free_dquots.prev;
 	}
+	spin_unlock(&dq_list_lock);
+	return freed;
 }
 
-/*
- * This is called from kswapd when we think we need some
- * more memory
- */
-static int shrink_dqcache_memory(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long
+dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-
-	if (nr) {
-		spin_lock(&dq_list_lock);
-		prune_dqcache(nr);
-		spin_unlock(&dq_list_lock);
-	}
-	return ((unsigned)
-		percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
-		/100) * sysctl_vfs_cache_pressure;
+	return vfs_pressure_ratio(
+	percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
 }
 
 static struct shrinker dqcache_shrinker = {
-	.shrink = shrink_dqcache_memory,
+	.count_objects = dqcache_shrink_count,
+	.scan_objects = dqcache_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -1505,13 +1498,14 @@ EXPORT_SYMBOL(dquot_drop);
  * inode_reserved_space is managed internally by quota, and protected by
  * i_lock similar to i_blocks+i_bytes.
  */
-static qsize_t *inode_reserved_space(struct inode * inode)
+qsize_t *inode_reserved_space(struct inode * inode)
 {
 	/* Filesystem must explicitly define it's own method in order to use
 	 * quota reservation interface */
 	BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
 	return inode->i_sb->dq_op->get_reserved_space(inode);
 }
+EXPORT_SYMBOL(inode_reserved_space);
 
 void inode_add_rsv_space(struct inode *inode, qsize_t number)
 {
@@ -1547,7 +1541,7 @@ void inode_sub_rsv_space(struct inode *inode, qsize_t number)
 }
 EXPORT_SYMBOL(inode_sub_rsv_space);
 
-static qsize_t inode_get_rsv_space(struct inode *inode)
+qsize_t inode_get_rsv_space(struct inode *inode)
 {
 	qsize_t ret;
 
@@ -1558,8 +1552,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
+EXPORT_SYMBOL(inode_get_rsv_space);
 
-static void inode_incr_space(struct inode *inode, qsize_t number,
+void inode_incr_space(struct inode *inode, qsize_t number,
 				int reserve)
 {
 	if (reserve)
@@ -1567,14 +1562,16 @@ static void inode_incr_space(struct inode *inode, qsize_t number,
 	else
 		inode_add_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_incr_space);
 
-static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 {
 	if (reserve)
 		inode_sub_rsv_space(inode, number);
 	else
 		inode_sub_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_decr_space);
 
 /*
  * This functions updates i_blocks+i_bytes fields and quota information
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/types.h>
 #include <linux/writeback.h>
+#include <linux/compat.h>
 
 static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 				     qid_t id)
@@ -38,7 +39,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 			break;
 		/*FALLTHROUGH*/
 	default:
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			return -EPERM;
 	}
 
@@ -459,6 +460,181 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 #endif
 }
 
+#ifdef CONFIG_QUOTA_COMPAT
+
+asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
+
+static long compat_quotactl(unsigned int cmds, unsigned int type,
+		const char __user *special, qid_t id,
+		void __user *addr)
+{
+	struct super_block *sb;
+	long ret;
+
+	sb = NULL;
+	switch (cmds) {
+		case QC_QUOTAON:
+			return sys_quotactl(QCMD(Q_QUOTAON, type),
+					special, id, addr);
+
+		case QC_QUOTAOFF:
+			return sys_quotactl(QCMD(Q_QUOTAOFF, type),
+					special, id, addr);
+
+		case QC_SYNC:
+			return sys_quotactl(QCMD(Q_SYNC, type),
+					special, id, addr);
+
+		case QC_GETQUOTA: {
+			struct if_dqblk idq;
+			struct fs_disk_quota fdq;
+			struct compat_dqblk cdq;
+			struct kqid qid;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			qid = make_kqid(current_user_ns(), type, id);
+			ret = -EINVAL;
+			if (!qid_valid(qid))
+				break;
+			ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
+			copy_to_if_dqblk(&idq, &fdq);
+			if (ret)
+				break;
+			memset(&cdq, 0, sizeof(cdq));
+			cdq.dqb_ihardlimit = fdq.d_ino_hardlimit;
+			cdq.dqb_isoftlimit = fdq.d_ino_softlimit;
+			cdq.dqb_curinodes = fdq.d_icount;
+			cdq.dqb_bhardlimit = fdq.d_blk_hardlimit;
+			cdq.dqb_bsoftlimit = fdq.d_blk_softlimit;
+			cdq.dqb_curspace = fdq.d_bcount;
+			cdq.dqb_btime = fdq.d_btimer;
+			cdq.dqb_itime = fdq.d_itimer;
+			ret = 0;
+			if (copy_to_user(addr, &cdq, sizeof(cdq)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETQUOTA:
+		case QC_SETUSE:
+		case QC_SETQLIM: {
+			struct if_dqblk idq;
+			struct fs_disk_quota fdq;
+			struct compat_dqblk cdq;
+			struct kqid qid;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cdq, addr, sizeof(cdq)))
+				break;
+			qid = make_kqid(current_user_ns(), type, id);
+			ret = -EINVAL;
+			if (!qid_valid(qid))
+				break;
+			idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
+			idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
+			idq.dqb_curinodes = cdq.dqb_curinodes;
+			idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
+			idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
+			idq.dqb_curspace = cdq.dqb_curspace;
+			idq.dqb_valid = 0;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
+				idq.dqb_valid |= QIF_LIMITS;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
+				idq.dqb_valid |= QIF_USAGE;
+			copy_from_if_dqblk(&fdq, &idq);
+			ret = sb->s_qcop->set_dqblk(sb, qid, &fdq);
+			break;
+		}
+
+		case QC_GETINFO: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = sb->s_qcop->get_info(sb, type, &iinf);
+			if (ret)
+				break;
+
+			memset(&cinf, 0, sizeof(cinf));
+			cinf.dqi_bgrace = iinf.dqi_bgrace;
+			cinf.dqi_igrace = iinf.dqi_igrace;
+			if (iinf.dqi_flags & DQF_INFO_DIRTY)
+				cinf.dqi_flags |= 0x0010;
+			ret = 0;
+			if (copy_to_user(addr, &cinf, sizeof(cinf)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETINFO:
+		case QC_SETGRACE:
+		case QC_SETFLAGS: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cinf, addr, sizeof(cinf)))
+				break;
+			iinf.dqi_bgrace = cinf.dqi_bgrace;
+			iinf.dqi_igrace = cinf.dqi_igrace;
+			iinf.dqi_flags = cinf.dqi_flags;
+			iinf.dqi_valid = 0;
+			if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
+				iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
+			if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
+				iinf.dqi_valid |= IIF_FLAGS;
+			ret = sb->s_qcop->set_info(sb, type, &iinf);
+			break;
+		}
+
+		case QC_GETSTATS: {
+			struct compat_dqstats stat;
+
+			memset(&stat, 0, sizeof(stat));
+			stat.version = 6*10000+5*100+0;
+			ret = 0;
+			if (copy_to_user(addr, &stat, sizeof(stat)))
+				ret = -EFAULT;
+			break;
+		}
+
+		default:
+			ret = -ENOSYS;
+			break;
+	}
+	if (sb && !IS_ERR(sb))
+		drop_super(sb);
+	return ret;
+}
+
+#endif
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
@@ -476,6 +652,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
+#ifdef CONFIG_QUOTA_COMPAT
+	if (cmds >= 0x0100 && cmds < 0x3000)
+		return compat_quotactl(cmds, type, special, id, addr);
+#endif
+
 	/*
 	 * As a special case Q_SYNC can be called without a specific device.
 	 * It will iterate all superblocks that have quota enabled and call
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -26,6 +26,8 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#include <bc/beancounter.h>
+
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t);
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -600,7 +600,8 @@ static int init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-							SLAB_MEM_SPREAD),
+						      SLAB_MEM_SPREAD|
+						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -286,13 +286,9 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
-			else {
-				if (error == 0)
-					acl = NULL;
-			}
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -623,8 +623,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				  romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				  SLAB_ACCOUNT, romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		printk(KERN_ERR
--- a/fs/select.c
+++ b/fs/select.c
@@ -32,7 +32,6 @@
 
 #include <asm/uaccess.h>
 
-
 /*
  * Estimate expected accuracy in ns from a timeval.
  *
@@ -578,7 +577,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	if (size > sizeof(stack_fds) / 6) {
 		/* Not enough space in on-stack array; must use kmalloc */
 		ret = -ENOMEM;
-		bits = kmalloc(6 * size, GFP_KERNEL);
+		bits = kmalloc(6 * size, GFP_KERNEL_ACCOUNT);
 		if (!bits)
 			goto out_nofds;
 	}
@@ -901,7 +900,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 		len = min(todo, POLLFD_PER_PAGE);
 		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
-		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		walk = walk->next = kmalloc(size, GFP_KERNEL_ACCOUNT);
 		if (!walk) {
 			err = -ENOMEM;
 			goto out_fds;
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -28,9 +28,9 @@ static void *seq_buf_alloc(unsigned long size)
 {
 	void *buf;
 
-	buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+	buf = kmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!buf && size > PAGE_SIZE)
-		buf = vmalloc(size);
+		buf = vmalloc_account(size);
 	return buf;
 }
 
@@ -61,7 +61,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	struct seq_file *p = file->private_data;
 
 	if (!p) {
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		p = kmalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 		if (!p)
 			return -ENOMEM;
 		file->private_data = p;
@@ -604,7 +604,7 @@ static void single_stop(struct seq_file *p, void *v)
 int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		void *data)
 {
-	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
 	int res = -ENOMEM;
 
 	if (op) {
@@ -666,7 +666,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
 	void *private;
 	struct seq_file *seq;
 
-	private = kzalloc(psize, GFP_KERNEL);
+	private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
 	if (private == NULL)
 		goto out;
 
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,6 +33,7 @@
 #include <linux/socket.h>
 #include <linux/compat.h>
 #include "internal.h"
+#include <linux/virtinfo.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -103,6 +104,7 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 	int err;
 
 	if (!PageUptodate(page)) {
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 		lock_page(page);
 
 		/*
@@ -340,6 +342,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
+	while (spd.nr_pages < nr_pages && mapping->i_peer_file) {
+		page = pick_peer_page(mapping, index, &in->f_ra,
+				      req_pages - spd.nr_pages);
+		if (!page)
+			break;
+		pages[spd.nr_pages++] = page;
+		index++;
+	}
+
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
 	 * readahead/allocate the rest and fill in the holes.
@@ -409,6 +420,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
+			virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 			lock_page(page);
 
 			/*
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -417,7 +417,8 @@ static int __init init_inodecache(void)
 {
 	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
 		sizeof(struct squashfs_inode_info), 0,
-		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+		init_once);
 
 	return squashfs_inode_cachep ? 0 : -ENOMEM;
 }
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
+#include <linux/device_cgroup.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
@@ -46,28 +47,37 @@ static int calculate_f_flags(struct vfsmount *mnt)
 		flags_by_sb(mnt->mnt_sb->s_flags);
 }
 
-static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+static int statfs_by_sb(struct super_block *sb, struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval;
 
-	if (!dentry->d_sb->s_op->statfs)
+	if (!sb->s_op->statfs)
 		return -ENOSYS;
 
 	memset(buf, 0, sizeof(*buf));
-	retval = security_sb_statfs(dentry);
-	if (retval)
-		return retval;
-	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	retval = sb->s_op->statfs(dentry, buf);
 	if (retval == 0 && buf->f_frsize == 0)
 		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
 
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	retval = security_sb_statfs(dentry);
+	if (!retval)
+		retval = statfs_by_sb(dentry->d_sb, dentry, buf);
+	return retval;
+}
+
 int vfs_statfs(struct path *path, struct kstatfs *buf)
 {
 	int error;
 
-	error = statfs_by_dentry(path->dentry, buf);
+	error = security_sb_statfs(path->dentry);
+	if (!error)
+		error = statfs_by_sb(path->mnt->mnt_sb, path->dentry, buf);
 	if (!error)
 		buf->f_flags = calculate_f_flags(path->mnt);
 	return error;
@@ -227,9 +237,16 @@ int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 
 SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(kdev, &sbuf);
 	if (err)
 		return err;
 
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 static int thaw_super_locked(struct super_block *sb);
@@ -48,6 +49,25 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_internal",
 };
 
+static bool dcache_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long anon, file, dcache;
+	int vfs_cache_min_ratio = READ_ONCE(sysctl_vfs_cache_min_ratio);
+
+	if (vfs_cache_min_ratio <= 0)
+		return false;
+
+	if (memcg)
+		return mem_cgroup_dcache_is_low(memcg, vfs_cache_min_ratio);
+
+	anon = global_page_state(NR_ANON_PAGES);
+	file = global_page_state(NR_FILE_PAGES);
+	dcache = global_page_state(NR_SLAB_RECLAIMABLE);
+
+	return dcache / vfs_cache_min_ratio <
+			(anon + file + dcache) / 100;
+}
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -55,11 +75,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long super_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
 {
 	struct super_block *sb;
-	int	fs_objects = 0;
-	int	total_objects;
+	long	fs_objects = 0;
+	long	total_objects;
+	long	freed = 0;
+	long	dentries;
+	long	inodes;
 
 	sb = container_of(shrink, struct super_block, s_shrink);
 
@@ -67,43 +91,68 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
 	 * to recurse into the FS that called us in clear_inode() and friends..
 	 */
-	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-		return -1;
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	if (!trylock_super(sb))
+		return SHRINK_STOP;
 
 	if (sb->s_op && sb->s_op->nr_cached_objects)
-		fs_objects = sb->s_op->nr_cached_objects(sb);
-
-	total_objects = sb->s_nr_dentry_unused +
-			sb->s_nr_inodes_unused + fs_objects + 1;
-
-	if (sc->nr_to_scan) {
-		int	dentries;
-		int	inodes;
-
-		/* proportion the scan between the caches */
-		dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
-							total_objects;
-		inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
-							total_objects;
-		if (fs_objects)
-			fs_objects = (sc->nr_to_scan * fs_objects) /
-							total_objects;
-		/*
-		 * prune the dcache first as the icache is pinned by it, then
-		 * prune the icache, followed by the filesystem specific caches
-		 */
-		prune_dcache_sb(sb, dentries);
-		prune_icache_sb(sb, inodes);
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 
-		if (fs_objects && sb->s_op->free_cached_objects) {
-			sb->s_op->free_cached_objects(sb, fs_objects);
-			fs_objects = sb->s_op->nr_cached_objects(sb);
-		}
-		total_objects = sb->s_nr_dentry_unused +
-				sb->s_nr_inodes_unused + fs_objects;
+	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects = dentries + inodes + fs_objects + 1;
+
+	/* proportion the scan between the caches */
+	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
+	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
+
+	/*
+	 * prune the dcache first as the icache is pinned by it, then
+	 * prune the icache, followed by the filesystem specific caches
+	 */
+	sc->nr_to_scan = dentries;
+	freed = prune_dcache_sb(sb, sc);
+	sc->nr_to_scan = inodes;
+	freed += prune_icache_sb(sb, sc);
+
+	if (fs_objects) {
+		sc->nr_to_scan = fs_objects;
+		freed += sb->s_op->free_cached_objects(sb, sc);
 	}
 
-	total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
+	up_read(&sb->s_umount);
+	return freed;
+}
+
+static unsigned long super_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct super_block *sb;
+	long	total_objects = 0;
+
+	if (!sc->for_drop_caches && dcache_is_low(sc->memcg))
+		return 0;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Don't call trylock_super as it is a potential
+	 * scalability bottleneck. The counts could get updated
+	 * between super_cache_count and super_cache_scan anyway.
+	 * Call to super_cache_count with shrinker_rwsem held
+	 * ensures the safety of call to list_lru_count_node() and
+	 * s_op->nr_cached_objects().
+	 */
+	if (sb->s_op && sb->s_op->nr_cached_objects)
+		total_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+
+	total_objects = vfs_pressure_ratio(total_objects);
 	return total_objects;
 }
 
@@ -116,6 +165,10 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 static void destroy_super(struct super_block *s)
 {
 	int i;
+
+	list_lru_destroy(&s->s_dentry_lru);
+	list_lru_destroy(&s->s_inode_lru);
+
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
@@ -142,6 +195,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	if (!s)
 		return NULL;
 
+	INIT_LIST_HEAD(&s->s_mounts);
+
 	if (security_sb_alloc(s))
 		goto fail;
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
@@ -158,10 +213,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
-	INIT_LIST_HEAD(&s->s_dentry_lru);
-	INIT_LIST_HEAD(&s->s_inode_lru);
-	spin_lock_init(&s->s_inode_lru_lock);
-	INIT_LIST_HEAD(&s->s_mounts);
+
+	if (list_lru_init_memcg(&s->s_dentry_lru))
+		goto fail;
+	if (list_lru_init_memcg(&s->s_inode_lru))
+		goto err_out_dentry_lru;
+
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
@@ -190,12 +247,17 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
-	s->cleancache_poolid = -1;
+	s->cleancache_poolid = CLEANCACHE_NO_POOL;
 
 	s->s_shrink.seeks = DEFAULT_SEEKS;
-	s->s_shrink.shrink = prune_super;
+	s->s_shrink.scan_objects = super_cache_scan;
+	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
+	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
 	return s;
+
+err_out_dentry_lru:
+	list_lru_destroy(&s->s_dentry_lru);
 fail:
 	destroy_super(s);
 	return NULL;
@@ -221,7 +283,7 @@ static void __put_super(struct super_block *sb)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
@@ -248,6 +310,14 @@ void deactivate_locked_super(struct super_block *s)
 		unregister_shrinker(&s->s_shrink);
 		fs->kill_sb(s);
 
+		/*
+		 * Since list_lru_destroy() may sleep, we cannot call it from
+		 * put_super(), where we hold the sb_lock. Therefore we destroy
+		 * the lru lists right now.
+		 */
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
@@ -303,35 +373,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 }
 
 /*
- *	grab_super_passive - acquire a passive reference
+ *	trylock_super - try to grab ->s_umount shared
  *	@sb: reference we are trying to grab
  *
- *	Tries to acquire a passive reference. This is used in places where we
+ *	Try to prevent fs shutdown.  This is used in places where we
  *	cannot take an active reference but we need to ensure that the
- *	superblock does not go away while we are working on it. It returns
- *	false if a reference was not gained, and returns true with the s_umount
- *	lock held in read mode if a reference is gained. On successful return,
- *	the caller must drop the s_umount lock and the passive reference when
- *	done.
+ *	filesystem is not shut down while we are working on it. It returns
+ *	false if we cannot acquire s_umount or if we lose the race and
+ *	filesystem already got into shutdown, and returns true with the s_umount
+ *	lock held in read mode in case of success. On successful return,
+ *	the caller must drop the s_umount lock when done.
+ *
+ *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ *	The reason why it's safe is that we are OK with doing trylock instead
+ *	of down_read().  There's a couple of places that are OK with that, but
+ *	it's very much not a general-purpose interface.
  */
-bool grab_super_passive(struct super_block *sb)
+bool trylock_super(struct super_block *sb)
 {
-	spin_lock(&sb_lock);
-	if (hlist_unhashed(&sb->s_instances)) {
-		spin_unlock(&sb_lock);
-		return false;
-	}
-
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root && (sb->s_flags & MS_BORN))
+		if (!hlist_unhashed(&sb->s_instances) &&
+		    sb->s_root && (sb->s_flags & MS_BORN))
 			return true;
 		up_read(&sb->s_umount);
 	}
 
-	put_super(sb);
 	return false;
 }
 
@@ -963,7 +1029,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode | FMODE_MOUNT, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -1003,11 +1069,26 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
-
+#ifdef CONFIG_VE
+		void *data_orig = data;
+		struct ve_struct *ve = get_exec_env();
+
+		if (!ve_is_super(ve)) {
+			error = ve_devmnt_process(ve, bdev->bd_dev, &data, 0);
+			if (error) {
+				deactivate_locked_super(s);
+				goto error;
+			}
+		}
+#endif
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+#ifdef CONFIG_VE
+		if (data_orig != data)
+			free_page((unsigned long)data);
+#endif
 		if (error) {
 			deactivate_locked_super(s);
 			goto error;
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,15 +7,21 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
+#include <linux/pid_namespace.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
+#include <linux/ve.h>
 #include "internal.h"
+#include "mount.h"
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -27,12 +33,13 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb,
+			     struct user_beancounter *ub, int wait)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb_ub(sb, ub);
 	else
-		writeback_inodes_sb(sb, WB_REASON_SYNC);
+		writeback_inodes_sb_ub(sb, ub, WB_REASON_SYNC);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
@@ -44,7 +51,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int sync_filesystem(struct super_block *sb)
+static int sync_filesystem_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	int ret;
 
@@ -60,10 +67,15 @@ int sync_filesystem(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, ub, 0);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, ub, 1);
+}
+
+int sync_filesystem(struct super_block *sb)
+{
+	return sync_filesystem_ub(sb, NULL);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
@@ -94,6 +106,111 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+struct sync_sb {
+	struct list_head list;
+	struct super_block *sb;
+};
+
+static void sync_release_filesystems(struct list_head *sync_list)
+{
+	struct sync_sb *ss, *tmp;
+
+	list_for_each_entry_safe(ss, tmp, sync_list, list) {
+		list_del(&ss->list);
+		put_super(ss->sb);
+		kfree(ss);
+	}
+}
+
+static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
+{
+	struct sync_sb *ss;
+
+	list_for_each_entry(ss, sync_list, list)
+		if (ss->sb == sb)
+			return 1;
+	return 0;
+}
+
+static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	struct sync_sb *ss;
+	int ret = 0;
+
+	BUG_ON(!list_empty(sync_list));
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
+			continue;
+
+		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+		if (ss == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+		ss->sb = mnt->mnt.mnt_sb;
+		/*
+		 * We hold mount point and thus can be sure, that superblock is
+		 * alive. And it means, that we can safely increase it's usage
+		 * counter.
+		 */
+		spin_lock(&sb_lock);
+		ss->sb->s_count++;
+		spin_unlock(&sb_lock);
+		list_add_tail(&ss->list, sync_list);
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static void sync_filesystems_ve(struct ve_struct *ve, struct user_beancounter *ub, int wait)
+{
+	struct super_block *sb;
+	LIST_HEAD(sync_list);
+	struct sync_sb *ss;
+
+	/*
+	 * We don't need to care about allocating failure here. At least we
+	 * don't need to skip sync on such error.
+	 * Let's sync what we collected already instead.
+	 */
+	sync_collect_filesystems(ve, &sync_list);
+
+	list_for_each_entry(ss, &sync_list, list) {
+		sb = ss->sb;
+		down_read(&sb->s_umount);
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
+			__sync_filesystem(sb, ub, wait);
+		up_read(&sb->s_umount);
+	}
+
+	sync_release_filesystems(&sync_list);
+}
+
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	if (ve->fsync_enable == 2)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return __ve_fsync_behavior(ve);
+}
+
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -106,8 +223,40 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
+	struct ve_struct *ve = get_exec_env();
+	struct user_beancounter *ub, *sync_ub = NULL;
 	int nowait = 0, wait = 1;
 
+	ub = get_exec_ub();
+	ub_percpu_inc(ub, sync);
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto skip;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto skip;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+
+		if (sync_ub && (sync_ub != get_ub0())) {
+			wakeup_flusher_threads_ub(0, sync_ub, WB_REASON_SYNC);
+			sync_filesystems_ve(get_exec_env(), sync_ub, nowait);
+			sync_filesystems_ve(get_exec_env(), sync_ub, wait);
+			goto skip;
+		}
+	}
+
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
@@ -116,6 +265,8 @@ SYSCALL_DEFINE0(sync)
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return 0;
 }
 
@@ -155,17 +306,49 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct fd f = fdget(fd);
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
+	struct user_beancounter *ub, *sync_ub = NULL;
+	struct ve_struct *ve;
+
+	ub = get_exec_ub();
+	ve = get_exec_env();
+	ub_percpu_inc(ub, sync);
+
+	if (!f.file) {
+		ret = -EBADF;
+		goto skip;
+	}
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+	}
 
-	if (!f.file)
-		return -EBADF;
 	sb = f.file->f_dentry->d_sb;
 
 	down_read(&sb->s_umount);
-	ret = sync_filesystem(sb);
+	if (sb->s_root)
+		ret = sync_filesystem_ub(sb, sync_ub);
 	up_read(&sb->s_umount);
-
+fdput:
 	fdput(f);
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return ret;
 }
 
@@ -182,9 +365,34 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct user_beancounter *ub;
+	int ret;
+	struct inode *inode = file->f_mapping->host;
+
 	if (!file->f_op || !file->f_op->fsync)
 		return -EINVAL;
-	return file->f_op->fsync(file, start, end, datasync);
+
+	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
+
+	ub = get_exec_ub();
+	if (datasync)
+		ub_percpu_inc(ub, fdsync);
+	else
+		ub_percpu_inc(ub, fsync);
+
+	ret = file->f_op->fsync(file, start, end, datasync);
+
+	if (datasync)
+		ub_percpu_inc(ub, fdsync_done);
+	else
+		ub_percpu_inc(ub, fsync_done);
+
+	return ret;
 }
 EXPORT_SYMBOL(vfs_fsync_range);
 
@@ -204,9 +412,13 @@ EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
 {
-	struct fd f = fdget(fd);
+	struct fd f;
 	int ret = -EBADF;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return 0;
+
+	f = fdget(fd);
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
@@ -291,6 +503,7 @@ EXPORT_SYMBOL(generic_write_sync);
 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 				unsigned int, flags)
 {
+	struct user_beancounter *ub;
 	int ret;
 	struct fd f;
 	struct address_space *mapping;
@@ -349,22 +562,27 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 		goto out_put;
 	}
 
+	ub = get_exec_ub();
+	ub_percpu_inc(ub, frsync);
+
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out_acct;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
 		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out_acct;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 
+out_acct:
+	ub_percpu_inc(ub, frsync_done);
 out_put:
 	fdput(f);
 out:
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -4,3 +4,4 @@
 
 obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
 		   group.o
+obj-$(CONFIG_VE) += ve.o
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -23,12 +23,13 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/hash.h>
+#include <linux/ve.h>
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
 static DEFINE_SPINLOCK(sysfs_ino_lock);
 static DEFINE_IDA(sysfs_ino_ida);
@@ -73,6 +74,32 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
 				  right);
 }
 
+static bool sysfs_sd_visible(struct sysfs_dirent *sd, struct super_block *sb)
+{
+	struct ve_struct *ve = sysfs_info(sb)->ve;
+	struct sysfs_dirent *tmp_sd = sd;
+
+	/* Host sees anything */
+	if (ve_is_super(ve))
+		return true;
+
+	/* Entries with namespace tag and their sub-entries always visible */
+	while (tmp_sd) {
+		if (tmp_sd->s_ns)
+			return true;
+		tmp_sd = tmp_sd->s_parent;
+	}
+
+	/* Symlinks are visible if target sd is visible */
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		sd = sd->s_symlink.target_sd;
+
+	if (kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms))
+		return true;
+
+	return false;
+}
+
 /**
  *	sysfs_link_subling - link sysfs_dirent into sibling rbtree
  *	@sd: sysfs_dirent of interest
@@ -279,6 +306,8 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 	if (sd->s_iattr && sd->s_iattr->ia_secdata)
 		security_release_secctx(sd->s_iattr->ia_secdata,
 					sd->s_iattr->ia_secdata_len);
+	if (sd->s_ve_perms)
+		kmapset_put(sd->s_ve_perms);
 	kfree(sd->s_iattr);
 	sysfs_free_ino(sd->s_ino);
 	kmem_cache_free(sysfs_dir_cachep, sd);
@@ -326,6 +355,9 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 			goto out_bad;
 	}
 
+	if (!sysfs_sd_visible(sd, dentry->d_sb))
+		goto out_bad;
+
 	mutex_unlock(&sysfs_mutex);
 	return 1;
 out_bad:
@@ -436,17 +468,19 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
-
+#ifndef CONFIG_VE
 	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 			sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
 			acxt->parent_sd->s_name, sd->s_name);
 		return -EINVAL;
 	}
-
+#endif
 	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
+	sd->s_ve_perms = kmapset_commit(kmapset_new(&ve_sysfs_perms));
+
 	ret = sysfs_link_sibling(sd);
 	if (ret)
 		return ret;
@@ -609,14 +643,14 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 {
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
-
+#ifndef CONFIG_VE
 	if (!!sysfs_ns_type(parent_sd) != !!ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 			sysfs_ns_type(parent_sd)? "required": "invalid",
 			parent_sd->s_name, name);
 		return NULL;
 	}
-
+#endif
 	hash = sysfs_name_hash(ns, name);
 	while (node) {
 		struct sysfs_dirent *sd;
@@ -776,7 +810,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
 
 	/* no such entry */
-	if (!sd) {
+	if (!sd || !sysfs_sd_visible(sd, dentry->d_sb)) {
 		ret = ERR_PTR(-ENOENT);
 		goto out_unlock;
 	}
@@ -942,8 +976,8 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+					  loff_t hash, struct sysfs_dirent *pos)
 {
 	if (pos) {
 		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
@@ -966,29 +1000,32 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 				break;
 		}
 	}
-	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	}
 	return pos;
 }
 
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_next_entry(struct sysfs_dirent *cur)
 {
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos) do {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	} while (pos && pos->s_ns != ns);
-	return pos;
+	struct rb_node *node = rb_next(&cur->s_rb);
+
+	return node ? to_sysfs_dirent(node) : NULL;
+}
+
+struct sysfs_dirent *sysfs_next_recursive(struct sysfs_dirent *sd)
+{
+	struct rb_node *node;
+
+	if (sysfs_type(sd) == SYSFS_DIR &&
+	    !RB_EMPTY_ROOT(&sd->s_dir.children))
+		return to_sysfs_dirent(rb_first(&sd->s_dir.children));
+
+	do {
+		node = rb_next(&sd->s_rb);
+		if (node)
+			return to_sysfs_dirent(node);
+		sd = sd->s_parent;
+	} while (sd);
+
+	return NULL;
 }
 
 static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
@@ -1023,13 +1060,16 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 	mutex_lock(&sysfs_mutex);
 	off = filp->f_pos;
-	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
-	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+	pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+	for (; pos; pos = sysfs_next_entry(pos)) {
 		const char * name;
 		unsigned int type;
 		int len, ret;
 
+		/* Skip invisible entries and extries from wrong namespace */
+		if (pos->s_ns != ns || !sysfs_sd_visible(pos, dentry->d_sb))
+			continue;
+
 		name = pos->s_name;
 		len = strlen(name);
 		ino = pos->s_ino;
@@ -1042,6 +1082,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		mutex_lock(&sysfs_mutex);
 		if (ret < 0)
 			break;
+
+		/* Revalidate position pointer after reacquiring sysfs_mutex */
+		pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
 	}
 	mutex_unlock(&sysfs_mutex);
 
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -29,7 +29,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
-			sysfs_remove_bin_file(kobj, *bin_attr);
+			sysfs_hash_and_remove(dir_sd, NULL,
+					      (*bin_attr)->attr.name);
 }
 
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -71,8 +72,10 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 	if (grp->bin_attrs) {
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
 			if (update)
-				sysfs_remove_bin_file(kobj, *bin_attr);
-			error = sysfs_create_bin_file(kobj, *bin_attr);
+				sysfs_hash_and_remove(dir_sd, NULL,
+						      (*bin_attr)->attr.name);
+			error = sysfs_add_file(dir_sd, &(*bin_attr)->attr,
+					       SYSFS_KOBJ_BIN_ATTR);
 			if (error)
 				break;
 		}
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -17,11 +17,11 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
+#include <linux/ve.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
@@ -113,6 +113,9 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!sd)
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	mutex_lock(&sysfs_mutex);
 	error = inode_change_ok(inode, iattr);
 	if (error)
@@ -339,9 +342,35 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 		return -ENOENT;
 }
 
+static int sysfs_sd_permission(struct sysfs_dirent *sd, int mask)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct sysfs_dirent *tmp_sd = sd;
+	int perm;
+
+	if (ve_is_super(ve))
+		return 0;
+
+	while (tmp_sd) {
+		if (tmp_sd->s_ns)
+			return 0;
+		tmp_sd = tmp_sd->s_parent;
+	}
+
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		sd = sd->s_symlink.target_sd;
+
+	perm = kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms);
+	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+
+	return -EACCES;
+}
+
 int sysfs_permission(struct inode *inode, int mask)
 {
 	struct sysfs_dirent *sd;
+	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
@@ -349,8 +378,12 @@ int sysfs_permission(struct inode *inode, int mask)
 	sd = inode->i_private;
 
 	mutex_lock(&sysfs_mutex);
+	ret = sysfs_sd_permission(sd, mask);
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
+	if (ret)
+		return ret;
+
 	return generic_permission(inode, mask);
 }
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -20,6 +20,9 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/seq_file.h>
+#include <linux/xattr.h>
+#include <linux/ve.h>
 
 #include "sysfs.h"
 
@@ -27,10 +30,21 @@
 static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 
+static int sysfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct ve_struct *ve = sysfs_info(root->d_sb)->ve;
+
+	if (!ve_is_super(ve))
+		seq_printf(m, ",ve=%s", ve_name(ve));
+
+	return 0;
+}
+
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= sysfs_evict_inode,
+	.show_options	= sysfs_show_options,
 };
 
 struct sysfs_dirent sysfs_root = {
@@ -84,6 +98,8 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 		if (sb_info->ns[type] != info->ns[type])
 			found = 0;
 	}
+	if (sb_info->ve != info->ve)
+		found = 0;
 	return found;
 }
 
@@ -101,6 +117,7 @@ static void free_sysfs_super_info(struct sysfs_super_info *info)
 	int type;
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		kobj_ns_drop(type, info->ns[type]);
+	put_ve(info->ve);
 	kfree(info);
 }
 
@@ -121,6 +138,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		info->ns[type] = kobj_ns_grab_current(type);
+	info->ve = get_ve(get_exec_env());
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
@@ -153,13 +171,21 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int __init sysfs_init(void)
 {
 	int err = -ENOMEM;
 
+	kmapset_init_set(&ve_sysfs_perms);
+
+	sysfs_root.s_ve_perms = kmapset_new(&ve_sysfs_perms);
+	if (!sysfs_root.s_ve_perms)
+		goto out;
+
+	kmapset_commit(sysfs_root.s_ve_perms);
+
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
 					      0, 0, NULL);
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -59,6 +59,10 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	sysfs_addrm_start(&acxt, parent_sd);
 	/* Symlinks must be between directories with the same ns_type */
 	if (!ns_type ||
+#ifdef CONFIG_VE
+	    /* or if target doesn't have ns_type */
+	    !sysfs_ns_type(sd->s_symlink.target_sd->s_parent) ||
+#endif
 	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
 		if (warn)
 			error = sysfs_add_one(&acxt, sd);
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -81,6 +81,8 @@ struct sysfs_dirent {
 	umode_t 		s_mode;
 	unsigned int		s_ino;
 	struct sysfs_inode_attrs *s_iattr;
+
+	struct kmapset_map	*s_ve_perms;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
@@ -147,6 +149,7 @@ struct sysfs_addrm_cxt {
  */
 struct sysfs_super_info {
 	void *ns[KOBJ_NS_TYPES];
+	struct ve_struct *ve;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
@@ -172,6 +175,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
+struct sysfs_dirent *sysfs_next_recursive(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns,
 				       const unsigned char *name);
--- /dev/null
+++ b/fs/sysfs/ve.c
@@ -0,0 +1,313 @@
+/*
+ *  fs/sysfs/ve.c - sysfs permissions for containers
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/seq_file.h>
+#include <linux/kmapset.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ve.h>
+#include <net/sock.h>
+#include "sysfs.h"
+
+static void *ve_grab_current_ns(void)
+{
+	return get_ve(get_exec_env());
+}
+
+static const void *ve_initial_ns(void)
+{
+	return get_ve0();
+}
+
+static void ve_drop_ns(void *p)
+{
+	put_ve(p);
+}
+
+const void *ve_netlink_ns(struct sock *sk)
+{
+	return sock_net(sk)->owner_ve;
+}
+
+struct kobj_ns_type_operations ve_ns_type_operations = {
+	.type = KOBJ_NS_TYPE_VE,
+	.grab_current_ns = ve_grab_current_ns,
+	.netlink_ns = ve_netlink_ns,
+	.initial_ns = ve_initial_ns,
+	.drop_ns = ve_drop_ns,
+};
+
+static bool sysfs_perms_shown(struct ve_struct *ve, struct sysfs_dirent *sd)
+{
+	if (!ve) /* default_sysfs_permissions */
+		return sd->s_ve_perms->default_value != 0;
+	return kmapset_lookup(sd->s_ve_perms, &ve->ve_sysfs_perms) != NULL;
+}
+
+static void * sysfs_perms_start(struct seq_file *m, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = &sysfs_root;
+	loff_t pos = *ppos;
+
+	mutex_lock(&sysfs_mutex);
+	for (sd = &sysfs_root; sd; sd = sysfs_next_recursive(sd)) {
+		if (sysfs_perms_shown(ve, sd) && !pos--)
+			break;
+	};
+	return sd;
+}
+
+static void * sysfs_perms_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = v;
+
+	(*ppos)++;
+	while ((sd = sysfs_next_recursive(sd))) {
+		if (sysfs_perms_shown(ve, sd))
+			break;
+	};
+	return sd;
+}
+
+static void sysfs_perms_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&sysfs_mutex);
+}
+
+static int sysfs_perms_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = v;
+	char *buf;
+	size_t size, len, off;
+	int mask;
+
+	if (!ve)
+		mask = sd->s_ve_perms->default_value;
+	else
+		mask = kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms);
+
+	size = seq_get_buf(m, &buf);
+	if (size) {
+		off = size;
+		do {
+			len = strlen(sd->s_name);
+			if (len >= off) {
+				seq_commit(m, -1);
+				return 0;
+			}
+			if (sysfs_type(sd) == SYSFS_DIR)
+				buf[--off] = '/';
+			off -= len;
+			memcpy(buf + off, sd->s_name, len);
+			sd = sd->s_parent;
+		} while (sd && sd != &sysfs_root);
+		memmove(buf, buf + off, size - off);
+		seq_commit(m, size - off);
+	}
+
+	seq_putc(m, ' ');
+
+	if (!mask)
+		seq_putc(m, '-');
+	if (mask & MAY_READ)
+		seq_putc(m, 'r');
+	if (mask & MAY_WRITE)
+		seq_putc(m, 'w');
+	if (mask & MAY_EXEC)
+		seq_putc(m, 'x');
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+int sysfs_perms_set(char *path, struct ve_struct *ve, int mask)
+{
+	struct sysfs_dirent *sd = &sysfs_root;
+	struct kmapset_map *map = NULL;
+	char *name = path, *sep;
+	int ret;
+
+	mutex_lock(&sysfs_mutex);
+	do {
+		sep = strchr(name, '/');
+		if (sep)
+			*sep = 0;
+		if (*name)
+			sd = sysfs_find_dirent(sd, NULL, name);
+		if (sep)
+			*sep = '/';
+		name = sep + 1;
+	} while (sd && sep);
+
+	ret = -ENOENT;
+	if (!sd)
+		goto out;
+
+	ret = -ENOMEM;
+	map = kmapset_dup(sd->s_ve_perms);
+	if (!map)
+		goto out;
+
+	ret = 0;
+	if (!ve) {
+		kmapset_set_default(map, mask > 0 ? mask : 0);
+	} else if (mask < 0) {
+		kmapset_del_value(map, &ve->ve_sysfs_perms);
+	} else {
+		ret = kmapset_set_value(map, &ve->ve_sysfs_perms, mask);
+	}
+
+	if (!ret) {
+		map = kmapset_commit(map);
+		swap(map, sd->s_ve_perms);
+	}
+out:
+	mutex_unlock(&sysfs_mutex);
+	kmapset_put(map);
+	return ret;
+}
+
+static int sysfs_perms_line(struct ve_struct *ve, char *line)
+{
+	int mask = 0;
+	char *p;
+
+	p = strpbrk(line, " \t");
+	if (!p)
+		return -EINVAL;
+	*p++ = 0;
+	p = skip_spaces(p);
+	while (1) {
+		switch (*p++) {
+			case 'r':
+				mask |= MAY_READ;
+				break;
+			case 'w':
+				mask |= MAY_WRITE;
+				break;
+			case 'x':
+				mask |= MAY_EXEC;
+				break;
+			case '-':
+				mask = -1;
+				break;
+			case 0:
+				return sysfs_perms_set(line, ve, mask);
+			default:
+				return -EINVAL;
+		}
+	}
+}
+
+static ssize_t sysfs_perms_write(struct cgroup *cgrp,
+		struct cftype *cftype, struct file * file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	char *line, *next, *page;
+	int ret, len;
+
+	ve = ve_is_super(ve) ? NULL : ve;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	len = min(count, PAGE_SIZE - 1);
+	ret = copy_from_user(page, buf, len);
+	if (ret)
+		goto err;
+
+	page[len] = '\0';
+
+	next = page;
+	while (1) {
+		line = skip_spaces(next);
+		next = strchr(line, '\n');
+		if (next) {
+			*next++ = '\0';
+		} else if (len < count) {
+			ret = line != page ? line - page : -EINVAL;
+			break;
+		}
+		if (*line && *line != '#') {
+			ret = sysfs_perms_line(ve, line);
+			if (ret)
+				break;
+		}
+		if (!next) {
+			ret = len;
+			break;
+		}
+	}
+err:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+struct seq_operations sysfs_perms_sops = {
+	.start = sysfs_perms_start,
+	.stop = sysfs_perms_stop,
+	.next = sysfs_perms_next,
+	.show = sysfs_perms_show,
+};
+
+static int sysfs_perms_open(struct inode *inode, struct file *file)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &sysfs_perms_sops);
+	if (!ret) {
+		m = file->private_data;
+		m->private = ve_is_super(ve) ? NULL : ve;
+	}
+	return ret;
+}
+
+static ssize_t sysfs_perms_read(struct cgroup *cgrp, struct cftype *cft,
+	struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	return seq_read(file, buf, nbytes, ppos);
+}
+
+static int sysfs_perms_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+static struct cftype sysfs_ve_cftypes[] = {
+	{
+		.name = "default_sysfs_permissions",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.open = sysfs_perms_open,
+		.read = sysfs_perms_read,
+		.write = sysfs_perms_write,
+		.release = sysfs_perms_release,
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	{
+		.name = "sysfs_permissions",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = sysfs_perms_open,
+		.read = sysfs_perms_read,
+		.write = sysfs_perms_write,
+		.release = sysfs_perms_release,
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	{ },
+};
+
+static int init_sysfs_ve_perms(void)
+{
+	return cgroup_add_cftypes(&ve_subsys, sysfs_ve_cftypes);
+}
+module_init(init_sysfs_ve_perms);
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -351,7 +351,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -24,6 +24,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <linux/rcupdate.h>
+#include <linux/ve.h>
 
 struct timerfd_ctx {
 	struct hrtimer tmr;
@@ -36,6 +37,7 @@ struct timerfd_ctx {
 	short unsigned settime_flags;	/* to show in fdinfo */
 	struct rcu_head rcu;
 	struct list_head clist;
+	spinlock_t cancel_lock;
 	bool might_cancel;
 };
 
@@ -88,7 +90,7 @@ void timerfd_clock_was_set(void)
 	rcu_read_unlock();
 }
 
-static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+static void __timerfd_remove_cancel(struct timerfd_ctx *ctx)
 {
 	if (ctx->might_cancel) {
 		ctx->might_cancel = false;
@@ -98,6 +100,13 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
 	}
 }
 
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+	spin_lock(&ctx->cancel_lock);
+	__timerfd_remove_cancel(ctx);
+	spin_unlock(&ctx->cancel_lock);
+}
+
 static bool timerfd_canceled(struct timerfd_ctx *ctx)
 {
 	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
@@ -108,6 +117,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
 
 static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 {
+	spin_lock(&ctx->cancel_lock);
 	if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
 	    (flags & TFD_TIMER_CANCEL_ON_SET)) {
 		if (!ctx->might_cancel) {
@@ -116,9 +126,10 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 			list_add_rcu(&ctx->clist, &cancel_list);
 			spin_unlock(&cancel_lock);
 		}
-	} else if (ctx->might_cancel) {
-		timerfd_remove_cancel(ctx);
+	} else {
+		__timerfd_remove_cancel(ctx);
 	}
+	spin_unlock(&ctx->cancel_lock);
 }
 
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
@@ -336,6 +347,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 		return -ENOMEM;
 
 	init_waitqueue_head(&ctx->wqh);
+	spin_lock_init(&ctx->cancel_lock);
 	ctx->clockid = clockid;
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 	ctx->moffs = ktime_get_monotonic_offset();
@@ -349,7 +361,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 }
 
 static int do_timerfd_settime(int ufd, int flags, 
-		const struct itimerspec *new,
+		struct itimerspec *new,
 		struct itimerspec *old)
 {
 	struct fd f;
@@ -395,6 +407,9 @@ static int do_timerfd_settime(int ufd, int flags,
 	/*
 	 * Re-program the timer to the new value ...
 	 */
+	if ((flags & TFD_TIMER_ABSTIME) &&
+	    (new->it_value.tv_sec || new->it_value.tv_nsec))
+		monotonic_ve_to_abs(ctx->clockid, &new->it_value);
 	ret = timerfd_setup(ctx, flags, new);
 
 	spin_unlock_irq(&ctx->wqh.lock);
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,7 +1538,6 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
 static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,18 +277,25 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
-	if (nr == 0)
-		/*
-		 * Due to the way UBIFS updates the clean znode counter it may
-		 * temporarily be negative.
-		 */
-		return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+	/*
+	 * Due to the way UBIFS updates the clean znode counter it may
+	 * temporarily be negative.
+	 */
+	return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+}
+
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	unsigned long nr = sc->nr_to_scan;
+	int contention = 0;
+	unsigned long freed;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
 	if (!clean_zn_cnt) {
 		/*
@@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 
 	if (!freed && contention) {
 		dbg_tnc("freed nothing, but contention");
-		return -1;
+		return SHRINK_STOP;
 	}
 
 out:
-	dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+	dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
 	return freed;
 }
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab;
 
 /* UBIFS TNC shrinker description */
 static struct shrinker ubifs_shrinker_info = {
-	.shrink = ubifs_shrinker,
+	.scan_objects = ubifs_shrink_scan,
+	.count_objects = ubifs_shrink_count,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -2248,8 +2249,8 @@ static int __init ubifs_init(void)
 
 	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
 				sizeof(struct ubifs_inode), 0,
-				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
-				&inode_slab_ctor);
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+				SLAB_ACCOUNT, &inode_slab_ctor);
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc);
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -165,7 +165,8 @@ static int init_inodecache(void)
 	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
-						 SLAB_MEM_SPREAD),
+						 SLAB_MEM_SPREAD |
+						 SLAB_ACCOUNT),
 					     init_once);
 	if (!udf_inode_cachep)
 		return -ENOMEM;
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1458,7 +1458,7 @@ static int init_inodecache(void)
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -52,7 +52,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	 * The trusted.* namespace can only be accessed by privileged users.
 	 */
 	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		return 0;
 	}
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
 #define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
 #define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
 #define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT	SLAB_ACCOUNT
 
 #define kmem_zone	kmem_cache
 #define kmem_zone_t	struct kmem_cache
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+#include <bc/io_acct.h>
 
 /*
  * structure owned by writepages passed to individual writepage calls
@@ -1853,6 +1854,11 @@ xfs_vm_set_page_dirty(
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					!radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_dirty(mapping);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -79,54 +79,6 @@ xfs_buf_vmap_len(
 	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
 }
 
-/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (list_empty(&bp->b_lru)) {
-		atomic_inc(&bp->b_hold);
-		list_add_tail(&bp->b_lru, &btp->bt_lru);
-		btp->bt_lru_nr++;
-		bp->b_state &= ~XFS_BSTATE_DISPOSE;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	if (list_empty(&bp->b_lru))
-		return;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (!list_empty(&bp->b_lru)) {
-		list_del_init(&bp->b_lru);
-		btp->bt_lru_nr--;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
 /*
  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
  * this buffer. The count is incremented once per buffer (per hold cycle)
@@ -214,20 +166,12 @@ xfs_buf_stale(
 	__xfs_buf_ioacct_dec(bp);
 
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!list_empty(&bp->b_lru)) {
-		struct xfs_buftarg *btp = bp->b_target;
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+		atomic_dec(&bp->b_hold);
 
-		spin_lock(&btp->bt_lru_lock);
-		if (!list_empty(&bp->b_lru) &&
-		    !(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_del_init(&bp->b_lru);
-			btp->bt_lru_nr--;
-			atomic_dec(&bp->b_hold);
-		}
-		spin_unlock(&btp->bt_lru_lock);
-	}
-	spin_unlock(&bp->b_lock);
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
 }
 
 static int
@@ -1034,7 +978,10 @@ xfs_buf_rele(
 		 * reference to the buffer for the LRU and clear the
 		 * (now stale) dispose list state flag
 		 */
-		xfs_buf_lru_add(bp);
+		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+			bp->b_state &= ~XFS_BSTATE_DISPOSE;
+			atomic_inc(&bp->b_hold);
+		}
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
 		/*
@@ -1044,11 +991,10 @@ xfs_buf_rele(
 		 * buffer was on was the disposal list
 		 */
 		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			xfs_buf_lru_del(bp);
+			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
 		} else {
 			ASSERT(list_empty(&bp->b_lru));
 		}
-
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 		spin_unlock(&pag->pag_buf_lock);
@@ -1626,129 +1572,128 @@ xfs_buf_iomove(
  * returned. These buffers will have an elevated hold count, so wait on those
  * while freeing all the buffers only held by the LRU.
  */
+static enum lru_status
+xfs_buftarg_wait_rele(
+	struct list_head	*item,
+	struct list_lru_one     *lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head        *dispose = arg;
+
+	if (atomic_read(&bp->b_hold) > 1) {
+		/* need to wait */
+		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+		return LRU_SKIP;
+	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct xfs_buf		*bp;
 	LIST_HEAD(dispose);
+	int loop = 0;
 
-	/*
-	 * First wait on the buftarg I/O count for all in-flight buffers to be
-	 * released. This is critical as new buffers do not make the LRU until
-	 * they are released.
-	 *
-	 * Next, flush the buffer workqueue to ensure all completion processing
-	 * has finished. Just waiting on buffer locks is not sufficient for
-	 * async IO as the reference count held over IO is not released until
-	 * after the buffer lock is dropped. Hence we need to ensure here that
-	 * all reference counts have been dropped before we start walking the
-	 * LRU list.
-	 */
-	while (percpu_counter_sum(&btp->bt_io_count))
-		delay(100);
-	flush_workqueue(btp->bt_mount->m_buf_workqueue);
-
-restart:
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-		if (atomic_read(&bp->b_hold) > 1) {
-			/* need to wait, so skip it this pass */
-			trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
-skip:
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			spin_unlock(&btp->bt_lru_lock);
-			delay(100);
-			goto restart;
-		}
-		if (!spin_trylock(&bp->b_lock))
-			goto skip;
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
+		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+			      &dispose, LONG_MAX);
 
-		/*
-		 * clear the LRU reference count so the buffer doesn't get
-		 * ignored in xfs_buf_rele().
-		 */
-		atomic_set(&bp->b_lru_ref, 0);
-		if (bp->b_flags & XBF_WRITE_FAIL) {
-			xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
-				(long long)bp->b_bn);
-			xfs_alert(btp->bt_mount,
-"Please run xfs_repair to determine the extent of the problem.");
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			if (bp->b_flags & XBF_WRITE_FAIL) {
+				xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+					(long long)bp->b_bn);
+			}
+			xfs_buf_rele(bp);
 		}
-		bp->b_state |= XFS_BSTATE_DISPOSE;
-		list_move_tail(&bp->b_lru, &dispose);
-		spin_unlock(&bp->b_lock);
+		if (loop++ != 0)
+			delay(100);
 	}
-	spin_unlock(&btp->bt_lru_lock);
+}
 
-	while (!list_empty(&dispose)) {
-		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
-		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+static enum lru_status
+xfs_buftarg_isolate(
+	struct list_head	*item,
+	struct list_lru_one     *lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * Decrement the b_lru_ref count unless the value is already
+	 * zero. If the value is already zero, we need to reclaim the
+	 * buffer, otherwise it gets another trip through the LRU.
+	 */
+	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
+		return LRU_ROTATE;
 	}
+
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
 }
 
-int
-xfs_buftarg_shrink(
+static unsigned long
+xfs_buftarg_shrink_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
-	struct xfs_buf		*bp;
-	int nr_to_scan = sc->nr_to_scan;
 	LIST_HEAD(dispose);
+	unsigned long		freed;
+	unsigned long		nr_to_scan = sc->nr_to_scan;
 
-	if (!nr_to_scan)
-		return btp->bt_lru_nr;
-
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		if (nr_to_scan-- <= 0)
-			break;
-
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
-		/*
-		 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-		 * If we fail to get the lock, just skip it.
-		 */
-		if (!spin_trylock(&bp->b_lock)) {
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * Decrement the b_lru_ref count unless the value is already
-		 * zero. If the value is already zero, we need to reclaim the
-		 * buffer, otherwise it gets another trip through the LRU.
-		 */
-		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-			spin_unlock(&bp->b_lock);
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * remove the buffer from the LRU now to avoid needing another
-		 * lock round trip inside xfs_buf_rele().
-		 */
-		list_move(&bp->b_lru, &dispose);
-		btp->bt_lru_nr--;
-		bp->b_state |= XFS_BSTATE_DISPOSE;
-		spin_unlock(&bp->b_lock);
-	}
-	spin_unlock(&btp->bt_lru_lock);
+	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+				       &dispose, &nr_to_scan);
 
 	while (!list_empty(&dispose)) {
+		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
 		xfs_buf_rele(bp);
 	}
 
-	return btp->bt_lru_nr;
+	return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	return list_lru_count_node(&btp->bt_lru, sc->nid);
 }
 
 void
@@ -1759,6 +1704,7 @@ xfs_free_buftarg(
 	unregister_shrinker(&btp->bt_shrinker);
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
+	list_lru_destroy(&btp->bt_lru);
 
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
@@ -1820,16 +1766,18 @@ xfs_alloc_buftarg(
 	btp->bt_bdev = bdev;
 	btp->bt_bdi = blk_get_backing_dev_info(bdev);
 
-	INIT_LIST_HEAD(&btp->bt_lru);
-	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 
-	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+	if (list_lru_init(&btp->bt_lru))
 		goto error;
 
-	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+		goto error;
+	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&btp->bt_shrinker);
 	return btp;
 
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -26,6 +26,7 @@
 #include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
+#include <linux/list_lru.h>
 
 /*
  *	Base types
@@ -86,7 +87,7 @@ typedef unsigned int xfs_buf_flags_t;
 /*
  * Internal state flags.
  */
-#define XFS_BSTATE_DISPOSE	(1 << 0)	/* buffer being discarded */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
 #define XFS_BSTATE_IN_FLIGHT	(1 << 1)	/* I/O in flight */
 
 /*
@@ -114,9 +115,7 @@ typedef struct xfs_buftarg {
 
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
-	struct list_head	bt_lru;
-	spinlock_t		bt_lru_lock;
-	unsigned int		bt_lru_nr;
+	struct list_lru		bt_lru;
 
 	struct percpu_counter	bt_io_count;
 } xfs_buftarg_t;
@@ -162,6 +161,7 @@ typedef struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
+	xfs_buf_flags_t		b_lru_flags;	/* internal lru status flags */
 	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_state;	/* internal state flags */
 	int			b_io_error;	/* internal IO error state */
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -949,13 +949,8 @@ xfs_qm_dqput(
 		struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
 		trace_xfs_dqput_free(dqp);
 
-		mutex_lock(&qi->qi_lru_lock);
-		if (list_empty(&dqp->q_lru)) {
-			list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
-			qi->qi_lru_count++;
+		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
 			XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
-		}
-		mutex_unlock(&qi->qi_lru_lock);
 
 	}
 	xfs_dqunlock(dqp);
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -330,7 +330,7 @@ xfs_file_dio_aio_read(
 	 * serialisation.
 	 */
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if (mapping->nrpages) {
+	{
 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -345,22 +345,20 @@ xfs_file_dio_aio_read(
 		 * flush and reduce the chances of repeated iolock cycles going
 		 * forward.
 		 */
-		if (mapping->nrpages) {
-			ret = filemap_write_and_wait(mapping);
-			if (ret) {
-				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
-				return ret;
-			}
-
-			/*
-			 * Invalidate whole pages. This can return an error if
-			 * we fail to invalidate a page, but this should never
-			 * happen on XFS. Warn if it does fail.
-			 */
-			ret = invalidate_inode_pages2(mapping);
-			WARN_ON_ONCE(ret);
-			ret = 0;
+		ret = filemap_write_and_wait(mapping);
+		if (ret) {
+			xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+			return ret;
 		}
+
+		/*
+		 * Invalidate whole pages. This can return an error if
+		 * we fail to invalidate a page, but this should never
+		 * happen on XFS. Warn if it does fail.
+		 */
+		ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+		WARN_ON_ONCE(ret);
+		ret = 0;
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}
 	ret = __blockdev_direct_IO(READ, iocb, inode, target->bt_bdev,
@@ -1830,7 +1828,6 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.pmd_fault	= xfs_filemap_pmd_fault,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
 };
 
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1209,7 +1209,7 @@ xfs_reclaim_inodes(
  * them to be cleaned, which we hope will not be very long due to the
  * background walker having already kicked the IO off on those dirty inodes.
  */
-void
+long
 xfs_reclaim_inodes_nr(
 	struct xfs_mount	*mp,
 	int			nr_to_scan)
@@ -1218,7 +1218,7 @@ xfs_reclaim_inodes_nr(
 	xfs_reclaim_work_queue(mp);
 	xfs_ail_push_all(mp->m_ail);
 
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+	return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
 }
 
 /*
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -58,7 +58,7 @@ void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -46,8 +46,9 @@
  */
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(struct shrinker *, struct shrink_control *);
 
+
+STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
 /*
  * We use the batch lookup interface to iterate over the dquots as it
  * currently is the only interface into the radix tree code that allows
@@ -181,12 +182,9 @@ xfs_qm_dqpurge(
 	 * We move dquots to the freelist as soon as their reference count
 	 * hits zero, so it really should be on the freelist here.
 	 */
-	mutex_lock(&qi->qi_lru_lock);
 	ASSERT(!list_empty(&dqp->q_lru));
-	list_del_init(&dqp->q_lru);
-	qi->qi_lru_count--;
+	list_lru_del(&qi->qi_lru, &dqp->q_lru);
 	XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-	mutex_unlock(&qi->qi_lru_lock);
 
 	xfs_qm_dqdestroy(dqp);
 	return 0;
@@ -455,6 +453,143 @@ xfs_qm_set_defquota(
 	}
 }
 
+struct xfs_qm_isolate {
+	struct list_head	buffers;
+	struct list_head	dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_dquot	*dqp = container_of(item,
+						struct xfs_dquot, q_lru);
+	struct xfs_qm_isolate	*isol = arg;
+
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_miss_busy;
+
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
+		XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
+
+		trace_xfs_dqreclaim_want(dqp);
+		list_lru_isolate(lru, &dqp->q_lru);
+		XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+		return LRU_REMOVED;
+	}
+
+	/*
+	 * If the dquot is dirty, flush it. If it's already being flushed, just
+	 * skip it so there is time for the IO to complete before we try to
+	 * reclaim it again on the next LRU pass.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		xfs_dqunlock(dqp);
+		goto out_miss_busy;
+	}
+
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		trace_xfs_dqreclaim_dirty(dqp);
+
+		/* we have to drop the LRU lock to flush the dquot */
+		spin_unlock(lru_lock);
+
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error) {
+			xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+				 __func__, dqp);
+			goto out_unlock_dirty;
+		}
+
+		xfs_buf_delwri_queue(bp, &isol->buffers);
+		xfs_buf_relse(bp);
+		goto out_unlock_dirty;
+	}
+	xfs_dqfunlock(dqp);
+
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->dq_flags |= XFS_DQ_FREEING;
+	xfs_dqunlock(dqp);
+
+	ASSERT(dqp->q_nrefs == 0);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+	trace_xfs_dqreclaim_done(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
+	return LRU_REMOVED;
+
+out_miss_busy:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	return LRU_SKIP;
+
+out_unlock_dirty:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	xfs_dqunlock(dqp);
+	spin_lock(lru_lock);
+	return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_qm_isolate	isol;
+	unsigned long		freed;
+	int			error;
+
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+		return 0;
+
+	INIT_LIST_HEAD(&isol.buffers);
+	INIT_LIST_HEAD(&isol.dispose);
+
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
+
+	error = xfs_buf_delwri_submit(&isol.buffers);
+	if (error)
+		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+	while (!list_empty(&isol.dispose)) {
+		struct xfs_dquot	*dqp;
+
+		dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+		list_del_init(&dqp->q_lru);
+		xfs_qm_dqfree_one(dqp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+
+	return list_lru_shrink_count(&qi->qi_lru, sc);
+}
+
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -471,11 +606,18 @@ xfs_qm_init_quotainfo(
 
 	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
+	if ((error = list_lru_init(&qinf->qi_lru))) {
+		kmem_free(qinf);
+		mp->m_quotainfo = NULL;
+		return error;
+	}
+
 	/*
 	 * See if quotainodes are setup, and if not, allocate them,
 	 * and change the superblock accordingly.
 	 */
 	if ((error = xfs_qm_init_quotainos(mp))) {
+		list_lru_destroy(&qinf->qi_lru);
 		kmem_free(qinf);
 		mp->m_quotainfo = NULL;
 		return error;
@@ -486,10 +628,6 @@ xfs_qm_init_quotainfo(
 	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
 	mutex_init(&qinf->qi_tree_lock);
 
-	INIT_LIST_HEAD(&qinf->qi_lru_list);
-	qinf->qi_lru_count = 0;
-	mutex_init(&qinf->qi_lru_lock);
-
 	/* mutex used to serialize quotaoffs */
 	mutex_init(&qinf->qi_quotaofflock);
 
@@ -554,8 +692,10 @@ xfs_qm_init_quotainfo(
 	if (XFS_IS_PQUOTA_RUNNING(mp))
 		xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
 
-	qinf->qi_shrinker.shrink = xfs_qm_shake;
+	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&qinf->qi_shrinker);
 	return 0;
 }
@@ -576,6 +716,7 @@ xfs_qm_destroy_quotainfo(
 	ASSERT(qi != NULL);
 
 	unregister_shrinker(&qi->qi_shrinker);
+	list_lru_destroy(&qi->qi_lru);
 
 	if (qi->qi_uquotaip) {
 		IRELE(qi->qi_uquotaip);
@@ -1468,132 +1609,6 @@ xfs_qm_dqfree_one(
 	xfs_qm_dqdestroy(dqp);
 }
 
-STATIC void
-xfs_qm_dqreclaim_one(
-	struct xfs_dquot	*dqp,
-	struct list_head	*buffer_list,
-	struct list_head	*dispose_list)
-{
-	struct xfs_mount	*mp = dqp->q_mount;
-	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	int			error;
-
-	if (!xfs_dqlock_nowait(dqp))
-		goto out_move_tail;
-
-	/*
-	 * This dquot has acquired a reference in the meantime remove it from
-	 * the freelist and try again.
-	 */
-	if (dqp->q_nrefs) {
-		xfs_dqunlock(dqp);
-
-		trace_xfs_dqreclaim_want(dqp);
-		XFS_STATS_INC(mp, xs_qm_dqwants);
-
-		list_del_init(&dqp->q_lru);
-		qi->qi_lru_count--;
-		XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-		return;
-	}
-
-	/*
-	 * Try to grab the flush lock. If this dquot is in the process of
-	 * getting flushed to disk, we don't want to reclaim it.
-	 */
-	if (!xfs_dqflock_nowait(dqp))
-		goto out_unlock_move_tail;
-
-	if (XFS_DQ_IS_DIRTY(dqp)) {
-		struct xfs_buf	*bp = NULL;
-
-		trace_xfs_dqreclaim_dirty(dqp);
-
-		error = xfs_qm_dqflush(dqp, &bp);
-		if (error) {
-			xfs_warn(mp, "%s: dquot %p flush failed",
-				 __func__, dqp);
-			goto out_unlock_move_tail;
-		}
-
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
-		/*
-		 * Give the dquot another try on the freelist, as the
-		 * flushing will take some time.
-		 */
-		goto out_unlock_move_tail;
-	}
-	xfs_dqfunlock(dqp);
-
-	/*
-	 * Prevent lookups now that we are past the point of no return.
-	 */
-	dqp->dq_flags |= XFS_DQ_FREEING;
-	xfs_dqunlock(dqp);
-
-	ASSERT(dqp->q_nrefs == 0);
-	list_move_tail(&dqp->q_lru, dispose_list);
-	qi->qi_lru_count--;
-	XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-
-	trace_xfs_dqreclaim_done(dqp);
-	XFS_STATS_INC(mp, xs_qm_dqreclaims);
-	return;
-
-	/*
-	 * Move the dquot to the tail of the list so that we don't spin on it.
-	 */
-out_unlock_move_tail:
-	xfs_dqunlock(dqp);
-out_move_tail:
-	list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-	trace_xfs_dqreclaim_busy(dqp);
-	XFS_STATS_INC(mp, xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
-	struct shrinker		*shrink,
-	struct shrink_control	*sc)
-{
-	struct xfs_quotainfo	*qi =
-		container_of(shrink, struct xfs_quotainfo, qi_shrinker);
-	int			nr_to_scan = sc->nr_to_scan;
-	LIST_HEAD		(buffer_list);
-	LIST_HEAD		(dispose_list);
-	struct xfs_dquot	*dqp;
-	int			error;
-
-	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
-		return 0;
-	if (!nr_to_scan)
-		goto out;
-
-	mutex_lock(&qi->qi_lru_lock);
-	while (!list_empty(&qi->qi_lru_list)) {
-		if (nr_to_scan-- <= 0)
-			break;
-		dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
-				       q_lru);
-		xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
-	}
-	mutex_unlock(&qi->qi_lru_lock);
-
-	error = xfs_buf_delwri_submit(&buffer_list);
-	if (error)
-		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
-	while (!list_empty(&dispose_list)) {
-		dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
-		list_del_init(&dqp->q_lru);
-		xfs_qm_dqfree_one(dqp);
-	}
-
-out:
-	return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
-}
-
 /* --------------- utility functions for vnodeops ---------------- */
 
 
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -74,9 +74,7 @@ typedef struct xfs_quotainfo {
 	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
 	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
 	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
-	struct list_head qi_lru_list;
-	struct mutex	 qi_lru_lock;
-	int		 qi_lru_count;
+	struct list_lru	 qi_lru;
 	int		 qi_dquots;
 	time_t		 qi_btimelimit;	 /* limit for blks timer */
 	time_t		 qi_itimelimit;	 /* limit for inodes timer */
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1695,19 +1695,20 @@ xfs_fs_mount(
 	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
 }
 
-static int
+static long
 xfs_fs_nr_cached_objects(
-	struct super_block	*sb)
+	struct super_block	*sb,
+	struct shrink_control	*sc)
 {
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
 
-static void
+static long
 xfs_fs_free_cached_objects(
 	struct super_block	*sb,
-	int			nr_to_scan)
+	struct shrink_control	*sc)
 {
-	xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 
 static const struct super_operations xfs_super_operations = {
@@ -1732,7 +1733,7 @@ static struct file_system_type xfs_fs_type = {
 	.kill_sb		= kill_block_super,
 	.fs_flags		= FS_REQUIRES_DEV | FS_HAS_RM_XQUOTA |
 				  FS_HAS_INVALIDATE_RANGE | FS_HAS_DIO_IODONE2 |
-				  FS_HAS_NEXTDQBLK,
+				  FS_HAS_NEXTDQBLK | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("xfs");
 
@@ -1802,8 +1803,8 @@ xfs_init_zones(void)
 
 	xfs_inode_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-			xfs_fs_inode_init_once);
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+			KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
 	if (!xfs_inode_zone)
 		goto out_destroy_efi_zone;
 
--- a/include/asm-generic/kexec.h
+++ b/include/asm-generic/kexec.h
@@ -4,7 +4,11 @@
 #ifdef CONFIG_KEXEC_AUTO_RESERVE
 
 #ifndef KEXEC_AUTO_RESERVED_SIZE
+#ifndef CONFIG_KASAN
 #define KEXEC_AUTO_RESERVED_SIZE ((1ULL<<27) + (1ULL<<25)) /* 160M */
+#else
+#define KEXEC_AUTO_RESERVED_SIZE ((1ULL<<28) + (1ULL<<26)) /* 320M */
+#endif
 #endif
 #ifndef KEXEC_AUTO_THRESHOLD
 #define KEXEC_AUTO_THRESHOLD (1ULL<<31) /* 2G */
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -449,21 +449,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
 	return pte;
 }
-
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-       return pte;
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-       return pte;
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-       return 0;
-}
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -454,7 +454,7 @@
 		*(.entry.text)						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
@@ -464,6 +464,16 @@
 #define IRQENTRY_TEXT
 #endif
 
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+#define SOFTIRQENTRY_TEXT						\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
+		*(.softirqentry.text)					\
+		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
+#else
+#define SOFTIRQENTRY_TEXT
+#endif
+
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
 
@@ -496,6 +506,7 @@
 #define KERNEL_CTORS()	. = ALIGN(8);			   \
 			VMLINUX_SYMBOL(__ctors_start) = .; \
 			*(.ctors)			   \
+			*(SORT(.init_array.*))		   \
 			*(.init_array)			   \
 			VMLINUX_SYMBOL(__ctors_end) = .;
 #else
--- /dev/null
+++ b/include/bc/beancounter.h
@@ -0,0 +1,493 @@
+/*
+ *  include/bc/beancounter.h
+ *
+ *  Copyright (c) 1999-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Andrey Savochkin	saw@sw-soft.com
+ *
+ */
+
+#ifndef _LINUX_BEANCOUNTER_H
+#define _LINUX_BEANCOUNTER_H
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <linux/cgroup.h>
+#include <bc/decl.h>
+#include <asm/atomic.h>
+
+#include <uapi/linux/beancounter.h>
+
+/*
+ * This magic is used to distinuish user beancounter and pages beancounter
+ * in struct page. page_ub and page_bc are placed in union and MAGIC
+ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
+ */
+#define UB_MAGIC		0x62756275
+
+/*
+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
+ */
+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
+
+
+/*
+ *	Resource management structures
+ * Serialization issues:
+ *   beancounter list management is protected via ub_hash_lock
+ *   task pointers are set only for current task and only once
+ *   refcount is managed atomically
+ *   value and limit comparison and change are protected by per-ub spinlock
+ */
+
+struct task_beancounter;
+
+struct ub_percpu_struct {
+	int dirty_pages;
+	int writeback_pages;
+	int wb_requests;
+	int wb_sectors;
+
+	unsigned long fuse_requests;
+	unsigned long fuse_bytes;
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	unsigned long async_write_complete;
+	unsigned long async_write_canceled;
+	unsigned long long sync_write_bytes;
+	unsigned long long sync_read_bytes;
+#endif
+	unsigned long	sync;
+	unsigned long	sync_done;
+
+	unsigned long	fsync;
+	unsigned long	fsync_done;
+
+	unsigned long	fdsync;
+	unsigned long	fdsync_done;
+
+	unsigned long	frsync;
+	unsigned long	frsync_done;
+
+	/* percpu resource precharge */
+	int	precharge[UB_RESOURCES];
+};
+
+enum {
+	UB_MEM_CGROUP,
+	UB_BLKIO_CGROUP,
+	NR_UB_BOUND_CGROUPS,
+};
+
+struct user_beancounter {
+	struct cgroup_subsys_state css;
+
+	struct cgroup_subsys_state *ub_bound_css[NR_UB_BOUND_CGROUPS];
+
+	unsigned long		ub_magic;
+	struct list_head	ub_list;
+
+	spinlock_t		ub_lock;
+	const char		*ub_name;
+
+	struct ratelimit_state	ub_ratelimit;
+
+	atomic_long_t		dirty_pages;
+	atomic_long_t		writeback_pages;
+	atomic_long_t		wb_requests;
+	atomic_long_t		wb_sectors;
+
+	unsigned long		swapin;
+	unsigned long		swapout;
+
+	void			*iolimit;
+
+	/* resources statistic and settings */
+	struct ubparm		ub_parms[UB_RESOURCES];
+	/* resources statistic for last interval */
+	struct ubparm		*ub_store;
+
+	struct ub_percpu_struct	*ub_percpu;
+};
+
+extern int ub_count;
+
+enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE };
+
+#define UB_TEST	0x100
+#define UB_SEV_FLAGS	UB_TEST
+
+extern struct cgroup_subsys ub_subsys;
+static inline struct user_beancounter *cgroup_ub(struct cgroup *cg)
+{
+	return container_of(cgroup_subsys_state(cg, ub_subsys_id),
+			    struct user_beancounter, css);
+}
+
+extern struct cgroup_subsys_state *
+__ub_get_css(struct user_beancounter *ub, int idx);
+
+static inline struct cgroup_subsys_state *
+ub_get_mem_css(struct user_beancounter *ub)
+{
+	return __ub_get_css(ub, UB_MEM_CGROUP);
+}
+
+static inline struct cgroup_subsys_state *
+ub_get_blkio_css(struct user_beancounter *ub)
+{
+	return __ub_get_css(ub, UB_BLKIO_CGROUP);
+}
+
+static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
+{
+	return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
+}
+
+static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
+{
+	return (ub->ub_parms[resource].held > 
+		((ub->ub_parms[resource].barrier) >> 1));
+}
+
+static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3);
+}
+
+static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024;
+}
+
+static inline unsigned long ub_resource_bound(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	switch (strict) {
+		case UB_HARD:
+			return ub->ub_parms[resource].barrier;
+		case UB_SOFT:
+			return ub->ub_parms[resource].limit;
+		case UB_FORCE:
+			return UB_MAXVALUE;
+		default:
+			{
+				extern int no_such_severity(void);
+				return no_such_severity();
+			}
+	}
+}
+
+static inline unsigned long ub_resource_excess(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	unsigned long held, bound;
+
+	held = ub->ub_parms[resource].held;
+	bound = ub_resource_bound(ub, resource, strict);
+	if (likely(held < bound))
+		return bound - held;
+	return 0;
+}
+
+#ifndef CONFIG_BEANCOUNTERS
+
+#define ub_percpu(ub, cpu)		(NULL)
+#define __ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_add(ub, f, v)	do { } while (0)
+#define ub_percpu_sub(ub, f, v)	do { } while (0)
+#define ub_percpu_inc(ub, f)	do { } while (0)
+#define ub_percpu_dec(ub, f)	do { } while (0)
+
+#define mm_ub(mm)	(NULL)
+
+#define for_each_beancounter(__ubp)	while (0)
+
+extern inline struct user_beancounter *get_beancounter_by_name
+		(const char *name, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter_byuid
+		(uid_t uid, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter
+		(struct user_beancounter *ub) { return NULL; }
+extern inline void put_beancounter(struct user_beancounter *ub) { }
+
+static inline uid_t ub_legacy_id(struct user_beancounter *ub) { return -1; }
+
+static inline void ub_init_late(void) { };
+static inline void ub_init_early(void) { };
+
+static inline int charge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val,
+			enum ub_severity strict) { return 0; }
+#define charge_beancounter_fast charge_beancounter
+static inline void uncharge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val) { }
+#define uncharge_beancounter_fast uncharge_beancounter
+
+#else /* CONFIG_BEANCOUNTERS */
+
+extern struct list_head ub_list_head;
+
+#define for_each_beancounter(__ubp) \
+	list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list)
+
+#define ub_percpu(ub, cpu) (per_cpu_ptr((ub)->ub_percpu, (cpu)))
+
+#define __ub_percpu_sum(ub, field)	({			\
+		struct user_beancounter *__ub = (ub);		\
+		typeof(ub_percpu(__ub, 0)->field) __sum = 0;	\
+		int __cpu;					\
+		for_each_possible_cpu(__cpu)			\
+			__sum += ub_percpu(__ub, __cpu)->field;	\
+		__sum;						\
+	})
+
+#define ub_percpu_sum(ub, field)	({			\
+		long __sum = __ub_percpu_sum(ub, field);	\
+		(__sum < 0) ? 0 : __sum;			\
+	})
+
+#define ub_percpu_add(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1)
+
+#define ub_percpu_sub(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1)
+
+#define mm_ub(mm)	((mm)->mm_ub)
+/*
+ *  Charge/uncharge operations
+ */
+
+extern int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict);
+
+extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val);
+
+extern void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held);
+
+extern int ub_update_memcg(struct user_beancounter *ub);
+extern void ub_sync_memcg(struct user_beancounter *ub);
+extern unsigned long ub_total_pages(struct user_beancounter *ub, bool swap);
+
+extern const char *ub_rnames[];
+/*
+ *	Put a beancounter reference
+ */
+
+static inline void put_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return;
+
+	css_put(&ub->css);
+}
+
+/*
+ *	Create a new beancounter reference
+ */
+extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
+extern struct user_beancounter *get_beancounter_by_name(const char *name,
+							int create);
+
+static inline 
+struct user_beancounter *get_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return NULL;
+
+	css_get(&ub->css);
+	return ub;
+}
+
+static inline 
+struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub)
+{
+	return css_tryget(&ub->css) ? ub : NULL;
+}
+
+extern uid_t ub_legacy_id(struct user_beancounter *ub);
+
+extern void ub_init_late(void);
+extern void ub_init_early(void);
+
+#define UB_STAT_BATCH	64
+
+static inline void __ub_stat_add(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu + val <= UB_STAT_BATCH)
+		*pcpu += val;
+	else {
+		atomic_long_add(*pcpu + val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_sub(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu - val >= -UB_STAT_BATCH)
+		*pcpu -= val;
+	else {
+		atomic_long_add(*pcpu - val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_flush_pcpu(atomic_long_t *stat, int *pcpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	atomic_long_add(*pcpu, stat);
+	*pcpu = 0;
+	local_irq_restore(flags);
+}
+
+#define ub_stat_add(ub, name, val)	__ub_stat_add(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_sub(ub, name, val)	__ub_stat_sub(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_inc(ub, name)		ub_stat_add(ub, name, 1)
+#define ub_stat_dec(ub, name)		ub_stat_sub(ub, name, 1)
+#define ub_stat_mod(ub, name, val)	atomic_long_add(val, &(ub)->name)
+#define __ub_stat_get(ub, name)		atomic_long_read(&(ub)->name)
+#define ub_stat_get(ub, name)		max(0l, atomic_long_read(&(ub)->name))
+#define ub_stat_get_exact(ub, name)	max(0l, __ub_stat_get(ub, name) + __ub_percpu_sum(ub, name))
+#define ub_stat_flush_pcpu(ub, name)	__ub_stat_flush_pcpu(&(ub)->name, &(ub)->ub_percpu->name)
+
+int ubstat_alloc_store(struct user_beancounter *ub);
+
+/*
+ *	Resource charging
+ * Change user's account and compare against limits
+ */
+
+static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
+{
+	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
+		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
+	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
+		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
+}
+
+int charge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val, enum ub_severity strict);
+void uncharge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val);
+
+extern int ub_resource_precharge[UB_RESOURCES];
+void init_beancounter_precharge(struct user_beancounter *ub, int resource);
+
+static inline int __try_charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val)) {
+		ub_pcpu->precharge[resource] -= val;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+static inline int __try_uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] + val <=
+				ub->ub_parms[resource].max_precharge)) {
+		ub_pcpu->precharge[resource] += val;
+		return 0;
+	}
+
+	return -E2BIG;
+}
+
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict);
+
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val);
+
+static inline int charge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+	int retval = 0;
+
+	if (val > UB_MAXVALUE)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_charge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		retval = __charge_beancounter_percpu(ub, ub_pcpu, resource,
+							val, strict);
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+static inline void uncharge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_uncharge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		__uncharge_beancounter_percpu(ub, ub_pcpu, resource, val);
+	local_irq_restore(flags);
+}
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource);
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val);
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge);
+
+#define UB_IOPRIO_MIN 0
+#define UB_IOPRIO_MAX 8
+
+#endif /* CONFIG_BEANCOUNTERS */
+
+#ifdef CONFIG_BC_IO_PRIORITY
+extern int ub_set_ioprio(int id, int ioprio);
+#else
+static inline int ub_set_ioprio(int veid, int ioprio) { return -EINVAL; }
+#endif
+
+#endif /* _LINUX_BEANCOUNTER_H */
--- /dev/null
+++ b/include/bc/decl.h
@@ -0,0 +1,39 @@
+/*
+ *  include/bc/decl.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_DECL_H_
+#define __BC_DECL_H_
+
+#ifdef __KERNEL__
+
+/*
+ * Naming convension:
+ * ub_<section|object>_<operation>
+ */
+
+#ifdef CONFIG_BEANCOUNTERS
+
+#define UB_DECLARE_FUNC(ret_type, decl)	extern ret_type decl;
+#define UB_DECLARE_VOID_FUNC(decl)	extern void decl;
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define UB_DECLARE_FUNC(ret_type, decl)		\
+	static inline ret_type decl		\
+	{					\
+		return (ret_type)0;		\
+	}
+#define UB_DECLARE_VOID_FUNC(decl)		\
+	static inline void decl			\
+	{					\
+	}
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif
+
+#endif
--- /dev/null
+++ b/include/bc/io_acct.h
@@ -0,0 +1,124 @@
+/*
+ *  include/bc/io_acct.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifndef __UB_IO_ACCT_H_
+#define __UB_IO_ACCT_H_
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+#include <bc/beancounter.h>
+#include <linux/virtinfo.h>
+
+extern int ub_dirty_ratio;
+extern int ub_dirty_background_ratio;
+
+/*
+ * IO ub is required in task context only, so if exec_ub is set
+ * to NULL this means that uses doesn't need to charge some
+ * resources. nevertheless IO activity must be accounted, so we
+ * account it to current's task beancounter.
+ */
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+	struct user_beancounter *ub;
+
+	ub = get_exec_ub();
+	if (unlikely(ub == NULL))
+		ub = get_task_ub(current);
+
+	return ub;
+}
+
+static inline void ub_io_account_read(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_read_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_write_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+extern void ub_io_account_dirty(struct address_space *mapping);
+extern void ub_io_account_clean(struct address_space *mapping);
+extern void ub_io_account_cancel(struct address_space *mapping);
+extern void ub_io_writeback_inc(struct address_space *mapping);
+extern void ub_io_writeback_dec(struct address_space *mapping);
+
+extern int ub_dirty_limits(unsigned long *pbackground,
+			   long *pdirty, struct user_beancounter *ub);
+extern bool ub_over_bground_thresh(void);
+extern bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode);
+
+static inline void ub_writeback_io(unsigned long requests, unsigned long sectors)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	ub_stat_add(ub, wb_requests, requests);
+	ub_stat_add(ub, wb_sectors, sectors);
+}
+
+#else /* UBC_IO_ACCT */
+
+static inline void ub_io_account_read(size_t bytes)
+{
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+}
+
+static inline void ub_io_account_dirty(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_clean(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_cancel(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_inc(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_dec(struct address_space *mapping)
+{
+}
+
+static inline int ub_dirty_limits(unsigned long *pbackground,
+				  long *pdirty, struct user_beancounter *ub)
+{
+	return 0;
+}
+
+static inline bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode)
+{
+	return false;
+}
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+	return NULL;
+}
+
+static inline bool ub_over_bground_thresh(void)
+{
+	return false;
+}
+
+#endif /* UBC_IO_ACCT */
+
+#endif
--- /dev/null
+++ b/include/bc/misc.h
@@ -0,0 +1,44 @@
+/*
+ *  include/bc/misc.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_MISC_H_
+#define __BC_MISC_H_
+
+#include <bc/decl.h>
+
+struct tty_struct;
+struct file;
+struct file_lock;
+struct sigqueue;
+
+UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
+UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
+UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
+UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
+UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q,
+			struct user_beancounter *ub, gfp_t gfp_mask))
+UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q))
+UB_DECLARE_FUNC(int, ub_task_charge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_get(struct user_beancounter *ub,
+			struct task_struct *task))
+UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task))
+UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
+UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
+
+#ifdef CONFIG_BEANCOUNTERS
+#define set_flock_charged(fl)	do { (fl)->fl_charged = 1; } while (0)
+#define unset_flock_charged(fl)	do {		\
+		WARN_ON((fl)->fl_charged == 0);	\
+		(fl)->fl_charged = 0;		\
+	} while (0)
+#else
+#define set_flock_charged(fl)	do { } while (0)
+#define unset_flock_charged(fl)	do { } while (0)
+#endif
+#endif
--- /dev/null
+++ b/include/bc/proc.h
@@ -0,0 +1,38 @@
+/*
+ *  include/bc/proc.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __UB_PROC_H_
+#define __UB_PROC_H_
+
+#include <linux/seq_file.h>
+
+struct bc_proc_entry {
+	char *name;
+	union {
+		int (*show)(struct seq_file *, void *);
+		struct file_operations *fops;
+	} u;
+	struct bc_proc_entry *next;
+	int cookie;
+};
+
+struct user_beancounter;
+
+void bc_register_proc_entry(struct bc_proc_entry *);
+void bc_register_proc_root_entry(struct bc_proc_entry *);
+
+static inline struct user_beancounter *seq_beancounter(struct seq_file *f)
+{
+	return (struct user_beancounter *)(f->private);
+}
+
+extern const char *bc_proc_lu_fmt;
+extern const char *bc_proc_lu_lfmt;
+extern const char *bc_proc_llu_fmt;
+extern const char *bc_proc_lu_lu_fmt;
+#endif
--- /dev/null
+++ b/include/bc/task.h
@@ -0,0 +1,49 @@
+/*
+ *  include/bc/task.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_TASK_H_
+#define __BC_TASK_H_
+
+struct user_beancounter;
+struct callback_head;
+
+#ifdef CONFIG_BEANCOUNTERS
+struct task_beancounter {
+	struct user_beancounter	*exec_ub;
+	struct user_beancounter	*task_ub;
+	struct callback_head cgroup_attach_work;
+};
+
+extern int ub_attach_task(struct user_beancounter *, struct task_struct *);
+
+#define get_task_ub(__task)	((__task)->task_bc.task_ub)
+
+extern struct user_beancounter ub0;
+#define get_ub0()	(&ub0)
+
+#define get_exec_ub()		(current->task_bc.exec_ub)
+#define set_exec_ub(__newub)		\
+({					\
+	struct user_beancounter *old;	\
+	struct task_beancounter *tbc;	\
+ 					\
+	tbc = &current->task_bc;	\
+	old = tbc->exec_ub;		\
+	tbc->exec_ub = __newub;		\
+	old;				\
+})
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define get_ub0()		(NULL)
+#define get_exec_ub()		(NULL)
+#define get_task_ub(task)	(NULL)
+#define set_exec_ub(__ub)	(NULL)
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif /* __task.h_ */
--- /dev/null
+++ b/include/bc/vmpages.h
@@ -0,0 +1,52 @@
+/*
+ *  include/bc/vmpages.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __UB_PAGES_H_
+#define __UB_PAGES_H_
+
+#include <linux/linkage.h>
+#include <linux/sched.h>	/* for get_exec_ub() */
+#include <linux/mm.h>
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+extern int ub_overcommit_memory;
+
+/*
+ * Check whether vma has private or copy-on-write mapping.
+ */
+#define VM_UB_PRIVATE(__flags, __file)					\
+		( ((__flags) & VM_WRITE) ?				\
+			(__file) == NULL || !((__flags) & VM_SHARED) :	\
+			0						\
+		)
+
+UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file,
+			int strict))
+UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file))
+
+struct shmem_inode_info;
+
+UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
+			unsigned long size))
+
+UB_DECLARE_FUNC(int, ub_enough_memory(struct mm_struct *mm, long pages))
+
+#endif /* __UB_PAGES_H_ */
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -31,6 +31,7 @@ struct alg_sock {
 	struct sock *parent;
 
 	unsigned int refcnt;
+	unsigned int nokey_refcnt;
 
 	const struct af_alg_type *type;
 	void *private;
--- a/include/drm/drm_backport.h
+++ b/include/drm/drm_backport.h
@@ -13,6 +13,8 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/console.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
 
 /**
  * ktime_mono_to_real - Convert monotonic time to clock realtime
@@ -48,58 +50,6 @@ static inline void get_monotonic_boottime64(struct timespec64 *ts)
 #define module_param_unsafe(name, type, perm)			\
 	module_param(name, type, perm)
 
-/*
- *
- */
-
-#include <linux/mm.h>
-
-#define SHRINK_STOP (~0UL)
-/*
- * A callback you can register to apply pressure to ageable caches.
- *
- * @count_objects should return the number of freeable items in the cache. If
- * there are no objects to free or the number of freeable items cannot be
- * determined, it should return 0. No deadlock checks should be done during the
- * count callback - the shrinker relies on aggregating scan counts that couldn't
- * be executed due to potential deadlocks to be run at a later call when the
- * deadlock condition is no longer pending.
- *
- * @scan_objects will only be called if @count_objects returned a non-zero
- * value for the number of freeable objects. The callout should scan the cache
- * and attempt to free items from the cache. It should then return the number
- * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
- * due to potential deadlocks. If SHRINK_STOP is returned, then no further
- * attempts to call the @scan_objects will be made from the current reclaim
- * context.
- *
- * @flags determine the shrinker abilities, like numa awareness
- */
-struct shrinker2 {
-	unsigned long (*count_objects)(struct shrinker2 *,
-				       struct shrink_control *sc);
-	unsigned long (*scan_objects)(struct shrinker2 *,
-				      struct shrink_control *sc);
-
-	int seeks;	/* seeks to recreate an obj */
-	long batch;	/* reclaim batch size, 0 = default */
-	unsigned long flags;
-
-	/* These are for internal use */
-	struct list_head list;
-	/* objs pending delete, per node */
-	atomic_long_t *nr_deferred;
-
-	/* compat: */
-	struct shrinker compat;
-};
-int register_shrinker2(struct shrinker2 *shrinker);
-void unregister_shrinker2(struct shrinker2 *shrinker);
-
-#define shrinker            shrinker2
-#define register_shrinker   register_shrinker2
-#define unregister_shrinker unregister_shrinker2
-
 /*
  *
  */
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -46,5 +46,9 @@ extern void user_describe(const struct key *user, struct seq_file *m);
 extern long user_read(const struct key *key,
 		      char __user *buffer, size_t buflen);
 
+static inline const struct user_key_payload *user_key_payload(const struct key *key)
+{
+	return (struct user_key_payload *)rcu_dereference_key(key);
+}
 
 #endif /* _KEYS_USER_TYPE_H */
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -14,6 +14,16 @@ struct kiocb;
 
 #define KIOCB_KEY		0
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
+struct ve_ioc_arg
+{
+	aio_context_t	ctx_id;
+	unsigned	val;
+};
+
+#define VE_AIO_IOC_WAIT_ACTIVE	_IOW('a',  1, struct ve_ioc_arg)
+
 /*
  * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
  * cancelled or completed (this makes a certain amount of sense because
@@ -40,6 +50,7 @@ struct kiocb {
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
+		void			(*complete)(u64 user_data, long res);
 	} ki_obj;
 
 	__u64			ki_user_data;	/* user's data for completion */
@@ -64,6 +75,7 @@ struct kiocb {
 	 * this is the underlying eventfd context to deliver events to.
 	 */
 	struct eventfd_ctx	*ki_eventfd;
+	struct iov_iter		*ki_iter;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -71,6 +83,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
 	return kiocb->ki_ctx == NULL;
 }
 
+static inline bool is_kernel_kiocb(struct kiocb *kiocb)
+{
+	return kiocb->ki_ctx == (void *)-1;
+}
+
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
@@ -91,6 +108,18 @@ extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+struct kiocb *aio_kernel_alloc(gfp_t gfp);
+void aio_kernel_free(struct kiocb *iocb);
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off);
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data);
+int aio_kernel_submit(struct kiocb *iocb);
+#ifdef CONFIG_VE
+int ve_aio_ioctl(struct task_struct *, unsigned int, unsigned long);
+#endif
+
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline void aio_put_req(struct kiocb *iocb) { }
@@ -102,6 +131,8 @@ static inline long do_io_submit(aio_context_t ctx_id, long nr,
 				bool compat) { return 0; }
 static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
+static int ve_aio_ioctl(struct task_struct *task, unsigned int cmd,
+			unsigned long arg) { return 0; }
 #endif /* CONFIG_AIO */
 
 static inline struct kiocb *list_kiocb(struct list_head *h)
@@ -109,8 +140,4 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
 	return list_entry(h, struct kiocb, ki_list);
 }
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -58,6 +58,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
@@ -67,7 +68,10 @@ struct backing_dev_info {
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	congested_fn *congested_fn2; /* use per-bdi waitq */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	int bd_full; /* backing dev is full */
 
 	char *name;
 
@@ -94,6 +98,9 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
+	unsigned int min_dirty_pages;
+	unsigned int max_dirty_pages;
+
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list & wb.dwork scheduling */
 
@@ -103,6 +110,8 @@ struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
+        wait_queue_head_t cong_waitq; /* to wait on congestion */
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
@@ -121,6 +130,8 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			enum wb_reason reason, struct user_beancounter *ub);
 void bdi_writeback_workfn(struct work_struct *work);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
@@ -216,6 +227,8 @@ static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned int min_dirty);
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned int max_dirty);
 
 /*
  * Flags in backing_dev_info::capability
@@ -308,6 +321,30 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 				  (1 << BDI_async_congested));
 }
 
+/* congestion helpers for block-devices supporting per-bdi waitq */
+static inline int bdi_congested2(struct backing_dev_info *bdi, int bdi_bits)
+{
+	if (bdi->congested_fn2)
+		return bdi->congested_fn2(bdi->congested_data, bdi_bits);
+	return 0;
+}
+
+static inline int bdi_read_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_sync_congested);
+}
+
+static inline int bdi_write_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_async_congested);
+}
+
+static inline int bdi_rw_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, (1 << BDI_sync_congested) |
+				  (1 << BDI_async_congested));
+}
+
 enum {
 	BLK_RW_ASYNC	= 0,
 	BLK_RW_SYNC	= 1,
@@ -357,6 +394,11 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_account_writeback(struct address_space *mapping)
+{
+	return bdi_cap_account_writeback(mapping->backing_dev_info);
+}
+
 static inline bool mapping_cap_swap_backed(struct address_space *mapping)
 {
 	return bdi_cap_swap_backed(mapping->backing_dev_info);
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -28,6 +28,14 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr)
+{
+	ssize_t bytes = 0;
+	while (nr--)
+		bytes += (bvec++)->bv_len;
+	return bytes;
+}
+
 /*
  * RHEL7 auxillary shadow structure used to extend 'struct bio' without
  * breaking RHEL kABI -- bio_init_aux() must be used to set bio->bio_aux
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -519,6 +519,9 @@ struct request_queue {
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	/* Throttle data */
 	struct throtl_data *td;
+#endif
+#ifdef CONFIG_BLK_DEV_CBT
+	struct cbt_info	*cbt;
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
@@ -1790,6 +1793,34 @@ struct blk_dax_ctl {
 	pfn_t pfn;
 };
 
+#if defined (CONFIG_BLK_DEV_CBT)
+extern void blk_cbt_update_size(struct block_device *bdev);
+extern void blk_cbt_release(struct request_queue *q);
+extern void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio);
+extern int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg);
+extern int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
+				 struct page ***map_ptr, blkcnt_t *block_max,
+				 blkcnt_t *block_bits);
+extern int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
+			     struct page **map, blkcnt_t block_max,
+			     blkcnt_t block_bits);
+#else /* CONFIG_BLK_DEV_CBT */
+static inline void blk_cbt_update_size(struct block_device *bdev)
+{
+}
+static inline void blk_cbt_release(struct request_queue *q)
+{
+}
+static inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+}
+static inline int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd,
+				 char __user *arg)
+{
+	return 0;
+}
+#endif /* CONFIG_BLK_DEV_CBT */
+
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -34,7 +34,6 @@ struct cpu_vfs_cap_data {
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
-
 struct file;
 struct inode;
 struct dentry;
@@ -45,6 +44,10 @@ struct user_namespace *current_user_ns(void);
 extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+#include <linux/spinlock_types.h>
+
+extern spinlock_t task_capability_lock;
+
 /*
  * Internal kernel functions only
  */
@@ -213,6 +216,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
+extern bool ve_capable(int cap);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
 #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
 
 #ifdef CONFIG_CGROUP_PIDS
 void cgroup_pids_release(struct task_struct *task);
@@ -36,6 +37,33 @@ struct cgroup;
 struct css_id;
 struct eventfd_ctx;
 
+struct cgroup_sb_opts {
+	unsigned long subsys_mask;
+	unsigned long flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+
+	struct cgroupfs_root *new_root;
+
+};
+
+enum cgroup_open_flags {
+	CGRP_CREAT	= 0x0001,	/* create if not found */
+	CGRP_EXCL	= 0x0002,	/* fail if already exist */
+};
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts);
+struct cgroup *cgroup_get_root(struct vfsmount *mnt);
+struct cgroup *cgroup_kernel_lookup(struct vfsmount *mnt,
+				    const char *pathname);
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, const char *name);
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk);
+void cgroup_kernel_close(struct cgroup *cgrp);
+
 /*
  * Define the enumeration of all cgroup subsystems.
  *
@@ -98,13 +126,8 @@ struct cgroup_subsys_state {
 	 */
 	struct cgroup *cgroup;
 
-	/*
-	 * State maintained by the cgroup system to allow subsystems
-	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and css_put().
-	 */
-
-	atomic_t refcnt;
+	/* reference count - access via css_[try]get() and css_put() */
+	struct percpu_ref refcnt;
 
 	unsigned long flags;
 	/* ID for this css, if possible */
@@ -120,12 +143,6 @@ enum {
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
-/* Caller must verify that the css is not for root cgroup */
-static inline void __css_get(struct cgroup_subsys_state *css, int count)
-{
-	atomic_add(count, &css->refcnt);
-}
-
 /*
  * Call css_get() to hold a reference on the css; it can be used
  * for a reference obtained via:
@@ -137,7 +154,7 @@ static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		__css_get(css, 1);
+		percpu_ref_get(&css->refcnt);
 }
 
 /*
@@ -146,12 +163,11 @@ static inline void css_get(struct cgroup_subsys_state *css)
  * the css has been destroyed.
  */
 
-extern bool __css_tryget(struct cgroup_subsys_state *css);
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return __css_tryget(css);
+	return percpu_ref_tryget(&css->refcnt);
 }
 
 /*
@@ -159,11 +175,10 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  * css_get() or css_tryget()
  */
 
-extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-		__css_put(css);
+		percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
@@ -185,6 +200,9 @@ enum {
 	CGRP_CPUSET_CLONE_CHILDREN,
 	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
 	CGRP_SANE_BEHAVIOR,
+
+	/* The cgroup is root in a VE */
+	CGRP_VE_ROOT,
 };
 
 struct cgroup_name {
@@ -255,9 +273,10 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* For RCU-protected deletion */
+	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
-	struct work_struct free_work;
+	struct work_struct destroy_work;
+	atomic_t css_kill_cnt;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
@@ -265,6 +284,7 @@ struct cgroup {
 
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
+	u64 subgroups_limit;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -425,6 +445,7 @@ struct cgroup_map_cb {
 #define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
 #define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
 #define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
+#define CFTYPE_VE_WRITABLE	(1U << 15)	/* allow write from CT */
 
 #define MAX_CFTYPE_NAME		64
 
@@ -570,6 +591,7 @@ int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
+int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
@@ -924,7 +946,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
-unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
@@ -943,8 +964,6 @@ static inline void cgroup_post_fork(struct task_struct *p,
 				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
-static inline void cgroup_lock(void) {}
-static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -89,7 +89,7 @@ SUBSYS(hugetlb)
 
 /* */
 
-#ifdef CONFIG_CGROUP_BCACHE
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BCACHE)
 SUBSYS(bcache)
 #endif
 
@@ -104,6 +104,17 @@ SUBSYS_TAG(CANFORK_END)
 #endif
 /* */
 
+#if IS_SUBSYS_ENABLED(CONFIG_VE)
+SUBSYS(ve)
+#endif
+
+/* */
+
+#if IS_SUBSYS_ENABLED(CONFIG_BEANCOUNTERS)
+SUBSYS(ub)
+#endif
+
+/* */
 #ifdef __TMP_SUBSYS_TAG
 #undef __TMP_SUBSYS_TAG
 #undef SUBSYS_TAG
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
 #include <linux/exportfs.h>
 #include <linux/mm.h>
 
+#define CLEANCACHE_NO_POOL		-1
+#define CLEANCACHE_NO_BACKEND		-2
+#define CLEANCACHE_NO_BACKEND_SHARED	-3
+
 #define CLEANCACHE_KEY_MAX 6
 
 /*
@@ -26,17 +30,16 @@ struct cleancache_ops {
 	int (*init_shared_fs)(char *uuid, size_t);
 	int (*get_page)(int, struct cleancache_filekey,
 			pgoff_t, struct page *);
-	void (*put_page)(int, struct cleancache_filekey,
+	int (*put_page)(int, struct cleancache_filekey,
 			pgoff_t, struct page *);
 	void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
 	void (*invalidate_inode)(int, struct cleancache_filekey);
 	void (*invalidate_fs)(int);
 };
 
-extern struct cleancache_ops *
-	cleancache_register_ops(struct cleancache_ops *ops);
+extern int cleancache_register_ops(struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern void __cleancache_init_shared_fs(struct super_block *);
 extern int  __cleancache_get_page(struct page *);
 extern void __cleancache_put_page(struct page *);
 extern void __cleancache_invalidate_page(struct address_space *, struct page *);
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
 		__cleancache_init_fs(sb);
 }
 
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+static inline void cleancache_init_shared_fs(struct super_block *sb)
 {
 	if (cleancache_enabled)
-		__cleancache_init_shared_fs(uuid, sb);
+		__cleancache_init_shared_fs(sb);
 }
 
 static inline int cleancache_get_page(struct page *page)
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -17,9 +17,11 @@
 #include <linux/fs.h>
 #include <linux/aio_abi.h>	/* for aio_context_t */
 
+#ifdef __KERNEL__
 #include <asm/compat.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
+#endif
 
 #ifndef COMPAT_USE_64BIT_TIME
 #define COMPAT_USE_64BIT_TIME 0
@@ -141,6 +143,7 @@ struct compat_sigaction {
 	compat_sigset_t			sa_mask __packed;
 };
 
+#ifdef __KERNEL__
 /*
  * These functions operate strictly on struct compat_time*
  */
@@ -161,6 +164,7 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
+#endif
 
 struct compat_iovec {
 	compat_uptr_t	iov_base;
@@ -191,14 +195,18 @@ struct compat_rusage {
 	compat_long_t	ru_nivcsw;
 };
 
+#ifdef __KERNEL__
 extern int put_compat_rusage(const struct rusage *,
 			     struct compat_rusage __user *);
+#endif
 
 struct compat_siginfo;
 
+#ifdef __KERNEL__
 extern asmlinkage long compat_sys_waitid(int, compat_pid_t,
 		struct compat_siginfo __user *, int,
 		struct compat_rusage __user *);
+#endif
 
 struct compat_dirent {
 	u32		d_ino;
@@ -304,6 +312,7 @@ struct compat_kexec_segment;
 struct compat_mq_attr;
 struct compat_msgbuf;
 
+#ifdef __KERNEL__
 extern void compat_exit_robust_list(struct task_struct *curr);
 
 asmlinkage long
@@ -407,6 +416,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int compat_printk(const char *fmt, ...);
+extern int ve_compat_printk(int dst, const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
 extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
 
@@ -661,6 +671,7 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 		const struct compat_iovec __user *lvec,
 		unsigned long liovcnt, const struct compat_iovec __user *rvec,
 		unsigned long riovcnt, unsigned long flags);
+#endif
 
 asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
 				    compat_off_t __user *offset, compat_size_t count);
@@ -695,6 +706,72 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 static inline bool in_compat_syscall(void) { return is_compat_task(); }
 #endif
 
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define QC_QUOTAON  0x0100	/* enable quotas */
+#define QC_QUOTAOFF 0x0200	/* disable quotas */
+/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
+#define QC_SYNC     0x0600	/* sync disk copy of a filesystems quotas */
+#define QC_SETQLIM  0x0700	/* set limits */
+/* GETSTATS at 0x0800 is now longer... */
+#define QC_GETINFO  0x0900	/* get info about quotas - graces, flags... */
+#define QC_SETINFO  0x0A00	/* set info about quotas */
+#define QC_SETGRACE 0x0B00	/* set inode and block grace */
+#define QC_SETFLAGS 0x0C00	/* set flags for quota */
+#define QC_GETQUOTA 0x0D00	/* get limits and usage */
+#define QC_SETQUOTA 0x0E00	/* set limits and usage */
+#define QC_SETUSE   0x0F00	/* set usage */
+/* 0x1000 used by old RSQUASH */
+#define QC_GETSTATS 0x1100	/* get collected stats */
+
+struct compat_dqblk {
+	unsigned int dqb_ihardlimit;
+	unsigned int dqb_isoftlimit;
+	unsigned int dqb_curinodes;
+	unsigned int dqb_bhardlimit;
+	unsigned int dqb_bsoftlimit;
+	qsize_t dqb_curspace;
+	__kernel_time_t dqb_btime;
+	__kernel_time_t dqb_itime;
+};
+
+#ifdef CONFIG_COMPAT
+
+struct compat_compat_dqblk {
+	compat_uint_t	dqb_ihardlimit;
+	compat_uint_t	dqb_isoftlimit;
+	compat_uint_t	dqb_curinodes;
+	compat_uint_t	dqb_bhardlimit;
+	compat_uint_t	dqb_bsoftlimit;
+	compat_u64	dqb_curspace;
+	compat_time_t	dqb_btime;
+	compat_time_t	dqb_itime;
+};
+
+#endif
+
+struct compat_dqinfo {
+	unsigned int dqi_bgrace;
+	unsigned int dqi_igrace;
+	unsigned int dqi_flags;
+	unsigned int dqi_blocks;
+	unsigned int dqi_free_blk;
+	unsigned int dqi_free_entry;
+};
+
+struct compat_dqstats {
+	__u32 lookups;
+	__u32 drops;
+	__u32 reads;
+	__u32 writes;
+	__u32 cache_hits;
+	__u32 allocated_dquots;
+	__u32 free_dquots;
+	__u32 syncs;
+	__u32 version;
+};
+#endif /* CONFIG_QUOTA_COMPAT */
+
 #else
 
 #define is_compat_task() (0)
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -231,12 +231,33 @@
 #endif
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
 
+#if GCC_VERSION >= 70000
+#define KASAN_ABI_VERSION 5
+#elif GCC_VERSION >= 50000
+#define KASAN_ABI_VERSION 4
+#elif GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
+
+#if GCC_VERSION >= 40902
+/*
+ * Tell the compiler that address safety instrumentation (KASAN)
+ * should not be applied to that function.
+ * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ */
+#define __no_sanitize_address __attribute__((no_sanitize_address))
+#endif
+
 #endif	/* gcc version >= 40000 specific checks */
 
 #if !defined(__noclone)
 #define __noclone	/* not needed */
 #endif
 
+#if !defined(__no_sanitize_address)
+#define __no_sanitize_address
+#endif
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,20 +185,46 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #include <uapi/linux/types.h>
 
-static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+#define __READ_ONCE_SIZE						\
+({									\
+	switch (size) {							\
+	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
+	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
+	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
+	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
+	default:							\
+		barrier();						\
+		__builtin_memcpy((void *)res, (const void *)p, size);	\
+		barrier();						\
+	}								\
+})
+
+static __always_inline
+void __read_once_size(const volatile void *p, void *res, int size)
 {
-	switch (size) {
-	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
-	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
-	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
-	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
-	default:
-		barrier();
-		__builtin_memcpy((void *)res, (const void *)p, size);
-		barrier();
-	}
+	__READ_ONCE_SIZE;
 }
 
+#ifdef CONFIG_KASAN
+/*
+ * This function is not 'inline' because __no_sanitize_address confilcts
+ * with inlining. Attempt to inline it may cause a build failure.
+ * 	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
+ */
+static __no_sanitize_address __maybe_unused
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#else
+static __always_inline
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#endif
+
 static __always_inline void __write_once_size(volatile void *p, void *res, int size)
 {
 	switch (size) {
@@ -235,8 +261,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * required ordering.
  */
 
-#define READ_ONCE(x) \
-	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+#define __READ_ONCE(x, check)						\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	if (check)							\
+		__read_once_size(&(x), __u.__c, sizeof(x));		\
+	else								\
+		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+	__u.__val;							\
+})
+#define READ_ONCE(x) __READ_ONCE(x, 1)
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
+ * to hide memory access from KASAN.
+ */
+#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
 
 #define WRITE_ONCE(x, val) \
 	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
@@ -426,6 +466,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  */
 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+/**
+ * lockless_dereference() - safely load a pointer for later dereference
+ * @p: The pointer to load
+ *
+ * Similar to rcu_dereference(), but for situations where the pointed-to
+ * object's lifetime is managed by something other than RCU.  That
+ * "something other" might be reference counting or simple immortality.
+ */
+#define lockless_dereference(p) \
+({ \
+	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+	(_________p1); \
+})
+
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
 # define __kprobes	__attribute__((__section__(".kprobes.text")))
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -63,11 +63,29 @@ struct cn_dev {
 
 	u32 seq, groups;
 	struct sock *nls;
-	void (*input) (struct sk_buff *skb);
 
 	struct cn_queue_dev *cbdev;
 };
 
+struct cn_private {
+	struct cn_dev   cdev;
+	int             cn_already_initialized;
+
+	atomic_t        proc_event_num_listeners;
+	u32 __percpu    *proc_event_counts;
+
+};
+
+int cn_proc_init_ve(struct ve_struct *ve);
+void cn_proc_fini_ve(struct ve_struct *ve);
+
+int cn_add_callback_ve(struct ve_struct *ve,
+		       struct cb_id *id, const char *name,
+		       void (*callback)(struct cn_msg *,
+					struct netlink_skb_parms *));
+void cn_del_callback_ve(struct ve_struct *ve, struct cb_id *id);
+int cn_netlink_send_ve(struct ve_struct *ve, struct cn_msg *, u32, gfp_t);
+
 int cn_add_callback(struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
 void cn_del_callback(struct cb_id *);
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -169,9 +169,6 @@ extern int braille_register_console(struct console *, int index,
 extern int braille_unregister_console(struct console *);
 #ifdef CONFIG_TTY
 extern void console_sysfs_notify(void);
-#else
-static inline void console_sysfs_notify(void)
-{ }
 #endif
 extern bool console_suspend_enabled;
 
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -362,6 +362,7 @@ struct ablkcipher_tfm {
 
 	unsigned int ivsize;
 	unsigned int reqsize;
+	bool has_setkey;
 };
 
 struct aead_tfm {
@@ -672,6 +673,13 @@ static inline int crypto_ablkcipher_setkey(struct crypto_ablkcipher *tfm,
 	return crt->setkey(crt->base, key, keylen);
 }
 
+static inline bool crypto_ablkcipher_has_setkey(struct crypto_ablkcipher *tfm)
+{
+	struct ablkcipher_tfm *crt = crypto_ablkcipher_crt(tfm);
+
+	return crt->has_setkey;
+}
+
 static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm(
 	struct ablkcipher_request *req)
 {
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -55,11 +55,11 @@ struct qstr {
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
 
 struct dentry_stat_t {
-	int nr_dentry;
-	int nr_unused;
-	int age_limit;          /* age in seconds */
-	int want_pages;         /* pages requested by system */
-	int dummy[2];
+	long nr_dentry;
+	long nr_unused;
+	long age_limit;          /* age in seconds */
+	long want_pages;         /* pages requested by system */
+	long dummy[2];
 };
 extern struct dentry_stat_t dentry_stat;
 
@@ -230,6 +230,8 @@ struct dentry_operations_wrapper {
 #define DCACHE_FILE_TYPE		0x04000000 /* Other file type */
 #define DCACHE_OP_REAL			0x08000000
 
+#define DCACHE_MAY_FREE			0x00800000
+
 extern seqlock_t rename_lock;
 
 static inline int dname_external(struct dentry *dentry)
@@ -510,7 +512,7 @@ static inline bool d_really_is_positive(const struct dentry *dentry)
 }
 
 extern int sysctl_vfs_cache_pressure;
-
+extern int sysctl_vfs_cache_min_ratio;
 
 /**
  * d_inode - Get the actual inode of this dentry
@@ -575,4 +577,8 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+static inline unsigned long vfs_pressure_ratio(unsigned long val)
+{
+	return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+}
 #endif	/* __LINUX_DCACHE_H */
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -102,6 +102,25 @@ static inline int delayacct_add_tsk(struct taskstats *d,
 	return __delayacct_add_tsk(d, tsk);
 }
 
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{
+	if (!delayacct_on)
+		return;
+
+	d->cpu_count			+= s->cpu_count;
+	d->cpu_delay_total		+= s->cpu_delay_total;
+	d->cpu_run_real_total		+= s->cpu_run_real_total;
+	d->cpu_run_virtual_total	+= s->cpu_run_virtual_total;
+	d->cpu_scaled_run_real_total	+= s->cpu_scaled_run_real_total;
+	d->blkio_count			+= s->blkio_count;
+	d->blkio_delay_total		+= s->blkio_delay_total;
+	d->swapin_count			+= s->swapin_count;
+	d->swapin_delay_total		+= s->swapin_delay_total;
+	d->freepages_count		+= s->freepages_count;
+	d->freepages_delay_total	+= s->freepages_delay_total;
+}
+
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	if (tsk->delays)
@@ -139,6 +158,9 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 { return 0; }
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{}
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 { return 0; }
 static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -121,6 +121,14 @@ typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
 typedef void (*dm_io_hints_fn) (struct dm_target *ti,
 				struct queue_limits *limits);
 
+typedef void (*dm_ploop_modify_fn) (struct dm_target *ti, int action);
+
+/* "action" arg of dm_ploop_modify_fn */
+enum {
+	DM_PLOOP_ATTACH,
+	DM_PLOOP_DETACH,
+};
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -184,6 +192,7 @@ struct target_type {
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_direct_access_fn direct_access;
+	dm_ploop_modify_fn ploop_modify;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -11,9 +11,23 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 		return 0;
 	return __devcgroup_inode_permission(inode, mask);
 }
+
+extern int devcgroup_device_permission(umode_t mode, dev_t dev, int mask);
+extern int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors);
+
+struct ve_struct;
+int devcgroup_set_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned);
+int devcgroup_seq_show_ve(struct ve_struct *, struct seq_file *);
+
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
 static inline int devcgroup_inode_mknod(int mode, dev_t dev)
 { return 0; }
+static inline int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{ return 0; }
+static inline int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors)
+{ return 0; }
 #endif
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -115,7 +115,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 	return ret;
 }
 
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
@@ -148,7 +148,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 }
 
 static inline
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order)
 {
 	return NULL;
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,6 +22,10 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
+#endif
+
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
--- /dev/null
+++ b/include/linux/fence-watchdog.h
@@ -0,0 +1,14 @@
+/*
+ *  include/linux/fence-watchdog.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_FENCE_WATCHDOG_H_
+#define _LINUX_FENCE_WATCHDOG_H_
+
+inline int fence_wdog_check_timer(void);
+bool fence_wdog_tmo_match(void);
+
+#endif
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -172,14 +172,6 @@ static inline void freezable_schedule(void)
 	freezer_count();
 }
 
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezable_schedule_unsafe(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count_unsafe();
-}
-
 /*
  * Like freezable_schedule_timeout(), but should not block the freezer.  Do not
  * call this with locks held.
@@ -216,16 +208,6 @@ static inline long freezable_schedule_timeout_killable(long timeout)
 	return __retval;
 }
 
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
 /*
  * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
  * call this with locks held.
@@ -315,8 +297,6 @@ static inline void set_freezable(void) {}
 
 #define freezable_schedule()  schedule()
 
-#define freezable_schedule_unsafe()  schedule()
-
 #define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)
 
 #define freezable_schedule_timeout_interruptible(timeout)		\
@@ -325,9 +305,6 @@ static inline void set_freezable(void) {}
 #define freezable_schedule_timeout_killable(timeout)			\
 	schedule_timeout_killable(timeout)
 
-#define freezable_schedule_timeout_killable_unsafe(timeout)		\
-	schedule_timeout_killable(timeout)
-
 #define freezable_schedule_hrtimeout_range(expires, delta, mode)	\
 	schedule_hrtimeout_range(expires, delta, mode)
 
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/list.h>
+#include <linux/list_lru.h>
 #include <linux/radix-tree.h>
 #include <linux/rbtree.h>
 #include <linux/init.h>
@@ -77,6 +78,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define MAY_CHDIR		0x00000040
 /* called from RCU mode, don't block */
 #define MAY_NOT_BLOCK		0x00000080
+/* for devgroup-vs-openvz only */
+#define MAY_QUOTACTL		0x00010000	/* deprecated */
+#define MAY_MOUNT		0x00020000
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
@@ -127,6 +131,12 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Can do sys_quotactl (for devperms) */
+#define FMODE_QUOTACTL		((__force fmode_t)0x8000)
+
+/* File is a block device opened by mount(2)  */
+#define FMODE_MOUNT		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
@@ -196,6 +206,8 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 #define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 
+extern int may_use_odirect(void);
+
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
@@ -309,37 +321,140 @@ struct address_space;
 struct writeback_control;
 
 struct iov_iter {
-	const struct iovec *iov;
+	struct iov_iter_ops *ops;
+	unsigned long data;
 	unsigned long nr_segs;
 	size_t iov_offset;
 	size_t count;
 };
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i);
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-			struct iov_iter *i);
+struct iov_iter_ops {
+	size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
+					 unsigned long, size_t);
+	size_t (*ii_copy_to_user)(struct page *, struct iov_iter *,
+				  unsigned long, size_t);
+	size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *,
+					   unsigned long, size_t);
+	size_t (*ii_copy_from_user)(struct page *, struct iov_iter *,
+					  unsigned long, size_t);
+	void (*ii_advance)(struct iov_iter *, size_t);
+	int (*ii_fault_in_readable)(struct iov_iter *, size_t);
+	size_t (*ii_single_seg_count)(const struct iov_iter *);
+	int (*ii_shorten)(struct iov_iter *, size_t);
+};
+
+static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user(page, i, offset, bytes);
+}
+static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_advance(i, bytes);
+}
+static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_fault_in_readable(i, bytes);
+}
+static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+	return i->ops->ii_single_seg_count(i);
+}
+static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
+{
+	return i->ops->ii_shorten(i, count);
+}
+
+extern struct iov_iter_ops ii_bvec_ops;
+
+struct bio_vec;
+static inline void iov_iter_init_bvec(struct iov_iter *i,
+				      struct bio_vec *bvec,
+				      unsigned long nr_segs,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_bvec_ops;
+	i->data = (unsigned long)bvec;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_bvec(struct iov_iter *i)
+{
+	return i->ops == &ii_bvec_ops;
+}
+static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_bvec(i));
+	return (struct bio_vec *)i->data;
+}
+
+extern struct iov_iter_ops ii_page_ops;
+
+static inline void iov_iter_init_page(struct iov_iter *i,
+				      struct page *page,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_page_ops;
+	i->data = (unsigned long)page;
+	i->nr_segs = 1;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_page(struct iov_iter *i)
+{
+	return i->ops == &ii_page_ops;
+}
+static inline struct page *iov_iter_page(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_page(i));
+	return (struct page *)i->data;
+}
+
+extern struct iov_iter_ops ii_iovec_ops;
 
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count, size_t written)
 {
-	i->iov = iov;
+	i->ops = &ii_iovec_ops;
+	i->data = (unsigned long)iov;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count + written;
 
 	iov_iter_advance(i, written);
 }
+static inline int iov_iter_has_iovec(const struct iov_iter *i)
+{
+	return i->ops == &ii_iovec_ops;
+}
+static inline struct iovec *iov_iter_iovec(const struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_iovec(i));
+	return (struct iovec *)i->data;
+}
 
-static inline size_t iov_iter_count(struct iov_iter *i)
+static inline size_t iov_iter_count(const struct iov_iter *i)
 {
 	return i->count;
 }
@@ -399,6 +514,12 @@ struct address_space_operations {
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
+	ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec,
+			loff_t offset, unsigned long bvec_len);
+	ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page,
+			loff_t offset);
+	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
+						void **, unsigned long *);
 	RH_KABI_DEPRECATE_FN(int, get_xip_mem, struct address_space *, pgoff_t,
 			int, void **, unsigned long *)
 	/*
@@ -444,7 +565,6 @@ struct address_space {
 	RH_KABI_REPLACE(unsigned int i_mmap_writable,
 			 atomic_t i_mmap_writable) /* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
-	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
@@ -458,6 +578,9 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
+	struct list_head	i_peer_list;
+	struct file		*i_peer_file;
+	struct user_beancounter *dirtied_ub;
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -518,8 +641,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!RB_EMPTY_ROOT(&mapping->i_mmap) ||
-		!list_empty(&mapping->i_mmap_nonlinear);
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap);
 }
 
 /*
@@ -633,6 +755,7 @@ struct inode {
 	struct mutex		i_mutex;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
@@ -867,6 +990,7 @@ struct file {
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
+	struct path		f_original_path;
 #define f_dentry	f_path.dentry
 	struct inode		*f_inode;	/* cached value */
 	const struct file_operations	*f_op;
@@ -892,6 +1016,7 @@ struct file {
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
+	struct user_beancounter	*f_ub;
 
 	u64			f_version;
 #ifdef CONFIG_SECURITY
@@ -1080,6 +1205,10 @@ struct file_lock {
 	fl_owner_t fl_owner;
 	unsigned int fl_flags;
 	unsigned char fl_type;
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned char fl_charged;
+	struct user_beancounter *fl_ub;
+#endif
 	unsigned int fl_pid;
 	int fl_link_cpu;		/* what cpu's list is this on? */
 	struct pid *fl_nspid;
@@ -1124,6 +1253,9 @@ get_lm_ops_extend(struct file_lock *fl)
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
+extern void generic_set_file_flags_unlocked(struct file*, unsigned int arg);
+extern int generic_set_file_flags(struct file*, unsigned int arg);
+
 /*
  * Return the inode to use for locking
  *
@@ -1153,7 +1285,7 @@ extern int fcntl_getlease(struct file *filp);
 /* fs/locks.c */
 void locks_free_lock(struct file_lock *fl);
 extern void locks_init_lock(struct file_lock *);
-extern struct file_lock * locks_alloc_lock(void);
+extern struct file_lock * locks_alloc_lock(int charge);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void locks_copy_conflock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
@@ -1394,6 +1526,9 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
+/* sb->s_iflags */
+#define SB_I_UMOUNT_SYNC		0x10000000 /* don't use delayed unmount */
+
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 
@@ -1434,6 +1569,7 @@ struct super_block {
 	const struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
+	unsigned long           s_iflags;       /* internal SB_I_* flags */
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
@@ -1460,15 +1596,6 @@ struct super_block {
 #endif
 #endif
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
-	/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
-	struct list_head	s_dentry_lru;	/* unused dentry lru */
-	int			s_nr_dentry_unused;	/* # of dentry on lru */
-
-	/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
-	spinlock_t		s_inode_lru_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inode_lru;		/* unused inode lru */
-	int			s_nr_inodes_unused;	/* # of inodes on lru */
-
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
@@ -1524,6 +1651,13 @@ struct super_block {
 	RH_KABI_EXTEND(struct workqueue_struct *s_dio_done_wq)
 	RH_KABI_EXTEND(struct rcu_head rcu)
 	RH_KABI_EXTEND(struct hlist_head s_pins)
+
+	/*
+	 * Keep the lru lists last in the structure so they always sit on their
+	 * own individual cachelines.
+	 */
+	struct list_lru		s_dentry_lru ____cacheline_aligned_in_smp;
+	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 };
 
 extern const unsigned super_block_wrapper_version;
@@ -1556,10 +1690,6 @@ static inline int *get_s_stack_depth(struct super_block *sb)
 	return wrapper ? &wrapper->s_stack_depth : NULL;
 }
 
-/* superblock cache pruning functions */
-extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
-extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
-
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*
@@ -1691,6 +1821,8 @@ extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_whiteout(struct inode *, struct dentry *);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+			   const char *, unsigned int, struct path *);
 
 /*
  * VFS dentry helper functions.
@@ -1753,7 +1885,9 @@ struct file_operations {
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -1768,10 +1902,11 @@ struct file_operations {
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-	int (*check_flags)(int);
+	int (*set_flags)(struct file *, int);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*fadvise)(struct file* file, loff_t offset, loff_t len, int advice);
 	RH_KABI_REPLACE(int (*setlease)(struct file *, long, struct file_lock **), int (*setlease)(struct file *, long, struct file_lock **, void **))
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
@@ -1884,8 +2019,10 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	int (*nr_cached_objects)(struct super_block *);
-	void (*free_cached_objects)(struct super_block *, int);
+	long (*nr_cached_objects)(struct super_block *,
+				  struct shrink_control *);
+	long (*free_cached_objects)(struct super_block *,
+				    struct shrink_control *);
 };
 
 /*
@@ -2021,8 +2158,12 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_DIRTY_TIME		(1 << 11)
+#define __I_DIRTY_TIME_EXPIRED	12
+#define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
 
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
@@ -2078,7 +2219,8 @@ extern void touch_atime(struct path *);
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
-		touch_atime(&file->f_path);
+		touch_atime(file->f_original_path.mnt ?
+			    &file->f_original_path : &file->f_path);
 }
 
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
@@ -2092,6 +2234,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_VIRTUALIZED		64	/* Can mount this fstype inside ve */
 #define FS_HAS_RM_XQUOTA	256	/* KABI: fs has the rm_xquota quota op */
 #define FS_HAS_INVALIDATE_RANGE	512	/* FS has new ->invalidatepage with length arg */
 #define FS_HAS_DIO_IODONE2	1024	/* KABI: fs supports new iodone */
@@ -2100,6 +2243,13 @@ struct file_system_type {
 dentry_operations_wrapper */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 #define FS_HAS_FO_EXTEND	65536 	/* fs is using the file_operations_extend struture */
+/*
+ * f_op->mmap must be called with vma=NULL before taking mmap_sem;
+ * workaround for wrong i_mutex vs mmap_sem lock ordering in pfcache
+ * (PSBM-23133) - vdavydov@
+ */
+#define FS_HAS_MMAP_PREP	(1<<18)
+
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
@@ -2224,6 +2374,7 @@ void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
+void put_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
@@ -2275,8 +2426,11 @@ extern bool our_mnt(struct vfsmount *mnt);
 
 extern int current_umask(void);
 
+extern int ve_devmnt_process(struct ve_struct *, dev_t, void **, int);
+
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
+extern int generic_update_time(struct inode *, struct timespec *, int);
 
 /* /sys/fs */
 extern struct kobject *fs_kobj;
@@ -2502,6 +2656,7 @@ extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
+extern void bd_write_size(struct block_device *, loff_t size);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
@@ -2797,6 +2952,12 @@ extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
 extern ino_t find_inode_number(struct dentry *, struct qstr *);
 
+int ve_fsync_behavior(void);
+
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
@@ -2825,6 +2986,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_inode_nowait(struct super_block *,
+				       unsigned long,
+				       int (*match)(struct inode *,
+						    unsigned long, void *),
+				       void *data);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -2875,18 +3041,23 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
-		unsigned long size, pgoff_t pgoff);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
 		loff_t *);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
+extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
+extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -701,16 +701,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
  */
 #define __notrace_funcgraph		notrace
 
-/*
- * We want to which function is an entrypoint of a hardirq.
- * That will help us to put a signal on output.
- */
-#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
-
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
@@ -746,7 +736,6 @@ static inline void unpause_graph_tracing(void)
 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
 
 #define __notrace_funcgraph
-#define __irq_entry
 #define INIT_FTRACE_GRAPH
 
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -22,6 +22,7 @@
 #define part_to_dev(part)	(&((part)->__dev))
 
 extern struct device_type part_type;
+extern struct device_type disk_type;
 extern struct kobject *block_depr;
 extern struct class block_class;
 
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,7 +36,7 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#define ___GFP_KMEMCG		0x100000u
+#define ___GFP_ACCOUNT		0x100000u
 #define ___GFP_NOTRACK		0x200000u
 #define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
@@ -92,11 +92,11 @@ struct vm_area_struct;
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
+#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)	/* Account to kmemcg */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_KMEMCG	((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
 /*
@@ -105,7 +105,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 25	/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 26	/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -115,6 +115,7 @@ struct vm_area_struct;
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
 #define GFP_TEMPORARY	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
 			 __GFP_RECLAIMABLE)
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
@@ -393,9 +394,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask);
 extern void __free_page_frag(void *addr);
 
-extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
-extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -7,6 +7,7 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
+#include <bc/task.h>
 
 #if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
 extern void synchronize_irq(unsigned int irq);
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -145,6 +145,12 @@ extern struct task_group root_task_group;
 # define INIT_CGROUP_SCHED(tsk)
 #endif
 
+#ifdef CONFIG_VE
+#define	INIT_TASK_VE(tsk) .task_ve = &ve0,
+#else
+#define	INIT_TASK_VE(tsk)
+#endif
+
 #ifdef CONFIG_PERF_EVENTS
 # define INIT_PERF_EVENTS(tsk)						\
 	.perf_event_mutex = 						\
@@ -173,6 +179,13 @@ extern struct task_group root_task_group;
 # define INIT_RT_MUTEXES(tsk)
 #endif
 
+#ifdef CONFIG_KASAN
+# define INIT_KASAN(tsk)						\
+	.kasan_depth = 1,
+#else
+# define INIT_KASAN(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -201,6 +214,7 @@ extern struct task_group root_task_group;
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	INIT_CGROUP_SCHED(tsk)						\
+	INIT_TASK_VE(tsk)						\
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
@@ -244,6 +258,7 @@ extern struct task_group root_task_group;
 	INIT_RT_MUTEXES(tsk)					\
 	INIT_PREV_CPUTIME(tsk)						\
 	INIT_VTIME(tsk)							\
+	INIT_KASAN(tsk)							\
 }
 
 
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -703,4 +703,24 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+/*
+ * We want to know which function is an entrypoint of a hardirq or a softirq.
+ */
+#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
+#define __softirq_entry  \
+	__attribute__((__section__(".softirqentry.text")))
+
+/* Limits of hardirq entrypoints */
+extern char __irqentry_text_start[];
+extern char __irqentry_text_end[];
+/* Limits of softirq entrypoints */
+extern char __softirqentry_text_start[];
+extern char __softirqentry_text_end[];
+
+#else
+#define __irq_entry
+#define __softirq_entry
+#endif
+
 #endif
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -115,6 +115,9 @@ struct io_context {
 	struct hlist_head	icq_list;
 
 	struct work_struct release_work;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ioc_ub;
+#endif
 };
 
 /**
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -39,6 +39,7 @@ enum {
 	IOPRIO_WHO_PROCESS = 1,
 	IOPRIO_WHO_PGRP,
 	IOPRIO_WHO_USER,
+	IOPRIO_WHO_UBC = 1000,
 };
 
 /*
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
--- /dev/null
+++ b/include/linux/kasan.h
@@ -0,0 +1,120 @@
+#ifndef _LINUX_KASAN_H
+#define _LINUX_KASAN_H
+
+#include <linux/types.h>
+
+struct kmem_cache;
+struct page;
+struct vm_struct;
+
+#ifdef CONFIG_KASAN
+
+#define KASAN_SHADOW_SCALE_SHIFT 3
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+#include <asm/kasan.h>
+#include <linux/sched.h>
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+/* Enable reporting bugs after kasan_disable_current() */
+static inline void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+/* Disable reporting bugs for current task */
+static inline void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size);
+
+void kasan_alloc_pages(struct page *page, unsigned int order);
+void kasan_free_pages(struct page *page, unsigned int order);
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_shutdown(struct kmem_cache *cache);
+
+void kasan_poison_slab(struct page *page);
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+void kasan_poison_object_data(struct kmem_cache *cache, void *object);
+void kasan_init_slab_obj(struct kmem_cache *cache, const void *object);
+
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
+void kasan_kfree_large(const void *ptr);
+void kasan_poison_kfree(void *ptr);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
+		  gfp_t flags);
+void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
+
+void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+};
+
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_free_shadow(const struct vm_struct *vm);
+
+size_t ksize(const void *);
+static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t kasan_metadata_size(struct kmem_cache *cache);
+
+#else /* CONFIG_KASAN */
+
+static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
+static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+
+static inline void kasan_cache_create(struct kmem_cache *cache,
+				      size_t *size,
+				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
+
+static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+					void *object) {}
+static inline void kasan_poison_object_data(struct kmem_cache *cache,
+					void *object) {}
+static inline void kasan_init_slab_obj(struct kmem_cache *cache,
+				const void *object) {}
+
+static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
+static inline void kasan_kfree_large(const void *ptr) {}
+static inline void kasan_poison_kfree(void *ptr) {}
+static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
+				size_t size, gfp_t flags) {}
+static inline void kasan_krealloc(const void *object, size_t new_size,
+				 gfp_t flags) {}
+
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
+				   gfp_t flags) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+
+static inline void kasan_unpoison_slab(const void *ptr) { }
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
+
+#endif /* CONFIG_KASAN */
+
+#endif /* LINUX_KASAN_H */
--- a/include/linux/kcmp.h
+++ b/include/linux/kcmp.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_KCMP_H
 #define _LINUX_KCMP_H
 
+#include <linux/types.h>
+
 /* Comparison type */
 enum kcmp_type {
 	KCMP_FILE,
@@ -10,8 +12,16 @@ enum kcmp_type {
 	KCMP_SIGHAND,
 	KCMP_IO,
 	KCMP_SYSVSEM,
+	KCMP_EPOLL_TFD,
 
 	KCMP_TYPES,
 };
 
+/* Slot for KCMP_EPOLL_TFD */
+struct kcmp_epoll_slot {
+	__u32 efd;		/* epoll file descriptor */
+	__u32 tfd;		/* target file number */
+	__u32 toff;		/* target offset within same numbered sequence */
+};
+
 #endif /* _LINUX_KCMP_H */
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+	/* Coverage collection is not enabled yet. */
+	KCOV_MODE_DISABLED = 0,
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 */
+	KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ enum cpu_usage_stat {
 	CPUTIME_IRQ,
 	CPUTIME_IDLE,
 	CPUTIME_IOWAIT,
+	CPUTIME_USED,
 	CPUTIME_STEAL,
 	CPUTIME_GUEST,
 	CPUTIME_GUEST_NICE,
@@ -35,6 +36,42 @@ struct kernel_cpustat {
 	u64 cpustat[NR_STATS];
 };
 
+static inline u64 kernel_cpustat_total_usage(const struct kernel_cpustat *p)
+{
+	return p->cpustat[CPUTIME_USER] + p->cpustat[CPUTIME_NICE] +
+		p->cpustat[CPUTIME_SYSTEM];
+}
+
+static inline u64 kernel_cpustat_total_idle(const struct kernel_cpustat *p)
+{
+	return p->cpustat[CPUTIME_IDLE] + p->cpustat[CPUTIME_IOWAIT];
+}
+
+static inline void kernel_cpustat_zero(struct kernel_cpustat *p)
+{
+	memset(p, 0, sizeof(*p));
+}
+
+static inline void kernel_cpustat_add(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] + rhs->cpustat[i];
+}
+
+static inline void kernel_cpustat_sub(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] - rhs->cpustat[i];
+}
+
 struct kernel_stat {
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
--- /dev/null
+++ b/include/linux/kmapset.h
@@ -0,0 +1,104 @@
+/*
+ *  include/linux/kmapset.h
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_KMAPSET_H
+#define _LINUX_KMAPSET_H
+
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/rculist.h>
+#include <linux/kref.h>
+
+struct kmapset_map;
+
+struct kmapset_set {
+	struct mutex		mutex;
+	struct rb_root		tree;
+	unsigned long		default_value;
+};
+
+struct kmapset_map {
+	struct kref		kref;
+	unsigned		size;
+	struct kmapset_set	*set;
+	unsigned long		default_value;
+	unsigned long		hash;
+	struct hlist_head	links;
+	union {
+		struct rb_node		node;
+		struct rcu_head		rcu_head;
+	};
+};
+
+struct kmapset_key {
+	struct hlist_head	links;
+};
+
+struct kmapset_link {
+	struct kmapset_map	*map;
+	struct kmapset_key	*key;
+	unsigned long		value;
+	struct hlist_node	map_link;
+	union {
+		struct hlist_node	key_link;
+		struct rcu_head		rcu_head;
+	};
+};
+
+static inline void kmapset_lock(struct kmapset_set *set)
+{
+	mutex_lock(&set->mutex);
+}
+
+static inline void kmapset_unlock(struct kmapset_set *set)
+{
+	mutex_unlock(&set->mutex);
+}
+
+struct kmapset_map *kmapset_new(struct kmapset_set *set);
+
+static inline void kmapset_init_set(struct kmapset_set *set)
+{
+	mutex_init(&set->mutex);
+	set->tree = RB_ROOT;
+	set->default_value = 0;
+}
+
+static inline void kmapset_init_map(struct kmapset_map *map,
+		struct kmapset_set *set)
+{
+	kref_init(&map->kref);
+	map->size = 0;
+	map->set = set;
+	map->default_value = set->default_value;
+	INIT_HLIST_HEAD(&map->links);
+	RB_CLEAR_NODE(&map->node);
+}
+
+static inline void kmapset_init_key(struct kmapset_key *key)
+{
+	 INIT_HLIST_HEAD(&key->links);
+}
+
+struct kmapset_map *kmapset_get(struct kmapset_map *map);
+void kmapset_put(struct kmapset_map *map);
+
+struct kmapset_map *kmapset_dup(struct kmapset_map *old);
+struct kmapset_map *kmapset_commit(struct kmapset_map *map);
+
+struct kmapset_link *kmapset_lookup(struct kmapset_map *map,
+		struct kmapset_key *key);
+unsigned long kmapset_get_value(struct kmapset_map *map,
+		struct kmapset_key *key);
+int kmapset_set_value(struct kmapset_map *map,
+		struct kmapset_key *key, unsigned long value);
+bool kmapset_del_value(struct kmapset_map *map, struct kmapset_key *key);
+void kmapset_set_default(struct kmapset_map *map, unsigned long value);
+
+void kmapset_unlink(struct kmapset_key *key, struct kmapset_set *set);
+
+#endif /* _LINUX_KMAPSET_H */
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -21,6 +21,8 @@
 #ifndef __KMEMLEAK_H
 #define __KMEMLEAK_H
 
+#include <linux/slab.h>
+
 #ifdef CONFIG_DEBUG_KMEMLEAK
 
 extern void kmemleak_init(void) __ref;
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -23,7 +23,7 @@
 #include <linux/stddef.h>
 #include <linux/errno.h>
 #include <linux/compiler.h>
-#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/sysctl.h>
 
 #define KMOD_PATH_LEN 256
@@ -44,6 +44,11 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 #define try_then_request_module(x, mod...) (x)
 #endif
 
+#ifdef CONFIG_VE_IPTABLES
+extern bool module_payload_allowed(const char *module);
+#else
+static inline bool module_payload_allowed(const char *module) { return true; }
+#endif
 
 struct cred;
 struct file;
@@ -54,7 +59,7 @@ struct file;
 #define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
 
 struct subprocess_info {
-	struct work_struct work;
+	struct kthread_work work;
 	struct completion *complete;
 	char *path;
 	char **argv;
@@ -66,6 +71,11 @@ struct subprocess_info {
 	void *data;
 };
 
+extern int
+call_usermodehelper_by(struct kthread_worker *worker,
+			char *path, char **argv, char **envp, int wait,
+			int (*init)(struct subprocess_info *info, struct cred *new),
+			void (*cleanup)(struct subprocess_info *), void *data);
 extern int
 call_usermodehelper(char *path, char **argv, char **envp, int wait);
 
@@ -85,6 +95,8 @@ enum umh_disable_depth {
 	UMH_DISABLED,
 };
 
+extern void usermodehelper_init(void);
+
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -206,6 +206,8 @@ extern struct kobject *firmware_kobj;
 int kobject_uevent(struct kobject *kobj, enum kobject_action action);
 int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			char *envp[]);
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
+			char *envp[]);
 
 __printf(2, 3)
 int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
--- a/include/linux/kobject_ns.h
+++ b/include/linux/kobject_ns.h
@@ -27,6 +27,7 @@ struct kobject;
 enum kobj_ns_type {
 	KOBJ_NS_TYPE_NONE = 0,
 	KOBJ_NS_TYPE_NET,
+	KOBJ_NS_TYPE_VE,
 	KOBJ_NS_TYPES
 };
 
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -76,8 +76,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 int page_referenced_ksm(struct page *page,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-		  struct vm_area_struct *, unsigned long, void *), void *arg);
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 #else  /* !CONFIG_KSM */
@@ -120,8 +119,8 @@ static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 	return 0;
 }
 
-static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static inline int rmap_walk_ksm(struct page *page,
+			struct rmap_walk_control *rwc)
 {
 	return 0;
 }
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -5,15 +5,34 @@
 #include <linux/sched.h>
 
 __printf(4, 5)
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data,
-					   int node,
-					   const char namefmt[], ...);
+struct kthread_create_info
+{
+	/* Information passed to kthread() from kthreadd. */
+	int (*threadfn)(void *data);
+	void *data;
+	int node;
+
+	/* Result passed back to kthread_create() from kthreadd. */
+	struct task_struct *result;
+	struct completion done;
+
+	struct list_head list;
+};
+
+struct task_struct *__kthread_create_on_node(
+		void (*addfn)(void *data, struct kthread_create_info *create),
+		void *add_data,
+		int (*threadfn)(void *data),
+		void *data, int node,
+		const char namefmt[],
+		...);
+
+#define kthread_create_on_node(threadfn, data, node, namefmt, arg...)	\
+	__kthread_create_on_node(NULL, NULL, threadfn, data, node, namefmt, ##arg)
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
 
-
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data,
 					  unsigned int cpu,
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -148,6 +148,9 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_SMI               26
 #define KVM_REQ_HV_CRASH          27
 #define KVM_REQ_IOAPIC_EOI_EXIT   28
+#define KVM_REQ_HV_RESET          29
+#define KVM_REQ_HV_EXIT           30
+#define KVM_REQ_HV_STIMER         31
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -316,6 +319,11 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 }
 
+struct kvm_hv_sint {
+	u32 vcpu;
+	u32 sint;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -328,6 +336,7 @@ struct kvm_kernel_irq_routing_entry {
 			unsigned pin;
 		} irqchip;
 		struct msi_msg msi;
+		struct kvm_hv_sint hv_sint;
 	};
 	struct hlist_node link;
 };
@@ -335,6 +344,7 @@ struct kvm_kernel_irq_routing_entry {
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+	struct kvm_kernel_irq_routing_entry *rt_entries;
 	u32 nr_rt_entries;
 	/*
 	 * Array indexed by gsi. Each entry contains list of irq chips
@@ -448,6 +458,8 @@ struct kvm {
 #define vcpu_debug_ratelimited(vcpu, fmt, ...)				\
 	kvm_debug_ratelimited("vcpu%i " fmt, (vcpu)->vcpu_id,           \
 			      ## __VA_ARGS__)
+#define vcpu_err(vcpu, fmt, ...)					\
+	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 {
@@ -465,6 +477,28 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 	     (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
 	     idx++)
 
+static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (vcpu->vcpu_id == id)
+			return vcpu;
+	return NULL;
+}
+
+static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu *tmp;
+	int idx;
+
+	kvm_for_each_vcpu(idx, tmp, vcpu->kvm)
+		if (tmp == vcpu)
+			return idx;
+	BUG();
+}
+
 #define kvm_for_each_memslot(memslot, slots)	\
 	for (memslot = &slots->memslots[0];	\
 	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
@@ -1070,6 +1104,8 @@ static inline void kvm_irq_routing_update(struct kvm *kvm)
 {
 }
 #endif
+void kvm_arch_irq_routing_update(struct kvm *kvm);
+void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,9 @@
 
 #include <linux/list.h>
 #include <linux/nodemask.h>
+#include <linux/shrinker.h>
+
+struct mem_cgroup;
 
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
@@ -21,24 +24,46 @@ enum lru_status {
 				   internally, but has to return locked. */
 };
 
-struct list_lru_node {
-	spinlock_t		lock;
+struct list_lru_one {
 	struct list_head	list;
-	/* kept as signed so we can catch imbalance bugs */
+	/* may become negative during memcg reparenting */
 	long			nr_items;
+};
+
+struct list_lru_memcg {
+	struct rcu_head		rcu;
+	/* array of per cgroup lists, indexed by memcg_cache_id */
+	struct list_lru_one	*lru[0];
+};
+
+struct list_lru_node {
+	/* protects all lists on the node, including per cgroup */
+	spinlock_t		lock;
+	/* global list, used for the root cgroup in cgroup aware lrus */
+	struct list_lru_one	lru;
+#ifdef CONFIG_MEMCG_KMEM
+	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+	struct list_lru_memcg	__rcu *memcg_lrus;
+#endif
 } ____cacheline_aligned_in_smp;
 
 struct list_lru {
 	struct list_lru_node	*node;
-	nodemask_t		active_nodes;
+#ifdef CONFIG_MEMCG_KMEM
+	struct list_head	list;
+#endif
 };
 
 void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
-	return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key);
+
+#define list_lru_init(lru)		__list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key)	__list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
+
+int memcg_update_all_list_lrus(int num_memcgs);
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
 
 /**
  * list_lru_add: add an element to the lru list's tail
@@ -72,32 +97,48 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
 /**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
  * @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
  *
  * Always return a non-negative number, 0 for empty lists. There is no
  * guarantee that the list is not updated while the count is being computed.
  * Callers that want such a guarantee need to provide an outer lock.
  */
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg);
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+						  struct shrink_control *sc)
+{
+	return list_lru_count_one(lru, sc->nid, sc->memcg);
+}
+
 static inline unsigned long list_lru_count(struct list_lru *lru)
 {
 	long count = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes)
+	for_each_node_state(nid, N_NORMAL_MEMORY)
 		count += list_lru_count_node(lru, nid);
 
 	return count;
 }
 
-typedef enum lru_status
-(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head);
+
+typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
+		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+
 /**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
  * @isolate: callback function that is resposible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
@@ -115,10 +156,22 @@ typedef enum lru_status
  *
  * Return value: the number of objects effectively removed from the LRU.
  */
+unsigned long list_lru_walk_one(struct list_lru *lru,
+				int nid, struct mem_cgroup *memcg,
+				list_lru_walk_cb isolate, void *cb_arg,
+				unsigned long *nr_to_walk);
 unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
 
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+				 &sc->nr_to_scan);
+}
+
 static inline unsigned long
 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	      void *cb_arg, unsigned long nr_to_walk)
@@ -126,7 +179,7 @@ list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	long isolated = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes) {
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		isolated += list_lru_walk_node(lru, nid, isolate,
 					       cb_arg, &nr_to_walk);
 		if (nr_to_walk <= 0)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,12 +23,16 @@
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/page-flags.h>
 
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_context;
+
+extern struct oom_context global_oom_ctx;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -41,47 +45,38 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
-#ifdef CONFIG_MEMCG
 /*
- * All "charge" functions with gfp_mask should use GFP_KERNEL or
- * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
- * alloc memory but reclaims memory from all available zones. So, "where I want
- * memory from" bits of gfp_mask has no meaning. So any bits of that field is
- * available but adding a rule is better. charge functions' gfp_mask should
- * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
- * codes.
- * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
+#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
+#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
 
-extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask);
-/* for swap handling */
-extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-		struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
-extern void mem_cgroup_commit_charge_swapin(struct page *page,
-					struct mem_cgroup *memcg);
-extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
+#ifdef CONFIG_MEMCG
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+			      bool lrucare);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+void mem_cgroup_uncharge(struct page *page);
+void mem_cgroup_uncharge_list(struct list_head *page_list);
 
-extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-					gfp_t gfp_mask);
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+			bool lrucare);
 
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
-/* For coalescing uncharge for reducing memcg' overhead*/
-extern void mem_cgroup_uncharge_start(void);
-extern void mem_cgroup_uncharge_end(void);
-
-extern void mem_cgroup_uncharge_page(struct page *page);
-extern void mem_cgroup_uncharge_cache_page(struct page *page);
-
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
+extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
@@ -101,11 +96,7 @@ bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 
-extern void
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-			     struct mem_cgroup **memcgp);
-extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-	struct page *oldpage, struct page *newpage, bool migration_ok);
+unsigned long page_cgroup_ino(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
@@ -115,14 +106,18 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
+bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg, int vfs_cache_min_ratio);
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+bool mem_cgroup_cleancache_disabled(struct page *page);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
+extern unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg);
+extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
+				     struct task_struct *task);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
-extern void mem_cgroup_replace_page_cache(struct page *oldpage,
-					struct page *newpage);
 
 static inline void mem_cgroup_oom_enable(void)
 {
@@ -154,6 +149,16 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+	css_get(mem_cgroup_css(memcg));
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+	css_put(mem_cgroup_css(memcg));
+}
+
 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
 					 unsigned long *flags);
 
@@ -221,46 +226,36 @@ void mem_cgroup_print_bad_page(struct page *page);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
-static inline int mem_cgroup_newpage_charge(struct page *page,
-					struct mm_struct *mm, gfp_t gfp_mask)
+static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+					gfp_t gfp_mask,
+					struct mem_cgroup **memcgp)
 {
+	*memcgp = NULL;
 	return 0;
 }
 
-static inline int mem_cgroup_cache_charge(struct page *page,
-					struct mm_struct *mm, gfp_t gfp_mask)
+static inline void mem_cgroup_commit_charge(struct page *page,
+					    struct mem_cgroup *memcg,
+					    bool lrucare)
 {
-	return 0;
 }
 
-static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-		struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
+static inline void mem_cgroup_cancel_charge(struct page *page,
+					    struct mem_cgroup *memcg)
 {
-	return 0;
 }
 
-static inline void mem_cgroup_commit_charge_swapin(struct page *page,
-					  struct mem_cgroup *memcg)
+static inline void mem_cgroup_uncharge(struct page *page)
 {
 }
 
-static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
-static inline void mem_cgroup_uncharge_start(void)
-{
-}
-
-static inline void mem_cgroup_uncharge_end(void)
-{
-}
-
-static inline void mem_cgroup_uncharge_page(struct page *page)
-{
-}
-
-static inline void mem_cgroup_uncharge_cache_page(struct page *page)
+static inline void mem_cgroup_migrate(struct page *oldpage,
+				      struct page *newpage,
+				      bool lrucare)
 {
 }
 
@@ -281,17 +276,17 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	return NULL;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
-{
-	return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
 	return true;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+       return NULL;
+}
+
 static inline int task_in_mem_cgroup(struct task_struct *task,
 				     const struct mem_cgroup *memcg)
 {
@@ -304,17 +299,6 @@ static inline struct cgroup_subsys_state
 	return NULL;
 }
 
-static inline void
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-			     struct mem_cgroup **memcgp)
-{
-}
-
-static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-		struct page *oldpage, struct page *newpage, bool migration_ok)
-{
-}
-
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -333,10 +317,29 @@ static inline bool mem_cgroup_disabled(void)
 	return true;
 }
 
-static inline int
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
 {
-	return 1;
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
+static inline bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg,
+	int vfs_cache_min_ratio)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_low(struct mem_cgroup *root,
+				  struct mem_cgroup *memcg)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_cleancache_disabled(struct page *page)
+{
+	return false;
 }
 
 static inline unsigned long
@@ -351,6 +354,22 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline struct oom_context *
+mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	return &global_oom_ctx;
+}
+
+static inline unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
+static inline void
+mem_cgroup_note_oom_kill(struct mem_cgroup *memcg, struct task_struct *task)
+{
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
@@ -410,10 +429,6 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
-static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
-				struct page *newpage)
-{
-}
 #endif /* CONFIG_MEMCG */
 
 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
@@ -451,7 +466,19 @@ static inline void sock_release_memcg(struct sock *sk)
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
 
-extern int memcg_limited_groups_array_size;
+extern int memcg_nr_cache_ids;
+extern void memcg_get_cache_ids(void);
+extern void memcg_put_cache_ids(void);
+
+static inline void memcg_stop_kmem_account(void)
+{
+	current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+	current->memcg_kmem_skip_account--;
+}
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -459,13 +486,15 @@ extern int memcg_limited_groups_array_size;
  * the slab_mutex must be held when looping through those caches
  */
 #define for_each_memcg_cache_index(_idx)	\
-	for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
+	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
 
 static inline bool memcg_kmem_enabled(void)
 {
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -477,31 +506,25 @@ static inline bool memcg_kmem_enabled(void)
  * conditions, but because they are pretty simple, they are expected to be
  * fast.
  */
-bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
-					int order);
-void __memcg_kmem_commit_charge(struct page *page,
-				       struct mem_cgroup *memcg, int order);
+bool __memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
-
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
-void memcg_update_array_size(int num_groups);
 
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+void __memcg_kmem_put_cache(struct kmem_cache *cachep);
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
 
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages);
+void memcg_charge_kmem_nofail(struct mem_cgroup *memcg, unsigned long nr_pages);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
+ * @page: page to charge.
  * @gfp: the gfp allocation flags.
- * @memcg: a pointer to the memcg this was charged against.
  * @order: allocation order.
  *
  * returns true if the memcg where the current task belongs can hold this
@@ -511,10 +534,12 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
  * any memcg.
  */
 static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
 	if (!memcg_kmem_enabled())
 		return true;
+	if (!(gfp & __GFP_ACCOUNT))
+		return true;
 
 	/*
 	 * __GFP_NOFAIL allocations will move on even if charging is not
@@ -522,7 +547,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 	 * unaccounted. We could in theory charge it forcibly, but we hope
 	 * those allocations are rare, and won't be worth the trouble.
 	 */
-	if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+	if (gfp & __GFP_NOFAIL)
 		return true;
 	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
 		return true;
@@ -531,7 +556,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 	if (unlikely(fatal_signal_pending(current)))
 		return true;
 
-	return __memcg_kmem_newpage_charge(gfp, memcg, order);
+	return __memcg_kmem_newpage_charge(page, gfp, order);
 }
 
 /**
@@ -544,44 +569,16 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 static inline void
 memcg_kmem_uncharge_pages(struct page *page, int order)
 {
-	if (memcg_kmem_enabled())
+	if (memcg_kmem_enabled() && PageKmemcg(page))
 		__memcg_kmem_uncharge_pages(page, order);
 }
 
-/**
- * memcg_kmem_commit_charge: embeds correct memcg in a page
- * @page: pointer to struct page recently allocated
- * @memcg: the memcg structure we charged against
- * @order: allocation order.
- *
- * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
- * failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit the memcg given by @memcg to the
- * corresponding page_cgroup.
- */
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
-{
-	if (memcg_kmem_enabled() && memcg)
-		__memcg_kmem_commit_charge(page, memcg, order);
-}
-
 /**
  * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
  * @cachep: the original global kmem cache
  * @gfp: allocation flags.
  *
- * This function assumes that the task allocating, which determines the memcg
- * in the page allocator, belongs to the same cgroup throughout the whole
- * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
- * while belonging to a cgroup, and later on changes. This is considered
- * acceptable, and should only happen upon task migration.
- *
- * Before the cache is created by the memcg core, there is also a possible
- * imbalance: the task belongs to a memcg, but the cache being allocated from
- * is the global cache, since the child cache is not yet guaranteed to be
- * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
- * passed and the page allocator will not attempt any cgroup accounting.
+ * All memory allocated from a per-memcg cache is charged to the owner memcg.
  */
 static __always_inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
@@ -597,6 +594,19 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 	return __memcg_kmem_get_cache(cachep, gfp);
 }
+
+static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+	if (memcg_kmem_enabled())
+		__memcg_kmem_put_cache(cachep);
+}
+
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	if (!memcg_kmem_enabled())
+		return NULL;
+	return __mem_cgroup_from_kmem(ptr);
+}
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -606,8 +616,13 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
 	return true;
 }
@@ -616,29 +631,24 @@ static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 }
 
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
 {
+	return -1;
 }
 
-static inline int memcg_cache_id(struct mem_cgroup *memcg)
+static inline void memcg_get_cache_ids(void)
 {
-	return -1;
 }
 
-static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-		     struct kmem_cache *root_cache)
+static inline void memcg_put_cache_ids(void)
 {
-	return 0;
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_stop_kmem_account(void)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-					struct kmem_cache *s)
+static inline void memcg_resume_kmem_account(void)
 {
 }
 
@@ -648,8 +658,13 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 	return cachep;
 }
 
-static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
 {
+	return NULL;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -196,6 +196,7 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern void mpol_put_task_policy(struct task_struct *);
 
 #else
 
@@ -320,5 +321,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
 	return -1; /* no node preference */
 }
 
+static inline void mpol_put_task_policy(struct task_struct *task)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -93,7 +93,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-extern struct kmem_cache *vm_area_cachep;
+extern struct kmem_cache *__vm_area_cachep;
+#define allocate_vma(mm, gfp_flags)	kmem_cache_alloc(__vm_area_cachep, gfp_flags)
+#define free_vma(mm, vma)		kmem_cache_free(__vm_area_cachep, vma)
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -138,7 +140,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_ARCH_2	0x02000000
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
@@ -218,21 +219,19 @@ extern unsigned int kobjsize(const void *objp);
 extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
-#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED	0x40	/* second try */
-#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
+#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */
+#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */
+#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED	0x20	/* Second try */
+#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
- * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may implement ->remap_pages to get nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
 	unsigned int flags;		/* FAULT_FLAG_xxx flags */
@@ -297,10 +296,6 @@ struct vm_operations_struct {
 	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
 		const nodemask_t *to, unsigned long flags);
 #endif
-	/* called by sys_remap_file_pages() to populate non-linear mapping */
-	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
-			   unsigned long size, pgoff_t pgoff);
-
 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
 	RH_KABI_EXTEND(int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf))
 	RH_KABI_EXTEND(int (*pmd_fault)(struct vm_area_struct *,
@@ -406,6 +401,20 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline void compound_lock(struct page *page)
@@ -504,53 +513,6 @@ static inline struct page *virt_to_head_page(const void *x)
 	return compound_head(page);
 }
 
-/*
- * PageBuddy() indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
- * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
- * -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
- * efficiently by most CPU architectures.
- */
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-
-static inline int PageBuddy(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBuddy(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-
-static inline int PageBalloon(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBalloon(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
@@ -1162,7 +1124,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-	struct vm_area_struct *nonlinear_vma;	/* Check page->index if set */
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
@@ -1265,6 +1226,7 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write);
@@ -1308,6 +1270,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 void account_page_dirtied(struct page *page, struct address_space *mapping);
 void account_page_writeback(struct page *page);
 int set_page_dirty(struct page *page);
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
@@ -1844,12 +1807,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-					struct list_head *list)
-{
-	list_add_tail(&vma->shared.nonlinear, list);
-}
-
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root *root);
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
@@ -1914,6 +1871,8 @@ extern struct file *get_mm_exe_file(struct mm_struct *mm);
 extern struct file *get_task_exe_file(struct task_struct *task);
 
 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
+extern bool vma_is_vdso_or_vvar(const struct vm_area_struct *vma,
+				   const struct mm_struct *mm);
 extern int install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
 				   unsigned long flags, struct page **pages);
@@ -1991,6 +1950,14 @@ extern void truncate_inode_pages_final(struct address_space *);
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 
+struct path;
+struct cred;
+int open_mapping_peer(struct address_space *mapping,
+		struct path *path, const struct cred *cred);
+void close_mapping_peer(struct address_space *mapping);
+struct page *pick_peer_page(struct address_space *mapping, pgoff_t index,
+		struct file_ra_state *ra, unsigned ra_size);
+
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
 void task_dirty_inc(struct task_struct *tsk);
@@ -2020,7 +1987,6 @@ unsigned long ra_submit(struct file_ra_state *ra,
 			struct file *filp);
 
 extern unsigned long stack_guard_gap;
-
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
 
@@ -2189,18 +2155,24 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 
-unsigned long shrink_slab(struct shrink_control *shrink,
-			  unsigned long nr_pages_scanned,
-			  unsigned long lru_pages);
+void drop_slab(void);
+void drop_slab_node(int nid);
 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
 #else
-extern int randomize_va_space;
+extern int _randomize_va_space;
+#ifndef CONFIG_VE
+#define randomize_va_space _randomize_va_space
+#else
+#define randomize_va_space (get_exec_env()->_randomize_va_space)
+#endif
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
-void print_vma_addr(char *prefix, unsigned long rip);
+void ve_print_vma_addr(int dst, char *prefix, unsigned long rip);
+#define print_vma_addr(prefix, rip) \
+	ve_print_vma_addr(VE0_LOG, (prefix), (rip))
 
 void sparse_mem_maps_populate_node(struct page **map_map,
 				   unsigned long pnum_begin,
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -25,6 +25,7 @@
 
 struct address_space;
 struct hmm;
+struct gang;
 
 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
@@ -306,15 +307,11 @@ struct vm_area_struct {
 
 	/*
 	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree, or
-	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 * linkage into the address_space->i_mmap interval tree.
 	 */
-	union {
-		struct {
-			struct rb_node rb;
-			unsigned long rb_subtree_last;
-		} linear;
-		struct list_head nonlinear;
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
 	} shared;
 
 	/*
@@ -335,6 +332,15 @@ struct vm_area_struct {
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
+	/*
+	 * Special for pfcache - we can't reuse vm_private_data
+	 * to save up memory as the field is inherited on fork and
+	 * to distinquish if it is our and we need to clear it we will
+	 * still need some kind of flag on address_space of these vma,
+	 * and there are too few free flags left so it implyes adding
+	 * yet another variable.
+	 */
+	void * vm_private_data2;
 
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
@@ -407,6 +413,11 @@ struct mm_struct {
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
 	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/* Base adresses for compatible mmap() */
+	unsigned long mmap_compat_base;
+	unsigned long mmap_compat_legacy_base;
+#endif
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
@@ -457,6 +468,11 @@ struct mm_struct {
 
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
+	unsigned int vps_dumpable:2;
+
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *mm_ub;
+#endif
 	struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
@@ -543,6 +559,12 @@ struct mm_struct {
 	RH_KABI_RESERVE(8)
 };
 
+#define VD_VE_ENTER_TASK	0/* tasks entered to VE from host, no ptrace,
+				  * or coredump or licdata access allowed */
+#define VD_PTRACE_COREDUMP	1/* tasks with ptrace and coredump allowed */
+#define VD_LICDATA_ACCESS	2/* tasks accessed containers license data,
+				  * no ptrace and no coredump allowed */
+
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -64,6 +64,15 @@ struct mmu_notifier_ops {
 				 struct mm_struct *mm,
 				 unsigned long address);
 
+	/*
+	 * clear_young is a lightweight version of clear_flush_young. Like the
+	 * latter, it is supposed to test-and-clear the young/accessed bitflag
+	 * in the secondary pte, but it may omit flushing the secondary tlb.
+	 */
+	int (*clear_young)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long address);
+
 	/*
 	 * test_young is called to check the young/accessed bitflag in
 	 * the secondary pte. This is used to know if the page is
@@ -240,6 +249,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+				      unsigned long address);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -268,6 +279,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_young(mm, address);
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -382,6 +401,26 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pmd;								\
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -462,6 +501,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_clear_flush_notify pmdp_clear_flush
 #define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -16,6 +16,9 @@
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <generated/bounds.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 
@@ -373,7 +376,6 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-	int                     all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
@@ -428,6 +430,13 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+	/* Refaults at the time of last reclaim cycle */
+	unsigned long			refaults;
+
+#ifdef CONFIG_MEMCG
+	bool force_scan;
+#endif
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/tracepoint.h>
 #include <linux/export.h>
+#include <linux/rbtree_latch.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -84,7 +85,7 @@ void trim_init_extable(struct module *m);
 
 #ifdef MODULE
 #define MODULE_GENERIC_TABLE(gtype,name)			\
-extern const struct gtype##_id __mod_##gtype##_table		\
+extern const typeof(name) __mod_##gtype##_table			\
   __attribute__ ((unused, alias(__stringify(name))))
 
 #else  /* !MODULE */
@@ -236,8 +237,14 @@ struct module_ext {
 #endif
 };
 
-struct module
-{
+struct module;
+
+struct mod_tree_node {
+	struct module *mod;
+	struct latch_tree_node node;
+};
+
+struct module {
 	enum module_state state;
 
 	/* Member of list of modules */
@@ -296,8 +303,15 @@ struct module
 	/* Startup function. */
 	int (*init)(void);
 
-	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	/*
+	 * If this is non-NULL, vfree() after init() returns.
+	 *
+	 * Cacheline align here, such that:
+	 *   module_init, module_core, init_size, core_size,
+	 *   init_text_size, core_text_size and ltn_core.node[0]
+	 * are on the same cacheline.
+	 */
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -308,6 +322,14 @@ struct module
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	/*
+	 * We want mtn_core::{mod,node[0]} to be in the same cacheline as the
+	 * above entries such that a regular lookup will only touch one
+	 * cacheline.
+	 */
+	struct mod_tree_node	mtn_core;
+	struct mod_tree_node	mtn_init;
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
@@ -381,9 +403,6 @@ struct module
 	/* What modules do I depend on? */
 	struct list_head target_list;
 
-	/* Who is waiting for us to be unloaded */
-	struct task_struct *waiter;
-
 	/* Destruction function. */
 	void (*exit)(void);
 
@@ -395,7 +414,7 @@ struct module
 	ctor_fn_t *ctors;
 	unsigned int num_ctors;
 #endif
-};
+} ____cacheline_aligned;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
 #endif
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -80,4 +80,11 @@ int module_finalize(const Elf_Ehdr *hdr,
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
+#ifdef CONFIG_KASAN
+#include <linux/kasan.h>
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#define MODULE_ALIGN PAGE_SIZE
+#endif
+
 #endif
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -42,7 +42,9 @@ struct mnt_namespace;
  * flag, consider how it interacts with shared mounts.
  */
 #define MNT_SHARED_MASK	(MNT_UNBINDABLE)
-#define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+#define MNT_USER_SETTABLE_MASK  (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
+				 | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
+				 | MNT_READONLY)
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
 			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
@@ -51,6 +53,7 @@ struct mnt_namespace;
 
 #define MNT_LOCK_READONLY	0x400000
 #define MNT_LOCKED		0x800000
+
 #define MNT_DOOMED		0x1000000
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
@@ -62,6 +65,13 @@ struct vfsmount {
 	int mnt_flags;
 };
 
+struct mountpoint {
+	struct hlist_node m_hash;
+	struct dentry *m_dentry;
+	struct hlist_head m_list;
+	int m_count;
+};
+
 struct file; /* forward dec */
 struct path;
 
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -248,6 +248,11 @@ do {								\
 	net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
+#define net_velog_ratelimited(fmt, ...)				\
+	net_ratelimited_function(ve_printk, VE_LOG, fmt, ##__VA_ARGS__)
+#define net_veboth_ratelimited(fmt, ...)				\
+	net_ratelimited_function(ve_printk, VE_LOG_BOTH, fmt, ##__VA_ARGS__)
+
 
 bool __net_get_random_once(void *buf, int nbytes, bool *done,
 			   struct static_key *done_key);
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -91,6 +91,9 @@ enum {
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
+	NETIF_F_VENET_BIT,		/* device is venet device */
+	NETIF_F_VIRTUAL_BIT,		/* can be registered inside VE */
+	NETIF_F_FIXED_ADDR_BIT,
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -153,6 +156,9 @@ enum {
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
+#define NETIF_F_VENET		__NETIF_F(VENET)
+#define NETIF_F_VIRTUAL		__NETIF_F(VIRTUAL)
+#define NETIF_F_FIXED_ADDR	__NETIF_F(FIXED_ADDR)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -298,7 +298,6 @@ enum netdev_state_t {
 	__LINK_STATE_DORMANT,
 };
 
-
 /*
  * This structure holds at boot time configured netdevice settings. They
  * are then used in the device probing.
@@ -764,6 +763,11 @@ struct netdev_tc_txq {
 	u16 offset;
 };
 
+struct cpt_context;
+struct cpt_ops;
+struct rst_ops;
+struct cpt_netdev_image;
+
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 /*
  * This structure is to hold information about the device
@@ -1694,6 +1698,7 @@ struct net_device {
 						   because most packets are
 						   unicast) */
 
+	unsigned char		is_leaked;
 
 #ifdef CONFIG_RPS
 	struct netdev_rx_queue	*_rx;
@@ -1915,6 +1920,20 @@ struct net_device_extended {
 
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+#define NETDEV_HASHBITS	8
+#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
 #define	NETDEV_ALIGN		32
 
 static inline
@@ -4049,6 +4068,18 @@ netdev_features_t passthru_features_check(struct sk_buff *skb,
 					  netdev_features_t features);
 netdev_features_t netif_skb_features(struct sk_buff *skb);
 
+#if defined(CONFIG_VE) && defined(CONFIG_NET)
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL));
+}
+#else
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return 0;
+}
+#endif
+
 static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 {
 	netdev_features_t feature = gso_type & SKB_GSO1_MASK;
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -407,4 +407,35 @@ static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
  */
 DECLARE_PER_CPU(bool, nf_skb_duplicated);
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/vziptable_defs.h>
+
+#define ve_ipt_permitted(netns, ipt)					\
+	(mask_ipt_allow(get_exec_env()->ipt_mask, ipt))
+
+#define net_ipt_permitted(netns, ipt)					\
+	(mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt))
+
+#define net_ipt_module_set(netns, ipt)					\
+	({								\
+		(netns)->_iptables_modules |= ipt##_MOD;	\
+	})
+
+#define net_ipt_module_clear(netns, ipt)				\
+	({								\
+		(netns)->_iptables_modules &= ~ipt##_MOD;	\
+	})
+
+#define net_is_ipt_module_set(netns, ipt)				\
+	((netns)->_iptables_modules & (ipt##_MOD))
+
+#else /* CONFIG_VE_IPTABLES */
+
+#define net_ipt_permitted(netns, ipt)		(1)
+#define net_is_ipt_module_set(netns, ipt)	(1)
+#define net_ipt_module_set(netns, ipt)
+#define net_ipt_module_clear(netns, ipt)
+
+#endif /* CONFIG_VE_IPTABLES */
+
 #endif /*__LINUX_NETFILTER_H*/
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -147,7 +147,8 @@ struct ip_set_type {
 	u8 revision_min, revision_max;
 
 	/* Create set */
-	int (*create)(struct ip_set *set, struct nlattr *tb[], u32 flags);
+	int (*create)(struct net *net, struct ip_set *set,
+		      struct nlattr *tb[], u32 flags);
 
 	/* Attribute policies */
 	const struct nla_policy create_policy[IPSET_ATTR_CREATE_MAX + 1];
@@ -250,11 +251,12 @@ ip_set_init_counter(struct ip_set_counter *counter,
 }
 
 /* register and unregister set references */
-extern ip_set_id_t ip_set_get_byname(const char *name, struct ip_set **set);
-extern void ip_set_put_byindex(ip_set_id_t index);
-extern const char *ip_set_name_byindex(ip_set_id_t index);
-extern ip_set_id_t ip_set_nfnl_get_byindex(ip_set_id_t index);
-extern void ip_set_nfnl_put(ip_set_id_t index);
+extern ip_set_id_t ip_set_get_byname(struct net *net,
+				     const char *name, struct ip_set **set);
+extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
+extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
+extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
+extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
 
 /* API for iptables set match, and SET target */
 
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -224,6 +224,10 @@ struct xt_table_info {
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+#endif
+
 	/*
 	 * Number of user chains. Since tables cannot have loops, at most
 	 * @stacksize jumps (number of user chains) can possibly be made.
@@ -504,4 +508,21 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int next_offset);
 
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VE
+static inline bool ve_xt_table_forbidden(struct xt_table *xt)
+{
+	/*
+	 * The only purpose to have this check as a separate
+	 * helper is "grep"-a-bility
+	 *
+	 * If this helper hit it means that a VE has been
+	 * configured without the particular xt_table support
+	 */
+	return xt == NULL;
+}
+#else
+static inline bool ve_xt_table_forbidden(struct xt_table *xt) { return true; }
+#endif
+
 #endif /* _X_TABLES_H */
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -179,6 +179,7 @@ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
 			  struct user_namespace *ns, int cap);
 bool netlink_ns_capable(const struct sk_buff *skb,
 			struct user_namespace *ns, int cap);
+bool netlink_ve_capable(const struct sk_buff *skb, int cap);
 bool netlink_capable(const struct sk_buff *skb, int cap);
 bool netlink_net_capable(const struct sk_buff *skb, int cap);
 
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -156,8 +156,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
+#define NOTIFY_FAIL		0x0002		/* Reject */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|NOTIFY_FAIL)
 						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -73,9 +73,10 @@ static inline void put_nsproxy(struct nsproxy *ns)
 	}
 }
 
-static inline void get_nsproxy(struct nsproxy *ns)
+static inline struct nsproxy *get_nsproxy(struct nsproxy *ns)
 {
 	atomic_inc(&ns->count);
+	return ns;
 }
 
 #endif
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
 
 struct zonelist;
 struct notifier_block;
@@ -25,10 +27,23 @@ enum oom_constraint {
 enum oom_scan_t {
 	OOM_SCAN_OK,		/* scan thread and find its badness */
 	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
-	OOM_SCAN_ABORT,		/* abort the iteration and return */
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
+struct oom_context {
+	struct task_struct *owner;
+	struct task_struct *victim;
+	bool marked;
+	unsigned long oom_start;
+	unsigned long oom_end;
+	unsigned long overdraft;
+	int rage;
+	wait_queue_head_t waitq;
+};
+
+extern void init_oom_context(struct oom_context *ctx);
+extern void release_oom_context(struct oom_context *ctx);
+
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
@@ -47,26 +62,50 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
+/* linux/mm/oom_group.c */
+extern int get_task_oom_score_adj(struct task_struct *t);
+
+extern void mark_oom_victim(struct task_struct *tsk);
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
-		unsigned long totalpages);
+		unsigned long totalpages, unsigned long *overdraft);
+
+static inline bool oom_worse(unsigned long points, unsigned long overdraft,
+		unsigned long *chosen_points, unsigned long *max_overdraft)
+{
+	if (overdraft > *max_overdraft) {
+		*max_overdraft = overdraft;
+		*chosen_points = points;
+		return true;
+	}
+	if (overdraft == *max_overdraft && points > *chosen_points) {
+		*chosen_points = points;
+		return true;
+	}
+	return false;
+}
+
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			     unsigned int points, unsigned long totalpages,
+			     unsigned long points, unsigned long overdraft,
+			     unsigned long totalpages,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern bool oom_trylock(struct mem_cgroup *memcg);
+extern void oom_unlock(struct mem_cgroup *memcg);
 
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask);
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill);
+					       const nodemask_t *nodemask);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *mask, bool force_kill);
+			  int order, nodemask_t *mask);
+
+extern void exit_oom_victim(void);
+
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
@@ -88,4 +127,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
 #endif /* _INCLUDE_LINUX_OOM_H */
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -108,6 +108,10 @@ enum pageflags {
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
+#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+	PG_young,
+	PG_idle,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -276,6 +280,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
@@ -466,6 +477,72 @@ static inline int PageTransTail(struct page *page)
 }
 #endif
 
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ *
+ * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
+ * efficiently by most CPU architectures.
+ */
+#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
+
+static inline int PageBuddy(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBuddy(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
+
+static inline int PageBalloon(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBalloon(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512)
+
+static inline int PageKmemcg(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_KMEMCG_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageKmemcg(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_KMEMCG_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageKmemcg(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageKmemcg(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
 /*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -3,9 +3,9 @@
 
 enum {
 	/* flags for mem_cgroup */
-	PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
-	PCG_USED, /* this object is in use. */
-	PCG_MIGRATION, /* under page migration */
+	PCG_USED = 0x01,	/* This page is charged to a memcg */
+	PCG_MEM = 0x02,		/* This page holds a memory charge */
+	PCG_MEMSW = 0x04,	/* This page holds a memory+swap charge */
 	__NR_PCG_FLAGS,
 };
 
@@ -44,42 +44,9 @@ static inline void __init page_cgroup_init(void)
 struct page_cgroup *lookup_page_cgroup(struct page *page);
 struct page *lookup_cgroup_page(struct page_cgroup *pc);
 
-#define TESTPCGFLAG(uname, lname)			\
-static inline int PageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_bit(PCG_##lname, &pc->flags); }
-
-#define SETPCGFLAG(uname, lname)			\
-static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
-	{ set_bit(PCG_##lname, &pc->flags);  }
-
-#define CLEARPCGFLAG(uname, lname)			\
-static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
-	{ clear_bit(PCG_##lname, &pc->flags);  }
-
-#define TESTCLEARPCGFLAG(uname, lname)			\
-static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
-
-TESTPCGFLAG(Used, USED)
-CLEARPCGFLAG(Used, USED)
-SETPCGFLAG(Used, USED)
-
-SETPCGFLAG(Migration, MIGRATION)
-CLEARPCGFLAG(Migration, MIGRATION)
-TESTPCGFLAG(Migration, MIGRATION)
-
-static inline void lock_page_cgroup(struct page_cgroup *pc)
-{
-	/*
-	 * Don't take this lock in IRQ context.
-	 * This lock is for pc->mem_cgroup, USED, MIGRATION
-	 */
-	bit_spin_lock(PCG_LOCK, &pc->flags);
-}
-
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline int PageCgroupUsed(struct page_cgroup *pc)
 {
-	bit_spin_unlock(PCG_LOCK, &pc->flags);
+	return !!(pc->flags & PCG_USED);
 }
 
 #else /* CONFIG_MEMCG */
--- /dev/null
+++ b/include/linux/page_idle.h
@@ -0,0 +1,69 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/page-flags.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+static inline bool page_is_young(struct page *page)
+{
+	return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+	SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+	SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+	ClearPageIdle(page);
+}
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -338,7 +338,7 @@ do {									\
 #endif
 
 #ifndef this_cpu_sub
-# define this_cpu_sub(pcp, val)		this_cpu_add((pcp), -(val))
+# define this_cpu_sub(pcp, val)		this_cpu_add((pcp), -(typeof(pcp))(val))
 #endif
 
 #ifndef this_cpu_inc
@@ -424,7 +424,7 @@ do {									\
 # define this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
 #endif
 
-#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
+#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(typeof(pcp))(val))
 #define this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
 #define this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
 
@@ -592,7 +592,7 @@ do {									\
 #endif
 
 #ifndef __this_cpu_sub
-# define __this_cpu_sub(pcp, val)	__this_cpu_add((pcp), -(val))
+# define __this_cpu_sub(pcp, val)	__this_cpu_add((pcp), -(typeof(pcp))(val))
 #endif
 
 #ifndef __this_cpu_inc
@@ -674,7 +674,7 @@ do {									\
 	__pcpu_size_call_return2(__this_cpu_add_return_, pcp, val)
 #endif
 
-#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(val))
+#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(typeof(pcp))(val))
 #define __this_cpu_inc_return(pcp)	__this_cpu_add_return(pcp, 1)
 #define __this_cpu_dec_return(pcp)	__this_cpu_add_return(pcp, -1)
 
--- /dev/null
+++ b/include/linux/pfcache.h
@@ -0,0 +1,70 @@
+/*
+ *  include/linux/pfcache.h
+ *
+ *  Parallels File Cache
+ *
+ *  Copyright (c) 2012-2015 Parallels IP Holdings GmbH
+ *
+ *  Author: Konstantin Khlebnikov
+ *
+ */
+
+#ifndef LINUX_PFCACHE_H
+#define LINUX_PFCACHE_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define FS_IOC_PFCACHE_OPEN	_IO('f', 50)
+#define FS_IOC_PFCACHE_CLOSE	_IO('f', 51)
+#define FS_IOC_PFCACHE_DUMP	_IO('f', 52)
+
+#define PFCACHE_CSUM_SIZE	20	/* SHA-1 (FIPS 180-1) */
+
+#define PFCACHE_XATTR_NAME	"trusted.pfcache"
+
+/* extendable FS_IOC_PFCACHE_DUMP argument, must be 32/64-bits compatible */
+struct pfcache_dump_request {
+	__u32	header_size;		/* this struct size */
+	__u32	buffer_size;		/* tail buffer size */
+	__u64	filter;			/* filter flags */
+	__u64	payload;		/* payload flags */
+	__u32	offset;			/* skip inodes, after filtering */
+	__u8	csum_filter[PFCACHE_CSUM_SIZE];
+	/* -- add fields above this line -- */
+	__u8	buffer[0];
+};
+
+/* to check new fields presence */
+#define PFCACHE_DUMP_HAS(req, field)	((req)->header_size >= \
+		offsetof(typeof(*(req)), field) + sizeof((req)->field))
+
+/* filter bits, what to skip */
+#define PFCACHE_FILTER_WITH_CSUM	0x0001ll
+#define PFCACHE_FILTER_WITHOUT_CSUM	0x0002ll
+#define PFCACHE_FILTER_WITH_PEER	0x0004ll
+#define PFCACHE_FILTER_WITHOUT_PEER	0x0008ll
+#define PFCACHE_FILTER_COMPARE_CSUM	0x0010ll /* check csum_filter */
+#define PFCACHE_FILTER_MASK		0x001Fll /* all known filters */
+
+/* payload bits, what to dump */
+#define PFCACHE_PAYLOAD_CSUM		0x0001ll /* u8[EXT4_DATA_CSUM_SIZE] */
+#define PFCACHE_PAYLOAD_FHANDLE		0x0002ll /* struct file_handle */
+#define PFCACHE_PAYLOAD_STATE		0x0004ll /* u64 filter-state */
+#define PFCACHE_PAYLOAD_FSIZE		0x0008ll /* u64 file size */
+#define PFCACHE_PAYLOAD_PAGES		0x0010ll /* u64 page-cache size */
+#define PFCACHE_PAYLOAD_MASK		0x001Fll /* all known payloads */
+
+/* MAX_HANDLE_SZ */
+#define PFCACHE_FHANDLE_MAX		256
+
+/* see fs/fhandle.c */
+#define PFCACHE_FHANDLE_SIZE(ptr)	(*(__u32*)(ptr) + sizeof(__u32) * 2)
+
+/* all payload fields aligned to 8 bytes boundary */
+#define PFCACHE_PAYLOAD_MAX_SIZE			\
+	(ALIGN(PFCACHE_CSUM_SIZE, sizeof(__u64)) +	\
+	 PFCACHE_FHANDLE_MAX +				\
+	 sizeof(__u64) * 3)
+
+#endif /* LINUX_PFCACHE_H */
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -61,7 +61,7 @@ struct pid
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct rcu_head rcu;
-	struct upid numbers[1];
+	struct upid numbers[2];
 };
 
 extern struct pid init_struct_pid;
@@ -171,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
 pid_t pid_vnr(struct pid *pid);
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 
 #define do_each_pid_task(pid, type, task)				\
 	do {								\
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -24,6 +24,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int pid_max;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -41,6 +42,7 @@ struct pid_namespace {
 	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
+	int hide_pidns;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	unsigned int proc_inum;
 	RH_KABI_EXTEND(struct rcu_head rcu)
@@ -63,6 +65,7 @@ extern struct pid_namespace *copy_pid_ns(unsigned long flags,
 	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
+extern int change_active_pid_ns(struct task_struct *, struct pid_namespace *);
 extern void put_pid_ns(struct pid_namespace *ns);
 
 #else /* !CONFIG_PID_NS */
--- /dev/null
+++ b/include/linux/ploop/compat.h
@@ -0,0 +1,33 @@
+/*
+ *  include/linux/ploop/compat.h
+ *
+ *  This file contained macros to provide compatibility layer for 2.6.18,
+ *  where bio layer was different.
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_COMPAT_H_
+#define _LINUX_PLOOP_COMPAT_H_
+
+#include <linux/version.h>
+
+#define DEFINE_BIO_CB(func) \
+static void func(struct bio *bio, int err) {
+
+#define END_BIO_CB(func)  }
+
+#define BIO_ENDIO(_queue, _bio, _err)					\
+	do {								\
+		trace_block_bio_complete((_queue), (_bio), (_err));	\
+		bio_endio((_bio), (_err));				\
+	} while (0);
+
+#define F_DENTRY(file)	(file)->f_path.dentry
+#define F_MNT(file)	(file)->f_path.mnt
+
+#define KOBJECT_INIT(kobj, ktype) kobject_init(kobj, ktype)
+#define KOBJECT_ADD(kobj, parent, fmt, arg...) kobject_add(kobj, parent, fmt, arg)
+
+#endif
--- /dev/null
+++ b/include/linux/ploop/ploop.h
@@ -0,0 +1,914 @@
+/*
+ *  include/linux/ploop/ploop.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_H_
+#define _LINUX_PLOOP_H_
+
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include "ploop_if.h"
+#include "compat.h"
+
+#define PLOOP_NAME_SIZE		64
+#define PLOOP_MAX_FORMATS	32
+#define PLOOP_DEVICE_MAJOR	182
+#define PLOOP_DEVICE_RANGE	(1UL << MINORBITS)
+#define PLOOP_PART_SHIFT	4
+#define PLOOP_PART_MAX		(1UL << PLOOP_PART_SHIFT)
+
+/* 1. fastpath_reqs is subtracted because they don't consume preq-s
+ * 2. typically, entry_qlen and bio_qlen are close to zero */
+#define PLOOP_CONGESTED(plo)    (plo->entry_qlen + plo->active_reqs - \
+				 plo->fastpath_reqs + plo->bio_qlen)
+/* 32 bits for virtual block. Enough. */
+typedef u32	cluster_t;
+typedef u32	iblock_t;
+
+struct ploop_request;
+struct ploop_delta;
+
+enum {
+	PLOOP_S_RUNNING,	/* Device is active */
+	PLOOP_S_ATTENTION,	/* Device is processing a barrier, everything
+				 * is queued to be totally serialized */
+	PLOOP_S_WAIT_PROCESS,	/* Main thread is waiting for requests */
+	PLOOP_S_EXITING,	/* Exiting */
+	PLOOP_S_ABORT,		/* Device is aborted due to unrecoverable
+				 * error. Reads are still allowed. */
+	PLOOP_S_SYNC,		/* Unplug was requested */
+	PLOOP_S_CHANGED,	/* Media changed */
+	PLOOP_S_WRITE_CONG,	/* Write direction was congested */
+	PLOOP_S_READ_CONG,	/* Read direction was congested */
+	PLOOP_S_TRACK,		/* Write tracker is ON */
+	PLOOP_S_TRACK_ABORT,	/* Write tracker is aborted */
+	PLOOP_S_ENOSPC_EVENT,	/* ENOSPC event happened but but was not
+				 * consumed by userspace yet */
+	PLOOP_S_CONGESTED,	/* Too many bios submitted to us */
+	PLOOP_S_DISCARD,	/* ploop is ready to handle discard request */
+	PLOOP_S_DISCARD_LOADED,	/* A discard request was handled and
+				   free blocks loaded */
+	PLOOP_S_LOCKED,	        /* ploop is locked by userspace
+				   (for minor mgmt only) */
+	PLOOP_S_ONCE,	        /* An event (e.g. printk once) happened */
+	PLOOP_S_PUSH_BACKUP,	/* Push_backup is in progress */
+	PLOOP_S_NULLIFY,	/* Nullifying BAT is in progress */
+};
+
+enum {
+	PLOOP_F_NORMAL,		/* Default: not yet freezed or unfrozen */
+	PLOOP_F_FROZEN,		/* Frozen PLOOP_IOC_FREEZE */
+	PLOOP_F_THAWING,	/* thaw_bdev is in progress */
+};
+
+struct ploop_snapdata
+{
+	/* top_delta file reopened read-only. */
+	struct file		*file;
+};
+
+
+
+struct ploop_file
+{
+	struct list_head	list;
+
+	loff_t		vpos;	/* Position of this chunk in virtual map */
+	loff_t		start;	/* Start of data in this file, usually 0 */
+	loff_t		length;	/* Length of data in this file */
+	loff_t		limit;	/* Maximal size of this file. If it is
+				 * exceeded we must switch to the next chunk
+				 */
+	struct file		*file;	/* File */
+	struct address_space	*mapping;
+	struct inode		*inode;
+	struct extent_map_tree	*em_tree;
+	struct block_device	*bdev;
+	int flags; /* file flags */
+};
+
+/* Real functions are hidden deeply. :-)
+ *
+ * This struct describes how we do real IO on particular backing file.
+ */
+
+enum {
+	PLOOP_IO_FSYNC_DELAYED,  /* Must f_op->fsync before FLUSH|FUA */
+};
+
+struct ploop_io
+{
+	struct ploop_device	*plo;
+
+	loff_t		       *size_ptr; /* NULL or points to ploop_mapping */
+	loff_t			prealloced_size;
+	struct ploop_request   *prealloc_preq;  /* preq who does prealloc */
+	loff_t			max_size;	/* Infinity */
+	int			n_chunks;	/* 1. */
+	struct ploop_file	files;		/* Only 1 file is supported */
+
+	iblock_t		alloc_head;
+
+	struct list_head	fsync_queue;
+	struct task_struct	*fsync_thread;
+	int			fsync_qlen;
+	wait_queue_head_t	fsync_waitq;
+	struct timer_list	fsync_timer;
+
+	struct ploop_io_ops	*ops;
+	unsigned long		io_state;
+	u64                     io_count;
+};
+
+struct ploop_io_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	void		(*unplug)(struct ploop_io *);
+	int		(*congested)(struct ploop_io *, int bits);
+
+	/* Allocate new block, return its index in image.
+	 * Data must be initialized to zeros and commited to disk.
+	 *
+	 * This function is slow and it is used only to allocate
+	 * index tables.
+	 */
+	int	(*alloc)(struct ploop_io *, loff_t pos, loff_t len);
+
+	/* These functions must schedule IO from/to disk.
+	 * If it returns 1, this means write is not complete and
+	 * preq is added to some internal queue.
+	 *
+	 * submit() makes IO to already allocated space (preq->iblock)
+	 * and must fail when writing to unallocated area.
+	 *
+	 * submit_alloc() assumes that storage is not allocated and allocates
+	 * new area in image.
+	 */
+	void	(*submit)(struct ploop_io *, struct ploop_request *,
+			  unsigned long rw,
+			  struct bio_list *sbl, iblock_t iblk, unsigned int size);
+	void	(*submit_alloc)(struct ploop_io *, struct ploop_request *,
+				struct bio_list *sbl, unsigned int size);
+	void	(*post_submit)(struct ploop_io *, struct ploop_request *);
+
+	int	(*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len);
+	int	(*fastmap)(struct ploop_io * io, struct bio *orig_bio,
+			   struct bio * bio, sector_t isec);
+
+	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
+			     struct page * page, sector_t sec);
+	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
+			      struct page * page, sector_t sec, unsigned long rw);
+
+
+	int	(*sync_read)(struct ploop_io * io, struct page * page,
+			     unsigned int len, unsigned int off, sector_t sec);
+	int	(*sync_write)(struct ploop_io * io, struct page * page,
+			      unsigned int len, unsigned int off, sector_t sec);
+
+
+	int	(*sync_readvec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+	int	(*sync_writevec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+
+	int	(*init)(struct ploop_io * io);
+	void	(*destroy)(struct ploop_io * io);
+	int	(*open)(struct ploop_io * io);
+	int	(*sync)(struct ploop_io * io);
+	int	(*stop)(struct ploop_io * io);
+	int	(*prepare_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*complete_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*prepare_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*start_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*truncate)(struct ploop_io *, struct file *, __u32 alloc_head);
+	void	(*queue_settings)(struct ploop_io *, struct request_queue *q);
+
+	void	(*issue_flush)(struct ploop_io*, struct ploop_request * preq);
+
+	int	(*dump)(struct ploop_io*);
+
+	loff_t  (*i_size_read)(struct ploop_io*);
+	fmode_t (*f_mode)(struct ploop_io*);
+
+	int     (*autodetect)(struct ploop_io * io);
+};
+
+static inline loff_t generic_i_size_read(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+	BUG_ON(!io->files.inode);
+
+	return i_size_read(io->files.inode);
+}
+static inline fmode_t generic_f_mode(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+
+	return io->files.file->f_mode;
+}
+
+enum {
+	PLOOP_MAP_IDENTICAL,
+	PLOOP_MAP_DEAD,
+};
+
+#define PLOOP_LRU_BUFFER	8
+
+struct ploop_map
+{
+	struct ploop_device	*plo;
+	struct list_head	delta_list;
+
+	struct rb_root		rb_root;
+	unsigned long		flags;
+	unsigned long		last_activity;
+
+	unsigned int		pages;
+	unsigned int		max_index;
+
+	struct map_node		*lru_buffer[PLOOP_LRU_BUFFER];
+	unsigned int		lru_buffer_ptr;
+
+	wait_queue_head_t	destroy_waitq;
+};
+
+#define PLOOP_FMT_CAP_DELTA	1
+#define PLOOP_FMT_CAP_WRITABLE	2
+#define PLOOP_FMT_CAP_IDENTICAL	4
+
+struct ploop_delta_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	unsigned int		capability;
+
+	/* Return location of index page */
+	int		(*map_index)(struct ploop_delta *, unsigned long index,
+				     sector_t *sec);
+	void		(*read_index)(struct ploop_delta *, struct ploop_request * preq,
+				      struct page * page, sector_t sec);
+
+	/* Allocate new block in delta and write request there.
+	 * If request does not cover whole block, this function
+	 * must pad with zeros
+	 */
+	void		(*allocate)(struct ploop_delta *, struct ploop_request *,
+				    struct bio_list *sbl, unsigned int size);
+	void		(*allocate_complete)(struct ploop_delta *, struct ploop_request *);
+
+	int		(*compose)(struct ploop_delta *, int, struct ploop_ctl_chunk *);
+	int		(*open)(struct ploop_delta *);
+	void		(*destroy)(struct ploop_delta *);
+	int		(*start)(struct ploop_delta *);
+	int		(*stop)(struct ploop_delta *);
+	int		(*refresh)(struct ploop_delta *);
+	int		(*sync)(struct ploop_delta *);
+	int		(*prepare_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*complete_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*prepare_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*start_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*truncate)(struct ploop_delta *, struct file *, __u32 alloc_head);
+	int		(*prepare_grow)(struct ploop_delta *, u64 *new_size, int *reloc);
+	int		(*complete_grow)(struct ploop_delta *, u64 new_size);
+};
+
+/* Virtual image. */
+struct ploop_delta
+{
+	struct list_head	list;
+
+	int			level;		/* Level of delta. 0 is base image */
+	unsigned int		cluster_log;	/* In 512=1<<9 byte sectors */
+	unsigned int		flags;
+
+	struct ploop_device	*plo;
+
+	struct ploop_io		io;
+
+	void			*priv;
+
+	struct ploop_delta_ops	*ops;
+
+	struct kobject		kobj;
+
+	u64			max_delta_size; /* in sectors */
+};
+
+struct ploop_tunable
+{
+	int	max_requests;
+	int	batch_entry_qlen;
+	int	batch_entry_delay;
+	int	fsync_max;
+	int	fsync_delay;
+	int	min_map_pages;
+	int	max_map_inactivity;
+	int	congestion_high_watermark;
+	int	congestion_low_watermark;
+	int	max_active_requests;
+	int	push_backup_timeout; /* in seconds */
+	unsigned int pass_flushes : 1, pass_fuas : 1,
+		     congestion_detection : 1,
+		     check_zeros : 1,
+		     disable_root_threshold : 1,
+		     disable_user_threshold : 1;
+};
+
+#define DEFAULT_PLOOP_MAXRQ 256
+#define DEFAULT_PLOOP_BATCH_ENTRY_QLEN 32
+
+#define DEFAULT_PLOOP_TUNE \
+(struct ploop_tunable) { \
+.max_requests = DEFAULT_PLOOP_MAXRQ, \
+.batch_entry_qlen = 32, \
+.batch_entry_delay = HZ/20, \
+.fsync_max = DEFAULT_PLOOP_BATCH_ENTRY_QLEN, \
+.fsync_delay = HZ/10, \
+.min_map_pages = 32, \
+.max_map_inactivity = 10*HZ, \
+.congestion_high_watermark = 3*DEFAULT_PLOOP_MAXRQ/4, \
+.congestion_low_watermark = DEFAULT_PLOOP_MAXRQ/2, \
+.pass_flushes = 1, \
+.pass_fuas = 1, \
+.check_zeros = 1, \
+.max_active_requests = DEFAULT_PLOOP_BATCH_ENTRY_QLEN / 2, \
+.push_backup_timeout = 42, }
+
+struct ploop_stats
+{
+#define __DO(_at)	__u32	_at;
+#include "ploop_stat.h"
+#undef __DO
+};
+
+struct ploop_freeblks_desc;
+struct ploop_pushbackup_desc;
+
+struct ploop_device
+{
+	unsigned long		state;
+	spinlock_t		lock;
+
+	struct list_head	free_list;
+	struct list_head	entry_queue;
+	int			entry_qlen;
+	int			read_sync_reqs;
+	int			free_qlen; /* len of free_list */
+	int			free_qmax; /* max len of free_list */
+	int			blockable_reqs; /* depends on userspace tool */
+	int			blocked_bios; /* depends on userspace tool */
+
+	struct bio		*bio_head;
+	struct bio		*bio_tail;
+	struct bio		*bio_sync;
+	struct bio_list		bio_discard_list;
+	int			bio_discard_qlen;
+	int			bio_qlen;
+	int			bio_total;
+
+	struct rb_root		entry_tree[2];
+
+	struct list_head	ready_queue;
+
+	struct rb_root		lockout_tree;
+	struct rb_root		lockout_pb_tree;
+
+	int			cluster_log;
+	int			fmt_version;
+
+	int			active_reqs;
+	int			fastpath_reqs;
+	int			barrier_reqs;
+
+	struct bio		*cached_bio;
+
+	struct timer_list	mitigation_timer;
+	struct timer_list	freeze_timer;
+
+	wait_queue_head_t	waitq;
+	wait_queue_head_t	req_waitq;
+	wait_queue_head_t	freeze_waitq;
+	wait_queue_head_t	event_waitq;
+
+	struct ploop_map	map;
+	struct ploop_map	*trans_map;
+
+	struct ploop_tunable	tune;
+
+	int			index;
+	struct mutex		ctl_mutex;
+	atomic_t		open_count;
+	u64			bd_size;
+	struct gendisk		*disk;
+	struct block_device	*bdev;
+	struct request_queue	*queue;
+	struct task_struct	*thread;
+	struct block_device	*frozen_bdev;
+	int			freeze_state;
+	struct rb_node		link;
+
+	/* someone who wants to quiesce state-machine waits
+	 * here for signal from state-machine saying that
+	 * processing came to PLOOP_REQ_BARRIER request */
+	struct completion	*quiesce_comp;
+
+	/* state-machine in 'quiesce' state waits here till
+	 * someone call ploop_relax() */
+	struct completion	relax_comp;
+
+	/* someone who call ploop_relax() waits here to know
+	 * that 'relax' really happened and state-machine is
+	 * ready for next ploop_quiesce(). This is important
+	 * because someone might call ploop_quiesce() immediately
+	 * after ploop_relax() succeeded */
+	struct completion	relaxed_comp;
+
+	spinlock_t		track_lock;
+	struct rb_root		track_tree;
+	sector_t		track_end;
+	u32			track_cluster;
+	u32			track_ptr;
+
+	u32			merge_ptr;
+
+	atomic_t		maintenance_cnt;
+	struct completion	maintenance_comp;
+	int			maintenance_type;
+
+	u32			grow_start;
+	u32			grow_end;
+	u32			grow_relocated;
+	u64			grow_new_size;
+
+	spinlock_t		dummy_lock;
+	struct mutex		sysfs_mutex;
+	struct kobject		kobj;
+	struct kobject		*pstat_dir;
+	struct kobject		*pstate_dir;
+	struct kobject		*ptune_dir;
+
+	struct ploop_stats	st;
+	char                    cookie[PLOOP_COOKIE_SIZE];
+
+	struct ploop_freeblks_desc *fbd;
+	struct ploop_pushbackup_desc *pbd;
+	struct block_device *dm_crypt_bdev;
+
+	unsigned long		locking_state; /* plo locked by userspace */
+};
+
+enum
+{
+	PLOOP_REQ_LOCKOUT,	/* This preq is locking overapping requests */
+	PLOOP_REQ_PB_LOCKOUT,	/* This preq is locking overlapping WRITEs */
+	PLOOP_REQ_SYNC,
+	PLOOP_REQ_BARRIER,
+	PLOOP_REQ_UNSTABLE,
+	PLOOP_REQ_TRACK,
+	PLOOP_REQ_SORTED,
+	PLOOP_REQ_TRANS,
+	PLOOP_REQ_MERGE,
+	PLOOP_REQ_RELOC_A,	/* 'A' stands for allocate() */
+	PLOOP_REQ_RELOC_S,	/* 'S' stands for submit() */
+	PLOOP_REQ_RELOC_N,	/* 'N' stands for "nullify" */
+	PLOOP_REQ_ZERO,
+	PLOOP_REQ_DISCARD,
+	PLOOP_REQ_RSYNC,
+	PLOOP_REQ_KAIO_FSYNC,	/*force image fsync by KAIO module */
+	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
+	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
+	PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
+	PLOOP_REQ_ISSUE_FLUSH, /* preq needs ->issue_flush before completing */
+	PLOOP_REQ_BLOCKABLE,  /* preq was accounted in plo->blockable_reqs */
+};
+
+#define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
+#define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
+#define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_RELOC_N_FL (1 << PLOOP_REQ_RELOC_N)
+#define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
+#define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
+
+enum
+{
+	PLOOP_E_ENTRY,		/* Not yet processed */
+	PLOOP_E_COMPLETE,	/* Complete. Maybe, with an error */
+	PLOOP_E_RELOC_COMPLETE,	/* Reloc complete. Maybe, with an error */
+	PLOOP_E_INDEX_READ,	/* Reading an index page */
+	PLOOP_E_TRANS_INDEX_READ,/* Reading a trans index page */
+	PLOOP_E_DELTA_READ,	/* Write request reads data from previos delta */
+	PLOOP_E_DELTA_COPIED,	/* Data from previos delta was bcopy-ied */
+	PLOOP_E_TRANS_DELTA_READ,/* Write request reads data from trans delta */
+	PLOOP_E_RELOC_DATA_READ,/* Read user data to relocate */
+	PLOOP_E_RELOC_NULLIFY,  /* Zeroing given iblock is in progress */
+	PLOOP_E_INDEX_DELAY,	/* Index update is blocked by already queued
+				 * index update.
+				 */
+	PLOOP_E_INDEX_WB,	/* Index writeback is in progress */
+	PLOOP_E_DATA_WBI,	/* Data writeback is in progress and index
+				 * is not updated.
+				 */
+	PLOOP_E_ZERO_INDEX,	/* Zeroing index of free block; original request
+				   can use .submit on completion */
+	PLOOP_E_DELTA_ZERO_INDEX,/* the same but for PLOOP_E_DELTA_READ */
+	PLOOP_E_FSYNC_PENDED,   /* INDEX_WB needs io->ops->sync() to proceed */
+};
+
+#define BIO_BDEV_REUSED	14	/* io_context is stored in bi_bdev */
+
+struct ploop_request
+{
+	struct list_head	list;	/* List link.
+					 * Req can be on
+					 * - free list
+					 * - entry queue
+					 * - ready queue
+					 * - delay_list of another request
+					 * nowhere
+					 */
+
+	struct ploop_device	*plo;
+
+	cluster_t		req_cluster;
+	sector_t		req_sector;
+	unsigned int		req_size;
+	unsigned int		req_rw;
+	unsigned int		req_index_update_rw;
+	unsigned long		tstamp;
+	struct io_context	*ioc;
+
+	struct bio_list		bl;
+
+	struct bio		*aux_bio;
+
+	atomic_t		io_count;
+
+	unsigned long		state;
+	unsigned long		eng_state;
+	int			error;
+
+	struct map_node		*map;
+	struct map_node		*trans_map;
+
+	iblock_t		iblock;
+
+	/* relocation info */
+	union {
+		struct {
+			iblock_t      src_iblock;
+			iblock_t      dst_iblock;
+		};
+		unsigned long	      ppb_state;
+	};
+	cluster_t		dst_cluster;
+	struct rb_node		reloc_link;
+
+	/* State specific information */
+	union {
+		/* E_INDEX_READ */
+		struct {
+			struct page	* tpage;
+			int		level;
+		} ri;
+
+		/* E_INDEX_WB */
+		struct {
+			struct page	* tpage;
+		} wi;
+	} sinfo;
+
+	u64			verf;
+
+	/* List of requests blocked until completion of this request. */
+	struct list_head	delay_list;
+
+	/* Link to tree of "blocking requests". Blocking request
+	 * is a request which triggers a kind of a change in image format,
+	 * which does not allow to proceed requests to the same area.
+	 * F.e. when we do not have mapping in delta and request
+	 * requires a copy of data block from previous delta,
+	 * this request locks all subseqent requests to the same virtual block
+	 * until we allocate and initialize block in delta.
+	 */
+	struct rb_node		lockout_link;
+	struct rb_node		lockout_pb_link;
+
+	u32			track_cluster;
+
+	/* # bytes in tail of image file to prealloc on behalf of this preq */
+	loff_t			prealloc_size;
+
+	/* if the engine starts operation on particular io, let's finish
+	 * the operation on the same io (see io.ops->post_submit) */
+	struct ploop_io	       *eng_io;
+};
+
+static inline struct ploop_delta * ploop_top_delta(struct ploop_device * plo)
+{
+	return list_empty(&plo->map.delta_list) ? NULL :
+		list_first_entry(&plo->map.delta_list,
+				 struct ploop_delta, list);
+}
+
+static inline struct ploop_delta * map_top_delta(struct ploop_map * map)
+{
+	return list_first_entry(&map->delta_list, struct ploop_delta, list);
+}
+
+void ploop_complete_io_state(struct ploop_request * preq);
+void ploop_fail_request(struct ploop_request * preq, int err);
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list);
+
+
+static inline int ploop_req_delay_fua_possible(struct ploop_request *preq)
+{
+	return preq->eng_state == PLOOP_E_DATA_WBI;
+}
+
+static inline void ploop_set_dm_crypt_bdev(struct block_device *ploop_bdev,
+				struct block_device *bdev)
+{
+	if (MAJOR(ploop_bdev->bd_dev) == PLOOP_DEVICE_MAJOR) {
+		struct ploop_device *plo = ploop_bdev->bd_disk->private_data;
+		mutex_lock(&plo->ctl_mutex);
+		plo->dm_crypt_bdev = bdev;
+		mutex_unlock(&plo->ctl_mutex);
+	}
+}
+
+static inline struct block_device *__ploop_get_dm_crypt_bdev(
+	struct ploop_device *plo)
+{
+	if (plo->dm_crypt_bdev)
+		bdgrab(plo->dm_crypt_bdev);
+
+	return plo->dm_crypt_bdev;
+}
+
+static inline struct block_device *ploop_get_dm_crypt_bdev(
+				struct ploop_device *plo)
+{
+	struct block_device *ret;
+
+	mutex_lock(&plo->ctl_mutex);
+	ret = __ploop_get_dm_crypt_bdev(plo);
+	mutex_unlock(&plo->ctl_mutex);
+	return ret;
+}
+
+static inline void ploop_req_set_error(struct ploop_request * preq, int err)
+{
+	if (!preq->error) {
+		preq->error = err;
+		if (!test_bit(PLOOP_S_ABORT, &preq->plo->state)) {
+			if (err != -ENOSPC) {
+				printk("ploop_set_error=%d on ploop%d\n",
+				       err, preq->plo->index);
+				return;
+			}
+			printk("No space left on device! Either free some "
+			       "space on disk or abort ploop%d manually.\n",
+				preq->plo->index);
+		}
+	}
+}
+
+#define PLOOP_TRACE_ERROR 1
+#define PLOOP_TRACE_ERROR_DUMP_STACK_ON 1
+
+#if PLOOP_TRACE_ERROR_DUMP_STACK_ON
+#define PLOOP_TRACE_ERROR_DUMP_STACK()	dump_stack();
+#else
+#define PLOOP_TRACE_ERROR_DUMP_STACK()
+#endif
+
+#if PLOOP_TRACE_ERROR
+#define PLOOP_REQ_TRACE_ERROR(preq, err)					\
+	do {									\
+		if ((err)) {							\
+			printk("%s() %d ploop%d set error %d\n",		\
+			__FUNCTION__, __LINE__, (preq)->plo->index, (int)(err));\
+			PLOOP_TRACE_ERROR_DUMP_STACK();				\
+		}								\
+	} while (0);
+#else
+#define PLOOP_REQ_TRACE_ERROR(preq, err)
+#endif
+
+#define PLOOP_REQ_SET_ERROR(preq, err)			\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_req_set_error(preq, err);		\
+	} while (0);
+
+#define PLOOP_FAIL_REQUEST(preq, err)			\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_fail_request(preq, err);		\
+	} while (0);
+
+static inline void ploop_prepare_io_request(struct ploop_request * preq)
+{
+	atomic_set(&preq->io_count, 1);
+}
+
+static inline void ploop_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		ploop_complete_io_state(preq);
+}
+
+static inline void ploop_prepare_tracker(struct ploop_request * preq,
+					 sector_t sec)
+{
+	if (unlikely(test_bit(PLOOP_S_TRACK, &preq->plo->state))) {
+		BUG_ON(test_bit(PLOOP_REQ_TRACK, &preq->state));
+		set_bit(PLOOP_REQ_TRACK, &preq->state);
+		preq->track_cluster = sec >> preq->plo->cluster_log;
+	}
+}
+
+void ploop_tracker_notify(struct ploop_device *, sector_t sec);
+
+static inline void ploop_acc_ff_in_locked(struct ploop_device *plo,
+					  unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_in++;
+	if (unlikely(rw & REQ_FUA))
+		plo->st.bio_fua_in++;
+}
+static inline void ploop_acc_ff_in(struct ploop_device *plo,
+				   unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & REQ_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_ff_out_locked(struct ploop_device *plo,
+					   unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_out++;
+	if (unlikely(rw & REQ_FUA))
+		plo->st.bio_fua_out++;
+}
+static inline void ploop_acc_ff_out(struct ploop_device *plo,
+				    unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & REQ_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_flush_skip_locked(struct ploop_device *plo,
+					       unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_skip++;
+}
+
+static inline void ploop_entry_add(struct ploop_device * plo, struct ploop_request * preq)
+{
+	list_add_tail(&preq->list, &plo->entry_queue);
+	plo->entry_qlen++;
+	if (test_bit(PLOOP_REQ_SYNC, &preq->state) && (!(preq->req_rw & WRITE) || (preq->req_rw & (REQ_FLUSH|REQ_FUA)))) {
+		__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		plo->read_sync_reqs++;
+	}
+}
+
+static inline void ploop_entry_qlen_dec(struct ploop_request * preq)
+{
+	preq->plo->entry_qlen--;
+	if (test_bit(PLOOP_REQ_RSYNC, &preq->state)) {
+		__clear_bit(PLOOP_REQ_RSYNC, &preq->state);
+		preq->plo->read_sync_reqs--;
+	}
+}
+
+static inline int ploop_map_log(struct ploop_device *plo)
+{
+	switch (plo->fmt_version) {
+	case PLOOP_FMT_V1:
+		return plo->cluster_log;
+	case PLOOP_FMT_V2:
+		return 0;
+	default:
+		BUG();
+	}
+
+	return -1;
+}
+
+struct map_node;
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result);
+void ploop_update_map(struct ploop_map * map, int level, cluster_t block, iblock_t iblk);
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size);
+void map_release(struct map_node * m);
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_check_map(struct ploop_map * map, struct ploop_request * preq);
+cluster_t map_get_mn_end(struct map_node *m);
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int map_index_fault(struct ploop_request * preq);
+void map_read_complete(struct ploop_request * preq);
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec);
+struct ploop_delta * map_writable_delta(struct ploop_request * preq);
+void map_init(struct ploop_device *, struct ploop_map * map);
+void ploop_map_start(struct ploop_map * map, u64 bd_size);
+void ploop_map_destroy(struct ploop_map * map);
+void ploop_map_remove_delta(struct ploop_map * map, int level);
+void ploop_index_wb_proceed(struct ploop_request * preq);
+void ploop_index_update(struct ploop_request * preq);
+void ploop_index_wb_complete(struct ploop_request * preq);
+int __init ploop_map_init(void);
+void ploop_map_exit(void);
+void ploop_add_req_to_fsync_queue(struct ploop_request * preq);
+
+
+void ploop_quiesce(struct ploop_device * plo);
+void ploop_relax(struct ploop_device * plo);
+
+void track_init(struct ploop_device * plo);
+int ploop_tracker_destroy(struct ploop_device *plo, int force);
+int ploop_tracker_stop(struct ploop_device * plo, int force);
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg);
+
+
+int ploop_add_lockout(struct ploop_request *preq, int try);
+void del_lockout(struct ploop_request *preq);
+
+int ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc);
+int ploop_io_open(struct ploop_io *);
+void ploop_io_destroy(struct ploop_io * io);
+void ploop_io_report_fn(struct file * file, char * msg);
+
+int ploop_register_format(struct ploop_delta_ops * ops);
+int ploop_register_io(struct ploop_io_ops * ops);
+void ploop_unregister_format(struct ploop_delta_ops * ops);
+void ploop_unregister_io(struct ploop_io_ops * ops);
+void ploop_format_put(struct ploop_delta_ops * ops);
+
+extern struct kobj_type ploop_delta_ktype;
+void ploop_sysfs_init(struct ploop_device * plo);
+void ploop_sysfs_uninit(struct ploop_device * plo);
+
+void ploop_queue_zero_request(struct ploop_device *plo, struct ploop_request *orig_preq, cluster_t clu);
+
+int ploop_maintenance_wait(struct ploop_device * plo);
+
+extern int max_map_pages;
+
+extern void ploop_msg_once(struct ploop_device *plo, const char *, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+/* Define PLOOP_TRACE to get full trace of ploop state machine.
+ */
+#undef PLOOP_TRACE
+
+
+#ifdef PLOOP_TRACE
+#define __TRACE(a...)  do { printk(a); } while (0)
+#else
+#define __TRACE(a...)  do { } while (0)
+#endif
+
+#endif /* _LINUX_PLOOP_H_ */
--- /dev/null
+++ b/include/linux/ploop/ploop_if.h
@@ -0,0 +1,387 @@
+/*
+ *  include/linux/ploop/ploop_if.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __PLOOP_IF_H__
+#define __PLOOP_IF_H__ 1
+
+#include <linux/ioctl.h>
+
+/* This interface mixes data relevant to delta layer and io layer
+ * to one request. It is too simplistic.
+ *
+ * But this allows to create the whole delta atomically and does
+ * not require maintenance of incomplete composition state inside device.
+ */
+
+/* Formats of deltas. */
+
+#define PLOOP_FMT_RAW		1
+#define PLOOP_FMT_PLOOP1	2
+
+/* PLOOP_FMT_PLOOP1 subversions */
+enum {
+	PLOOP_FMT_UNDEFINED = 0,
+	PLOOP_FMT_V1,
+	PLOOP_FMT_V2,
+};
+
+/* Delta flags. */
+#define PLOOP_FMT_RDONLY	1
+#define PLOOP_FMT_FLAGS		1
+
+#define PLOOP_FLAG_FS_SYNC	0x10000000
+
+#define PLOOP_FMT_PREALLOCATED	2
+
+#define PLOOP_FLAG_COOKIE	4
+#define PLOOP_COOKIE_SIZE	64
+
+#define PLOOP_FLAG_CLUBLKS	8
+
+/* IO types. */
+
+#define PLOOP_IO_AUTO		0
+#define PLOOP_IO_DIRECT		1
+#define PLOOP_IO_NFS		2
+#define PLOOP_IO_RESERVED	3	/* reserved, do not use */
+#define PLOOP_IO_KAIO		4
+
+/*
+ * # slots to skip in the very first page of L2 table
+ * (they are reserved for format-specific header)
+ * Assumptions:
+ * 1) sizeof(map_index_t) == sizeof(u32)
+ * 2) PLOOP_MAP_OFFSET == sizeof(struct ploop_pvd_header) / sizeof(u32)
+ */
+#define PLOOP_MAP_OFFSET	16
+
+/*
+ * in-kernel ploop implementation assumes that L2[index] can never be
+ * equal to this value (this is guaranteed by limitation of bdsize).
+ * So, in-kernel ploop may encode L2[index] == 0 by this value and keep
+ * zero value as special one meaning "iblock is not allocated yet for
+ * given index". User-space may use this value to denote uninitialized
+ * slots of L2[] table.
+ */
+#define PLOOP_ZERO_INDEX	0xFFFFFFFFU
+
+struct ploop_ctl_chunk
+{
+	__s32	pctl_fd;	/* FD of backing file */
+	__u32	pctl_type;	/* IO engine */
+	__u32	pctl_flags;	/* Some modifiers, undefined now */
+	__u32	pctl_offset;	/* Starting cluster of this chunk in image */
+
+	__u64	pctl_start;	/* Position of data in file.  */
+	__u64	pctl_len;	/* Length of data area in file. */
+} __attribute__ ((aligned (8)));
+
+struct ploop_ctl
+{
+	/* Description of delta format */
+	__u32	pctl_format;
+	__u32	pctl_flags;
+	__u32	pctl_cluster_log;
+	__u32	pctl_size;
+
+	/* Description of backing files. */
+	__u16	pctl_chunks;
+	__u8	pctl_level;
+	__u8	__mbz1;
+	__u32	__mbz2;
+	struct ploop_ctl_chunk chunks[0];
+} __attribute__ ((aligned (8)));
+
+/* helper for ADD_DELTA */
+struct ploop_ctl_delta {
+	struct ploop_ctl c;
+	struct ploop_ctl_chunk f;
+};
+
+struct ploop_truncate_ctl
+{
+	int	fd;
+	__u32	alloc_head;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+} __attribute__ ((aligned (8)));
+
+
+/*
+ * Before relocation l2[req_cluster] == old_iblk.
+ * Then user-space decided to relocate old_iblk to new_iblk.
+ * After relocation is done, we need kernel help to update map_node
+ * structure for req_cluster (if present). When kernel
+ * accomplished this, user-space may safely nullify old_iblk.
+ */
+struct reloc_map
+{
+	__u32 req_cluster;
+	__u32 iblk;
+} __attribute__ ((aligned (8)));
+
+struct ploop_index_update_ctl
+{
+	__u32	n_maps;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct reloc_map rmap[0];
+} __attribute__ ((aligned (8)));
+
+/*
+ * user-space found out that some blocks are not used
+ * and reports the list of them to kernel. Onwards,
+ * kernel will use them as free blocks instead of
+ * alloc_head++ technique.
+ */
+struct ploop_freeblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+
+} __attribute__ ((aligned (8)));
+
+struct ploop_freeblks_ctl
+{
+	__u32	n_extents;
+	__u32	alloc_head; /* out */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_freeblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+	__u32 free; /* this extent is also present in freemap */
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl
+{
+	__u32	n_extents;
+	__u32	n_scanned;  /* # bytes scanned */
+	__u32	alloc_head; /* in, for sanity check */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_relocblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_balloon_ctl
+{
+	__u32	mntn_type;     /* see enum above */
+	__u32	alloc_head;    /* freezed alloc_head */
+	__u8	level;	       /* top-level of ploop device */
+	__u8	inflate;       /* inflate/truncate flag */
+	__u8	keep_intact;   /* keep mntn state intact */
+	__u8	__mbz;
+} __attribute__ ((aligned (8)));
+
+struct ploop_getdevice_ctl
+{
+	__u32	minor;
+	__u32	__mbz1;
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_init_ctl
+{
+	__u8    cbt_uuid[16];
+	__u64	cbt_mask_addr; /* page-aligned space for CBT mask */
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_ctl_extent
+{
+	__u32 clu;
+	__u32 len;
+} __attribute__ ((aligned (8)));
+
+/* ploop_push_backup_io_ctl.direction */
+enum {
+	PLOOP_READ = 0, /* wait for requests */
+	PLOOP_WRITE,    /* ACK requests */
+	PLOOP_PEEK,     /* peek at what to be backed up */
+};
+
+struct ploop_push_backup_io_ctl
+{
+	__u8    cbt_uuid[16];
+	__u32	direction;
+	__u32	n_extents;
+	struct ploop_push_backup_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_stop_ctl
+{
+	__u8    cbt_uuid[16];
+	__u32	status; /* for sanity: non-zero if pending or active queue is not empty */
+} __attribute__ ((aligned (8)));
+
+/* maintenance types */
+enum {
+	PLOOP_MNTN_OFF = 0,  /* no maintenance is in progress */
+	PLOOP_MNTN_BALLOON,  /* user-space started ballooning */
+	PLOOP_MNTN_FBLOADED, /* list of free-blocks loaded */
+	PLOOP_MNTN_SNAPSHOT, /* bdev is freezed due to snapshot */
+
+	PLOOP_MNTN_TRACK,    /* tracking is in progress */
+	PLOOP_MNTN_DISCARD,  /* ready to handle discard requests */
+
+	PLOOP_MNTN_NOFAST = 256,
+	/* all types below requires fast-path disabled ! */
+
+	PLOOP_MNTN_MERGE,    /* merge is in progress */
+	PLOOP_MNTN_GROW,     /* grow is in progress */
+	PLOOP_MNTN_RELOC,    /* relocation is in progress */
+	PLOOP_MNTN_PUSH_BACKUP, /* push backup is in progress */
+};
+
+/*
+ * This define should be in sync with enum above.
+ * NB: PLOOP_MNTN_TRACK is handled separately because
+ * READ-requests may go fast-path even while tracking.
+ */
+#define FAST_PATH_DISABLED(t) (t > PLOOP_MNTN_NOFAST)
+
+#define PLOOPCTLTYPE	'P'
+
+/* Add delta. Device must be offline */
+#define PLOOP_IOC_ADD_DELTA	_IOW(PLOOPCTLTYPE, 0, struct ploop_ctl)
+
+/* Close images, free all data, return the device to initial state  */
+#define PLOOP_IOC_CLEAR		_IO(PLOOPCTLTYPE, 1)
+
+/* Stop/start device. */
+#define PLOOP_IOC_STOP		_IO(PLOOPCTLTYPE, 2)
+#define PLOOP_IOC_START		_IO(PLOOPCTLTYPE, 3)
+
+/* Make new snapshot on running device */
+#define PLOOP_IOC_SNAPSHOT	_IOW(PLOOPCTLTYPE, 4, struct ploop_ctl)
+
+/* Remove delta. Argument is delta level. */
+#define PLOOP_IOC_DEL_DELTA	_IOW(PLOOPCTLTYPE, 5, __u32)
+
+struct ploop_track_extent
+{
+	__u64	start;
+	__u64	end;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+/* Start tracking of top delta image. */
+#define PLOOP_IOC_TRACK_INIT	_IOR(PLOOPCTLTYPE, 6, struct ploop_track_extent)
+
+/* Stop of top delta image. It is responsibility of caller
+ * to quiesce the device before stopping tracking. The ioctl
+ * will fail if tracking was aborted or if not all dirty bits are read.
+ */
+#define PLOOP_IOC_TRACK_STOP	_IO(PLOOPCTLTYPE, 7)
+
+/* Abort tracker, clear the state */
+#define PLOOP_IOC_TRACK_ABORT	_IO(PLOOPCTLTYPE, 8)
+
+/* User -> ploop : transferred up to this position */
+#define PLOOP_IOC_TRACK_SETPOS	_IOW(PLOOPCTLTYPE, 9, __u64)
+
+/* ploop -> user: get modified bits */
+#define PLOOP_IOC_TRACK_READ	_IOR(PLOOPCTLTYPE, 10, struct ploop_track_extent)
+
+/* sync cacheable state of deltas to disk */
+#define PLOOP_IOC_SYNC		_IO(PLOOPCTLTYPE, 11)
+
+/* Merge top delta to lower one and delete it. */
+#define PLOOP_IOC_MERGE		_IO(PLOOPCTLTYPE, 12)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_REPLACE_DELTA	_IOW(PLOOPCTLTYPE, 13, struct ploop_ctl)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_TRUNCATE	_IOW(PLOOPCTLTYPE, 14, struct ploop_truncate_ctl)
+
+/* Update in-core copy of L2 table */
+#define PLOOP_IOC_UPDATE_INDEX  _IOW(PLOOPCTLTYPE, 16, struct ploop_index_update_ctl)
+
+/* Increase size of block device */
+#define PLOOP_IOC_GROW		_IOW(PLOOPCTLTYPE, 17, struct ploop_ctl)
+
+/* Inquire current state of free block extents */
+#define PLOOP_IOC_FBGET		_IOW(PLOOPCTLTYPE, 18, struct ploop_freeblks_ctl)
+
+/* Start balloning or inquire maintenance_type or flush stale BALLON state */
+#define PLOOP_IOC_BALLOON	_IOW(PLOOPCTLTYPE, 19, struct ploop_balloon_ctl)
+
+/* Load free blocks to ploop */
+#define PLOOP_IOC_FREEBLKS      _IOW(PLOOPCTLTYPE, 20, struct ploop_freeblks_ctl)
+
+/* Load blocks to relocate and initiate relocation process */
+#define PLOOP_IOC_RELOCBLKS     _IOW(PLOOPCTLTYPE, 21, struct ploop_relocblks_ctl)
+
+/* Search ploop_device global tree for first unused minor number */
+#define PLOOP_IOC_GETDEVICE    _IOW(PLOOPCTLTYPE, 22, struct ploop_getdevice_ctl)
+
+/* Start handling discard requests */
+#define PLOOP_IOC_DISCARD_INIT _IO(PLOOPCTLTYPE, 23)
+/* Stop handling discard requests */
+#define PLOOP_IOC_DISCARD_FINI _IO(PLOOPCTLTYPE, 24)
+/* Wait a discard request */
+#define PLOOP_IOC_DISCARD_WAIT _IO(PLOOPCTLTYPE, 25)
+
+/* Drop current state of free block extents */
+#define PLOOP_IOC_FBDROP	_IO(PLOOPCTLTYPE, 26)
+
+/* Filter extents with sizes less than arg */
+#define PLOOP_IOC_FBFILTER	_IOR(PLOOPCTLTYPE, 27, unsigned long)
+
+/* Set maximum size for the top delta . */
+#define PLOOP_IOC_MAX_DELTA_SIZE _IOW(PLOOPCTLTYPE, 28, __u64)
+
+/* Start push backup */
+#define PLOOP_IOC_PUSH_BACKUP_INIT _IOR(PLOOPCTLTYPE, 29, struct ploop_push_backup_init_ctl)
+
+/* Wait for push backup out-of-order requests; or ACK them */
+#define PLOOP_IOC_PUSH_BACKUP_IO _IOR(PLOOPCTLTYPE, 30, struct ploop_push_backup_io_ctl)
+
+/* Stop push backup */
+#define PLOOP_IOC_PUSH_BACKUP_STOP _IOR(PLOOPCTLTYPE, 31, struct ploop_push_backup_stop_ctl)
+
+/* Freeze FS mounted over ploop */
+#define PLOOP_IOC_FREEZE	_IO(PLOOPCTLTYPE, 32)
+
+/* Unfreeze FS mounted over ploop */
+#define PLOOP_IOC_THAW		_IO(PLOOPCTLTYPE, 33)
+
+/* Events exposed via /sys/block/ploopN/pstate/event */
+#define PLOOP_EVENT_ABORTED	1
+#define PLOOP_EVENT_STOPPED	2
+#define PLOOP_EVENT_ENOSPC	3
+
+#ifdef __KERNEL__
+
+#define PLOOP_INTERNAL_MAGIC	0x284cd32c
+struct ploop_xops
+{
+	__u32		magic;
+
+	int		(*get_extent)(struct inode *inode, sector_t isec,
+				      unsigned int nr, sector_t *start,
+				      sector_t *psec, int creat);
+};
+
+#define PLOOP_IOC_INTERNAL	_IOR(PLOOPCTLTYPE, 15, struct ploop_xops)
+
+#endif
+
+#endif /* __PLOOP_IF_H__ */
--- /dev/null
+++ b/include/linux/ploop/ploop_stat.h
@@ -0,0 +1,55 @@
+/*
+ *  include/linux/ploop/ploop_stat.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+__DO(bio_in)
+__DO(bio_fast)
+__DO(bio_full)
+__DO(bio_out)
+__DO(bio_alloc)
+__DO(bio_alloc_whole)
+__DO(bio_splits)
+__DO(coal_back)
+__DO(coal_forw)
+__DO(coal_back2)
+__DO(coal_forw2)
+__DO(coal_oback)
+__DO(coal_oforw)
+__DO(coal_mback)
+__DO(coal_mforw)
+__DO(coal_overlap)
+__DO(coal_flush)
+__DO(bio_barriers)
+__DO(bio_rzero)
+__DO(bio_wzero)
+__DO(bio_syncwait)
+__DO(bio_fsync)
+__DO(bio_cows)
+__DO(bio_whole_cows)
+__DO(merge_neg_cluster)
+__DO(merge_neg_disable)
+__DO(fast_neg_nomap)
+__DO(fast_neg_noem)
+__DO(fast_neg_shortem)
+__DO(fast_neg_backing)
+__DO(bio_lockouts)
+__DO(map_lockouts)
+__DO(merge_lockouts)
+__DO(map_reads)
+__DO(map_merges)
+__DO(map_single_writes)
+__DO(map_multi_writes)
+__DO(map_multi_updates)
+__DO(bio_trans_whole)
+__DO(bio_trans_copy)
+__DO(bio_trans_alloc)
+__DO(bio_trans_index)
+__DO(bio_flush_in)
+__DO(bio_fua_in)
+__DO(bio_flush_out)
+__DO(bio_fua_out)
+__DO(bio_flush_skip)
+
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -96,6 +96,8 @@ extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
+extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);
+
 static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
 	switch (type) {
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -43,10 +43,9 @@ extern int console_printk[];
 #define minimum_console_loglevel (console_printk[2])
 #define default_console_loglevel (console_printk[3])
 
-static inline void console_silent(void)
-{
-	console_loglevel = 0;
-}
+#define VE0_LOG		1
+#define VE_LOG		2
+#define VE_LOG_BOTH	(VE0_LOG | VE_LOG)
 
 static inline void console_verbose(void)
 {
@@ -109,6 +108,8 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
+struct ve_struct;
+
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
 int vprintk_emit(int facility, int level,
@@ -126,6 +127,17 @@ asmlinkage int printk_emit(int facility, int level,
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
+asmlinkage __printf(2, 0)
+int ve_vprintk(int dst, const char *fmt, va_list args);
+
+asmlinkage __printf(2, 3) __cold
+int ve_printk(int dst, const char *fmt, ...);
+
+int ve_log_init(struct ve_struct *ve);
+void ve_log_destroy(struct ve_struct *ve);
+asmlinkage __printf(2, 3) __cold
+int ve_log_printk(struct ve_struct *ve, const char *s, ...);
+
 /*
  * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
  */
@@ -165,6 +177,30 @@ int printk(const char *s, ...)
 {
 	return 0;
 }
+static inline __printf(2, 0)
+int ve_vprintk(int dst, const char *s, va_list args)
+{
+	return 0;
+}
+static inline __printf(2, 3) __cold
+int ve_printk(int dst, const char *s, ...)
+{
+	return 0;
+}
+static inline
+int ve_log_init(struct ve_struct *ve)
+{
+	return 0;
+}
+static inline
+void ve_log_destroy(struct ve_struct *ve)
+{
+}
+static inline __printf(2, 3) __cold
+int ve_log_printk(struct ve_struct *ve, const char *s, ...)
+{
+	return 0;
+}
 static inline __printf(1, 2) __cold
 int printk_deferred(const char *s, ...)
 {
@@ -331,9 +367,21 @@ extern void dump_stack(void) __cold;
 	if (__ratelimit(&_rs))						\
 		printk(fmt, ##__VA_ARGS__);				\
 })
+
+#define ve_printk_ratelimited(dst, fmt, ...)				\
+({									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+									\
+	if (__ratelimit(&_rs))						\
+		ve_printk(dst, fmt, ##__VA_ARGS__);			\
+})
 #else
 #define printk_ratelimited(fmt, ...)					\
 	no_printk(fmt, ##__VA_ARGS__)
+#define ve_printk_ratelimited(dst, fmt, ...)				\
+	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 #define pr_emerg_ratelimited(fmt, ...)					\
@@ -351,6 +399,20 @@ extern void dump_stack(void) __cold;
 #define pr_info_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /* no pr_cont_ratelimited, don't do that... */
+#define ve_pr_emerg_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_alert_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_crit_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_err_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_warn_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_notice_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_info_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
 #if defined(DEBUG)
 #define pr_devel_ratelimited(fmt, ...)					\
@@ -402,6 +464,17 @@ extern void print_hex_dump(const char *level, const char *prefix_str,
 extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 				 const void *buf, size_t len);
 #endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+
+extern int console_silence_loglevel;
+
+static inline void console_silent(void)
+{
+	if (console_loglevel > console_silence_loglevel) {
+		printk(KERN_EMERG "console shuts up ...\n");
+		console_loglevel = 0;
+	}
+}
+
 #else
 static inline void print_hex_dump(const char *level, const char *prefix_str,
 				  int prefix_type, int rowsize, int groupsize,
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -7,15 +7,45 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 
-struct proc_dir_entry;
+struct proc_dir_entry {
+	unsigned int low_ino;
+	umode_t mode;
+	nlink_t nlink;
+	kuid_t uid;
+	kgid_t gid;
+	loff_t size;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+#ifdef __GENKSYMS__
+	struct proc_dir_entry *next, *parent, *subdir;
+#else
+	struct proc_dir_entry *parent;
+	struct rb_root subdir;
+	struct rb_node subdir_node;
+#endif
+	void *data;
+	atomic_t count;		/* use count */
+	atomic_t in_use;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
+	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	u8 namelen;
+	char name[];
+};
 
 #ifdef CONFIG_PROC_FS
 
 extern void proc_root_init(void);
 extern void proc_flush_task(struct task_struct *);
 
-extern struct proc_dir_entry *proc_symlink(const char *,
-		struct proc_dir_entry *, const char *);
+extern struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
+			struct proc_dir_entry *parent, const char *dest);
+static inline struct proc_dir_entry *proc_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *dest)
+{
+	return proc_symlink_mode(name, S_IRWXUGO, parent, dest);
+}
 extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
 extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 					      struct proc_dir_entry *, void *);
@@ -26,6 +56,9 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
 					       const struct file_operations *,
 					       void *);
+extern struct proc_dir_entry *proc_net_create_data(const char *name,
+				umode_t mode, struct proc_dir_entry *parent,
+				const struct file_operations *fops, void *data);
 
 static inline struct proc_dir_entry *proc_create(
 	const char *name, umode_t mode, struct proc_dir_entry *parent,
@@ -34,6 +67,13 @@ static inline struct proc_dir_entry *proc_create(
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
 
+static inline struct proc_dir_entry *proc_net_create(
+	const char *name, umode_t mode, struct proc_dir_entry *parent,
+	const struct file_operations *fops)
+{
+	return proc_net_create_data(name, mode, parent, fops, NULL);
+}
+
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
 extern void *PDE_DATA(const struct inode *);
@@ -50,6 +90,8 @@ static inline void proc_flush_task(struct task_struct *task)
 
 static inline struct proc_dir_entry *proc_symlink(const char *name,
 		struct proc_dir_entry *parent,const char *dest) { return NULL;}
+static inline struct proc_dir_entry *proc_symlink_mode(const char *name,
+	umode_t m, struct proc_dir_entry *p, const char *d) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
 static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
@@ -75,7 +117,7 @@ struct net;
 static inline struct proc_dir_entry *proc_net_mkdir(
 	struct net *net, const char *name, struct proc_dir_entry *parent)
 {
-	return proc_mkdir_data(name, 0, parent, net);
+	return proc_mkdir_data(name, S_ISVTX|S_IRUGO|S_IXUGO, parent, net);
 }
 
 #endif /* _LINUX_PROC_FS_H */
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -4,11 +4,13 @@
 #ifndef _LINUX_PROC_NS_H
 #define _LINUX_PROC_NS_H
 
+struct super_block;
 struct pid_namespace;
 struct nsproxy;
 
 struct proc_ns_operations {
 	const char *name;
+	const char *real_ns_name;
 	int type;
 	void *(*get)(struct task_struct *task);
 	void (*put)(void *ns);
@@ -25,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 
@@ -49,6 +52,8 @@ extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
 extern bool proc_ns_inode(struct inode *inode);
 
+extern bool proc_in_container(struct super_block *sb);
+
 #else /* CONFIG_PROC_FS */
 
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -36,6 +36,7 @@
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
 #define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -42,6 +42,10 @@ void inode_add_rsv_space(struct inode *inode, qsize_t number);
 void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
 void inode_reclaim_rsv_space(struct inode *inode, qsize_t number);
+qsize_t *inode_reserved_space(struct inode * inode);
+qsize_t inode_get_rsv_space(struct inode *inode);
+void inode_incr_space(struct inode *inode, qsize_t number, int reserve);
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve);
 
 void dquot_initialize(struct inode *inode);
 void dquot_drop(struct inode *inode);
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -72,6 +72,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_TAG_LONGS	\
 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+#define RADIX_ROOT_TAG_MASK	(((1<<RADIX_TREE_MAX_TAGS)-1) << __GFP_BITS_SHIFT)
+
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
@@ -286,6 +288,10 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
+void __radix_tree_root_tag_move_all_to_prev(struct radix_tree_root *root);
+void __radix_tree_prev_tag_clear(struct radix_tree_root *root,
+				 unsigned int tag);
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -31,6 +31,7 @@
 
 #include <linux/kernel.h>
 #include <linux/stddef.h>
+#include <linux/rcupdate.h>
 
 struct rb_node {
 	unsigned long  __rb_parent_color;
@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
 {
 	node->__rb_parent_color = (unsigned long)parent;
 	node->rb_left = node->rb_right = NULL;
@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 	*rb_link = node;
 }
 
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
 #define rb_entry_safe(ptr, type, member) \
 	({ typeof(ptr) ____ptr = (ptr); \
 	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -113,11 +113,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
 {
 	if (parent) {
 		if (parent->rb_left == old)
-			parent->rb_left = new;
+			WRITE_ONCE(parent->rb_left, new);
 		else
-			parent->rb_right = new;
+			WRITE_ONCE(parent->rb_right, new);
 	} else
-		root->rb_node = new;
+		WRITE_ONCE(root->rb_node, new);
 }
 
 extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
@@ -127,7 +127,8 @@ static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		     const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
@@ -157,6 +158,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		tmp = parent;
 	} else {
 		struct rb_node *successor = child, *child2;
+
 		tmp = child->rb_left;
 		if (!tmp) {
 			/*
@@ -170,6 +172,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 			 */
 			parent = successor;
 			child2 = successor->rb_right;
+
 			augment->copy(node, successor);
 		} else {
 			/*
@@ -191,19 +194,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 				successor = tmp;
 				tmp = tmp->rb_left;
 			} while (tmp);
-			parent->rb_left = child2 = successor->rb_right;
-			successor->rb_right = child;
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
 			rb_set_parent(child, successor);
+
 			augment->copy(node, successor);
 			augment->propagate(parent, successor);
 		}
 
-		successor->rb_left = tmp = node->rb_left;
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
 		rb_set_parent(tmp, successor);
 
 		pc = node->__rb_parent_color;
 		tmp = __rb_parent(pc);
 		__rb_change_child(node, successor, tmp, root);
+
 		if (child2) {
 			successor->__rb_parent_color = pc;
 			rb_set_parent_color(child2, parent, RB_BLACK);
--- /dev/null
+++ b/include/linux/rbtree_latch.h
@@ -0,0 +1,212 @@
+/*
+ * Latched RB-trees
+ *
+ * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
+ *
+ * Since RB-trees have non-atomic modifications they're not immediately suited
+ * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
+ * lockless lookups; we cannot guarantee they return a correct result.
+ *
+ * The simplest solution is a seqlock + RB-tree, this will allow lockless
+ * lookups; but has the constraint (inherent to the seqlock) that read sides
+ * cannot nest in write sides.
+ *
+ * If we need to allow unconditional lookups (say as required for NMI context
+ * usage) we need a more complex setup; this data structure provides this by
+ * employing the latch technique -- see @raw_write_seqcount_latch -- to
+ * implement a latched RB-tree which does allow for unconditional lookups by
+ * virtue of always having (at least) one stable copy of the tree.
+ *
+ * However, while we have the guarantee that there is at all times one stable
+ * copy, this does not guarantee an iteration will not observe modifications.
+ * What might have been a stable copy at the start of the iteration, need not
+ * remain so for the duration of the iteration.
+ *
+ * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
+ * see the comment in lib/rbtree.c. Note however that we only require the first
+ * condition -- not seeing partial stores -- because the latch thing isolates
+ * us from loops. If we were to interrupt a modification the lookup would be
+ * pointed at the stable tree and complete while the modification was halted.
+ */
+
+#ifndef RB_TREE_LATCH_H
+#define RB_TREE_LATCH_H
+
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+
+struct latch_tree_node {
+	struct rb_node node[2];
+};
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+/**
+ * latch_tree_ops - operators to define the tree order
+ * @less: used for insertion; provides the (partial) order between two elements.
+ * @comp: used for lookups; provides the order between the search key and an element.
+ *
+ * The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * latch_tree_find().
+ */
+struct latch_tree_ops {
+	bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
+	int  (*comp)(void *key,                 struct latch_tree_node *b);
+};
+
+static __always_inline struct latch_tree_node *
+__lt_from_rb(struct rb_node *node, int idx)
+{
+	return container_of(node, struct latch_tree_node, node[idx]);
+}
+
+static __always_inline void
+__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
+	    bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
+{
+	struct rb_root *root = &ltr->tree[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *node = &ltn->node[idx];
+	struct rb_node *parent = NULL;
+	struct latch_tree_node *ltp;
+
+	while (*link) {
+		parent = *link;
+		ltp = __lt_from_rb(parent, idx);
+
+		if (less(ltn, ltp))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node_rcu(node, parent, link);
+	rb_insert_color(node, root);
+}
+
+static __always_inline void
+__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
+{
+	rb_erase(&ltn->node[idx], &ltr->tree[idx]);
+}
+
+static __always_inline struct latch_tree_node *
+__lt_find(void *key, struct latch_tree_root *ltr, int idx,
+	  int (*comp)(void *key, struct latch_tree_node *node))
+{
+	struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
+	struct latch_tree_node *ltn;
+	int c;
+
+	while (node) {
+		ltn = __lt_from_rb(node, idx);
+		c = comp(key, ltn);
+
+		if (c < 0)
+			node = rcu_dereference_raw(node->rb_left);
+		else if (c > 0)
+			node = rcu_dereference_raw(node->rb_right);
+		else
+			return ltn;
+	}
+
+	return NULL;
+}
+
+/**
+ * latch_tree_insert() - insert @node into the trees @root
+ * @node: nodes to insert
+ * @root: trees to insert @node into
+ * @ops: operators defining the node order
+ *
+ * It inserts @node into @root in an ordered fashion such that we can always
+ * observe one complete tree. See the comment for raw_write_seqcount_latch().
+ *
+ * The inserts use rcu_assign_pointer() to publish the element such that the
+ * tree structure is stored before we can observe the new @node.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_insert(struct latch_tree_node *node,
+		  struct latch_tree_root *root,
+		  const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 0, ops->less);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 1, ops->less);
+}
+
+/**
+ * latch_tree_erase() - removes @node from the trees @root
+ * @node: nodes to remote
+ * @root: trees to remove @node from
+ * @ops: operators defining the node order
+ *
+ * Removes @node from the trees @root in an ordered fashion such that we can
+ * always observe one complete tree. See the comment for
+ * raw_write_seqcount_latch().
+ *
+ * It is assumed that @node will observe one RCU quiescent state before being
+ * reused of freed.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_erase(struct latch_tree_node *node,
+		 struct latch_tree_root *root,
+		 const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 0);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 1);
+}
+
+/**
+ * latch_tree_find() - find the node matching @key in the trees @root
+ * @key: search key
+ * @root: trees to search for @key
+ * @ops: operators defining the node order
+ *
+ * Does a lockless lookup in the trees @root for the node matching @key.
+ *
+ * It is assumed that this is called while holding the appropriate RCU read
+ * side lock.
+ *
+ * If the operators define a partial order on the elements (there are multiple
+ * elements which have the same key value) it is undefined which of these
+ * elements will be found. Nor is it possible to iterate the tree to find
+ * further elements with the same key value.
+ *
+ * Returns: a pointer to the node matching @key or NULL.
+ */
+static __always_inline struct latch_tree_node *
+latch_tree_find(void *key, struct latch_tree_root *root,
+		const struct latch_tree_ops *ops)
+{
+	struct latch_tree_node *node;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount_latch(&root->seq);
+		node = __lt_find(key, root, seq & 1, ops->comp);
+	} while (read_seqcount_retry(&root->seq, seq));
+
+	return node;
+}
+
+#endif /* RB_TREE_LATCH_H */
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -579,21 +579,6 @@ static inline void rcu_preempt_sleep_check(void)
 		(p) = (typeof(*v) __force space *)(v); \
 	} while (0)
 
-/**
- * lockless_dereference() - safely load a pointer for later dereference
- * @p: The pointer to load
- *
- * Similar to rcu_dereference(), but for situations where the pointed-to
- * object's lifetime is managed by something other than RCU.  That
- * "something other" might be reference counting or simple immortality.
- */
-#define lockless_dereference(p) \
-({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	(_________p1); \
-})
-
 /**
  * rcu_access_pointer() - fetch RCU pointer with no dereferencing
  * @p: The pointer to read
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -55,6 +55,7 @@ struct anon_vma {
 	RH_KABI_EXTEND(unsigned degree)
 
 	RH_KABI_EXTEND(struct anon_vma *parent)	/* Parent of this anon_vma */
+	struct user_beancounter *anon_vma_ub;
 };
 
 /*
@@ -178,7 +179,7 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
 			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_file_rmap(struct page *);
+void page_add_file_rmap(struct page *, struct mm_struct *);
 void page_remove_rmap(struct page *);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
@@ -244,15 +245,32 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma_read(struct page *page);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
+extern struct anon_vma *page_lock_anon_vma_read(struct page *page);
+extern void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
+/*
+ * rmap_walk_control: To control rmap traversing for specific needs
+ *
+ * arg: passed to rmap_one() and invalid_vma()
+ * rmap_one: executed on each vma where page is mapped
+ * done: for checking traversing termination condition
+ * anon_lock: for getting anon_lock by optimized way rather than default
+ * invalid_vma: for skipping uninterested vma
+ */
+struct rmap_walk_control {
+	void *arg;
+	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+					unsigned long addr, void *arg);
+	int (*done)(struct page *page);
+	struct anon_vma *(*anon_lock)(struct page *page);
+	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
+};
+
 /*
  * Called by migrate.c to remove migration ptes, but might be used more later.
  */
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg);
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
 
 #else	/* !CONFIG_MMU */
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,15 +47,19 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
+#include <linux/ve_proto.h>
 
 #include <asm/processor.h>
 
+#include <bc/task.h>
+
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
 
 /*
@@ -147,6 +151,8 @@ struct filename;
  */
 extern unsigned long avenrun[];		/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_avenrun_ve(unsigned long *loads,
+			unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
@@ -160,16 +166,23 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;
 
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern bool single_task_running(void);
+extern unsigned long nr_sleeping(void);
+extern unsigned long nr_stopped(void);
+extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
-extern unsigned long this_cpu_load(void);
-
+extern unsigned long nr_active_cpu(void);
+extern atomic_t nr_dead;
+extern unsigned long nr_zombie;
 
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
@@ -213,6 +226,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_PARKED		512
 #define TASK_STATE_MAX		1024
 
+#define __TASK_IOTHROTTLED	1024
+
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 extern char ___assert_task_state[1 - 2*!!(
@@ -240,6 +255,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FROZEN) == 0)
+#define task_iothrottled(task)	((task->state & __TASK_IOTHROTTLED) != 0)
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -448,7 +464,9 @@ extern int get_dumpable(struct mm_struct *mm);
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
-#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+/* This ine-shot flag is droped due to necessivity of changing exe once again
+ * on NFS restore */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
@@ -1041,6 +1059,11 @@ struct sched_domain {
 	unsigned int alb_failed;
 	unsigned int alb_pushed;
 
+	/* cpulimit balancing */
+	unsigned int clb_count;
+	unsigned int clb_failed;
+	unsigned int clb_pushed;
+
 	/* SD_BALANCE_EXEC stats */
 	unsigned int sbe_count;
 	unsigned int sbe_balanced;
@@ -1174,6 +1197,7 @@ struct sched_statistics {
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_cpulimit;
 	u64			nr_failed_migrations_hot;
 	u64			nr_forced_migrations;
 
@@ -1193,8 +1217,14 @@ struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	struct list_head	group_node;
+	struct list_head	cfs_rq_node;
 	unsigned int		on_rq;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	unsigned int            boosted;
+	struct list_head        boost_node;
+#endif
+
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
@@ -1208,6 +1238,7 @@ struct sched_entity {
 	struct cfs_rq		*cfs_rq;
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
+	int			depth;
 #endif
 
 /*
@@ -1403,17 +1434,23 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
+	/* scheduler bits, serialized by rq lock: */
+	unsigned sched_reset_on_fork:1;
+	/* Two below are really protected by pi_lock, but they are modified in
+         * the place where nobody else can modify other fields using rq->lock */
+	unsigned sched_contributes_to_load:1;
+	unsigned sched_interruptible_sleep:1;
+	unsigned woken_while_running:1;
+	unsigned sched_iothrottled_sleep:1;
+	unsigned :0; /* force alignment to the next boundary */
+
+	/* unserialized, strictly 'current' */
 	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
-
-	/* task may not gain privileges */
-	unsigned no_new_privs:1;
-
-	/* Revert to default priority/policy when forking */
-	unsigned sched_reset_on_fork:1;
-	unsigned sched_contributes_to_load:1;
+	unsigned no_new_privs:1; /* task may not gain privileges */
+	unsigned may_throttle:1;
 
 	pid_t pid;
 	pid_t tgid;
@@ -1708,6 +1745,9 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+#ifdef CONFIG_KASAN
+	unsigned int kasan_depth;
+#endif
 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) && !defined(CONFIG_S390)
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
@@ -1729,13 +1769,23 @@ struct task_struct {
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+	/* Coverage collection mode enabled for this task (0 if disabled). */
+	enum kcov_mode kcov_mode;
+	/* Size of the kcov_area. */
+	unsigned	kcov_size;
+	/* Buffer for coverage collection. */
+	void		*kcov_area;
+	/* kcov desciptor wired with this task or NULL. */
+	struct kcov	*kcov;
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+	struct task_beancounter task_bc;
+#endif
+#ifdef CONFIG_VE
+	struct ve_struct *task_ve;
+#endif
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
-	struct memcg_batch_info {
-		int do_batch;	/* incremented when batch uncharge started */
-		struct mem_cgroup *memcg; /* target memcg of uncharge */
-		unsigned long nr_pages;	/* uncharged usage */
-		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
-	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -2065,6 +2115,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_MEMCG_RECLAIM  0x01000000	/* We are in memcg reclaim */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
@@ -2427,6 +2478,8 @@ extern void __set_special_pids(struct pid *pid);
 
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(kuid_t);
+extern struct user_struct * alloc_uid_ns(struct user_namespace *ns, kuid_t);
+
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
@@ -2679,8 +2732,10 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-	return list_entry_rcu(p->thread_group.next,
+	struct task_struct *tsk;
+	tsk = list_entry_rcu(p->thread_group.next,
 			      struct task_struct, thread_group);
+	return tsk;
 }
 
 static inline int thread_group_empty(struct task_struct *p)
@@ -2911,6 +2966,13 @@ extern int _cond_resched(void);
 	_cond_resched();			\
 })
 
+extern int _cond_resched_may_throttle(void);
+
+#define cond_resched_may_throttle() ({		\
+	__might_sleep(__FILE__, __LINE__, 0);	\
+	_cond_resched_may_throttle();		\
+})
+
 extern int __cond_resched_lock(spinlock_t *lock);
 
 #ifdef CONFIG_PREEMPT_COUNT
@@ -3120,6 +3182,33 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+extern unsigned int tg_cpu_rate(struct task_group *tg);
+extern unsigned int tg_nr_cpus(struct task_group *tg);
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -87,4 +87,15 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
 	return;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+extern long seccomp_get_filter(struct task_struct *task,
+			       unsigned long filter_off, void __user *data);
+#else
+static inline long seccomp_get_filter(struct task_struct *task,
+				      unsigned long n, void __user *data)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
 #endif /* _LINUX_SECCOMP_H */
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -34,6 +34,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <linux/compiler.h>
 #include <asm/processor.h>
 
 /*
@@ -170,10 +171,87 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 	return __read_seqcount_retry(s, start);
 }
 
+static inline int raw_read_seqcount_latch(seqcount_t *s)
+{
+	return lockless_dereference(s->sequence);
+}
 
-/*
+/**
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
+ *
+ * The latch technique is a multiversion concurrency control method that allows
+ * queries during non-atomic modifications. If you can guarantee queries never
+ * interrupt the modification -- e.g. the concurrency is strictly between CPUs
+ * -- you most likely do not need this.
+ *
+ * Where the traditional RCU/lockless data structures rely on atomic
+ * modifications to ensure queries observe either the old or the new state the
+ * latch allows the same for non-atomic updates. The trade-off is doubling the
+ * cost of storage; we have to maintain two copies of the entire data
+ * structure.
+ *
+ * Very simply put: we first modify one copy and then the other. This ensures
+ * there is always one copy in a stable state, ready to give us an answer.
+ *
+ * The basic form is a data structure like:
+ *
+ * struct latch_struct {
+ *	seqcount_t		seq;
+ *	struct data_struct	data[2];
+ * };
+ *
+ * Where a modification, which is assumed to be externally serialized, does the
+ * following:
+ *
+ * void latch_modify(struct latch_struct *latch, ...)
+ * {
+ *	smp_wmb();	<- Ensure that the last data[1] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[0], ...);
+ *
+ *	smp_wmb();	<- Ensure that the data[0] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[1], ...);
+ * }
+ *
+ * The query will have a form like:
+ *
+ * struct entry *latch_query(struct latch_struct *latch, ...)
+ * {
+ *	struct entry *entry;
+ *	unsigned seq, idx;
+ *
+ *	do {
+ *		seq = lockless_dereference(latch->seq);
+ *
+ *		idx = seq & 0x01;
+ *		entry = data_query(latch->data[idx], ...);
+ *
+ *		smp_rmb();
+ *	} while (seq != latch->seq);
+ *
+ *	return entry;
+ * }
+ *
+ * So during the modification, queries are first redirected to data[1]. Then we
+ * modify data[0]. When that is complete, we redirect queries back to data[0]
+ * and we can modify data[1].
+ *
+ * NOTE: The non-requirement for atomic modifications does _NOT_ include
+ *       the publishing of new entries in the case where data is a dynamic
+ *       data structure.
+ *
+ *       An iteration might start in data[0] and get suspended long enough
+ *       to miss an entire modification sequence, once it resumes it might
+ *       observe the new entry.
+ *
+ * NOTE: When data is a dynamic data structure; one should use regular RCU
+ *       patterns to manage the lifetimes of the objects within.
  */
 static inline void raw_write_seqcount_latch(seqcount_t *s)
 {
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,6 +23,7 @@ struct shmem_inode_info {
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	struct inode		vfs_inode;
+	struct user_beancounter	*shmi_ub;
 };
 
 struct shmem_sb_info {
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,39 +4,71 @@
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
  */
 struct shrink_control {
 	gfp_t gfp_mask;
 
-	/* How many slab objects shrinker() should scan and try to reclaim */
+	/*
+	 * How many objects scan_objects should scan and try to reclaim.
+	 * This is reset before every call, so it is safe for callees
+	 * to modify.
+	 */
 	unsigned long nr_to_scan;
+
+	/* current node being shrunk (for NUMA aware shrinkers) */
+	int nid;
+
+	/* current memcg being shrunk (for memcg aware shrinkers) */
+	struct mem_cgroup *memcg;
+
+	bool for_drop_caches;
 };
 
+#define SHRINK_STOP (~0UL)
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
  *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
  *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @flags determine the shrinker abilities, like numa awareness
  */
 struct shrinker {
-	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	unsigned long (*count_objects)(struct shrinker *,
+				       struct shrink_control *sc);
+	unsigned long (*scan_objects)(struct shrinker *,
+				      struct shrink_control *sc);
+
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
+	unsigned long flags;
 
 	/* These are for internal use */
 	struct list_head list;
-	atomic_long_t nr_in_batch; /* objs pending delete */
+	/* objs pending delete, per node */
+	atomic_long_t *nr_deferred;
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
+
+/* Flags */
+#define SHRINKER_NUMA_AWARE	(1 << 0)
+#define SHRINKER_MEMCG_AWARE	(1 << 1)
+
+extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
 #endif
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -17,6 +17,9 @@ struct sigqueue {
 	int flags;
 	siginfo_t info;
 	struct user_struct *user;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *sig_ub;
+#endif
 };
 
 /* flags values. */
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -4,6 +4,8 @@
  * (C) SGI 2006, Christoph Lameter
  * 	Cleaned up and restructured to ease the addition of alternative
  * 	implementations of SLAB allocators.
+ * (C) Linux Foundation 2008-2013
+ *      Unified interface for all slab allocators
  */
 
 #ifndef _LINUX_SLAB_H
@@ -77,10 +79,22 @@
 #else
 # define SLAB_FAILSLAB		0x00000000UL
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+# define SLAB_ACCOUNT		0x04000000UL	/* Account to memcg */
+#else
+# define SLAB_ACCOUNT		0x00000000UL
+#endif
+
+#ifdef CONFIG_KASAN
+#define SLAB_KASAN		0x08000000UL
+#else
+#define SLAB_KASAN		0x00000000UL
+#endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -94,6 +108,8 @@
 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
 				(unsigned long)ZERO_SIZE_PTR)
 
+#include <linux/kmemleak.h>
+#include <linux/kasan.h>
 
 struct mem_cgroup;
 /*
@@ -105,12 +121,12 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
-struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
-			unsigned long, void (*)(void *), struct kmem_cache *);
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void kmem_cache_free(struct kmem_cache *, void *);
+
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -146,35 +162,6 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
-#ifdef CONFIG_SLOB
-/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
- */
-struct kmem_cache {
-	unsigned int object_size;/* The original size of the object */
-	unsigned int size;	/* The aligned/padded/added on size  */
-	unsigned int align;	/* Alignment as calculated */
-	unsigned long flags;	/* Active flags on the slab */
-	const char *name;	/* Slab name for sysfs */
-	int refcount;		/* Use counter */
-	void (*ctor)(void *);	/* Called on object slot creation */
-	struct list_head list;	/* List of all slab caches on the system */
-};
-
-#define KMALLOC_MAX_SIZE (1UL << 30)
-
-#include <linux/slob_def.h>
-
-#else /* CONFIG_SLOB */
-
 /*
  * Kmalloc array related definitions
  */
@@ -195,7 +182,9 @@ struct kmem_cache {
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
-#else
+#endif
+
+#ifdef CONFIG_SLUB
 /*
  * SLUB allocates up to order 2 pages directly and otherwise
  * passes the request to the page allocator.
@@ -207,6 +196,19 @@ struct kmem_cache {
 #endif
 #endif
 
+#ifdef CONFIG_SLOB
+/*
+ * SLOB passes all page size and larger requests to the page allocator.
+ * No kmalloc array is necessary since objects of different sizes can
+ * be allocated from the same page.
+ */
+#define KMALLOC_SHIFT_MAX	30
+#define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
+#ifndef KMALLOC_SHIFT_LOW
+#define KMALLOC_SHIFT_LOW	3
+#endif
+#endif
+
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
 /* Maximum size for which we actually use a slab cache */
@@ -221,6 +223,7 @@ struct kmem_cache {
 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
+#ifndef CONFIG_SLOB
 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 #ifdef CONFIG_ZONE_DMA
 extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
@@ -275,15 +278,112 @@ static __always_inline int kmalloc_index(size_t size)
 	/* Will never be reached. Needed because the compiler may complain */
 	return -1;
 }
+#endif /* !CONFIG_SLOB */
 
-#ifdef CONFIG_SLAB
-#include <linux/slab_def.h>
-#elif defined(CONFIG_SLUB)
-#include <linux/slub_def.h>
+void *__kmalloc(size_t size, gfp_t flags);
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *, void *);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+#else
+static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc(size, flags);
+}
+
+static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
+{
+	return kmem_cache_alloc(s, flags);
+}
+#endif
+
+#ifdef CONFIG_TRACING
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+
+#ifdef CONFIG_NUMA
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+					   gfp_t gfpflags,
+					   int node, size_t size);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_trace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node, size_t size)
+{
+	return kmem_cache_alloc_trace(s, gfpflags, size);
+}
+#endif /* CONFIG_NUMA */
+
+#else /* CONFIG_TRACING */
+static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
+		gfp_t flags, size_t size)
+{
+	void *ret = kmem_cache_alloc(s, flags);
+
+	kasan_kmalloc(s, ret, size, flags);
+	return ret;
+}
+
+static __always_inline void *
+kmem_cache_alloc_node_trace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node, size_t size)
+{
+	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size, gfpflags);
+	return ret;
+}
+#endif /* CONFIG_TRACING */
+
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+
+#ifdef CONFIG_TRACING
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
 #else
-#error "Unknown slab allocator"
+static __always_inline void *
+kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+	return kmalloc_order(size, flags, order);
+}
 #endif
 
+static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+{
+	unsigned int order = get_order(size);
+	return kmalloc_order_trace(size, flags, order);
+}
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * kmalloc is the normal method of allocating memory
+ * for objects smaller than page size in the kernel.
+ */
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
+{
+	if (__builtin_constant_p(size)) {
+		if (size > KMALLOC_MAX_CACHE_SIZE)
+			return kmalloc_large(size, flags);
+#ifndef CONFIG_SLOB
+		if (!(flags & GFP_DMA)) {
+			int index = kmalloc_index(size);
+
+			if (!index)
+				return ZERO_SIZE_PTR;
+
+			return kmem_cache_alloc_trace(kmalloc_caches[index],
+					flags, size);
+		}
+#endif
+	}
+	return __kmalloc(size, flags);
+}
+
 /*
  * Determine size used for the nth kmalloc cache.
  * return size or 0 if a kmalloc cache for that
@@ -291,6 +391,7 @@ static __always_inline int kmalloc_index(size_t size)
  */
 static __always_inline int kmalloc_size(int n)
 {
+#ifndef CONFIG_SLOB
 	if (n > 2)
 		return 1 << n;
 
@@ -299,10 +400,26 @@ static __always_inline int kmalloc_size(int n)
 
 	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
 		return 192;
-
+#endif
 	return 0;
 }
-#endif /* !CONFIG_SLOB */
+
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+#ifndef CONFIG_SLOB
+	if (__builtin_constant_p(size) &&
+		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & SLAB_CACHE_DMA)) {
+		int i = kmalloc_index(size);
+
+		if (!i)
+			return ZERO_SIZE_PTR;
+
+		return kmem_cache_alloc_node_trace(kmalloc_caches[i],
+						flags, node, size);
+	}
+#endif
+	return __kmalloc_node(size, flags, node);
+}
 
 /*
  * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
@@ -312,49 +429,42 @@ static __always_inline int kmalloc_size(int n)
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+
+struct memcg_cache_array {
+	struct rcu_head rcu;
+	struct kmem_cache *entries[0];
+};
+
 /*
  * This is the main placeholder for memcg-related information in kmem caches.
- * struct kmem_cache will hold a pointer to it, so the memory cost while
- * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
- * would otherwise be if that would be bundled in kmem_cache: we'll need an
- * extra pointer chase. But the trade off clearly lays in favor of not
- * penalizing non-users.
- *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system.
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
  *
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
  * @memcg: pointer to the memcg this cache belongs to
- * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @dead: set to true after the memcg dies; the cache may still be around.
- * @nr_pages: number of pages that belongs to this cache.
- * @destroy: worker to be called whenever we are ready, or believe we may be
- *           ready, to destroy this cache.
+ *
+ * Both root and child caches of the same kind are linked into a list chained
+ * through @list.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
+	struct list_head list;
 	union {
-		struct kmem_cache *memcg_caches[0];
+		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head list;
 			struct kmem_cache *root_cache;
-			bool dead;
-			atomic_t nr_pages;
-			struct work_struct destroy;
 		};
 	};
 };
 
 int memcg_update_all_caches(int num_memcgs);
 
-struct seq_file;
-int cache_show(struct kmem_cache *s, struct seq_file *m);
-void print_slabinfo_header(struct seq_file *m);
-
 /**
  * kmalloc_array - allocate memory for an array.
  * @n: number of elements.
@@ -434,36 +544,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
-#if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)
-/**
- * kmalloc_node - allocate memory from a specific node
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- * @node: node to allocate from.
- *
- * kmalloc() for non-local nodes, used to allocate from a specific node
- * if available. Equivalent to kmalloc() in the non-NUMA single-node
- * case.
- */
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kmalloc(size, flags);
-}
-
-static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc(size, flags);
-}
-
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-
-static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
-					gfp_t flags, int node)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
-#endif /* !CONFIG_NUMA && !CONFIG_SLOB */
-
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
  * calling function of the routine calling it for slab leak tracking instead
@@ -540,14 +620,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
-/*
- * Determine the size of a slab object
- */
-static inline unsigned int kmem_cache_size(struct kmem_cache *s)
-{
-	return s->object_size;
-}
-
+unsigned int kmem_cache_size(struct kmem_cache *s);
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -5,20 +5,6 @@
 
 /*
  * Definitions unique to the original Linux SLAB allocator.
- *
- * What we provide here is a way to optimize the frequent kmalloc
- * calls in the kernel by selecting the appropriate general cache
- * if kmalloc was called with a size that can be established at
- * compile time.
- */
-
-#include <linux/init.h>
-#include <linux/compiler.h>
-
-/*
- * struct kmem_cache
- *
- * manages a cache.
  */
 
 struct kmem_cache {
@@ -82,7 +68,10 @@ struct kmem_cache {
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
+#endif
+#ifdef CONFIG_KASAN
+	struct kasan_cache kasan_info;
 #endif
 
 /* 6) per-cpu/per-node data, touched during every alloc/free */
@@ -104,96 +93,16 @@ struct kmem_cache {
 	 */
 };
 
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-void *__kmalloc(size_t size, gfp_t flags);
-
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
-#else
-static __always_inline void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
-#endif
-
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	struct kmem_cache *cachep;
-	void *ret;
-
-	if (__builtin_constant_p(size)) {
-		int i;
-
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
-			return NULL;
-
-		i = kmalloc_index(size);
-
-#ifdef CONFIG_ZONE_DMA
-		if (flags & GFP_DMA)
-			cachep = kmalloc_dma_caches[i];
-		else
-#endif
-			cachep = kmalloc_caches[i];
-
-		ret = kmem_cache_alloc_trace(cachep, flags, size);
-
-		return ret;
-	}
-	return __kmalloc(size, flags);
-}
-
-#ifdef CONFIG_NUMA
-extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
-extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-					 gfp_t flags,
-					 int nodeid,
-					 size_t size);
-#else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-			    gfp_t flags,
-			    int nodeid,
-			    size_t size)
-{
-	return kmem_cache_alloc_node(cachep, flags, nodeid);
-}
-#endif
-
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+				void *x)
 {
-	struct kmem_cache *cachep;
-
-	if (__builtin_constant_p(size)) {
-		int i;
-
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
-			return NULL;
+	void *object = x - (x - page->s_mem) % cache->size;
+	void *last_object = page->s_mem + (cache->num - 1) * cache->size;
 
-		i = kmalloc_index(size);
-
-#ifdef CONFIG_ZONE_DMA
-		if (flags & GFP_DMA)
-			cachep = kmalloc_dma_caches[i];
-		else
-#endif
-			cachep = kmalloc_caches[i];
-
-		return kmem_cache_alloc_node_trace(cachep, flags, node, size);
-	}
-	return __kmalloc_node(size, flags, node);
+	if (unlikely(object > last_object))
+		return last_object;
+	else
+		return object;
 }
 
-#endif	/* CONFIG_NUMA */
-
 #endif	/* _LINUX_SLAB_DEF_H */
--- a/include/linux/slob_def.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __LINUX_SLOB_DEF_H
-#define __LINUX_SLOB_DEF_H
-
-#include <linux/numa.h>
-
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-
-static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
-					      gfp_t flags)
-{
-	return kmem_cache_alloc_node(cachep, flags, NUMA_NO_NODE);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc_node(size, flags, node);
-}
-
-/**
- * kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- */
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	return __kmalloc_node(size, flags, NUMA_NO_NODE);
-}
-
-static __always_inline void *__kmalloc(size_t size, gfp_t flags)
-{
-	return kmalloc(size, flags);
-}
-
-#endif /* __LINUX_SLOB_DEF_H */
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -6,14 +6,8 @@
  *
  * (C) 2007 SGI, Christoph Lameter
  */
-#include <linux/types.h>
-#include <linux/gfp.h>
-#include <linux/bug.h>
-#include <linux/workqueue.h>
 #include <linux/kobject.h>
 
-#include <linux/kmemleak.h>
-
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
@@ -91,8 +85,11 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 	int max_attr_size; /* for propagation, maximum size of a stored attr */
+#ifdef CONFIG_SYSFS
+	struct kset *memcg_kset;
+#endif
 #endif
 
 #ifdef CONFIG_NUMA
@@ -101,23 +98,13 @@ struct kmem_cache {
 	 */
 	int remote_node_defrag_ratio;
 #endif
+#ifdef CONFIG_KASAN
+	struct kasan_cache kasan_info;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-void *__kmalloc(size_t size, gfp_t flags);
-
-static __always_inline void *
-kmalloc_order(size_t size, gfp_t flags, unsigned int order)
-{
-	void *ret;
-
-	flags |= (__GFP_COMP | __GFP_KMEMCG);
-	ret = (void *) __get_free_pages(flags, order);
-	kmemleak_alloc(ret, size, 1, flags);
-	return ret;
-}
-
 /**
  * Calling this on allocated memory will check that the memory
  * is expected to be in use, and print warnings if not.
@@ -131,81 +118,43 @@ static inline bool verify_mem_not_deleted(const void *x)
 }
 #endif
 
-#ifdef CONFIG_TRACING
-extern void *
-kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size);
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+#ifdef CONFIG_SYSFS
+#define SLAB_SUPPORTS_SYSFS
+void sysfs_slab_remove(struct kmem_cache *);
 #else
-static __always_inline void *
-kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
-{
-	return kmem_cache_alloc(s, gfpflags);
-}
-
-static __always_inline void *
-kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
-	return kmalloc_order(size, flags, order);
 }
 #endif
 
-static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
-{
-	unsigned int order = get_order(size);
-	return kmalloc_order_trace(size, flags, order);
-}
 
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	if (__builtin_constant_p(size)) {
-		if (size > KMALLOC_MAX_CACHE_SIZE)
-			return kmalloc_large(size, flags);
-
-		if (!(flags & GFP_DMA)) {
-			int index = kmalloc_index(size);
-
-			if (!index)
-				return ZERO_SIZE_PTR;
-
-			return kmem_cache_alloc_trace(kmalloc_caches[index],
-					flags, size);
-		}
-	}
-	return __kmalloc(size, flags);
-}
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-					   gfp_t gfpflags,
-					   int node, size_t size);
-#else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-			      gfp_t gfpflags,
-			      int node, size_t size)
+/**
+ * virt_to_obj - returns address of the beginning of object.
+ * @s: object's kmem_cache
+ * @slab_page: address of slab page
+ * @x: address within object memory range
+ *
+ * Returns address of the beginning of object
+ */
+static inline void *virt_to_obj(struct kmem_cache *s,
+				const void *slab_page,
+				const void *x)
 {
-	return kmem_cache_alloc_node(s, gfpflags, node);
+	return (void *)x - ((x - slab_page) % s->size);
 }
-#endif
 
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	if (__builtin_constant_p(size) &&
-		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
-		int index = kmalloc_index(size);
-
-		if (!index)
-			return ZERO_SIZE_PTR;
-
-		return kmem_cache_alloc_node_trace(kmalloc_caches[index],
-			       flags, node, size);
-	}
-	return __kmalloc_node(size, flags, node);
+void object_err(struct kmem_cache *s, struct page *page,
+		u8 *object, char *reason);
+
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+				void *x) {
+	void *object = x - (x - page_address(page)) % cache->size;
+	void *last_object = page_address(page) +
+		(page->objects - 1) * cache->size;
+	if (unlikely(object > last_object))
+		return last_object;
+	else
+		return object;
 }
-#endif
 
 #endif /* _LINUX_SLUB_DEF_H */
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -305,6 +305,15 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
+#define MAX_SOCK_ADDR  128             /* 108 for Unix domain -
+					  16 for IP, 16 for IPX,
+					  24 for IPv6,
+					  about 80 for AX.25
+					  must be at least one bigger than
+					  the AF_UNIX size (see net/unix/af_unix.c
+					  :unix_mkname()).
+					*/
+
 extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred);
 
 extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
--- /dev/null
+++ b/include/linux/stackdepot.h
@@ -0,0 +1,32 @@
+/*
+ * A generic stack depot implementation
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_STACKDEPOT_H
+#define _LINUX_STACKDEPOT_H
+
+typedef u32 depot_stack_handle_t;
+
+struct stack_trace;
+
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags);
+
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace);
+
+#endif
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -212,6 +212,11 @@ int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
+int rpc_task_kill_proc_init(struct net *net);
+void rpc_task_kill_proc_fini(struct net *net);
+
+bool rpc_abort_task(struct rpc_task *task);
+
 void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -108,6 +108,7 @@ struct svc_serv {
 	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
 						 * entries in the svc_cb_list */
 	struct svc_xprt		*sv_bc_xprt;	/* callback on fore channel */
+	void			(*svc_cb_down_net)(struct svc_serv *serv, struct net *net);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
 
@@ -484,6 +485,8 @@ void		   svc_reserve(struct svc_rqst *rqstp, int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
 
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net);
+
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
 /*
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -242,9 +242,7 @@ struct swap_info_struct {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
-
-void workingset_remember_node(struct radix_tree_node *node);
-void workingset_forget_node(struct radix_tree_node *node);
+extern struct list_lru workingset_shadow_nodes;
 
 static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
 {
@@ -318,13 +316,16 @@ static inline void lru_cache_add_file(struct page *page)
 	ClearPageActive(page);
 	__lru_cache_add(page);
 }
+extern void lru_cache_add_active_or_unevictable(struct page *page,
+						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-						  gfp_t gfp_mask, bool noswap);
+						  unsigned long nr_pages,
+						  gfp_t gfp_mask, int flags);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
@@ -347,6 +348,10 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_MEMCG
+extern int sysctl_force_scan_thresh;
+#endif
+
 extern int page_evictable(struct page *page);
 extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
@@ -361,9 +366,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
-extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
+extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
 #else
-static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
+static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+}
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 }
 #endif
@@ -423,7 +432,7 @@ extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
-extern void swapcache_free(swp_entry_t, struct page *page);
+extern void swapcache_free(swp_entry_t);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
@@ -486,7 +495,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline void swapcache_free(swp_entry_t swp, struct page *page)
+static inline void swapcache_free(swp_entry_t swp)
 {
 }
 
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+	return !pte_none(pte) && !pte_present(pte);
 }
 #endif
 
@@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 {
 	swp_entry_t arch_entry;
 
-	BUG_ON(pte_file(pte));
 	if (pte_swp_soft_dirty(pte))
 		pte = pte_swp_clear_soft_dirty(pte);
 	arch_entry = __pte_to_swp_entry(pte);
@@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	swp_entry_t arch_entry;
 
 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
-	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
 
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -58,6 +58,17 @@ extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 extern int proc_do_large_bitmap(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 
+extern int proc_dointvec_virtual(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_doulongvec_minmax_virtual(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_dointvec_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_dostring_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_dointvec_minmax_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+
 /*
  * Register a set of sysctl names by calling register_sysctl_table
  * with an initialised array of struct ctl_table's.  An entry with 
@@ -166,6 +177,8 @@ struct ctl_path {
 	const char *procname;
 };
 
+extern int ve_allow_module_load;
+
 #ifdef CONFIG_SYSCTL
 
 void proc_sys_poll_notify(struct ctl_table_poll *poll);
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -254,6 +254,11 @@ void sysfs_put(struct sysfs_dirent *sd);
 
 int __must_check sysfs_init(void);
 
+#ifdef CONFIG_VE
+struct ve_struct;
+int sysfs_perms_set(char *path, struct ve_struct *ve, int mask);
+#endif
+
 #else /* CONFIG_SYSFS */
 
 static inline int sysfs_schedule_callback(struct kobject *kobj,
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -5,10 +5,13 @@
 #define __TASK_IO_ACCOUNTING_OPS_INCLUDED
 
 #include <linux/sched.h>
+#include <bc/io_acct.h>
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
+
 static inline void task_io_account_read(size_t bytes)
 {
+	ub_io_account_read(bytes);
 	current->ioac.read_bytes += bytes;
 }
 
@@ -22,6 +25,12 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 }
 
 static inline void task_io_account_write(size_t bytes)
+{
+	ub_io_account_write(bytes);
+	current->ioac.write_bytes += bytes;
+}
+
+static inline void task_io_account_dirty(size_t bytes)
 {
 	current->ioac.write_bytes += bytes;
 }
@@ -73,6 +82,10 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 	return 0;
 }
 
+static inline void task_io_account_dirty(size_t bytes)
+{
+}
+
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -14,11 +14,16 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
 
 int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
 struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
-void task_work_run(void);
+void __task_work_run(bool exiting);
+
+static inline void task_work_run(void)
+{
+	return __task_work_run(false);
+}
 
 static inline void exit_task_work(struct task_struct *task)
 {
-	task_work_run();
+	__task_work_run(true);
 }
 
 #endif	/* _LINUX_TASK_WORK_H */
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -360,6 +360,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
 	u32			  tw_rcv_nxt;
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,13 +56,12 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 #ifdef __KERNEL__
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
+				 __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
 #endif
 
-#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
-
 /*
  * flag set/clear/test wrappers
  * - pass TIF_xxxx constants to these functions
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -25,6 +25,7 @@
  * This controls the default maximum pid allocated to a process
  */
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
+#define PID_MAX_NS_DEFAULT	(PID_MAX_DEFAULT)
 
 /*
  * A maximum of 4 million PIDs should be enough for a while.
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -281,6 +281,10 @@ struct tty_struct {
 	struct tty_port *port;
 
 	RH_KABI_EXTEND(struct ld_semaphore ldisc_sem)
+
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+#endif
 };
 
 /* Each of a tty's open files has private_data pointing to tty_file_private */
@@ -313,6 +317,7 @@ struct tty_file_private {
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
 #define TTY_HUPPED 		18	/* Post driver->hangup() */
 #define TTY_LDISC_HALTED	22	/* Line discipline is halted */
+#define TTY_CHARGED		23	/* Charged as ub resource */
 
 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
 
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -26,6 +26,11 @@ typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
 typedef __kernel_mqd_t		mqd_t;
 
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
 typedef _Bool			bool;
 
 typedef __kernel_uid32_t	uid_t;
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -7,6 +7,9 @@
 #include <linux/err.h>
 #include <linux/rh_kabi.h>
 
+#define UIDHASH_BITS   (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ     (1 << UIDHASH_BITS)
+
 #define UID_GID_MAP_MAX_EXTENTS 5
 
 struct uid_gid_map {	/* 64 bytes -- 1 cache line */
@@ -49,6 +52,7 @@ struct user_namespace {
 	struct uid_gid_map	gid_map;
 	struct uid_gid_map	projid_map;
 	atomic_t		count;
+	struct hlist_head       uidhash_table[UIDHASH_SZ];
 	struct user_namespace	*parent;
 	kuid_t			owner;
 	kgid_t			group;
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -19,14 +19,31 @@ enum uts_proc {
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+#ifdef CONFIG_X86
+struct uts_vdso {
+	void			*addr;
+	struct page		**pages;
+	unsigned int		nr_pages;
+	unsigned int		size;
+	unsigned long		version_off;
+};
+#endif
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
 	unsigned int proc_inum;
 	RH_KABI_EXTEND(struct ucounts *ucounts)
+#ifdef CONFIG_X86
+	struct uts_vdso vdso;
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	struct uts_vdso vdso32;
+#endif
 };
 extern struct uts_namespace init_uts_ns;
+extern struct new_utsname virt_utsname;
 
 #ifdef CONFIG_UTS_NS
 static inline void get_uts_ns(struct uts_namespace *ns)
--- /dev/null
+++ b/include/linux/ve.h
@@ -0,0 +1,286 @@
+/*
+ *  include/linux/ve.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VE_H
+#define _LINUX_VE_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/vzstat.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/path.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/ve_proto.h>
+#include <net/inet_frag.h>
+#include <linux/cgroup.h>
+#include <linux/kmapset.h>
+#include <linux/binfmts.h>
+
+struct tty_driver;
+struct file_system_type;
+struct veip_struct;
+struct nsproxy;
+struct user_namespace;
+struct cn_private;
+extern struct user_namespace init_user_ns;
+
+struct ve_struct {
+	struct cgroup_subsys_state	css;
+
+	const char		*ve_name;
+
+	struct list_head	ve_list;
+
+	envid_t			veid;
+
+	unsigned int		class_id;
+	struct rw_semaphore	op_sem;
+	int			is_running;
+	int			is_pseudosuper;
+	atomic_t		suspend;
+	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
+	__u64			features;
+
+	struct task_struct	*ve_kthread_task;
+	struct kthread_worker	ve_kthread_worker;
+
+	struct task_struct	*ve_umh_task;
+	struct kthread_worker	ve_umh_worker;
+
+	struct super_block	*dev_sb;
+	struct super_block	*devpts_sb;
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc	*binfmt_misc;
+#endif
+
+	struct list_head	devices;
+
+#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE)
+	struct veip_struct	*veip;
+	struct net_device	*_venet_dev;
+#endif
+
+/* per VE CPU stats*/
+	struct timespec		start_timespec;		/* monotonic time */
+	struct timespec		real_start_timespec;	/* boot based time */
+	u64			start_jiffies;	/* Deprecated */
+
+	struct kstat_lat_pcpu_struct	sched_lat_ve;
+
+#ifdef CONFIG_INET
+	struct venet_stat       *stat;
+#ifdef CONFIG_VE_IPTABLES
+/* core/netfilter.c virtualization */
+	__u64			ipt_mask;
+#endif /* CONFIG_VE_IPTABLES */
+#endif
+
+	void			*log_state;
+#define VE_LOG_BUF_LEN		4096
+
+	unsigned long		down_at;
+	struct list_head	cleanup_list;
+	unsigned char		disable_net;
+	unsigned long		meminfo_val;
+	int _randomize_va_space;
+
+	int			odirect_enable;
+	int			fsync_enable;
+
+	u64			_uevent_seqnum;
+	struct nsproxy __rcu	*ve_ns;
+	struct task_struct	*init_task;
+	struct cred		*init_cred;
+	struct net		*ve_netns;
+
+	struct list_head	devmnt_list;
+	struct mutex		devmnt_mutex;
+
+	struct kmapset_key	ve_sysfs_perms;
+
+#ifdef CONFIG_AIO
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+#endif
+	atomic_t		netns_avail_nr;
+	int			netns_max_nr;
+	atomic_t		netif_avail_nr;
+	int			netif_max_nr;
+	atomic_t		mnt_nr;	/* number of present VE mounts */
+#ifdef CONFIG_COREDUMP
+	char 			core_pattern[CORENAME_MAX_SIZE];
+#endif
+#ifdef CONFIG_CONNECTOR
+	struct cn_private	*cn;
+#endif
+};
+
+struct ve_devmnt {
+	struct list_head	link;
+
+	dev_t                   dev;
+	char			*allowed_options;
+	char			*hidden_options; /* balloon_ino, etc. */
+};
+
+#define NETNS_MAX_NR_DEFAULT	256	/* number of net-namespaces per-VE */
+#define NETIF_MAX_NR_DEFAULT	256	/* number of net-interfaces per-VE */
+
+#define VE_MEMINFO_DEFAULT      1       /* default behaviour */
+#define VE_MEMINFO_SYSTEM       0       /* disable meminfo virtualization */
+
+#define capable_setveid() \
+	(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
+
+extern int nr_ve;
+extern struct proc_dir_entry *proc_vz_dir;
+extern struct cgroup_subsys ve_subsys;
+
+extern unsigned int sysctl_ve_mount_nr;
+
+#ifdef CONFIG_VE
+#define ve_uevent_seqnum       (get_exec_env()->_uevent_seqnum)
+
+extern struct kobj_ns_type_operations ve_ns_type_operations;
+extern struct kobject * kobject_create_and_add_ve(const char *name,
+						struct kobject *parent);
+
+extern struct kmapset_set ve_sysfs_perms;
+
+extern int vz_security_family_check(struct net *net, int family, int type);
+extern int vz_security_protocol_check(struct net *net, int protocol);
+
+extern struct task_struct *kthread_create_on_node_ve(struct ve_struct *ve,
+					int (*threadfn)(void *data),
+					void *data, int node,
+					const char namefmt[], ...);
+
+#define kthread_create_ve(ve, threadfn, data, namefmt, arg...) \
+	kthread_create_on_node_ve(ve, threadfn, data, -1, namefmt, ##arg)
+
+#define kthread_run_ve(ve, threadfn, data, namefmt, ...)		   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create_ve(ve, threadfn, data, namefmt, ## __VA_ARGS__); \
+	if (!IS_ERR(__k))						   \
+		wake_up_process(__k);					   \
+	__k;								   \
+})
+
+struct subprocess_info;
+extern int call_usermodehelper_fns_ve(struct ve_struct *ve,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data);
+
+static inline int
+call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
+		       char **envp, int wait)
+{
+	return call_usermodehelper_fns_ve(ve, path, argv, envp, wait,
+				       NULL, NULL, NULL);
+}
+void do_update_load_avg_ve(void);
+
+extern struct ve_struct *get_ve(struct ve_struct *ve);
+extern void put_ve(struct ve_struct *ve);
+
+struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id);
+
+static inline struct ve_struct *cgroup_ve(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, ve_subsys_id),
+			struct ve_struct, css);
+}
+
+extern unsigned long long ve_relative_clock(struct timespec * ts);
+extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp);
+extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp);
+
+void ve_stop_ns(struct pid_namespace *ns);
+void ve_exit_ns(struct pid_namespace *ns);
+
+extern bool current_user_ns_initial(void);
+struct user_namespace *ve_init_user_ns(void);
+
+int ve_net_hide_sysctl(struct net *net);
+
+#ifdef CONFIG_TTY
+#define MAX_NR_VTTY_CONSOLES	(12)
+extern struct tty_driver *vtty_driver(dev_t dev, int *index);
+extern struct tty_driver *vtty_console_driver(int *index);
+extern int vtty_open_master(envid_t veid, int idx);
+extern void vtty_release(struct tty_struct *tty, struct tty_struct *o_tty,
+			 int *tty_closing, int *o_tty_closing);
+extern bool vtty_is_master(struct tty_struct *tty);
+#endif /* CONFIG_TTY */
+
+extern struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp);
+
+#else	/* CONFIG_VE */
+
+#define ve_uevent_seqnum uevent_seqnum
+
+static inline int vz_security_family_check(struct net *net, int family, int type) { return 0; }
+static inline int vz_security_protocol_check(struct net *net, int protocol) { return 0; }
+
+#define ve_utsname	system_utsname
+#define get_ve(ve)	(NULL)
+#define put_ve(ve)	do { } while (0)
+
+static inline void ve_stop_ns(struct pid_namespace *ns) { }
+static inline void ve_exit_ns(struct pid_namespace *ns) { }
+
+static inline bool current_user_ns_initial(void)
+{
+	return current_user_ns() == init_cred.user_ns;
+}
+
+static inline struct user_namespace *ve_init_user_ns(void)
+{
+	return &init_user_ns;
+}
+
+#define kthread_create_on_node_ve(ve, threadfn, data, node, namefmt...)	\
+	kthread_create_on_node_ve(threadfn, data, node, namefmt...)
+
+#define kobject_create_and_add_ve		kobject_create_and_add
+
+static inline void monotonic_abs_to_ve(clockid_t which_clock,
+				struct timespec *tp) { }
+static inline void monotonic_ve_to_abs(clockid_t which_clock,
+				struct timepsec *tp) { }
+
+static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+{
+	return NULL;
+}
+#endif	/* CONFIG_VE */
+
+struct seq_file;
+struct kernel_cpustat;
+
+#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avenrun);
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat);
+#else
+static inline int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+static inline int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avenrun) { return -ENOSYS; }
+static inline int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat) { return -ENOSYS; }
+#endif
+
+#endif /* _LINUX_VE_H */
--- /dev/null
+++ b/include/linux/ve_proto.h
@@ -0,0 +1,135 @@
+/*
+ *  include/linux/ve_proto.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VE_H__
+#define __VE_H__
+
+struct ve_struct;
+struct task_struct;
+struct seq_file;
+struct net;
+
+#ifdef CONFIG_VE
+
+extern struct ve_struct ve0;
+
+static inline struct ve_struct *get_ve0(void)
+{
+	return &ve0;
+}
+
+static inline bool ve_is_super(struct ve_struct *ve)
+{
+	return ve == &ve0;
+}
+
+#define get_exec_env()		(current->task_ve)
+
+const char *ve_name(struct ve_struct *ve);
+
+/* must be called under rcu_read_lock if task != current */
+const char *task_ve_name(struct task_struct *task);
+
+extern int ve_task_count(struct ve_struct *);
+
+typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *);
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t);
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t);
+
+#if defined(CONFIG_INET) && defined(CONFIG_VE_NETDEV)
+int venet_init(void);
+#endif
+
+extern struct list_head ve_list_head;
+#define for_each_ve(ve)	list_for_each_entry((ve), &ve_list_head, ve_list)
+extern struct mutex ve_list_lock;
+extern struct ve_struct *get_ve_by_id(envid_t);
+
+extern int nr_threads_ve(struct ve_struct *ve);
+
+enum {
+	VE_SS_CHAIN,
+	VE_SHUTDOWN_CHAIN,
+
+	VE_MAX_CHAINS
+};
+
+typedef int ve_hook_init_fn(void *data);
+typedef void ve_hook_fini_fn(void *data);
+
+struct ve_hook
+{
+	ve_hook_init_fn *init;
+	ve_hook_fini_fn *fini;
+	struct module *owner;
+
+	/* Functions are called in ascending priority */
+	int priority;
+
+	/* Private part */
+	struct list_head list;
+};
+
+enum {
+	HOOK_PRIO_DEFAULT = 0,
+
+	HOOK_PRIO_FS = HOOK_PRIO_DEFAULT,
+
+	HOOK_PRIO_NET_PRE,
+	HOOK_PRIO_NET,
+	HOOK_PRIO_NET_POST,
+	HOOK_PRIO_NET_ACCT = 100,
+	HOOK_PRIO_NET_ACCT_V6,
+
+	HOOK_PRIO_AFTERALL = INT_MAX-1,
+	HOOK_PRIO_FINISHING = INT_MAX,
+};
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos);
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void ve_seq_stop(struct seq_file *m, void *v);
+
+extern int ve_hook_iterate_init(int chain, void *data);
+extern void ve_hook_iterate_fini(int chain, void *data);
+
+extern void ve_hook_register(int chain, struct ve_hook *vh);
+extern void ve_hook_unregister(struct ve_hook *vh);
+#else /* CONFIG_VE */
+#define ve_hook_register(ch, vh)	do { } while (0)
+#define ve_hook_unregister(ve)		do { } while (0)
+
+static inline struct ve_struct *get_ve0(void)
+{
+	return NULL;
+}
+
+static inline struct ve_struct *get_exec_env(void)
+{
+	return NULL;
+}
+
+static inline bool ve_is_super(struct ve_struct *ve)
+{
+	return true;
+}
+
+static inline const char *ve_name(struct ve_struct *ve)
+{
+	return "0";
+}
+
+static inline const char *task_ve_name(struct task_struct *task)
+{
+	return "0";
+}
+
+#define nr_threads_ve(ve)	(nr_threads)
+
+#endif /* CONFIG_VE */
+#endif
--- /dev/null
+++ b/include/linux/veip.h
@@ -0,0 +1,22 @@
+/*
+ *  include/linux/veip.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VE_IP_H_
+#define __VE_IP_H_
+
+struct ve_addr_struct {
+	int family;
+	__u32 key[4];
+};
+
+struct sockaddr;
+
+extern void veaddr_print(char *, int, struct ve_addr_struct *);
+extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr);
+
+#endif
--- /dev/null
+++ b/include/linux/venet.h
@@ -0,0 +1,103 @@
+/*
+ *  include/linux/venet.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _VENET_H
+#define _VENET_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/veip.h>
+#include <linux/netdevice.h>
+
+#define VEIP_HASH_SZ 512
+
+struct ve_struct;
+struct venet_stat;
+struct venet_stats {
+	struct net_device_stats	stats;
+	struct net_device_stats	*real_stats;
+};
+
+struct ip_entry_struct
+{
+	struct ve_addr_struct	addr;
+	struct ve_struct	*active_env;
+	struct veip_struct	*tgt_veip;
+	struct hlist_node 	ip_hash;
+	union {
+		struct list_head 	ve_list;
+		struct rcu_head		rcu;
+	};
+};
+
+struct ext_entry_struct
+{
+	struct list_head	list;
+	struct ve_addr_struct	addr;
+	struct rcu_head		rcu;
+};
+
+struct veip_struct
+{
+	struct list_head	src_lh;
+	struct list_head	dst_lh;
+	struct list_head	ip_lh;
+	struct list_head	list;
+	struct list_head	ext_lh;
+	envid_t			veid;
+	struct venet_stat	*stat;
+	struct rcu_head		rcu;
+};
+
+struct veip_pool_ops {
+	int (*veip_create)(struct ve_struct *);
+	void (*veip_release)(struct ve_struct *);
+	void (*veip_free)(struct veip_struct *);
+	struct ve_struct *(*veip_lookup)(struct ve_struct *, struct sk_buff *);
+};
+
+extern struct veip_pool_ops *veip_pool_ops;
+
+static inline struct net_device_stats *
+venet_stats(struct net_device *dev, int cpu)
+{
+	struct venet_stats *stats;
+	stats = (struct venet_stats*)dev->ml_priv;
+	return per_cpu_ptr(stats->real_stats, cpu);
+}
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *);
+
+struct veip_struct *veip_findcreate(envid_t veid);
+int veip_put(struct veip_struct *veip);
+void veip_cleanup(void);
+
+int in4_to_veaddr(const char *addr, struct ve_addr_struct *veaddr);
+int in6_to_veaddr(const char *addr, struct ve_addr_struct *veaddr);
+
+extern struct list_head veip_lh;
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr);
+
+extern struct hlist_head ip_entry_hash_table[];
+extern spinlock_t veip_lock;
+
+extern void (*venet_free_stat)(struct ve_struct *);
+
+#define NIPQUAD(addr) \
+	((unsigned char *)&addr)[0], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[3]
+
+#endif
--- /dev/null
+++ b/include/linux/virtinfo.h
@@ -0,0 +1,84 @@
+/*
+ *  include/linux/virtinfo.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __LINUX_VIRTINFO_H
+#define __LINUX_VIRTINFO_H
+
+#include <linux/kernel.h>
+#include <linux/page-flags.h>
+#include <linux/notifier.h>
+#include <linux/mmzone.h>
+
+struct vnotifier_block
+{
+	int (*notifier_call)(struct vnotifier_block *self,
+			unsigned long, void *, int);
+	struct vnotifier_block *next;
+	int priority;
+};
+
+extern struct semaphore virtinfo_sem;
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
+int virtinfo_notifier_call(int type, unsigned long n, void *data);
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data);
+
+struct page_info {
+	unsigned long nr_file_dirty;
+	unsigned long nr_writeback;
+	unsigned long nr_anon_pages;
+	unsigned long nr_file_mapped;
+	unsigned long nr_slab_rec;
+	unsigned long nr_slab_unrec;
+	unsigned long nr_pagetable;
+	unsigned long nr_unstable_nfs;
+	unsigned long nr_bounce;
+	unsigned long nr_writeback_temp;
+};
+
+struct sysinfo;
+struct user_beancounter;
+
+struct meminfo {
+	struct sysinfo *si;
+	struct user_beancounter *ub;
+	unsigned long meminfo_val;
+	unsigned long pages[NR_LRU_LISTS];
+	unsigned long cached, dirty_pages, writeback_pages, locked, shmem;
+	unsigned long slab_reclaimable, slab_unreclaimable;
+};
+
+struct seq_file;
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val);
+
+#define VIRTINFO_MEMINFO	0
+#define VIRTINFO_SYSINFO	2
+#define VIRTINFO_VMSTAT		3
+#define VIRTINFO_OOMKILL	4
+
+#define VIRTINFO_IO_ACCOUNT	0
+#define VIRTINFO_IO_PREPARE	1
+#define VIRTINFO_IO_JOURNAL	2
+#define VIRTINFO_IO_READAHEAD	3
+#define VIRTINFO_IO_CONGESTION	4
+#define VIRTINFO_IO_OP_ACCOUNT	5
+#define VIRTINFO_IO_BALANCE_DIRTY	6
+#define VIRTINFO_IO_FUSE_REQ	7
+
+enum virt_info_types {
+	VITYPE_GENERAL,
+	VITYPE_QUOTA,
+	VITYPE_IO,
+
+	VIRT_TYPES
+};
+
+#endif /* __LINUX_VIRTINFO_H */
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -16,6 +16,8 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
 #define VM_VPAGES	0x00000010	/* buffer for pages was vmalloc'ed */
 #define VM_UNLIST	0x00000020	/* vm_struct is not listed in vmlist */
+#define VM_NO_GUARD	0x00000040      /* don't add guard page */
+#define VM_KASAN	0x00000080      /* has allocated kasan shadow memory */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -66,6 +68,8 @@ static inline void vmalloc_init(void)
 
 extern void *vmalloc(unsigned long size);
 extern void *vzalloc(unsigned long size);
+extern void *vmalloc_account(unsigned long size);
+extern void *vzalloc_account(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
@@ -75,7 +79,10 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller);
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller);
+extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
+
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -96,8 +103,12 @@ void vmalloc_sync_all(void);
 
 static inline size_t get_vm_area_size(const struct vm_struct *area)
 {
-	/* return actual size without guard page */
-	return area->size - PAGE_SIZE;
+	if (!(area->flags & VM_NO_GUARD))
+		/* return actual size without guard page */
+		return area->size - PAGE_SIZE;
+	else
+		return area->size;
+
 }
 
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
--- /dev/null
+++ b/include/linux/vzctl.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzctl.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZCTL_H
+#define _LINUX_VZCTL_H
+
+#include <linux/list.h>
+
+struct module;
+struct inode;
+struct file;
+struct vzioctlinfo {
+	unsigned type;
+	int (*ioctl)(struct file *, unsigned int, unsigned long);
+	int (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	struct module *owner;
+	struct list_head list;
+};
+
+extern void vzioctl_register(struct vzioctlinfo *inf);
+extern void vzioctl_unregister(struct vzioctlinfo *inf);
+
+#endif
--- /dev/null
+++ b/include/linux/vzevent.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzevent.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __LINUX_VZ_EVENT_H__
+#define __LINUX_VZ_EVENT_H__
+
+#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE)
+extern int vzevent_send(int msg, const char *attrs_fmt, ...);
+#else
+static inline int vzevent_send(int msg, const char *attrs_fmt, ...)
+{
+	return 0;
+}
+#endif
+
+enum {
+	VE_EVENT_MOUNT,
+	VE_EVENT_UMOUNT,
+	VE_EVENT_START,
+	VE_EVENT_STOP,
+	VE_EVENT_REBOOT,
+};
+
+#endif /* __LINUX_VZ_EVENT_H__ */
--- /dev/null
+++ b/include/linux/vziolimit.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vziolimit.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZIOLIMIT_H
+#define _LINUX_VZIOLIMIT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VZIOLIMITTYPE 'I'
+
+struct iolimit_state {
+	unsigned int id;
+	unsigned int speed;
+	unsigned int burst;
+	unsigned int latency;
+};
+
+#define VZCTL_SET_IOLIMIT	_IOW(VZIOLIMITTYPE, 0, struct iolimit_state)
+#define VZCTL_GET_IOLIMIT	_IOR(VZIOLIMITTYPE, 1, struct iolimit_state)
+#define VZCTL_SET_IOPSLIMIT	_IOW(VZIOLIMITTYPE, 2, struct iolimit_state)
+#define VZCTL_GET_IOPSLIMIT	_IOR(VZIOLIMITTYPE, 3, struct iolimit_state)
+
+#endif /* _LINUX_VZIOLIMIT_H */
--- /dev/null
+++ b/include/linux/vziptable_defs.h
@@ -0,0 +1,21 @@
+/*
+ *  include/linux/vziptable_defs.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZIPTABLE_DEFS_H
+#define _LINUX_VZIPTABLE_DEFS_H
+
+#include <linux/types.h>
+#include <linux/ve.h>
+
+#include <uapi/linux/vziptable_defs.h>
+
+static inline bool mask_ipt_allow(__u64 permitted, __u64 mask)
+{
+	return (permitted & mask) == mask;
+}
+
+#endif /* _LINUX_VZIPTABLE_DEFS_H */
--- /dev/null
+++ b/include/linux/vznetstat.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/vznetstat.h
+ *
+ * Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _VZNETSTAT_H
+#define _VZNETSTAT_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#define TC_CLASS_MAX	16
+
+struct acct_counter {
+	u64	bytes;
+	u32	pkts;
+	u32	__pad;
+};
+
+enum {
+	ACCT_IN,
+	ACCT_OUT,
+	ACCT_MAX
+};
+
+struct acct_stat {
+	struct acct_counter cnt[TC_CLASS_MAX][ACCT_MAX];
+};
+
+struct venet_stat {
+	struct list_head list;
+	envid_t  veid;
+	u16 base;
+	unsigned long flags;
+	atomic_t users;
+
+	struct acct_stat __percpu *ipv4_stat;
+	struct acct_stat __percpu *ipv6_stat;
+};
+
+static inline int venet_acct_skb_size(struct sk_buff *skb)
+{
+	return skb->data_len + (skb->tail - skb->network_header);
+}
+
+struct ve_addr_struct;
+
+#if IS_ENABLED(CONFIG_VE_NETDEV_ACCOUNTING)
+struct venet_stat *venet_acct_find_stat(envid_t veid);
+struct venet_stat *venet_acct_find_create_stat(envid_t veid);
+static inline void venet_acct_get_stat(struct venet_stat *stat)
+{
+	atomic_inc(&stat->users);
+}
+void   venet_acct_put_stat(struct venet_stat *);
+
+void venet_acct_classify_add_incoming(struct venet_stat *, struct sk_buff *skb);
+void venet_acct_classify_add_outgoing(struct venet_stat *, struct sk_buff *skb);
+void venet_acct_classify_sub_outgoing(struct venet_stat *, struct sk_buff *skb);
+
+void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size);
+void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size);
+
+#else /* !CONFIG_VE_NETDEV_ACCOUNTING */
+static inline void venet_acct_get_stat(struct venet_stat *stat) { }
+static inline void venet_acct_put_stat(struct venet_stat *stat) { }
+
+static inline void venet_acct_classify_add_incoming(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+static inline void venet_acct_classify_add_outgoing(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+static inline void venet_acct_classify_sub_outgoing(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+
+static inline void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size) {}
+static inline void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size) {}
+#endif /* CONFIG_VE_NETDEV_ACCOUNTING */
+
+#endif
--- /dev/null
+++ b/include/linux/vzprivnet.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzprivnet.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __IP_VZPRIVNET_H__
+#define __IP_VZPRIVNET_H__
+
+extern int vzpn_handle_bridged;
+extern int vzpn_filter_host;
+
+struct proc_dir_entry;
+extern struct proc_dir_entry *vzpriv_proc_dir;
+
+struct seq_file;
+typedef void (*vzprivnet_show_fn)(struct seq_file *);
+void vzprivnet_reg_show(vzprivnet_show_fn);
+void vzprivnet_unreg_show(vzprivnet_show_fn);
+
+#define is_eol(ch)	((ch) == '\0' || (ch) == '\n')
+
+#define VZPRIVNET_STRONG       0
+#define VZPRIVNET_WEAK         1
+#define VZPRIVNET_INET         2
+
+#endif
--- /dev/null
+++ b/include/linux/vzstat.h
@@ -0,0 +1,127 @@
+/*
+ *  include/linux/vzstat.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VZSTAT_H__
+#define __VZSTAT_H__
+
+#include <linux/mmzone.h>
+
+struct swap_cache_info_struct {
+	unsigned long add_total;
+	unsigned long del_total;
+	unsigned long find_success;
+	unsigned long find_total;
+};
+
+struct kstat_lat_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+};
+struct kstat_lat_pcpu_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+	seqcount_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct kstat_lat_struct {
+	struct kstat_lat_snap_struct cur, last;
+	u64 avg[3];
+};
+struct kstat_lat_pcpu_struct {
+	struct kstat_lat_pcpu_snap_struct *cur;
+	u64 max_snap;
+	struct kstat_lat_snap_struct last;
+	u64 avg[3];
+};
+
+struct kstat_perf_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+};
+
+struct kstat_perf_pcpu_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+	seqcount_t lock;
+};
+
+struct kstat_perf_pcpu_struct {
+	struct kstat_perf_pcpu_snap_struct *cur;
+	struct kstat_perf_snap_struct last;
+};
+
+struct kstat_zone_avg {
+	unsigned long		free_pages_avg[3],
+				nr_active_avg[3],
+				nr_inactive_avg[3];
+};
+
+enum {
+	KSTAT_ALLOCSTAT_ATOMIC,
+	KSTAT_ALLOCSTAT_LOW,
+	KSTAT_ALLOCSTAT_HIGH,
+	KSTAT_ALLOCSTAT_LOW_MP,
+	KSTAT_ALLOCSTAT_HIGH_MP,
+	KSTAT_ALLOCSTAT_NR,
+};
+
+struct kernel_stat_glob {
+	unsigned long nr_unint_avg[3];
+
+	unsigned long alloc_fails[NR_CPUS][KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct sched_lat;
+	struct kstat_lat_pcpu_struct page_in;
+	struct kstat_lat_struct swap_in;
+
+	struct kstat_perf_pcpu_struct ttfp, cache_reap,
+			refill_inact, shrink_icache, shrink_dcache;
+
+	struct kstat_zone_avg zone_avg[MAX_NR_ZONES];
+} ____cacheline_aligned;
+
+extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
+extern spinlock_t kstat_glb_lock;
+
+extern void kstat_init(void);
+
+#ifdef CONFIG_VE
+
+extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
+			   u64 cpu_time);
+
+#define KSTAT_PERF_ENTER(name)				\
+	u64 start, sleep_time;				\
+							\
+	start = ktime_to_ns(ktime_get());		\
+	sleep_time = current->se.statistics->sum_sleep_runtime; \
+
+#define KSTAT_PERF_LEAVE(name)				\
+	start = ktime_to_ns(ktime_get()) - start;	\
+	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
+	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
+
+extern void KSTAT_LAT_ADD(struct kstat_lat_struct *p, u64 dur);
+extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, u64 dur);
+extern void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p);
+extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
+
+#else
+#define KSTAT_PERF_ADD(ptr, real_time, cpu_time)
+#define KSTAT_PERF_ENTER(name)
+#define KSTAT_PERF_LEAVE(name)
+#define KSTAT_LAT_ADD(p, dur)
+#define KSTAT_LAT_PCPU_ADD(p, cpu, dur)
+#define KSTAT_LAT_UPDATE(p)
+#define KSTAT_LAT_PCPU_UPDATE(p)
+#define KSTAT_LAT_PCPU_UPDATE(p)
+#endif
+
+#endif /* __VZSTAT_H__ */
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -744,6 +744,32 @@ do {									\
 	__ret;								\
 })
 
+#define __wait_event_killable_exclusive(wq, condition, ret)		\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait_exclusive(&wq, &__wait, TASK_KILLABLE);	\
+		if (condition)						\
+			break;						\
+		if (!fatal_signal_pending(current)) {			\
+			schedule();					\
+			continue;					\
+		}							\
+		ret = -ERESTARTSYS;					\
+		break;							\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+
+#define wait_event_killable_exclusive(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__wait_event_killable_exclusive(wq, condition, __ret);	\
+	__ret;								\
+})
 
 #define __wait_event_lock_irq(wq, condition, lock, cmd)			\
 do {									\
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -92,13 +92,18 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+void writeback_inodes_sb_ub(struct super_block *, struct user_beancounter *,
+							enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb_ub(struct super_block *, struct user_beancounter *ub);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
@@ -131,6 +136,7 @@ extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -147,6 +153,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -10,7 +10,6 @@
 #ifndef _LINUX_XATTR_H
 #define _LINUX_XATTR_H
 
-
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -168,6 +168,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+void __ipv6_sock_mc_close(struct sock *sk);
 void ipv6_sock_mc_close(struct sock *sk);
 bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr);
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -53,7 +53,7 @@ struct unix_sock {
 	struct sock		sk;
 	struct unix_address     *addr;
 	struct path		path;
-	struct mutex		readlock;
+	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
 	atomic_long_t		inflight;
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -61,6 +61,7 @@ struct dst_entry {
 #define DST_XFRM_TUNNEL		0x0080
 #define DST_XFRM_QUEUE		0x0100
 #define DST_METADATA		0x0200
+#define DST_FREE		0x0400
 
 	RH_KABI_DEPRECATE(unsigned short, pending_confirm)
 	short			error;
@@ -193,6 +194,11 @@ dst_metric_raw(const struct dst_entry *dst, const int metric)
 	return p[metric-1];
 }
 
+void dst_dump_one(struct dst_entry *d);
+void ip_rt_dump_dsts(void);
+void dst_cache_dump(void);
+extern void (*ip6_rt_dump_dsts)(void);
+
 static inline u32
 dst_metric(const struct dst_entry *dst, const int metric)
 {
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -203,14 +203,27 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd)
 #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)
 
-unsigned long snmp_fold_field(void __percpu *mib[], int offt);
+unsigned long __snmp_fold_field(void __percpu *mib[], int offt, const struct cpumask *mask);
+static inline unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+{
+	return __snmp_fold_field(mib, offt, cpu_possible_mask);
+}
 #if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off);
+u64 __snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off,
+			const struct cpumask *mask);
+static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off)
+{
+	return __snmp_fold_field64(mib, offt, sync_off, cpu_possible_mask)
+}
 #else
 static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off)
 {
 	return snmp_fold_field(mib, offt);
 }
+static inline unsigned long __snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off, const struct cpumask *mask)
+{
+	return __snmp_fold_field(mib, offt, mask);
+}
 #endif
 int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align);
 
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 
 void ip6_route_input(struct sk_buff *skb);
+void __ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr);
 
 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
 				   struct flowi6 *fl6);
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -75,6 +75,13 @@ struct net {
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	int			ifindex;
 
+#ifdef CONFIG_VE
+	struct ve_struct	*owner_ve;
+#ifdef CONFIG_VE_IPTABLES
+	__u64			_iptables_modules;
+#endif
+#endif
+
 	/* core fib_rules */
 	struct list_head	rules_ops;
 
@@ -248,6 +255,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 
 extern void net_drop_ns(void *);
 
+/* Returns whether curr can mess with net's objects */
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return net_eq(curr, &init_net) || net_eq(curr, net);
+}
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -271,6 +283,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 }
 
 #define net_drop_ns NULL
+
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return 1;
+}
 #endif
 
 
@@ -310,6 +327,16 @@ static inline struct net *read_pnet(possible_net_t const *pnet)
 #define __net_initconst	__initconst
 #endif
 
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+static inline void allow_conntrack_allocation(struct net *net)
+{
+	net->ct.can_alloc = true;
+	smp_wmb(); /* Pairs with rmb in resolve_normal_ct() */
+}
+#else
+static inline void allow_conntrack_allocation(struct net *net) { }
+#endif
+
 int peernet2id_alloc(struct net *net, struct net *peer);
 int peernet2id(struct net *net, struct net *peer);
 bool peernet_has_id(struct net *net, struct net *peer);
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -9,7 +9,6 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 extern unsigned int nf_ct_expect_hsize;
-extern unsigned int nf_ct_expect_max;
 
 struct nf_conntrack_expect {
 	/* Conntrack expectation list member */
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -93,7 +93,7 @@ struct nf_log_buf;
 
 struct nf_log_buf *nf_log_buf_open(void);
 __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...);
-void nf_log_buf_close(struct nf_log_buf *m);
+void nf_log_buf_close(struct nf_log_buf *m, struct ve_struct *ve);
 
 /* common logging functions */
 int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -525,10 +525,10 @@ static inline int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
 {
 	return 0;
 }
-static int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
-				 u32 offset,
-				 unsigned long bitmap,
-				 gfp_t flags)
+static inline int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
+					u32 offset,
+					unsigned long bitmap,
+					gfp_t flags)
 {
 	return 0;
 }
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -107,8 +107,12 @@ struct ct_pcpu {
 
 struct netns_ct {
 	atomic_t		count;
+	bool			can_alloc; /* Initialized in 0 by net_alloc */
+	unsigned int		max;
 	unsigned int		expect_count;
+	unsigned int		expect_max;
 #ifdef CONFIG_SYSCTL
+	struct ctl_table_header	*netfilter_header;
 	struct ctl_table_header	*sysctl_header;
 	struct ctl_table_header	*acct_sysctl_header;
 	struct ctl_table_header	*tstamp_sysctl_header;
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -42,7 +42,13 @@ static inline void *net_generic(const struct net *net, int id)
 	ptr = ng->ptr[id - 1];
 	rcu_read_unlock();
 
+#ifndef CONFIG_VE
+	/* May be NULL for disabled VE features */
 	BUG_ON(!ptr);
+#endif
 	return ptr;
 }
+
+extern int net_assign_generic(struct net *net, int id, void *data);
+
 #endif
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -23,6 +23,11 @@
 
 extern struct proto raw_prot;
 
+extern struct raw_hashinfo raw_v4_hashinfo;
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, __be32 raddr,
+			     __be32 laddr, int dif);
+
 void raw_icmp_error(struct sk_buff *, int, u32);
 int raw_local_deliver(struct sk_buff *, int);
 
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,13 @@
 
 #include <net/protocol.h>
 
+extern struct raw_hashinfo raw_v6_hashinfo;
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, const struct in6_addr *loc_addr,
+			     const struct in6_addr *rmt_addr, int dif);
+
+int raw_abort(struct sock *sk, int err);
+
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32);
 bool raw6_local_deliver(struct sk_buff *, int);
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -214,6 +214,7 @@ static inline void ip_rt_put(struct rtable *rt)
 #define IPTOS_RT_MASK	(IPTOS_TOS_MASK & ~3)
 
 extern const __u8 ip_tos2prio[16];
+extern int ip_rt_src_check;
 
 static inline char rt_tos2priority(u8 tos)
 {
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -813,11 +813,15 @@ struct psched_ratecfg {
 	u16	overhead;
 	u8	linklayer;
 	u8	shift;
+	u32	mpu;
 };
 
 static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
 				unsigned int len)
 {
+	if (len < r->mpu)
+		len = r->mpu;
+
 	len += r->overhead;
 
 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,11 +54,7 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
-#ifdef __GENKSYMS__
-#include <linux/res_counter.h>
-#else
 #include <linux/page_counter.h>
-#endif
 #include <linux/memcontrol.h>
 #include <linux/static_key.h>
 #include <linux/aio.h>
@@ -376,8 +372,13 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
 	struct sk_buff_head	sk_write_queue;
+
+	/*
+	 * Because of non atomicity rules, all
+	 * changes are protected by socket lock.
+	 */
 	kmemcheck_bitfield_begin(flags);
-	unsigned int		sk_shutdown  : 2,
+	unsigned int		sk_padding  : 2,
 #ifdef __GENKSYMS__
 				sk_no_check : 2,
 #else
@@ -389,6 +390,7 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
 	kmemcheck_bitfield_end(flags);
+
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
 	u32			sk_pacing_rate; /* bytes per second */
@@ -397,6 +399,7 @@ struct sock {
 	int			sk_gso_type;
 	unsigned int		sk_gso_max_size;
 	u16			sk_gso_max_segs;
+	u8			sk_shutdown;
 	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
@@ -1250,6 +1253,7 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 					      unsigned long amt,
 					      int *parent_status)
 {
+	memcg_charge_kmem_nofail(prot->memcg, amt);
 	page_counter_charge(prot->memory_allocated, amt);
 
 	if (page_counter_read(prot->memory_allocated) >
@@ -1261,6 +1265,7 @@ static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
 					      unsigned long amt)
 {
 	page_counter_uncharge(prot->memory_allocated, amt);
+	memcg_uncharge_kmem(prot->memcg, amt);
 }
 
 static inline long
@@ -1473,6 +1478,7 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
 {
 	if (!sk_has_account(sk))
 		return true;
+
 	return size<= sk->sk_forward_alloc ||
 		__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
 		skb_pfmemalloc(skb);
@@ -2377,6 +2383,13 @@ static inline void sk_change_net(struct sock *sk, struct net *net)
 	}
 }
 
+static inline void sk_change_net_get(struct sock *sk, struct net *net)
+{
+	struct net *old_net = sock_net(sk);
+	sock_net_set(sk, get_net(net));
+	put_net(old_net);
+}
+
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (skb->sk) {
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,13 @@
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
+#include <net/tcp_memcontrol.h>
+
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+
+#define TW_WSCALE_MASK		0x0f
+#define TW_WSCALE_SPEC		0x10
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -255,10 +262,13 @@ extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_dsack;
+extern int sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
+#ifndef sysctl_tcp_adv_win_scale
 extern int sysctl_tcp_adv_win_scale;
+#endif
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
@@ -278,6 +288,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_use_sg;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -287,7 +298,7 @@ extern int tcp_memory_pressure;
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return !!sk->sk_cgrp->memory_pressure;
+		return *sk->sk_cgrp->memory_pressure;
 
 	return tcp_memory_pressure;
 }
@@ -318,11 +329,28 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
+static inline void orphan_count_inc(struct sock *sk)
+{
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_inc(sk);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+}
+
+static inline void orphan_count_dec(struct sock *sk)
+{
+	percpu_counter_dec(sk->sk_prot->orphan_count);
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_dec(sk);
+}
+
 static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 {
 	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
 	int orphans = percpu_counter_read_positive(ocp);
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		return cg_too_many_orphans(sk, shift);
+
 	if (orphans << shift > sysctl_tcp_max_orphans) {
 		orphans = percpu_counter_sum_positive(ocp);
 		if (orphans << shift > sysctl_tcp_max_orphans)
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -6,8 +6,10 @@ struct tcp_memcontrol {
 	/* per-cgroup tcp memory pressure knobs */
 	struct page_counter tcp_memory_allocated;
 	struct percpu_counter tcp_sockets_allocated;
+	struct percpu_counter tcp_orphan_count;
 	/* those two are read-mostly, leave them at the end */
 	long tcp_prot_mem[3];
+	int tcp_max_orphans;
 	int tcp_memory_pressure;
 };
 
@@ -15,4 +17,8 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
+
+void cg_orphan_count_inc(struct sock *sk);
+void cg_orphan_count_dec(struct sock *sk);
+bool cg_too_many_orphans(struct sock *sk, int shift);
 #endif /* _TCP_MEMCG_H */
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -259,6 +259,7 @@ extern void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 extern int udp_rcv(struct sk_buff *skb);
 extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int udp_init_sock(struct sock *sk);
+extern int __udp_disconnect(struct sock *sk, int flags);
 extern int udp_disconnect(struct sock *sk, int flags);
 extern unsigned int udp_poll(struct file *file, struct socket *sock,
 			     poll_table *wait);
@@ -352,4 +353,5 @@ extern void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 extern void udpv6_encap_enable(void);
 #endif
+extern int udp_init_sock(struct sock *sk);
 #endif	/* _UDP_H */
--- /dev/null
+++ b/include/net/udp_memcontrol.h
@@ -0,0 +1,20 @@
+/*
+ *  include/net/udp_memcontrol.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UDP_MEMCG_H
+#define _UDP_MEMCG_H
+
+struct udp_memcontrol {
+	struct cg_proto cg_proto;
+	struct page_counter udp_memory_allocated;
+	long udp_prot_mem[3];
+};
+
+struct cg_proto *udp_proto_cgroup(struct mem_cgroup *memcg);
+int udp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
+void udp_destroy_cgroup(struct mem_cgroup *memcg);
+#endif /* _UDP_MEMCG_H */
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -83,6 +83,36 @@ struct extent_status;
 	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"})
 
 
+TRACE_EVENT(ext4_other_inode_update_time,
+	TP_PROTO(struct inode *inode, ino_t orig_ino),
+
+	TP_ARGS(inode, orig_ino),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	orig_ino		)
+		__field(	uid_t,	uid			)
+		__field(	gid_t,	gid			)
+		__field(	__u16, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->orig_ino = orig_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->uid	= i_uid_read(inode);
+		__entry->gid	= i_gid_read(inode);
+		__entry->mode	= inode->i_mode;
+	),
+
+	TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->orig_ino,
+		  (unsigned long) __entry->ino, __entry->mode,
+		  __entry->uid, __entry->gid)
+);
+
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
 
@@ -887,6 +917,60 @@ TRACE_EVENT(ext4_sync_file_exit,
 		  __entry->ret)
 );
 
+TRACE_EVENT(ext4_sync_files_iterate,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+	TP_ARGS(dentry, tid, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync,
+		  __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+	TP_ARGS(dentry, tid, barrier),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	barrier			)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+		__entry->barrier	= barrier;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->barrier,
+		  __entry->tid)
+);
+
 TRACE_EVENT(ext4_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 
@@ -2377,7 +2461,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
 		  show_extent_status(__entry->found ? __entry->status : 0))
 );
 
-TRACE_EVENT(ext4_es_shrink_enter,
+DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
 	TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),
 
 	TP_ARGS(sb, nr_to_scan, cache_cnt),
@@ -2399,26 +2483,38 @@ TRACE_EVENT(ext4_es_shrink_enter,
 		  __entry->nr_to_scan, __entry->cache_cnt)
 );
 
-TRACE_EVENT(ext4_es_shrink_exit,
-	TP_PROTO(struct super_block *sb, int shrunk_nr, int cache_cnt),
+DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
+	TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),
+
+	TP_ARGS(sb, nr_to_scan, cache_cnt)
+);
+
+DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
+	TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),
+
+	TP_ARGS(sb, nr_to_scan, cache_cnt)
+);
+
+TRACE_EVENT(ext4_es_shrink_scan_exit,
+	TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),
 
-	TP_ARGS(sb, shrunk_nr, cache_cnt),
+	TP_ARGS(sb, nr_shrunk, cache_cnt),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	int,	shrunk_nr		)
+		__field(	int,	nr_shrunk		)
 		__field(	int,	cache_cnt		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->shrunk_nr	= shrunk_nr;
+		__entry->nr_shrunk	= nr_shrunk;
 		__entry->cache_cnt	= cache_cnt;
 	),
 
-	TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d",
+	TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->shrunk_nr, __entry->cache_cnt)
+		  __entry->nr_shrunk, __entry->cache_cnt)
 );
 
 TRACE_EVENT(ext4_collapse_range,
@@ -2446,6 +2542,34 @@ TRACE_EVENT(ext4_collapse_range,
 		  __entry->offset, __entry->len)
 );
 
+TRACE_EVENT(ext4_es_shrink,
+	TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
+		 int nr_skipped, int retried),
+
+	TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	int,		nr_shrunk	)
+		__field(	unsigned long long, scan_time	)
+		__field(	int,		nr_skipped	)
+		__field(	int,		retried		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sb->s_dev;
+		__entry->nr_shrunk	= nr_shrunk;
+		__entry->scan_time	= div_u64(scan_time, 1000);
+		__entry->nr_skipped	= nr_skipped;
+		__entry->retried	= retried;
+	),
+
+	TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
+		  "nr_skipped %d retried %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
+		  __entry->scan_time, __entry->nr_skipped, __entry->retried)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
 	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
 	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
-	{(unsigned long)__GFP_KMEMCG,		"GFP_KMEMCG"},		\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
 	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -181,47 +181,44 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
 
 TRACE_EVENT(mm_shrink_slab_start,
 	TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
-		long nr_objects_to_shrink, unsigned long pgs_scanned,
-		unsigned long lru_pgs, unsigned long cache_items,
-		unsigned long long delta, unsigned long total_scan),
+		long nr_objects_to_shrink, unsigned long cache_items,
+		unsigned long long delta, unsigned long total_scan,
+		int priority),
 
-	TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
-		cache_items, delta, total_scan),
+	TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
+		priority),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
 		__field(void *, shrink)
 		__field(long, nr_objects_to_shrink)
 		__field(gfp_t, gfp_flags)
-		__field(unsigned long, pgs_scanned)
-		__field(unsigned long, lru_pgs)
 		__field(unsigned long, cache_items)
 		__field(unsigned long long, delta)
 		__field(unsigned long, total_scan)
+		__field(int, priority)
 	),
 
 	TP_fast_assign(
 		__entry->shr = shr;
-		__entry->shrink = shr->shrink;
+		__entry->shrink = shr->scan_objects;
 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
 		__entry->gfp_flags = sc->gfp_mask;
-		__entry->pgs_scanned = pgs_scanned;
-		__entry->lru_pgs = lru_pgs;
 		__entry->cache_items = cache_items;
 		__entry->delta = delta;
 		__entry->total_scan = total_scan;
+		__entry->priority = priority;
 	),
 
-	TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+	TP_printk("%pF %p: objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
-		__entry->pgs_scanned,
-		__entry->lru_pgs,
 		__entry->cache_items,
 		__entry->delta,
-		__entry->total_scan)
+		__entry->total_scan,
+		__entry->priority)
 );
 
 TRACE_EVENT(mm_shrink_slab_end,
@@ -241,7 +238,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 
 	TP_fast_assign(
 		__entry->shr = shr;
-		__entry->shrink = shr->shrink;
+		__entry->shrink = shr->scan_objects;
 		__entry->unused_scan = unused_scan_cnt;
 		__entry->new_scan = new_scan_cnt;
 		__entry->retval = shrinker_retval;
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
 		{I_FREEING,		"I_FREEING"},		\
 		{I_CLEAR,		"I_CLEAR"},		\
 		{I_SYNC,		"I_SYNC"},		\
+		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
+		{I_DIRTY_TIME_EXPIRED,	"I_DIRTY_TIME_EXPIRED"}, \
 		{I_REFERENCED,		"I_REFERENCED"}		\
 	)
 
@@ -69,6 +71,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	TP_STRUCT__entry (
 		__array(char, name, 32)
 		__field(unsigned long, ino)
+		__field(unsigned long, state)
 		__field(unsigned long, flags)
 	),
 
@@ -79,16 +82,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 		strncpy(__entry->name,
 			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
 		__entry->flags		= flags;
 	),
 
-	TP_printk("bdi %s: ino=%lu flags=%s",
+	TP_printk("bdi %s: ino=%lu state=%s flags=%s",
 		__entry->name,
 		__entry->ino,
+		show_inode_state(__entry->state),
 		show_inode_state(__entry->flags)
 	)
 );
 
+DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
+
+	TP_PROTO(struct inode *inode, int flags),
+
+	TP_ARGS(inode, flags)
+);
+
 DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
 
 	TP_PROTO(struct inode *inode, int flags),
@@ -599,6 +611,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
 	TP_ARGS(inode, wbc, nr_to_write)
 );
 
+DECLARE_EVENT_CLASS(writeback_lazytime_template,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(unsigned long,	ino			)
+		__field(unsigned long,	state			)
+		__field(	__u16, mode			)
+		__field(unsigned long, dirtied_when		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->state	= inode->i_state;
+		__entry->mode	= inode->i_mode;
+		__entry->dirtied_when = inode->dirtied_when;
+	),
+
+	TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->dirtied_when,
+		  show_inode_state(__entry->state), __entry->mode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
--- a/include/uapi/asm-generic/ioctls.h
+++ b/include/uapi/asm-generic/ioctls.h
@@ -94,6 +94,8 @@
 #define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
 #define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
 
+#define TIOSAK		_IO('T', 0x66)  /* "Secure Attention Key" */
+
 /*
  * Some arches already define FIOQSIZE due to a historical
  * conflict with a Hayes modem-specific ioctl value.
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -62,6 +62,7 @@ header-y += auxvec.h
 header-y += ax25.h
 header-y += b1lli.h
 header-y += baycom.h
+header-y += beancounter.h
 header-y += bfs_fs.h
 header-y += binfmts.h
 header-y += blkpg.h
@@ -432,6 +433,12 @@ header-y += virtio_vsock.h
 header-y += vm_sockets.h
 header-y += vt.h
 header-y += vtpm_proxy.h
+header-y += vzcalluser.h
+header-y += vzctl_netstat.h
+header-y += vzctl_veth.h
+header-y += vzctl_venet.h
+header-y += vziptable_defs.h
+header-y += vzlist.h
 header-y += wait.h
 header-y += wanrouter.h
 header-y += watchdog.h
@@ -442,3 +449,4 @@ header-y += xattr.h
 header-y += xfrm.h
 header-y += hw_breakpoint.h
 header-y += userfaultfd.h
+header-y += compat.h
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -44,6 +44,8 @@ enum {
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
+	IOCB_CMD_READ_ITER = 9,
+	IOCB_CMD_WRITE_ITER = 10,
 };
 
 /*
--- /dev/null
+++ b/include/uapi/linux/bc/statd.h
@@ -0,0 +1,76 @@
+/*
+ *  include/uapi/linux/bc/statd.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_STATD_H_
+#define __BC_STATD_H_
+
+/* sys_ubstat commands list */
+#define UBSTAT_READ_ONE			0x010000
+#define UBSTAT_READ_ALL			0x020000
+#define UBSTAT_READ_FULL		0x030000
+#define UBSTAT_UBLIST			0x040000
+#define UBSTAT_UBPARMNUM		0x050000
+#define UBSTAT_GETTIME			0x060000
+
+#define UBSTAT_CMD(func)		((func) & 0xF0000)
+#define UBSTAT_PARMID(func)		((func) & 0x0FFFF)
+
+#define TIME_MAX_SEC		(LONG_MAX / HZ)
+#define TIME_MAX_JIF		(TIME_MAX_SEC * HZ)
+
+typedef unsigned long ubstattime_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstattime_t	cur_time;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubgettime_t;
+
+typedef struct {
+	long		maxinterval;
+	int		signum;
+} ubnotifrq_t;
+
+typedef struct {
+	unsigned long	maxheld;
+	unsigned long	failcnt;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubstatparm_t;
+
+typedef struct {
+	unsigned long	barrier;
+	unsigned long	limit;
+	unsigned long	held;
+	unsigned long	maxheld;
+	unsigned long	minheld;
+	unsigned long	failcnt;
+	unsigned long __unused1;
+	unsigned long __unused2;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubstatparmf_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstatparmf_t	param[0];
+} ubstatfull_t;
+
+#endif
--- /dev/null
+++ b/include/uapi/linux/beancounter.h
@@ -0,0 +1,65 @@
+/*
+ *  include/uapi/linux/beancounter.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_BEANCOUNTER_H
+#define _UAPI_LINUX_BEANCOUNTER_H
+
+/*
+ * Resource list.
+ */
+#define UB_KMEMSIZE		0	/* Unswappable kernel memory size including
+					 * struct task, page directories, etc. */
+#define UB_LOCKEDPAGES		1	/* Mlock()ed pages. */
+#define UB_PRIVVMPAGES		2	/* Total number of pages, counting potentially
+					 * private pages as private and used. */
+#define UB_SHMPAGES		3	/* IPC SHM segment size. */
+#define UB_DUMMY		4	/* Dummy resource (compatibility) */
+#define UB_NUMPROC		5	/* Number of processes. */
+#define UB_PHYSPAGES		6	/* All resident pages, for swapout guarantee. */
+#define UB_VMGUARPAGES		7	/* Guarantee for memory allocation,
+					 * checked against PRIVVMPAGES. */
+#define UB_OOMGUARPAGES		8	/* Guarantees against OOM kill.
+					 * Only limit is used, no accounting. */
+#define UB_NUMTCPSOCK		9	/* Number of TCP sockets. */
+#define UB_NUMFLOCK		10	/* Number of file locks. */
+#define UB_NUMPTY		11	/* Number of PTYs. */
+#define UB_NUMSIGINFO		12	/* Number of siginfos. */
+#define UB_TCPSNDBUF		13	/* Total size of tcp send buffers. */
+#define UB_TCPRCVBUF		14	/* Total size of tcp receive buffers. */
+#define UB_OTHERSOCKBUF		15	/* Total size of other socket
+					 * send buffers (all buffers for PF_UNIX). */
+#define UB_DGRAMRCVBUF		16	/* Total size of other socket
+					 * receive buffers. */
+#define UB_NUMOTHERSOCK		17	/* Number of other sockets. */
+#define UB_DCACHESIZE		18	/* Size of busy dentry/inode cache. */
+#define UB_NUMFILE		19	/* Number of open files. */
+
+#define UB_RESOURCES_COMPAT	24
+
+/*
+ * Add new resources here.
+ */
+#define UB_NUMXTENT		23
+#define UB_SWAPPAGES		24
+#define UB_RESOURCES		25
+
+struct ubparm {
+	/*
+	 * A barrier over which resource allocations are failed gracefully.
+	 * If the amount of consumed memory is over the barrier further sbrk()
+	 * or mmap() calls fail, the existing processes are not killed.
+	 */
+	unsigned long	barrier;
+	unsigned long	limit;		/* hard resource limit */
+	unsigned long	held;		/* consumed resources */
+	unsigned long	maxheld;	/* maximum amount of consumed resources through the last period */
+	unsigned long	minheld;	/* minimum amount of consumed resources through the last period */
+	unsigned long	failcnt;	/* count of failed charges */
+	int		max_precharge;	/* maximum percpu resource precharge */
+};
+
+#endif /* _UAPI_LINUX_BEANCOUNTER_H */
--- a/include/uapi/linux/blkpg.h
+++ b/include/uapi/linux/blkpg.h
@@ -41,6 +41,7 @@ struct blkpg_ioctl_arg {
 #define BLKPG_ADD_PARTITION	1
 #define BLKPG_DEL_PARTITION	2
 #define BLKPG_RESIZE_PARTITION	3
+#define BLKPG_GET_PARTITION	4
 
 /* Sizes of name fields. Unused at present. */
 #define BLKPG_DEVNAMELTH	64
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -171,12 +171,9 @@ struct vfs_cap_data {
 
 #define CAP_NET_BROADCAST    11
 
-/* Allow interface configuration */
 /* Allow administration of IP firewall, masquerading and accounting */
 /* Allow setting debug option on sockets */
 /* Allow modification of routing tables */
-/* Allow setting arbitrary process / process group ownership on
-   sockets */
 /* Allow binding to any address for transparent proxying (also via NET_RAW) */
 /* Allow setting TOS (type of service) */
 /* Allow setting promiscuous mode */
@@ -207,6 +204,7 @@ struct vfs_cap_data {
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
+/* Allow O_DIRECT access */
 /* Allow sending USB messages to any device via /proc/bus/usb */
 
 #define CAP_SYS_RAWIO        17
@@ -225,23 +223,18 @@ struct vfs_cap_data {
 
 /* Allow configuration of the secure attention key */
 /* Allow administration of the random device */
-/* Allow examination and configuration of disk quotas */
 /* Allow setting the domainname */
 /* Allow setting the hostname */
 /* Allow calling bdflush() */
-/* Allow mount() and umount(), setting up new smb connection */
+/* Allow setting up new smb connection */
 /* Allow some autofs root ioctls */
 /* Allow nfsservctl */
 /* Allow VM86_REQUEST_IRQ */
 /* Allow to read/write pci config on alpha */
 /* Allow irix_prctl on mips (setstacksize) */
 /* Allow flushing all cache on m68k (sys_cacheflush) */
-/* Allow removing semaphores */
-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
-   and shared memory */
 /* Allow locking/unlocking of shared memory segment */
 /* Allow turning swap on/off */
-/* Allow forged pids on socket credentials passing */
 /* Allow setting readahead and flushing buffers on block devices */
 /* Allow setting geometry in floppy driver */
 /* Allow turning DMA on/off in xd driver */
--- a/include/uapi/linux/fadvise.h
+++ b/include/uapi/linux/fadvise.h
@@ -17,5 +17,9 @@
 #define POSIX_FADV_DONTNEED	4 /* Don't need these pages.  */
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
+#define FADV_DEACTIVATE		32 /* Mark pages as good candidates for reclaim */
 
+#ifdef __KERNEL__
+extern int generic_fadvise(struct file* file, loff_t off, loff_t len, int adv);
+#endif
 #endif	/* FADVISE_H_INCLUDED */
--- /dev/null
+++ b/include/uapi/linux/fairsched.h
@@ -0,0 +1,7 @@
+/*
+ *  include/uapi/linux/fairsched.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -4,6 +4,8 @@
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
 #define FALLOC_FL_PUNCH_HOLE	0x02 /* de-allocates range */
 #define FALLOC_FL_NO_HIDE_STALE	0x04 /* reserved codepoint */
+#define FALLOC_FL_CONVERT_UNWRITTEN 0x100 /* mark extents as initialized */
+
 
 /*
  * FALLOC_FL_COLLAPSE_RANGE is used to remove a range of a file
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -60,9 +60,9 @@ struct files_stat_struct {
 };
 
 struct inodes_stat_t {
-	int nr_inodes;
-	int nr_unused;
-	int dummy[5];		/* padding for sysctl ABI compatibility */
+	long nr_inodes;
+	long nr_unused;
+	long dummy[5];		/* padding for sysctl ABI compatibility */
 };
 
 
@@ -97,6 +97,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOREMOTELOCK	(1<<27)
@@ -108,7 +109,8 @@ struct inodes_stat_t {
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+			 MS_LAZYTIME)
 
 /*
  * Old magic mount flag and mask
@@ -159,6 +161,40 @@ struct inodes_stat_t {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 
+/* Hole from 127..199 */
+struct blk_user_cbt_extent {
+	__u64 ce_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 ce_length;   /* length in bytes for this extent */
+	__u64 ce_reserved64[1];
+};
+
+struct blk_user_cbt_info {
+	__u8  ci_uuid[16];      /* Bitmap UUID */
+	__u64 ci_start;		/* start phisical range of mapping which
+				   userspace wants (in) */
+	__u64 ci_length;	/* phisical length of mapping which
+				 * userspace wants (in) */
+	__u32 ci_blksize;	/* cbt logical block size */
+	__u32 ci_flags;		/* CI_FLAG_* flags for request (in/out) */
+	__u32 ci_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 ci_extent_count;  /* size of fm_extents array (in) */
+	__u32 ci_reserved;
+	struct blk_user_cbt_extent ci_extents[0]; /* array of mapped extents (out) */
+};
+
+enum CI_FLAGS
+{
+	CI_FLAG_ONCE = 1, /* BLKCBTGET will clear bits */
+	CI_FLAG_NEW_UUID = 2 /* BLKCBTSET update uuid */
+};
+
+#define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
+#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
+#define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
+#define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
+
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -368,6 +368,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_STORE = 4,
 	FUSE_NOTIFY_RETRIEVE = 5,
 	FUSE_NOTIFY_DELETE = 6,
+	FUSE_NOTIFY_INVAL_FILES = 77,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -759,4 +760,12 @@ struct fuse_lseek_out {
 	uint64_t	offset;
 };
 
+struct fuse_notify_inval_files_out {
+	__u64	ino;
+};
+
+/* Device ioctls: */
+#define FUSE_DEV_IOC_CLONE	_IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_SETAFF	 _IO(229, 1)
+
 #endif /* _LINUX_FUSE_H */
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -22,6 +22,7 @@ struct genlmsghdr {
 #define GENL_CMD_CAP_DUMP	0x04
 #define GENL_CMD_CAP_HASPOL	0x08
 #define GENL_UNS_ADMIN_PERM	0x10
+#define GENL_VE_ADMIN_PERM	0x80
 
 /*
  * List of reserved static generic netlink identifiers:
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -221,6 +221,7 @@ struct ifreq {
 		char	ifru_newname[IFNAMSIZ];
 		void __user *	ifru_data;
 		struct	if_settings ifru_settings;
+		unsigned int ifru_acctid;
 	} ifr_ifru;
 };
 
@@ -241,6 +242,7 @@ struct ifreq {
 #define ifr_qlen	ifr_ifru.ifru_ivalue	/* Queue length 	*/
 #define ifr_newname	ifr_ifru.ifru_newname	/* New name		*/
 #define ifr_settings	ifr_ifru.ifru_settings	/* Device/proto settings*/
+#define ifr_acctid	ifr_ifru.ifru_acctid	/* New ve accounting identifier */
 
 /*
  * Structure used in SIOCGIFCONF request.
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,9 @@
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
 
+/* CONFIG_VE_TUNTAP_ACCOUNTING should be set */
+#define TUNSETACCTID _IOW('T', 300, struct ifreq)
+
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -174,6 +174,8 @@ struct in6_flowlabel_req {
 #define IPV6_JOIN_ANYCAST	27
 #define IPV6_LEAVE_ANYCAST	28
 
+#define IPV6_HDRINCL		36
+
 /* IPV6_MTU_DISCOVER values */
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -43,6 +43,23 @@ struct inet_diag_req_v2 {
 	struct inet_diag_sockid id;
 };
 
+/*
+ * SOCK_RAW sockets require the underlied protocol to be
+ * additionally specified so we can use @pad member for
+ * this, but we can't rename it because userspace programs
+ * still may depend on this name. Instead lets use another
+ * structure definition as an alias for struct
+ * @inet_diag_req_v2.
+ */
+struct inet_diag_req_raw {
+	__u8	sdiag_family;
+	__u8	sdiag_protocol;
+	__u8	idiag_ext;
+	__u8	sdiag_raw_protocol;
+	__u32	idiag_states;
+	struct inet_diag_sockid id;
+};
+
 enum {
 	INET_DIAG_REQ_NONE,
 	INET_DIAG_REQ_BYTECODE,
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -147,6 +147,25 @@ struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
+struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+	__u32 type;
+	union {
+		struct {
+			__u32 msr;
+			__u64 control;
+			__u64 evt_page;
+			__u64 msg_page;
+		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
+	} u;
+};
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -173,6 +192,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_HYPERV           27
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -317,6 +337,8 @@ struct kvm_run {
 			__u64 flags;
 		} system_event;
 
+		/* KVM_EXIT_HYPERV */
+		struct kvm_hyperv_exit hyperv;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -702,8 +724,11 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 #define KVM_CAP_SPLIT_IRQCHIP 121
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
+#define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_MAX_VCPU_ID 128
 #define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_HYPERV_SYNIC2 148
+#define KVM_CAP_HYPERV_VP_INDEX 149
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -719,9 +744,15 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_HV_SINT 4
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -731,6 +762,7 @@ struct kvm_irq_routing_entry {
 	union {
 		struct kvm_irq_routing_irqchip irqchip;
 		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_hv_sint hv_sint;
 		__u32 pad[8];
 	} u;
 };
--- a/include/uapi/linux/netfilter/xt_DSCP.h
+++ b/include/uapi/linux/netfilter/xt_DSCP.h
@@ -13,6 +13,12 @@
 #include <linux/netfilter/xt_dscp.h>
 #include <linux/types.h>
 
+#define IPTOS_NORMALSVC 0
+
+struct ipt_tos_target_info {
+	u_int8_t tos;
+};
+
 /* target info */
 struct xt_DSCP_info {
 	__u8 dscp;
--- a/include/uapi/linux/netfilter/xt_connlimit.h
+++ b/include/uapi/linux/netfilter/xt_connlimit.h
@@ -22,8 +22,13 @@ struct xt_connlimit_info {
 #endif
 	};
 	unsigned int limit;
-	/* revision 1 */
-	__u32 flags;
+	union {
+		/* revision 0 */
+		unsigned int inverse;
+
+		/* revision 1 */
+		__u32 flags;
+	};
 
 	/* Used internally by the kernel */
 	struct xt_connlimit_data *data __attribute__((aligned(8)));
--- a/include/uapi/linux/netfilter/xt_connmark.h
+++ b/include/uapi/linux/netfilter/xt_connmark.h
@@ -18,11 +18,22 @@ enum {
 	XT_CONNMARK_RESTORE
 };
 
+struct xt_connmark_target_info {
+	unsigned long mark;
+	unsigned long mask;
+	__u8 mode;
+};
+
 struct xt_connmark_tginfo1 {
 	__u32 ctmark, ctmask, nfmask;
 	__u8 mode;
 };
 
+struct xt_connmark_info {
+	unsigned long mark, mask;
+	__u8 invert;
+};
+
 struct xt_connmark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
--- a/include/uapi/linux/netfilter/xt_conntrack.h
+++ b/include/uapi/linux/netfilter/xt_conntrack.h
@@ -34,6 +34,41 @@ enum {
 	XT_CONNTRACK_STATE_ALIAS  = 1 << 13,
 };
 
+/* This is exposed to userspace, so remains frozen in time. */
+struct ip_conntrack_old_tuple
+{
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+	} src;
+
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+
+		/* The protocol. */
+		__u16 protonum;
+	} dst;
+};
+
+struct xt_conntrack_info
+{
+	unsigned int statemask, statusmask;
+        struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+	struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
+
+	unsigned long expires_min, expires_max;
+
+	/* Flags word */
+	__u8 flags;
+	/* Inverse flags */
+	__u8 invflags;
+};
+
 struct xt_conntrack_mtinfo1 {
 	union nf_inet_addr origsrc_addr, origsrc_mask;
 	union nf_inet_addr origdst_addr, origdst_mask;
--- a/include/uapi/linux/netfilter/xt_mark.h
+++ b/include/uapi/linux/netfilter/xt_mark.h
@@ -3,10 +3,32 @@
 
 #include <linux/types.h>
 
+/* Version 0 */
+struct xt_mark_target_info {
+	unsigned long mark;
+};
+
+/* Version 1 */
+enum {
+	XT_MARK_SET=0,
+	XT_MARK_AND,
+	XT_MARK_OR,
+};
+
+struct xt_mark_target_info_v1 {
+	unsigned long mark;
+	__u8 mode;
+};
+
 struct xt_mark_tginfo2 {
 	__u32 mark, mask;
 };
 
+struct xt_mark_info {
+	unsigned long mark, mask;
+	__u8 invert;
+};
+
 struct xt_mark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
--- a/include/uapi/linux/netfilter/xt_owner.h
+++ b/include/uapi/linux/netfilter/xt_owner.h
@@ -9,6 +9,23 @@ enum {
 	XT_OWNER_SOCKET = 1 << 2,
 };
 
+struct ipt_owner_info {
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t sid;
+	char comm[16];
+	u_int8_t match, invert;     /* flags */
+};
+
+struct ip6t_owner_info {
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t sid;
+	u_int8_t match, invert;     /* flags */
+};
+
 struct xt_owner_match_info {
 	__u32 uid_min, uid_max;
 	__u32 gid_min, gid_max;
--- a/include/uapi/linux/netfilter_arp/arp_tables.h
+++ b/include/uapi/linux/netfilter_arp/arp_tables.h
@@ -71,9 +71,9 @@ struct arpt_arp {
 };
 
 /* Values for "flag" field in struct arpt_ip (general arp structure).
- * No flags defined yet.
  */
-#define ARPT_F_MASK		0x00	/* All possible flag bits mask. */
+#define ARPT_WDOGTMO		0x80
+#define ARPT_F_MASK		0x80	/* All possible flag bits mask. */
 
 /* Values for "inv" field in struct arpt_arp. */
 #define ARPT_INV_VIA_IN		0x0001	/* Invert the sense of IN IFACE. */
@@ -86,7 +86,8 @@ struct arpt_arp {
 #define ARPT_INV_ARPHRD		0x0080	/* Invert the sense of ARP HRD. */
 #define ARPT_INV_ARPPRO		0x0100	/* Invert the sense of ARP PRO. */
 #define ARPT_INV_ARPHLN		0x0200	/* Invert the sense of ARP HLN. */
-#define ARPT_INV_MASK		0x03FF	/* All possible flag bits mask. */
+#define ARPT_INV_WDOGTMO	0x8000	/* Invert the sense if ARPT_WDOGTMO flag */
+#define ARPT_INV_MASK		0x83FF	/* All possible flag bits mask. */
 
 /* This structure defines each of the firewall rules.  Consists of 3
    parts which are 1) general ARP header stuff 2) match specific
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -101,14 +101,17 @@ struct nlmsgerr {
 	struct nlmsghdr msg;
 };
 
-#define NETLINK_ADD_MEMBERSHIP	1
-#define NETLINK_DROP_MEMBERSHIP	2
-#define NETLINK_PKTINFO		3
-#define NETLINK_BROADCAST_ERROR	4
-#define NETLINK_NO_ENOBUFS	5
-#define NETLINK_RX_RING		6
-#define NETLINK_TX_RING		7
-#define NETLINK_LISTEN_ALL_NSID	8
+#define NETLINK_ADD_MEMBERSHIP		1
+#define NETLINK_DROP_MEMBERSHIP		2
+#define NETLINK_PKTINFO			3
+#define NETLINK_BROADCAST_ERROR		4
+#define NETLINK_NO_ENOBUFS		5
+#define NETLINK_RX_RING			6
+#define NETLINK_TX_RING			7
+#define NETLINK_LISTEN_ALL_NSID		8
+#define NETLINK_LIST_MEMBERSHIPS	9
+#define NETLINK_CAP_ACK			10
+#define NETLINK_REPAIR			11
 
 struct nl_pktinfo {
 	__u32	group;
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -37,6 +37,7 @@ enum {
 	NETLINK_DIAG_GROUPS,
 	NETLINK_DIAG_RX_RING,
 	NETLINK_DIAG_TX_RING,
+	NETLINK_DIAG_FLAGS,
 
 	__NETLINK_DIAG_MAX,
 };
@@ -48,5 +49,14 @@ enum {
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#define NDIAG_SHOW_FLAGS	0x00000008 /* show flags of a netlink socket */
+
+/* flags */
+#define NDIAG_FLAG_CB_RUNNING		0x00000001
+#define NDIAG_FLAG_PKTINFO		0x00000002
+#define NDIAG_FLAG_BROADCAST_ERROR	0x00000004
+#define NDIAG_FLAG_NO_ENOBUFS		0x00000008
+#define NDIAG_FLAG_LISTEN_ALL_NSID	0x00000010
+#define NDIAG_FLAG_CAP_ACK		0x00000020
 
 #endif
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -64,6 +64,8 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_GETSIGMASK	0x420a
 #define PTRACE_SETSIGMASK	0x420b
 
+#define PTRACE_SECCOMP_GET_FILTER	0x420c
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
@@ -89,9 +91,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL	(1 << 20)
+#define PTRACE_O_EXITKILL		(1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)
 
-#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK		(\
+	0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
--- /dev/null
+++ b/include/uapi/linux/venet-netlink.h
@@ -0,0 +1,30 @@
+/*
+ *  include/uapi/linux/venet-netlink.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __NET_VENET_H_
+#define __NET_VENET_H_
+
+enum {
+	VENET_INFO_UNSPEC,
+	VENET_INFO_CMD,
+
+	__VENET_INFO_MAX
+#define VENET_INFO_MAX   (__VENET_INFO_MAX - 1)
+};
+
+enum {
+	VENET_IP_ADD,
+	VENET_IP_DEL,
+};
+
+struct venetaddrmsg {
+	__u8		va_family;
+	__u8		va_cmd;
+	__u32		va_addr[4];
+};
+
+#endif
--- a/include/uapi/linux/veth.h
+++ b/include/uapi/linux/veth.h
@@ -1,3 +1,12 @@
+/*
+ *  include/linux/veth.h
+ *
+ *  Copyright (C) 2007  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
 #ifndef __NET_VETH_H_
 #define __NET_VETH_H_
 
@@ -9,4 +18,7 @@ enum {
 #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
 };
 
+#define SIOCSVENET	(SIOCDEVPRIVATE + 0xf)
+#define SIOCSFIXEDADDR	(SIOCDEVPRIVATE + 0xe)
+
 #endif
--- /dev/null
+++ b/include/uapi/linux/vzcalluser.h
@@ -0,0 +1,203 @@
+/*
+ *  include/uapi/linux/vzcalluser.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZCALLUSER_H
+#define _UAPI_LINUX_VZCALLUSER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#include <linux/vziptable_defs.h>
+
+#ifndef __ENVID_T_DEFINED__
+# define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * VE management ioctls
+ */
+
+#define VE_CREATE	1	/* Create VE, VE_ENTER added automatically */
+#define VE_EXCLUSIVE	2	/* Fail if exists */
+#define VE_ENTER	4	/* Enter existing VE */
+#define VE_TEST		8	/* Test if VE exists */
+#define VE_LOCK		16	/* Do not allow entering created VE */
+#define VE_SKIPLOCK	32	/* Allow entering embrion VE */
+
+struct vzctl_old_env_create {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				addr;
+};
+
+struct vzctl_mark_env_to_down {
+	envid_t			veid;
+};
+
+#define VE_USE_MAJOR	010	/* Test MAJOR supplied in rule */
+#define VE_USE_MINOR	030	/* Test MINOR supplied in rule */
+#define VE_USE_MASK	030	/* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
+
+struct vzctl_setdevperms {
+	envid_t				veid;
+	unsigned int			type;
+	unsigned int			dev;
+	unsigned int			mask;
+};
+
+#define VE_NETDEV_ADD  1
+#define VE_NETDEV_DEL  2
+
+struct vzctl_ve_netdev {
+	envid_t				veid;
+	int				op;
+	char __user			*dev_name;
+};
+
+#define VE_CONFIGURE_OS_RELEASE		2
+#define VE_CONFIGURE_CREATE_PROC_LINK	4
+#define VE_CONFIGURE_OPEN_TTY		5
+
+struct vzctl_ve_configure {
+	envid_t				veid;
+	unsigned int			key;
+	unsigned int			val;
+	unsigned int			size;
+	char				data[0];
+};
+
+struct vzctl_ve_meminfo {
+	envid_t				veid;
+	unsigned long			val;
+};
+
+struct vzctl_env_create_cid {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+};
+
+struct vzctl_env_create {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+};
+
+struct env_create_param {
+	__u64				iptables_mask;
+};
+
+#define VZCTL_ENV_CREATE_DATA_MINLEN	sizeof(struct env_create_param)
+
+struct env_create_param2 {
+	__u64				iptables_mask;
+	__u64				feature_mask;
+	__u32				total_vcpus;	/* 0 - don't care, same as in host */
+};
+
+struct env_create_param3 {
+	__u64				iptables_mask;
+	__u64				feature_mask;
+	__u32				total_vcpus;
+	__u32				pad;
+	__u64				known_features;
+};
+
+#define VE_FEATURE_SYSFS	(1ULL << 0)	/* deprecated */
+#define VE_FEATURE_NFS		(1ULL << 1)
+#define VE_FEATURE_DEF_PERMS	(1ULL << 2)	/* deprecated */
+#define VE_FEATURE_SIT          (1ULL << 3)
+#define VE_FEATURE_IPIP         (1ULL << 4)
+#define VE_FEATURE_PPP		(1ULL << 5)
+#define VE_FEATURE_IPGRE	(1ULL << 6)	/* deprecated */
+#define VE_FEATURE_BRIDGE	(1ULL << 7)
+#define VE_FEATURE_NFSD		(1ULL << 8)
+
+#define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
+#define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | VE_FEATURE_DEF_PERMS)
+
+typedef struct env_create_param3 env_create_param_t;
+#define VZCTL_ENV_CREATE_DATA_MAXLEN	sizeof(env_create_param_t)
+
+struct vzctl_env_create_data {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+	env_create_param_t __user	*data;
+	int				datalen;
+};
+
+struct vz_load_avg {
+	int				val_int;
+	int				val_frac;
+};
+
+struct vz_cpu_stat {
+	unsigned long			user_jif;
+	unsigned long			nice_jif;
+	unsigned long			system_jif;
+	unsigned long			uptime_jif;
+	__u64				idle_clk;
+	__u64				strv_clk;
+	__u64				uptime_clk;
+	struct vz_load_avg		avenrun[3];	/* loadavg data */
+};
+
+struct vzctl_cpustatctl {
+	envid_t				veid;
+	struct vz_cpu_stat __user	*cpustat;
+};
+
+#define VZCTLTYPE			'.'
+#define VZCTL_OLD_ENV_CREATE		_IOW(VZCTLTYPE,  0, struct vzctl_old_env_create)
+#define VZCTL_MARK_ENV_TO_DOWN		_IOW(VZCTLTYPE,  1, struct vzctl_mark_env_to_down)
+#define VZCTL_SETDEVPERMS		_IOW(VZCTLTYPE,  2, struct vzctl_setdevperms) /* DEPRECATED */
+#define VZCTL_ENV_CREATE_CID		_IOW(VZCTLTYPE,  4, struct vzctl_env_create_cid)
+#define VZCTL_ENV_CREATE		_IOW(VZCTLTYPE,  5, struct vzctl_env_create)
+#define VZCTL_GET_CPU_STAT		_IOW(VZCTLTYPE,  6, struct vzctl_cpustatctl)
+#define VZCTL_ENV_CREATE_DATA		_IOW(VZCTLTYPE, 10, struct vzctl_env_create_data)
+#define VZCTL_VE_NETDEV			_IOW(VZCTLTYPE, 11, struct vzctl_ve_netdev)
+#define VZCTL_VE_MEMINFO		_IOW(VZCTLTYPE, 13, struct vzctl_ve_meminfo)
+#define VZCTL_VE_CONFIGURE		_IOW(VZCTLTYPE, 15, struct vzctl_ve_configure)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_netdev {
+	envid_t				veid;
+	int				op;
+	compat_uptr_t			dev_name;
+};
+
+struct compat_vzctl_ve_meminfo {
+	envid_t				veid;
+	compat_ulong_t			val;
+};
+
+struct compat_vzctl_env_create_data {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+	compat_uptr_t			data;
+	int				datalen;
+};
+
+#define VZCTL_COMPAT_ENV_CREATE_DATA	_IOW(VZCTLTYPE, 10, struct compat_vzctl_env_create_data)
+#define VZCTL_COMPAT_VE_NETDEV		_IOW(VZCTLTYPE, 11, struct compat_vzctl_ve_netdev)
+#define VZCTL_COMPAT_VE_MEMINFO		_IOW(VZCTLTYPE, 13, struct compat_vzctl_ve_meminfo)
+
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* _UAPI_LINUX_VZCALLUSER_H */
--- /dev/null
+++ b/include/uapi/linux/vzctl_netstat.h
@@ -0,0 +1,129 @@
+/*
+ *  include/uapi/linux/vzctl_netstat.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VZCTL_NETSTAT_H__
+#define __VZCTL_NETSTAT_H__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+# define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+/*
+ * Traffic accouting management ioctl
+ */
+
+struct vz_tc_class_info {
+	__u32				cid;	/* class number */
+	__u32				addr;	/* Network byte order */
+	__u32				mask;	/* subnet mask */
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+
+struct vzctl_tc_classes {
+	struct vz_tc_class_info		*info;
+	int				length;
+};
+
+/* For IPv6 */
+struct vz_tc_class_info_v6 {
+	__u32				cid;	/* class number */
+	__u32				addr[4];/* Network byte order */
+	__u32				mask[4];/* subnet mask */
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+struct vzctl_tc_classes_v6 {
+	struct vz_tc_class_info_v6	*info;
+	int				length;
+};
+
+struct vzctl_tc_get_stat {
+	envid_t				veid;
+	__u64				*incoming;
+	__u64				*outgoing;
+	__u32				*incoming_pkt;
+	__u32				*outgoing_pkt;
+	int				length;
+};
+
+struct vzctl_tc_get_stat_list {
+	envid_t				*list;
+	int				length;
+};
+
+struct vzctl_tc_set_base {
+	envid_t				veid;
+	__u16				base;
+};
+
+#define VZTCCTLTYPE			'='
+#define VZCTL_TC_MAX_CLASS		_IO(VZTCCTLTYPE, 1)
+#define VZCTL_TC_CLASS_NUM		_IO(VZTCCTLTYPE, 2)
+#define VZCTL_TC_SET_CLASS_TABLE	_IOW(VZTCCTLTYPE, 3, struct vzctl_tc_classes)
+#define VZCTL_TC_GET_CLASS_TABLE	_IOR(VZTCCTLTYPE, 4, struct vzctl_tc_classes)
+#define VZCTL_TC_STAT_NUM		_IO(VZTCCTLTYPE, 5)
+#define VZCTL_TC_GET_STAT_LIST		_IOR(VZTCCTLTYPE, 6, struct vzctl_tc_get_stat_list)
+#define VZCTL_TC_GET_STAT		_IOR(VZTCCTLTYPE, 7, struct vzctl_tc_get_stat)
+#define VZCTL_TC_DESTROY_STAT		_IO(VZTCCTLTYPE, 8)
+#define VZCTL_TC_DESTROY_ALL_STAT	_IO(VZTCCTLTYPE, 9)
+
+#define VZCTL_TC_GET_BASE		_IO(VZTCCTLTYPE, 11)
+#define VZCTL_TC_SET_BASE		_IOW(VZTCCTLTYPE, 12, struct vzctl_tc_set_base)
+
+#define VZCTL_TC_GET_STAT_V6		_IOR(VZTCCTLTYPE, 13, struct vzctl_tc_get_stat)
+#define VZCTL_TC_SET_CLASS_TABLE_V6	_IOW(VZTCCTLTYPE, 14, struct vzctl_tc_classes_v6)
+#define VZCTL_TC_GET_CLASS_TABLE_V6	_IOR(VZTCCTLTYPE, 15, struct vzctl_tc_classes_v6)
+
+#define VZCTL_TC_CLASS_NUM_V6		_IO(VZTCCTLTYPE, 16)
+
+#define VZCTL_TC_CLEAR_STAT		_IO(VZTCCTLTYPE, 17)
+#define VZCTL_TC_CLEAR_ALL_STAT		_IO(VZTCCTLTYPE, 18)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_tc_classes {
+	compat_uptr_t			info;
+	int				length;
+};
+
+struct compat_vzctl_tc_get_stat {
+	envid_t				veid;
+	compat_uptr_t			incoming;
+	compat_uptr_t			outgoing;
+	compat_uptr_t			incoming_pkt;
+	compat_uptr_t			outgoing_pkt;
+	int				length;
+};
+
+struct compat_vzctl_tc_get_stat_list {
+	compat_uptr_t			list;
+	int				length;
+};
+
+#define COMPAT_VZCTL_TC_SET_CLASS_TABLE	_IOW(VZTCCTLTYPE, 3, struct compat_vzctl_tc_classes)
+#define COMPAT_VZCTL_TC_GET_CLASS_TABLE	_IOR(VZTCCTLTYPE, 4, struct compat_vzctl_tc_classes)
+#define COMPAT_VZCTL_TC_GET_STAT_LIST	_IOR(VZTCCTLTYPE, 6, struct compat_vzctl_tc_get_stat_list)
+#define COMPAT_VZCTL_TC_GET_STAT	_IOR(VZTCCTLTYPE, 7, struct compat_vzctl_tc_get_stat)
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* __VZCTL_NETSTAT_H__ */
--- /dev/null
+++ b/include/uapi/linux/vzctl_venet.h
@@ -0,0 +1,51 @@
+/*
+ *  include/uapi/linux/vzctl_venet.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_VZCTL_VENET_H
+#define _UAPI_VZCTL_VENET_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#endif
+
+#define VE_IP_ADD	1
+#define VE_IP_DEL	2
+#define VE_IP_EXT_ADD	3
+#define VE_IP_EXT_DEL	4
+
+struct vzctl_ve_ip_map {
+	envid_t		veid;
+	int		op;
+	struct sockaddr *addr;
+	int		addrlen;
+};
+
+#define VENETCTLTYPE		'('
+#define VENETCTL_VE_IP_MAP	_IOW(VENETCTLTYPE, 3, struct vzctl_ve_ip_map)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_ip_map {
+	envid_t		veid;
+	int		op;
+	compat_uptr_t	addr;
+	int		addrlen;
+};
+
+#define VENETCTL_COMPAT_VE_IP_MAP	_IOW(VENETCTLTYPE, 3, struct compat_vzctl_ve_ip_map)
+
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* _UAPI_VZCTL_VENET_H */
--- /dev/null
+++ b/include/uapi/linux/vzctl_veth.h
@@ -0,0 +1,39 @@
+/*
+ *  include/uapi/linux/vzctl_veth.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_VZCTL_VETH_H
+#define _UAPI_VZCTL_VETH_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#endif
+
+#define VE_ETH_ADD			1
+#define VE_ETH_DEL			2
+#define VE_ETH_ALLOW_MAC_CHANGE		3
+#define VE_ETH_DENY_MAC_CHANGE		4
+
+struct vzctl_ve_hwaddr {
+	envid_t		veid;
+	int		op;
+	unsigned char	dev_addr[6];
+	int		addrlen;
+	char		dev_name[16];
+	unsigned char	dev_addr_ve[6];
+	int		addrlen_ve;
+	char		dev_name_ve[16];
+};
+
+#define VETHCTLTYPE		'['
+#define VETHCTL_VE_HWADDR	_IOW(VETHCTLTYPE, 3, struct vzctl_ve_hwaddr)
+
+#endif /* _UAPI_VZCTL_VETH_H */
--- /dev/null
+++ b/include/uapi/linux/vziptable_defs.h
@@ -0,0 +1,79 @@
+/*
+ *  include/uapi/linux/vziptable_defs.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZIPTABLE_DEFS_H
+#define _UAPI_LINUX_VZIPTABLE_DEFS_H
+
+/*
+ * This masks represent modules
+ *
+ * Strictly speaking we use only a small subset
+ * of this bits novadays but we MUST RESERVE all
+ * the bits were ever used in a sake of ABI compatibility
+ * (ie compatibility with vzctl user-space utility)
+ *
+ * DON'T EVER DELETE/MODIFY THIS BITS
+ */
+#define VE_IPT_GENERATE(name, shift)	name = (1U << shift)
+
+enum ve_ipt_mods {
+	VE_IPT_GENERATE(VE_IP_IPTABLES_MOD,		 0),
+	VE_IPT_GENERATE(VE_IP_FILTER_MOD,		 1),
+	VE_IPT_GENERATE(VE_IP_MANGLE_MOD,		 2),
+	VE_IPT_GENERATE(VE_IP_MATCH_LIMIT_MOD,		 3),
+	VE_IPT_GENERATE(VE_IP_MATCH_MULTIPORT_MOD,	 4),
+	VE_IPT_GENERATE(VE_IP_MATCH_TOS_MOD,		 5),
+	VE_IPT_GENERATE(VE_IP_TARGET_TOS_MOD,		 6),
+	VE_IPT_GENERATE(VE_IP_TARGET_REJECT_MOD,	 7),
+	VE_IPT_GENERATE(VE_IP_TARGET_TCPMSS_MOD,	 8),
+	VE_IPT_GENERATE(VE_IP_MATCH_TCPMSS_MOD,		 9),
+	VE_IPT_GENERATE(VE_IP_MATCH_TTL_MOD,		10),
+	VE_IPT_GENERATE(VE_IP_TARGET_LOG_MOD,		11),
+	VE_IPT_GENERATE(VE_IP_MATCH_LENGTH_MOD,		12),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_MOD,		14),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_FTP_MOD,	15),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_IRC_MOD,	16),
+	VE_IPT_GENERATE(VE_IP_MATCH_CONNTRACK_MOD,	17),
+	VE_IPT_GENERATE(VE_IP_MATCH_STATE_MOD,		18),
+	VE_IPT_GENERATE(VE_IP_MATCH_HELPER_MOD,		19),
+	VE_IPT_GENERATE(VE_IP_NAT_MOD,			20),
+	VE_IPT_GENERATE(VE_IP_NAT_FTP_MOD,		21),
+	VE_IPT_GENERATE(VE_IP_NAT_IRC_MOD,		22),
+	VE_IPT_GENERATE(VE_IP_TARGET_REDIRECT_MOD,	23),
+	VE_IPT_GENERATE(VE_IP_MATCH_OWNER_MOD,		24),
+	VE_IPT_GENERATE(VE_IP_MATCH_MAC_MOD,		25),
+	VE_IPT_GENERATE(VE_IP_IPTABLES6_MOD,		26),
+	VE_IPT_GENERATE(VE_IP_FILTER6_MOD,		27),
+	VE_IPT_GENERATE(VE_IP_MANGLE6_MOD,		28),
+	VE_IPT_GENERATE(VE_IP_IPTABLE_NAT_MOD,		29),
+	VE_IPT_GENERATE(VE_NF_CONNTRACK_MOD,		30),
+};
+
+/* these masks represent modules with their dependences */
+#define VE_IP_IPTABLES		(VE_IP_IPTABLES_MOD)
+#define VE_IP_FILTER		(VE_IP_FILTER_MOD | VE_IP_IPTABLES)
+#define VE_IP_MANGLE		(VE_IP_MANGLE_MOD | VE_IP_IPTABLES)
+#define VE_IP_IPTABLES6		(VE_IP_IPTABLES6_MOD)
+#define VE_IP_FILTER6		(VE_IP_FILTER6_MOD | VE_IP_IPTABLES6)
+#define VE_IP_MANGLE6		(VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6)
+#define VE_NF_CONNTRACK		(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK		(VE_IP_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK_FTP	(VE_IP_CONNTRACK_FTP_MOD | VE_IP_CONNTRACK)
+#define VE_IP_CONNTRACK_IRC	(VE_IP_CONNTRACK_IRC_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT		(VE_IP_NAT_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT_FTP		(VE_IP_NAT_FTP_MOD | VE_IP_NAT | VE_IP_CONNTRACK_FTP)
+#define VE_IP_NAT_IRC		(VE_IP_NAT_IRC_MOD | VE_IP_NAT | VE_IP_CONNTRACK_IRC)
+#define VE_IP_IPTABLE_NAT	(VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK)
+
+/* safe iptables mask to be used by default */
+#define VE_IP_DEFAULT		(VE_IP_IPTABLES | VE_IP_FILTER | VE_IP_MANGLE | \
+				 VE_IP_IPTABLES6 | VE_IP_FILTER6 | VE_IP_MANGLE6)
+
+#define VE_IP_NONE		(0ull)
+#define VE_IP_ALL		(~VE_IP_NONE)
+
+#endif /* _UAPI_LINUX_VZIPTABLE_DEFS_H */
--- /dev/null
+++ b/include/uapi/linux/vzlist.h
@@ -0,0 +1,46 @@
+/*
+ *  include/uapi/linux/vzlist.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZLIST_H
+#define _UAPI_LINUX_VZLIST_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+struct vzlist_veidctl {
+	unsigned int	num;
+	envid_t	__user	*id;
+};
+
+struct vzlist_vepidctl {
+	envid_t		veid;
+	unsigned int	num;
+	pid_t __user	*pid;
+};
+
+struct vzlist_veipctl {
+	envid_t		veid;
+	unsigned int	num;
+	void __user	*ip;
+};
+
+#define VZLISTTYPE		'x'
+#define VZCTL_GET_VEIDS		_IOR(VZLISTTYPE, 1, struct vzlist_veidctl)
+#define VZCTL_GET_VEPIDS	_IOR(VZLISTTYPE, 2, struct vzlist_vepidctl)
+#define VZCTL_GET_VEIPS		_IOR(VZLISTTYPE, 3, struct vzlist_veipctl)
+#define VZCTL_GET_VEIP6S	_IOR(VZLISTTYPE, 4, struct vzlist_veipctl)
+
+#endif /* _UAPI_LINUX_VZLIST_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,7 +400,7 @@ config TASK_XACCT
 
 config TASK_IO_ACCOUNTING
 	bool "Enable per-task storage I/O accounting"
-	depends on TASK_XACCT
+	depends on TASK_XACCT && BEANCOUNTERS
 	help
 	  Collect information on the number of bytes of storage I/O which this
 	  task has caused.
@@ -1009,9 +1009,13 @@ config FAIR_GROUP_SCHED
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 
+config CFS_CPULIMIT
+	bool
+
 config CFS_BANDWIDTH
 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
 	depends on FAIR_GROUP_SCHED
+	select CFS_CPULIMIT
 	default n
 	help
 	  This option allows users to define CPU bandwidth rates (limits) for
--- a/init/main.c
+++ b/init/main.c
@@ -59,7 +59,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
@@ -79,6 +78,9 @@
 #include <linux/context_tracking.h>
 #include <linux/list.h>
 #include <linux/io.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -117,6 +119,12 @@ bool early_boot_irqs_disabled __read_mostly;
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
+#ifdef CONFIG_VE
+extern void init_ve_system(void);
+#else
+#define init_ve_system()		do { } while (0)
+#endif
+
 /*
  * Boot command-line arguments
  */
@@ -519,6 +527,8 @@ asmlinkage void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	ub_init_early();
+	kstat_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
@@ -608,8 +618,8 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	page_cgroup_init();
-	debug_objects_mem_init();
 	kmemleak_init();
+	debug_objects_mem_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
@@ -638,6 +648,7 @@ asmlinkage void __init start_kernel(void)
 	proc_root_init();
 #endif
 	cgroup_init();
+	ub_init_late();
 	cpuset_init();
 	taskstats_init_early();
 	delayacct_init();
@@ -853,7 +864,9 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
+	init_ve_system();
 	cpuset_init_smp();
+	usermodehelper_init();
 	shmem_init();
 	driver_init();
 	init_irq_proc();
--- a/init/version.c
+++ b/init/version.c
@@ -13,6 +13,7 @@
 #include <generated/utsrelease.h>
 #include <linux/version.h>
 #include <linux/proc_ns.h>
+#include <linux/init_task.h>
 
 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
@@ -39,6 +40,12 @@ struct uts_namespace init_uts_ns = {
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
+struct new_utsname virt_utsname = {
+	/* we need only this field */
+	.release        = UTS_RELEASE,
+};
+EXPORT_SYMBOL(virt_utsname);
+
 /* FIXED STRINGS! Don't touch! */
 const char linux_banner[] =
 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -165,28 +165,28 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmax),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmall",
 		.data		= &init_ipc_ns.shm_ctlall,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlall),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmmni",
 		.data		= &init_ipc_ns.shm_ctlmni,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmni),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec,
 	},
 	{
 		.procname	= "shm_rmid_forced",
 		.data		= &init_ipc_ns.shm_rmid_forced,
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -195,7 +195,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msgmax",
 		.data		= &init_ipc_ns.msg_ctlmax,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmax),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -204,7 +204,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msgmni",
 		.data		= &init_ipc_ns.msg_ctlmni,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmni),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_callback_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -213,7 +213,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	=  "msgmnb",
 		.data		= &init_ipc_ns.msg_ctlmnb,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmnb),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -222,14 +222,14 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "sem",
 		.data		= &init_ipc_ns.sem_ctls,
 		.maxlen		= 4*sizeof (int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec,
 	},
 	{
 		.procname	= "auto_msgmni",
 		.data		= &init_ipc_ns.auto_msgmni,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipcauto_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -239,7 +239,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -248,7 +248,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -257,7 +257,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -266,18 +266,14 @@ static struct ctl_table ipc_kern_table[] = {
 	{}
 };
 
-static struct ctl_table ipc_root_table[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= ipc_kern_table,
-	},
+static struct ctl_path ipc_path[] = {
+	{ .procname = "kernel", },
 	{}
 };
 
 static int __init ipc_sysctl_init(void)
 {
-	register_sysctl_table(ipc_root_table);
+	register_sysctl_paths(ipc_path, ipc_kern_table);
 	return 0;
 }
 
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
+#include <linux/stat.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 static void *get_mq(ctl_table *table)
@@ -58,14 +59,14 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "queues_max",
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
@@ -74,7 +75,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msgsize_max",
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
@@ -83,7 +84,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msg_default",
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
@@ -92,7 +93,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msgsize_default",
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1396,7 +1396,7 @@ static struct file_system_type mqueue_fs_type = {
 	.name = "mqueue",
 	.mount = mqueue_mount,
 	.kill_sb = kill_litter_super,
-	.fs_flags = FS_USERNS_MOUNT,
+	.fs_flags = FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int mq_init_ns(struct ipc_namespace *ns)
@@ -1433,7 +1433,7 @@ static int __init init_mqueue_fs(void)
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
 				sizeof(struct mqueue_inode_info), 0,
-				SLAB_HWCACHE_ALIGN, init_once);
+				SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -52,7 +52,7 @@ static struct msg_msg *alloc_msg(size_t len)
 	size_t alen;
 
 	alen = min(len, DATALEN_MSG);
-	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
 	if (msg == NULL)
 		return NULL;
 
@@ -64,7 +64,7 @@ static struct msg_msg *alloc_msg(size_t len)
 	while (len > 0) {
 		struct msg_msgseg *seg;
 		alen = min(len, DATALEN_SEG);
-		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL);
+		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
 		if (seg == NULL)
 			goto out_err;
 		*pseg = seg;
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1606,7 +1606,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 
 	undo_list = current->sysvsem.undo_list;
 	if (!undo_list) {
-		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
 		if (undo_list == NULL)
 			return -ENOMEM;
 		spin_lock_init(&undo_list->lock);
@@ -1690,7 +1690,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 	rcu_read_unlock();
 
 	/* step 2: allocate new undo structure */
-	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+	new = kzalloc(sizeof(struct sem_undo) +	sizeof(short)*nsems,
+			GFP_KERNEL_ACCOUNT);
 	if (!new) {
 		ipc_rcu_putref(sma, ipc_rcu_free);
 		return ERR_PTR(-ENOMEM);
@@ -1780,7 +1781,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	if (nsops > ns->sc_semopm)
 		return -E2BIG;
 	if(nsops > SEMOPM_FAST) {
-		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
+		sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_ACCOUNT);
 		if(sops==NULL)
 			return -ENOMEM;
 	}
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -155,9 +155,13 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 {
 	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
 
+	/*
+	 * Callers of shm_lock() must validate the status of the returned ipc
+	 * object pointer (as returned by ipc_lock()), and error out as
+	 * appropriate.
+	 */
 	if (IS_ERR(ipcp))
 		return (struct shmid_kernel *)ipcp;
-
 	return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 
@@ -182,19 +186,32 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 }
 
 
-/* This is called by fork, once for every shm attach. */
-static void shm_open(struct vm_area_struct *vma)
+static int __shm_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct shmid_kernel *shp;
 
 	shp = shm_lock(sfd->ns, sfd->id);
-	BUG_ON(IS_ERR(shp));
+	if (IS_ERR(shp))
+		return PTR_ERR(shp);
+
 	shp->shm_atim = get_seconds();
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_nattch++;
 	shm_unlock(shp);
+	return 0;
+}
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+	int err = __shm_open(vma);
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	WARN_ON_ONCE(err);
 }
 
 /*
@@ -256,7 +273,14 @@ static void shm_close(struct vm_area_struct *vma)
 	down_write(&shm_ids(ns).rwsem);
 	/* remove from the list of attaches of the shm segment */
 	shp = shm_lock(ns, sfd->id);
-	BUG_ON(IS_ERR(shp));
+
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	if (WARN_ON_ONCE(IS_ERR(shp)))
+		goto done; /* no-op */
+
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
@@ -264,6 +288,7 @@ static void shm_close(struct vm_area_struct *vma)
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
+done:
 	up_write(&shm_ids(ns).rwsem);
 }
 
@@ -384,17 +409,25 @@ static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 	struct shm_file_data *sfd = shm_file_data(file);
 	int ret;
 
+	/*
+	 * In case of remap_file_pages() emulation, the file can represent
+	 * removed IPC ID: propogate shm_lock() error to caller.
+	 */
+	ret =__shm_open(vma);
+	if (ret)
+		return ret;
+
 	ret = sfd->file->f_op->mmap(sfd->file, vma);
-	if (ret != 0)
+	if (ret) {
+		shm_close(vma);
 		return ret;
+	}
 	sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
 	BUG_ON(!sfd->vm_ops->fault);
 #endif
 	vma->vm_ops = &shm_vm_ops;
-	shm_open(vma);
-
-	return ret;
+	return 0;
 }
 
 static int shm_release(struct inode *ino, struct file *file)
@@ -1041,8 +1074,8 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
  * "raddr" thing points to kernel space, and there has to be a wrapper around
  * this.
  */
-long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
-	      unsigned long shmlba)
+long do_shmat(int shmid, char __user *shmaddr, int shmflg,
+	      ulong *raddr, unsigned long shmlba)
 {
 	struct shmid_kernel *shp;
 	unsigned long addr;
@@ -1063,8 +1096,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		goto out;
 	else if ((addr = (ulong)shmaddr)) {
 		if (addr & (shmlba - 1)) {
-			if (shmflg & SHM_RND)
-				addr &= ~(shmlba - 1);	   /* round down */
+			/*
+			 * Round down to the nearest multiple of shmlba.
+			 * For sane do_mmap_pgoff() parameters, avoid
+			 * round downs that trigger nil-page and MAP_FIXED.
+			 */
+			if ((shmflg & SHM_RND) && addr >= shmlba)
+				addr &= ~(shmlba - 1);
 			else
 #ifndef __ARCH_FORCE_SHMLBA
 				if (addr & ~PAGE_MASK)
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -182,8 +182,8 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 	iface->show	= show;
 
 	pde = proc_create_data(path,
-			       S_IRUGO,        /* world readable */
-			       NULL,           /* parent dir */
+			       S_ISVTX | S_IRUGO,	/* world readable */
+			       NULL,			/* parent dir */
 			       &sysvipc_proc_fops,
 			       iface);
 	if (!pde) {
@@ -466,9 +466,9 @@ void *ipc_alloc(int size)
 {
 	void *out;
 	if(size > PAGE_SIZE)
-		out = vmalloc(size);
+		out = vmalloc_account(size);
 	else
-		out = kmalloc(size, GFP_KERNEL);
+		out = kmalloc(size, GFP_KERNEL_ACCOUNT);
 	return out;
 }
 
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -130,7 +130,7 @@ int ipc_rcu_getref(void *ptr);
 void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
 void ipc_rcu_free(struct rcu_head *head);
 
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+extern struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
--- /dev/null
+++ b/kernel/Kconfig.openvz
@@ -0,0 +1,120 @@
+# kernel/Kconfig.openvz
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+menu "OpenVZ"
+
+config VE
+	bool "Virtual Environment support"
+	default y
+	select NAMESPACES
+	select PID_NS
+	select IPC_NS
+	select UTS_NS
+	select NET_NS
+	select USER_NS
+	select CGROUPS
+	select CGROUP_DEVICE
+	select CGROUP_FREEZER
+	select CGROUP_PERF
+	select SCHEDSTATS
+	help
+	  This option adds support of virtual Linux running on the original box
+	  with fully supported virtual network driver, tty subsystem and
+	  configurable access for hardware and other resources.
+
+config VE_CALLS
+	tristate "VE calls interface"
+	depends on VE
+	select VZ_DEV
+	default m
+	help
+	  This option controls how to build vzmon code containing VE calls.
+	  By default it's build in module vzmon.o
+
+config VZ_GENCALLS
+	bool
+	default y
+
+config VE_NETDEV
+	tristate "VE network device"
+	depends on VE_CALLS && NET
+	select VZ_DEV
+	default m
+	help
+	  This option controls whether to build venet device. This is a
+	  common interface for networking in VE.
+
+config VZ_DEV
+	tristate "VE device"
+	default m
+	help
+	  This option adds support of vzdev device, which is used by
+	  user-space applications to control Virtual Environments.
+
+config VE_IPTABLES
+	bool "VE netfiltering"
+	depends on VE && VE_NETDEV && INET && NETFILTER
+	default y
+	help
+	  This option controls whether to build VE netfiltering code.
+
+config VZ_LIST
+	tristate "VE listing/statistics user ioctl interface"
+	depends on VE
+	default m
+	help
+	  This options controls building of vzlist module.
+	  This module provides ioctl interfaces for fetching VE ids, ip addresses
+	  and pids of running processes.
+
+config VE_NETDEV_ACCOUNTING
+	tristate "VE networking accounting"
+	depends on VE_NETDEV
+	default m
+	help
+	  This option allows traffic accounting on Virtual Networking device and
+	  on real devices moved to a Virtual Environment
+
+config VZ_WDOG
+	tristate "VE watchdog module"
+	depends on VE_CALLS
+	default m
+	help
+	  This option controls building of vzwdog module, which dumps
+	  a lot of useful system info on console periodically.
+
+config VZ_EVENT
+ 	tristate "Enable sending notifications of the VE status change through the netlink socket"
+ 	depends on VE && VE_CALLS && NET
+ 	default m
+ 	help
+ 	  This option provides for sending notifications of the VE
+ 	  events to the curious user space applications through
+ 	  the netlink socket just like the core kernel
+ 	  networking code does. By now just the notifications of
+ 	  the VE essensial status changes are being sent.
+
+
+config FENCE_WATCHDOG
+	bool "Fencing watchdog for HA cluster support"
+	depends on X86_64
+	default n
+endmenu
+
+
+config VZ_IOLIMIT
+	tristate "Container IO-limiting"
+	depends on VE && VE_CALLS && BC_IO_ACCOUNTING
+	default m
+	help
+	   This option provides io-limiting module.
+
+config VE_TUNTAP_ACCOUNTING
+	bool "Accounting for tun/tap devices"
+	depends on VE_NETDEV_ACCOUNTING && TUN
+	default y
+	help
+	   This option enables accounting for tun/tap devices.
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,17 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
 obj-y += sched/
 obj-y += power/
 obj-y += cpu/
@@ -32,6 +43,9 @@ obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+obj-$(CONFIG_BEANCOUNTERS) += bc/
+obj-y += ve/
+
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
@@ -83,6 +97,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
@@ -119,6 +134,7 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_FENCE_WATCHDOG) += fence-watchdog.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -271,7 +271,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 {
 	int error = 0;
 
-	if (!capable(CAP_SYS_PACCT))
+	if (!ve_capable(CAP_SYS_PACCT))
 		return -EPERM;
 
 	if (name) {
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
 #include <asm/syscall.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
+#include <linux/uaccess.h>
 #include <linux/compat.h>
 #include <linux/ctype.h>
 
@@ -1977,7 +1978,7 @@ static int audit_set_loginuid_perm(kuid_t loginuid)
 	if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
 		return -EPERM;
 	/* it is set, you need permission */
-	if (!capable(CAP_AUDIT_CONTROL))
+	if (!ve_capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 	/* reject if this is not an unset and we don't allow that */
 	if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
--- /dev/null
+++ b/kernel/bc/Kconfig
@@ -0,0 +1,55 @@
+#
+# User resources part (UBC)
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+menu "User resources"
+
+config BEANCOUNTERS
+	bool "Enable user resource accounting"
+	default y
+	select CGROUPS
+	select MEMCG
+	select MEMCG_KMEM
+	select MEMCG_SWAP if SWAP
+	select MEMCG_SWAP_ENABLED if SWAP
+	help 
+          This patch provides accounting and allows to configure
+          limits for user's consumption of exhaustible system resources.
+          The most important resource controlled by this patch is unswappable 
+          memory (either mlock'ed or used by internal kernel structures and 
+          buffers). The main goal of this patch is to protect processes
+          from running short of important resources because of an accidental
+          misbehavior of processes or malicious activity aiming to ``kill'' 
+          the system. It's worth to mention that resource limits configured 
+          by setrlimit(2) do not give an acceptable level of protection 
+          because they cover only small fraction of resources and work on a 
+          per-process basis.  Per-process accounting doesn't prevent malicious
+          users from spawning a lot of resource-consuming processes.
+
+config BC_IO_ACCOUNTING
+	bool "Account file I/O"
+	default y
+	depends on BEANCOUNTERS
+	help
+	  This option allows seeing I/O activity caused by tasks from each UB
+
+config BC_IO_PRIORITY
+	bool "Disk I/O priority"
+	default y
+	depends on BEANCOUNTERS
+	select BLK_CGROUP
+	help
+	  This option add compat-layer on top of the blkio-cgroup for groupping
+	  and prioritizing disk access.
+
+config BC_PROC
+	bool "Report resource usage in /proc"
+	default y
+	depends on BEANCOUNTERS
+	help
+          Allows a system administrator to inspect resource accounts and limits.
+
+endmenu
--- /dev/null
+++ b/kernel/bc/Makefile
@@ -0,0 +1,13 @@
+#
+# User resources part (UBC)
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-y := sys.o beancounter.o misc.o \
+	 vm_pages.o statd.o
+
+obj-$(CONFIG_BC_PROC)  += proc.o
+obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o
+obj-$(CONFIG_BC_IO_PRIORITY) += io_prio.o
--- /dev/null
+++ b/kernel/bc/beancounter.c
@@ -0,0 +1,1207 @@
+/*
+ *  kernel/bc/beancounter.c
+ *
+ *  Copyright (C) 1998  Alan Cox
+ *                1998-2000  Andrey V. Savochkin <saw@saw.sw.com.sg>
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ * TODO:
+ *   - more intelligent limit check in mremap(): currently the new size is
+ *     charged and _then_ old size is uncharged
+ *     (almost done: !move_vma case is completely done,
+ *      move_vma in its current implementation requires too many conditions to
+ *      do things right, because it may be not only expansion, but shrinking
+ *      also, plus do_munmap will require an additional parameter...)
+ *   - problem: bad pmd page handling
+ *   - consider /proc redesign
+ *   - TCP/UDP ports
+ *   + consider whether __charge_beancounter_locked should be inline
+ *
+ * Changes:
+ *   1999/08/17  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- Set "barrier" and "limit" parts of limits atomically.
+ *   1999/10/06  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- setublimit system call.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <linux/cgroup.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroup.h>
+#include <linux/task_work.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+struct user_beancounter ub0 = {
+};
+EXPORT_SYMBOL(ub0);
+
+const char *ub_rnames[] = {
+	"kmemsize",	/* 0 */
+	"lockedpages",
+	"privvmpages",
+	"shmpages",
+	"dummy",
+	"numproc",	/* 5 */
+	"physpages",
+	"vmguarpages",
+	"oomguarpages",
+	"numtcpsock",
+	"numflock",	/* 10 */
+	"numpty",
+	"numsiginfo",
+	"tcpsndbuf",
+	"tcprcvbuf",
+	"othersockbuf",	/* 15 */
+	"dgramrcvbuf",
+	"numothersock",
+	"dcachesize",
+	"numfile",
+	"dummy",	/* 20 */
+	"dummy",
+	"dummy",
+	"numiptent",
+	"swappages",
+};
+
+/* default maximum perpcu resources precharge */
+int ub_resource_precharge[UB_RESOURCES] = {
+	[UB_PRIVVMPAGES]= 256,
+	[UB_NUMPROC]	= 4,
+	[UB_NUMSIGINFO]	= 4,
+	[UB_NUMFILE]	= 8,
+};
+
+/* natural limits for percpu precharge bounds */
+static int resource_precharge_min = 0;
+static int resource_precharge_max = INT_MAX / NR_CPUS;
+
+static struct vfsmount *ub_cgroup_mnt;
+static struct vfsmount *ub_bound_cgroup_mnt[NR_UB_BOUND_CGROUPS];
+
+#define mem_cgroup_mnt		(ub_bound_cgroup_mnt[UB_MEM_CGROUP])
+#define blkio_cgroup_mnt	(ub_bound_cgroup_mnt[UB_BLKIO_CGROUP])
+
+static void __ub_set_css(struct user_beancounter *ub, int idx,
+			 struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys_state *old_css;
+	unsigned long flags;
+
+	if (css)
+		css_get(css);
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	old_css = ub->ub_bound_css[idx];
+	ACCESS_ONCE(ub->ub_bound_css[idx]) = css;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	if (old_css)
+		css_put(old_css);
+}
+
+struct cgroup_subsys_state *__ub_get_css(struct user_beancounter *ub, int idx)
+{
+	struct cgroup_subsys_state *css, *root_css;
+	unsigned long flags;
+
+	rcu_read_lock();
+retry:
+	css = ACCESS_ONCE(ub->ub_bound_css[idx]);
+	if (likely(css && css_tryget(css))) {
+		rcu_read_unlock();
+		return css;
+	}
+
+	root_css = ub0.ub_bound_css[idx];
+
+	/* cgroup was removed, fall back to the root */
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	if (unlikely(ub->ub_bound_css[idx] != css)) {
+		/* someone did it for us, retry */
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+		goto retry;
+	}
+	ACCESS_ONCE(ub->ub_bound_css[idx]) = root_css;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	rcu_read_unlock();
+
+	if (css)
+		css_put(css);
+
+	css_get(root_css);
+	return root_css;
+}
+
+static void ub_set_mem_css(struct user_beancounter *ub,
+				  struct cgroup_subsys_state *css)
+{
+	__ub_set_css(ub, UB_MEM_CGROUP, css);
+}
+
+static void ub_set_blkio_css(struct user_beancounter *ub,
+			     struct cgroup_subsys_state *css)
+{
+	__ub_set_css(ub, UB_BLKIO_CGROUP, css);
+}
+
+/*
+ * Used to attach a task to a beancounter in the legacy API.
+ */
+int ub_attach_task(struct user_beancounter *ub, struct task_struct *tsk)
+{
+	int ret = 0;
+	struct user_beancounter *old_ub = tsk->task_bc.exec_ub;
+	struct cgroup_subsys_state *css;
+
+	if (ub == old_ub)
+		goto out;
+	css = ub_get_mem_css(ub);
+	ret = cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	if (ret)
+		goto out;
+	css = ub_get_blkio_css(ub);
+	ret = cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	if (ret)
+		goto fail_blkio;
+	ret = cgroup_kernel_attach(ub->css.cgroup, tsk);
+	if (ret)
+		goto fail_ub;
+out:
+	return ret;
+fail_ub:
+	css = ub_get_blkio_css(old_ub);
+	cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+fail_blkio:
+	css = ub_get_mem_css(old_ub);
+	cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	goto out;
+}
+
+extern void mem_cgroup_sync_beancounter(struct mem_cgroup *memcg,
+					struct user_beancounter *ub);
+extern int mem_cgroup_apply_beancounter(struct mem_cgroup *memcg,
+					struct user_beancounter *ub);
+extern void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+				    unsigned long *pages);
+extern unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg,
+					    bool swap);
+
+/*
+ * Update memcg limits according to beancounter configuration.
+ */
+int ub_update_memcg(struct user_beancounter *ub)
+{
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_apply_beancounter(mem_cgroup_from_cont(css->cgroup),
+					   ub);
+	css_put(css);
+	return ret;
+}
+
+/*
+ * Synchronize memcg stats with beancounter.
+ */
+void ub_sync_memcg(struct user_beancounter *ub)
+{
+	struct cgroup_subsys_state *css;
+
+	css = ub_get_mem_css(ub);
+	mem_cgroup_sync_beancounter(mem_cgroup_from_cont(css->cgroup), ub);
+	css_put(css);
+}
+
+unsigned long ub_total_pages(struct user_beancounter *ub, bool swap)
+{
+	struct cgroup_subsys_state *css;
+	unsigned long ret;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_total_pages(mem_cgroup_from_cont(css->cgroup), swap);
+	css_put(css);
+	return ret;
+}
+
+void init_beancounter_precharge(struct user_beancounter *ub, int resource)
+{
+	/* limit maximum precharge with one half of current resource excess */
+	ub->ub_parms[resource].max_precharge = min_t(long,
+			ub_resource_precharge[resource],
+			ub_resource_excess(ub, resource, UB_SOFT) /
+			(2 * num_possible_cpus()));
+}
+
+static void init_beancounter_precharges(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		init_beancounter_precharge(ub, resource);
+}
+
+static void __init init_beancounter_precharges_early(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ ) {
+
+		/* DEBUG: sanity checks for initial prechage bounds */
+		BUG_ON(ub_resource_precharge[resource] < resource_precharge_min);
+		BUG_ON(ub_resource_precharge[resource] > resource_precharge_max);
+
+		ub->ub_parms[resource].max_precharge =
+			ub_resource_precharge[resource];
+	}
+}
+
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge)
+{
+	int cpu, resource;
+
+	memset(precharge, 0, sizeof(int) * UB_RESOURCES);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+		for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+			precharge[resource] += pcpu->precharge[resource];
+	}
+}
+
+static void uncharge_beancounter_precharge(struct user_beancounter *ub)
+{
+	int resource, precharge[UB_RESOURCES];
+
+	ub_precharge_snapshot(ub, precharge);
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		ub->ub_parms[resource].held -= precharge[resource];
+}
+
+static void init_beancounter_struct(struct user_beancounter *ub);
+static void init_beancounter_nolimits(struct user_beancounter *ub);
+
+static DEFINE_SPINLOCK(ub_list_lock);
+LIST_HEAD(ub_list_head); /* protected by ub_list_lock */
+EXPORT_SYMBOL(ub_list_head);
+int ub_count;
+
+/*
+ *	Per user resource beancounting. Resources are tied to their luid.
+ *	The resource structure itself is tagged both to the process and
+ *	the charging resources (a socket doesn't want to have to search for
+ *	things at irq time for example). Reference counters keep things in
+ *	hand.
+ *
+ *	The case where a user creates resource, kills all his processes and
+ *	then starts new ones is correctly handled this way. The refcounters
+ *	will mean the old entry is still around with resource tied to it.
+ */
+
+static struct user_beancounter *alloc_ub(const char *name)
+{
+	struct user_beancounter *new_ub;
+
+	new_ub = kzalloc(sizeof(*new_ub), GFP_KERNEL);
+	if (new_ub == NULL)
+		return NULL;
+
+	init_beancounter_nolimits(new_ub);
+	init_beancounter_struct(new_ub);
+
+	init_beancounter_precharges(new_ub);
+
+	new_ub->ub_name = kstrdup(name, GFP_KERNEL);
+	if (!new_ub->ub_name)
+		goto fail_name;
+
+	new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
+	if (new_ub->ub_percpu == NULL)
+		goto fail_free;
+
+	return new_ub;
+
+fail_free:
+	kfree(new_ub->ub_name);
+fail_name:
+	kfree(new_ub);
+	return NULL;
+}
+
+static inline void free_ub(struct user_beancounter *ub)
+{
+	free_percpu(ub->ub_percpu);
+	kfree(ub->ub_store);
+	kfree(ub->ub_name);
+	kfree(ub->iolimit);
+	kfree(ub);
+}
+
+/*
+ * Used to lookup or create a beancounter in the legacy API.
+ */
+struct user_beancounter *get_beancounter_by_name(const char *name, int create)
+{
+	struct user_beancounter *ub;
+	struct cgroup *cg, *ub_cg;
+	int err = 0;
+
+	if (!strcmp(name, get_ub0()->ub_name))
+		return get_beancounter(get_ub0());
+
+	ub_cg = cgroup_kernel_open(cgroup_get_root(ub_cgroup_mnt), 0, name);
+	if (IS_ERR(ub_cg))
+		return NULL;
+	if (ub_cg) {
+		ub = cgroup_ub(ub_cg);
+		goto out;
+	}
+	if (!create)
+		return NULL;
+
+	/* The beancounter does not exist and we were asked to create it */
+
+	ub_cg = cgroup_kernel_open(cgroup_get_root(ub_cgroup_mnt),
+				   CGRP_CREAT, name);
+	if (IS_ERR(ub_cg))
+		return ERR_CAST(ub_cg);
+
+	ub = cgroup_ub(ub_cg);
+
+	cg = cgroup_kernel_open(cgroup_get_root(mem_cgroup_mnt),
+				CGRP_CREAT, name);
+	err = PTR_ERR(cg);
+	if (IS_ERR(cg))
+		goto out;
+
+	ub_set_mem_css(ub, cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+	cgroup_kernel_close(cg);
+
+	cg = cgroup_kernel_open(cgroup_get_root(blkio_cgroup_mnt),
+				CGRP_CREAT, name);
+	err = PTR_ERR(cg);
+	if (IS_ERR(cg))
+		goto out;
+
+	ub_set_blkio_css(ub, cgroup_subsys_state(cg, blkio_subsys_id));
+	cgroup_kernel_close(cg);
+
+	err = ub_update_memcg(cgroup_ub(ub_cg));
+	if (err)
+		pr_warn("Failed to init UB %s limits: %d\n", name, err);
+
+out:
+	if (!err)
+		get_beancounter(ub);
+	else
+		ub = NULL;
+
+	/* Don't care about cgroup removal on error, because currently we never
+	 * cleanup beancounter cgroups in the legacy mode */
+	cgroup_kernel_close(ub_cg);
+	return ub;
+}
+
+struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
+{
+	char name[32];
+
+	snprintf(name, sizeof(name), "%u", uid);
+	return get_beancounter_by_name(name, create);
+}
+EXPORT_SYMBOL(get_beancounter_byuid);
+
+uid_t ub_legacy_id(struct user_beancounter *ub)
+{
+	uid_t id;
+
+	if (kstrtouint(ub->ub_name, 10, &id) != 0)
+		id = -1;
+	return id;
+}
+
+static int verify_res(struct user_beancounter *ub, const char *name,
+		unsigned long held)
+{
+	if (likely(held == 0))
+		return 1;
+
+	printk(KERN_WARNING "Ub %s helds %ld in %s on put\n",
+			ub->ub_name, held, name);
+	return 0;
+}
+
+static inline int bc_verify_held(struct user_beancounter *ub)
+{
+	int i, clean;
+
+	ub_stat_mod(ub, dirty_pages, __ub_percpu_sum(ub, dirty_pages));
+	ub_stat_mod(ub, writeback_pages, __ub_percpu_sum(ub, writeback_pages));
+	uncharge_beancounter_precharge(ub);
+
+	/* accounted by memcg */
+	ub->ub_parms[UB_KMEMSIZE].held = 0;
+	ub->ub_parms[UB_DCACHESIZE].held = 0;
+	ub->ub_parms[UB_PHYSPAGES].held = 0;
+	ub->ub_parms[UB_SWAPPAGES].held = 0;
+	ub->ub_parms[UB_OOMGUARPAGES].held = 0;
+	ub->ub_parms[UB_NUMTCPSOCK].held = 0;
+	ub->ub_parms[UB_TCPSNDBUF].held = 0;
+	ub->ub_parms[UB_TCPRCVBUF].held = 0;
+	ub->ub_parms[UB_OTHERSOCKBUF].held = 0;
+	ub->ub_parms[UB_DGRAMRCVBUF].held = 0;
+	ub->ub_parms[UB_NUMOTHERSOCK].held = 0;
+
+	clean = 1;
+	for (i = 0; i < UB_RESOURCES; i++)
+		clean &= verify_res(ub, ub_rnames[i], ub->ub_parms[i].held);
+
+	clean &= verify_res(ub, "dirty_pages",
+			__ub_stat_get(ub, dirty_pages));
+	clean &= verify_res(ub, "writeback_pages",
+			__ub_stat_get(ub, writeback_pages));
+
+	return clean;
+}
+
+static struct cgroup_subsys_state *ub_cgroup_css_alloc(struct cgroup *cg)
+{
+	struct user_beancounter *ub;
+
+	if (!cg->parent)
+		return &ub0.css;
+
+	/* forbid nested containers */
+	if (cgroup_ub(cg->parent) != &ub0)
+		return ERR_PTR(-EPERM);
+
+	ub = alloc_ub(cg->dentry->d_name.name);
+	if (!ub)
+		return ERR_PTR(-ENOMEM);
+
+	return &ub->css;
+}
+
+static int ub_cgroup_css_online(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+
+	if (!cg->parent)
+		return 0;
+
+	init_beancounter_nolimits(ub);
+	spin_lock(&ub_list_lock);
+	list_add_rcu(&ub->ub_list, &ub_list_head);
+	ub_count++;
+	spin_unlock(&ub_list_lock);
+	return 0;
+}
+
+static void ub_cgroup_css_offline(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+
+	spin_lock(&ub_list_lock);
+	ub_count--;
+	list_del_rcu(&ub->ub_list);
+	spin_unlock(&ub_list_lock);
+}
+
+static void ub_cgroup_css_free(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	int i;
+
+	for (i = 0; i < NR_UB_BOUND_CGROUPS; i++)
+		__ub_set_css(ub, i, NULL);
+
+	if (!bc_verify_held(ub)) {
+		printk(KERN_ERR "UB: leaked beancounter %s (%p)\n",
+				ub->ub_name, ub);
+		add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
+		return;
+	}
+	free_ub(ub);
+}
+
+static void __ub_cgroup_attach(struct task_struct *tsk)
+{
+	struct user_beancounter *ub;
+
+	rcu_read_lock();
+	do {
+		ub = cgroup_ub(task_cgroup(current, ub_subsys_id));
+		if (tsk->task_bc.exec_ub == ub)
+			goto out;
+	} while (!get_beancounter_rcu(ub));
+	put_beancounter(tsk->task_bc.exec_ub);
+	tsk->task_bc.exec_ub = ub;
+out:
+	rcu_read_unlock();
+}
+
+static void ub_cgroup_attach_work_fn(struct callback_head *ch)
+{
+	__ub_cgroup_attach(current);
+}
+
+static void ub_cgroup_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *p;
+
+	/*
+	 * task_bc->exec_ub can only be modified by the owner task so we use
+	 * task work to get things done
+	 */
+	cgroup_taskset_for_each(p, cg, tset) {
+		/*
+		 * kthreads cannot be kicked to run a task work so we just
+		 * don't change ub for them
+		 */
+		if (p->flags & PF_KTHREAD)
+			return;
+
+		init_task_work(&p->task_bc.cgroup_attach_work,
+			       ub_cgroup_attach_work_fn);
+		task_work_cancel(p, ub_cgroup_attach_work_fn);
+		task_work_add(p, &p->task_bc.cgroup_attach_work, true);
+	}
+}
+
+static void ub_cgroup_fork(struct task_struct *tsk, void *private)
+{
+	/*
+	 * If a forking task is moved between cgroups, the child will have
+	 * exec_ub set to the source cgroup while being attached to the
+	 * destination cgroup, because the parent's exec_ub will only change
+	 * when it returns to userspace (see ub_cgroup_attach). To avoid this
+	 * discrepancy, here we synchronize the child's exec_ub with its
+	 * cgroup. It is safe, because the task is not allowed to run yet and
+	 * therefore cannot get/set its exec_ub.
+	 */
+	__ub_cgroup_attach(tsk);
+}
+
+static ssize_t ub_cgroup_read(struct cgroup *cg, struct cftype *cft,
+			      struct file *file, char __user *buf,
+			      size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct cgroup_subsys_state *bound_css;
+	char *path;
+	int len;
+	ssize_t ret;
+
+	bound_css = __ub_get_css(ub, cft->private);
+
+	ret = -ENOMEM;
+	path = kmalloc(PATH_MAX + 1, GFP_KERNEL);
+	if (!path)
+		goto out;
+	ret = cgroup_path(bound_css->cgroup, path, PATH_MAX);
+	if (!ret) {
+		len = strlen(path);
+		path[len++] = '\n';
+		path[len] = '\0';
+		ret = simple_read_from_buffer(buf, nbytes, ppos, path, len);
+	}
+	kfree(path);
+out:
+	css_put(bound_css);
+	return ret;
+}
+
+static int ub_cgroup_write(struct cgroup *cg, struct cftype *cft,
+			   const char *buf)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct cgroup *bound_cg;
+
+	bound_cg = cgroup_kernel_lookup(ub_bound_cgroup_mnt[cft->private],
+					buf);
+	if (IS_ERR(bound_cg))
+		return PTR_ERR(bound_cg);
+
+	switch (cft->private) {
+	case UB_MEM_CGROUP:
+		ub_set_mem_css(ub, cgroup_subsys_state(bound_cg,
+					mem_cgroup_subsys_id));
+		break;
+	case UB_BLKIO_CGROUP:
+		ub_set_blkio_css(ub, cgroup_subsys_state(bound_cg,
+					blkio_subsys_id));
+		break;
+	}
+
+	cgroup_kernel_close(bound_cg);
+	return 0;
+}
+
+static struct cftype ub_cgroup_files[] = {
+	{
+		.name = "memory",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_MEM_CGROUP,
+		.write_string = ub_cgroup_write,
+		.read = ub_cgroup_read,
+	},
+	{
+		.name = "blkio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_BLKIO_CGROUP,
+		.write_string = ub_cgroup_write,
+		.read = ub_cgroup_read,
+	},
+	{ },	/* terminate */
+};
+
+enum {
+	UB_CGROUP_ATTR_HELD,
+	UB_CGROUP_ATTR_MAXHELD,
+	UB_CGROUP_ATTR_BARRIER,
+	UB_CGROUP_ATTR_LIMIT,
+	UB_CGROUP_ATTR_FAILCNT,
+	UB_CGROUP_NR_ATTRS,
+};
+
+#define UB_CGROUP_PRIVATE(res, attr)	(((res) << 16) | (attr))
+#define UB_CGROUP_RES(val)		(((val) >> 16) & 0xffff)
+#define UB_CGROUP_ATTR(val)		((val) & 0xffff)
+
+static ssize_t ub_cgroup_resource_read(struct cgroup *cg, struct cftype *cft,
+				       struct file *file, char __user *buf,
+				       size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct ubparm *ubparm;
+	unsigned long val;
+	int res, attr;
+	int len;
+	char str[32];
+
+	res = UB_CGROUP_RES(cft->private);
+	attr = UB_CGROUP_ATTR(cft->private);
+
+	ubparm = &ub->ub_parms[res];
+
+	switch (attr) {
+	case UB_CGROUP_ATTR_HELD:
+		val = ubparm->held;
+		break;
+	case UB_CGROUP_ATTR_MAXHELD:
+		val = ubparm->maxheld;
+		break;
+	case UB_CGROUP_ATTR_BARRIER:
+		val = ubparm->barrier;
+		break;
+	case UB_CGROUP_ATTR_LIMIT:
+		val = ubparm->limit;
+		break;
+	case UB_CGROUP_ATTR_FAILCNT:
+		val = ubparm->failcnt;
+		break;
+	default:
+		BUG();
+	}
+
+	len = scnprintf(str, sizeof(str), "%lu\n", val);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int ub_cgroup_resource_write(struct cgroup *cg, struct cftype *cft,
+				    u64 val)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct ubparm *ubparm;
+	int res, attr;
+
+	if (val > UB_MAXVALUE)
+		return -EINVAL;
+
+	res = UB_CGROUP_RES(cft->private);
+	attr = UB_CGROUP_ATTR(cft->private);
+
+	ubparm = &ub->ub_parms[res];
+
+	spin_lock_irq(&ub->ub_lock);
+	switch (attr) {
+	case UB_CGROUP_ATTR_BARRIER:
+		ubparm->barrier = val;
+		break;
+	case UB_CGROUP_ATTR_LIMIT:
+		ubparm->limit = val;
+		break;
+	default:
+		BUG();
+	}
+	init_beancounter_precharge(ub, res);
+	spin_unlock_irq(&ub->ub_lock);
+	return 0;
+}
+
+static __init int ub_cgroup_init(void)
+{
+	static struct cftype cgroup_files[UB_RESOURCES * UB_CGROUP_NR_ATTRS + 1];
+	struct cftype *cft;
+	int i, j;
+
+	for (i = 0, j = 0; i < UB_RESOURCES; i++) {
+		if (!strcmp(ub_rnames[i], "dummy"))
+			continue;
+
+		/* accounted by memcg */
+		switch (i) {
+		case UB_KMEMSIZE:
+		case UB_DCACHESIZE:
+		case UB_PHYSPAGES:
+		case UB_SWAPPAGES:
+		case UB_OOMGUARPAGES:
+		case UB_NUMTCPSOCK:
+		case UB_TCPSNDBUF:
+		case UB_TCPRCVBUF:
+		case UB_OTHERSOCKBUF:
+		case UB_DGRAMRCVBUF:
+		case UB_NUMOTHERSOCK:
+			continue;
+		}
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.held", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_HELD);
+		cft->read = ub_cgroup_resource_read;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 1];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.maxheld", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_MAXHELD);
+		cft->read = ub_cgroup_resource_read;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 2];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.barrier", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_BARRIER);
+		cft->read = ub_cgroup_resource_read;
+		cft->write_u64 = ub_cgroup_resource_write;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 3];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_LIMIT);
+		cft->read = ub_cgroup_resource_read;
+		cft->write_u64 = ub_cgroup_resource_write;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 4];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_FAILCNT);
+		cft->read = ub_cgroup_resource_read;
+
+		j++;
+	}
+
+	WARN_ON(cgroup_add_cftypes(&ub_subsys, cgroup_files));
+
+	return 0;
+}
+module_init(ub_cgroup_init);
+
+struct cgroup_subsys ub_subsys = {
+	.name = "beancounter",
+	.subsys_id = ub_subsys_id,
+	.css_alloc = ub_cgroup_css_alloc,
+	.css_online = ub_cgroup_css_online,
+	.css_offline = ub_cgroup_css_offline,
+	.css_free = ub_cgroup_css_free,
+	.attach = ub_cgroup_attach,
+	.fork = ub_cgroup_fork,
+	.base_cftypes = ub_cgroup_files,
+	.use_id = true,
+};
+EXPORT_SYMBOL(ub_subsys);
+
+/*
+ *	Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	/*
+	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
+	 * at the moment is possible so an overflow is impossible.  
+	 */
+	ub->ub_parms[resource].held += val;
+
+	switch (strict & ~UB_SEV_FLAGS) {
+		case UB_HARD:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].barrier)
+				break;
+		case UB_SOFT:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].limit)
+				break;
+		case UB_FORCE:
+			ub_adjust_maxheld(ub, resource);
+			return 0;
+		default:
+			BUG();
+	}
+
+	if (!(strict & UB_TEST)) {
+		if (strict == UB_SOFT && __ratelimit(&ub->ub_ratelimit))
+			printk(KERN_INFO "Fatal resource shortage: %s, UB %s.\n",
+			       ub_rnames[resource], ub->ub_name);
+		ub->ub_parms[resource].failcnt++;
+	}
+	ub->ub_parms[resource].held -= val;
+	return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval;
+	unsigned long flags;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		retval = __charge_beancounter_locked(ub, resource, val, strict);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+out:
+	return retval;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held)
+{
+	printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n",
+			val, held, resource, ub->ub_name);
+}
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	if (ub->ub_parms[resource].held < val) {
+		uncharge_warn(ub, ub_rnames[resource],
+				val, ub->ub_parms[resource].held);
+		val = ub->ub_parms[resource].held;
+	}
+	ub->ub_parms[resource].held -= val;
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		__uncharge_beancounter_locked(ub, resource, val);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+}
+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+/* called with disabled interrupts */
+static int __precharge_beancounter_percpu(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu = ub_percpu(ub, smp_processor_id());
+	int charge, retval;
+
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val))
+		return 0;
+
+	spin_lock(&ub->ub_lock);
+	charge = max((int)val, ub->ub_parms[resource].max_precharge >> 1) -
+		ub_pcpu->precharge[resource];
+	retval = __charge_beancounter_locked(ub, resource,
+			charge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += charge;
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+
+/* called with disabled interrupts */
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval, precharge;
+
+	spin_lock(&ub->ub_lock);
+	precharge = max(0, (ub->ub_parms[resource].max_precharge >> 1) -
+			ub_pcpu->precharge[resource]);
+	retval = __charge_beancounter_locked(ub, resource,
+			val + precharge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += precharge;
+	else {
+		init_beancounter_precharge(ub, resource);
+		retval = __charge_beancounter_locked(ub, resource,
+				val, strict);
+	}
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(__charge_beancounter_percpu);
+
+/* called with disabled interrupts */
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val)
+{
+	int uncharge;
+
+	spin_lock(&ub->ub_lock);
+	if (ub->ub_parms[resource].max_precharge !=
+			ub_resource_precharge[resource])
+		init_beancounter_precharge(ub, resource);
+	uncharge = max(0, ub_pcpu->precharge[resource] -
+			(ub->ub_parms[resource].max_precharge >> 1));
+	ub_pcpu->precharge[resource] -= uncharge;
+	smp_wmb();
+	__uncharge_beancounter_locked(ub, resource, val + uncharge);
+	spin_unlock(&ub->ub_lock);
+}
+EXPORT_SYMBOL(__uncharge_beancounter_percpu);
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource)
+{
+	long held, precharge;
+
+	held = ub->ub_parms[resource].held;
+	smp_rmb();
+	precharge = __ub_percpu_sum(ub, precharge[resource]);
+
+	return max(0l, held - precharge);
+}
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+	int retval;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	local_irq_save(flags);
+	if (ub)
+		retval = __precharge_beancounter_percpu(ub, resource, val);
+	local_irq_restore(flags);
+out:
+	return retval;
+}
+EXPORT_SYMBOL(precharge_beancounter);
+
+/*
+ *	Initialization
+ *
+ *	struct user_beancounter contains
+ *	 - limits and other configuration settings,
+ *	   with a copy stored for accounting purposes,
+ *	 - structural fields: lists, spinlocks and so on.
+ *
+ *	Before these parts are initialized, the structure should be memset
+ *	to 0 or copied from a known clean structure.  That takes care of a lot
+ *	of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub)
+{
+	ub->ub_magic = UB_MAGIC;
+	spin_lock_init(&ub->ub_lock);
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++) {
+		ub->ub_parms[k].limit = UB_MAXVALUE;
+		ub->ub_parms[k].barrier = UB_MAXVALUE;
+	}
+
+	/*
+	 * Unlimited vmguarpages gives immunity against systemwide overcommit
+	 * policy. It makes sense in some cases but by default we must obey it.
+	 */
+	ub->ub_parms[UB_VMGUARPAGES].barrier = 0;
+
+	/*
+	 * Unlimited oomguarpages makes container or host mostly immune to
+	 * to the OOM-killer while other containers exists. Withal we cannot
+	 * set it to zero, otherwise single unconfigured container will be
+	 * first target for OOM-killer. 75% of ram looks like sane default.
+	 */
+	ub->ub_parms[UB_OOMGUARPAGES].barrier = totalram_pages * 3 / 4;
+
+	/* Ratelimit for messages in the kernel log */
+	ub->ub_ratelimit.burst = 4;
+	ub->ub_ratelimit.interval = 300*HZ;
+}
+
+static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu);
+
+void __init ub_init_early(void)
+{
+	struct user_beancounter *ub;
+
+	ub = get_ub0();
+	ub->ub_name = "0";
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub);
+	init_beancounter_precharges_early(ub);
+	ub->ub_percpu = &ub0_percpu;
+
+	memset(&current->task_bc, 0, sizeof(struct task_beancounter));
+	(void)set_exec_ub(ub);
+	current->task_bc.task_ub = get_beancounter(ub);
+	__charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE);
+	init_mm.mm_ub = get_beancounter(ub);
+
+	list_add(&ub->ub_list, &ub_list_head);
+	ub_count++;
+}
+
+static int proc_resource_precharge(ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(lock);
+	struct user_beancounter *ub;
+	int err;
+
+	mutex_lock(&lock);
+
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		spin_lock_irq(&ub->ub_lock);
+		init_beancounter_precharges(ub);
+		spin_unlock_irq(&ub->ub_lock);
+	}
+	rcu_read_unlock();
+
+out:
+	mutex_unlock(&lock);
+	return err;
+}
+
+static ctl_table ub_sysctl_table[] = {
+	{
+		.procname	= "resource_precharge",
+		.data		= &ub_resource_precharge,
+		.extra1		= &resource_precharge_min,
+		.extra2		= &resource_precharge_max,
+		.maxlen		= sizeof(ub_resource_precharge),
+		.mode		= 0644,
+		.proc_handler	= &proc_resource_precharge,
+	},
+	{
+		.procname	= "overcommit_memory",
+		.data		= &ub_overcommit_memory,
+		.maxlen		= sizeof(ub_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	{
+		.procname	= "dirty_ratio",
+		.data		= &ub_dirty_ratio,
+		.maxlen		= sizeof ub_dirty_ratio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "dirty_background_ratio",
+		.data		= &ub_dirty_background_ratio,
+		.maxlen		= sizeof ub_dirty_background_ratio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif /* CONFIG_BC_IO_ACCOUNTING */
+	{ }
+};
+
+static ctl_table ub_sysctl_root[] = {
+       {
+	       .procname	= "ubc",
+	       .mode		= 0555,
+	       .child		= ub_sysctl_table,
+       },
+       { }
+};
+
+void __init ub_init_late(void)
+{
+	ub_set_mem_css(&ub0, task_subsys_state_check(&init_task,  mem_cgroup_subsys_id, true));
+	ub_set_blkio_css(&ub0, task_subsys_state_check(&init_task, blkio_subsys_id, true));
+
+	register_sysctl_table(ub_sysctl_root);
+}
+
+int __init ub_init_cgroup(void)
+{
+	struct cgroup_sb_opts blkio_opts = {
+		.subsys_mask    = (1ul << blkio_subsys_id),
+	};
+	struct cgroup_sb_opts mem_opts = {
+		.subsys_mask    = (1ul << mem_cgroup_subsys_id),
+	};
+	struct cgroup_sb_opts ub_opts = {
+		.subsys_mask	= (1ul << ub_subsys_id),
+	};
+
+	blkio_cgroup_mnt = cgroup_kernel_mount(&blkio_opts);
+	if (IS_ERR(blkio_cgroup_mnt))
+		panic("Failed to mount blkio cgroup: %ld\n",
+		      PTR_ERR(blkio_cgroup_mnt));
+
+	mem_cgroup_mnt = cgroup_kernel_mount(&mem_opts);
+	if (IS_ERR(mem_cgroup_mnt))
+		panic("Failed to mount memory cgroup: %ld\n",
+		      PTR_ERR(mem_cgroup_mnt));
+
+	ub_cgroup_mnt = cgroup_kernel_mount(&ub_opts);
+	if (IS_ERR(ub_cgroup_mnt))
+		panic("Failed to mount beancounter cgroup: %ld\n",
+		      PTR_ERR(ub_cgroup_mnt));
+
+	return 0;
+}
+late_initcall(ub_init_cgroup);
--- /dev/null
+++ b/kernel/bc/io_acct.c
@@ -0,0 +1,335 @@
+/*
+ *  kernel/bc/io_acct.c
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mempool.h>
+#include <linux/proc_fs.h>
+#include <linux/virtinfo.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/proc.h>
+#include <bc/vmpages.h>
+
+/*
+ * starts writeback at this dirty memory percentage from physpages limit
+ */
+int ub_dirty_ratio = 50;
+int ub_dirty_background_ratio = 30;
+
+/* under write lock mapping->tree_lock */
+
+void ub_io_account_dirty(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, dirty_pages);
+}
+EXPORT_SYMBOL_GPL(ub_io_account_dirty);
+
+void ub_io_account_clean(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+	size_t bytes = PAGE_SIZE;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_complete);
+
+	ub = set_exec_ub(ub);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+	ub = set_exec_ub(ub);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+void ub_io_account_cancel(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_canceled);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+void ub_io_writeback_inc(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, writeback_pages);
+}
+
+void ub_io_writeback_dec(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, writeback_pages);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) ||
+	     !mapping_cap_account_dirty(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+static bool __ub_over_bground_thresh(struct user_beancounter *ub)
+{
+	unsigned long background_thresh, dirty_thresh;
+	unsigned long ub_dirty, ub_writeback;
+
+	ub_dirty_limits(&background_thresh, &dirty_thresh, ub);
+
+	ub_dirty = ub_stat_get(ub, dirty_pages);
+	ub_writeback = ub_stat_get(ub, writeback_pages);
+
+	if (ub_dirty + ub_writeback >= background_thresh)
+		return true;
+
+	return false;
+}
+
+bool ub_over_bground_thresh(void)
+{
+	struct user_beancounter *ub;
+	bool ret = false;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (ub == get_ub0())
+			continue;
+		if (__ub_over_bground_thresh(ub)) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+int ub_dirty_limits(unsigned long *pbackground,
+		    long *pdirty, struct user_beancounter *ub)
+{
+	int dirty_ratio;
+	unsigned long available_memory;
+
+	*pdirty = *pbackground = LONG_MAX;
+
+	dirty_ratio = ub_dirty_ratio;
+	if (!dirty_ratio)
+		return 0;
+
+	available_memory = ub_total_pages(ub, false);
+	if (available_memory == ULONG_MAX || available_memory == 0)
+		return 0;
+
+	*pdirty = (dirty_ratio * available_memory) / 100;
+
+	dirty_ratio = ub_dirty_background_ratio;
+	*pbackground = (dirty_ratio * available_memory) / 100;
+	if (!dirty_ratio || *pbackground >= *pdirty)
+		*pbackground = *pdirty / 2;
+
+	return 1;
+}
+
+bool ub_should_skip_writeback(struct user_beancounter *ub, struct inode *inode)
+{
+	struct user_beancounter *dirtied_ub;
+	bool ret;
+
+	rcu_read_lock();
+	dirtied_ub = rcu_dereference(inode->i_mapping->dirtied_ub);
+	if (ub)
+		ret = (ub != dirtied_ub);
+	else
+		ret = (dirtied_ub && !__ub_over_bground_thresh(dirtied_ub));
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+#define in_flight(var)	(var > var##_done ? var - var##_done : 0)
+
+static int bc_ioacct_show(struct seq_file *f, void *v)
+{
+	int i;
+	unsigned long long read, write, cancel;
+	unsigned long sync, sync_done;
+	unsigned long fsync, fsync_done;
+	unsigned long fdsync, fdsync_done;
+	unsigned long frsync, frsync_done;
+	struct user_beancounter *ub;
+	unsigned long dirty_pages;
+	unsigned long long dirtied;
+	unsigned long fuse_requests, fuse_bytes;
+
+	ub = seq_beancounter(f);
+
+	dirty_pages = __ub_stat_get(ub, dirty_pages);
+
+	read = write = cancel = 0;
+	sync = sync_done = fsync = fsync_done =
+		fdsync = fdsync_done = frsync = frsync_done = 0;
+	fuse_requests = fuse_bytes = 0;
+	for_each_online_cpu(i) {
+		struct ub_percpu_struct *ub_percpu;
+		ub_percpu = per_cpu_ptr(ub->ub_percpu, i);
+
+		read += ub_percpu->sync_read_bytes;
+		write += ub_percpu->sync_write_bytes;
+
+		dirty_pages += ub_percpu->dirty_pages;
+		write += (u64)ub_percpu->async_write_complete << PAGE_SHIFT;
+		cancel += (u64)ub_percpu->async_write_canceled << PAGE_SHIFT;
+
+		sync += ub_percpu->sync;
+		fsync += ub_percpu->fsync;
+		fdsync += ub_percpu->fdsync;
+		frsync += ub_percpu->frsync;
+		sync_done += ub_percpu->sync_done;
+		fsync_done += ub_percpu->fsync_done;
+		fdsync_done += ub_percpu->fdsync_done;
+		frsync_done += ub_percpu->frsync_done;
+
+		fuse_requests += ub_percpu->fuse_requests;
+		fuse_bytes += ub_percpu->fuse_bytes;
+	}
+
+	if ((long)dirty_pages < 0)
+		dirty_pages = 0;
+
+	dirtied = write + cancel;
+	dirtied += (u64)dirty_pages << PAGE_SHIFT;
+
+	seq_printf(f, bc_proc_llu_fmt, "read", read);
+	seq_printf(f, bc_proc_llu_fmt, "write", write);
+	seq_printf(f, bc_proc_llu_fmt, "dirty", dirtied);
+	seq_printf(f, bc_proc_llu_fmt, "cancel", cancel);
+	seq_printf(f, bc_proc_llu_fmt, "missed", 0ull);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync);
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync);
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync);
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync));
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync));
+
+	seq_printf(f, bc_proc_lu_lfmt, "io_pbs", dirty_pages);
+
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_requests", fuse_requests);
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_bytes", fuse_bytes);
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioacct_entry = {
+	.name = "ioacct",
+	.u.show = bc_ioacct_show,
+};
+
+static int bc_ioacct_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	struct user_beancounter *ub;
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long *vm_events;
+	unsigned long long bin, bout;
+	int i;
+
+	if (event != VIRTINFO_VMSTAT)
+		return old_ret;
+
+	ub = get_exec_ub();
+	if (ub == get_ub0())
+		return old_ret;
+
+	/* Think over: do we need to account here bytes_dirty_missed? */
+	bout = 0;
+	bin = 0;
+	for_each_online_cpu(i) {
+		ub_pcpu = per_cpu_ptr(ub->ub_percpu, i);
+		bout += (u64)ub_pcpu->async_write_complete << PAGE_SHIFT;
+		bout += ub_pcpu->sync_write_bytes;
+		bin += ub_pcpu->sync_read_bytes;
+	}
+
+	/* convert to Kbytes */
+	bout >>= 10;
+	bin >>= 10;
+
+	vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS;
+	vm_events[PGPGOUT] = (unsigned long)bout;
+	vm_events[PGPGIN] = (unsigned long)bin;
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block bc_ioacct_nb = {
+	.notifier_call = bc_ioacct_notify,
+};
+
+static int __init bc_ioacct_init(void)
+{
+	bc_register_proc_entry(&bc_ioacct_entry);
+
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb);
+	return 0;
+}
+
+late_initcall(bc_ioacct_init);
+#endif
--- /dev/null
+++ b/kernel/bc/io_prio.c
@@ -0,0 +1,173 @@
+/*
+ *  kernel/bc/io_prio.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+static unsigned int ioprio_weight[UB_IOPRIO_MAX] = {
+	320, 365, 410, 460, 500, 550, 600, 640,
+};
+
+extern unsigned int blkcg_get_weight(struct cgroup *cgrp);
+extern int blkcg_set_weight(struct cgroup *cgrp, unsigned int weight);
+extern void blkcg_show_ub_iostat(struct cgroup *cgrp, struct seq_file *sf);
+
+int ub_set_ioprio(int id, int ioprio)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	ret = -ERANGE;
+	if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX)
+		goto out;
+
+	ret = -ESRCH;
+	ub = get_beancounter_byuid(id, 0);
+	if (!ub)
+		goto out;
+
+	css = ub_get_blkio_css(ub);
+	ret = blkcg_set_weight(css->cgroup, ioprio_weight[ioprio]);
+	css_put(css);
+	put_beancounter(ub);
+out:
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int bc_iostat(struct seq_file *f, struct user_beancounter *bc)
+{
+	struct cgroup_subsys_state *css;
+
+	seq_printf(f, "flush %s . 0 0 0 0 0 %ld %ld 0 0\n",
+			bc->ub_name,
+			ub_stat_get_exact(bc, wb_requests),
+			ub_stat_get_exact(bc, wb_sectors));
+
+	seq_printf(f, "fuse %s . 0 0 0 0 0 %lu %lu 0 0\n",
+			bc->ub_name,
+			__ub_percpu_sum(bc, fuse_requests),
+			__ub_percpu_sum(bc, fuse_bytes) >> 9);
+
+	css = ub_get_blkio_css(bc);
+	blkcg_show_ub_iostat(css->cgroup, f);
+	css_put(css);
+	return 0;
+}
+
+static int bc_iostat_single(struct seq_file *f, void *v)
+{
+	return bc_iostat(f, seq_beancounter(f));
+}
+
+static struct bc_proc_entry bc_iostat_entry = {
+	.name = "iostat",
+	.u.show = bc_iostat_single,
+};
+
+static void *bc_iostat_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	unsigned long pos = *ppos;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!pos--)
+			return ub;
+	}
+	return NULL;
+}
+
+static void *bc_iostat_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub = v;
+
+	list_for_each_entry_continue_rcu(ub, &ub_list_head, ub_list) {
+		(*ppos)++;
+		return ub;
+	}
+	return NULL;
+}
+
+static int bc_iostat_show(struct seq_file *f, void *v)
+{
+	f->private = v;
+	return bc_iostat(f, v);
+}
+
+static void bc_iostat_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+
+static struct seq_operations iostat_seq_ops = {
+	.start = bc_iostat_start,
+	.next  = bc_iostat_next,
+	.stop  = bc_iostat_stop,
+	.show  = bc_iostat_show,
+};
+
+static int bc_iostat_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &iostat_seq_ops);
+}
+
+static struct file_operations bc_iostat_ops = {
+	.open		= bc_iostat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_root_iostat_entry = {
+	.name = "iostat",
+	.u.fops = &bc_iostat_ops,
+};
+
+static int bc_ioprio_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *bc;
+	struct cgroup_subsys_state *css;
+	unsigned int weight;
+	int ioprio;
+
+	bc = seq_beancounter(f);
+
+	css = ub_get_blkio_css(bc);
+	weight = blkcg_get_weight(css->cgroup);
+	css_put(css);
+
+	ioprio = UB_IOPRIO_MAX - 1;
+	while (ioprio && weight < ioprio_weight[ioprio])
+		ioprio--;
+
+	seq_printf(f, "prio: %d\n", ioprio);
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioprio_entry = {
+	.name = "ioprio",
+	.u.show = bc_ioprio_show,
+};
+
+static int __init bc_iostat_init(void)
+{
+	bc_register_proc_entry(&bc_ioprio_entry);
+	bc_register_proc_entry(&bc_iostat_entry);
+	bc_register_proc_root_entry(&bc_root_iostat_entry);
+	return 0;
+}
+late_initcall(bc_iostat_init);
+
+#endif /* CONFIG_PROC_FS */
--- /dev/null
+++ b/kernel/bc/misc.c
@@ -0,0 +1,168 @@
+/*
+ *  kernel/bc/misc.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+/*
+ * Task staff
+ */
+
+int ub_task_charge(struct user_beancounter *ub)
+{
+	return charge_beancounter_fast(ub, UB_NUMPROC, 1, UB_HARD);
+}
+
+void ub_task_uncharge(struct user_beancounter *ub)
+{
+	uncharge_beancounter_fast(ub, UB_NUMPROC, 1);
+}
+
+void ub_task_get(struct user_beancounter *ub, struct task_struct *task)
+{
+	struct task_beancounter *new_bc = &task->task_bc;
+
+	new_bc->task_ub = get_beancounter(ub);
+	new_bc->exec_ub = get_beancounter(ub);
+}
+
+void ub_task_put(struct task_struct *task)
+{
+	struct task_beancounter *task_bc;
+
+	task_bc = &task->task_bc;
+
+	put_beancounter(task_bc->exec_ub);
+	put_beancounter(task_bc->task_ub);
+
+	task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
+	task_bc->task_ub = (struct user_beancounter *)0xdead100c;
+}
+
+int ub_file_charge(struct file *f)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	int err;
+
+	err = charge_beancounter_fast(ub, UB_NUMFILE, 1, UB_HARD);
+	if (unlikely(err))
+		goto no_file;
+
+	f->f_ub = get_beancounter(ub);
+
+	return 0;
+
+no_file:
+	return err;
+}
+
+void ub_file_uncharge(struct file *f)
+{
+	struct user_beancounter *ub = f->f_ub;
+
+	uncharge_beancounter_fast(ub, UB_NUMFILE, 1);
+	put_beancounter(ub);
+}
+
+int ub_flock_charge(struct file_lock *fl, int hard)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	ub = fl->fl_ub;
+	if (ub == NULL)
+		return 0;
+
+	err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
+	if (!err)
+		fl->fl_charged = 1;
+	return err;
+}
+
+void ub_flock_uncharge(struct file_lock *fl)
+{
+	struct user_beancounter *ub;
+
+	ub = fl->fl_ub;
+	if (ub == NULL || !fl->fl_charged)
+		return;
+
+	uncharge_beancounter(ub, UB_NUMFLOCK, 1);
+	fl->fl_charged = 0;
+}
+
+/*
+ * Signal handling
+ */
+
+int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub,
+			gfp_t gfp_mask)
+{
+	if (charge_beancounter_fast(ub, UB_NUMSIGINFO, 1, UB_HARD))
+		goto out_num;
+
+	sq->sig_ub = get_beancounter(ub);
+	return 0;
+
+out_num:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(ub_siginfo_charge);
+
+void ub_siginfo_uncharge(struct sigqueue *sq)
+{
+	struct user_beancounter *ub;
+
+	ub = sq->sig_ub;
+	sq->sig_ub = NULL;
+	uncharge_beancounter_fast(ub, UB_NUMSIGINFO, 1);
+	put_beancounter(ub);
+}
+
+/*
+ * PTYs
+ */
+
+int ub_pty_charge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	int retval;
+
+	retval = 0;
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			!test_bit(TTY_CHARGED, &tty->flags)) {
+		retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
+		if (!retval) {
+			set_bit(TTY_CHARGED, &tty->flags);
+			tty->ub = get_beancounter(ub);
+		}
+	}
+	return retval;
+}
+
+void ub_pty_uncharge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub;
+
+	ub = tty->ub;
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			test_bit(TTY_CHARGED, &tty->flags)) {
+		uncharge_beancounter(ub, UB_NUMPTY, 1);
+		clear_bit(TTY_CHARGED, &tty->flags);
+		put_beancounter(ub);
+	}
+}
--- /dev/null
+++ b/kernel/bc/proc.c
@@ -0,0 +1,760 @@
+/*
+ *  kernel/bc/proc.c
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ve_proto.h>
+#include <linux/virtinfo.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
+#include <linux/lglock.h>
+#include <linux/ve.h>
+#include <linux/memcontrol.h>
+
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+/* Generic output formats */
+#if BITS_PER_LONG == 32
+const char *bc_proc_lu_fmt = "\t%-20s %10lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n";
+#else
+const char *bc_proc_lu_fmt = "\t%-20s %21lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n";
+#endif
+
+#if BITS_PER_LONG == 32
+static const char *head_fmt = "%10s  %-12s %10s %10s %10s %10s %10s\n";
+static const char *res_fmt = "%10s%c %-12s %10lu %10lu %10lu %10lu %10lu\n";
+#else
+static const char *head_fmt = "%10s  %-12s %20s %20s %20s %20s %20s\n";
+static const char *res_fmt = "%10s%c %-12s %20lu %20lu %20lu %20lu %20lu\n";
+#endif
+
+static void ub_show_res(struct seq_file *f, struct user_beancounter *ub,
+		int r, int precharge, int show_uid)
+{
+	struct ubparm *p;
+	unsigned long held;
+
+	p = &ub->ub_parms[r];
+	held = p->held;
+	held = (held > precharge) ? (held - precharge) : 0;
+
+	seq_printf(f, res_fmt,
+			show_uid && r == 0 ? ub->ub_name : "",
+			show_uid && r == 0 ? ':' : ' ',
+		   	ub_rnames[r],
+			held,
+			p->maxheld,
+			p->barrier,
+			p->limit,
+			p->failcnt);
+}
+
+static void __show_resources(struct seq_file *f, struct user_beancounter *ub,
+		int show_uid)
+{
+	int i, precharge[UB_RESOURCES];
+
+	ub_sync_memcg(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+		if (strcmp(ub_rnames[i], "dummy") != 0)
+			ub_show_res(f, ub, i, precharge[i], show_uid);
+
+	for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++)
+		ub_show_res(f, ub, i, precharge[i], show_uid);
+}
+
+static int bc_resources_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, seq_beancounter(f), 0);
+	return 0;
+}
+
+static struct bc_proc_entry bc_resources_entry = {
+	.name = "resources",
+	.u.show = bc_resources_show,
+};
+
+static int bc_precharge_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+	int i, cpus = num_possible_cpus();
+	int precharge[UB_RESOURCES];
+
+	seq_printf(f, "%-12s %16s %10s %10s\n",
+			"resource", "real_held", "precharge", "max_precharge");
+
+	ub = seq_beancounter(f);
+	ub_precharge_snapshot(ub, precharge);
+	for ( i = 0 ; i < UB_RESOURCES ; i++ ) {
+		if (!strcmp(ub_rnames[i], "dummy"))
+			continue;
+		seq_printf(f, "%-12s %16lu %10d %10d\n", ub_rnames[i],
+				ub->ub_parms[i].held,
+				precharge[i],
+				ub->ub_parms[i].max_precharge * cpus);
+	}
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_precharge_entry = {
+	.name = "precharge",
+	.u.show = bc_precharge_show,
+};
+
+static int bc_proc_meminfo_show(struct seq_file *f, void *v)
+{
+	return meminfo_proc_show_ub(f, NULL,
+			seq_beancounter(f), VE_MEMINFO_DEFAULT);
+}
+
+static struct bc_proc_entry bc_meminfo_entry = {
+	.name = "meminfo",
+	.u.show = bc_proc_meminfo_show,
+};
+
+extern void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+				    unsigned long *pages);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+static int bc_proc_nodeinfo_show(struct seq_file *f, void *v)
+{
+	int nid;
+	struct cgroup_subsys_state *css;
+	unsigned long pages[NR_LRU_LISTS];
+
+	css = ub_get_mem_css(seq_beancounter(f));
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		memset(pages, 0, sizeof(pages));
+		mem_cgroup_get_nr_pages(mem_cgroup_from_cont(css->cgroup),
+					nid, pages);
+		seq_printf(f,
+			"Node %d Active:         %8lu kB\n"
+			"Node %d Inactive:       %8lu kB\n"
+			"Node %d Active(anon):   %8lu kB\n"
+			"Node %d Inactive(anon): %8lu kB\n"
+			"Node %d Active(file):   %8lu kB\n"
+			"Node %d Inactive(file): %8lu kB\n"
+			"Node %d Unevictable:    %8lu kB\n",
+			nid, K(pages[LRU_ACTIVE_ANON] +
+			       pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_ANON] +
+			       pages[LRU_INACTIVE_FILE]),
+			nid, K(pages[LRU_ACTIVE_ANON]),
+			nid, K(pages[LRU_INACTIVE_ANON]),
+			nid, K(pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_FILE]),
+			nid, K(pages[LRU_UNEVICTABLE]));
+	}
+	css_put(css);
+	return 0;
+}
+#undef K
+
+static struct bc_proc_entry bc_nodeinfo_entry = {
+	.name = "nodeinfo",
+	.u.show = bc_proc_nodeinfo_show,
+};
+
+static int ub_show(struct seq_file *f, void *v)
+{
+	int i, precharge[UB_RESOURCES];
+	struct user_beancounter *ub = v;
+
+	ub_sync_memcg(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+		ub_show_res(f, ub, i, precharge[i], 1);
+	return 0;
+}
+
+static int res_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, (struct user_beancounter *)v, 1);
+	return 0;
+}
+
+static int ub_accessible(struct user_beancounter *exec,
+		struct user_beancounter *target)
+{
+	return (exec == get_ub0() || exec == target);
+}
+
+static void ub_show_header(struct seq_file *f)
+{
+	seq_printf(f, "Version: 2.5\n");
+	seq_printf(f, head_fmt, "uid", "resource",
+			"held", "maxheld", "barrier", "limit", "failcnt");
+}
+
+static void *ub_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub, *ret = NULL;
+	struct user_beancounter *exec_ub; 
+	unsigned long pos;
+
+	pos = *ppos;
+	if (pos == 0)
+		ub_show_header(f);
+
+	exec_ub = get_exec_ub();
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		if (!get_beancounter_rcu(ub))
+			continue;
+		if (pos-- == 0) {
+			ret = ub;
+			break;
+		}
+		put_beancounter(ub);
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *ub_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub, *ret = NULL;
+	struct user_beancounter *exec_ub;
+
+	exec_ub = get_exec_ub();
+	ub = (struct user_beancounter *)v;
+	rcu_read_lock();
+	put_beancounter(ub);
+	list_for_each_entry_continue_rcu(ub, &ub_list_head, ub_list) {
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		if (!get_beancounter_rcu(ub))
+			continue;
+		(*ppos)++;
+		ret = ub;
+		break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void ub_stop(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+
+	ub = (struct user_beancounter *)v;
+	put_beancounter(ub);
+}
+
+static struct seq_operations ub_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = ub_show,
+};
+
+static int ub_open(struct inode *inode, struct file *filp)
+{
+	if (!(ve_capable(CAP_DAC_OVERRIDE) && ve_capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &ub_seq_ops);
+}
+
+static struct file_operations ub_file_operations = {
+	.open		= ub_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct seq_operations res_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = res_show,
+};
+
+static int res_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &res_seq_ops);
+}
+
+static struct file_operations resources_operations = {
+	.open		= res_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_all_resources_entry = {
+	.name = "resources",
+	.u.fops = &resources_operations,
+};
+
+/*
+ * Generic showing stuff
+ */
+
+static int cookies, num_entries;
+static struct bc_proc_entry *bc_entries __read_mostly;
+static struct bc_proc_entry *bc_root_entries __read_mostly;
+static DEFINE_SPINLOCK(bc_entries_lock);
+static struct proc_dir_entry *bc_proc_root;
+
+void bc_register_proc_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_entries;
+	bc_entries = e;
+	num_entries++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_entry);
+
+void bc_register_proc_root_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_root_entries;
+	bc_root_entries = e;
+	bc_proc_root->nlink++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_root_entry);
+
+/*
+ * small helpers
+ */
+
+static inline unsigned long bc_make_ino(struct user_beancounter *ub)
+{
+	return 0xbc000000 | (css_id(&ub->css) + 1);
+}
+
+static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de)
+{
+	return 0xbe000000 + de->cookie;
+}
+
+static int bc_d_delete(const struct dentry *d)
+{
+	return 1;
+}
+
+static void bc_d_release(struct dentry *d)
+{
+	put_beancounter((struct user_beancounter *)d->d_fsdata);
+}
+
+static struct inode_operations bc_entry_iops;
+static struct file_operations bc_entry_fops;
+static struct dentry_operations bc_dentry_ops = {
+	.d_delete = bc_d_delete,
+	.d_release = bc_d_release,
+};
+
+/*
+ * common directory operations' helpers
+ */
+
+static int bc_readdir(struct file *file, filldir_t filler, void *data,
+		struct user_beancounter *parent)
+{
+	int err = 0;
+	loff_t pos, filled;
+	struct user_beancounter *ub, *prev;
+	struct bc_proc_entry *pde;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EPERM;
+
+	pos = file->f_pos;
+	if (pos == 0) {
+		err = (*filler)(data, ".", 1, pos,
+				file->f_dentry->d_inode->i_ino, DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (pos == 1) {
+		err = (*filler)(data, "..", 2, pos,
+				parent_ino(file->f_dentry), DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	filled = 2;
+	for (pde = (parent == NULL ? bc_root_entries : bc_entries);
+			pde != NULL; pde = pde->next) {
+		if (filled++ < pos)
+			continue;
+
+		err = (*filler)(data, pde->name, strlen(pde->name), pos,
+				bc_make_file_ino(pde), DT_REG);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (parent)
+		goto out;
+
+	rcu_read_lock();
+	prev = NULL;
+	ub = list_entry(&ub_list_head, struct user_beancounter, ub_list);
+	while (1) {
+		ub = list_entry(rcu_dereference(ub->ub_list.next),
+				struct user_beancounter, ub_list);
+		if (&ub->ub_list == &ub_list_head)
+			break;
+
+		if (!get_beancounter_rcu(ub))
+			continue;
+
+		if (filled++ < pos) {
+			put_beancounter(ub);
+			continue;
+		}
+
+		rcu_read_unlock();
+		put_beancounter(prev);
+
+		err = (*filler)(data, ub->ub_name, strlen(ub->ub_name),
+				pos, bc_make_ino(ub), DT_DIR);
+		if (err < 0) {
+			err = 0;
+			put_beancounter(ub);
+			goto out;
+		}
+
+		rcu_read_lock();
+		prev = ub;
+		pos++;
+	}
+	rcu_read_unlock();
+	put_beancounter(prev);
+out:
+	file->f_pos = pos;
+	return err;
+}
+
+static int bc_looktest(struct inode *ino, void *data)
+{
+	return ino->i_op == &bc_entry_iops && ino->i_private == data;
+}
+
+static int bc_lookset(struct inode *ino, void *data)
+{
+	struct user_beancounter *ub;
+
+	ub = (struct user_beancounter *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_ino(ub);
+	ino->i_fop = &bc_entry_fops;
+	ino->i_op = &bc_entry_iops;
+	ino->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+	/* subbeancounters are not included, but who cares? */
+	ino->__i_nlink = num_entries + 2;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *ino;
+
+	ino = iget5_locked(dir->i_sb, css_id(&ub->css), bc_looktest, bc_lookset, ub);
+	if (ino == NULL)
+		goto out_put;
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	d_set_d_op(dentry, &bc_dentry_ops);
+	dentry->d_fsdata = ub;
+	d_add(dentry, ino);
+	return NULL;
+
+out_put:
+	put_beancounter(ub);
+	return ERR_PTR(-ENOENT);
+}
+
+/*
+ * files (bc_proc_entry) manipulations
+ */
+
+static struct dentry *bc_lookup_file(struct inode *dir,
+		struct dentry *dentry, struct bc_proc_entry *root,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *))
+{
+	struct bc_proc_entry *pde;
+	struct inode *ino;
+
+	for (pde = root; pde != NULL; pde = pde->next)
+		if (strcmp(pde->name, dentry->d_name.name) == 0)
+			break;
+
+	if (pde == NULL)
+		return ERR_PTR(-ESRCH);
+
+	ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde);
+	if (ino == NULL)
+		return ERR_PTR(-ENOENT);
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	d_set_d_op(dentry, &bc_dentry_ops);
+	d_add(dentry, ino);
+	return NULL;
+}
+
+static int bc_file_open(struct inode *ino, struct file *filp)
+{
+	struct bc_proc_entry *de;
+	struct user_beancounter *ub;
+
+	de = (struct bc_proc_entry *)ino->i_private;
+	ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata;
+	BUG_ON(ub->ub_magic != UB_MAGIC);
+
+	/*
+	 * ub can't disappear: we hold d_parent, he holds the beancounter
+	 */
+	return single_open(filp, de->u.show, ub);
+}
+
+static struct file_operations bc_file_ops = {
+	.open		= bc_file_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int bc_looktest_entry(struct inode *ino, void *data)
+{
+	return ino->i_fop == &bc_file_ops && ino->i_private == data;
+}
+
+static int bc_lookset_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = &bc_file_ops,
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->__i_nlink = 1;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_entries,
+			bc_looktest_entry, bc_lookset_entry);
+}
+
+static int bc_looktest_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	return ino->i_fop == de->u.fops && ino->i_private == data;
+}
+
+static int bc_lookset_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = de->u.fops;
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->__i_nlink = 1;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_root_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_root_entries,
+			bc_looktest_root_entry, bc_lookset_root_entry);
+}
+
+/*
+ * /proc/bc/.../<id> directory operations
+ */
+
+static int bc_entry_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data,
+			(struct user_beancounter *)file->f_dentry->d_fsdata);
+}
+
+static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	return ERR_PTR(-ENOENT);
+}
+
+static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct user_beancounter *ub;
+
+	generic_fillattr(dentry->d_inode, stat);
+	ub = (struct user_beancounter *)dentry->d_fsdata;
+	stat->nlink = 2;
+	return 0;
+}
+
+static struct file_operations bc_entry_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_entry_readdir,
+};
+
+static struct inode_operations bc_entry_iops = {
+	.lookup = bc_entry_lookup,
+	.getattr = bc_entry_getattr,
+};
+
+/*
+ * /proc/bc directory operations
+ */
+
+static int bc_root_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data, NULL);
+}
+
+static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct user_beancounter *ub;
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_root_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	ub = get_beancounter_by_name(dentry->d_name.name, 0);
+	if (IS_ERR_OR_NULL(ub))
+		return ub ? ERR_CAST(ub) : ERR_PTR(-ENOENT);
+
+	return bc_lookup(ub, dir, dentry);
+}
+
+static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry,
+	struct kstat *stat)
+{
+	generic_fillattr(dentry->d_inode, stat);
+	stat->nlink = ub_count + 2;
+	return 0;
+}
+
+static struct file_operations bc_root_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_root_readdir,
+};
+
+static struct inode_operations bc_root_iops = {
+	.lookup = bc_root_lookup,
+	.getattr = bc_root_getattr,
+};
+
+static int ub_vswap_show(struct seq_file *f, void *unused)
+{
+	seq_puts(f, "Version: 1.0\n");
+	return 0;
+}
+
+static int ub_vswap_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ub_vswap_show, NULL);
+}
+
+static struct file_operations ub_vswap_fops = {
+	.open		= ub_vswap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init ub_init_proc(void)
+{
+	struct proc_dir_entry *entry;
+
+	bc_proc_root = proc_mkdir_mode("bc", 0, NULL);
+	if (bc_proc_root == NULL)
+		panic("Can't create /proc/bc entry");
+
+	bc_proc_root->proc_fops = &bc_root_fops;
+	bc_proc_root->proc_iops = &bc_root_iops;
+
+	bc_register_proc_entry(&bc_resources_entry);
+	bc_register_proc_entry(&bc_precharge_entry);
+	bc_register_proc_root_entry(&bc_all_resources_entry);
+	bc_register_proc_entry(&bc_meminfo_entry);
+	bc_register_proc_entry(&bc_nodeinfo_entry);
+
+	entry = proc_create("user_beancounters",
+			S_IRUSR|S_ISVTX, NULL, &ub_file_operations);
+	proc_create("vswap", S_IRUSR, proc_vz_dir, &ub_vswap_fops);
+	return 0;
+}
+
+core_initcall(ub_init_proc);
--- /dev/null
+++ b/kernel/bc/statd.c
@@ -0,0 +1,527 @@
+/*
+ *  kernel/bc/statd.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+
+#include <bc/beancounter.h>
+#include <uapi/linux/bc/statd.h>
+
+static DEFINE_SPINLOCK(ubs_notify_lock);
+static LIST_HEAD(ubs_notify_list);
+static long ubs_min_interval;
+static ubstattime_t ubs_start_time, ubs_end_time;
+static struct timer_list ubs_timer;
+
+struct ub_stat_notify {
+	struct list_head	list;
+	struct task_struct	*task;
+	int			signum;
+};
+
+static int ubstat_get_list(void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub, *ubp;
+	long *page, *ptr, *end;
+	int len;
+
+	page = (long *)__get_free_page(GFP_KERNEL);
+	if (page == NULL)
+		return -ENOMEM;
+
+	retval = 0;
+	ubp = NULL;
+	ptr = page;
+	end = page + PAGE_SIZE / sizeof(*ptr);
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		uid_t uid = ub_legacy_id(ub);
+
+		if (uid == -1)
+			continue;
+
+		*ptr++ = uid;
+		if (ptr != end)
+			continue;
+
+		if (!get_beancounter_rcu(ub)) {
+			ptr--;
+			continue;
+		}
+		rcu_read_unlock();
+
+		put_beancounter(ubp);
+		ubp = ub;
+
+		len = min_t(long, (ptr - page) * sizeof(*ptr), size);
+		if (copy_to_user(buf, page, len)) {
+			retval = -EFAULT;
+			goto out_put;
+		}
+		retval += len;
+		if (len < PAGE_SIZE)
+			goto out_put;
+		buf += len;
+		size -= len;
+
+		ptr = page;
+		end = page + PAGE_SIZE / sizeof(*ptr);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	size = min_t(long, (ptr - page) * sizeof(*ptr), size);
+	if (size > 0 && copy_to_user(buf, page, size)) {
+		retval = -EFAULT;
+		goto out_put;
+	}
+	retval += size;
+
+out_put:
+	put_beancounter(ubp);
+	free_page((unsigned long)page);
+	return retval;
+}
+
+static int ubstat_gettime(void __user *buf, long size)
+{
+	ubgettime_t data;
+	int retval;
+
+	spin_lock(&ubs_notify_lock);
+	data.start_time = ubs_start_time;
+	data.end_time = ubs_end_time;
+	data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
+	spin_unlock(&ubs_notify_lock);
+
+	retval = min_t(long, sizeof(data), size);
+	if (copy_to_user(buf, &data, retval))
+		retval = -EFAULT;
+	return retval;
+}
+
+static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
+{
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[1];
+	} *data;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+
+	data->param[0].maxheld = ub->ub_store[res].maxheld;
+	data->param[0].failcnt = ub->ub_store[res].failcnt;
+
+	return sizeof(*data);
+}
+
+static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+		data->param[resource].maxheld = ub->ub_store[resource].maxheld;
+		data->param[resource].failcnt = ub->ub_store[resource].failcnt;
+		wrote += sizeof(data->param[resource]); 
+	}
+
+	return wrote;
+}
+
+static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
+		int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparmf_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		ubstatparmf_t *p = &data->param[resource];
+		struct ubparm *s = &ub->ub_store[resource];
+
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+
+		p->barrier	= s->barrier;
+		p->limit	= s->limit;
+		p->held		= s->held;
+		p->maxheld	= s->maxheld;
+		p->minheld	= s->minheld;
+		p->failcnt	= s->failcnt;
+		p->__unused1	= 0;
+		p->__unused2	= 0;
+
+		wrote += sizeof(data->param[resource]);
+	}
+	return wrote;
+}
+
+int ubstat_alloc_store(struct user_beancounter *ub)
+{
+	if (ub->ub_store == NULL) {
+		struct ubparm *store;
+
+		store = kmemdup(ub->ub_parms,
+				UB_RESOURCES * sizeof(struct ubparm),
+				GFP_KERNEL);
+		if (store == NULL)
+			return -ENOMEM;
+
+		spin_lock(&ubs_notify_lock);
+		if (ub->ub_store != NULL)
+			kfree(store);
+		else
+			ub->ub_store = store;
+		spin_unlock(&ubs_notify_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ubstat_alloc_store);
+
+static bool ubstat_need_memcg_sync(long cmd)
+{
+	if (UBSTAT_CMD(cmd) != UBSTAT_READ_ONE)
+		return true;
+
+	switch (UBSTAT_PARMID(cmd)) {
+		case UB_KMEMSIZE:
+		case UB_DCACHESIZE:
+		case UB_PHYSPAGES:
+		case UB_SWAPPAGES:
+		case UB_OOMGUARPAGES:
+			return true;
+	}
+	return false;
+}
+
+static int ubstat_check_cmd(long cmd)
+{
+	switch (UBSTAT_CMD(cmd)) {
+		case UBSTAT_READ_ONE:
+			if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
+				break;
+		case UBSTAT_READ_ALL:
+		case UBSTAT_READ_FULL:
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
+		void __user *buf, long size)
+{
+	void *kbuf;
+	int retval;
+
+	retval = ubstat_check_cmd(cmd);
+	if (retval)
+		return retval;
+
+	kbuf = (void *)__get_free_page(GFP_KERNEL);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	retval = ubstat_alloc_store(ub);
+	if (retval)
+		goto out;
+
+	if (ubstat_need_memcg_sync(cmd))
+		ub_sync_memcg(ub);
+
+	spin_lock(&ubs_notify_lock);
+	switch (UBSTAT_CMD(cmd)) {
+		case UBSTAT_READ_ONE:
+			retval = ubstat_do_read_one(ub,
+					UBSTAT_PARMID(cmd), kbuf);
+			break;
+		case UBSTAT_READ_ALL:
+			retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
+			break;
+		case UBSTAT_READ_FULL:
+			retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
+			break;
+		default:
+			retval = -EINVAL;
+			__WARN_printf("%s: we shouldn't get there\ncmd: %ld\n",
+					__func__, UBSTAT_CMD(cmd));
+	}
+	spin_unlock(&ubs_notify_lock);
+
+	if (retval > 0) {
+		retval = min_t(long, retval, size);
+		if (copy_to_user(buf, kbuf, retval))
+			retval = -EFAULT;
+	}
+out:
+	free_page((unsigned long)kbuf);
+	return retval;
+}
+
+static int ubstat_handle_notifrq(ubnotifrq_t *req)
+{
+	int retval;
+	struct ub_stat_notify *new_notify;
+	struct list_head *entry;
+	struct task_struct *tsk_to_free;
+
+	new_notify = kmalloc(sizeof(*new_notify), GFP_KERNEL);
+	if (new_notify == NULL)
+		return -ENOMEM;
+
+	tsk_to_free = NULL;
+	INIT_LIST_HEAD(&new_notify->list);
+
+	spin_lock(&ubs_notify_lock);
+	list_for_each(entry, &ubs_notify_list) {
+		struct ub_stat_notify *notify;
+
+		notify = list_entry(entry, struct ub_stat_notify, list);
+		if (notify->task == current) {
+			kfree(new_notify);
+			new_notify = notify;
+			break;
+		}
+	}
+
+	retval = -EINVAL;
+	if (req->maxinterval < 1)
+		goto out_unlock;
+	if (req->maxinterval > TIME_MAX_SEC)
+		req->maxinterval = TIME_MAX_SEC;
+	if (req->maxinterval < ubs_min_interval) {
+		unsigned long dif;
+
+		ubs_min_interval = req->maxinterval;
+		dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
+		if (dif > req->maxinterval)
+			mod_timer(&ubs_timer,
+					ubs_timer.expires -
+					(dif - req->maxinterval) * HZ);
+	}
+
+	if (entry != &ubs_notify_list) {
+		list_del(&new_notify->list);
+		tsk_to_free = new_notify->task;
+	}
+	if (req->signum) {
+		new_notify->task = current;
+		get_task_struct(new_notify->task);
+		new_notify->signum = req->signum;
+		list_add(&new_notify->list, &ubs_notify_list);
+	} else
+		kfree(new_notify);
+	retval = 0;
+out_unlock:
+	spin_unlock(&ubs_notify_lock);
+	if (tsk_to_free != NULL)
+		put_task_struct(tsk_to_free);
+	return retval;
+}
+
+/*
+ * former sys_ubstat
+ */
+long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
+		void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub;
+
+	if (func == UBSTAT_UBPARMNUM)
+		return UB_RESOURCES;
+	if (func == UBSTAT_UBLIST)
+		return ubstat_get_list(buf, size);
+
+	if (func == UBSTAT_GETTIME) {
+		retval = ubstat_gettime(buf, size);
+		goto notify;
+	}
+
+	ub = get_exec_ub();
+	if (ub != NULL && ub_legacy_id(ub) == arg1 && (uid_t)arg1 != -1)
+		get_beancounter(ub);
+	else /* FIXME must be if (ve_is_super) */
+		ub = get_beancounter_byuid(arg1, 0);
+
+	if (ub == NULL)
+		return -ESRCH;
+
+	retval = ubstat_get_stat(ub, func, buf, size);
+	put_beancounter(ub);
+notify:
+	/* Handle request for notification */
+	if (retval >= 0) {
+		ubnotifrq_t notifrq;
+		int err;
+
+		err = -EFAULT;
+		if (!copy_from_user(&notifrq, (void __user *)arg2,
+					sizeof(notifrq)))
+			err = ubstat_handle_notifrq(&notifrq);
+		if (err)
+			retval = err;
+	}
+
+	return retval;
+}
+
+static void ubstat_save_onestat(struct user_beancounter *ub)
+{
+	int resource;
+
+	if (ub->ub_store == NULL)
+		return;
+
+	/* called with local irq disabled */
+	spin_lock(&ub->ub_lock);
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
+			sizeof(struct ubparm));
+		ub->ub_parms[resource].minheld = 
+			ub->ub_parms[resource].maxheld =
+			ub->ub_parms[resource].held;
+	}
+	spin_unlock(&ub->ub_lock);
+}
+
+static void ubstat_save_statistics(void)
+{
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	local_irq_save(flags);
+	for_each_beancounter (ub)
+		ubstat_save_onestat(ub);
+	local_irq_restore(flags);
+}
+
+static void ubstatd_timeout(unsigned long __data)
+{
+	struct task_struct *p;
+
+	p = (struct task_struct *) __data;
+	wake_up_process(p);
+}
+
+/*
+ * Safe wrapper for send_sig. It prevents a race with release_task
+ * for sighand.
+ * Should be called under tasklist_lock.
+ */
+static void task_send_sig(struct ub_stat_notify *notify)
+{
+	if (likely(notify->task->sighand != NULL))
+		send_sig(notify->signum, notify->task, 1);
+}
+
+static inline void do_notifies(void)
+{
+	LIST_HEAD(notif_free_list);
+	struct ub_stat_notify *notify;
+	struct ub_stat_notify *tmp;
+
+	spin_lock(&ubs_notify_lock);
+	ubs_start_time = ubs_end_time;
+	/*
+	 * the expression below relies on time being unsigned long and
+	 * arithmetic promotion rules
+	 */
+	ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
+	mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
+	ubs_min_interval = TIME_MAX_SEC;
+	/* save statistics accumulated for the interval */
+	ubstat_save_statistics();
+	/* send signals */
+	qread_lock(&tasklist_lock);
+	list_for_each_entry_safe(notify, tmp, &ubs_notify_list, list) {
+		task_send_sig(notify);
+		list_move(&notify->list, &notif_free_list);
+	}
+	qread_unlock(&tasklist_lock);
+	spin_unlock(&ubs_notify_lock);
+
+	list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
+		put_task_struct(notify->task);
+		list_del(&notify->list);
+		kfree(notify);
+	}
+}
+
+/*
+ * Kernel thread
+ */
+static int ubstatd(void *unused)
+{
+	ubs_timer.data = (unsigned long)current;
+	ubs_timer.function = ubstatd_timeout;
+	add_timer(&ubs_timer);
+
+	while (1) {
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		if (time_after(ubs_timer.expires, jiffies)) {
+			schedule();
+			try_to_freeze();
+			continue;
+		}
+
+		__set_task_state(current, TASK_RUNNING);
+		do_notifies();
+	}
+	return 0;
+}
+
+static int __init ubstatd_init(void)
+{
+	init_timer(&ubs_timer);
+	ubs_timer.expires = TIME_MAX_JIF;
+	ubs_min_interval = TIME_MAX_SEC;
+	ubs_start_time = ubs_end_time = 0;
+	kthread_run(ubstatd, NULL, "ubstatd");
+	return 0;
+}
+
+module_init(ubstatd_init);
--- /dev/null
+++ b/kernel/bc/sys.c
@@ -0,0 +1,161 @@
+/*
+ *  kernel/bc/sys.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/virtinfo.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+#include <bc/beancounter.h>
+
+/*
+ *	The (rather boring) getluid syscall
+ */
+SYSCALL_DEFINE0(getluid)
+{
+	struct user_beancounter *ub;
+	uid_t uid;
+
+	ub = get_exec_ub();
+	if (ub == NULL)
+		return -EINVAL;
+
+	uid = ub_legacy_id(ub);
+	if (uid == -1)
+		return -EINVAL;
+
+	return uid;
+}
+
+/*
+ *	The setluid syscall
+ */
+SYSCALL_DEFINE1(setluid, uid_t, uid)
+{
+	struct user_beancounter *ub;
+	int error;
+
+	/* You may not disown a setluid */
+	error = -EINVAL;
+	if (uid == (uid_t)-1)
+		goto out;
+
+	/* You may only set an ub as root */
+	error = -EPERM;
+	if (!capable(CAP_SETUID))
+		goto out;
+
+	/* Ok - set up a beancounter entry for this user */
+	error = -ENOBUFS;
+	ub = get_beancounter_byuid(uid, 1);
+	if (ub == NULL)
+		goto out;
+	error = ub_attach_task(ub, current);
+	put_beancounter(ub);
+out:
+	return error;
+}
+
+long do_setublimit(uid_t uid, unsigned long resource,
+		unsigned long *new_limits)
+{
+	int error;
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	error = -EINVAL;
+	if (resource >= UB_RESOURCES)
+		goto out;
+
+	error = -EINVAL;
+	if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
+		goto out;
+
+	error = -ENOENT;
+	ub = get_beancounter_byuid(uid, 0);
+	if (ub == NULL)
+		goto out;
+
+	ub_sync_memcg(ub);
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[resource].barrier = new_limits[0];
+	ub->ub_parms[resource].limit = new_limits[1];
+	init_beancounter_precharge(ub, resource);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	error = ub_update_memcg(ub);
+
+	put_beancounter(ub);
+out:
+	return error;
+}
+
+/*
+ *	The setbeanlimit syscall
+ */
+SYSCALL_DEFINE3(setublimit, uid_t, uid, unsigned long, resource,
+		unsigned long __user *, limits)
+{
+	unsigned long new_limits[2];
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
+		return -EFAULT;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, 
+		void __user *buf, long size);
+
+SYSCALL_DEFINE5(ubstat, int, func, unsigned long, arg1, unsigned long, arg2,
+		void __user *, buf, long, size)
+{
+	if (!capable(CAP_DAC_OVERRIDE) && !capable(CAP_DAC_READ_SEARCH))
+		return -EPERM;
+
+	return do_ubstat(func, arg1, arg2, buf, size);
+}
+
+#ifdef CONFIG_COMPAT
+#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1)
+
+asmlinkage long compat_sys_setublimit(uid_t uid,
+		compat_long_t resource,
+		compat_long_t __user *limits)
+{
+	compat_long_t u_new_limits[2];
+	unsigned long new_limits[2];
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits)))
+		return -EFAULT;
+
+	new_limits[0] = u_new_limits[0];
+	new_limits[1] = u_new_limits[1];
+
+	if (u_new_limits[0] == UB_MAXVALUE_COMPAT)
+		new_limits[0] = UB_MAXVALUE;
+	if (u_new_limits[1] == UB_MAXVALUE_COMPAT)
+		new_limits[1] = UB_MAXVALUE;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+asmlinkage long compat_sys_ubstat(int func, unsigned int arg1,
+		unsigned int arg2, compat_uptr_t *buf, long size)
+{
+	return sys_ubstat(func, arg1, arg2, buf, size);
+}
+#endif
--- /dev/null
+++ b/kernel/bc/vm_pages.c
@@ -0,0 +1,300 @@
+/*
+ *  kernel/bc/vm_pages.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/virtinfo.h>
+#include <linux/module.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/ve.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+int ub_overcommit_memory;
+
+int ub_memory_charge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file, int sv)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	size >>= PAGE_SHIFT;
+	if (size > UB_MAXVALUE)
+		return -EINVAL;
+
+	BUG_ON(sv != UB_SOFT && sv != UB_HARD);
+
+	if (vm_flags & VM_LOCKED) {
+		if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
+			goto out_err;
+	}
+	if (VM_UB_PRIVATE(vm_flags, vm_file)) {
+               if (charge_beancounter_fast(ub, UB_PRIVVMPAGES, size, sv))
+			goto out_private;
+	}
+	return 0;
+
+out_private:
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+out_err:
+	return -ENOMEM;
+}
+
+void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	size >>= PAGE_SHIFT;
+
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+	if (VM_UB_PRIVATE(vm_flags, vm_file))
+		uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, size);
+}
+
+int ub_locked_charge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+extern int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages);
+
+int ub_enough_memory(struct mm_struct *mm, long pages)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+	unsigned long flags;
+	int ret;
+
+	if (!mm)
+		return 0;
+
+	ub = mm->mm_ub;
+
+	if (ub->ub_parms[UB_PRIVVMPAGES].held >
+	    ub->ub_parms[UB_PRIVVMPAGES].barrier) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (ub == get_ub0() || ub_overcommit_memory)
+		return 0;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_enough_memory(mem_cgroup_from_cont(css->cgroup), pages);
+	css_put(css);
+out:
+	if (unlikely(ret < 0)) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		ub->ub_parms[UB_PRIVVMPAGES].failcnt++;
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+	return ret;
+}
+
+static int bc_fill_sysinfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct sysinfo *si)
+{
+	unsigned long used, total;
+	unsigned long totalram, totalswap;
+
+	/* No virtualization */
+	if (meminfo_val == VE_MEMINFO_SYSTEM)
+		return NOTIFY_DONE | NOTIFY_STOP_MASK;
+
+	totalram = si->totalram;
+	totalswap = si->totalswap;
+
+	memset(si, 0, sizeof(*si));
+
+	ub_sync_memcg(ub);
+
+	total = ub->ub_parms[UB_PHYSPAGES].limit;
+	used = ub->ub_parms[UB_PHYSPAGES].held;
+
+	if (total == UB_MAXVALUE)
+		total = totalram;
+
+	si->totalram = total;
+	si->freeram = (total > used ? total - used : 0);
+
+	total = ub->ub_parms[UB_SWAPPAGES].limit;
+	used = ub->ub_parms[UB_SWAPPAGES].held;
+
+	if (total == UB_MAXVALUE)
+		total = totalswap;
+
+	si->totalswap = total;
+	si->freeswap = (total > used ? total - used : 0);
+
+	si->mem_unit = PAGE_SIZE;
+
+	return NOTIFY_OK;
+}
+
+extern void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi);
+
+static int bc_fill_meminfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct meminfo *mi)
+{
+	struct cgroup_subsys_state *css;
+	int cpu, ret;
+
+	ret = bc_fill_sysinfo(ub, meminfo_val, mi->si);
+	if (ret & NOTIFY_STOP_MASK)
+		goto out;
+
+	css = ub_get_mem_css(ub);
+	mem_cgroup_fill_meminfo(mem_cgroup_from_cont(css->cgroup), mi);
+	css_put(css);
+
+	mi->locked = ub->ub_parms[UB_LOCKEDPAGES].held;
+
+	mi->dirty_pages = __ub_stat_get(ub, dirty_pages);
+	mi->writeback_pages = __ub_stat_get(ub, writeback_pages);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+
+		mi->dirty_pages	+= pcpu->dirty_pages;
+		mi->writeback_pages += pcpu->writeback_pages;
+	}
+
+	mi->dirty_pages = max_t(long, 0, mi->dirty_pages);
+	mi->writeback_pages = max_t(long, 0, mi->writeback_pages);
+out:
+	return ret;
+}
+
+static int bc_fill_vmstat(struct user_beancounter *ub, unsigned long *stat)
+{
+	/* FIXME: show swapin/swapout? */
+	return NOTIFY_OK;
+}
+
+static int bc_mem_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	switch (event) {
+	case VIRTINFO_MEMINFO: {
+		struct meminfo *mi = arg;
+		return bc_fill_meminfo(mi->ub, mi->meminfo_val, mi);
+	}
+	case VIRTINFO_SYSINFO:
+		return bc_fill_sysinfo(get_exec_ub(),
+				get_exec_env()->meminfo_val, arg);
+	case VIRTINFO_VMSTAT:
+		return bc_fill_vmstat(get_exec_ub(), arg);
+	};
+
+	return old_ret;
+}
+
+static struct vnotifier_block bc_mem_notifier_block = {
+	.notifier_call = bc_mem_notify,
+};
+
+static int __init init_vmguar_notifier(void)
+{
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_mem_notifier_block);
+	return 0;
+}
+
+static void __exit fini_vmguar_notifier(void)
+{
+	virtinfo_notifier_unregister(VITYPE_GENERAL, &bc_mem_notifier_block);
+}
+
+module_init(init_vmguar_notifier);
+module_exit(fini_vmguar_notifier);
+
+#ifdef CONFIG_PROC_FS
+static int bc_vmaux_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+
+	ub = seq_beancounter(f);
+
+	ub_sync_memcg(ub);
+
+	seq_printf(f, bc_proc_lu_fmt, "swapin", ub->swapin);
+	seq_printf(f, bc_proc_lu_fmt, "swapout", ub->swapout);
+
+	seq_printf(f, bc_proc_lu_fmt, "ram", ub->ub_parms[UB_PHYSPAGES].held);
+
+	return 0;
+}
+static struct bc_proc_entry bc_vmaux_entry = {
+	.name = "vmaux",
+	.u.show = bc_vmaux_show,
+};
+
+static int __init bc_vmaux_init(void)
+{
+	bc_register_proc_entry(&bc_vmaux_entry);
+	return 0;
+}
+
+late_initcall(bc_vmaux_init);
+#endif
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -16,6 +16,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
+#include <linux/ve.h>
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -397,6 +398,25 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+#if CONFIG_VE
+bool ve_capable(int cap)
+{
+	struct cred *cred = get_exec_env()->init_cred;
+
+	if (cred == NULL) /* ve isn't running */
+		cred = ve0.init_cred;
+
+	return ns_capable(cred->user_ns, cap);
+}
+#else
+bool ve_capable(int cap)
+{
+	return capable(cap);
+}
+#endif
+
+EXPORT_SYMBOL_GPL(ve_capable);
+
 /**
  * file_ns_capable - Determine if the file's opener had a capability in effect
  * @file:  The file we want to check
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -39,7 +39,6 @@
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
-#include <linux/sched.h>
 #include <linux/backing-dev.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -60,12 +59,10 @@
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
+#include <linux/ve.h>
 
 #include <linux/atomic.h>
 
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS		INT_MIN
-
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -215,23 +212,11 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
  */
 static int need_forkexit_callback __read_mostly;
 
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
 
-static int css_unbias_refcnt(int refcnt)
-{
-	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-	int v = atomic_read(&css->refcnt);
-
-	return css_unbias_refcnt(v);
-}
-
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -838,7 +823,7 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 
 static void cgroup_free_fn(struct work_struct *work)
 {
-	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
@@ -883,7 +868,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
-	queue_work(cgroup_destroy_wq, &cgrp->free_work);
+	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1103,6 +1089,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (strlen(root->release_agent_path))
 		seq_show_option(seq, "release_agent",
 				root->release_agent_path);
+
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
@@ -1111,19 +1098,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	return 0;
 }
 
-struct cgroup_sb_opts {
-	unsigned long subsys_mask;
-	unsigned long flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-
-	struct cgroupfs_root *new_root;
-
-};
-
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
  * with cgroup_mutex held to protect the subsys[] array. This function takes
@@ -1400,10 +1374,60 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	return ret;
 }
 
+#ifdef CONFIG_VE
+static int cgroup_show_path(struct seq_file *m, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct cgroup *cgrp = __d_cgrp(dentry);
+	char *buf;
+	int ret;
+
+	/*
+	 * dentry can be of cgroup file (but not only of
+	 * directory) so use parent's cgroup for it to find
+	 * dirname and append dentry name to it at the end
+	 */
+	if (!inode || !S_ISDIR(inode->i_mode))
+		cgrp = __d_cgrp(dentry->d_parent);
+
+	ret = -ENOMEM;
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = cgroup_path_ve(cgrp, buf, PATH_MAX);
+	if (ret < 0)
+		goto out_free;
+
+	ret = seq_puts(m, buf);
+	if (ret < 0)
+		goto out_free;
+
+	if (!inode || !S_ISDIR(inode->i_mode)) {
+		if (buf[1] != '\0') {
+			ret = seq_putc(m, '/');
+			if (ret < 0)
+				goto out_free;
+		}
+
+		ret = seq_puts(m, dentry->d_name.name);
+		if (ret < 0)
+			goto out_free;
+	}
+out_free:
+	kfree(buf);
+out:
+	return ret;
+}
+#endif
+
 static const struct super_operations cgroup_ops = {
 	.statfs = simple_statfs,
 	.drop_inode = generic_delete_inode,
 	.show_options = cgroup_show_options,
+#ifdef CONFIG_VE
+	.show_path = cgroup_show_path,
+#endif
 	.remount_fs = cgroup_remount,
 };
 
@@ -1416,7 +1440,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
-	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -1586,10 +1609,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct cgroupfs_root *new_root;
 	struct inode *inode;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
+		if (!get_exec_env()->is_pseudosuper)
+			return ERR_PTR(-EACCES);
+	}
+#endif
+
 	/* First find the desired set of subsystems */
-	mutex_lock(&cgroup_mutex);
-	ret = parse_cgroupfs_options(data, &opts);
-	mutex_unlock(&cgroup_mutex);
+	if (!(flags & MS_KERNMOUNT)) {
+		mutex_lock(&cgroup_mutex);
+		ret = parse_cgroupfs_options(data, &opts);
+		mutex_unlock(&cgroup_mutex);
+	} else {
+		opts = *(struct cgroup_sb_opts *)data;
+		opts.name = kstrdup(opts.name, GFP_KERNEL);
+		opts.release_agent = kstrdup(opts.release_agent, GFP_KERNEL);
+	}
 	if (ret)
 		goto out_err;
 
@@ -1781,6 +1817,7 @@ static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
+	.fs_flags = FS_VIRTUALIZED,
 };
 
 static struct kobject *cgroup_kobj;
@@ -1798,7 +1835,8 @@ static struct kobject *cgroup_kobj;
  * inode's i_mutex, while on the other hand cgroup_path() can be called
  * with some irq-safe spinlocks held.
  */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+static int __cgroup_path(const struct cgroup *cgrp, char *buf, int buflen,
+			 bool virt)
 {
 	int ret = -ENAMETOOLONG;
 	char *start;
@@ -1817,6 +1855,22 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 		const char *name = cgroup_name(cgrp);
 		int len;
 
+#ifdef CONFIG_VE
+		if (virt && test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			/*
+			 * Containers cgroups are bind-mounted from node
+			 * so they are like '/' from inside, thus we have
+			 * to mangle cgroup path output.
+			 */
+			if (*start != '/') {
+				if (--start < buf)
+					goto out;
+				*start = '/';
+			}
+			break;
+		}
+#endif
+
 		len = strlen(name);
 		if ((start -= len) < buf)
 			goto out;
@@ -1834,8 +1888,18 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	rcu_read_unlock();
 	return ret;
 }
+
+int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+{
+	return __cgroup_path(cgrp, buf, buflen, false);
+}
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
+{
+	return __cgroup_path(cgrp, buf, buflen, !ve_is_super(get_exec_env()));
+}
+
 /*
  * Control Group taskset
  */
@@ -2228,10 +2292,13 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
+
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
+
 	mutex_lock(&cgroup_root_mutex);
 	strcpy(cgrp->root->release_agent_path, buffer);
 	mutex_unlock(&cgroup_root_mutex);
@@ -2334,6 +2401,31 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
+#ifdef CONFIG_VE
+	/*
+	 * In a sake of Docker we might bindmount cgroups so
+	 * that they would look like
+	 *
+	 * Node				Container
+	 * /sys/fs/cgroup/memory/CTID	/sys/fs/cgroup/memory
+	 *
+	 * but we should not allow to modify these toplevel
+	 * cgroups, only nested ones, because toplevel carries
+	 * container's resource limits/settings and etc.
+	 *
+	 * Same time ve cgroup should be writable during
+	 * container startup (to modify @ve.state entry which
+	 * kick container to run), but once ve is up and running
+	 * userspace from ve0 should *never* bindmount it
+	 * inside a container FS.
+	 */
+	if (!ve_is_super(get_exec_env())
+	    && test_bit(CGRP_VE_ROOT, &cgrp->flags)
+	    && !get_exec_env()->is_pseudosuper
+	    && !(cft->flags & CFTYPE_VE_WRITABLE))
+		return -EPERM;
+#endif
+
 	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
@@ -2880,6 +2972,7 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	cgroup_cfts_commit(ss, NULL, false);
 	return -ENOENT;
 }
+EXPORT_SYMBOL_GPL(cgroup_rm_cftypes);
 
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3973,6 +4066,22 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 	return 0;
 }
 
+static u64 cgroup_read_subgroups_limit(struct cgroup *cgrp,
+				struct cftype *cft)
+{
+	return cgrp->subgroups_limit;
+}
+static int cgroup_write_subgroups_limit(struct cgroup *cgrp,
+					struct cftype *cft,
+					u64 val)
+{
+	if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		return -EACCES;
+
+	cgrp->subgroups_limit = val;
+	return 0;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -3981,6 +4090,7 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 static struct cftype files[] = {
 	{
 		.name = "tasks",
+		.flags = CFTYPE_VE_WRITABLE,
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_pidlist_release,
@@ -3988,6 +4098,7 @@ static struct cftype files[] = {
 	},
 	{
 		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
+		.flags = CFTYPE_VE_WRITABLE,
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
 		.release = cgroup_pidlist_release,
@@ -4021,6 +4132,12 @@ static struct cftype files[] = {
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
+	{
+		.name = "cgroup.subgroups_limit",
+		.read_u64 = cgroup_read_subgroups_limit,
+		.write_u64 = cgroup_write_subgroups_limit,
+		.mode = S_IRUGO | S_IWUSR,
+	},
 	{ }	/* terminate */
 };
 
@@ -4072,15 +4189,23 @@ static void css_dput_fn(struct work_struct *work)
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, dput_work);
 
+	percpu_ref_exit(&css->refcnt);
 	cgroup_dput(css->cgroup);
 }
 
+static void css_release(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	queue_work(cgroup_destroy_wq, &css->dput_work);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	css->id = NULL;
 	if (cgrp == dummytop)
@@ -4128,6 +4253,49 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
 
+static int subgroups_count(struct cgroup *cgroup)
+{
+	struct cgroup *pos;
+	int cgrps_count = 0;
+
+	rcu_read_lock();
+	cgroup_for_each_descendant_post(pos, cgroup)
+		cgrps_count++;
+	rcu_read_unlock();
+
+	return cgrps_count;
+}
+
+#ifdef CONFIG_VE
+void cgroup_mark_ve_root(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = task_cgroup_from_root(ve->init_task, root);
+		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
+struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+{
+	struct cgroup *ve_root = NULL;
+
+	do {
+		if (test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			ve_root = cgrp;
+			break;
+		}
+		cgrp = cgrp->parent;
+	} while (cgrp);
+
+	return ve_root;
+}
+#endif
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -4145,6 +4313,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	int err = 0;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
+	struct cgroup *ve_root = parent;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
@@ -4160,6 +4329,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (cgrp->id < 0)
 		goto err_free_name;
 
+	ve_root = cgroup_get_ve_root(parent);
+	if (ve_root && ve_root->subgroups_limit > 0 &&
+			subgroups_count(ve_root) >= ve_root->subgroups_limit) {
+		err = -EACCES;
+		goto err_free_name;
+	}
+
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
@@ -4201,7 +4377,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			err = PTR_ERR(css);
 			goto err_free_all;
 		}
+
+		err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
+		if (err)
+			goto err_free_all;
+
 		init_cgroup_css(css, ss, cgrp);
+
 		if (ss->use_id) {
 			err = alloc_css_id(ss, parent, cgrp);
 			if (err)
@@ -4258,8 +4440,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 err_free_all:
 	for_each_subsys(root, ss) {
-		if (cgrp->subsys[ss->subsys_id])
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+		if (css) {
+			percpu_ref_exit(&css->refcnt);
 			ss->css_free(cgrp);
+		}
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
@@ -4287,63 +4473,122 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+		return;
+
+	/* percpu ref's of all css's are killed, kick off the next step */
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup *parent = cgrp->parent;
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
+	struct cgroup *child;
+	bool empty;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+	if (atomic_read(&cgrp->count))
+		return -EBUSY;
+
+	/*
+	 * Make sure there's no live children.  We can't test ->children
+	 * emptiness as dead children linger on it while being destroyed;
+	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+	 */
+	empty = true;
+	rcu_read_lock();
+	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
+		empty = cgroup_is_removed(child);
+		if (!empty)
+			break;
+	}
+	rcu_read_unlock();
+	if (!empty)
 		return -EBUSY;
 
 	/*
-	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
-	 * removed.  This makes future css_tryget() and child creation
-	 * attempts fail thus maintaining the removal conditions verified
-	 * above.
+	 * Block new css_tryget() by killing css refcnts.  cgroup core
+	 * guarantees that, by the time ->css_offline() is invoked, no new
+	 * css reference will be given out via css_tryget().  We can't
+	 * simply call percpu_ref_kill() and proceed to offlining css's
+	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
+	 * as killed on all CPUs on return.
+	 *
+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
+	 * css is confirmed to be seen as killed on all CPUs.  The
+	 * notification callback keeps track of the number of css's to be
+	 * killed and schedules cgroup_offline_fn() to perform the rest of
+	 * destruction once the percpu refs of all css's are confirmed to
+	 * be killed.
 	 */
+	atomic_set(&cgrp->css_kill_cnt, 1);
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
-		WARN_ON(atomic_read(&css->refcnt) < 0);
-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-	}
-	set_bit(CGRP_REMOVED, &cgrp->flags);
+		/*
+		 * Killing would put the base ref, but we need to keep it
+		 * alive until after ->css_offline.
+		 */
+		percpu_ref_get(&css->refcnt);
 
-	/* tell subsystems to initate destruction */
-	for_each_subsys(cgrp->root, ss)
-		offline_css(ss, cgrp);
+		atomic_inc(&cgrp->css_kill_cnt);
+		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
+	}
+	cgroup_css_killed(cgrp);
 
-	/*
-	 * Put all the base refs.  Each css holds an extra reference to the
-	 * cgroup's dentry and cgroup removal proceeds regardless of css
-	 * refs.  On the last put of each css, whenever that may be, the
-	 * extra dentry ref is put so that dentry destruction happens only
-	 * after all css's are released.
-	 */
-	for_each_subsys(cgrp->root, ss)
-		css_put(cgrp->subsys[ss->subsys_id]);
+	set_bit(CGRP_REMOVED, &cgrp->flags);
 
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 
-	/* delete this cgroup from parent->children */
-	list_del_rcu(&cgrp->sibling);
-	list_del_init(&cgrp->allcg_node);
-
+	/*
+	 * Remove @cgrp directory.  The removal puts the base ref but we
+	 * aren't quite done with @cgrp yet, so hold onto it.
+	 */
 	dget(d);
 	cgroup_d_remove_dir(d);
-	dput(d);
-
-	set_bit(CGRP_RELEASABLE, &parent->flags);
-	check_for_release(parent);
 
 	/*
 	 * Unregister events and notify userspace.
@@ -4358,6 +4603,54 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	spin_unlock(&cgrp->event_list_lock);
 
 	return 0;
+};
+
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
+static void cgroup_offline_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+	struct cgroup *parent = cgrp->parent;
+	struct dentry *d = cgrp->dentry;
+	struct cgroup_subsys *ss;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
+	 * initate destruction.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		offline_css(ss, cgrp);
+
+	/*
+	 * Put the css refs from cgroup_destroy_locked().  Each css holds
+	 * an extra reference to the cgroup's dentry and cgroup removal
+	 * proceeds regardless of css refs.  On the last put of each css,
+	 * whenever that may be, the extra dentry ref is put so that dentry
+	 * destruction happens only after all css's are released.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		css_put(cgrp->subsys[ss->subsys_id]);
+
+	/* delete this cgroup from parent->children */
+	list_del_rcu(&cgrp->sibling);
+	list_del_init(&cgrp->allcg_node);
+
+	dput(d);
+
+	set_bit(CGRP_RELEASABLE, &parent->flags);
+	check_for_release(parent);
+
+	mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4693,7 +4986,7 @@ int __init cgroup_init(void)
 		goto out;
 	}
 
-	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+	proc_create("cgroups", S_ISVTX, NULL, &proc_cgroupstats_operations);
 
 out:
 	if (err)
@@ -4718,6 +5011,13 @@ static int __init cgroup_wq_init(void)
 }
 core_initcall(cgroup_wq_init);
 
+static int ve_hide_cgroups(struct cgroupfs_root *root)
+{
+	/* Hide cpuset cgroup in CT for docker */
+	return !ve_is_super(get_exec_env())
+	       && (root->subsys_mask & (1UL << cpuset_subsys_id));
+}
+
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4759,6 +5059,8 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 		struct cgroup *cgrp;
 		int count = 0;
 
+		if (ve_hide_cgroups(root))
+			continue;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -4767,7 +5069,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
-		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
+		retval = cgroup_path_ve(cgrp, buf, PAGE_SIZE);
 		if (retval < 0)
 			goto out_unlock;
 		seq_puts(m, buf);
@@ -4783,6 +5085,8 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	return retval;
 }
 
+#define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
+
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
@@ -4797,11 +5101,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+		int num;
+
 		if (ss == NULL)
 			continue;
+		if (ve_hide_cgroups(ss->root))
+			continue;
+		num = _cg_virtualized(ss->root->number_of_cgroups);
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
-			   ss->root->number_of_cgroups, !ss->disabled);
+			   num, !ss->disabled);
 	}
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -5077,34 +5386,6 @@ static void check_for_release(struct cgroup *cgrp)
 	}
 }
 
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-	while (true) {
-		int t, v;
-
-		v = css_refcnt(css);
-		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-		if (likely(t == v))
-			return true;
-		else if (t < 0)
-			return false;
-		cpu_relax();
-	}
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-	int v;
-
-	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-	if (v == 0)
-		queue_work(cgroup_destroy_wq, &css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
@@ -5135,7 +5416,7 @@ static void cgroup_release_agent(struct work_struct *work)
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
-		int i;
+		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
@@ -5166,7 +5447,12 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		if (err < 0)
+			pr_warn_ratelimited("cgroup release_agent "
+					    "%s %s failed: %d\n",
+					    agentbuf, pathbuf, err);
+
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
@@ -5224,7 +5510,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
+	cssid = rcu_dereference_raw(css->id);
 
 	if (cssid)
 		return cssid->id;
@@ -5232,18 +5518,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
 
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-	struct css_id *cssid;
-
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
-
-	if (cssid)
-		return cssid->depth;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
 /**
  *  css_is_ancestor - test "root" css is an ancestor of "child"
  * @child: the css to be tested.
@@ -5569,3 +5843,85 @@ struct cgroup_subsys debug_subsys = {
 	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
+
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts)
+{
+	return kern_mount_data(&cgroup_fs_type, opts);
+}
+
+struct cgroup *cgroup_get_root(struct vfsmount *mnt)
+{
+	return mnt->mnt_root->d_fsdata;
+}
+
+struct cgroup *cgroup_kernel_lookup(struct vfsmount *mnt,
+				    const char *pathname)
+{
+	int err;
+	struct path path;
+	struct dentry *dentry;
+	struct cgroup *cgrp;
+
+	err = vfs_path_lookup(mnt->mnt_root, mnt, pathname,
+			      LOOKUP_DIRECTORY, &path);
+	if (err)
+		return ERR_PTR(err);
+	dentry = path.dentry;
+	if (dentry->d_inode) {
+		cgrp = __d_cgrp(dentry);
+		atomic_inc(&cgrp->count);
+	} else
+		cgrp = ERR_PTR(-ENOENT);
+	path_put(&path);
+	return cgrp;
+}
+
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, const char *name)
+{
+	struct dentry *dentry;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, parent->dentry, strlen(name));
+	cgrp = ERR_CAST(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (flags & CGRP_CREAT) {
+		if ((flags & CGRP_EXCL) && dentry->d_inode)
+			ret = -EEXIST;
+		else if (!dentry->d_inode)
+			ret = vfs_mkdir(parent->dentry->d_inode, dentry, 0755);
+	}
+	if (!ret && dentry->d_inode) {
+		cgrp = __d_cgrp(dentry);
+		atomic_inc(&cgrp->count);
+	} else
+		cgrp = ret ? ERR_PTR(ret) : NULL;
+	dput(dentry);
+out:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return cgrp;
+}
+
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+	int ret;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	ret = cgroup_attach_task(cgrp, tsk, true);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+void cgroup_kernel_close(struct cgroup *cgrp)
+{
+	if (atomic_dec_and_test(&cgrp->count)) {
+		set_bit(CGRP_RELEASABLE, &cgrp->flags);
+		check_for_release(cgrp);
+	}
+}
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -875,14 +875,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 	cgroup_scan_tasks(&scan);
 }
 
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
-			  const char *buf)
+static int __update_cpumask(struct cpuset *cs,
+			    const struct cpumask *cpus_allowed)
 {
+	struct cpuset *trialcs;
 	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
@@ -891,33 +887,26 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
-	 * Since cpulist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have cpus.
-	 */
-	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
-	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
-		if (retval < 0)
-			return retval;
-
-		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
-			return -EINVAL;
-	}
+	if (!cpumask_subset(cpus_allowed, cpu_active_mask))
+		return -EINVAL;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
+	cpumask_copy(trialcs->cpus_allowed, cpus_allowed);
+
 	retval = validate_change(cs, trialcs);
 	if (retval < 0)
-		return retval;
+		goto done;
 
 	/* Nothing to do if the cpus didn't change */
 	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
-		return 0;
+		goto done;
 
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
-		return retval;
+		goto done;
 
 	is_load_balanced = is_sched_load_balance(trialcs);
 
@@ -935,7 +924,41 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
 	if (is_load_balanced)
 		rebuild_sched_domains_locked();
-	return 0;
+
+	retval = 0;
+done:
+	free_trial_cpuset(trialcs);
+	return retval;
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, const char *buf)
+{
+	cpumask_var_t cpus_allowed;
+	int retval = 0;
+
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+		return -ENOMEM;
+
+	/*
+	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have cpus.
+	 */
+	if (!*buf)
+		cpumask_clear(cpus_allowed);
+	else
+		retval = cpulist_parse(buf, cpus_allowed);
+
+	if (retval == 0)
+		retval = __update_cpumask(cs, cpus_allowed);
+
+	free_cpumask_var(cpus_allowed);
+	return retval;
 }
 
 /*
@@ -1103,9 +1126,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
  */
-static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
-			   const char *buf)
+static int __update_nodemask(struct cpuset *cs,
+			   const nodemask_t *mems_allowed)
 {
+	struct cpuset *trialcs = NULL;
 	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
 	int retval;
 	struct ptr_heap heap;
@@ -1122,25 +1146,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 		goto done;
 	}
 
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs->mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs->mems_allowed);
-		if (retval < 0)
-			goto done;
+	if (!nodes_subset(*mems_allowed, node_states[N_MEMORY])) {
+		retval = -EINVAL;
+		goto done;
+	}
 
-		if (!nodes_subset(trialcs->mems_allowed,
-				node_states[N_MEMORY])) {
-			retval =  -EINVAL;
-			goto done;
-		}
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs) {
+		retval = -ENOMEM;
+		goto done;
 	}
+
+	trialcs->mems_allowed = *mems_allowed;
+
 	*oldmem = cs->mems_allowed;
 	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
@@ -1162,10 +1180,38 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	heap_free(&heap);
 done:
+	if (trialcs)
+		free_trial_cpuset(trialcs);
 	NODEMASK_FREE(oldmem);
 	return retval;
 }
 
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+	NODEMASK_ALLOC(nodemask_t, mems_allowed, GFP_KERNEL);
+	int retval = 0;
+
+	if (!mems_allowed)
+		return -ENOMEM;
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf)
+		nodes_clear(*mems_allowed);
+	else
+		retval = nodelist_parse(buf, *mems_allowed);
+
+	if (retval == 0)
+		retval = __update_nodemask(cs, mems_allowed);
+
+	NODEMASK_FREE(mems_allowed);
+	return retval;
+}
+
 int current_cpuset_is_being_rebound(void)
 {
 	int ret;
@@ -1593,7 +1639,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 				const char *buf)
 {
 	struct cpuset *cs = cgroup_cs(cgrp);
-	struct cpuset *trialcs;
 	int retval = -ENODEV;
 
 	/*
@@ -1618,25 +1663,18 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
 
-	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs) {
-		retval = -ENOMEM;
-		goto out_unlock;
-	}
-
 	switch (cft->private) {
 	case FILE_CPULIST:
-		retval = update_cpumask(cs, trialcs, buf);
+		retval = update_cpumask(cs, buf);
 		break;
 	case FILE_MEMLIST:
-		retval = update_nodemask(cs, trialcs, buf);
+		retval = update_nodemask(cs, buf);
 		break;
 	default:
 		retval = -EINVAL;
 		break;
 	}
 
-	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return retval;
@@ -2648,7 +2686,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
 
 	rcu_read_lock();
 	css = task_subsys_state(tsk, cpuset_subsys_id);
-	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+	retval = cgroup_path_ve(css->cgroup, buf, PAGE_SIZE);
 	rcu_read_unlock();
 	if (retval < 0)
 		goto out_put_task;
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -55,6 +55,7 @@ struct cred init_cred = {
 	.user_ns		= &init_user_ns,
 	.group_info		= &init_groups,
 };
+EXPORT_SYMBOL(init_cred);
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
 {
@@ -561,8 +562,8 @@ EXPORT_SYMBOL(revert_creds);
 void __init cred_init(void)
 {
 	/* allocate a slab in which we can store credentials */
-	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
-				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 }
 
 /**
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
 }
 
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -123,6 +123,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	/* For mmu_notifiers */
 	const unsigned long mmun_start = addr;
 	const unsigned long mmun_end   = addr + PAGE_SIZE;
+	struct mem_cgroup *memcg;
+
+	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+	if (err)
+		return err;
 
 	/* For try_to_free_swap() and munlock_vma_page() below */
 	lock_page(page);
@@ -135,6 +140,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	get_page(kpage);
 	page_add_new_anon_rmap(kpage, vma, addr);
+	mem_cgroup_commit_charge(kpage, memcg, false);
+	lru_cache_add_active_or_unevictable(kpage, vma);
 
 	if (!PageAnon(page)) {
 		dec_mm_counter(mm, mm_counter_file(page));
@@ -156,6 +163,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
+	mem_cgroup_cancel_charge(kpage, memcg);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,20 +48,22 @@
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
+#include <linux/ve.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/kcov.h>
+
+#include <bc/misc.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
-static void exit_mm(struct task_struct * tsk);
-
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
@@ -186,6 +188,8 @@ void release_task(struct task_struct * p)
 	tasklist_write_lock_irq();
 	ptrace_release_task(p);
 	__exit_signal(p);
+	nr_zombie--;
+	atomic_inc(&nr_dead);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -208,6 +212,7 @@ void release_task(struct task_struct * p)
 	qwrite_unlock_irq(&tasklist_lock);
 	cgroup_pids_release(p);
 	release_thread(p);
+	ub_task_uncharge(get_task_ub(p));
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
 	p = leader;
@@ -381,6 +386,7 @@ void mm_update_next_owner(struct mm_struct *mm)
 	 */
 	if (mm->owner != p)
 		return;
+
 	/*
 	 * The current owner is exiting/execing and there are no other
 	 * candidates.  Do not leave the mm pointing to a possibly
@@ -463,6 +469,19 @@ static void exit_mm(struct task_struct * tsk)
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
+
+#ifdef CONFIG_VE
+#define K(x) ((x) << (PAGE_SHIFT-10))
+	if (tsk->task_ve != &ve0 &&
+	    test_tsk_thread_flag(tsk, TIF_MEMDIE))
+		ve_printk(VE_LOG, KERN_ERR "OOM killed process %d (%s) "
+			  "total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+			  task_pid_vnr(tsk), tsk->comm, K(mm->total_vm),
+			  K(get_mm_counter(mm, MM_ANONPAGES)),
+			  K(get_mm_counter(mm, MM_FILEPAGES)));
+#undef K
+#endif
+
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
@@ -679,6 +698,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+	nr_zombie++;
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -721,6 +741,7 @@ void do_exit(long code)
 	int group_dead;
 
 	profile_task_exit(tsk);
+	kcov_task_exit(tsk);
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
@@ -807,8 +828,21 @@ void do_exit(long code)
 	exit_fs(tsk);
 	if (group_dead)
 		disassociate_ctty(1);
+
+	/*
+	 * task_work_run() has to be called before exit_task_namespaces(),
+	 * because fuse_abort_conn() is called from __fput(). If it will not
+	 * be executed, we can hang in request_wait_answer(). We have seen this
+	 * situation when a process was the last member of a mount namespace
+	 * and the mount namespace has a vstorage fuse mount.
+	 */
+	task_work_run();
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
+
+	if (test_thread_flag(TIF_MEMDIE))
+		exit_oom_victim();
+
 	check_stack_usage();
 	exit_thread();
 
@@ -832,12 +866,7 @@ void do_exit(long code)
 	ptrace_put_breakpoints(tsk);
 
 	exit_notify(tsk, group_dead);
-#ifdef CONFIG_NUMA
-	task_lock(tsk);
-	mpol_put(tsk->mempolicy);
-	tsk->mempolicy = NULL;
-	task_unlock(tsk);
-#endif
+	mpol_put_task_policy(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
@@ -903,7 +932,6 @@ void complete_and_exit(struct completion *comp, long code)
 
 	do_exit(code);
 }
-
 EXPORT_SYMBOL(complete_and_exit);
 
 SYSCALL_DEFINE1(exit, int, error_code)
@@ -975,12 +1003,40 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 	return task->pids[type].pid;
 }
 
-static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+static int __eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
+static int __entered_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	struct pid *pid, *wo_pid;
+
+	wo_pid = wo->wo_pid;
+	if ((wo_pid == NULL) || (wo_pid->level != 0))
+		return 0;
+
+	pid = task_pid_type(p, wo->wo_type);
+	if (pid->level != 1)
+		return 0;
+
+	if (wo_pid->numbers[0].nr != pid->numbers[0].nr)
+		return 0;
+
+	wo->wo_pid = get_pid(pid);
+	put_pid(wo_pid);
+	return 1;
+}
+
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	if (__eligible_pid(wo, p))
+		return 1;
+	else
+		return __entered_pid(wo, p);
+}
+
 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
@@ -1640,7 +1696,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 			ret = put_user(0, &infop->si_status);
 	}
 
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 	return ret;
 }
 
@@ -1676,7 +1732,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 	wo.wo_stat	= stat_addr;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 
 	return ret;
 }
--- /dev/null
+++ b/kernel/fence-watchdog.c
@@ -0,0 +1,313 @@
+/*
+ *  kernel/fence-watchdog.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Provide userspace with an interface to forbid kernel to work
+ * without an userspace daemon.
+ *
+ * The daemon should write number of seconds before fencing to the
+ * file /sys/kernel/watchdog_timer, and must renew it, until the
+ * time elapses.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/jiffies.h>
+#include <linux/reboot.h>
+#include <linux/fence-watchdog.h>
+#include <linux/device.h>
+#include <linux/kmsg_dump.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+
+#define MAX_U64			(~(u64)0)
+#define MAX_JIFFIES_DELTA	(10 * 365UL * 24UL * 3600UL * HZ)
+#define ACTION_NAME_LEN		16
+
+enum {
+	FENCE_WDOG_CRASH = 0,
+	FENCE_WDOG_REBOOT = 1,
+	FENCE_WDOG_POWEROFF = 2,
+	FENCE_WDOG_NETFILTER = 3,
+};
+
+const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL};
+
+
+DEFINE_VVAR(volatile unsigned long, fence_wdog_jiffies64) = MAX_U64;
+static int fence_wdog_action = FENCE_WDOG_CRASH;
+
+enum {
+	NOT_FENCED = 0,
+	FENCED = 1,
+	FENCED_TIMEOUT = 2,
+};
+
+static atomic_t fence_stage = ATOMIC_INIT(NOT_FENCED);
+static char fence_wdog_log_path[PATH_MAX] = "/fence_wdog.log";
+
+#define SECS_PER_MIN	60
+#define PREFIX_LEN	39
+
+static int print_prefix(char *msg) {
+	struct timeval tv;
+	struct tm tm;
+
+	do_gettimeofday(&tv);
+	time_to_tm(tv.tv_sec - sys_tz.tz_minuteswest * SECS_PER_MIN, 0, &tm);
+
+	return snprintf(msg, PREFIX_LEN, "[%02d:%02d:%02d/%04ld-%02d-%02d] fence-watchdog: ",
+			tm.tm_hour, tm.tm_min, tm.tm_sec,
+			tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday);
+}
+
+#define MSG_LEN (PREFIX_LEN + 10)
+
+void fence_wdog_log(void)
+{
+	char msg[MSG_LEN];
+	struct file *file;
+	int ret, len;
+
+	ret = print_prefix(msg);
+	if (ret < 0)
+		return;
+
+	len = strlen(msg);
+
+	ret = snprintf(msg + len, MSG_LEN - len, "%s\n", action_names[fence_wdog_action]);
+	if (ret != strlen(action_names[fence_wdog_action]) + 1) {
+		printk(KERN_EMERG"fence-watchdog: Failed to sprintf msg\n");
+		return;
+	}
+
+	file = filp_open(fence_wdog_log_path,
+			 O_CREAT | O_WRONLY | O_APPEND | O_NOFOLLOW | O_LARGEFILE,
+			 0600);
+	if (IS_ERR(file)) {
+		printk(KERN_EMERG"fence-watchdog: Failed to open log path\n");
+		return;
+	}
+
+	if (!S_ISREG(file_inode(file)->i_mode)) {
+		printk(KERN_EMERG"fence-watchdog: Wrong type of log file\n");
+		goto close;
+	}
+
+	ret = kernel_write(file, msg, strlen(msg), file->f_pos);
+	if (ret < 0) {
+		printk(KERN_EMERG"fence-watchdog: Failed to write msg, ret=%d\n", ret);
+		goto close;
+	}
+
+	ret = vfs_fsync(file, 0);
+	if (ret < 0)
+		printk(KERN_EMERG"fence-watchdog: Failed to fsync log file ret=%d\n", ret);
+
+close:
+	ret = filp_close(file, NULL);
+	if (ret < 0)
+		printk(KERN_EMERG"fence-watchdog: Failed to close log file ret=%d\n", ret);
+
+	return;
+}
+
+static void do_halt_or_reboot(struct work_struct *dummy)
+{
+	printk(KERN_EMERG"fence-watchdog: %s\n",
+	       action_names[fence_wdog_action]);
+
+	fence_wdog_log();
+
+	switch (fence_wdog_action) {
+	case FENCE_WDOG_REBOOT:
+		emergency_restart();
+		break;
+	case FENCE_WDOG_POWEROFF:
+		kernel_halt();
+		break;
+	}
+}
+
+static DECLARE_WORK(halt_or_reboot_work, do_halt_or_reboot);
+
+void fence_wdog_do_fence(void)
+{
+	if (fence_wdog_action == FENCE_WDOG_CRASH ||
+			atomic_read(&fence_stage) == FENCED_TIMEOUT)
+		panic("fence-watchdog: %s\n",
+		      action_names[fence_wdog_action]);
+	else
+		schedule_work(&halt_or_reboot_work);
+}
+
+#define FENCE_WDOG_TIMEOUT 30
+
+inline int fence_wdog_check_timer(void)
+{
+	if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 &&
+			fence_wdog_action != FENCE_WDOG_NETFILTER)) {
+		if (atomic_cmpxchg(&fence_stage, NOT_FENCED, FENCED) == NOT_FENCED
+		    || (get_jiffies_64() > fence_wdog_jiffies64
+		    + FENCE_WDOG_TIMEOUT * HZ
+		    && atomic_cmpxchg(&fence_stage, FENCED, FENCED_TIMEOUT) == FENCED))
+			fence_wdog_do_fence();
+
+		return 1;
+	}
+
+	return 0;
+}
+
+bool fence_wdog_tmo_match(void)
+{
+	return get_jiffies_64() > fence_wdog_jiffies64;
+}
+EXPORT_SYMBOL(fence_wdog_tmo_match);
+
+static ssize_t fence_wdog_timer_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	ssize_t ret;
+	u64 jiffies_delta = fence_wdog_jiffies64 - get_jiffies_64();
+	struct timespec t;
+
+	if (jiffies_delta > MAX_JIFFIES_DELTA) {
+		ret =  sprintf(buf, "inf\n");
+	} else {
+		jiffies_to_timespec(jiffies_delta, &t);
+		ret =  sprintf(buf, "%ld\n", t.tv_sec);
+	}
+
+	return ret;
+}
+
+static ssize_t fence_wdog_timer_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long long val;
+	unsigned long jiffies_delta;
+	struct timespec t;
+
+	if (strict_strtoull(buf, 10, &val))
+		return -EINVAL;
+
+	if (val == 0) {
+		fence_wdog_jiffies64 = MAX_U64;
+		return count;
+	}
+
+	t.tv_sec = val;
+	t.tv_nsec = 0;
+
+	jiffies_delta = timespec_to_jiffies(&t);
+	if (jiffies_delta > MAX_JIFFIES_DELTA)
+		return -EINVAL;
+
+	fence_wdog_jiffies64 = get_jiffies_64() + jiffies_delta;
+
+	return count;
+}
+
+static ssize_t fence_wdog_action_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", action_names[fence_wdog_action]);
+}
+
+static ssize_t fence_wdog_action_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	char str_action[ACTION_NAME_LEN];
+	int i = 0;
+
+	if (sscanf(buf, "%15s", str_action) != 1)
+		return -EINVAL;
+
+	for (i = 0; action_names[i]; i++) {
+		if ((!strnicmp(str_action, action_names[i], ACTION_NAME_LEN))) {
+			fence_wdog_action = i;
+			return count;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t fence_wdog_available_actions_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	int i, ret = 0;
+
+	for (i = 0; action_names[i] != NULL; i++)
+		ret += sprintf(&buf[ret], "%s ", action_names[i]);
+
+	ret += sprintf(&buf[ret], "\n");
+	return ret;
+}
+
+static ssize_t fence_wdog_log_path_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", fence_wdog_log_path);
+}
+
+#define STORE_FORMAT_LEN 16
+
+static ssize_t fence_wdog_log_path_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	char format[STORE_FORMAT_LEN];
+	int ret;
+
+	ret = snprintf(format, STORE_FORMAT_LEN, "%%%ds", PATH_MAX - 1);
+	if (ret < 0)
+		return ret;
+
+
+	if (sscanf(buf, format, fence_wdog_log_path) != 1)
+		return -EINVAL;
+	return 0;
+}
+
+static struct kobj_attribute fence_wdog_timer_attr =
+	__ATTR(watchdog_timer, 0644,
+		fence_wdog_timer_show, fence_wdog_timer_store);
+
+static struct kobj_attribute fence_wdog_action_attr =
+	__ATTR(watchdog_action, 0644,
+		fence_wdog_action_show, fence_wdog_action_store);
+
+static struct kobj_attribute fence_wdog_available_actions_attr =
+	__ATTR(watchdog_available_actions, 0644,
+		fence_wdog_available_actions_show, NULL);
+
+static struct kobj_attribute fence_wdog_log_path_attr =
+	__ATTR(watchdog_log_path, 0644,
+		fence_wdog_log_path_show, fence_wdog_log_path_store);
+
+static struct attribute *fence_wdog_attrs[] = {
+	&fence_wdog_timer_attr.attr,
+	&fence_wdog_action_attr.attr,
+	&fence_wdog_available_actions_attr.attr,
+	&fence_wdog_log_path_attr.attr,
+	NULL,
+};
+
+static struct attribute_group fence_wdog_attr_group = {
+	.attrs = fence_wdog_attrs,
+};
+
+static int __init fence_wdog_init(void)
+{
+	sysfs_update_group(kernel_kobj, &fence_wdog_attr_group);
+	return 0;
+}
+
+module_init(fence_wdog_init)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,6 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
+#include <linux/ratelimit.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
 #include <linux/file.h>
@@ -56,6 +57,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
+#include <linux/kcov.h>
 #include <linux/freezer.h>
 #include <linux/kaiser.h>
 #include <linux/delayacct.h>
@@ -78,6 +80,7 @@
 #ifndef __GENKSYMS__
 #include <linux/user_namespace.h>
 #endif
+#include <linux/ve.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -86,6 +89,9 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/misc.h>
+#include <bc/vmpages.h>
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -96,6 +102,7 @@
  */
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
 int nr_threads;			/* The idle threads do not count.. */
+EXPORT_SYMBOL(nr_threads);
 
 int max_threads;		/* tunable limit on nr_threads */
 
@@ -105,6 +112,7 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __attribute__((__section__(".data..cacheline_aligned")))
 static atomic_t tasklist_waiters = ATOMIC_INIT(0);
 __cacheline_aligned DEFINE_QRWLOCK(tasklist_lock);  /* outer */
+EXPORT_SYMBOL(tasklist_lock);
 
 void tasklist_write_lock_irq(void)
 {
@@ -182,7 +190,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 						  int node)
 {
-	struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
 	return page ? page_address(page) : NULL;
@@ -190,7 +198,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
-	free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -228,7 +236,7 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+struct kmem_cache *__vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
@@ -272,11 +280,13 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	ub_task_put(tsk);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
 
+	atomic_dec(&nr_dead);
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
@@ -292,9 +302,9 @@ void __init fork_init(unsigned long mempages)
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
-	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+	task_struct_cachep = kmem_cache_create("task_struct",
+			sizeof(struct task_struct), ARCH_MIN_TASKALIGN,
+			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -377,6 +387,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	account_kernel_stack(ti, 1);
 
+	kcov_task_init(tsk);
+
 	return tsk;
 
 free_ti:
@@ -436,6 +448,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			continue;
 		}
 		charge = 0;
+		if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
+					mpnt->vm_flags & ~VM_LOCKED,
+					mpnt->vm_file, UB_HARD))
+			goto fail_noch;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned long len = vma_pages(mpnt);
 
@@ -443,7 +459,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = allocate_vma(mm, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -461,6 +477,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~(VM_LOCKED);
 		tmp->vm_next = tmp->vm_prev = NULL;
+		tmp->vm_private_data2 = NULL;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -474,12 +491,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				atomic_inc(&mapping->i_mmap_writable);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
-				vma_nonlinear_insert(tmp,
-						&mapping->i_mmap_nonlinear);
-			else
-				vma_interval_tree_insert_after(tmp, mpnt,
-							&mapping->i_mmap);
+			vma_interval_tree_insert_after(tmp, mpnt,
+					&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
 			mutex_unlock(&mapping->i_mmap_mutex);
 		}
@@ -526,8 +539,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 fail_nomem_anon_vma_fork:
 	mpol_put(pol);
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	free_vma(mm, tmp);
 fail_nomem:
+	ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
+			mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
+fail_noch:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
 	goto out;
@@ -559,7 +575,32 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#ifdef CONFIG_BEANCOUNTERS
+
+static inline struct mm_struct *allocate_mm(struct user_beancounter *ub)
+{
+	return kmem_cache_alloc(mm_cachep, GFP_KERNEL);
+}
+
+static inline void set_mm_ub(struct mm_struct *mm, struct user_beancounter *ub)
+{
+	mm->mm_ub = get_beancounter(ub);
+}
+
+static inline void put_mm_ub(struct mm_struct *mm)
+{
+	put_beancounter(mm->mm_ub);
+	mm->mm_ub = NULL;
+}
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define allocate_mm(ub)  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define set_mm_ub(mm, ub)
+#define put_mm_ub(mm)
+
+#endif /* CONFIG_BEANCOUNTERS */
+
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
@@ -614,6 +655,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 		return mm;
 	}
 
+	put_mm_ub(mm);
 	free_mm(mm);
 	return NULL;
 }
@@ -642,11 +684,12 @@ struct mm_struct *mm_alloc(void)
 {
 	struct mm_struct *mm;
 
-	mm = allocate_mm();
+	mm = allocate_mm(get_exec_ub());
 	if (!mm)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
+	set_mm_ub(mm, get_exec_ub());
 	mm_init_cpumask(mm);
 	return mm_init(mm, current);
 }
@@ -659,6 +702,8 @@ struct mm_struct *mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
+	if (unlikely(atomic_read(&mm->mm_users)))
+		put_mm_ub(mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	hmm_mm_destroy(mm);
@@ -689,6 +734,7 @@ void mmput(struct mm_struct *mm)
 		}
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
+		put_mm_ub(mm);
 		mmdrop(mm);
 	}
 }
@@ -919,7 +965,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!oldmm)
 		return NULL;
 
-	mm = allocate_mm();
+	mm = allocate_mm(tsk->task_bc.task_ub);
 	if (!mm)
 		goto fail_nomem;
 
@@ -929,6 +975,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
+	set_mm_ub(mm, tsk->task_bc.task_ub);
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
@@ -960,6 +1007,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
+	put_mm_ub(mm);
 	mm_free_pgd(mm);
 	free_mm(mm);
 	return NULL;
@@ -1289,10 +1337,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/*
 	 * If the new process will be in a different pid or user namespace
-	 * do not allow it to share a thread group or signal handlers or
-	 * parent with the forking task.
+	 * do not allow it to share a thread group with the forking task.
 	 */
-	if (clone_flags & CLONE_SIGHAND) {
+	if (clone_flags & CLONE_THREAD) {
 		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
 		    (task_active_pid_ns(current) !=
 				current->nsproxy->pid_ns))
@@ -1304,9 +1351,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	retval = -ENOMEM;
+	if (ub_task_charge(get_exec_ub()))
+		goto fork_out;
+
 	p = dup_task_struct(current, node);
 	if (!p)
-		goto fork_out;
+		goto bad_fork_uncharge;
+
+	ub_task_get(get_exec_ub(), p);
 
 	ftrace_graph_init_task(p);
 	get_seccomp_filter(p);
@@ -1421,10 +1473,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
-	p->memcg_batch.do_batch = 0;
-	p->memcg_batch.memcg = NULL;
-#endif
 #ifdef CONFIG_BCACHE
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
@@ -1572,11 +1620,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	*/
 	recalc_sigpending();
 	if (signal_pending(current)) {
-		spin_unlock(&current->sighand->siglock);
-		qwrite_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
+	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+		retval = -ENOMEM;
+		goto bad_fork_cancel_cgroup;
+	}
 
 	if (likely(p->pid)) {
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1626,6 +1676,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	return p;
 
 bad_fork_cancel_cgroup:
+	spin_unlock(&current->sighand->siglock);
+	qwrite_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
@@ -1667,7 +1719,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
+	ub_task_put(p);
 	free_task(p);
+bad_fork_uncharge:
+	ub_task_uncharge(get_exec_ub());
 fork_out:
 	return ERR_PTR(retval);
 }
@@ -1845,16 +1900,19 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-			SLAB_NOTRACK, sighand_ctor);
+			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
 	 * whole struct cpumask for the OFFSTACK case. We could change
@@ -1864,8 +1922,9 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
+	__vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
 }
@@ -1951,7 +2010,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	int err;
 
 	/*
-	 * If unsharing a user namespace must also unshare the thread.
+	 * If unsharing a user namespace must also unshare the thread group
+	 * and unshare the filesystem root and working directories.
 	 */
 	if (unshare_flags & CLONE_NEWUSER)
 		unshare_flags |= CLONE_THREAD | CLONE_FS;
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -36,6 +36,12 @@ bool freezing_slow_path(struct task_struct *p)
 	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
 		return false;
 
+	if (test_thread_flag(TIF_MEMDIE))
+		return false;
+
+	if (p->jobctl & JOBCTL_TRAPPING)
+		return false;
+
 	if (pm_nosig_freezing || cgroup_freezing(p))
 		return true;
 
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
 #include <linux/hugetlb.h>
+#include <linux/ve.h>
 
 #include <asm/futex.h>
 
@@ -2576,7 +2577,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
-	struct rt_mutex *pi_mutex = NULL;
 	struct futex_hash_bucket *hb;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
@@ -2662,6 +2662,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
+		struct rt_mutex *pi_mutex;
+
 		/*
 		 * We have been woken up by futex_unlock_pi(), a timeout, or a
 		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2685,18 +2687,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		if (res)
 			ret = (res < 0) ? res : 0;
 
+		/*
+		 * If fixup_pi_state_owner() faulted and was unable to handle
+		 * the fault, unlock the rt_mutex and return the fault to
+		 * userspace.
+		 */
+		if (ret && rt_mutex_owner(pi_mutex) == current)
+			rt_mutex_unlock(pi_mutex);
+
 		/* Unqueue and drop the lock. */
 		unqueue_me_pi(&q);
 	}
 
-	/*
-	 * If fixup_pi_state_owner() faulted and was unable to handle the
-	 * fault, unlock the rt_mutex and return the fault to userspace.
-	 */
-	if (ret == -EFAULT) {
-		if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
-			rt_mutex_unlock(pi_mutex);
-	} else if (ret == -EINTR) {
+	if (ret == -EINTR) {
 		/*
 		 * We've already been requeued, but cannot restart by calling
 		 * futex_lock_pi() directly. We could restart this syscall, but
@@ -2944,6 +2947,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;
+	ktime_t abs_time;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;
@@ -2952,6 +2956,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		flags |= FLAGS_CLOCKRT;
 		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
+	} else if (timeout) {
+		if (cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI) {
+			abs_time = ktime_add(*timeout, timespec_to_ktime(
+					     get_exec_env()->start_timespec));
+			timeout = &abs_time;
+		}
 	}
 
 	switch (cmd) {
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -21,7 +21,8 @@ struct group_info *groups_alloc(int gidsetsize)
 	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
 	/* Make sure we always allocate at least one indirect block pointer */
 	nblocks = nblocks ? : 1;
-	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *),
+			GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
 	if (!group_info)
 		return NULL;
 	group_info->ngroups = gidsetsize;
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -286,9 +286,9 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 			return -EFAULT;
 	} else {
 		memset(&set_buffer, 0, sizeof(set_buffer));
-		printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+		printk_once(KERN_WARNING "cmd: %s CT: %s calls setitimer() with new_value NULL pointer."
 			    " Misfeature support will be removed\n",
-			    current->comm);
+			    current->comm, task_ve_name(current));
 	}
 
 	error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
 
 #include <asm/unistd.h>
 
@@ -44,11 +48,12 @@ static long kptr_obfuscate(long v, int type)
  */
 static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
 {
-	long ret;
+	long t1, t2;
 
-	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+	t1 = kptr_obfuscate((long)v1, type);
+	t2 = kptr_obfuscate((long)v2, type);
 
-	return (ret < 0) | ((ret > 0) << 1);
+	return (t1 < t2) | ((t1 > t2) << 1);
 }
 
 /* The caller must have pinned the task */
@@ -93,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
 	return err;
 }
 
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	struct file *filp, *filp_epoll, *filp_tgt;
+	struct kcmp_epoll_slot slot;
+	struct files_struct *files;
+
+	if (copy_from_user(&slot, uslot, sizeof(slot)))
+		return -EFAULT;
+
+	filp = get_file_raw_ptr(task1, idx1);
+	if (!filp)
+		return -EBADF;
+
+	files = get_files_struct(task2);
+	if (!files)
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	filp_epoll = fcheck_files(files, slot.efd);
+	if (filp_epoll)
+		get_file(filp_epoll);
+	else
+		filp_tgt = ERR_PTR(-EBADF);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (filp_epoll) {
+		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+		fput(filp_epoll);
+	}
+
+	if (IS_ERR(filp_tgt))
+		return PTR_ERR(filp_tgt);
+
+	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		unsigned long, idx1, unsigned long, idx2)
 {
@@ -164,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		ret = -EOPNOTSUPP;
 #endif
 		break;
+	case KCMP_EPOLL_TFD:
+		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,283 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/preempt_mask.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ *  - initial state after open()
+ *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ *  - then, mmap() call (several calls are allowed but not useful)
+ *  - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+	/*
+	 * Reference counter. We keep one for:
+	 *  - opened file descriptor
+	 *  - task with enabled coverage (we can't unwire it from another task)
+	 */
+	atomic_t		refcount;
+	/* The lock protects mode, size, area and t. */
+	spinlock_t		lock;
+	enum kcov_mode		mode;
+	/* Size of arena (in long's for KCOV_MODE_TRACE). */
+	unsigned		size;
+	/* Coverage buffer shared with user space. */
+	void			*area;
+	/* Task for which we collect coverage, or NULL. */
+	struct task_struct	*t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	enum kcov_mode mode;
+
+	t = current;
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 * The checks for whether we are in an interrupt are open-coded, because
+	 * 1. We can't use in_interrupt() here, since it also returns true
+	 *    when we are inside local_bh_disable() section.
+	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+	 *    since that leads to slower generated code (three separate tests,
+	 *    one for each of the flags).
+	 */
+	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+							| NMI_MASK)))
+		return;
+	mode = READ_ONCE(t->kcov_mode);
+	if (mode == KCOV_MODE_TRACE) {
+		unsigned long *area;
+		unsigned long pos;
+
+		/*
+		 * There is some code that runs in interrupts but for which
+		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+		 * interrupts, there are paired barrier()/WRITE_ONCE() in
+		 * kcov_ioctl_locked().
+		 */
+		barrier();
+		area = t->kcov_area;
+		/* The first word is number of subsequent PCs. */
+		pos = READ_ONCE(area[0]) + 1;
+		if (likely(pos < t->kcov_size)) {
+			area[pos] = _RET_IP_;
+			WRITE_ONCE(area[0], pos);
+		}
+	}
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+	atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+	if (atomic_dec_and_test(&kcov->refcount)) {
+		vfree(kcov->area);
+		kfree(kcov);
+	}
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+	t->kcov_mode = KCOV_MODE_DISABLED;
+	t->kcov_size = 0;
+	t->kcov_area = NULL;
+	t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+	struct kcov *kcov;
+
+	kcov = t->kcov;
+	if (kcov == NULL)
+		return;
+	spin_lock(&kcov->lock);
+	if (WARN_ON(kcov->t != t)) {
+		spin_unlock(&kcov->lock);
+		return;
+	}
+	/* Just to not leave dangling references behind. */
+	kcov_task_init(t);
+	kcov->t = NULL;
+	spin_unlock(&kcov->lock);
+	kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	int res = 0;
+	void *area;
+	struct kcov *kcov = vma->vm_file->private_data;
+	unsigned long size, off;
+	struct page *page;
+
+	area = vmalloc_user(vma->vm_end - vma->vm_start);
+	if (!area)
+		return -ENOMEM;
+
+	spin_lock(&kcov->lock);
+	size = kcov->size * sizeof(unsigned long);
+	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	    vma->vm_end - vma->vm_start != size) {
+		res = -EINVAL;
+		goto exit;
+	}
+	if (!kcov->area) {
+		kcov->area = area;
+		vma->vm_flags |= VM_DONTEXPAND;
+		spin_unlock(&kcov->lock);
+		for (off = 0; off < size; off += PAGE_SIZE) {
+			page = vmalloc_to_page(kcov->area + off);
+			if (vm_insert_page(vma, vma->vm_start + off, page))
+				WARN_ONCE(1, "vm_insert_page() failed");
+		}
+		return 0;
+	}
+exit:
+	spin_unlock(&kcov->lock);
+	vfree(area);
+	return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+	struct kcov *kcov;
+
+	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+	if (!kcov)
+		return -ENOMEM;
+	atomic_set(&kcov->refcount, 1);
+	spin_lock_init(&kcov->lock);
+	filep->private_data = kcov;
+	return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+	kcov_put(filep->private_data);
+	return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct task_struct *t;
+	unsigned long size, unused;
+
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 */
+		if (kcov->mode != KCOV_MODE_DISABLED)
+			return -EBUSY;
+		/*
+		 * Size must be at least 2 to hold current position and one PC.
+		 * Later we allocate size * sizeof(unsigned long) memory,
+		 * that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_TRACE;
+		return 0;
+	case KCOV_ENABLE:
+		/*
+		 * Enable coverage for the current task.
+		 * At this point user must have been enabled trace mode,
+		 * and mmapped the file. Coverage collection is disabled only
+		 * at task exit or voluntary by KCOV_DISABLE. After that it can
+		 * be enabled for another task.
+		 */
+		unused = arg;
+		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+		    kcov->area == NULL)
+			return -EINVAL;
+		if (kcov->t != NULL)
+			return -EBUSY;
+		t = current;
+		/* Cache in task struct for performance. */
+		t->kcov_size = kcov->size;
+		t->kcov_area = kcov->area;
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		barrier();
+		WRITE_ONCE(t->kcov_mode, kcov->mode);
+		t->kcov = kcov;
+		kcov->t = t;
+		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+		kcov_get(kcov);
+		return 0;
+	case KCOV_DISABLE:
+		/* Disable coverage for the current task. */
+		unused = arg;
+		if (unused != 0 || current->kcov != kcov)
+			return -EINVAL;
+		t = current;
+		if (WARN_ON(kcov->t != t))
+			return -EINVAL;
+		kcov_task_init(t);
+		kcov->t = NULL;
+		kcov_put(kcov);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kcov *kcov;
+	int res;
+
+	kcov = filep->private_data;
+	spin_lock(&kcov->lock);
+	res = kcov_ioctl_locked(kcov, cmd, arg);
+	spin_unlock(&kcov->lock);
+	return res;
+}
+
+static const struct file_operations kcov_fops = {
+	.open		= kcov_open,
+	.unlocked_ioctl	= kcov_ioctl,
+	.mmap		= kcov_mmap,
+	.release        = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+		pr_err("failed to create kcov in debugfs\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+device_initcall(kcov_init);
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -28,7 +28,7 @@
 #include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/kernel.h>
@@ -39,12 +39,17 @@
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
 #include <linux/async.h>
+#include <linux/ve.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
 
 extern int max_threads;
 
+static DEFINE_KTHREAD_WORKER(khelper_worker);
+
 /*
  * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
  * locking to protect this global - it is private to the singleton khelper
@@ -69,11 +74,14 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
-	kfree(info->argv[3]); /* check call_modprobe() */
+	kfree(info->argv[4]); /* check call_modprobe() */
 	kfree(info->argv);
 }
 
-static int call_modprobe(char *module_name, int wait)
+static int __call_usermodehelper_exec(struct kthread_worker *worker,
+		struct subprocess_info *sub_info, int wait);
+
+static int call_modprobe(char *module_name, int wait, int blacklist)
 {
 	struct subprocess_info *info;
 	static char *envp[] = {
@@ -83,7 +91,7 @@ static int call_modprobe(char *module_name, int wait)
 		NULL
 	};
 
-	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+	char **argv = kmalloc(sizeof(char *[6]), GFP_KERNEL);
 	if (!argv)
 		goto out;
 
@@ -93,16 +101,24 @@ static int call_modprobe(char *module_name, int wait)
 
 	argv[0] = modprobe_path;
 	argv[1] = "-q";
-	argv[2] = "--";
-	argv[3] = module_name;	/* check free_modprobe_argv() */
-	argv[4] = NULL;
+	if (blacklist)
+		argv[2] = "-b";
+	else
+		argv[2] = "-q"; /* just repeat argv[1] */
+	argv[3] = "--";
+	argv[4] = module_name;	/* check free_modprobe_argv() */
+	argv[5] = NULL;
 
 	info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
 					 NULL, free_modprobe_argv, NULL);
 	if (!info)
 		goto free_module_name;
 
-	return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+	/*
+	 * We enter to this function with the right permittions, so
+	 * it's possible to directly call __call_usermodehelper_exec()
+	 */
+	return __call_usermodehelper_exec(&khelper_worker, info, wait | UMH_KILLABLE);
 
 free_module_name:
 	kfree(module_name);
@@ -113,10 +129,10 @@ static int call_modprobe(char *module_name, int wait)
 }
 
 /**
- * __request_module - try to load a kernel module
+ * ___request_module - try to load a kernel module
  * @wait: wait (or not) for the operation to complete
- * @fmt: printf style format string for the name of the module
- * @...: arguments as specified in the format string
+ * @blacklist: say usermodehelper to ignore blacklisted modules
+ * @module_name: name of requested module
  *
  * Load a module using the user mode module loader. The function returns
  * zero on success or a negative errno code on failure. Note that a
@@ -127,10 +143,8 @@ static int call_modprobe(char *module_name, int wait)
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int __request_module(bool wait, const char *fmt, ...)
+static int ___request_module(bool wait, bool blacklist, char *module_name)
 {
-	va_list args;
-	char module_name[MODULE_NAME_LEN];
 	unsigned int max_modprobes;
 	int ret;
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
@@ -145,12 +159,6 @@ int __request_module(bool wait, const char *fmt, ...)
 	 */
 	WARN_ON_ONCE(wait && current_is_async());
 
-	va_start(args, fmt);
-	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
-	va_end(args);
-	if (ret >= MODULE_NAME_LEN)
-		return -ENAMETOOLONG;
-
 	ret = security_kernel_module_request(module_name);
 	if (ret)
 		return ret;
@@ -183,11 +191,319 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, blacklist);
 
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
+
+#ifdef CONFIG_VE_IPTABLES
+
+/* ve0 allowed iptables modules */
+static struct {
+	const char *name;
+	u64 perm;
+} ve0_ipt_am[] = {
+	{ "ip_tables",		VE_IP_IPTABLES	},
+	{ "ip6_tables",		VE_IP_IPTABLES6	},
+	{ "iptable_filter",	VE_IP_FILTER	},
+	{ "iptable_raw",	VE_IP_IPTABLES	},
+	{ "iptable_nat",	VE_IP_NAT	},
+	{ "iptable_mangle",	VE_IP_MANGLE	},
+	{ "ip6table_filter",	VE_IP_FILTER6	},
+	{ "ip6table_nat",	VE_IP_NAT	},
+	{ "ip6table_mangle",	VE_IP_MANGLE6	},
+	{ "ip6table_raw",	VE_IP_IPTABLES6	},
+
+	{ "xt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_NOTRACK",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_cluster",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "xt_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+
+	{ "ipt_CLUSTERIP",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ipt_MASQUERADE",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_NETMAP",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_REDIRECT",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ipt_SYNPROXY",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+
+	{ "ip6t_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_helper",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_socket",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ip6t_MASQUERADE",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT|VE_IP_IPTABLES6	},
+	{ "ip6t_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ip6t_SYNPROXY",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+
+	{ "nf-nat-ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf-nat",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf_conntrack-2",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack-10",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv6",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+
+	{ "nft-set",		VE_IP_IPTABLES			},
+	{ "nft-afinfo-2",	VE_IP_IPTABLES			}, /* IPV4 */
+	{ "nft-afinfo-3",	VE_IP_IPTABLES			}, /* ARP  */
+	{ "nft-afinfo-10",	VE_IP_IPTABLES6			}, /* IPV6 */
+
+	{ "nft-chain-2-nat",	VE_IP_IPTABLES|VE_IP_NAT	},
+	{ "nft-chain-2-route",	VE_IP_IPTABLES			},
+
+	{ "nft-chain-10-nat",	VE_IP_IPTABLES6|VE_IP_NAT	},
+	{ "nft-chain-10-route",	VE_IP_IPTABLES6		},
+
+	{ "nft-expr-2-reject",	VE_IP_IPTABLES			},
+	{ "nft-expr-10-reject",	VE_IP_IPTABLES6			},
+	{ "nf-logger-2-0",	VE_IP_IPTABLES			},
+	{ "nf-logger-10-0",	VE_IP_IPTABLES6			},
+};
+
+/*
+ *  Check if module named nft-expr-name is allowed.
+ *  We pass only tail name part to this function.
+ */
+static bool nft_expr_allowed(const char *name)
+{
+	u64 permitted = get_exec_env()->ipt_mask;
+
+	if (!name[0])
+		return false;
+
+	if (!strcmp(name, "ct"))
+		return mask_ipt_allow(permitted, VE_IP_CONNTRACK);
+
+	if (!strcmp(name, "nat"))
+		return mask_ipt_allow(permitted, VE_IP_NAT);
+
+	/*
+	 * We are interested in modules like nft-expr-xxx.
+	 * Expressions like nft-expr-xxx-yyy currently are
+	 * handled in ve0_ipt_am table. So expr does not contain
+	 * minus
+	 */
+	if (!strchr(name, '-'))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) |
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+	return false;
+}
+
+/*
+ * module_payload_iptable_allowed - check if iptables functionality is allowed
+ *			    to be used inside current virtual environment.
+ *
+ * Returns:
+ *   0 if iptable module is disallowed to load
+ *   1 if it is allowed or we're in ve0
+ *   -1 if module isn't iptables module
+ */
+static inline int module_payload_iptable_allowed(const char *module)
+{
+	u64 permitted = get_exec_env()->ipt_mask;
+	int i;
+
+	/* Look for full module name in ve0_ipt_am table */
+	for (i = 0; i < ARRAY_SIZE(ve0_ipt_am); i++) {
+		if (!strcmp(ve0_ipt_am[i].name, module))
+			return mask_ipt_allow(permitted, ve0_ipt_am[i].perm);
+	}
+
+	/* The rest of xt_* modules is allowed in both ipv4 and ipv6 modes */
+	if (!strncmp("xt_", module, 3))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) ||
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of ipt_* modules */
+	if (!strncmp("ipt_", module, 4))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES);
+
+	/* The rest of ip6t_* modules */
+	if (!strncmp("ip6t_", module, 5))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of arpt_* modules */
+	if (!strncmp("arpt_", module, 5))
+		return 1;
+
+	/* The rest of ebt* modules */
+	if (!strncmp("ebt", module, 3))
+		return 1;
+
+	/* The rest of nft- modules */
+	if (!strncmp("nft-expr-", module, 9))
+		return nft_expr_allowed(module + 9);
+
+	return -1;
+}
+
+/* ve0 allowed modules */
+static const char * const ve0_allowed_mod[] = {
+	"fs-binfmt_misc",
+	"fs-overlay",
+
+	/* inet_diag, inet6_diag  */
+	"net-pf-16-proto-4-type-2",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET */
+	"net-pf-16-proto-4-type-10",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET6 */
+
+	/* tcp_diag */
+	"net-pf-16-proto-4-type-2-6",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_TCP */
+
+	/* udp_diag */
+	"net-pf-16-proto-4-type-2-17",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDP */
+	"net-pf-16-proto-4-type-2-136",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDPLITE */
+
+	/* nfnetlink  */
+	"net-pf-16-proto-12",		/* PF_NETLINK, NETLINK_NETFILTER */
+	"nfnetlink-subsys-1",		/* NFNL_SUBSYS_CTNETLINK */
+	"nfnetlink-subsys-2",		/* NFNL_SUBSYS_CTNETLINK_EXP */
+
+	/* unix_diag */
+	"net-pf-16-proto-4-type-1",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */
+
+	/* af_packet_diag */
+	"net-pf-16-proto-4-type-17",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_PACKET */
+
+	/* netlink_diag */
+	"net-pf-16-proto-4-type-16",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */
+
+	/* ip_set */
+	"nfnetlink-subsys-6",		/* NFNL_SUBSYS_IPSET */
+	"ip_set_bitmap:ip",
+	"ip_set_bitmap:ip,mac",
+	"ip_set_bitmap:port",
+	"ip_set_hash:ip",
+	"ip_set_hash:ip,port",
+	"ip_set_hash:ip,port,ip",
+	"ip_set_hash:net",
+	"ip_set_hash:net,port",
+	"ip_set_hash:ip,port,net",
+	"ip_set_hash:net,iface",
+	"ip_set_list:set",
+
+	"rtnl-link-dummy",
+	"rtnl-link-vxlan",
+
+	/* NFS */
+	"nfsv3",
+	"nfsv4",
+
+	/* IPVS */
+	"ip_vs_ftp",
+	"ip_vs_nq",
+	"ip_vs_wlc",
+	"ip6t_ipvs",
+	"ipt_ipvs",
+	"ip_vs_rr",
+	"ip_vs_pe_sip",
+	"ip_vs_lblc",
+	"ip_vs_wrr",
+	"ip_vs_sed",
+	"ip_vs_dh",
+	"ip_vs_sh",
+	"ip_vs_lblcr",
+	"ip_vs_lc",
+};
+
+/*
+ * module_payload_allowed - check if module functionality is allowed
+ *			    to be used inside current virtual environment.
+ *
+ * Returns true if it is allowed or we're in ve0, false otherwise.
+ */
+bool module_payload_allowed(const char *module)
+{
+	int i;
+	int ret;
+
+	if (ve_is_super(get_exec_env()))
+		return true;
+
+	ret = module_payload_iptable_allowed(module);
+	if (ret >= 0)
+		return !!ret;
+
+	for (i = 0; i < ARRAY_SIZE(ve0_allowed_mod); i++) {
+		if (!strcmp(ve0_allowed_mod[i], module))
+			return true;
+	}
+
+	return false;
+}
+
+#endif
+
+int __request_module(bool wait, const char *fmt, ...)
+{
+	char module_name[MODULE_NAME_LEN];
+	bool blacklist;
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	/* Check that autoload is not prohobited using /proc interface */
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve_allow_module_load)
+		return -EPERM;
+
+	/* Check that module functionality is permitted */
+	if (!module_payload_allowed(module_name))
+		return -EPERM;
+
+	/*
+	 * This function may be called from ve0, where standard behaviour
+	 * is not to use blacklist. So, we request blacklist reading only
+	 * if we're inside CT.
+	 */
+	blacklist = !ve_is_super(get_exec_env());
+
+	return ___request_module(wait, blacklist, module_name);
+}
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
@@ -311,7 +627,7 @@ static int wait_for_helper(void *data)
 }
 
 /* This is run by khelper thread  */
-static void __call_usermodehelper(struct work_struct *work)
+static void __call_usermodehelper(struct kthread_work *work)
 {
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
@@ -533,7 +849,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	if (!sub_info)
 		goto out;
 
-	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	init_kthread_work(&sub_info->work, __call_usermodehelper);
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
@@ -558,7 +874,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+static int __call_usermodehelper_exec(struct kthread_worker *worker,
+		struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
@@ -590,7 +907,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	sub_info->complete = &done;
 	sub_info->wait = wait;
 
-	queue_work(system_unbound_wq, &sub_info->work);
+	queue_kthread_work(worker, &sub_info->work);
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
 
@@ -614,6 +931,14 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	helper_unlock();
 	return retval;
 }
+
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	return __call_usermodehelper_exec(&khelper_worker, sub_info, wait);
+}
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
 /**
@@ -630,18 +955,76 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
  * call_usermodehelper_exec().
  */
 int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+	return call_usermodehelper_by(&khelper_worker, path, argv, envp,
+			wait, NULL, NULL, NULL);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+#ifdef CONFIG_VE
+int call_usermodehelper_fns_ve(struct ve_struct *ve,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data)
+{
+	int err;
+	struct kthread_worker *khelper;
+
+	ve = get_ve(ve);
+	if (!ve)
+		return -EFAULT;
+
+	khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
+
+	if (ve_is_super(ve) || (get_exec_env() == ve)) {
+		err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
+					     cleanup, data);
+		goto out_put;
+	}
+
+	if (wait > UMH_WAIT_EXEC) {
+		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
+				"not supported\n", ve->ve_name);
+		err = -EINVAL;
+		goto out_put;
+	}
+
+	down_read(&ve->op_sem);
+	err = -EPIPE;
+	if (!ve->is_running)
+		goto out;
+
+	err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
+				     cleanup, data);
+
+out:
+	up_read(&ve->op_sem);
+out_put:
+	put_ve(ve);
+	return err;
+}
+EXPORT_SYMBOL(call_usermodehelper_fns_ve);
+#endif
+
+int call_usermodehelper_by(struct kthread_worker *worker,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data)
 {
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 
+	if (worker == &khelper_worker && !ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
-					 NULL, NULL, NULL);
+					 init, cleanup, data);
 	if (info == NULL)
 		return -ENOMEM;
 
-	return call_usermodehelper_exec(info, wait);
+	return __call_usermodehelper_exec(worker, info, wait);
 }
-EXPORT_SYMBOL(call_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper_by);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -720,3 +1103,11 @@ struct ctl_table usermodehelper_table[] = {
 	},
 	{ }
 };
+
+void __init usermodehelper_init(void)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kthread_worker_fn, &khelper_worker, "khelper");
+	BUG_ON(IS_ERR(t));
+}
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/capability.h>
 #include <linux/compiler.h>
+#include <linux/ve.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -31,7 +32,7 @@ static struct kobj_attribute _name##_attr = \
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)ve_uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -24,20 +24,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
-struct kthread_create_info
-{
-	/* Information passed to kthread() from kthreadd. */
-	int (*threadfn)(void *data);
-	void *data;
-	int node;
-
-	/* Result passed back to kthread_create() from kthreadd. */
-	struct task_struct *result;
-	struct completion done;
-
-	struct list_head list;
-};
-
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
@@ -215,7 +201,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
 	return NUMA_NO_NODE;
 }
 
-static void create_kthread(struct kthread_create_info *create)
+void create_kthread(struct kthread_create_info *create)
 {
 	int pid;
 
@@ -229,6 +215,16 @@ static void create_kthread(struct kthread_create_info *create)
 		complete(&create->done);
 	}
 }
+EXPORT_SYMBOL(create_kthread);
+
+static void kthread_add_to_kthreadd(void *data, struct kthread_create_info *create)
+{
+	spin_lock(&kthread_create_lock);
+	list_add_tail(&create->list, &kthread_create_list);
+	spin_unlock(&kthread_create_lock);
+	wake_up_process(kthreadd_task);
+	wait_for_completion(&create->done);
+}
 
 /**
  * kthread_create_on_node - create a kthread.
@@ -252,10 +248,13 @@ static void create_kthread(struct kthread_create_info *create)
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data, int node,
-					   const char namefmt[],
-					   ...)
+struct task_struct *__kthread_create_on_node(
+		void (*addfn)(void *data, struct kthread_create_info *create),
+		void *add_data,
+		int (*threadfn)(void *data),
+		void *data, int node,
+		const char namefmt[],
+		...)
 {
 	struct kthread_create_info create;
 
@@ -264,12 +263,10 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	create.node = node;
 	init_completion(&create.done);
 
-	spin_lock(&kthread_create_lock);
-	list_add_tail(&create.list, &kthread_create_list);
-	spin_unlock(&kthread_create_lock);
+	if (addfn == NULL)
+		addfn = kthread_add_to_kthreadd;
 
-	wake_up_process(kthreadd_task);
-	wait_for_completion(&create.done);
+	addfn(add_data, &create);
 
 	if (!IS_ERR(create.result)) {
 		static const struct sched_param param = { .sched_priority = 0 };
@@ -288,7 +285,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	return create.result;
 }
-EXPORT_SYMBOL(kthread_create_on_node);
+EXPORT_SYMBOL(__kthread_create_on_node);
 
 static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,108 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * Because modules have two address ranges: init and core, we need two
+ * latch_tree_nodes entries. Therefore we need the back-pointer from
+ * mod_tree_node.
+ *
+ * Because init ranges are short lived we mark them unlikely and have placed
+ * them outside the critical cacheline in struct module.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
+
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->module_init;
+
+	return (unsigned long)mod->module_core;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
+
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->init_size;
+
+	return (unsigned long)mod->core_size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+	return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long start, end;
+
+	start = __mod_tree_val(n);
+	if (val < start)
+		return -1;
+
+	end = start + __mod_tree_size(n);
+	if (val >= end)
+		return 1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+	.less = mod_tree_less,
+	.comp = mod_tree_comp,
+};
+
+static struct latch_tree_root mod_tree __cacheline_aligned;
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+	mod->mtn_core.mod = mod;
+	mod->mtn_init.mod = mod;
+
+	latch_tree_insert(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+	if (mod->init_size)
+		latch_tree_insert(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	if (mod->init_size)
+		latch_tree_erase(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	latch_tree_erase(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+	mod_tree_remove_init(mod);
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+	struct latch_tree_node *ltn;
+
+	ltn = latch_tree_find((void *)addr, &mod_tree, &mod_tree_ops);
+	if (!ltn)
+		return NULL;
+
+	return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -715,8 +817,6 @@ static int module_unload_init(struct module *mod)
 
 	/* Hold reference count during initialization. */
 	__this_cpu_write(mod->refptr->incs, 1);
-	/* Backwards compatibility macros put refcount during init. */
-	mod->waiter = current;
 
 	return 0;
 }
@@ -842,16 +942,9 @@ static int __try_stop_module(void *_sref)
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-	if (flags & O_NONBLOCK) {
-		struct stopref sref = { mod, flags, forced };
+	struct stopref sref = { mod, flags, forced };
 
-		return stop_machine(__try_stop_module, &sref, NULL);
-	} else {
-		/* We don't need to stop the machine for this. */
-		mod->state = MODULE_STATE_GOING;
-		synchronize_sched();
-		return 0;
-	}
+	return stop_machine(__try_stop_module, &sref, NULL);
 }
 
 unsigned long module_refcount(struct module *mod)
@@ -884,21 +977,6 @@ EXPORT_SYMBOL(module_refcount);
 /* This exists whether we can unload or not */
 static void free_module(struct module *mod);
 
-static void wait_for_zero_refcount(struct module *mod)
-{
-	/* Since we might sleep for some time, release the mutex first */
-	mutex_unlock(&module_mutex);
-	for (;;) {
-		pr_debug("Looking at refcount...\n");
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (module_refcount(mod) == 0)
-			break;
-		schedule();
-	}
-	current->state = TASK_RUNNING;
-	mutex_lock(&module_mutex);
-}
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
@@ -932,8 +1010,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
 	/* Doing init or already dying? */
 	if (mod->state != MODULE_STATE_LIVE) {
-		/* FIXME: if (force), slam module count and wake up
-                   waiter --RR */
+		/* FIXME: if (force), slam module count damn the torpedoes */
 		pr_debug("%s already dying\n", mod->name);
 		ret = -EBUSY;
 		goto out;
@@ -949,18 +1026,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		}
 	}
 
-	/* Set this up before setting mod->state */
-	mod->waiter = current;
-
 	/* Stop the machine so refcounts can't move and disable module. */
 	ret = try_stop_module(mod, flags, &forced);
 	if (ret != 0)
 		goto out;
 
-	/* Never wait if forced. */
-	if (!forced && module_refcount(mod) != 0)
-		wait_for_zero_refcount(mod);
-
 	mutex_unlock(&module_mutex);
 	/* Final destruction now no one is using it. */
 	if (mod->exit != NULL)
@@ -983,8 +1053,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
 	int printed_something = 0;
+	bool in_container = !ve_is_super(get_exec_env());
 
-	seq_printf(m, " %lu ", module_refcount(mod));
+	seq_printf(m, " %lu ", in_container ? 1 : module_refcount(mod));
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
@@ -1078,9 +1149,6 @@ void module_put(struct module *module)
 		__this_cpu_inc(module->refptr->decs);
 
 		trace_module_put(module, _RET_IP_);
-		/* Maybe they're waiting for us to drop reference? */
-		if (unlikely(!module_is_live(module)))
-			wake_up_process(module->waiter);
 		preempt_enable();
 	}
 }
@@ -1713,6 +1781,49 @@ static int mod_sysfs_init(struct module *mod)
 	return err;
 }
 
+#ifdef CONFIG_VE
+
+static ssize_t module_sysfs_perm_set_ve(struct module *mod, char *subdir, int mask)
+{
+	static char path[PATH_MAX];
+
+	if (snprintf(path, sizeof(path) - 1, "module/%s/%s",
+		     mod->name, (subdir) ? subdir : "") >= sizeof(path) - 1)
+		return -E2BIG;
+
+	return sysfs_perms_set(path, NULL, mask);
+}
+
+static ssize_t module_sysfs_hide_dir_ve(struct module *mod, char *subdir)
+{
+	return module_sysfs_perm_set_ve(mod, subdir, -1);
+}
+
+static ssize_t module_sysfs_expose_dir_ve(struct module *mod, char *subdir)
+{
+	return module_sysfs_perm_set_ve(mod, subdir, MAY_READ | MAY_EXEC);
+}
+
+static int module_sysfs_ve_init(struct module *mod)
+{
+	int err;
+
+	err = module_sysfs_expose_dir_ve(mod, NULL);
+	if (!err)
+		err = module_sysfs_expose_dir_ve(mod, "holders");
+	return err;
+}
+
+static void module_sysfs_ve_fini(struct module *mod)
+{
+	(void) module_sysfs_hide_dir_ve(mod, "holders");
+	(void) module_sysfs_hide_dir_ve(mod, NULL);
+}
+#else
+static __always_inline int module_sysfs_ve_init(struct module *mod) { }
+static __always_inline void module_sysfs_ve_fini(struct module *mod) { }
+#endif
+
 static int mod_sysfs_setup(struct module *mod,
 			   const struct load_info *info,
 			   struct kernel_param *kparam,
@@ -1743,6 +1854,7 @@ static int mod_sysfs_setup(struct module *mod,
 	add_notes_attrs(mod, info);
 
 	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+	(void) module_sysfs_ve_init(mod);
 	return 0;
 
 out_unreg_param:
@@ -1757,6 +1869,7 @@ static int mod_sysfs_setup(struct module *mod,
 
 static void mod_sysfs_fini(struct module *mod)
 {
+	module_sysfs_ve_fini(mod);
 	remove_notes_attrs(mod);
 	remove_sect_attrs(mod);
 	kobject_put(&mod->mkobj.kobj);
@@ -1963,12 +2076,14 @@ static void free_module(struct module *mod)
 	/* Now we can delete it from the lists */
 	mutex_lock(&module_mutex);
 	stop_machine(__unlink_module, mod, NULL);
+	mod_tree_remove(mod);
 	mutex_unlock(&module_mutex);
 
 	mutex_lock(&module_ext_mutex);
 	mod_ext = find_module_ext(mod);
 	list_del(&mod_ext->next);
 	mutex_unlock(&module_ext_mutex);
+	kfree(mod_ext);
 
 	/* This may be NULL, but that's OK */
 	unset_module_init_ro_nx(mod);
@@ -2334,7 +2449,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 	}
 	if (sym->st_shndx == SHN_UNDEF)
 		return 'U';
-	if (sym->st_shndx == SHN_ABS)
+	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
 		return 'a';
 	if (sym->st_shndx >= SHN_LORESERVE)
 		return '?';
@@ -2363,7 +2478,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 }
 
 static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
-                           unsigned int shnum)
+			   unsigned int shnum, unsigned int pcpundx)
 {
 	const Elf_Shdr *sec;
 
@@ -2372,6 +2487,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 	    || !src->st_name)
 		return false;
 
+#ifdef CONFIG_KALLSYMS_ALL
+	if (src->st_shndx == pcpundx)
+		return true;
+#endif
+
 	sec = sechdrs + src->st_shndx;
 	if (!(sec->sh_flags & SHF_ALLOC)
 #ifndef CONFIG_KALLSYMS_ALL
@@ -2409,7 +2529,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	/* Compute total space required for the core symbols' strtab. */
 	for (ndst = i = 0; i < nsrc; i++) {
 		if (i == 0 ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
 			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
 			ndst++;
 		}
@@ -2449,7 +2570,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	src = mod->symtab;
 	for (ndst = i = 0; i < mod->num_symtab; i++) {
 		if (i == 0 ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
 			dst[ndst] = src[i];
 			dst[ndst++].st_name = s - mod->core_strtab;
 			s += strlcpy(s, &mod->strtab[src[i].st_name],
@@ -3255,6 +3377,7 @@ static int do_init_module(struct module *mod)
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
@@ -3304,6 +3427,7 @@ static int add_unformed_module(struct module *mod)
 		goto out;
 	}
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3512,12 +3636,14 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mutex_lock(&module_ext_mutex);
 	list_del(&mod_ext->next);
 	mutex_unlock(&module_ext_mutex);
+	kfree(mod_ext);
  free_unload:
 	module_unload_free(mod);
  unlink_mod:
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
 	mutex_unlock(&module_mutex);
  free_module:
@@ -3850,6 +3976,7 @@ static void m_stop(struct seq_file *m, void *p)
 static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
+	bool in_container = !ve_is_super(get_exec_env());
 	char buf[MODULE_FLAGS_BUF_SIZE];
 
 	/* We always ignore unformed modules. */
@@ -3857,7 +3984,7 @@ static int m_show(struct seq_file *m, void *p)
 		return 0;
 
 	seq_printf(m, "%s %u",
-		   mod->name, mod->init_size + mod->core_size);
+		   mod->name, in_container ? 4242 : mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
 
 	/* Informative for users. */
@@ -3869,7 +3996,7 @@ static int m_show(struct seq_file *m, void *p)
 	seq_printf(m, " 0x%pK", mod->module_core);
 
 	/* Taints info */
-	if (mod->taints)
+	if (mod->taints && !in_container)
 		seq_printf(m, " %s", module_flags(mod, buf));
 
 	seq_printf(m, "\n");
@@ -3902,7 +4029,7 @@ static const struct file_operations proc_modules_operations = {
 
 static int __init proc_modules_init(void)
 {
-	proc_create("modules", 0, NULL, &proc_modules_operations);
+	proc_create("modules", S_ISVTX, NULL, &proc_modules_operations);
 	return 0;
 }
 module_init(proc_modules_init);
@@ -3966,14 +4093,14 @@ struct module *__module_address(unsigned long addr)
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	mod = mod_tree_find(addr);
+	if (mod) {
+		BUG_ON(!(within_module_core(addr, mod)
+		    || within_module_init(addr, mod)));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module_core(addr, mod)
-		    || within_module_init(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -165,6 +165,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	put_nsproxy(old_ns);
 	return err;
 }
+EXPORT_SYMBOL(copy_namespaces);
 
 void free_nsproxy(struct nsproxy *ns)
 {
@@ -179,6 +180,7 @@ void free_nsproxy(struct nsproxy *ns)
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
+EXPORT_SYMBOL(free_nsproxy);
 
 /*
  * Called from unshare. Unshare all the namespaces part of nsproxy.
@@ -223,6 +225,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 	if (ns && atomic_dec_and_test(&ns->count))
 		free_nsproxy(ns);
 }
+EXPORT_SYMBOL_GPL(switch_task_namespaces);
 
 void exit_task_namespaces(struct task_struct *p)
 {
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -387,6 +387,12 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 		printk(KERN_WARNING
 		       "Disabling lock debugging due to kernel taint\n");
 
+	/* Do not confuse people with calltraces on proprietary modules */
+	if (flag != TAINT_PROPRIETARY_MODULE && flag != TAINT_OOT_MODULE &&
+	    flag != TAINT_UNSIGNED_MODULE) {
+		printk(KERN_WARNING "Tainting kernel with flag 0x%x\n", flag);
+		dump_stack();
+	}
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -45,8 +45,6 @@ static struct hlist_head *pid_hash;
 static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
-int pid_max = PID_MAX_DEFAULT;
-
 #define RESERVED_PIDS		300
 
 int pid_max_min = RESERVED_PIDS + 1;
@@ -153,7 +151,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	struct pidmap *map;
 
 	pid = last + 1;
-	if (pid >= pid_max)
+	if (pid >= pid_ns->pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
@@ -162,7 +160,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	 * want to scan this bitmap block twice, the second time
 	 * we start with offset == 0 (or RESERVED_PIDS).
 	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
+	max_scan = DIV_ROUND_UP(pid_ns->pid_max, BITS_PER_PAGE) - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -191,11 +189,11 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 				if (offset >= BITS_PER_PAGE)
 					break;
 				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
+				if (pid >= pid_ns->pid_max)
 					break;
 			}
 		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pid_ns->pidmap[(pid_ns->pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
@@ -229,6 +227,7 @@ int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(next_pidmap);
 
 void put_pid(struct pid *pid)
 {
@@ -262,7 +261,9 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
+
 		hlist_del_rcu(&upid->pid_chain);
+
 		switch(--ns->nr_hashed) {
 		case 2:
 		case 1:
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
+		case PIDNS_HASH_ADDING:
+			/* Handle a fork failure of the first process */
+			WARN_ON(ns->child_reaper);
+			ns->nr_hashed = 0;
+			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
 			break;
@@ -310,8 +316,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns))
+		if (pid_ns_prepare_proc(ns)) {
+			disable_pid_allocation(ns);
 			goto out_free;
+		}
 	}
 
 	get_pid_ns(ns);
@@ -436,6 +444,51 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 }
 EXPORT_SYMBOL(pid_task);
 
+int change_active_pid_ns(struct task_struct *task, struct pid_namespace *ns)
+{
+	struct pid *pid = task_pid(task);
+	struct upid *upid = pid->numbers + pid->level;
+	int nr, err;
+
+	if (upid->ns != ns->parent || upid->ns->pid_cachep != ns->pid_cachep)
+		return -EINVAL;
+
+	nr = alloc_pidmap(ns);
+	if (nr < 0)
+		return -ENOMEM;
+
+	get_pid_ns(ns);
+	put_pid_ns(upid->ns);
+
+	upid++;
+	upid->nr = nr;
+	upid->ns = ns;
+	smp_wmb();
+	pid->level++;
+
+	if (is_child_reaper(pid)) {
+		err = pid_ns_prepare_proc(ns);
+		if (err)
+			goto undo;
+		ns->child_reaper = task;
+	}
+
+	spin_lock_irq(&pidmap_lock);
+	hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(nr, ns)]);
+	ns->nr_hashed++;
+	spin_unlock_irq(&pidmap_lock);
+
+	return 0;
+
+undo:
+	pid->level--;
+	free_pidmap(upid);
+	get_pid_ns(ns->parent);
+	put_pid_ns(ns);
+	return err;
+}
+EXPORT_SYMBOL_GPL(change_active_pid_ns);
+
 /*
  * Must be called under rcu_read_lock().
  */
@@ -458,7 +511,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 	rcu_read_lock();
 	if (type != PIDTYPE_PID)
 		task = task->group_leader;
-	pid = get_pid(task->pids[type].pid);
+	pid = get_pid(rcu_dereference(task->pids[type].pid));
 	rcu_read_unlock();
 	return pid;
 }
@@ -519,7 +572,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
-		nr = pid_nr_ns(task->pids[type].pid, ns);
+		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
 	}
 	rcu_read_unlock();
 
@@ -527,6 +580,18 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t ppid;
+	rcu_read_lock();
+	ppid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+	/* It's dirty hack. Some old utils don't work if ppid is zero*/
+	if (ppid == 0 && ns->child_reaper != tsk)
+		ppid = 1;
+	return ppid;
+}
+
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return pid_nr_ns(task_tgid(tsk), ns);
@@ -583,11 +648,11 @@ void __init pidmap_init(void)
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
-	pid_max = min(pid_max_max, max_t(int, pid_max,
+	init_pid_ns.pid_max = min(pid_max_max, max_t(int, PID_MAX_DEFAULT,
 				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
-	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
 
 	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
@@ -596,5 +661,5 @@ void __init pidmap_init(void)
 	init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 }
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -18,6 +18,9 @@
 #include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
+#include <linux/module.h>
+#include <linux/ve.h>
+#include <linux/kthread.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -40,6 +43,9 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 	struct pid_cache *pcache;
 	struct kmem_cache *cachep;
 
+	if (nr_ids <= 2)
+		return init_pid_ns.pid_cachep;
+
 	mutex_lock(&pid_caches_mutex);
 	list_for_each_entry(pcache, &pid_caches_lh, list)
 		if (pcache->nr_ids == nr_ids)
@@ -51,7 +57,7 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 
 	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
 	cachep = kmem_cache_create(pcache->name,
-			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+			sizeof(struct pid) + (nr_ids - 2) * sizeof(struct upid),
 			0, SLAB_HWCACHE_ALIGN, NULL);
 	if (cachep == NULL)
 		goto err_cachep;
@@ -129,6 +135,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->ucounts = ucounts;
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
+	ns->pid_max = PID_MAX_NS_DEFAULT;
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -212,6 +219,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
 	spin_unlock_irq(&me->sighand->siglock);
 
+	ve_stop_ns(pid_ns);
+
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
 	 * Find remaining pid_ts in the namespace, signal and wait for them
@@ -251,7 +260,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * Make sure they all go away, see free_pid().
 	 */
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (pid_ns->nr_hashed == init_pids)
 			break;
 		schedule();
@@ -262,6 +271,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 		current->signal->group_exit_code = pid_ns->reboot;
 
 	acct_exit_ns(pid_ns);
+
+	ve_exit_ns(pid_ns);
+
 	return;
 }
 
@@ -282,6 +294,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 */
 
 	tmp.data = &pid_ns->last_pid;
+	tmp.extra2 = &pid_ns->pid_max;
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
 
@@ -291,10 +304,9 @@ static struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
-		.mode = 0666, /* permissions are checked in the handler */
+		.mode = 0666 | S_ISVTX, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
 		.extra1 = &zero,
-		.extra2 = &pid_max,
 	},
 	{ }
 };
@@ -343,6 +355,29 @@ static void *pidns_get(struct task_struct *task)
 	return ns;
 }
 
+static void *pidns_for_children_get(struct task_struct *task)
+{
+	struct pid_namespace *ns = NULL;
+
+	task_lock(task);
+	if (task->nsproxy) {
+		ns = task->nsproxy->pid_ns;
+		get_pid_ns(ns);
+	}
+	task_unlock(task);
+
+	if (ns) {
+		qread_lock(&tasklist_lock);
+		if (!ns->child_reaper) {
+			put_pid_ns(ns);
+			ns = NULL;
+		}
+		qread_unlock(&tasklist_lock);
+	}
+
+	return ns;
+}
+
 static void pidns_put(void *ns)
 {
 	put_pid_ns(ns);
@@ -394,6 +429,16 @@ const struct proc_ns_operations pidns_operations = {
 	.inum		= pidns_inum,
 };
 
+const struct proc_ns_operations pidns_for_children_operations = {
+	.name		= "pid_for_children",
+	.real_ns_name	= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_for_children_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.inum		= pidns_inum,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -12,6 +12,7 @@
 #include <linux/random.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/module.h>
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,9 @@
 #include <linux/workqueue.h>
 #include <linux/export.h>
 #include <linux/hashtable.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include "time/timekeeping.h"
 
@@ -126,6 +129,39 @@ static DEFINE_SPINLOCK(hash_lock);
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
 
+#define clock_is_monotonic(which_clock) \
+	((which_clock) == CLOCK_MONOTONIC || \
+	 (which_clock) == CLOCK_MONOTONIC_RAW || \
+	 (which_clock) == CLOCK_MONOTONIC_COARSE)
+
+#ifdef CONFIG_VE
+static struct timespec zero_time;
+
+void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec - ve->start_timespec.tv_sec,
+				tp->tv_nsec - ve->start_timespec.tv_nsec);
+}
+
+void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec + ve->start_timespec.tv_sec,
+				tp->tv_nsec + ve->start_timespec.tv_nsec);
+	if (timespec_compare(tp, &zero_time) <= 0) {
+		tp->tv_sec =  0;
+		tp->tv_nsec = 1;
+	}
+}
+#endif
+
 /*
  * These ones are defined below.
  */
@@ -341,8 +377,8 @@ static __init int init_posix_timers(void)
 	posix_timers_register_clock(CLOCK_TAI, &clock_tai);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, SLAB_PANIC,
-					NULL);
+					sizeof (struct k_itimer), 0,
+					SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 
@@ -416,8 +452,14 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	rcu_read_lock();
 	task = pid_task(timr->it_pid, PIDTYPE_PID);
 	if (task) {
+		struct user_beancounter *ub;
+
+		ub = set_exec_ub(task->task_bc.task_ub);
+
 		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
 		ret = send_sigqueue(timr->sigq, task, shared);
+
+		(void)set_exec_ub(ub);
 	}
 	rcu_read_unlock();
 	/* If we failed to send the signal the timer stops. */
@@ -895,6 +937,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	if (!timr)
 		return -EINVAL;
 
+	if ((flags & TIMER_ABSTIME) &&
+	    (new_spec.it_value.tv_sec || new_spec.it_value.tv_nsec))
+		monotonic_ve_to_abs(timr->it_clock, &new_spec.it_value);
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -1028,6 +1073,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 
 	error = kc->clock_get(which_clock, &kernel_tp);
 
+	monotonic_abs_to_ve(which_clock, &kernel_tp);
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
@@ -1104,6 +1150,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
+	if (flags & TIMER_ABSTIME)
+		monotonic_ve_to_abs(which_clock, &t);
+
 	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -45,6 +45,8 @@
 #include <linux/poll.h>
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
+#include <linux/vermagic.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -125,6 +127,7 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
+int console_silence_loglevel;
 
 /*
  * The printk log buffer consists of a chain of concatenated variable
@@ -218,30 +221,6 @@ struct log {
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
-static u64 syslog_seq;
-static u32 syslog_idx;
-static enum log_flags syslog_prev;
-static size_t syslog_partial;
-
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags console_prev;
-
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
-
 #define PREFIX_MAX		32
 #define LOG_LINE_MAX		1024 - PREFIX_MAX
 
@@ -253,8 +232,92 @@ static u32 clear_idx;
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
+
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+struct cont {
+	char buf[LOG_LINE_MAX];
+	size_t len;			/* length == 0 means unused buffer */
+	size_t cons;			/* bytes written to console */
+	struct task_struct *owner;	/* task of first print*/
+	u64 ts_nsec;			/* time of first print */
+	u8 level;			/* log level of first message */
+	u8 facility;			/* log level of first message */
+	enum log_flags flags;		/* prefix, newline flags */
+	bool flushed:1;			/* buffer sealed and committed */
+};
+
+static struct log_state {
+	char *buf;
+	u32 buf_len;
+
+	/* the next printk record to read by syslog(READ) or /proc/kmsg */
+	u64 syslog_seq;
+	u32 syslog_idx;
+	enum log_flags syslog_prev;
+	size_t syslog_partial;
+
+	/* index and sequence number of the first record stored in the buffer */
+	u64 first_seq;
+	u32 first_idx;
+
+	/* index and sequence number of the next record to store in the buffer */
+	u64 next_seq;
+	u32 next_idx;
+
+	/* the next printk record to write to the console */
+	u64 console_seq;
+	u32 console_idx;
+	enum log_flags console_prev;
+
+	/* the next printk record to read after the last 'clear' command */
+	u64 clear_seq;
+	u32 clear_idx;
+
+	u64 seen_seq;
+
+	struct cont cont;
+
+	wait_queue_head_t wait;
+} init_log_state = {
+	.buf = __log_buf,
+	.buf_len = __LOG_BUF_LEN,
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_log_state.wait),
+};
+
+/* kdump relies on some log_* symbols, let's make it happy */
+#define DEFINE_STRUCT_MEMBER_ALIAS(name, inst, memb)			\
+static void ____ ## name ## _definition(void) __attribute__((used));	\
+static void ____ ## name ## _definition(void)				\
+{									\
+	asm (".globl " #name "\n\t.set " #name ", " #inst "+%c0"	\
+	     : : "g" (offsetof(typeof(inst), memb)));			\
+}									\
+extern typeof(inst.memb) name;
+DEFINE_STRUCT_MEMBER_ALIAS(log_buf, init_log_state, buf);
+DEFINE_STRUCT_MEMBER_ALIAS(log_buf_len, init_log_state, buf_len);
+DEFINE_STRUCT_MEMBER_ALIAS(log_first_idx, init_log_state, first_idx);
+DEFINE_STRUCT_MEMBER_ALIAS(log_next_idx, init_log_state, next_idx);
+#undef DEFINE_STRUCT_MEMBER_ALIAS
+
+static inline struct log_state *ve_log_state(void)
+{
+	struct log_state *log = &init_log_state;
+#ifdef CONFIG_VE
+	if (get_exec_env()->log_state)
+		log = get_exec_env()->log_state;
+#endif
+	return log;
+}
+
+void log_poll_wait(struct file *filp, poll_table *p)
+{
+	poll_wait(filp, &ve_log_state()->wait, p);
+}
 
 /* Return log buffer address */
 char *log_buf_addr_get(void)
@@ -281,23 +344,23 @@ static char *log_dict(const struct log *msg)
 }
 
 /* get record by index; idx must point to valid msg */
-static struct log *log_from_idx(u32 idx)
+static struct log *log_from_idx(struct log_state *log, u32 idx)
 {
-	struct log *msg = (struct log *)(log_buf + idx);
+	struct log *msg = (struct log *)(log->buf + idx);
 
 	/*
 	 * A length == 0 record is the end of buffer marker. Wrap around and
 	 * read the message at the start of the buffer.
 	 */
 	if (!msg->len)
-		return (struct log *)log_buf;
+		return (struct log *)log->buf;
 	return msg;
 }
 
 /* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
+static u32 log_next(struct log_state *log, u32 idx)
 {
-	struct log *msg = (struct log *)(log_buf + idx);
+	struct log *msg = (struct log *)(log->buf + idx);
 
 	/* length == 0 indicates the end of the buffer; wrap */
 	/*
@@ -306,14 +369,15 @@ static u32 log_next(u32 idx)
 	 * return the one after that.
 	 */
 	if (!msg->len) {
-		msg = (struct log *)log_buf;
+		msg = (struct log *)log->buf;
 		return msg->len;
 	}
 	return idx + msg->len;
 }
 
 /* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
+static void log_store(struct log_state *log,
+		      int facility, int level,
 		      enum log_flags flags, u64 ts_nsec,
 		      const char *dict, u16 dict_len,
 		      const char *text, u16 text_len)
@@ -326,34 +390,35 @@ static void log_store(int facility, int level,
 	pad_len = (-size) & (LOG_ALIGN - 1);
 	size += pad_len;
 
-	while (log_first_seq < log_next_seq) {
+	while (log->first_seq < log->next_seq) {
 		u32 free;
 
-		if (log_next_idx > log_first_idx)
-			free = max(log_buf_len - log_next_idx, log_first_idx);
+		if (log->next_idx > log->first_idx)
+			free = max(log->buf_len - log->next_idx,
+				   log->first_idx);
 		else
-			free = log_first_idx - log_next_idx;
+			free = log->first_idx - log->next_idx;
 
 		if (free > size + sizeof(struct log))
 			break;
 
 		/* drop old messages until we have enough contiuous space */
-		log_first_idx = log_next(log_first_idx);
-		log_first_seq++;
+		log->first_idx = log_next(log, log->first_idx);
+		log->first_seq++;
 	}
 
-	if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+	if (log->next_idx + size + sizeof(struct log) >= log->buf_len) {
 		/*
 		 * This message + an additional empty header does not fit
 		 * at the end of the buffer. Add an empty header with len == 0
 		 * to signify a wrap around.
 		 */
-		memset(log_buf + log_next_idx, 0, sizeof(struct log));
-		log_next_idx = 0;
+		memset(log->buf + log->next_idx, 0, sizeof(struct log));
+		log->next_idx = 0;
 	}
 
 	/* fill message */
-	msg = (struct log *)(log_buf + log_next_idx);
+	msg = (struct log *)(log->buf + log->next_idx);
 	memcpy(log_text(msg), text, text_len);
 	msg->text_len = text_len;
 	memcpy(log_dict(msg), dict, dict_len);
@@ -369,8 +434,8 @@ static void log_store(int facility, int level,
 	msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
 
 	/* insert message */
-	log_next_idx += msg->len;
-	log_next_seq++;
+	log->next_idx += msg->len;
+	log->next_seq++;
 }
 
 #ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -401,13 +466,13 @@ static int check_syslog_permissions(int type, bool from_file)
 		return 0;
 
 	if (syslog_action_restricted(type)) {
-		if (capable(CAP_SYSLOG))
+		if (ve_capable(CAP_SYSLOG))
 			return 0;
 		/*
 		 * For historical reasons, accept CAP_SYS_ADMIN too, with
 		 * a warning.
 		 */
-		if (capable(CAP_SYS_ADMIN)) {
+		if (ve_capable(CAP_SYS_ADMIN)) {
 			pr_warn_once("%s (%d): Attempt to access syslog with "
 				     "CAP_SYS_ADMIN but no CAP_SYSLOG "
 				     "(deprecated).\n",
@@ -439,6 +504,9 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
 	size_t len = iov_length(iv, count);
 	ssize_t ret = len;
 
+	if (ve_log_state() != &init_log_state)
+		return count;
+
 	if (len > LOG_LINE_MAX)
 		return -EINVAL;
 	buf = kmalloc(len+1, GFP_KERNEL);
@@ -488,6 +556,7 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	struct log *msg;
 	u64 ts_usec;
@@ -503,7 +572,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	while (user->seq == log_next_seq) {
+	while (user->seq == log->next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
 			raw_spin_unlock_irq(&logbuf_lock);
@@ -511,23 +580,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		}
 
 		raw_spin_unlock_irq(&logbuf_lock);
-		ret = wait_event_interruptible(log_wait,
-					       user->seq != log_next_seq);
+		ret = wait_event_interruptible(log->wait,
+				user->seq != log->next_seq);
 		if (ret)
 			goto out;
 		raw_spin_lock_irq(&logbuf_lock);
 	}
 
-	if (user->seq < log_first_seq) {
+	if (user->seq < log->first_seq) {
 		/* our last seen message is gone, return error and reset */
-		user->idx = log_first_idx;
-		user->seq = log_first_seq;
+		user->idx = log->first_idx;
+		user->seq = log->first_seq;
 		ret = -EPIPE;
 		raw_spin_unlock_irq(&logbuf_lock);
 		goto out;
 	}
 
-	msg = log_from_idx(user->idx);
+	msg = log_from_idx(log, user->idx);
 	ts_usec = msg->ts_nsec;
 	do_div(ts_usec, 1000);
 
@@ -588,7 +657,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		user->buf[len++] = '\n';
 	}
 
-	user->idx = log_next(user->idx);
+	user->idx = log_next(log, user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
 
@@ -609,6 +678,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 
 static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	loff_t ret = 0;
 
@@ -621,8 +691,8 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
-		user->idx = log_first_idx;
-		user->seq = log_first_seq;
+		user->idx = log->first_idx;
+		user->seq = log->first_seq;
 		break;
 	case SEEK_DATA:
 		/*
@@ -630,13 +700,13 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
 		 * changes no global state, and does not clear anything.
 		 */
-		user->idx = clear_idx;
-		user->seq = clear_seq;
+		user->idx = log->clear_idx;
+		user->seq = log->clear_seq;
 		break;
 	case SEEK_END:
 		/* after the last record */
-		user->idx = log_next_idx;
-		user->seq = log_next_seq;
+		user->idx = log->next_idx;
+		user->seq = log->next_seq;
 		break;
 	default:
 		ret = -EINVAL;
@@ -647,18 +717,19 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 
 static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	int ret = 0;
 
 	if (!user)
 		return POLLERR|POLLNVAL;
 
-	poll_wait(file, &log_wait, wait);
+	poll_wait(file, &log->wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	if (user->seq < log_next_seq) {
+	if (user->seq < log->next_seq) {
 		/* return error when data has vanished underneath us */
-		if (user->seq < log_first_seq)
+		if (user->seq < log->first_seq)
 			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
 		else
 			ret = POLLIN|POLLRDNORM;
@@ -670,6 +741,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 
 static int devkmsg_open(struct inode *inode, struct file *file)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user;
 	int err;
 
@@ -689,8 +761,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	mutex_init(&user->lock);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	user->idx = log_first_idx;
-	user->seq = log_first_seq;
+	user->idx = log->first_idx;
+	user->seq = log->first_seq;
 	raw_spin_unlock_irq(&logbuf_lock);
 
 	file->private_data = user;
@@ -745,6 +817,19 @@ void log_buf_vmcoreinfo_setup(void)
 }
 #endif
 
+static int __init setup_console_silencelevel(char *str)
+{
+	int level;
+
+	if (get_option(&str, &level) != 1)
+		return 0;
+
+	console_silence_loglevel = level;
+	return 1;
+}
+
+__setup("silencelevel=", setup_console_silencelevel);
+
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
 
@@ -755,7 +840,7 @@ static int __init log_buf_len_setup(char *str)
 
 	if (size)
 		size = roundup_pow_of_two(size);
-	if (size > log_buf_len)
+	if (size > init_log_state.buf_len)
 		new_log_buf_len = size;
 
 	return 0;
@@ -764,6 +849,7 @@ early_param("log_buf_len", log_buf_len_setup);
 
 void __init setup_log_buf(int early)
 {
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	char *new_log_buf;
 	int free;
@@ -789,14 +875,14 @@ void __init setup_log_buf(int early)
 	}
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	log_buf_len = new_log_buf_len;
-	log_buf = new_log_buf;
+	log->buf_len = new_log_buf_len;
+	log->buf = new_log_buf;
 	new_log_buf_len = 0;
-	free = __LOG_BUF_LEN - log_next_idx;
-	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+	free = __LOG_BUF_LEN - log->next_idx;
+	memcpy(log->buf, __log_buf, __LOG_BUF_LEN);
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
-	pr_info("log_buf_len: %d\n", log_buf_len);
+	pr_info("log_buf_len: %d\n", log->buf_len);
 	pr_info("early log buf free: %d(%d%%)\n",
 		free, (free * 100) / __LOG_BUF_LEN);
 }
@@ -976,7 +1062,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
 	return len;
 }
 
-static int syslog_print(char __user *buf, int size)
+static int syslog_print(struct log_state *log, char __user *buf, int size)
 {
 	char *text;
 	struct log *msg;
@@ -991,33 +1077,33 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		if (syslog_seq < log_first_seq) {
+		if (log->syslog_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			syslog_seq = log_first_seq;
-			syslog_idx = log_first_idx;
-			syslog_prev = 0;
-			syslog_partial = 0;
+			log->syslog_seq = log->first_seq;
+			log->syslog_idx = log->first_idx;
+			log->syslog_prev = 0;
+			log->syslog_partial = 0;
 		}
-		if (syslog_seq == log_next_seq) {
+		if (log->syslog_seq == log->next_seq) {
 			raw_spin_unlock_irq(&logbuf_lock);
 			break;
 		}
 
-		skip = syslog_partial;
-		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, syslog_prev, true, text,
+		skip = log->syslog_partial;
+		msg = log_from_idx(log, log->syslog_idx);
+		n = msg_print_text(msg, log->syslog_prev, true, text,
 				   LOG_LINE_MAX + PREFIX_MAX);
-		if (n - syslog_partial <= size) {
+		if (n - log->syslog_partial <= size) {
 			/* message fits into buffer, move forward */
-			syslog_idx = log_next(syslog_idx);
-			syslog_seq++;
-			syslog_prev = msg->flags;
-			n -= syslog_partial;
-			syslog_partial = 0;
+			log->syslog_idx = log_next(log, log->syslog_idx);
+			log->syslog_seq++;
+			log->syslog_prev = msg->flags;
+			n -= log->syslog_partial;
+			log->syslog_partial = 0;
 		} else if (!len){
 			/* partial read(), remember position */
 			n = size;
-			syslog_partial += n;
+			log->syslog_partial += n;
 		} else
 			n = 0;
 		raw_spin_unlock_irq(&logbuf_lock);
@@ -1040,7 +1126,8 @@ static int syslog_print(char __user *buf, int size)
 	return len;
 }
 
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int syslog_print_all(struct log_state *log,
+			    char __user *buf, int size, bool clear)
 {
 	char *text;
 	int len = 0;
@@ -1056,48 +1143,48 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u32 idx;
 		enum log_flags prev;
 
-		if (clear_seq < log_first_seq) {
+		if (log->clear_seq < log->first_seq) {
 			/* messages are gone, move to first available one */
-			clear_seq = log_first_seq;
-			clear_idx = log_first_idx;
+			log->clear_seq = log->first_seq;
+			log->clear_idx = log->first_idx;
 		}
 
 		/*
 		 * Find first record that fits, including all following records,
 		 * into the user-provided buffer for this dump.
 		 */
-		seq = clear_seq;
-		idx = clear_idx;
+		seq = log->clear_seq;
+		idx = log->clear_idx;
 		prev = 0;
-		while (seq < log_next_seq) {
-			struct log *msg = log_from_idx(idx);
+		while (seq < log->next_seq) {
+			struct log *msg = log_from_idx(log, idx);
 
 			len += msg_print_text(msg, prev, true, NULL, 0);
 			prev = msg->flags;
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 		}
 
 		/* move first record forward until length fits into the buffer */
-		seq = clear_seq;
-		idx = clear_idx;
+		seq = log->clear_seq;
+		idx = log->clear_idx;
 		prev = 0;
-		while (len > size && seq < log_next_seq) {
-			struct log *msg = log_from_idx(idx);
+		while (len > size && seq < log->next_seq) {
+			struct log *msg = log_from_idx(log, idx);
 
 			len -= msg_print_text(msg, prev, true, NULL, 0);
 			prev = msg->flags;
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 		}
 
 		/* last message fitting into this dump */
-		next_seq = log_next_seq;
+		next_seq = log->next_seq;
 
 		len = 0;
 		prev = 0;
 		while (len >= 0 && seq < next_seq) {
-			struct log *msg = log_from_idx(idx);
+			struct log *msg = log_from_idx(log, idx);
 			int textlen;
 
 			textlen = msg_print_text(msg, prev, true, text,
@@ -1106,7 +1193,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				len = textlen;
 				break;
 			}
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 			prev = msg->flags;
 
@@ -1117,18 +1204,18 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				len += textlen;
 			raw_spin_lock_irq(&logbuf_lock);
 
-			if (seq < log_first_seq) {
+			if (seq < log->first_seq) {
 				/* messages are gone, move to next one */
-				seq = log_first_seq;
-				idx = log_first_idx;
+				seq = log->first_seq;
+				idx = log->first_idx;
 				prev = 0;
 			}
 		}
 	}
 
 	if (clear) {
-		clear_seq = log_next_seq;
-		clear_idx = log_next_idx;
+		log->clear_seq = log->next_seq;
+		log->clear_idx = log->next_idx;
 	}
 	raw_spin_unlock_irq(&logbuf_lock);
 
@@ -1138,6 +1225,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
+	struct log_state *log = ve_log_state();
 	bool clear = false;
 	static int saved_console_loglevel = LOGLEVEL_DEFAULT;
 	int error;
@@ -1150,6 +1238,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	if (error)
 		return error;
 
+	error = 0;
+	if (log != &init_log_state &&
+	    (type == SYSLOG_ACTION_CONSOLE_OFF ||
+	     type == SYSLOG_ACTION_CONSOLE_ON))
+		goto out;
+
 	switch (type) {
 	case SYSLOG_ACTION_CLOSE:	/* Close log */
 		break;
@@ -1166,11 +1260,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait,
-						 syslog_seq != log_next_seq);
+		error = wait_event_interruptible(log->wait,
+				log->syslog_seq != log->next_seq);
 		if (error)
 			goto out;
-		error = syslog_print(buf, len);
+		error = syslog_print(log, buf, len);
 		break;
 	/* Read/clear last kernel messages */
 	case SYSLOG_ACTION_READ_CLEAR:
@@ -1188,11 +1282,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
-		error = syslog_print_all(buf, len, clear);
+		error = syslog_print_all(log, buf, len, clear);
 		break;
 	/* Clear ring buffer */
 	case SYSLOG_ACTION_CLEAR:
-		syslog_print_all(NULL, 0, true);
+		syslog_print_all(log, NULL, 0, true);
 		break;
 	/* Disable logging to console */
 	case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1212,6 +1306,10 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		error = -EINVAL;
 		if (len < 1 || len > 8)
 			goto out;
+		error = 0;
+		/* VE has no console, so return success */
+		if (log != &init_log_state)
+			goto out;
 		if (len < minimum_console_loglevel)
 			len = minimum_console_loglevel;
 		console_loglevel = len;
@@ -1222,12 +1320,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		if (syslog_seq < log_first_seq) {
+		if (log->syslog_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			syslog_seq = log_first_seq;
-			syslog_idx = log_first_idx;
-			syslog_prev = 0;
-			syslog_partial = 0;
+			log->syslog_seq = log->first_seq;
+			log->syslog_idx = log->first_idx;
+			log->syslog_prev = 0;
+			log->syslog_partial = 0;
 		}
 		if (from_file) {
 			/*
@@ -1235,28 +1333,28 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			 * for pending data, not the size; return the count of
 			 * records, not the length.
 			 */
-			error = log_next_idx - syslog_idx;
+			error = log->next_idx - log->syslog_idx;
 		} else {
-			u64 seq = syslog_seq;
-			u32 idx = syslog_idx;
-			enum log_flags prev = syslog_prev;
+			u64 seq = log->syslog_seq;
+			u32 idx = log->syslog_idx;
+			enum log_flags prev = log->syslog_prev;
 
 			error = 0;
-			while (seq < log_next_seq) {
-				struct log *msg = log_from_idx(idx);
+			while (seq < log->next_seq) {
+				struct log *msg = log_from_idx(log, idx);
 
 				error += msg_print_text(msg, prev, true, NULL, 0);
-				idx = log_next(idx);
+				idx = log_next(log, idx);
 				seq++;
 				prev = msg->flags;
 			}
-			error -= syslog_partial;
+			error -= log->syslog_partial;
 		}
 		raw_spin_unlock_irq(&logbuf_lock);
 		break;
 	/* Size of the log buffer */
 	case SYSLOG_ACTION_SIZE_BUFFER:
-		error = log_buf_len;
+		error = log->buf_len;
 		break;
 	default:
 		error = -EINVAL;
@@ -1389,113 +1487,143 @@ static inline void printk_delay(void)
 	}
 }
 
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
-	char buf[LOG_LINE_MAX];
-	size_t len;			/* length == 0 means unused buffer */
-	size_t cons;			/* bytes written to console */
-	struct task_struct *owner;	/* task of first print*/
-	u64 ts_nsec;			/* time of first print */
-	u8 level;			/* log level of first message */
-	u8 facility;			/* log level of first message */
-	enum log_flags flags;		/* prefix, newline flags */
-	bool flushed:1;			/* buffer sealed and committed */
-} cont;
-
-static void cont_flush(enum log_flags flags)
+static void cont_flush(struct log_state *log, enum log_flags flags)
 {
-	if (cont.flushed)
+	struct cont *c = &log->cont;
+
+	if (c->flushed)
 		return;
-	if (cont.len == 0)
+	if (c->len == 0)
 		return;
 
-	if (cont.cons) {
+	if (c->cons) {
 		/*
 		 * If a fragment of this line was directly flushed to the
 		 * console; wait for the console to pick up the rest of the
 		 * line. LOG_NOCONS suppresses a duplicated output.
 		 */
-		log_store(cont.facility, cont.level, flags | LOG_NOCONS,
-			  cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-		cont.flags = flags;
-		cont.flushed = true;
+		log_store(log, c->facility, c->level, flags | LOG_NOCONS,
+			  c->ts_nsec, NULL, 0, c->buf, c->len);
+		c->flags = flags;
+		c->flushed = true;
 	} else {
 		/*
 		 * If no fragment of this line ever reached the console,
 		 * just submit it to the store and free the buffer.
 		 */
-		log_store(cont.facility, cont.level, flags, 0,
-			  NULL, 0, cont.buf, cont.len);
-		cont.len = 0;
+		log_store(log, c->facility, c->level, flags, 0,
+			  NULL, 0, c->buf, c->len);
+		c->len = 0;
 	}
 }
 
-static bool cont_add(int facility, int level, const char *text, size_t len)
+static bool cont_add(struct log_state *log,
+		     int facility, int level, const char *text, size_t len)
 {
-	if (cont.len && cont.flushed)
+	struct cont *c = &log->cont;
+
+	if (c->len && c->flushed)
 		return false;
 
-	if (cont.len + len > sizeof(cont.buf)) {
+	if (c->len + len > sizeof(c->buf)) {
 		/* the line gets too long, split it up in separate records */
-		cont_flush(LOG_CONT);
+		cont_flush(log, LOG_CONT);
 		return false;
 	}
 
-	if (!cont.len) {
-		cont.facility = facility;
-		cont.level = level;
-		cont.owner = current;
-		cont.ts_nsec = local_clock();
-		cont.flags = 0;
-		cont.cons = 0;
-		cont.flushed = false;
+	if (!c->len) {
+		c->facility = facility;
+		c->level = level;
+		c->owner = current;
+		c->ts_nsec = local_clock();
+		c->flags = 0;
+		c->cons = 0;
+		c->flushed = false;
 	}
 
-	memcpy(cont.buf + cont.len, text, len);
-	cont.len += len;
+	memcpy(c->buf + c->len, text, len);
+	c->len += len;
 
-	if (cont.len > (sizeof(cont.buf) * 80) / 100)
-		cont_flush(LOG_CONT);
+	if (c->len > (sizeof(c->buf) * 80) / 100)
+		cont_flush(log, LOG_CONT);
 
 	return true;
 }
 
-static size_t cont_print_text(char *text, size_t size)
+static size_t cont_print_text(struct log_state *log, char *text, size_t size)
 {
+	struct cont *c = &log->cont;
 	size_t textlen = 0;
 	size_t len;
 
-	if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
-		textlen += print_time(cont.ts_nsec, text);
+	if (c->cons == 0 && (log->console_prev & LOG_NEWLINE)) {
+		textlen += print_time(c->ts_nsec, text);
 		size -= textlen;
 	}
 
-	len = cont.len - cont.cons;
+	len = c->len - c->cons;
 	if (len > 0) {
 		if (len+1 > size)
 			len = size-1;
-		memcpy(text + textlen, cont.buf + cont.cons, len);
+		memcpy(text + textlen, c->buf + c->cons, len);
 		textlen += len;
-		cont.cons = cont.len;
+		c->cons = c->len;
 	}
 
-	if (cont.flushed) {
-		if (cont.flags & LOG_NEWLINE)
+	if (c->flushed) {
+		if (c->flags & LOG_NEWLINE)
 			text[textlen++] = '\n';
 		/* got everything, release buffer */
-		cont.len = 0;
+		c->len = 0;
 	}
 	return textlen;
 }
 
-asmlinkage int vprintk_emit(int facility, int level,
-			    const char *dict, size_t dictlen,
-			    const char *fmt, va_list args)
+#ifdef CONFIG_VE
+int ve_log_init(struct ve_struct *ve)
+{
+	struct log_state *log;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	init_waitqueue_head(&log->wait);
+	log->buf_len = VE_LOG_BUF_LEN;
+	/* buf will be initialized later by log_state_init() */
+
+	ve->log_state = log;
+	return 0;
+}
+EXPORT_SYMBOL(ve_log_init);
+
+void ve_log_destroy(struct ve_struct *ve)
+{
+	struct log_state *log = ve->log_state;
+
+	kfree(log->buf);
+	kfree(log);
+}
+EXPORT_SYMBOL(ve_log_destroy);
+#endif
+
+static int log_state_init(struct log_state *log)
+{
+#ifdef CONFIG_VE
+	if (log->buf)
+		return 0;
+
+	log->buf = kzalloc(log->buf_len, GFP_ATOMIC);
+	if (!log->buf)
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static int __vprintk_emit(struct log_state *log,
+			  int facility, int level,
+			  const char *dict, size_t dictlen,
+			  const char *fmt, va_list args)
 {
 	static int recursion_bug;
 	static char textbuf[LOG_LINE_MAX];
@@ -1508,6 +1636,8 @@ asmlinkage int vprintk_emit(int facility, int level,
 	bool in_sched = false;
 	/* cpu currently holding logbuf_lock in this function */
 	static volatile unsigned int logbuf_cpu = UINT_MAX;
+	bool need_wake = false;
+	int err;
 
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
@@ -1544,6 +1674,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 	raw_spin_lock(&logbuf_lock);
 	logbuf_cpu = this_cpu;
 
+	err = log_state_init(log);
+	if (err) {
+		logbuf_cpu = UINT_MAX;
+		raw_spin_unlock(&logbuf_lock);
+		lockdep_on();
+		local_irq_restore(flags);
+		return err;
+	}
+
 	if (recursion_bug) {
 		static const char recursion_msg[] =
 			"BUG: recent printk recursion!";
@@ -1551,7 +1690,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		recursion_bug = 0;
 		printed_len += strlen(recursion_msg);
 		/* emit KERN_CRIT message */
-		log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+		log_store(log, 0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
 			  NULL, 0, recursion_msg, printed_len);
 	}
 
@@ -1599,12 +1738,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * Flush the conflicting buffer. An earlier newline was missing,
 		 * or another task also prints continuation lines.
 		 */
-		if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-			cont_flush(LOG_NEWLINE);
+		if (log->cont.len && (lflags & LOG_PREFIX ||
+				     log->cont.owner != current))
+			cont_flush(log, LOG_NEWLINE);
 
 		/* buffer line if possible, otherwise store it right away */
-		if (!cont_add(facility, level, text, text_len))
-			log_store(facility, level, lflags | LOG_CONT, 0,
+		if (!cont_add(log, facility, level, text, text_len))
+			log_store(log, facility, level, lflags | LOG_CONT, 0,
 				  dict, dictlen, text, text_len);
 	} else {
 		bool stored = false;
@@ -1615,14 +1755,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * there was a race with interrupts (prefix == true) then just
 		 * flush it out and store this line separately.
 		 */
-		if (cont.len && cont.owner == current) {
+		if (log->cont.len && log->cont.owner == current) {
 			if (!(lflags & LOG_PREFIX))
-				stored = cont_add(facility, level, text, text_len);
-			cont_flush(LOG_NEWLINE);
+				stored = cont_add(log, facility, level,
+						  text, text_len);
+			cont_flush(log, LOG_NEWLINE);
 		}
 
 		if (!stored)
-			log_store(facility, level, lflags, 0,
+			log_store(log, facility, level, lflags, 0,
 				  dict, dictlen, text, text_len);
 	}
 	printed_len += text_len;
@@ -1648,14 +1789,39 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * semaphore.  The release will print out buffers and wake up
 		 * /dev/kmsg and syslog() users.
 		 */
-		if (console_trylock_for_printk())
+		if (log != &init_log_state) {
+			raw_spin_lock_irqsave(&logbuf_lock, flags);
+			if (log->seen_seq != log->next_seq && !oops_in_progress) {
+				log->seen_seq = log->next_seq;
+				need_wake = true;
+			}
+			logbuf_cpu = UINT_MAX;
+			raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		} else if (console_trylock_for_printk())
 			console_unlock();
 		preempt_enable();
 		lockdep_on();
+
+		if (need_wake)
+			wake_up_interruptible(&log->wait);
 	}
 
 	return printed_len;
 }
+
+static int __vprintk(const char *fmt, va_list args)
+{
+	return __vprintk_emit(ve_log_state(), 0, -1, NULL, 0, fmt, args);
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+			    const char *dict, size_t dictlen,
+			    const char *fmt, va_list args)
+{
+	return __vprintk_emit(&init_log_state,
+			      facility, level, dict, dictlen, fmt, args);
+}
+
 EXPORT_SYMBOL(vprintk_emit);
 
 asmlinkage int vprintk(const char *fmt, va_list args)
@@ -1703,6 +1869,53 @@ EXPORT_SYMBOL_GPL(vprintk_default);
  */
 DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+{
+	va_list args2;
+	int r = 0;
+
+	va_copy(args2, args);
+	if (ve_is_super(get_exec_env()) || (dst & VE0_LOG))
+		r = vprintk(fmt, args);
+	if (!ve_is_super(get_exec_env()) && (dst & VE_LOG))
+		r = __vprintk(fmt, args2);
+
+	return r;
+}
+
+/*
+ * Do not use it from scheduler code - can lead to deadlocks.
+ */
+
+asmlinkage int ve_printk(int dst, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = ve_vprintk(dst, fmt, args);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(ve_printk);
+
+asmlinkage int ve_log_printk(struct ve_struct *ve, const char *fmt, ...)
+{
+	struct log_state *log = &init_log_state;
+	va_list args;
+	int r;
+
+	if (likely(ve && ve->log_state))
+		log = ve->log_state;
+
+	va_start(args, fmt);
+	r = __vprintk_emit(log, 0, -1, NULL, 0, fmt, args);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(ve_log_printk);
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1751,28 +1964,36 @@ EXPORT_SYMBOL(printk);
 
 #define LOG_LINE_MAX		0
 #define PREFIX_MAX		0
-#define LOG_LINE_MAX 0
-static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags syslog_prev;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
+#define LOG_LINE_MAX		0
+struct cont {
 	size_t len;
 	size_t cons;
 	u8 level;
 	bool flushed:1;
-} cont;
-static struct log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
+};
+static struct log_state {
+	u64 syslog_seq;
+	u32 syslog_idx;
+	enum log_flags syslog_prev;
+	u64 first_seq;
+	u32 first_idx;
+	u64 next_seq;
+	u64 console_seq;
+	u32 console_idx;
+	enum log_flags console_prev;
+	u64 seen_seq;
+	struct cont cont;
+	wait_queue_head_t wait;
+} init_log_state = {
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_log_state.wait),
+};
+static struct log *log_from_idx(struct log_state *log, u32 idx) { return NULL; }
+static u32 log_next(struct log_state *log, u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
 static size_t msg_print_text(const struct log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
+static size_t cont_print_text(struct log_state *log,
+			      char *text, size_t size) { return 0; }
 
 #endif /* CONFIG_PRINTK */
 
@@ -2032,14 +2253,14 @@ int is_console_locked(void)
 	return console_locked;
 }
 
-static void console_cont_flush(char *text, size_t size)
+static void console_cont_flush(struct log_state *log, char *text, size_t size)
 {
 	unsigned long flags;
 	size_t len;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
 
-	if (!cont.len)
+	if (!log->cont.len)
 		goto out;
 
 	/*
@@ -2047,13 +2268,13 @@ static void console_cont_flush(char *text, size_t size)
 	 * busy. The earlier ones need to be printed before this one, we
 	 * did not flush any fragment so far, so just let it queue up.
 	 */
-	if (console_seq < log_next_seq && !cont.cons)
+	if (log->console_seq < log->next_seq && !log->cont.cons)
 		goto out;
 
-	len = cont_print_text(text, size);
+	len = cont_print_text(log, text, size);
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
-	call_console_drivers(cont.level, text, len);
+	call_console_drivers(log->cont.level, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
 	return;
@@ -2078,9 +2299,10 @@ static void console_cont_flush(char *text, size_t size)
 void console_unlock(void)
 {
 	static char text[LOG_LINE_MAX + PREFIX_MAX];
-	static u64 seen_seq;
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	bool wake_klogd = false;
+	bool first = true;
 	bool retry;
 	unsigned cnt;
 
@@ -2092,7 +2314,7 @@ void console_unlock(void)
 	console_may_schedule = 0;
 
 	/* flush buffered message fragment immediately to console */
-	console_cont_flush(text, sizeof(text));
+	console_cont_flush(log, text, sizeof(text));
 again:
 	cnt = 5;
 	for (;;) {
@@ -2100,49 +2322,54 @@ void console_unlock(void)
 		size_t len;
 		int level;
 
+		if (first)
+			first = false;
+		else
+			touch_all_softlockup_watchdogs();
+
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		if (seen_seq != log_next_seq) {
+		if (log->seen_seq != log->next_seq) {
 			wake_klogd = true;
-			seen_seq = log_next_seq;
+			log->seen_seq = log->next_seq;
 		}
 
-		if (console_seq < log_first_seq) {
+		if (log->console_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			console_seq = log_first_seq;
-			console_idx = log_first_idx;
-			console_prev = 0;
+			log->console_seq = log->first_seq;
+			log->console_idx = log->first_idx;
+			log->console_prev = 0;
 		}
 skip:
-		if (console_seq == log_next_seq)
+		if (log->console_seq == log->next_seq)
 			break;
 
 		if (--cnt == 0)
 			break;	/* Someone else printk's like crazy */
 
-		msg = log_from_idx(console_idx);
+		msg = log_from_idx(log, log->console_idx);
 		if (msg->flags & LOG_NOCONS) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it.
 			 */
-			console_idx = log_next(console_idx);
-			console_seq++;
+			log->console_idx = log_next(log, log->console_idx);
+			log->console_seq++;
 			/*
 			 * We will get here again when we register a new
 			 * CON_PRINTBUFFER console. Clear the flag so we
 			 * will properly dump everything later.
 			 */
 			msg->flags &= ~LOG_NOCONS;
-			console_prev = msg->flags;
+			log->console_prev = msg->flags;
 			goto skip;
 		}
 
 		level = msg->level;
-		len = msg_print_text(msg, console_prev, false,
+		len = msg_print_text(msg, log->console_prev, false,
 				     text, sizeof(text));
-		console_idx = log_next(console_idx);
-		console_seq++;
-		console_prev = msg->flags;
+		log->console_idx = log_next(log, log->console_idx);
+		log->console_seq++;
+		log->console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
@@ -2168,7 +2395,7 @@ void console_unlock(void)
 	 * flush, no worries.
 	 */
 	raw_spin_lock(&logbuf_lock);
-	retry = console_seq != log_next_seq;
+	retry = log->console_seq != log->next_seq;
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 	if (retry && console_trylock())
@@ -2183,7 +2410,7 @@ void console_unlock(void)
 		cnt = 9999;
 		while (--cnt != 0) {
 			cpu_relax();
-			if (console_seq == log_next_seq) {
+			if (log->console_seq == log->next_seq) {
 				/* Good, other CPU entered "for(;;)" loop */
 				goto out;
 			}
@@ -2311,6 +2538,7 @@ early_param("keep_bootcon", keep_bootcon_setup);
  */
 void register_console(struct console *newcon)
 {
+	struct log_state *log = &init_log_state;
 	int i;
 	unsigned long flags;
 	struct console *bcon = NULL;
@@ -2431,9 +2659,9 @@ void register_console(struct console *newcon)
 		 * for us.
 		 */
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		console_seq = syslog_seq;
-		console_idx = syslog_idx;
-		console_prev = syslog_prev;
+		log->console_seq = log->syslog_seq;
+		log->console_idx = log->syslog_idx;
+		log->console_prev = log->syslog_prev;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
@@ -2546,7 +2774,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
-		wake_up_interruptible(&log_wait);
+		wake_up_interruptible(&init_log_state.wait);
 }
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
@@ -2557,7 +2785,7 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 void wake_up_klogd(void)
 {
 	preempt_disable();
-	if (waitqueue_active(&log_wait)) {
+	if (waitqueue_active(&init_log_state.wait)) {
 		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
 	}
@@ -2689,6 +2917,7 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
  */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
+	struct log_state *log = &init_log_state;
 	struct kmsg_dumper *dumper;
 	unsigned long flags;
 
@@ -2704,10 +2933,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		dumper->active = true;
 
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		dumper->cur_seq = clear_seq;
-		dumper->cur_idx = clear_idx;
-		dumper->next_seq = log_next_seq;
-		dumper->next_idx = log_next_idx;
+		dumper->cur_seq = log->clear_seq;
+		dumper->cur_idx = log->clear_idx;
+		dumper->next_seq = log->next_seq;
+		dumper->next_idx = log->next_idx;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 		/* invoke dumper which will iterate over records */
@@ -2741,6 +2970,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 			       char *line, size_t size, size_t *len)
 {
+	struct log_state *log = &init_log_state;
 	struct log *msg;
 	size_t l = 0;
 	bool ret = false;
@@ -2748,20 +2978,20 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 	if (!dumper->active)
 		goto out;
 
-	if (dumper->cur_seq < log_first_seq) {
+	if (dumper->cur_seq < log->first_seq) {
 		/* messages are gone, move to first available one */
-		dumper->cur_seq = log_first_seq;
-		dumper->cur_idx = log_first_idx;
+		dumper->cur_seq = log->first_seq;
+		dumper->cur_idx = log->first_idx;
 	}
 
 	/* last entry */
-	if (dumper->cur_seq >= log_next_seq)
+	if (dumper->cur_seq >= log->next_seq)
 		goto out;
 
-	msg = log_from_idx(dumper->cur_idx);
+	msg = log_from_idx(log, dumper->cur_idx);
 	l = msg_print_text(msg, 0, syslog, line, size);
 
-	dumper->cur_idx = log_next(dumper->cur_idx);
+	dumper->cur_idx = log_next(log, dumper->cur_idx);
 	dumper->cur_seq++;
 	ret = true;
 out:
@@ -2823,6 +3053,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 			  char *buf, size_t size, size_t *len)
 {
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	u64 seq;
 	u32 idx;
@@ -2836,10 +3067,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	if (dumper->cur_seq < log_first_seq) {
+	if (dumper->cur_seq < log->first_seq) {
 		/* messages are gone, move to first available one */
-		dumper->cur_seq = log_first_seq;
-		dumper->cur_idx = log_first_idx;
+		dumper->cur_seq = log->first_seq;
+		dumper->cur_idx = log->first_idx;
 	}
 
 	/* last entry */
@@ -2853,10 +3084,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	idx = dumper->cur_idx;
 	prev = 0;
 	while (seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l += msg_print_text(msg, prev, true, NULL, 0);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2866,10 +3097,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	idx = dumper->cur_idx;
 	prev = 0;
 	while (l > size && seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l -= msg_print_text(msg, prev, true, NULL, 0);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2881,10 +3112,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	l = 0;
 	prev = 0;
 	while (seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l += msg_print_text(msg, prev, syslog, buf + l, size - l);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2912,10 +3143,12 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
  */
 void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
 {
-	dumper->cur_seq = clear_seq;
-	dumper->cur_idx = clear_idx;
-	dumper->next_seq = log_next_seq;
-	dumper->next_idx = log_next_idx;
+	struct log_state *log = &init_log_state;
+
+	dumper->cur_seq = log->clear_seq;
+	dumper->cur_idx = log->clear_idx;
+	dumper->next_seq = log->next_seq;
+	dumper->next_idx = log->next_idx;
 }
 
 /**
@@ -2967,11 +3200,11 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...)
  */
 void dump_stack_print_info(const char *log_lvl)
 {
-	printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+	printk("%sCPU: %d PID: %d Comm: %.20s ve: %s %s %s %.*s %s\n",
 	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
-	       print_tainted(), init_utsname()->release,
+	       task_ve_name(current), print_tainted(), init_utsname()->release,
 	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
+	       init_utsname()->version, VZVERSION);
 
 	if (dump_stack_arch_desc_str[0] != '\0')
 		printk("%sHardware name: %s\n",
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -238,6 +238,8 @@ int ___ptrace_may_access(struct task_struct *tracer,
 	 * or halting the specified task is impossible.
 	 */
 	int dumpable = 0;
+	int vps_dumpable = 0;
+
 	/* Don't let security modules deny introspection */
 	if (task == tracer)
 		return 0;
@@ -265,14 +267,20 @@ int ___ptrace_may_access(struct task_struct *tracer,
 ok:
 	rcu_read_unlock();
 	smp_rmb();
-	if (task->mm)
+	if (task->mm) {
 		dumpable = get_dumpable(task->mm);
+		vps_dumpable = (task->mm->vps_dumpable == VD_PTRACE_COREDUMP);
+	}
 	rcu_read_lock();
 	if (dumpable != SUID_DUMP_USER &&
 	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
+	if (!vps_dumpable && !ve_is_super(get_exec_env())) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
 	rcu_read_unlock();
 
 	if (!(mode & PTRACE_MODE_NOACCESS_CHK))
@@ -332,6 +340,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	if (!retval) {
+		if (!task->mm || task->mm->vps_dumpable == VD_LICDATA_ACCESS)
+			retval = -EACCES;
+	}
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
@@ -605,6 +617,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+		    !config_enabled(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;
 	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
@@ -1032,6 +1057,11 @@ int ptrace_request(struct task_struct *child, long request,
 		break;
 	}
 #endif
+
+	case PTRACE_SECCOMP_GET_FILTER:
+		ret = seccomp_get_filter(child, addr, datavp);
+		break;
+
 	default:
 		break;
 	}
@@ -1043,6 +1073,10 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
 	struct task_struct *child;
 
+	/* ptracing of init from inside CT is dangerous */
+	if (pid == 1 && !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
 	rcu_read_lock();
 	child = find_task_by_vpid(pid);
 	if (child)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_clock.o = -pg
 endif
 
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,7 @@
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <linux/frame.h>
+#include <linux/ve.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -135,6 +136,37 @@ void update_rq_clock(struct rq *rq)
 	update_rq_clock_task(rq, delta);
 }
 
+struct kernel_stat_glob kstat_glob;
+DEFINE_SPINLOCK(kstat_glb_lock);
+EXPORT_SYMBOL(kstat_glob);
+EXPORT_SYMBOL(kstat_glb_lock);
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_page_in);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, alloc_kstat_lat[KSTAT_ALLOCSTAT_NR]);
+
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_ttfp);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_cache_reap);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_icache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_dcache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_refill_inact);
+
+void __init kstat_init(void)
+{
+	int i;
+
+	kstat_glob.sched_lat.cur = &glob_kstat_lat;
+	kstat_glob.page_in.cur = &glob_kstat_page_in;
+	for ( i = 0 ; i < KSTAT_ALLOCSTAT_NR ; i++)
+		kstat_glob.alloc_lat[i].cur = &alloc_kstat_lat[i];
+
+	kstat_glob.ttfp.cur = &kstat_pcpu_ttfp;
+	kstat_glob.cache_reap.cur = &kstat_pcpu_cache_reap;
+	kstat_glob.shrink_icache.cur = &kstat_pcpu_shrink_icache;
+	kstat_glob.shrink_dcache.cur = &kstat_pcpu_shrink_dcache;
+	kstat_glob.refill_inact.cur = &kstat_pcpu_refill_inact;
+}
+
 /*
  * Debugging: various feature bits
  */
@@ -308,6 +340,87 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+	struct cgroup *cg;
+
+	if (!tg)
+		return NULL;
+
+	cg = cgroup_get_ve_root(tg->css.cgroup);
+	return cg ? cgroup_tg(cg) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+	unsigned int cpu_rate = 0;
+#ifdef CONFIG_CFS_CPULIMIT
+	tg = ve_root_tg(tg);
+	if (tg)
+		cpu_rate = tg->cpu_rate;
+#endif
+	return cpu_rate;
+}
+
+unsigned int tg_nr_cpus(struct task_group *tg)
+{
+	unsigned int nr_cpus = 0;
+	unsigned int max_nr_cpus = num_online_cpus();
+
+#ifdef CONFIG_CFS_CPULIMIT
+	tg = ve_root_tg(tg);
+	if (tg)
+		nr_cpus = tg->nr_cpus;
+#endif
+
+	if (!nr_cpus || nr_cpus > max_nr_cpus)
+		nr_cpus = max_nr_cpus;
+
+	return nr_cpus;
+}
+
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return tg_nr_cpus(task_group(p));
+}
+
+static unsigned int task_cpu_rate(struct task_struct *p)
+{
+	return tg_cpu_rate(task_group(p));
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	unsigned long rate, max_rate;
+
+	if (!sysctl_sched_cpulimit_scale_cpufreq)
+		return freq;
+
+	rate = task_cpu_rate(current);
+
+	max_rate = num_online_vcpus() * MAX_CPU_RATE;
+	if (!rate || rate >= max_rate)
+		return freq;
+
+	return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
+
+unsigned long nr_zombie = 0;	/* protected by tasklist_lock */
+EXPORT_SYMBOL(nr_zombie);
+
+atomic_t nr_dead = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_dead);
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
@@ -887,18 +1000,48 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
+static inline void check_inc_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping++;
+}
+
+static inline void check_dec_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping--;
+}
+
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible--;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled--;
+		task_cfs_rq(p)->nr_unint--;
+	}
+
+	check_dec_sleeping(rq, p);
 
 	enqueue_task(rq, p, flags);
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	check_inc_sleeping(rq, p);
+
+#if 0 /* this is broken */
+	if (p->state == TASK_STOPPED) {
+		rq->nr_stopped++;
+	}
+#endif
+
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible++;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled++;
+		task_cfs_rq(p)->nr_unint++;
+	}
 
 	dequeue_task(rq, p, flags);
 }
@@ -1568,10 +1711,17 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
-	if (p->sched_contributes_to_load)
+	if (p->sched_contributes_to_load) {
 		rq->nr_uninterruptible--;
+		if (p->sched_iothrottled_sleep)
+			rq->nr_iothrottled--;
+		task_cfs_rq(p)->nr_unint--;
+	}
 #endif
 
+	if (p->sched_interruptible_sleep)
+		rq->nr_sleeping--;
+
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 	ttwu_do_wakeup(rq, p, wake_flags);
 }
@@ -1590,6 +1740,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	rq = __task_rq_lock(p);
 	if (p->on_rq) {
 		ttwu_do_wakeup(rq, p, wake_flags);
+		p->woken_while_running = 1;
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
@@ -1745,6 +1896,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * current.
 	 */
 	smp_rmb();
+
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
@@ -1779,7 +1931,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
+	if (p->in_iowait && p->sched_class->nr_iowait_dec) {
+		struct rq *rq = __task_rq_lock(p);
+		p->sched_class->nr_iowait_dec(p);
+		__task_rq_unlock(rq);
+	}
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	p->sched_interruptible_sleep = (p->state == TASK_INTERRUPTIBLE);
+	p->sched_iothrottled_sleep = !!task_iothrottled(p);
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking)
@@ -1898,6 +2058,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	p->se.boosted = 0;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* Even if schedstat is disabled, there should not be garbage */
 	p->se.statistics = &p->statistics;
@@ -2417,6 +2581,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	}
 
 	tick_nohz_task_switch(current);
+
+	/* kernel threads don't care about cpuid faulting */
+	if (current->mm)
+		set_cpuid_faulting(!ve_is_super(get_exec_env()));
 }
 
 #ifdef CONFIG_SMP
@@ -2462,20 +2630,18 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
+	struct rq *rq;
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+	/* finish_task_switch() drops rq->lock and enables preemtion */
+	preempt_disable();
+#endif
+	rq = this_rq();
 	finish_task_switch(rq, prev);
 
-	/*
-	 * FIXME: do we need to worry about rq being invalidated by the
-	 * task_switch?
-	 */
 	post_schedule(rq);
-
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
-#endif
+
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
@@ -2535,21 +2701,28 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	finish_task_switch(this_rq(), prev);
 }
 
+#define DECLARE_NR_ONLINE(varname)			\
+	unsigned long varname(void)			\
+	{						\
+		unsigned long i, sum = 0;		\
+		for_each_online_cpu(i)			\
+			sum += cpu_rq(i)->varname;	\
+		if (unlikely((long)sum < 0))		\
+			return 0;			\
+		return sum;				\
+	}						\
+	EXPORT_SYMBOL(varname);				\
+
 /*
  * nr_running and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
-unsigned long nr_running(void)
-{
-	unsigned long i, sum = 0;
-
-	for_each_online_cpu(i)
-		sum += cpu_rq(i)->nr_running;
-
-	return sum;
-}
+DECLARE_NR_ONLINE(nr_running);
+DECLARE_NR_ONLINE(nr_sleeping);
+DECLARE_NR_ONLINE(nr_stopped);
+DECLARE_NR_ONLINE(nr_uninterruptible);
 
 /*
  * Check if only the current task is running on the cpu.
@@ -2590,13 +2763,12 @@ unsigned long nr_iowait_cpu(int cpu)
 	return atomic_read(&this->nr_iowait);
 }
 
-unsigned long this_cpu_load(void)
+unsigned long nr_active_cpu(void)
 {
 	struct rq *this = this_rq();
-	return this->cpu_load[0];
+	return this->nr_active;
 }
 
-
 /*
  * Global load-average calculations
  *
@@ -2665,12 +2837,21 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 
+void get_avenrun_ve(unsigned long *loads, unsigned long offset, int shift)
+{
+	struct task_group *tg = task_group(current);
+	loads[0] = (tg->avenrun[0] + offset) << shift;
+	loads[1] = (tg->avenrun[1] + offset) << shift;
+	loads[2] = (tg->avenrun[2] + offset) << shift;
+}
+
 static long calc_load_fold_active(struct rq *this_rq)
 {
 	long nr_active, delta = 0;
 
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active -= (long) this_rq->nr_iothrottled;
 
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@@ -2692,6 +2873,42 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 	return load >> FSHIFT;
 }
 
+#ifdef CONFIG_VE
+static void calc_load_ve(void)
+{
+	unsigned long flags, nr_unint, nr_active;
+	struct task_group *tg;
+	int i;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		nr_active = 0;
+		for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			nr_active += tg->cfs_rq[i]->nr_running;
+			nr_active += tg->cfs_rq[i]->nr_unint;
+#endif
+		}
+		nr_active *= FIXED_1;
+
+		tg->avenrun[0] = calc_load(tg->avenrun[0], EXP_1, nr_active);
+		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
+		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
+	}
+	rcu_read_unlock();
+
+	nr_unint = nr_uninterruptible() * FIXED_1;
+	spin_lock_irqsave(&kstat_glb_lock, flags);
+	CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
+	spin_unlock_irqrestore(&kstat_glb_lock, flags);
+
+}
+#else
+#define calc_load_ve()	do { } while (0)
+#endif
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Handle NO_HZ for the global load-average.
@@ -2772,7 +2989,10 @@ void calc_load_enter_idle(void)
 	 * We're going into NOHZ mode, if there's any pending delta, fold it
 	 * into the pending idle delta.
 	 */
+
+	raw_spin_lock(&this_rq->lock);
 	delta = calc_load_fold_active(this_rq);
+	raw_spin_unlock(&this_rq->lock);
 	if (delta) {
 		int idx = calc_load_write_idx();
 		atomic_long_add(delta, &calc_load_idle[idx]);
@@ -2952,6 +3172,8 @@ void calc_global_load(unsigned long ticks)
 
 	calc_load_update += LOAD_FREQ;
 
+	calc_load_ve();
+
 	/*
 	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
@@ -3060,6 +3282,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 	this_rq->nr_load_updates++;
 
 	/* Update our load: */
+	this_rq->nr_active = this_rq->nr_running;
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
@@ -3476,6 +3699,7 @@ static void __sched __schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
+	int resched_next;
 	int cpu;
 
 need_resched:
@@ -3551,8 +3775,14 @@ static void __sched __schedule(void)
 
 	post_schedule(rq);
 
+	resched_next = READ_ONCE(rq->resched_next);
+	if (resched_next) {
+		set_tsk_need_resched(current);
+		rq->resched_next = 0;
+	}
+
 	sched_preempt_enable_no_resched();
-	if (need_resched())
+	if (!resched_next && need_resched())
 		goto need_resched;
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
@@ -4572,7 +4802,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (user && !capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_ADMIN)) {
 		if (fair_policy(policy)) {
 			if (attr->sched_nice < TASK_NICE(p) &&
 			    !can_nice(p, attr->sched_nice))
@@ -5134,6 +5364,9 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	struct task_struct *p;
 	int retval;
 
+	if (!ve_is_super(get_exec_env()))
+		return 0;
+
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
@@ -5268,6 +5501,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 	if (retval)
 		goto out_unlock;
 
+	if (!ve_is_super(get_exec_env())) {
+		cpumask_clear(mask);
+		bitmap_fill(cpumask_bits(mask), num_online_vcpus());
+		goto out_unlock;
+	}
+
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5348,23 +5587,38 @@ static inline int should_resched(void)
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 
-static void __cond_resched(void)
+static void __cond_resched(bool may_throttle)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
+	if (may_throttle)
+		current->may_throttle = 1;
 	__schedule();
+	if (may_throttle)
+		current->may_throttle = 0;
 	sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
-		__cond_resched();
+		__cond_resched(false);
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 
+int __sched _cond_resched_may_throttle(void)
+{
+	if (should_resched()) {
+		__cond_resched(true);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched_may_throttle);
+
+
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
@@ -5383,7 +5637,7 @@ int __cond_resched_lock(spinlock_t *lock)
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-			__cond_resched();
+			__cond_resched(false);
 		else
 			cpu_relax();
 		ret = 1;
@@ -5399,7 +5653,7 @@ int __sched __cond_resched_softirq(void)
 
 	if (should_resched()) {
 		local_bh_enable();
-		__cond_resched();
+		__cond_resched(false);
 		local_bh_disable();
 		return 1;
 	}
@@ -5656,27 +5910,16 @@ void sched_show_task(struct task_struct *p)
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
+	printk(KERN_CONT " %p ", p);
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	rcu_read_lock();
 	ppid = task_pid_nr(rcu_dereference(p->real_parent));
-	rcu_read_unlock();
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+	printk(KERN_CONT "%5lu %5d %6d %4s 0x%08lx\n", free,
 		task_pid_nr(p), ppid,
-		(unsigned long)task_thread_info(p)->flags);
-
+		task_ve_name(p), (unsigned long)task_thread_info(p)->flags);
+	rcu_read_unlock();
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 }
@@ -5687,25 +5930,33 @@ void show_state_filter(unsigned long state_filter)
 
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
+		"  task          taskaddr stack   pid father veid\n");
 #else
 	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
+		"  task                  taskaddr stack   pid father veid\n");
 #endif
 	rcu_read_lock();
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
 		 */
 		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 
-	touch_all_softlockup_watchdogs();
-
-#ifdef CONFIG_SCHED_DEBUG
+#if 0
+	/*
+	 * This results in soft lockups, because it writes too much data to
+	 * console. At the same time information it shows is only useful for
+	 * sched debugging and can be obtained via /proc/sched_debug anyway.
+	 * So disable it.
+	 */
 	sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
@@ -6058,6 +6309,9 @@ void idle_task_exit(void)
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
+
+	/* disable cpuid faulting when a cpu goes offline */
+	set_cpuid_faulting(false);
 }
 
 /*
@@ -6358,6 +6612,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 			set_rq_online(rq);
 		}
+		start_cfs_idle_time_accounting(cpu);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
@@ -6372,6 +6627,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		}
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
+		stop_cfs_idle_time_accounting(cpu);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
@@ -8189,6 +8445,8 @@ void __init sched_init(void)
 	init_dl_bandwidth(&def_dl_bandwidth,
 			global_rt_period(), global_rt_runtime());
 
+	root_task_group.taskstats = alloc_percpu(struct taskstats);
+
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
@@ -8204,8 +8462,14 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
 
+	root_task_group.start_time = (struct timespec){0, 0};
+
 #endif /* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_CFS_CPULIMIT
+	root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -8484,6 +8748,9 @@ static void free_sched_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
+	free_percpu(tg->taskstats);
+	kfree(tg->cpustat_last);
+	kfree(tg->vcpustat);
 	kfree(tg);
 }
 
@@ -8502,6 +8769,27 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	tg->taskstats = alloc_percpu(struct taskstats);
+	if (!tg->taskstats)
+		goto err;
+
+	tg->cpustat_last = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+				   GFP_KERNEL);
+	if (!tg->cpustat_last)
+		goto err;
+
+	tg->vcpustat = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+			       GFP_KERNEL);
+	if (!tg->vcpustat)
+		goto err;
+
+	tg->vcpustat_last_update = ktime_set(0, 0);
+	spin_lock_init(&tg->vcpustat_lock);
+
+	/* start_timespec is saved CT0 uptime */
+	do_posix_clock_monotonic_gettime(&tg->start_time);
+	monotonic_to_bootbased(&tg->start_time);
+
 	return tg;
 
 err:
@@ -8509,6 +8797,8 @@ struct task_group *sched_create_group(struct task_group *parent)
 	return ERR_PTR(-ENOMEM);
 }
 
+static void tg_update_topmost_limited_ancestor(struct task_group *tg);
+
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
 	unsigned long flags;
@@ -8521,6 +8811,9 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
+
+	tg_update_topmost_limited_ancestor(tg);
+
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	online_fair_sched_group(tg);
@@ -8574,6 +8867,17 @@ void sched_move_task(struct task_struct *tsk)
 
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_dec)
+			tsk->sched_class->nr_iowait_dec(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint--;
+
+		check_dec_sleeping(rq, tsk);
+	}
+
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
@@ -8598,6 +8902,16 @@ void sched_move_task(struct task_struct *tsk)
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_inc)
+			tsk->sched_class->nr_iowait_inc(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint++;
+
+		check_inc_sleeping(rq, tsk);
+	}
 
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -9015,6 +9329,11 @@ static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 	sched_offline_group(tg);
 }
 
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
+{
+	sched_move_task(task);
+}
+
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
 				 struct cgroup_taskset *tset)
 {
@@ -9055,6 +9374,19 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		return;
 
 	sched_move_task(task);
+
+	if (thread_group_leader(task)) {
+		struct task_group *tg = cgroup_tg(old_cgrp);
+		struct taskstats *stats = get_cpu_ptr(tg->taskstats);
+		struct signal_struct *sig = task->signal;
+
+		if (sig->stats)
+			delayacct_add_stats(stats, sig->stats);
+		else
+			delayacct_add_tsk(stats, task);
+
+		put_cpu_ptr(stats);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9079,7 +9411,10 @@ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static void tg_limit_toggled(struct task_group *tg);
+
+/* call with cfs_constraints_mutex held */
+static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -9103,10 +9438,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
-	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -9134,15 +9468,28 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
+		cfs_rq->runtime_remaining = 1;
 
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
+	if (runtime_enabled != runtime_was_enabled)
+		tg_limit_toggled(tg);
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
-out_unlock:
+	return ret;
+}
+
+static void tg_update_cpu_limit(struct task_group *tg);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int ret;
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	tg_update_cpu_limit(tg);
 	mutex_unlock(&cfs_constraints_mutex);
 
 	return ret;
@@ -9306,6 +9653,135 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 
 	return 0;
 }
+
+#ifdef CONFIG_CFS_CPULIMIT
+static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void *unused)
+{
+	struct task_group *parent = tg->parent;
+
+	/*
+	 * Parent and none of its uncestors is limited? The task group should
+	 * become a topmost limited uncestor then, provided it has a limit set.
+	 * Otherwise inherit topmost limited ancestor from the parent.
+	 */
+	if (parent->topmost_limited_ancestor == parent &&
+	    parent->cfs_bandwidth.quota == RUNTIME_INF)
+		tg->topmost_limited_ancestor = tg;
+	else
+		tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
+	return 0;
+}
+
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+	__tg_update_topmost_limited_ancestor(tg, NULL);
+}
+
+static void tg_limit_toggled(struct task_group *tg)
+{
+	if (tg->topmost_limited_ancestor != tg) {
+		/*
+		 * This task group is not a topmost limited ancestor, so both
+		 * it and all its children must already point to their topmost
+		 * limited ancestor, and we have nothing to do.
+		 */
+		return;
+	}
+
+	/*
+	 * This task group is a topmost limited ancestor. Walk over all its
+	 * children and update their pointers to the topmost limited ancestor.
+	 */
+
+	spin_lock_irq(&task_group_lock);
+	walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, NULL);
+	spin_unlock_irq(&task_group_lock);
+}
+
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+	long quota, period;
+	unsigned long rate = 0;
+
+	quota = tg_get_cfs_quota(tg);
+	period = tg_get_cfs_period(tg);
+
+	if (quota > 0 && period > 0) {
+		rate = quota * MAX_CPU_RATE / period;
+		rate = max(rate, 1UL);
+	}
+
+	tg->cpu_rate = rate;
+	tg->nr_cpus = 0;
+}
+
+static int tg_set_cpu_limit(struct task_group *tg,
+			    unsigned long cpu_rate, unsigned int nr_cpus)
+{
+	int ret;
+	unsigned long rate;
+	u64 quota = RUNTIME_INF;
+	u64 period = default_cfs_period();
+
+	rate = (cpu_rate && nr_cpus) ?
+		min_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE) :
+		max_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE);
+	if (rate) {
+		quota = div_u64(period * rate, MAX_CPU_RATE);
+		quota = max(quota, min_cfs_quota_period);
+	}
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	if (!ret) {
+		tg->cpu_rate = cpu_rate;
+		tg->nr_cpus = nr_cpus;
+	}
+	mutex_unlock(&cfs_constraints_mutex);
+
+	return ret;
+}
+
+static u64 cpu_rate_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgroup_tg(cgrp)->cpu_rate;
+}
+
+static int cpu_rate_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			      u64 rate)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (rate > num_online_cpus() * MAX_CPU_RATE)
+		rate = num_online_cpus() * MAX_CPU_RATE;
+	return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
+}
+
+static u64 nr_cpus_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgroup_tg(cgrp)->nr_cpus;
+}
+
+static int nr_cpus_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			     u64 nr_cpus)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (nr_cpus > num_online_cpus())
+		nr_cpus = num_online_cpus();
+	return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
+}
+#else
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+}
+static void tg_limit_toggled(struct task_group *tg)
+{
+}
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+}
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -9333,6 +9809,432 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+static void cpu_cgroup_update_stat(struct cgroup *cgrp, int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct sched_entity *se = tg->se[i];
+	struct kernel_cpustat *kcpustat = cpuacct_cpustat(cgrp, i);
+	u64 now = cpu_clock(i);
+	u64 delta, idle, iowait, steal, used;
+
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return;
+
+	iowait = se->statistics->iowait_sum;
+	idle = se->statistics->sum_sleep_runtime;
+	steal = se->statistics->wait_sum;
+	used = se->sum_exec_runtime;
+
+	if (idle > iowait)
+		idle -= iowait;
+	else
+		idle = 0;
+
+	if (se->statistics->sleep_start) {
+		delta = now - se->statistics->sleep_start;
+		if ((s64)delta > 0)
+			idle += delta;
+	} else if (se->statistics->block_start) {
+		delta = now - se->statistics->block_start;
+		if ((s64)delta > 0)
+			iowait += delta;
+	} else if (se->statistics->wait_start) {
+		delta = now - se->statistics->wait_start;
+		if ((s64)delta > 0)
+			steal += delta;
+	}
+
+	kcpustat->cpustat[CPUTIME_IDLE] =
+			max(kcpustat->cpustat[CPUTIME_IDLE],
+			    nsecs_to_cputime(idle));
+	kcpustat->cpustat[CPUTIME_IOWAIT] =
+			max(kcpustat->cpustat[CPUTIME_IOWAIT],
+			    nsecs_to_cputime(iowait));
+	kcpustat->cpustat[CPUTIME_STEAL] = nsecs_to_cputime(steal);
+	kcpustat->cpustat[CPUTIME_USED] = nsecs_to_cputime(used);
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+				       struct kernel_cpustat *rem, int ind,
+				       u64 cur_usage, u64 target_usage,
+				       u64 rem_usage)
+{
+	s64 scaled_val;
+	u32 scale_pct = 0;
+
+	/* distribute the delta among USER, NICE, and SYSTEM proportionally */
+	if (cur_usage < target_usage) {
+		if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * rem->cpustat[ind],
+					      rem_usage);
+	} else {
+		if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * cur->cpustat[ind],
+					      cur_usage);
+	}
+
+	scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+	cur->cpustat[ind] += scaled_val;
+	if ((s64)cur->cpustat[ind] < 0)
+		cur->cpustat[ind] = 0;
+
+	rem->cpustat[ind] -= scaled_val;
+	if ((s64)rem->cpustat[ind] < 0)
+		rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+				     int ind, u64 cur_idle, u64 target_idle)
+{
+	/* distribute target_idle between IDLE and IOWAIT proportionally to
+	 * what we initially had on this vcpu */
+	if ((s64)cur_idle > 0) {
+		u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+		cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+	} else {
+		cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
+	}
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+				 struct kernel_cpustat *rem,
+				 u64 max_usage)
+{
+	u64 cur_usage, target_usage, rem_usage;
+	u64 cur_idle, target_idle;
+
+	cur_usage = kernel_cpustat_total_usage(cur);
+	rem_usage = kernel_cpustat_total_usage(rem);
+
+	target_usage = min(cur_usage + rem_usage,
+			   max_usage);
+
+	if (cur_usage != target_usage) {
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
+				cur_usage, target_usage, rem_usage);
+	}
+
+	cur_idle = kernel_cpustat_total_idle(cur);
+	target_idle = max_usage - target_usage;
+
+	if (cur_idle != target_idle) {
+		calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
+					 cur_idle, target_idle);
+		calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
+					 cur_idle, target_idle);
+	}
+
+	cur->cpustat[CPUTIME_USED] = target_usage;
+
+	/* do not show steal time inside ve */
+	cur->cpustat[CPUTIME_STEAL] = 0;
+}
+
+static void cpu_cgroup_update_vcpustat(struct cgroup *cgrp)
+{
+	int i, j;
+	int nr_vcpus;
+	int vcpu_rate;
+	ktime_t now;
+	u64 abs_delta_ns, max_usage;
+	struct kernel_cpustat stat_delta, stat_rem;
+	struct task_group *tg = cgroup_tg(cgrp);
+	int first_pass = 1;
+
+	spin_lock(&tg->vcpustat_lock);
+
+	now = ktime_get();
+	nr_vcpus = tg_nr_cpus(tg);
+	vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
+	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+		vcpu_rate = MAX_CPU_RATE;
+
+	if (!ktime_to_ns(tg->vcpustat_last_update)) {
+		/* on the first read initialize vcpu i stat as a sum of stats
+		 * over pcpus j such that j % nr_vcpus == i */
+		for (i = 0; i < nr_vcpus; i++) {
+			for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+				if (!cpu_possible(j))
+					continue;
+				kernel_cpustat_add(tg->vcpustat + i,
+						   cpuacct_cpustat(cgrp, j),
+						   tg->vcpustat + i);
+			}
+		}
+		goto out_update_last;
+	}
+
+	abs_delta_ns = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+	max_usage = nsecs_to_cputime(abs_delta_ns);
+	max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+	/* don't allow to update stats too often to avoid calculation errors */
+	if (max_usage < 10)
+		goto out_unlock;
+
+	/* temporarily copy per cpu usage delta to tg->cpustat_last */
+	for_each_possible_cpu(i)
+		kernel_cpustat_sub(cpuacct_cpustat(cgrp, i),
+				   tg->cpustat_last + i,
+				   tg->cpustat_last + i);
+
+	/* proceed to calculating per vcpu delta */
+	kernel_cpustat_zero(&stat_rem);
+
+again:
+	for (i = 0; i < nr_vcpus; i++) {
+		int exceeds_max;
+
+		kernel_cpustat_zero(&stat_delta);
+		for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+			if (!cpu_possible(j))
+				continue;
+			kernel_cpustat_add(&stat_delta,
+					   tg->cpustat_last + j, &stat_delta);
+		}
+
+		exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+								max_usage;
+		/*
+		 * On the first pass calculate delta for vcpus with usage >
+		 * max_usage in order to accumulate excess in stat_rem.
+		 *
+		 * Once the remainder is accumulated, proceed to the rest of
+		 * vcpus so that it will be distributed among them.
+		*/
+		if (exceeds_max != first_pass)
+			continue;
+
+		fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+		kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+				   tg->vcpustat + i);
+	}
+
+	if (first_pass) {
+		first_pass = 0;
+		goto again;
+	}
+out_update_last:
+	for_each_possible_cpu(i)
+		tg->cpustat_last[i] = *cpuacct_cpustat(cgrp, i);
+	tg->vcpustat_last_update = now;
+out_unlock:
+	spin_unlock(&tg->vcpustat_lock);
+}
+
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *p)
+{
+	int i;
+	unsigned long jif;
+	u64 user, nice, system, idle, iowait, steal;
+	struct timespec boottime;
+	struct task_group *tg = cgroup_tg(cgrp);
+	bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
+	int nr_vcpus = tg_nr_cpus(tg);
+	struct kernel_cpustat *kcpustat;
+	unsigned long tg_nr_running = 0;
+	unsigned long tg_nr_iowait = 0;
+	unsigned long long tg_nr_switches = 0;
+	unsigned long tg_nr_forks = 0;
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec + tg->start_time.tv_sec;
+
+	for_each_possible_cpu(i) {
+		cpu_cgroup_update_stat(cgrp, i);
+
+		/* root task group has autogrouping, so this doesn't hold */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		tg_nr_running += tg->cfs_rq[i]->nr_running;
+		tg_nr_iowait += tg->cfs_rq[i]->nr_iowait;
+		tg_nr_switches += tg->cfs_rq[i]->nr_switches;
+		tg_nr_forks += tg->cfs_rq[i]->nr_forks;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	if (virt)
+		cpu_cgroup_update_vcpustat(cgrp);
+
+	user = nice = system = idle = iowait = steal = 0;
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_possible(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i : cpuacct_cpustat(cgrp, i);
+		user += kcpustat->cpustat[CPUTIME_USER];
+		nice += kcpustat->cpustat[CPUTIME_NICE];
+		system += kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle += kcpustat->cpustat[CPUTIME_IDLE];
+		iowait += kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal += kcpustat->cpustat[CPUTIME_STEAL];
+	}
+
+	if (!ve_is_super(get_exec_env()))
+		steal = 0;
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 %llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)cputime64_to_clock_t(idle),
+		(unsigned long long)cputime64_to_clock_t(iowait),
+		virt ? 0ULL :
+		(unsigned long long)cputime64_to_clock_t(steal));
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_online(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i : cpuacct_cpustat(cgrp, i);
+		user = kcpustat->cpustat[CPUTIME_USER];
+		nice = kcpustat->cpustat[CPUTIME_NICE];
+		system = kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle = kcpustat->cpustat[CPUTIME_IDLE];
+		iowait = kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal = kcpustat->cpustat[CPUTIME_STEAL];
+		if (!ve_is_super(get_exec_env()))
+			steal = 0;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)cputime64_to_clock_t(idle),
+			(unsigned long long)cputime64_to_clock_t(iowait),
+			virt ? 0ULL :
+			(unsigned long long)cputime64_to_clock_t(steal));
+	}
+	seq_printf(p, "intr 0\nswap 0 0\n");
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		tg_nr_switches,
+		(unsigned long)jif,
+		tg_nr_forks,
+		tg_nr_running,
+		tg_nr_iowait);
+
+	return 0;
+}
+
+int cpu_cgroup_proc_loadavg(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *p)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long avnrun[3];
+	int nr_running = 0;
+	int i;
+
+	avnrun[0] = tg->avenrun[0] + FIXED_1/200;
+	avnrun[1] = tg->avenrun[1] + FIXED_1/200;
+	avnrun[2] = tg->avenrun[2] + FIXED_1/200;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		nr_running += tg->cfs_rq[i]->nr_running;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running, cgroup_task_count(cgrp),
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+int cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int nr_vcpus = tg_nr_cpus(tg);
+	int i;
+
+	kernel_cpustat_zero(kstat);
+
+	if (tg == &root_task_group)
+		return -ENOENT;
+
+	for_each_possible_cpu(i)
+		cpu_cgroup_update_stat(cgrp, i);
+
+	cpu_cgroup_update_vcpustat(cgrp);
+
+	for (i = 0; i < nr_vcpus; i++)
+		kernel_cpustat_add(tg->vcpustat + i, kstat, kstat);
+
+	return 0;
+}
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avnrun)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (tg == &root_task_group)
+		return -ENOSYS;
+
+	avnrun[0] = tg->avenrun[0];
+	avnrun[1] = tg->avenrun[1];
+	avnrun[2] = tg->avenrun[2];
+
+	return 0;
+}
+
+static int cpu_cgroup_delay_show(struct cgroup *cgrp, struct cftype *cft,
+				 struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct taskstats stats;
+	struct cgroup_iter it;
+	struct task_struct *p;
+	int cpu;
+
+	memset(&stats, 0, sizeof stats);
+
+	for_each_present_cpu(cpu)
+		delayacct_add_stats(&stats, per_cpu_ptr(tg->taskstats, cpu));
+
+	cgroup_iter_start(cgrp, &it);
+	while ((p = cgroup_iter_next(cgrp, &it))) {
+		if (thread_group_leader(p) && p->signal->stats)
+			delayacct_add_stats(&stats, p->signal->stats);
+		delayacct_add_tsk(&stats, p);
+	}
+	cgroup_iter_end(cgrp, &it);
+
+	cb->fill(cb, "cpu_count", stats.cpu_count);
+	cb->fill(cb, "cpu_delay", stats.cpu_delay_total);
+	cb->fill(cb, "cpu_run_real", stats.cpu_run_real_total);
+	cb->fill(cb, "cpu_run_virtual", stats.cpu_run_virtual_total);
+	cb->fill(cb, "cpu_scaled_run_real", stats.cpu_scaled_run_real_total);
+	cb->fill(cb, "blkio_count", stats.blkio_count);
+	cb->fill(cb, "blkio_delay", stats.blkio_delay_total);
+	cb->fill(cb, "swapin_count", stats.swapin_count);
+	cb->fill(cb, "swapin_delay", stats.swapin_delay_total);
+	cb->fill(cb, "freepages_count", stats.freepages_count);
+	cb->fill(cb, "freepages_delay", stats.freepages_delay_total);
+
+	return 0;
+}
+
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -9357,6 +10259,18 @@ static struct cftype cpu_files[] = {
 		.read_map = cpu_stats_show,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.name = "rate",
+		.read_u64 = cpu_rate_read_u64,
+		.write_u64 = cpu_rate_write_u64,
+	},
+	{
+		.name = "nr_cpus",
+		.read_u64 = nr_cpus_read_u64,
+		.write_u64 = nr_cpus_write_u64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -9369,6 +10283,18 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
+	{
+		.name = "proc.stat",
+		.read_seq_string = cpu_cgroup_proc_stat,
+	},
+	{
+		.name = "proc.loadavg",
+		.read_seq_string = cpu_cgroup_proc_loadavg,
+	},
+	{
+		.name = "delayacct.total",
+		.read_map = cpu_cgroup_delay_show,
+	},
 	{ }	/* terminate */
 };
 
@@ -9378,6 +10304,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.css_free	= cpu_cgroup_css_free,
 	.css_online	= cpu_cgroup_css_online,
 	.css_offline	= cpu_cgroup_css_offline,
+	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -286,6 +286,11 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
 	rcu_read_unlock();
 }
 
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgroup_ca(cgrp)->cpustat, cpu);
+}
+
 struct cgroup_subsys cpuacct_subsys = {
 	.name		= "cpuacct",
 	.css_alloc	= cpuacct_css_alloc,
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -3,6 +3,9 @@
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
 
+struct cgroup;
+extern struct kernel_cpustat *cpuacct_cpustat(struct cgroup *cgrp, int cpu);
+
 #else
 
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
@@ -657,6 +658,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -236,6 +236,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->throttle_count);
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+	SEQ_printf(m, "  .%-30s: %d\n", "nr_cpus_active",
+		   atomic_read(&cfs_rq->tg->nr_cpus_active));
+#endif
+
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -595,6 +600,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 		P(se.statistics->nr_migrations_cold);
 		P(se.statistics->nr_failed_migrations_affine);
 		P(se.statistics->nr_failed_migrations_running);
+		P(se.statistics->nr_failed_migrations_cpulimit);
 		P(se.statistics->nr_failed_migrations_hot);
 		P(se.statistics->nr_forced_migrations);
 		P(se.statistics->nr_wakeups);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -29,6 +29,8 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
 
 #include <trace/events/sched.h>
 
@@ -113,6 +115,10 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
+#endif
+
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
@@ -248,11 +254,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
 /* runqueue on which this entity is (to be) queued */
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
@@ -320,16 +321,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 }
 
 /* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -343,8 +334,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -379,11 +370,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
@@ -427,6 +413,142 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct static_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+	return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+	static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+	static_key_slow_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+	return true;
+}
+
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
+#endif /* HAVE_JUMP_LABEL */
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return !list_empty(&cfs_rq->boosted_entities);
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return se->boosted;
+}
+
+#else /* !CONFIG_CFS_BANDWIDTH */
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#ifdef CONFIG_CFS_CPULIMIT
+static inline int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+	/* if we canceled delayed dec, there is no need to do inc */
+	if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+		atomic_inc(&cfs_rq->tg->nr_cpus_active);
+	cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+	if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+		postpone = 0;
+
+	if (!postpone) {
+		cfs_rq->active = 0;
+		atomic_dec(&cfs_rq->tg->nr_cpus_active);
+	} else {
+		hrtimer_start_range_ns(&cfs_rq->active_timer,
+				ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+				HRTIMER_MODE_REL_PINNED);
+	}
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	struct cfs_rq *cfs_rq =
+		container_of(timer, struct cfs_rq, active_timer);
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	cfs_rq->active = !list_empty(&cfs_rq->tasks);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+	int nr_cpus_limit = DIV_ROUND_UP(tg_cpu_rate(tg), MAX_CPU_RATE);
+	int nr_vcpus = tg_nr_cpus(tg);
+
+	nr_cpus_limit = nr_cpus_limit && nr_vcpus ?
+		min_t(int, nr_cpus_limit, nr_vcpus) :
+		max_t(int, nr_cpus_limit, nr_vcpus);
+
+	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+		return 1;
+
+	if (nr_cpus_active > nr_cpus_limit)
+		return -1;
+
+	return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -727,6 +849,27 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+static void dequeue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (entity_is_task(se)) {
+		struct task_struct *tsk = task_of(se);
+
+		if (tsk->state & TASK_INTERRUPTIBLE)
+			se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
+		if (tsk->state & TASK_UNINTERRUPTIBLE)
+			se->statistics->block_start = rq_clock(rq_of(cfs_rq));
+		if (tsk->in_iowait)
+			cfs_rq->nr_iowait++;
+	} else if (!cfs_rq_throttled(group_cfs_rq(se))) {
+		if (group_cfs_rq(se)->nr_iowait)
+			se->statistics->block_start = rq_clock(rq_of(cfs_rq));
+		else
+			se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
+	}
+#endif
+}
+
 #ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -734,6 +877,25 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->statistics->wait_start, rq_clock(rq_of(cfs_rq)));
 }
 
+static inline void update_sched_lat(struct task_struct *t, u64 now)
+{
+#ifdef CONFIG_VE
+	int cpu;
+	u64 ve_wstamp;
+
+	/* safe due to runqueue lock */
+	cpu = smp_processor_id();
+	ve_wstamp = t->se.statistics->wait_start;
+
+	if (ve_wstamp && now > ve_wstamp) {
+		KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
+				cpu, now - ve_wstamp);
+		KSTAT_LAT_PCPU_ADD(&t->task_ve->sched_lat_ve,
+				cpu, now - ve_wstamp);
+	}
+#endif
+}
+
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -743,8 +905,11 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->statistics->wait_sum, se->statistics->wait_sum +
 			rq_clock(rq_of(cfs_rq)) - se->statistics->wait_start);
 	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics->wait_start);
+		u64 now = rq_clock(rq_of(cfs_rq));
+		struct task_struct *p = task_of(se);
+
+		trace_sched_stat_wait(p, now - se->statistics->wait_start);
+		update_sched_lat(p, now);
 	}
 	schedstat_set(se->statistics->wait_start, 0);
 }
@@ -773,17 +938,8 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 
-	if (flags & DEQUEUE_SLEEP) {
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->statistics->block_start = rq_clock(rq_of(cfs_rq));
-		}
-	}
-
+	if (flags & DEQUEUE_SLEEP)
+		dequeue_sleeper(cfs_rq, se);
 }
 #else
 static inline void
@@ -819,6 +975,106 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_entity_boost(struct sched_entity *se)
+{
+	if (!entity_is_task(se)) {
+		se->boosted = cfs_rq_has_boosted_entities(group_cfs_rq(se));
+		WARN_ON(se->boosted && cfs_rq_throttled(group_cfs_rq(se)));
+	} else {
+		struct task_struct *p = task_of(se);
+
+		if (unlikely(p != current))
+			return;
+
+		if (!(preempt_count() & PREEMPT_ACTIVE)) {
+			se->boosted = sched_feat(BOOST_WAKEUPS) &&
+					p->woken_while_running;
+			p->woken_while_running = 0;
+		} else
+			se->boosted = sched_feat(BOOST_PREEMPT) &&
+				      !p->may_throttle;
+	}
+}
+
+static int check_enqueue_boost(struct rq *rq, struct task_struct *p, int flags)
+{
+	if (sched_feat(BOOST_WAKEUPS) && (flags & ENQUEUE_WAKEUP))
+		p->se.boosted = 1;
+	return p->se.boosted;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_add(&se->boost_node, &cfs_rq->boosted_entities);
+}
+
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	if (WARN_ON(se->boost_node.next == LIST_POISON1))
+		return;
+	list_del(&se->boost_node);
+}
+
+static int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) || !entity_boosted(se)) {
+		if (se != cfs_rq->curr)
+			__enqueue_boosted_entity(cfs_rq, se);
+		se->boosted = 1;
+		WARN_ON(!entity_is_task(se) &&
+			cfs_rq_throttled(group_cfs_rq(se)));
+		return 1;
+	} else
+		WARN_ON(cfs_rq_throttled(group_cfs_rq(se)));
+
+	return 0;
+}
+
+static int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) ||
+	    !cfs_rq_has_boosted_entities(group_cfs_rq(se))) {
+		if (se != cfs_rq->curr)
+			__dequeue_boosted_entity(cfs_rq, se);
+		if (!entity_is_task(se))
+			se->boosted = 0;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline void update_entity_boost(struct sched_entity *se) {}
+
+static inline int check_enqueue_boost(struct rq *rq,
+				      struct task_struct *p, int flags)
+{
+	return 0;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+
+static inline int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+#endif
+
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -1133,8 +1389,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
-	if (p)
-		get_task_struct(p);
 
 	env->best_task = p;
 	env->best_imp = imp;
@@ -1156,11 +1410,30 @@ static void task_numa_compare(struct task_numa_env *env,
 	long dst_load, src_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
+	bool assigned = false;
 
 	rcu_read_lock();
-	cur = ACCESS_ONCE(dst_rq->curr);
-	if (cur->pid == 0) /* idle */
+	raw_spin_lock_irq(&dst_rq->lock);
+	cur = dst_rq->curr;
+	/*
+	 * No need to move the exiting task or idle task.
+	 */
+	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
 		cur = NULL;
+	else {
+		/*
+		 * The task_struct must be protected here to protect the
+		 * p->numa_faults access in the task_weight since the
+		 * numa_faults could already be freed in the following path:
+		 * finish_task_switch()
+		 *     --> put_task_struct()
+		 *         --> __put_task_struct()
+		 *             --> task_numa_free()
+		 */
+		get_task_struct(cur);
+	}
+
+	raw_spin_unlock_irq(&dst_rq->lock);
 
 	/*
 	 * "imp" is the fault differential for the source task between the
@@ -1249,9 +1522,16 @@ static void task_numa_compare(struct task_numa_env *env,
 		goto unlock;
 
 assign:
+	assigned = true;
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
+	/*
+	 * The dst_rq->curr isn't assigned. The protection for task_struct is
+	 * finished.
+	 */
+	if (cur && !assigned)
+		put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2054,6 +2334,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		account_numa_enqueue(rq, task_of(se));
 		list_add(&se->group_node, &rq->cfs_tasks);
+		list_add(&se->cfs_rq_node, &cfs_rq->tasks);
 	}
 #endif
 	cfs_rq->nr_running++;
@@ -2068,6 +2349,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
+		list_del_init(&se->cfs_rq_node);
 	}
 	cfs_rq->nr_running--;
 }
@@ -2678,12 +2960,13 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			se->statistics->sleep_max = delta;
 
 		se->statistics->sleep_start = 0;
-		se->statistics->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
+
+		se->statistics->sum_sleep_runtime += delta;
 	}
 	if (se->statistics->block_start) {
 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics->block_start;
@@ -2695,7 +2978,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			se->statistics->block_max = delta;
 
 		se->statistics->block_start = 0;
-		se->statistics->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
@@ -2717,11 +2999,42 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 						delta >> 20);
 			}
 			account_scheduler_latency(tsk, delta >> 10, 0);
-		}
+		} else
+			se->statistics->iowait_sum += delta;
+
+		se->statistics->sum_sleep_runtime += delta;
 	}
 #endif
 }
 
+void start_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			dequeue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
+void stop_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			enqueue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -2767,7 +3080,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags);
 
 static inline void check_schedstat_required(void)
 {
@@ -2792,6 +3105,9 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	if (!cfs_rq->load.weight)
+		inc_nr_active_cfs_rqs(cfs_rq);
+
 	/*
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
@@ -2824,7 +3140,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
-		check_enqueue_throttle(cfs_rq);
+		check_enqueue_throttle(cfs_rq, flags);
 	}
 }
 
@@ -2889,6 +3205,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	clear_buddies(cfs_rq, se);
 
+	if (cfs_rq->prev == se)
+		cfs_rq->prev = NULL;
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
@@ -2902,11 +3221,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 
-	/* return excess runtime on last dequeue */
-	return_cfs_rq_runtime(cfs_rq);
+	if (!cfs_rq->nr_running) {
+		/* return excess runtime on last dequeue */
+		return_cfs_rq_runtime(cfs_rq);
+		/* account switch to idle task */
+		cfs_rq->nr_switches++;
+	}
 
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
+
+	if (!cfs_rq->load.weight)
+		dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -2962,10 +3288,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		if (schedstat_enabled())
 			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		if (entity_boosted(se))
+			__dequeue_boosted_entity(cfs_rq, se);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
+	if (cfs_rq->prev != se)
+		cfs_rq->nr_switches++;
 #ifdef CONFIG_SCHEDSTATS
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
@@ -3017,6 +3347,20 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 		se = cfs_rq->next;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	/*
+	 * Give boosted tasks a chance to finish their kernel-mode execution in
+	 * order to avoid prio inversion in case they hold a lock, but resched
+	 * them asap for the sake of fairness.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) {
+		if (cfs_rq_has_boosted_entities(cfs_rq))
+			se = list_first_entry(&cfs_rq->boosted_entities,
+					      struct sched_entity, boost_node);
+		rq_of(cfs_rq)->resched_next = 1;
+	}
+#endif
+
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -3033,6 +3377,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	update_entity_boost(prev);
+	if (entity_boosted(prev) && prev->on_rq) {
+		__enqueue_boosted_entity(cfs_rq, prev);
+		if (unlikely(cfs_rq_throttled(cfs_rq)))
+			/* prev was moved to throttled cfs_rq */
+			unthrottle_cfs_rq(cfs_rq);
+	}
+
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
@@ -3047,7 +3399,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
 		update_entity_load_avg(prev, 1);
-	}
+		cfs_rq->prev = prev;
+	} else
+		cfs_rq->prev = NULL;
 	cfs_rq->curr = NULL;
 }
 
@@ -3094,42 +3448,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
 #ifdef CONFIG_CFS_BANDWIDTH
 
-#ifdef HAVE_JUMP_LABEL
-static struct static_key __cfs_bandwidth_used;
-
-static inline bool cfs_bandwidth_used(void)
-{
-	return static_key_false(&__cfs_bandwidth_used);
-}
-
-void cfs_bandwidth_usage_inc(void)
-{
-	static_key_slow_inc(&__cfs_bandwidth_used);
-}
-
-void cfs_bandwidth_usage_dec(void)
-{
-	static_key_slow_dec(&__cfs_bandwidth_used);
-}
-#else /* HAVE_JUMP_LABEL */
-static bool cfs_bandwidth_used(void)
-{
-	return true;
-}
-
-void cfs_bandwidth_usage_inc(void) {}
-void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
-	return 100000000ULL;
-}
-
 static inline u64 sched_cfs_bandwidth_slice(void)
 {
 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
@@ -3275,11 +3593,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return cfs_bandwidth_used() && cfs_rq->throttled;
-}
-
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -3348,6 +3661,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
+	cfs_rq->throttled = 1;
+
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -3366,7 +3681,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		rq->nr_running -= task_delta;
 
-	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
 	/*
@@ -3660,11 +3974,36 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
  * runtime as update_curr() throttling can not not trigger until it's on-rq.
  */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags)
 {
+	WARN_ON(cfs_rq_has_boosted_entities(cfs_rq));
+
 	if (!cfs_bandwidth_used())
 		return;
 
+	/* Synchronize hierarchical throttle counter: */
+	if (unlikely(!cfs_rq->throttle_uptodate)) {
+		struct rq *rq = rq_of(cfs_rq);
+		struct cfs_rq *pcfs_rq;
+		struct task_group *tg;
+
+		cfs_rq->throttle_uptodate = 1;
+
+		/* Get closest up-to-date node, because leaves go first: */
+		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+			if (pcfs_rq->throttle_uptodate)
+				break;
+		}
+		if (tg) {
+			cfs_rq->throttle_count = pcfs_rq->throttle_count;
+			cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		}
+	}
+
+	if (flags & ENQUEUE_BOOST)
+		return;
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -3712,6 +4051,9 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_rq_throttled(cfs_rq))
 		return;
 
+	if (cfs_rq_has_boosted_entities(cfs_rq))
+		return;
+
 	throttle_cfs_rq(cfs_rq);
 }
 
@@ -3765,10 +4107,17 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer);
+
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	INIT_LIST_HEAD(&cfs_rq->boosted_entities);
+#ifdef CONFIG_CFS_CPULIMIT
+	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
 }
 
 /* requires cfs_b->lock, may release to reprogram timer */
@@ -3827,15 +4176,10 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return 0;
-}
-
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -3924,11 +4268,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boost = check_enqueue_boost(rq, p, flags);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
+		if (boost)
+			flags |= ENQUEUE_BOOST;
 		enqueue_entity(cfs_rq, se, flags);
 
 		/*
@@ -3941,6 +4288,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running++;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -3951,6 +4301,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -3958,6 +4311,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se) {
 		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
+	} else if (boost) {
+		for_each_sched_entity(se) {
+			cfs_rq = cfs_rq_of(se);
+			if (!enqueue_boosted_entity(cfs_rq, se)) {
+				WARN_ON(throttled_hierarchy(cfs_rq));
+				break;
+			}
+			if (cfs_rq_throttled(cfs_rq))
+				unthrottle_cfs_rq(cfs_rq);
+		}
 	}
 	hrtick_update(rq);
 }
@@ -3973,8 +4336,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	if (task_sleep)
+		flags |= DEQUEUE_TASK_SLEEP;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -3989,6 +4356,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running--;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -4008,8 +4378,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 
-		if (cfs_rq_throttled(cfs_rq))
+		if (cfs_rq_throttled(cfs_rq)) {
+			WARN_ON(boosted);
 			break;
+		}
+
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
 
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
@@ -4467,6 +4842,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	return target;
 }
 
+static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg;
+	struct sched_domain *sd;
+	int prev_cpu = task_cpu(p);
+	int cpu;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, *new_cpu) > 0)
+		return false;
+
+	if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+		return true;
+
+	if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+		*new_cpu = prev_cpu;
+		return true;
+	}
+
+	for_each_domain(*new_cpu, sd) {
+		for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				*new_cpu = cpu;
+				return true;
+			}
+		}
+	}
+#endif
+	return false;
+}
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -4514,9 +4921,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (affine_sd) {
 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-			prev_cpu = cpu;
+			new_cpu = cpu;
+		else
+			new_cpu = prev_cpu;
+	}
+
+	if (select_runnable_cpu(p, &new_cpu))
+		goto unlock;
 
-		new_cpu = select_idle_sibling(p, prev_cpu);
+	if (affine_sd) {
+		new_cpu = select_idle_sibling(p, new_cpu);
 		goto unlock;
 	}
 
@@ -4775,9 +5189,70 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	if (rq->resched_next && !entity_boosted(&p->se)) {
+		/*
+		 * seems boosted tasks have gone from the throttled cfs_rq,
+		 * pick another task then
+		 */
+		resched_curr(rq);
+		rq->resched_next = 0;
+	}
+
 	return p;
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
+static int cpulimit_balance_cpu_stop(void *data);
+
+static inline void trigger_cpulimit_balance(struct task_struct *p)
+{
+	struct rq *this_rq;
+	struct task_group *tg;
+	int this_cpu, cpu, target_cpu = -1;
+	struct sched_domain *sd;
+
+	this_rq = rq_of(cfs_rq_of(&p->se));
+	this_cpu = cpu_of(this_rq);
+
+	if (!p->se.on_rq || this_rq->active_balance)
+		return;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, this_cpu) >= 0)
+		return;
+
+	rcu_read_lock();
+	for_each_domain(this_cpu, sd) {
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+		for_each_cpu_and(cpu, sched_domain_span(sd),
+				 tsk_cpus_allowed(p)) {
+			if (cpu != this_cpu &&
+			    cfs_rq_active(tg->cfs_rq[cpu])) {
+				target_cpu = cpu;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+
+	if (target_cpu >= 0) {
+		this_rq->active_balance = 1;
+		this_rq->push_cpu = target_cpu;
+		raw_spin_unlock(&this_rq->lock);
+		stop_one_cpu_nowait(this_rq->cpu,
+				    cpulimit_balance_cpu_stop, this_rq,
+				    &this_rq->active_balance_work);
+		raw_spin_lock(&this_rq->lock);
+	}
+}
+#else
+static inline void trigger_cpulimit_balance(struct task_struct *p)
+{
+}
+#endif
+
 /*
  * Account for a descheduled task:
  */
@@ -5113,6 +5588,37 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 }
 #endif
 
+static inline int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+
+	if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
+		int cpu;
+
+		schedstat_inc(p, se.statistics->nr_failed_migrations_cpulimit);
+
+		env->flags |= LBF_SOME_PINNED;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0)
+			return 0;
+
+		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+			return 0;
+
+		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				env->flags |= LBF_DST_PINNED;
+				env->new_dst_cpu = cpu;
+				break;
+			}
+		}
+		return 0;
+	}
+#endif
+	return 1;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -5120,6 +5626,10 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
+
+	if (!can_migrate_task_cpulimit(p, env))
+		return 0;
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -5312,6 +5822,158 @@ static int move_tasks(struct lb_env *env)
 	return pulled;
 }
 
+#ifdef CONFIG_CFS_CPULIMIT
+static unsigned long entity_h_load(struct sched_entity *se);
+
+static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+
+	list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
+		p = task_of(se);
+		if (task_curr(p) ||
+		    !cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p)))
+			return 0;
+	}
+	env->flags &= ~LBF_ALL_PINNED;
+	return 1;
+}
+
+static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se, *tmp;
+	int moved = 0;
+
+	list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
+		move_task(task_of(se), env);
+		moved++;
+	}
+	return moved;
+}
+
+static int move_task_groups(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+	unsigned long load;
+	int cur_pulled, pulled = 0;
+
+	if (env->imbalance <= 0)
+		return 0;
+
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/*
+		 * A child always goes before its parent in a leaf_cfs_rq_list.
+		 * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+		 * we could not migrate the child and therefore we should not
+		 * even try to migrate the parent.
+		 */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
+			continue;
+
+		load = entity_h_load(tg->se[env->src_cpu]);
+		if ((load / 2) > env->imbalance)
+			continue;
+
+		if (!can_migrate_task_group(cfs_rq, env))
+			continue;
+
+		cur_pulled = move_task_group(cfs_rq, env);
+		pulled += cur_pulled;
+		env->imbalance -= load;
+
+		env->loop += cur_pulled;
+		if (env->loop > env->loop_max)
+			break;
+
+		if (env->imbalance <= 0)
+			break;
+	}
+	return pulled;
+}
+
+static int do_cpulimit_balance(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+	int pushed = 0;
+
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/* see move_task_groups for why we skip such groups */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+		if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
+		    can_migrate_task_group(cfs_rq, env))
+			pushed += move_task_group(cfs_rq, env);
+	}
+	return pushed;
+}
+
+static int cpulimit_balance_cpu_stop(void *data)
+{
+	struct rq *rq = data;
+	int cpu = cpu_of(rq);
+	int target_cpu = rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct sched_domain *sd;
+
+	raw_spin_lock_irq(&rq->lock);
+
+	if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
+		     !cpu_online(target_cpu)))
+		goto out_unlock;
+
+	if (unlikely(!rq->nr_running))
+		goto out_unlock;
+
+	BUG_ON(rq == target_rq);
+
+	double_lock_balance(rq, target_rq);
+	rcu_read_lock();
+	for_each_domain(target_cpu, sd) {
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+		    cpumask_test_cpu(cpu, sched_domain_span(sd)))
+				break;
+	}
+	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= cpu,
+			.src_rq		= rq,
+		};
+
+		schedstat_inc(sd, clb_count);
+
+		if (do_cpulimit_balance(&env))
+			schedstat_inc(sd, clb_pushed);
+		else
+			schedstat_inc(sd, clb_failed);
+	}
+	rcu_read_unlock();
+	double_unlock_balance(rq, target_rq);
+
+out_unlock:
+	rq->active_balance = 0;
+	raw_spin_unlock_irq(&rq->lock);
+	return 0;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * update tg->load_weight by folding this cpu's load_avg
@@ -5408,12 +6070,12 @@ static void update_h_load(long cpu)
 	rcu_read_unlock();
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long entity_h_load(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	unsigned long load;
 
-	load = p->se.load.weight;
+	load = se->load.weight;
 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
 
 	return load;
@@ -5427,12 +6089,17 @@ static inline void update_h_load(long cpu)
 {
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long entity_h_load(struct sched_entity *se)
 {
-	return p->se.load.weight;
+	return se->load.weight;
 }
 #endif
 
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return entity_h_load(&p->se);
+}
+
 /********** Helpers for find_busiest_group ************************/
 
 enum group_type {
@@ -6188,7 +6855,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	/* How much load to actually move to equalise the imbalance */
 	env->imbalance = min(
 		max_pull * busiest->group_power,
-		(sds->avg_load - local->avg_load) * local->group_power
+		(busiest->avg_load - local->avg_load) * local->group_power
 	) / SCHED_POWER_SCALE;
 
 	/*
@@ -6265,13 +6932,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (local->avg_load >= busiest->avg_load)
 		goto out_balanced;
 
-	/*
-	 * Don't pull any tasks if this group is already above the domain
-	 * average load.
-	 */
-	if (local->avg_load >= sds.avg_load)
-		goto out_balanced;
-
 	if (env->idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group is not overloaded
@@ -6519,6 +7179,17 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		double_rq_unlock(env.dst_rq, busiest);
 		local_irq_restore(flags);
 
+#ifdef CONFIG_CFS_CPULIMIT
+		if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
+			env.loop = 0;
+			local_irq_save(flags);
+			double_rq_lock(env.dst_rq, busiest);
+			cur_ld_moved = ld_moved = move_task_groups(&env);
+			double_rq_unlock(env.dst_rq, busiest);
+			local_irq_restore(flags);
+		}
+#endif
+
 		/*
 		 * some other cpu did the load balance for us.
 		 */
@@ -6853,6 +7524,11 @@ static int active_load_balance_cpu_stop(void *data)
 	return 0;
 }
 
+static void pre_schedule_fair(struct rq *rq, struct task_struct *prev)
+{
+	trigger_cpulimit_balance(prev);
+}
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * idle load balancing details
@@ -7352,6 +8028,8 @@ static void task_fork_fair(struct task_struct *p)
 
 	se->vruntime -= cfs_rq->min_vruntime;
 
+	cfs_rq->nr_forks++;
+
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -7420,6 +8098,13 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Since the real-depth could have been changed (only FAIR
+	 * class maintain depth value), reset depth properly.
+	 */
+	p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
+#endif
 	if (!p->se.on_rq)
 		return;
 
@@ -7455,6 +8140,7 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
+	INIT_LIST_HEAD(&cfs_rq->tasks);
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7513,6 +8199,12 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
 #endif
 	}
+
+	/*
+	 * Since the real-depth could have been changed (only FAIR
+	 * class maintain depth value), reset depth properly.
+	 */
+        p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -7631,15 +8323,23 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cpu_online(cpu))
+		se->statistics->sleep_start = cpu_clock(cpu);
+#endif
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -7708,6 +8408,69 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void nr_iowait_dec_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait--;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (!cfs_rq->nr_iowait && se && se->statistics->block_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->statistics->block_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->statistics->block_max))
+			se->statistics->block_max = delta;
+
+		se->statistics->block_start = 0;
+		se->statistics->sleep_start = rq->clock;
+
+		se->statistics->iowait_sum += delta;
+		se->statistics->sum_sleep_runtime += delta;
+	}
+#endif
+}
+
+static void nr_iowait_inc_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait++;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cfs_rq->nr_iowait && se && se->statistics->sleep_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->statistics->sleep_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->statistics->sleep_max))
+			se->statistics->sleep_max = delta;
+
+		se->statistics->sleep_start = 0;
+		se->statistics->block_start = rq->clock;
+
+		se->statistics->sum_sleep_runtime += delta;
+	}
+#endif
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -7732,6 +8495,7 @@ const struct sched_class fair_sched_class = {
 	.rq_offline		= rq_offline_fair,
 
 	.task_waking		= task_waking_fair,
+	.pre_schedule		= pre_schedule_fair,
 #endif
 
 	.set_curr_task          = set_curr_task_fair,
@@ -7748,6 +8512,8 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_move_group	= task_move_group_fair,
+	.nr_iowait_inc		= nr_iowait_inc_fair,
+	.nr_iowait_dec		= nr_iowait_dec_fair,
 #endif
 };
 
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,3 +83,6 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  */
 SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
+
+SCHED_FEAT(BOOST_WAKEUPS, true)
+SCHED_FEAT(BOOST_PREEMPT, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -52,7 +52,7 @@ extern __read_mostly int scheduler_running;
  * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  * increased costs.
  */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+#if BITS_PER_LONG > 32
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
@@ -260,8 +260,26 @@ struct task_group {
 	struct autogroup *autogroup;
 #endif
 
+	struct taskstats __percpu *taskstats;
+	unsigned long avenrun[3];	/* loadavg data */
+	struct timespec start_time;
+
+	struct kernel_cpustat *cpustat_last;
+	struct kernel_cpustat *vcpustat;
+	ktime_t vcpustat_last_update;
+	spinlock_t vcpustat_lock;
+
 	struct cfs_bandwidth cfs_bandwidth;
 
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+	unsigned long cpu_rate;
+	unsigned int nr_cpus;
+	atomic_t nr_cpus_active;
+	struct task_group *topmost_limited_ancestor; /* self if none of the
+							ancestors is limited */
+#endif
+
 #if defined(CONFIG_FAIR_GROUP_SCHED)
 	/*
 	 * Put load_avg/runnable_avg in its own cacheline to avoid
@@ -348,6 +366,9 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned int nr_running, h_nr_running;
 
+	unsigned long nr_iowait;
+	unsigned long nr_unint;
+
 	u64 exec_clock;
 	u64 min_vruntime;
 #ifndef CONFIG_64BIT
@@ -357,11 +378,16 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 
+	struct list_head tasks;
+
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr, *next, *last, *skip;
+	struct sched_entity *curr, *next, *last, *skip, *prev;
+
+	u64 nr_switches;
+	unsigned long nr_forks;
 
 #ifdef	CONFIG_SCHED_DEBUG
 	unsigned int nr_spread_over;
@@ -421,9 +447,15 @@ struct cfs_rq {
 
 	u64 throttled_clock, throttled_clock_task;
 	u64 throttled_clock_task_time;
-	int throttled, throttle_count;
+	int throttled, throttle_count, throttle_uptodate;
 	struct list_head throttled_list;
+
+	struct list_head boosted_entities;
 #endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_CFS_CPULIMIT
+	int active;
+	struct hrtimer active_timer;
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
@@ -570,7 +602,8 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
-	int skip_clock_update;
+	signed char skip_clock_update;
+	unsigned char resched_next;
 
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
@@ -592,6 +625,9 @@ struct rq {
 	struct list_head leaf_rt_rq_list;
 #endif
 
+	/* nr_running last seen in update_cpu_load() */
+	unsigned long nr_active;
+
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
@@ -599,6 +635,10 @@ struct rq {
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
+	unsigned long nr_iothrottled;
+
+	unsigned long nr_sleeping;
+	unsigned long nr_stopped;
 
 	struct task_struct *curr, *idle, *stop;
 	unsigned long next_balance;
@@ -1166,8 +1206,10 @@ static const u32 prio_to_wmult[40] = {
 #define ENQUEUE_WAKING		0
 #endif
 #define ENQUEUE_REPLENISH	8
+#define ENQUEUE_BOOST		16
 
 #define DEQUEUE_SLEEP		1
+#define DEQUEUE_TASK_SLEEP	2
 
 struct sched_class {
 	const struct sched_class *next;
@@ -1218,6 +1260,8 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
+	void (*nr_iowait_inc) (struct task_struct *p);
+	void (*nr_iowait_dec) (struct task_struct *p);
 	RH_KABI_EXTEND(void (*update_curr) (struct rq *rq))
 	RH_KABI_EXTEND(void (*task_dead) (struct task_struct *p))
 };
@@ -1621,6 +1665,32 @@ extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+#else
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return &task_rq(p)->cfs;
+}
+#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+#endif
+
+extern void start_cfs_idle_time_accounting(int cpu);
+extern void stop_cfs_idle_time_accounting(int cpu);
+
 #ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
 	NOHZ_TICK_STOPPED,
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -66,8 +66,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyg[itype]);
 			}
 			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+				   " %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u\n",
 			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
+			    sd->clb_count, sd->clb_failed, sd->clb_pushed,
 			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
 			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -54,6 +54,9 @@
 struct seccomp_filter {
 	atomic_t usage;
 	struct seccomp_filter *prev;
+#if CONFIG_VE
+	struct sock_fprog orig_prog;
+#endif
 	unsigned short len;  /* Instruction count */
 	struct sock_filter insns[];
 };
@@ -265,6 +268,16 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	if (copy_from_user(filter->insns, fprog->filter, fp_size))
 		goto fail;
 
+#if CONFIG_VE
+	filter->orig_prog.len = fprog->len;
+	filter->orig_prog.filter = kmemdup(filter->insns, fp_size,
+					   GFP_KERNEL|__GFP_NOWARN);
+	if (!filter->orig_prog.filter) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+#endif
+
 	/* Check and rewrite the fprog via the skb checker */
 	ret = sk_chk_filter(filter->insns, filter->len);
 	if (ret)
@@ -283,6 +296,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	current->seccomp.filter = filter;
 	return 0;
 fail:
+#if CONFIG_VE
+	kfree(filter->orig_prog.filter);
+#endif
 	kfree(filter);
 	return ret;
 }
@@ -332,6 +348,9 @@ void put_seccomp_filter(struct task_struct *tsk)
 	while (orig && atomic_dec_and_test(&orig->usage)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
+#if CONFIG_VE
+		kfree(freeme->orig_prog.filter);
+#endif
 		kfree(freeme);
 	}
 }
@@ -381,6 +400,10 @@ int __secure_computing(int this_syscall)
 	int *syscall;
 	u32 ret;
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return 0;
+
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
@@ -511,3 +534,71 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 out:
 	return ret;
 }
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+			void __user *data)
+{
+	struct seccomp_filter *filter;
+	long ret;
+	unsigned long count = 0;
+
+	if (!capable(CAP_SYS_ADMIN) ||
+	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+		return -EACCES;
+	}
+
+	spin_lock_irq(&task->sighand->siglock);
+	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	filter = task->seccomp.filter;
+	while (filter) {
+		filter = filter->prev;
+		count++;
+	}
+
+	if (filter_off >= count) {
+		ret = -ENOENT;
+		goto out;
+	}
+	count -= filter_off;
+
+	filter = task->seccomp.filter;
+	while (filter && count > 1) {
+		filter = filter->prev;
+		count--;
+	}
+
+	if (WARN_ON(count != 1 || !filter)) {
+		/* The filter tree shouldn't shrink while we're using it. */
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = filter->len;
+	if (!data)
+		goto out;
+
+	get_seccomp_filter(task);
+	spin_unlock_irq(&task->sighand->siglock);
+
+#if CONFIG_VE
+	if (copy_to_user(data, filter->orig_prog.filter,
+			 filter->orig_prog.len * sizeof(filter->orig_prog.filter[0])))
+		ret = -EFAULT;
+#else
+	if (copy_to_user(data, filter->insns, filter->len * sizeof(filter->insns[0])))
+		ret = -EFAULT;
+#endif
+
+	put_seccomp_filter(task);
+	return ret;
+
+out:
+	spin_unlock_irq(&task->sighand->siglock);
+	return ret;
+}
+#endif
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -34,7 +34,8 @@
 #include <linux/compat.h>
 #include <linux/cn_proc.h>
 #include <linux/compiler.h>
-
+#include <linux/interrupt.h>
+#include <linux/ve.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -43,6 +44,7 @@
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
+#include <bc/misc.h>
 #include "audit.h"	/* audit_signal_info() */
 
 /*
@@ -50,6 +52,7 @@
  */
 
 static struct kmem_cache *sigqueue_cachep;
+static inline int is_si_special(const struct siginfo *info);
 
 int print_fatal_signals __read_mostly;
 
@@ -376,6 +379,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (q && ub_siginfo_charge(q, get_task_ub(t), flags)) {
+			kmem_cache_free(sigqueue_cachep, q);
+			q = NULL;
+		}
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -398,6 +405,7 @@ static void __sigqueue_free(struct sigqueue *q)
 		return;
 	atomic_dec(&q->user->sigpending);
 	free_uid(q->user);
+	ub_siginfo_uncharge(q);
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
@@ -583,7 +591,18 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			siginfo_t *info)
 {
-	int sig = next_signal(pending, mask);
+	int sig = 0;
+
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 
 	if (sig) {
 		if (current->notifier) {
@@ -662,7 +681,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 */
 		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
-	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+	if (info->si_code== SI_TIMER && info->si_sys_private) {
 		/*
 		 * Release the siglock to ensure proper locking order
 		 * of timer locks outside of siglocks.  Note, we leave
@@ -3069,6 +3088,12 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 		return -EPERM;
 
 	info->si_signo = sig;
+	/*
+	 * If this is a posix timer signal, prevent that the dequeue
+	 * into the timer rearming code.
+	 */
+	if (info->si_code == SI_TIMER)
+		info->si_sys_private = 0;
 
 	return do_send_specific(tgid, pid, sig, info);
 }
@@ -3099,6 +3124,11 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
 }
 #endif
 
+void __weak sigaction_compat_abi(struct k_sigaction *act,
+		struct k_sigaction *oact)
+{
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct task_struct *t = current;
@@ -3114,6 +3144,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	if (oact)
 		*oact = *k;
 
+	sigaction_compat_abi(act, oact);
+
 	if (act) {
 		sigdelsetmask(&act->sa.sa_mask,
 			      sigmask(SIGKILL) | sigmask(SIGSTOP));
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,8 @@
 #include <linux/smpboot.h>
 #include <linux/tick.h>
 
+#include <bc/beancounter.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
@@ -202,8 +204,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 #define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void __softirq_entry __do_softirq(void)
 {
+	struct user_beancounter *ub;
 	struct softirq_action *h;
 	__u32 pending;
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
@@ -233,6 +236,7 @@ asmlinkage void __do_softirq(void)
 
 	h = softirq_vec;
 
+	ub = set_exec_ub(get_ub0());
 	do {
 		if (pending & 1) {
 			unsigned int vec_nr = h - softirq_vec;
@@ -257,6 +261,7 @@ asmlinkage void __do_softirq(void)
 		h++;
 		pending >>= 1;
 	} while (pending);
+	(void)set_exec_ub(ub);
 
 	local_irq_disable();
 
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
+#include <linux/virtinfo.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
@@ -42,6 +43,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/ve.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -49,7 +51,6 @@
 #include <linux/user_namespace.h>
 #include <linux/binfmts.h>
 
-#include <linux/sched.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
@@ -129,6 +130,122 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
+DEFINE_SEMAPHORE(virtinfo_sem);
+EXPORT_SYMBOL(virtinfo_sem);
+static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
+
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+
+	for (p = &virtinfo_chain[type];
+	     *p != NULL && nb->priority < (*p)->priority;
+	     p = &(*p)->next);
+	nb->next = *p;
+	smp_wmb();
+	*p = nb;
+}
+
+EXPORT_SYMBOL(__virtinfo_notifier_register);
+
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	down(&virtinfo_sem);
+	__virtinfo_notifier_register(type, nb);
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_register);
+
+struct virtinfo_cnt_struct {
+	volatile unsigned long exit[NR_CPUS];
+	volatile unsigned long entry;
+};
+static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
+
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+	int entry_cpu, exit_cpu;
+	unsigned long cnt, ent;
+
+	down(&virtinfo_sem);
+	for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
+	*p = nb->next;
+	smp_mb();
+
+	for_each_possible_cpu(entry_cpu) {
+		while (1) {
+			cnt = 0;
+			for_each_possible_cpu(exit_cpu)
+				cnt +=
+				    per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
+			smp_rmb();
+			ent = per_cpu(virtcnt, entry_cpu).entry;
+			if (cnt == ent)
+				break;
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	/* FIXME: replace virtinfo with srcu-notifier-chains */
+	rcu_barrier_sched();
+
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_unregister);
+
+static int do_virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	struct vnotifier_block *nb;
+
+	nb = virtinfo_chain[type];
+	ret = NOTIFY_DONE;
+	while (nb)
+	{
+		ret = nb->notifier_call(nb, n, data, ret);
+		if(ret & NOTIFY_STOP_MASK) {
+			ret &= ~NOTIFY_STOP_MASK;
+			break;
+		}
+		nb = nb->next;
+	}
+
+	return ret;
+}
+
+int virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	int entry_cpu, exit_cpu;
+
+	entry_cpu = get_cpu();
+	per_cpu(virtcnt, entry_cpu).entry++;
+	smp_wmb();
+	put_cpu();
+
+	ret = do_virtinfo_notifier_call(type, n, data);
+
+	exit_cpu = get_cpu();
+	smp_wmb();
+	per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
+	put_cpu();
+
+	return ret;
+}
+EXPORT_SYMBOL(virtinfo_notifier_call);
+
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data)
+{
+	if (!in_interrupt())
+		return virtinfo_notifier_call(type, n, data);
+	return do_virtinfo_notifier_call(type, n, data);
+}
+EXPORT_SYMBOL(virtinfo_notifier_call_irq);
+
 /*
  * If set, this is used for preparing the system to power off.
  */
@@ -1109,7 +1226,7 @@ SYSCALL_DEFINE0(getppid)
 	int pid;
 
 	rcu_read_lock();
-	pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+	pid = ve_task_ppid_nr_ns(current, current->nsproxy->pid_ns);
 	rcu_read_unlock();
 
 	return pid;
@@ -1152,8 +1269,27 @@ void do_sys_times(struct tms *tms)
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
 
+#ifdef CONFIG_VE
+unsigned long long ve_relative_clock(struct timespec * ts)
+{
+	unsigned long long offset = 0;
+
+	if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec ||
+	    (ts->tv_sec == get_exec_env()->start_timespec.tv_sec &&
+	     ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec))
+		offset = (unsigned long long)(ts->tv_sec -
+			get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC
+			+ ts->tv_nsec -	get_exec_env()->start_timespec.tv_nsec;
+	return nsec_to_clock_t(offset);
+}
+#endif
+
 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 {
+#ifdef CONFIG_VE
+	struct timespec now;
+#endif
+
 	if (tbuf) {
 		struct tms tmp;
 
@@ -1161,8 +1297,15 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
+#ifndef CONFIG_VE
 	force_successful_syscall_return();
 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
+#else
+	/* Compare to calculation in fs/proc/array.c */
+	do_posix_clock_monotonic_gettime(&now);
+	force_successful_syscall_return();
+	return ve_relative_clock(&now);
+#endif
 }
 
 /*
@@ -1347,6 +1490,7 @@ SYSCALL_DEFINE0(setsid)
 }
 
 DECLARE_RWSEM(uts_sem);
+EXPORT_SYMBOL_GPL(uts_sem);
 
 #ifdef COMPAT_UTS_MACHINE
 #define override_architecture(name) \
@@ -1636,7 +1780,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		/* Keep the capable check against init_user_ns until
 		   cgroups can contain all limits */
 		if (new_rlim->rlim_max > rlim->rlim_max &&
-				!capable(CAP_SYS_RESOURCE))
+				!ve_capable(CAP_SYS_RESOURCE))
 			retval = -EPERM;
 		if (!retval)
 			retval = security_task_setrlimit(tsk->group_leader,
@@ -1946,16 +2090,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 		fput(exe_file);
 	}
 
-	/*
-	 * The symlink can be changed only once, just to disallow arbitrary
-	 * transitions malicious software might bring in. This means one
-	 * could make a snapshot over all processes running and monitor
-	 * /proc/pid/exe changes to notice unusual activity if needed.
-	 */
-	err = -EPERM;
-	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-		goto exit;
-
 	err = 0;
 	/* set the new file, lockless */
 	get_file(exe.file);
@@ -2049,15 +2183,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
 
 	/*
 	 * Finally, make sure the caller has the rights to
-	 * change /proc/pid/exe link: only local root should
+	 * change /proc/pid/exe link: only local sys admin should
 	 * be allowed to.
 	 */
 	if (prctl_map->exe_fd != (u32)-1) {
-		struct user_namespace *ns = current_user_ns();
-		const struct cred *cred = current_cred();
-
-		if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
-		    !gid_eq(cred->gid, make_kgid(ns, 0)))
+		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 			goto out;
 	}
 
@@ -2199,12 +2329,12 @@ static int prctl_set_mm(int opt, unsigned long addr,
 		return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
 #endif
 
-	if (!capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
 	if (opt == PR_SET_MM_EXE_FILE)
 		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
+	if (!ve_capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
 	if (opt == PR_SET_MM_AUXV)
 		return prctl_set_auxv(mm, addr, arg4);
 
@@ -2636,18 +2766,34 @@ static int do_sysinfo(struct sysinfo *info)
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
 	struct timespec tp;
+	struct ve_struct *ve;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
+	si_meminfo(info);
+	si_swapinfo(info);
+
+#ifdef CONFIG_BEANCOUNTERS
+	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info)
+			& NOTIFY_FAIL)
+		return -ENOMSG;
+#endif
+	ve = get_exec_env();
+
 	get_monotonic_boottime(&tp);
 	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	if (ve_is_super(ve)) {
+		get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+
+		info->procs = nr_threads;
+	} else {
+		info->uptime -= ve->real_start_timespec.tv_sec;
 
-	info->procs = nr_threads;
+		info->procs = nr_threads_ve(ve);
 
-	si_meminfo(info);
-	si_swapinfo(info);
+		get_avenrun_ve(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	}
 
 	/*
 	 * If the sum of all the available memory (i.e. ram + swap)
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,6 +203,14 @@ cond_syscall(sys_userfaultfd);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
 
+/* user-beancounters */
+cond_syscall(sys_getluid);
+cond_syscall(sys_setluid);
+cond_syscall(sys_setublimit);
+cond_syscall(sys_ubstat);
+cond_syscall(compat_sys_setublimit);
+cond_syscall(compat_sys_ubstat);
+
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -91,6 +92,7 @@
 #ifdef CONFIG_LOCKUP_DETECTOR
 #include <linux/nmi.h>
 #endif
+extern unsigned relatime_interval; /* fs/inode.c */
 
 
 #if defined(CONFIG_SYSCTL)
@@ -101,10 +103,8 @@ extern int max_threads;
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
 extern int core_uses_pid;
-extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 #endif
-extern int pid_max;
 extern int pid_max_min, pid_max_max;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
@@ -114,6 +114,9 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 extern int sysctl_nr_trim_pages;
 #endif
 
+int ve_allow_module_load = 1;
+EXPORT_SYMBOL(ve_allow_module_load);
+
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
@@ -171,6 +174,17 @@ extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 #endif
 
+static bool virtual_ptr(void **ptr, void *base, size_t size, void *cur);
+#define sysctl_virtual(sysctl)							\
+int sysctl ## _virtual(struct ctl_table *table, int write,			\
+		        void __user *buffer, size_t *lenp, loff_t *ppos)	\
+{										\
+	struct ctl_table tmp = *table;						\
+	if (virtual_ptr(&tmp.data, &ve0, sizeof(ve0), get_exec_env()))		\
+		return sysctl(&tmp, write, buffer, lenp, ppos);			\
+	return -EINVAL;								\
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -273,6 +287,20 @@ static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
 #endif
 
+static int proc_dointvec_pidmax(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp;
+
+	tmp = *table;
+	tmp.data = &current->nsproxy->pid_ns->pid_max;
+	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_COREDUMP
+sysctl_virtual(proc_dostring_coredump);
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
@@ -458,6 +486,25 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.procname	= "sched_vcpu_hotslice",
+		.data		= &sysctl_sched_vcpu_hotslice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_cpulimit_scale_cpufreq",
+		.data		= &sysctl_sched_cpulimit_scale_cpufreq,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -493,10 +540,10 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "core_pattern",
-		.data		= core_pattern,
+		.data		= ve0.core_pattern,
 		.maxlen		= CORENAME_MAX_SIZE,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring_coredump,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dostring_coredump_virtual,
 	},
 	{
 		.procname	= "core_pipe_limit",
@@ -666,8 +713,8 @@ static struct ctl_table kern_table[] = {
 		.procname	= "hotplug",
 		.data		= &uevent_helper,
 		.maxlen		= UEVENT_HELPER_PATH_LEN,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dostring_immutable,
 	},
 
 #ifdef CONFIG_CHR_DEV_SG
@@ -761,10 +808,10 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.procname	= "pid_max",
-		.data		= &pid_max,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_pidmax,
 		.extra1		= &pid_max_min,
 		.extra2		= &pid_max_max,
 	},
@@ -780,8 +827,8 @@ static struct ctl_table kern_table[] = {
 		.procname	= "printk",
 		.data		= &console_loglevel,
 		.maxlen		= 4*sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_immutable,
 	},
 	{
 		.procname	= "printk_ratelimit",
@@ -824,6 +871,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
+#endif
+#ifdef CONFIG_VE
+        {
+		.procname       = "ve_allow_module_load",
+		.data           = &ve_allow_module_load,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{
 		.procname	= "ngroups_max",
@@ -988,10 +1046,10 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_MMU)
 	{
 		.procname	= "randomize_va_space",
-		.data		= &randomize_va_space,
+		.data		= &ve0._randomize_va_space,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_virtual,
 	},
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
@@ -1198,6 +1256,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
 	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
@@ -1269,6 +1334,14 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "dirtytime_expire_seconds",
+		.data		= &dirtytime_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= dirtytime_interval_handler,
+		.extra1		= &zero,
+	},
 	{
 		.procname       = "nr_pdflush_threads",
 		.mode           = 0444 /* read-only */,
@@ -1421,6 +1494,15 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "vfs_cache_min_ratio",
+		.data		= &sysctl_vfs_cache_min_ratio,
+		.maxlen		= sizeof(sysctl_vfs_cache_min_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
 		.procname	= "legacy_va_layout",
@@ -1473,7 +1555,7 @@ static struct ctl_table vm_table[] = {
 		.procname	= "mmap_min_addr",
 		.data		= &dac_mmap_min_addr,
 		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= mmap_min_addr_handler,
 	},
 #endif
@@ -1563,6 +1645,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= (void *)&mmap_rnd_compat_bits_min,
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
+#endif
+#ifdef CONFIG_MEMCG
+	{
+		.procname	= "force_scan_thresh",
+		.data		= &sysctl_force_scan_thresh,
+		.maxlen		= sizeof(sysctl_force_scan_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 	{ }
 };
@@ -1574,17 +1667,24 @@ static struct ctl_table binfmt_misc_table[] = {
 #endif
 
 static struct ctl_table fs_table[] = {
+	{
+		.procname	= "relatime_interval",
+		.data		= &relatime_interval,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.procname	= "inode-nr",
 		.data		= &inodes_stat,
-		.maxlen		= 2*sizeof(int),
+		.maxlen		= 2*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_inodes,
 	},
 	{
 		.procname	= "inode-state",
 		.data		= &inodes_stat,
-		.maxlen		= 7*sizeof(int),
+		.maxlen		= 7*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_inodes,
 	},
@@ -1614,7 +1714,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,
-		.maxlen		= 6*sizeof(int),
+		.maxlen		= 6*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_dentry,
 	},
@@ -1667,17 +1767,17 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_max_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
@@ -1729,6 +1829,13 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.procname	= "odirect_enable",
+		.data		= &ve0.odirect_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_virtual,
+	},
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
@@ -1755,8 +1862,8 @@ static struct ctl_table fs_table[] = {
 		.procname	= "may_detach_mounts",
 		.data		= &may_detach_mounts,
 		.maxlen		= sizeof(may_detach_mounts),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_minmax_immutable,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -2275,9 +2382,10 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 
 static void validate_coredump_safety(void)
 {
+	struct ve_struct *ve = get_exec_env();
 #ifdef CONFIG_COREDUMP
 	if (suid_dumpable == SUID_DUMP_ROOT &&
-	    core_pattern[0] != '/' && core_pattern[0] != '|') {
+	    ve->core_pattern[0] != '/' && ve->core_pattern[0] != '|') {
 		printk(KERN_WARNING "Unsafe core_pattern used with "\
 			"suid_dumpable=2. Pipe handler or fully qualified "\
 			"core dump path required.\n");
@@ -2751,6 +2859,50 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	}
 }
 
+static bool virtual_ptr(void **ptr, void *base, size_t size, void *cur)
+{
+	unsigned long addr = (unsigned long)*ptr;
+	unsigned long base_addr = (unsigned long)base;
+
+	if (addr >= base_addr && addr < base_addr + size) {
+		*ptr = (char *)cur + (addr - base_addr);
+		return true;
+	}
+	return false;
+}
+
+sysctl_virtual(proc_dointvec);
+sysctl_virtual(proc_doulongvec_minmax);
+
+static inline bool sysctl_in_container(void)
+{
+	return !ve_is_super(get_exec_env());
+}
+
+int proc_dointvec_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && sysctl_in_container())
+		return 0;
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+
+int proc_dostring_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && sysctl_in_container())
+		return 0;
+	return proc_dostring(table, write, buffer, lenp, ppos);
+}
+
+int proc_dointvec_minmax_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && sysctl_in_container())
+		return 0;
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
 #else /* CONFIG_PROC_SYSCTL */
 
 int proc_dostring(struct ctl_table *table, int write,
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -46,7 +46,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	return work;
 }
 
-void task_work_run(void)
+void __task_work_run(bool exiting)
 {
 	struct task_struct *task = current;
 	struct callback_head *work, *head, *next;
@@ -58,7 +58,7 @@ void task_work_run(void)
 		 */
 		do {
 			work = ACCESS_ONCE(task->task_works);
-			head = !work && (task->flags & PF_EXITING) ?
+			head = !work && exiting ?
 				&work_exited : NULL;
 		} while (cmpxchg(&task->task_works, work, head) != work);
 
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,6 +46,7 @@ static struct genl_family family = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+	.netnsok	= true,
 };
 
 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
@@ -640,7 +641,7 @@ static const struct genl_ops taskstats_ops[] = {
 		.cmd		= TASKSTATS_CMD_GET,
 		.doit		= taskstats_user_cmd,
 		.policy		= taskstats_cmd_get_policy,
-		.flags		= GENL_ADMIN_PERM,
+		.flags		= GENL_VE_ADMIN_PERM,
 	},
 	{
 		.cmd		= CGROUPSTATS_CMD_GET,
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -229,32 +229,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
  *
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb();	<- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb();	<- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- *	seq = tkf->seq;
- *	smp_rmb();
- *	idx = seq & 0x01;
- *	now = now(tkf->base[idx]);
- *	smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
  *
  * So if a NMI hits the update of base[0] then it will use base[1]
  * which is still consistent. In the worst case this can result is a
@@ -317,7 +292,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	u64 now;
 
 	do {
-		seq = raw_read_seqcount(&tkf->seq);
+		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
 	} while (read_seqcount_retry(&tkf->seq, seq));
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -38,10 +38,11 @@
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
-#include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4313,6 +4313,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array *tr = iter->tr;
 	ssize_t sret;
+	size_t loops = 0;
+	enum print_line_t ret = 0;
 
 	/* return any leftover data */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -4363,7 +4365,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	trace_event_read_lock();
 	trace_access_lock(iter->cpu_file);
 	while (trace_find_next_entry_inc(iter) != NULL) {
-		enum print_line_t ret;
 		int len = iter->seq.len;
 
 		ret = print_trace_line(iter);
@@ -4398,8 +4399,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	 * If there was nothing to send to user, in spite of consuming trace
 	 * entries, go back to wait for more entries.
 	 */
-	if (sret == -EBUSY)
+	if (sret == -EBUSY) {
+		if ((loops % 10000) == 0) {
+			WARN_ON(1);
+			printk("%ldk loops in tracing_read_pipe\n", loops / 1000);
+			printk("trace_empty(iter): %d\n", trace_empty(iter));
+			printk("iter->seq.len    : %d\n", iter->seq.len);
+			printk("iter->seq.readpos: %d\n", iter->seq.readpos);
+			printk("iter->cpu_file   : %d\n", iter->cpu_file);
+			printk("iter->lost_events: %ld\n", iter->lost_events);
+			printk("ret              : %d\n", ret);
+			printk("cnt              : %ld\n", cnt);
+		}
 		goto waitagain;
+	}
 
 out:
 	mutex_unlock(&iter->mutex);
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1066,6 +1066,9 @@ static void parse_init(struct filter_parse_state *ps,
 
 static char infix_next(struct filter_parse_state *ps)
 {
+	if (!ps->infix.cnt)
+		return 0;
+
 	ps->infix.cnt--;
 
 	return ps->infix.string[ps->infix.tail++];
@@ -1081,6 +1084,9 @@ static char infix_peek(struct filter_parse_state *ps)
 
 static void infix_advance(struct filter_parse_state *ps)
 {
+	if (!ps->infix.cnt)
+		return;
+
 	ps->infix.cnt--;
 	ps->infix.tail++;
 }
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -241,11 +241,10 @@ static __init int user_namespace_sysctl_init(void)
 	 * properly.
 	 */
 	user_header = register_sysctl("user", empty);
+	kmemleak_ignore(user_header);
 	BUG_ON(!user_header);
 	BUG_ON(!setup_userns_sysctls(&init_user_ns));
 #endif
 	return 0;
 }
 subsys_initcall(user_namespace_sysctl_init);
-
-
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -66,14 +66,11 @@ EXPORT_SYMBOL_GPL(init_user_ns);
  * when changing user ID's (ie setuid() and friends).
  */
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
-#define UIDHASH_SZ	(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid)	(uidhash_table + __uidhashfn((__kuid_val(uid))))
+#define uidhashentry(ns, uid)  ((ns)->uidhash_table + __uidhashfn((__kuid_val(uid))))
 
 static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
 
 /*
  * The uidhash_lock is mostly taken from process context, but it is
@@ -147,9 +144,10 @@ struct user_struct *find_user(kuid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
+	struct user_namespace *ns = current_user_ns();
 
 	spin_lock_irqsave(&uidhash_lock, flags);
-	ret = uid_hash_find(uid, uidhashentry(uid));
+	ret = uid_hash_find(uid, uidhashentry(ns, uid));
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	return ret;
 }
@@ -168,9 +166,9 @@ void free_uid(struct user_struct *up)
 		local_irq_restore(flags);
 }
 
-struct user_struct *alloc_uid(kuid_t uid)
+struct user_struct *alloc_uid_ns(struct user_namespace *ns, kuid_t uid)
 {
-	struct hlist_head *hashent = uidhashentry(uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
 	spin_lock_irq(&uidhash_lock);
@@ -208,6 +206,11 @@ struct user_struct *alloc_uid(kuid_t uid)
 	return NULL;
 }
 
+struct user_struct *alloc_uid(kuid_t uid)
+{
+	return alloc_uid_ns(current_user_ns(), uid);
+}
+
 static int __init uid_cache_init(void)
 {
 	int n;
@@ -216,11 +219,11 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_HLIST_HEAD(uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
-	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
+	uid_hash_insert(&root_user, uidhashentry(&init_user_ns, GLOBAL_ROOT_UID));
 	spin_unlock_irq(&uidhash_lock);
 
 	return 0;
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -71,6 +71,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
+	struct user_struct *new_user;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
 	struct ucounts *ucounts;
@@ -103,7 +104,7 @@ int create_user_ns(struct cred *new)
 	 * mount namespace which allows all files to be accessed.
 	 */
 	ret = -EPERM;
-	if (current_chrooted())
+	if (!IS_ENABLED(CONFIG_VE) && current_chrooted())
 		goto fail_dec;
 
 	/* The creator needs a mapping in the parent user namespace
@@ -124,6 +125,19 @@ int create_user_ns(struct cred *new)
 	if (ret)
 		goto fail_free;
 
+	for (i = 0; i < UIDHASH_SZ; ++i)
+		INIT_HLIST_HEAD(ns->uidhash_table + i);
+
+	new_user = alloc_uid_ns(ns, owner);
+	if (!new_user) {
+		proc_free_inum(ns->proc_inum);
+		kmem_cache_free(user_ns_cachep, ns);
+		return -ENOMEM;
+	}
+
+	free_uid(new->user);
+	new->user = new_user;
+
 	atomic_set(&ns->count, 1);
 	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
@@ -164,6 +178,7 @@ int create_user_ns(struct cred *new)
 fail:
 	return ret;
 }
+EXPORT_SYMBOL(create_user_ns);
 
 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
 {
@@ -1025,8 +1040,8 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
 	if (user_ns == current_user_ns())
 		return -EINVAL;
 
-	/* Threaded processes may not enter a different user namespace */
-	if (atomic_read(&current->mm->mm_users) > 1)
+	/* Tasks that share a thread group must share a user namespace */
+	if (!thread_group_empty(current))
 		return -EINVAL;
 
 	if (current->fs->users != 1)
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
@@ -32,8 +33,17 @@ static struct uts_namespace *create_uts_ns(void)
 	struct uts_namespace *uts_ns;
 
 	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
-	if (uts_ns)
+	if (uts_ns) {
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+		memset(&uts_ns->vdso, 0, sizeof(uts_ns->vdso));
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+		memset(&uts_ns->vdso32, 0, sizeof(uts_ns->vdso32));
+#endif
+#endif
 		kref_init(&uts_ns->kref);
+	}
 	return uts_ns;
 }
 
@@ -110,6 +120,25 @@ void free_uts_ns(struct kref *kref)
 	dec_uts_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	proc_free_inum(ns->proc_inum);
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+	if (ns->vdso.pages) {
+		int i;
+		vunmap(ns->vdso.addr);
+		for (i = 0; i < ns->vdso.nr_pages; i++)
+			put_page(ns->vdso.pages[i]);
+		kfree(ns->vdso.pages);
+	}
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	if (ns->vdso32.pages) {
+		int i;
+		for (i = 0; i < ns->vdso32.nr_pages; i++)
+			put_page(ns->vdso32.pages[i]);
+		kfree(ns->vdso32.pages);
+	}
+#endif
+#endif
 	kfree(ns);
 }
 
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/stat.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 
@@ -29,6 +30,14 @@ static void *get_uts(ctl_table *table, int write)
 		down_read(&uts_sem);
 	else
 		down_write(&uts_sem);
+
+	if (table->data == &virt_utsname.release) {
+		if (uts_ns == &init_uts_ns)
+			return virt_utsname.release;
+		else
+			return uts_ns->name.release;
+	}
+
 	return which;
 }
 
@@ -92,7 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.procname	= "hostname",
 		.data		= init_uts_ns.name.nodename,
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_do_uts_string,
 		.poll		= &hostname_poll,
 	},
@@ -100,18 +109,20 @@ static struct ctl_table uts_kern_table[] = {
 		.procname	= "domainname",
 		.data		= init_uts_ns.name.domainname,
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_do_uts_string,
 		.poll		= &domainname_poll,
 	},
 	{}
 };
 
-static struct ctl_table uts_root_table[] = {
+static struct ctl_table uts_virt_osrelease_table[] = {
 	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= uts_kern_table,
+		.procname       = "virt_osrelease",
+		.data           = virt_utsname.release,
+		.maxlen         = sizeof(virt_utsname.release),
+		.mode           = 0644,
+		.proc_handler   = &proc_do_uts_string,
 	},
 	{}
 };
@@ -129,9 +140,15 @@ void uts_proc_notify(enum uts_proc proc)
 }
 #endif
 
+static struct ctl_path uts_path[] = {
+	{ .procname = "kernel", },
+	{ }
+};
+
 static int __init utsname_sysctl_init(void)
 {
-	register_sysctl_table(uts_root_table);
+	register_sysctl_paths(uts_path, uts_kern_table);
+	register_sysctl_paths(uts_path, uts_virt_osrelease_table);
 	return 0;
 }
 
--- /dev/null
+++ b/kernel/ve/Makefile
@@ -0,0 +1,26 @@
+#
+# kernel/ve/Makefile
+#
+# Copyright (c) 2000-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-$(CONFIG_VE) = ve.o veowner.o hooks.o vzstat_core.o ve-kobject.o
+obj-$(CONFIG_VZ_WDOG) += vzwdog.o
+obj-$(CONFIG_VE_CALLS) += vzmon.o
+
+vzmon-objs = vecalls.o
+
+obj-$(CONFIG_VZ_DEV) += vzdev.o
+obj-$(CONFIG_VZ_EVENT) += vzevent.o
+
+obj-$(CONFIG_VE_NETDEV_ACCOUNTING) += vznetstat/
+
+obj-$(CONFIG_VZ_LIST) += vzlist.o
+obj-$(CONFIG_VE_CALLS) += vzstat.o
+
+obj-$(CONFIG_VZ_IOLIMIT) += vziolimit.o
+
+obj-$(CONFIG_VE_IPTABLES) += ve.o
+
+obj-y += dummy/
--- /dev/null
+++ b/kernel/ve/dummy/Makefile
@@ -0,0 +1,10 @@
+#
+# kernel/ve/dummy/Makefile
+#
+# Copyright (c) 2000-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-m += ip6_vzprivnet.o
+obj-m += ip_vzprivnet.o
+obj-m += pio_nfs.o
--- /dev/null
+++ b/kernel/ve/dummy/ip6_vzprivnet.c
@@ -0,0 +1,22 @@
+/*
+ *  kernel/ve/dummy/ip6_vzprivnet.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
--- /dev/null
+++ b/kernel/ve/dummy/ip_vzprivnet.c
@@ -0,0 +1,22 @@
+/*
+ *  kernel/ve/dummy/ip_vzprivnet.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
--- /dev/null
+++ b/kernel/ve/dummy/pio_nfs.c
@@ -0,0 +1,23 @@
+/*
+ *  kernel/ve/dummy/pio_nfs.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/hooks.c
@@ -0,0 +1,111 @@
+/*
+ *  kernel/ve/hooks.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ve_proto.h>
+#include <linux/module.h>
+
+static struct list_head ve_hooks[VE_MAX_CHAINS];
+static DECLARE_RWSEM(ve_hook_sem);
+
+void ve_hook_register(int chain, struct ve_hook *vh)
+{
+	struct list_head *lh;
+	struct ve_hook *tmp;
+
+	BUG_ON(chain > VE_MAX_CHAINS);
+
+	down_write(&ve_hook_sem);
+	list_for_each(lh, &ve_hooks[chain]) {
+		tmp = list_entry(lh, struct ve_hook, list);
+		if (vh->priority < tmp->priority)
+			break;
+	}
+
+	list_add_tail(&vh->list, lh);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_register);
+
+void ve_hook_unregister(struct ve_hook *vh)
+{
+	down_write(&ve_hook_sem);
+	list_del(&vh->list);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_unregister);
+
+static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve)
+{
+	int err;
+
+	err = 0;
+	if (vh->init != NULL && try_module_get(vh->owner)) {
+		err = vh->init(ve);
+		module_put(vh->owner);
+	}
+	return err;
+}
+
+static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve)
+{
+	if (vh->fini != NULL && try_module_get(vh->owner)) {
+		vh->fini(ve);
+		module_put(vh->owner);
+	}
+}
+
+int ve_hook_iterate_init(int chain, void *ve)
+{
+	struct ve_hook *vh;
+	int err;
+
+	err = 0;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry(vh, &ve_hooks[chain], list)
+		if ((err = ve_hook_init(vh, ve)) < 0)
+			break;
+
+	if (err)
+		list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list)
+			ve_hook_fini(vh, ve);
+
+	up_read(&ve_hook_sem);
+	return err;
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_init);
+
+void ve_hook_iterate_fini(int chain, void *ve)
+{
+	struct ve_hook *vh;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry_reverse(vh, &ve_hooks[chain], list)
+		ve_hook_fini(vh, ve);
+	up_read(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_fini);
+
+static int __init ve_hooks_init(void)
+{
+	int i;
+
+	for (i = 0; i < VE_MAX_CHAINS; i++)
+		INIT_LIST_HEAD(&ve_hooks[i]);
+	return 0;
+}
+
+core_initcall(ve_hooks_init);
+
--- /dev/null
+++ b/kernel/ve/ve-kobject.c
@@ -0,0 +1,48 @@
+/*
+ *  kernel/ve/ve-kobject.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/kobject_ns.h>
+
+static const struct kobj_ns_type_operations *ve_child_ns_type(struct kobject *kobj)
+{
+	return &ve_ns_type_operations;
+}
+
+static void ve_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type ve_kobj_ktype = {
+	.release	= ve_kobj_release,
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.child_ns_type	= ve_child_ns_type,
+};
+
+struct kobject *kobject_create_and_add_ve(const char *name, struct kobject *parent)
+{
+	struct kobject *kobj;
+	int retval;
+
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	if (!kobj)
+		return NULL;
+
+	kobject_init(kobj, &ve_kobj_ktype);
+
+	retval = kobject_add(kobj, parent, "%s", name);
+	if (retval) {
+		printk(KERN_WARNING "%s: kobject_add error: %d\n",
+		       __func__, retval);
+		kobject_put(kobj);
+		kobj = NULL;
+	}
+	return kobj;
+}
+
+
--- /dev/null
+++ b/kernel/ve/ve.c
@@ -0,0 +1,1579 @@
+/*
+ *  kernel/ve/ve.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * 've.c' helper file performing VE sub-system initialization
+ */
+
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/init.h>
+
+#include <linux/aio.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/kdev_t.h>
+#include <linux/termios.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/ve_proto.h>
+#include <linux/devpts_fs.h>
+#include <linux/user_namespace.h>
+#include <linux/init_task.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
+#include <linux/ctype.h>
+
+#include <uapi/linux/vzcalluser.h>
+#include <linux/venet.h>
+#include <linux/vziptable_defs.h>
+#include <net/rtnetlink.h>
+
+static struct kmem_cache *ve_cachep;
+
+unsigned long vz_rstamp = 0x37e0f59d;
+EXPORT_SYMBOL(vz_rstamp);
+
+#ifdef CONFIG_MODULES
+struct module no_module = { .state = MODULE_STATE_GOING };
+EXPORT_SYMBOL(no_module);
+#endif
+
+struct kmapset_set ve_sysfs_perms;
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats);
+
+struct ve_struct ve0 = {
+	.ve_name		= "0",
+	.start_jiffies		= INITIAL_JIFFIES,
+	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
+	.ve_netns		= &init_net,
+	.is_running		= 1,
+	.is_pseudosuper		= 1,
+#ifdef CONFIG_VE_IPTABLES
+	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
+#endif
+	.features		= -1,
+	.fsync_enable		= FSYNC_FILTERED,
+	.meminfo_val		= VE_MEMINFO_SYSTEM,
+	._randomize_va_space	=
+#ifdef CONFIG_COMPAT_BRK
+					1,
+#else
+					2,
+#endif
+	.sched_lat_ve.cur	= &ve0_lat_stats,
+	.init_cred		= &init_cred,
+	.mnt_nr			= ATOMIC_INIT(0),
+	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
+	.netns_max_nr		= INT_MAX,
+	.netif_avail_nr		= ATOMIC_INIT(INT_MAX),
+	.netif_max_nr		= INT_MAX,
+};
+EXPORT_SYMBOL(ve0);
+
+LIST_HEAD(ve_list_head);
+DEFINE_MUTEX(ve_list_lock);
+
+int nr_ve = 1;	/* One VE always exists. Compatibility with vestat */
+EXPORT_SYMBOL(nr_ve);
+
+static DEFINE_IDR(ve_idr);
+
+struct ve_struct *get_ve(struct ve_struct *ve)
+{
+	if (ve)
+		css_get(&ve->css);
+	return ve;
+}
+EXPORT_SYMBOL(get_ve);
+
+void put_ve(struct ve_struct *ve)
+{
+	if (ve)
+		css_put(&ve->css);
+}
+EXPORT_SYMBOL(put_ve);
+
+struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *task;
+
+	rcu_read_lock();
+	task = ve->ve_ns ? ve->init_task : &init_task;
+	while (true) {
+		css = task_subsys_state(task, subsys_id);
+		if (likely(css_tryget(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
+static int ve_list_add(struct ve_struct *ve)
+{
+	int err;
+
+	mutex_lock(&ve_list_lock);
+	err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
+	if (err < 0) {
+		if (err == -ENOSPC)
+			err = -EEXIST;
+		goto out;
+	}
+	list_add(&ve->ve_list, &ve_list_head);
+	nr_ve++;
+	err = 0;
+out:
+	mutex_unlock(&ve_list_lock);
+	return err;
+}
+
+static void ve_list_del(struct ve_struct *ve)
+{
+	mutex_lock(&ve_list_lock);
+	idr_remove(&ve_idr, ve->veid);
+	list_del_init(&ve->ve_list);
+	nr_ve--;
+	mutex_unlock(&ve_list_lock);
+}
+
+/* caller provides refrence to ve-struct */
+const char *ve_name(struct ve_struct *ve)
+{
+	return ve->ve_name;
+}
+EXPORT_SYMBOL(ve_name);
+
+/* under rcu_read_lock if task != current */
+const char *task_ve_name(struct task_struct *task)
+{
+	return rcu_dereference_check(task->task_ve, task == current)->ve_name;
+}
+EXPORT_SYMBOL(task_ve_name);
+
+struct ve_struct *get_ve_by_id(envid_t veid)
+{
+	struct ve_struct *ve;
+	rcu_read_lock();
+	ve = idr_find(&ve_idr, veid);
+	if (ve && !css_tryget(&ve->css))
+		ve = NULL;
+	rcu_read_unlock();
+	return ve;
+}
+EXPORT_SYMBOL(get_ve_by_id);
+
+EXPORT_SYMBOL(ve_list_lock);
+EXPORT_SYMBOL(ve_list_head);
+
+int vz_security_family_check(struct net *net, int family, int type)
+{
+	if (ve_is_super(net->owner_ve))
+		return 0;
+
+	switch (family) {
+	case PF_UNSPEC:
+	case PF_PACKET:
+	case PF_NETLINK:
+	case PF_UNIX:
+	case PF_INET:
+	case PF_INET6:
+	case PF_PPPOX:
+	case PF_KEY:
+		return 0;
+	case PF_BRIDGE:
+		if (type)
+			switch (type) {
+				case RTM_NEWNEIGH:
+				case RTM_DELNEIGH:
+				case RTM_GETNEIGH:
+					return 0;
+			}
+	default:
+		return -EAFNOSUPPORT;
+	}
+}
+EXPORT_SYMBOL_GPL(vz_security_family_check);
+
+int vz_security_protocol_check(struct net *net, int protocol)
+{
+	if (ve_is_super(net->owner_ve))
+		return 0;
+
+	switch (protocol) {
+	case  IPPROTO_IP:
+	case  IPPROTO_ICMP:
+	case  IPPROTO_TCP:
+	case  IPPROTO_UDP:
+	case  IPPROTO_RAW:
+	case  IPPROTO_DCCP:
+	case  IPPROTO_GRE:
+	case  IPPROTO_ESP:
+	case  IPPROTO_AH:
+	case  IPPROTO_SCTP:
+		return 0;
+	default:
+		return -EAFNOSUPPORT;
+	}
+}
+EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
+/* Check if current user_ns is initial for current ve */
+bool current_user_ns_initial(void)
+{
+	struct ve_struct *ve = get_exec_env();
+	bool ret = false;
+
+	if (current_user_ns() == &init_user_ns)
+		return true;
+
+	rcu_read_lock();
+	if (ve->ve_ns && ve->init_cred->user_ns == current_user_ns())
+		ret = true;
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(current_user_ns_initial);
+
+struct user_namespace *ve_init_user_ns(void)
+{
+	struct cred *init_cred;
+
+	init_cred = get_exec_env()->init_cred;
+	return init_cred ? init_cred->user_ns : &init_user_ns;
+}
+EXPORT_SYMBOL(ve_init_user_ns);
+
+int ve_net_hide_sysctl(struct net *net)
+{
+	/*
+	 * This can happen only on VE creation, when process created VE cgroup,
+	 * and clones a child with new network namespace.
+	 */
+	if (net->owner_ve->init_cred == NULL)
+		return 0;
+
+	/*
+	 * Expose sysctl only for container's init user namespace
+	 */
+	return net->user_ns != net->owner_ve->init_cred->user_ns;
+}
+EXPORT_SYMBOL(ve_net_hide_sysctl);
+
+int nr_threads_ve(struct ve_struct *ve)
+{
+	return cgroup_task_count(ve->css.cgroup);
+}
+EXPORT_SYMBOL(nr_threads_ve);
+
+struct kthread_attach_work {
+	struct kthread_work work;
+	struct completion done;
+	struct task_struct *target;
+	int result;
+};
+
+static void kthread_attach_fn(struct kthread_work *w)
+{
+	struct kthread_attach_work *work = container_of(w,
+			struct kthread_attach_work, work);
+	struct task_struct *target = work->target;
+	struct cred *cred;
+	int err;
+
+	switch_task_namespaces(current, get_nsproxy(target->nsproxy));
+
+	err = unshare_fs_struct();
+	if (err)
+		goto out;
+	set_fs_root(current->fs, &target->fs->root);
+	set_fs_pwd(current->fs, &target->fs->root);
+
+	err = -ENOMEM;
+	cred = prepare_kernel_cred(target);
+	if (!cred)
+		goto out;
+	err = commit_creds(cred);
+	if (err)
+		goto out;
+
+	err = change_active_pid_ns(current, task_active_pid_ns(target));
+	if (err)
+		goto out;
+
+	err = cgroup_attach_task_all(target, current);
+	if (err)
+		goto out;
+out:
+	work->result = err;
+	complete(&work->done);
+}
+
+struct kthread_create_work {
+	struct kthread_work work;
+	struct kthread_create_info *info;
+};
+
+extern void create_kthread(struct kthread_create_info *create);
+
+static void kthread_create_fn(struct kthread_work *w)
+{
+	struct kthread_create_work *work = container_of(w,
+			struct kthread_create_work, work);
+
+	create_kthread(work->info);
+}
+
+static void kthread_create_queue(void *data, struct kthread_create_info *info)
+{
+	struct ve_struct *ve = data;
+	struct kthread_create_work create = {
+		KTHREAD_WORK_INIT(create.work, kthread_create_fn),
+		.info = info,
+	};
+	queue_kthread_work(&ve->ve_kthread_worker, &create.work);
+	wait_for_completion(&info->done);
+}
+
+struct task_struct *kthread_create_on_node_ve(struct ve_struct *ve,
+					int (*threadfn)(void *data),
+					void *data, int node,
+					const char namefmt[], ...)
+{
+	va_list args;
+	struct task_struct *task;
+	void (*queue)(void *data, struct kthread_create_info *info) = NULL;
+
+	if (!ve_is_super(ve))
+		queue = kthread_create_queue;
+
+	va_start(args, namefmt);
+	task = __kthread_create_on_node(queue, ve, threadfn, data,
+					node, namefmt, args);
+	va_end(args);
+	return task;
+}
+EXPORT_SYMBOL(kthread_create_on_node_ve);
+
+static int ve_start_umh(struct ve_struct *ve)
+{
+	struct task_struct *t;
+
+	init_kthread_worker(&ve->ve_umh_worker);
+	t = kthread_run_ve(ve, kthread_worker_fn, &ve->ve_umh_worker,
+			"khelper");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ve->ve_umh_task = t;
+	return 0;
+}
+
+static void ve_stop_umh(struct ve_struct *ve)
+{
+	flush_kthread_worker(&ve->ve_umh_worker);
+	kthread_stop(ve->ve_umh_task);
+	ve->ve_umh_task = NULL;
+}
+
+static int ve_start_kthread(struct ve_struct *ve)
+{
+	struct task_struct *t;
+	struct kthread_attach_work attach = {
+		KTHREAD_WORK_INIT(attach.work, kthread_attach_fn),
+		COMPLETION_INITIALIZER_ONSTACK(attach.done),
+		.target = current,
+	};
+
+	init_kthread_worker(&ve->ve_kthread_worker);
+	t = kthread_run(kthread_worker_fn, &ve->ve_kthread_worker,
+			"kthreadd/%s", ve_name(ve));
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	queue_kthread_work(&ve->ve_kthread_worker, &attach.work);
+	wait_for_completion(&attach.done);
+	if (attach.result) {
+		kthread_stop(t);
+		return attach.result;
+	}
+
+	ve->ve_kthread_task = t;
+	return 0;
+}
+
+static void ve_stop_kthread(struct ve_struct *ve)
+{
+	flush_kthread_worker(&ve->ve_kthread_worker);
+	kthread_stop(ve->ve_kthread_task);
+	ve->ve_kthread_task = NULL;
+}
+
+static void ve_grab_context(struct ve_struct *ve)
+{
+	struct task_struct *tsk = current;
+
+	get_task_struct(tsk);
+	ve->init_task = tsk;
+	ve->init_cred = (struct cred *)get_current_cred();
+	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
+	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
+	synchronize_rcu();
+}
+
+static void ve_drop_context(struct ve_struct *ve)
+{
+	struct nsproxy *ve_ns = ve->ve_ns;
+
+	put_net(ve->ve_netns);
+	ve->ve_netns = NULL;
+
+	/* Allows to dereference init_cred and init_task if ve_ns is set */
+	rcu_assign_pointer(ve->ve_ns, NULL);
+	synchronize_rcu();
+	put_nsproxy(ve_ns);
+
+	ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
+
+	put_cred(ve->init_cred);
+	ve->init_cred = NULL;
+
+	put_task_struct(ve->init_task);
+	ve->init_task = NULL;
+
+}
+
+static const struct timespec zero_time = { };
+
+extern void cgroup_mark_ve_root(struct ve_struct *ve);
+
+/* under ve->op_sem write-lock */
+static int ve_start_container(struct ve_struct *ve)
+{
+	struct task_struct *tsk = current;
+	int err;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	if (ve->is_running || ve->ve_ns)
+		return -EBUSY;
+
+	if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
+		return -ECHILD;
+
+	/*
+	 * Setup uptime for new containers only, if restored
+	 * the velue won't be zero here already but setup from
+	 * cgroup write while resuming the container.
+	 */
+	if (timespec_equal(&ve->start_timespec, &zero_time)) {
+		ve->start_timespec = tsk->start_time;
+		ve->real_start_timespec = tsk->real_start_time;
+	}
+
+	/* The value is wrong, but it is never compared to process
+	 * start times */
+	ve->start_jiffies = get_jiffies_64();
+
+	ve_grab_context(ve);
+
+	err = ve_list_add(ve);
+	if (err)
+		goto err_list;
+
+	err = ve_start_kthread(ve);
+	if (err)
+		goto err_kthread;
+
+	err = ve_start_umh(ve);
+	if (err)
+		goto err_umh;
+
+	err = ve_hook_iterate_init(VE_SS_CHAIN, ve);
+	if (err < 0)
+		goto err_iterate;
+
+	cgroup_mark_ve_root(ve);
+
+	ve->is_running = 1;
+
+	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
+
+	get_ve(ve); /* for ve_exit_ns() */
+
+	return 0;
+
+err_iterate:
+	ve_stop_umh(ve);
+err_umh:
+	ve_stop_kthread(ve);
+err_kthread:
+	ve_list_del(ve);
+err_list:
+	ve_drop_context(ve);
+	return err;
+}
+
+void ve_stop_ns(struct pid_namespace *pid_ns)
+{
+	struct ve_struct *ve = current->task_ve;
+
+	/*
+	 * current->cgroups already switched to init_css_set in cgroup_exit(),
+	 * but current->task_ve still points to our exec ve.
+	 */
+	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
+		return;
+
+	down_write(&ve->op_sem);
+	/*
+	 * Here the VE changes its state into "not running".
+	 * op_sem works as barrier for vzctl ioctls.
+	 * ve_mutex works as barrier for ve_can_attach().
+	 */
+	ve->is_running = 0;
+
+	/*
+	 * Neither it can be in pseudosuper state
+	 * anymore, setup it again if needed.
+	 */
+	ve->is_pseudosuper = 0;
+
+	ve_stop_umh(ve);
+	/*
+	 * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
+	 */
+	ve_stop_kthread(ve);
+	up_write(&ve->op_sem);
+}
+
+void ve_exit_ns(struct pid_namespace *pid_ns)
+{
+	struct ve_struct *ve = current->task_ve;
+
+	/*
+	 * current->cgroups already switched to init_css_set in cgroup_exit(),
+	 * but current->task_ve still points to our exec ve.
+	 */
+	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
+		return;
+
+	/*
+	 * At this point all userspace tasks in container are dead.
+	 */
+
+	if (ve->dev_sb) {
+		deactivate_super(ve->dev_sb);
+		ve->dev_sb = NULL;
+	}
+	if (ve->devpts_sb) {
+		deactivate_super(ve->devpts_sb);
+		ve->devpts_sb = NULL;
+	}
+
+	down_write(&ve->op_sem);
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
+
+	ve_list_del(ve);
+	ve_drop_context(ve);
+	up_write(&ve->op_sem);
+
+	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
+
+	put_ve(ve); /* from ve_start_container() */
+}
+
+#ifdef CONFIG_VE_IPTABLES
+static __u64 ve_setup_iptables_mask(__u64 init_mask)
+{
+	/* Remove when userspace will start supplying IPv6-related bits. */
+	init_mask &= ~VE_IP_IPTABLES6;
+	init_mask &= ~VE_IP_FILTER6;
+	init_mask &= ~VE_IP_MANGLE6;
+	init_mask &= ~VE_IP_IPTABLE_NAT_MOD;
+	init_mask &= ~VE_NF_CONNTRACK_MOD;
+
+	if (mask_ipt_allow(init_mask, VE_IP_IPTABLES))
+		init_mask |= VE_IP_IPTABLES6;
+	if (mask_ipt_allow(init_mask, VE_IP_FILTER))
+		init_mask |= VE_IP_FILTER6;
+	if (mask_ipt_allow(init_mask, VE_IP_MANGLE))
+		init_mask |= VE_IP_MANGLE6;
+	if (mask_ipt_allow(init_mask, VE_IP_NAT))
+		init_mask |= VE_IP_IPTABLE_NAT;
+	if (mask_ipt_allow(init_mask, VE_IP_CONNTRACK))
+		init_mask |= VE_NF_CONNTRACK;
+
+	return init_mask;
+}
+#endif
+
+static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
+{
+	struct ve_struct *ve = &ve0;
+	int err;
+
+	if (!cg->parent)
+		goto do_init;
+
+	/* forbid nested containers */
+	if (cgroup_ve(cg->parent) != ve)
+		return ERR_PTR(-ENOTDIR);
+
+	err = -ENOMEM;
+	ve = kmem_cache_zalloc(ve_cachep, GFP_KERNEL);
+	if (!ve)
+		goto err_ve;
+
+	ve->ve_name = kstrdup(cg->dentry->d_name.name, GFP_KERNEL);
+	if (!ve->ve_name)
+		goto err_name;
+
+	ve->_randomize_va_space = ve0._randomize_va_space;
+
+	ve->features = VE_FEATURES_DEF;
+
+	ve->odirect_enable = 2;
+	ve->fsync_enable = 2;
+
+#ifdef CONFIG_VE_IPTABLES
+	ve->ipt_mask = ve_setup_iptables_mask(VE_IP_DEFAULT);
+#endif
+
+	ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct);
+	if (!ve->sched_lat_ve.cur)
+		goto err_lat;
+
+	err = ve_log_init(ve);
+	if (err)
+		goto err_log;
+
+	ve->meminfo_val = VE_MEMINFO_DEFAULT;
+
+	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
+	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
+
+	atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT);
+	ve->netif_max_nr = NETIF_MAX_NR_DEFAULT;
+
+do_init:
+	init_rwsem(&ve->op_sem);
+	INIT_LIST_HEAD(&ve->devices);
+	INIT_LIST_HEAD(&ve->ve_list);
+	INIT_LIST_HEAD(&ve->devmnt_list);
+	mutex_init(&ve->devmnt_mutex);
+	kmapset_init_key(&ve->ve_sysfs_perms);
+
+#ifdef CONFIG_AIO
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+	atomic_set(&ve->mnt_nr, 0);
+
+#ifdef CONFIG_COREDUMP
+	strcpy(ve->core_pattern, "core");
+#endif
+
+	return &ve->css;
+
+err_log:
+	free_percpu(ve->sched_lat_ve.cur);
+err_lat:
+	kfree(ve->ve_name);
+err_name:
+	kmem_cache_free(ve_cachep, ve);
+err_ve:
+	return ERR_PTR(err);
+}
+
+static void ve_devmnt_free(struct ve_devmnt *devmnt)
+{
+	if (!devmnt)
+		return;
+
+	kfree(devmnt->allowed_options);
+	kfree(devmnt->hidden_options);
+	kfree(devmnt);
+}
+
+static void free_ve_devmnts(struct ve_struct *ve)
+{
+	while (!list_empty(&ve->devmnt_list)) {
+		struct ve_devmnt *devmnt;
+
+		devmnt = list_first_entry(&ve->devmnt_list, struct ve_devmnt, link);
+		list_del(&devmnt->link);
+		ve_devmnt_free(devmnt);
+	}
+}
+
+static bool ve_task_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *task = cgroup_taskset_first(tset);
+
+	if (cgroup_taskset_size(tset) > 1) {
+		pr_err_ratelimited("ve_cgroup#%s: attach of a thread group is not supported\n",
+				cg->name->name);
+		return false;
+	}
+	if (!thread_group_leader(task)) {
+		pr_err_ratelimited("ve_cgroup#%s: only thread group leader is allowed to attach\n",
+				cg->name->name);
+		return false;
+	}
+	if (!thread_group_empty(task)) {
+		pr_err_ratelimited("ve_cgroup#%s: only single-threaded process is allowed to attach\n",
+				cg->name->name);
+		return false;
+	}
+	return true;
+}
+
+static int ve_is_attachable(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *task = cgroup_taskset_first(tset);
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (ve->is_running)
+		return 0;
+
+	if (!ve->veid) {
+		pr_err_ratelimited("ve_cgroup#%s: container's veid is not set\n",
+				cg->name->name);
+		return -EINVAL;
+	}
+
+	if (task->flags & PF_KTHREAD) {
+		/* Paranoia check: allow to attach kthread only, if cgroup is
+		 * not empty.
+		 * This check is required for kthreadd, which is created on CT
+		 * start.
+		 */
+		if (nr_threads_ve(ve))
+			return 0;
+		pr_err_ratelimited("ve_cgroup#%s: can't attach kthread - empty group\n",
+				cg->name->name);
+	} else {
+		/* In case of generic task only one is allowed to enter to
+		 * non-running container: init.
+		 */
+		if (nr_threads_ve(ve) == 0)
+			return 0;
+		pr_err_ratelimited("ve_cgroup#%s: can't attach more than 1 task to "
+				"non-running container\n",
+				cg->name->name);
+	}
+	return -EINVAL;
+}
+
+static void ve_destroy(struct cgroup *cg)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	kmapset_unlink(&ve->ve_sysfs_perms, &ve_sysfs_perms);
+	free_ve_devmnts(ve);
+
+	ve_log_destroy(ve);
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	kfree(ve->binfmt_misc);
+#endif
+	free_percpu(ve->sched_lat_ve.cur);
+	kfree(ve->ve_name);
+	kmem_cache_free(ve_cachep, ve);
+}
+
+static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	if (!ve_task_can_attach(cg, tset))
+		return -EINVAL;
+
+	return ve_is_attachable(cg, tset);
+}
+
+static void ve_update_cpuid_faulting(void *dummy)
+{
+	set_cpuid_faulting(!ve_is_super(get_exec_env()));
+}
+
+static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, cg, tset) {
+		/* this probihibts ptracing of task entered to VE from host system */
+		if (ve->is_running && task->mm)
+			task->mm->vps_dumpable = VD_VE_ENTER_TASK;
+
+		/* Drop OOM protection. */
+		task->signal->oom_score_adj = 0;
+		task->signal->oom_score_adj_min = 0;
+
+		/* Leave parent exec domain */
+		task->parent_exec_id--;
+
+		task->task_ve = ve;
+	}
+
+	/* Adjust cpuid faulting */
+	on_each_cpu(ve_update_cpuid_faulting, NULL, 1);
+}
+
+static int ve_state_read(struct cgroup *cg, struct cftype *cft,
+			 struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (ve->is_running)
+		seq_puts(m, "RUNNING");
+	else if (!ve->init_task)
+		seq_puts(m, "STOPPED");
+	else if (ve->ve_ns)
+		seq_puts(m, "STOPPING");
+	else
+		seq_puts(m, "STARTING");
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+struct ve_start_callback {
+		struct callback_head head;
+		struct ve_struct *ve;
+};
+
+static void ve_start_work(struct callback_head *head)
+{
+	struct ve_start_callback *work;
+	struct ve_struct *ve;
+	int ret;
+
+	work = container_of(head, struct ve_start_callback, head);
+	ve = work->ve;
+
+	down_write(&ve->op_sem);
+	ret = ve_start_container(ve);
+	up_write(&ve->op_sem);
+	put_ve(ve);
+	if (ret)
+		force_sig(SIGKILL, current);
+
+	kfree(work);
+}
+
+static int ve_state_write(struct cgroup *cg, struct cftype *cft,
+			  const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct ve_start_callback *work = NULL;
+	struct task_struct *tsk;
+	int ret = -EINVAL;
+	pid_t pid;
+
+	if (!strcmp(buffer, "START")) {
+		down_write(&ve->op_sem);
+		ret = ve_start_container(ve);
+		up_write(&ve->op_sem);
+
+		return ret;
+	}
+
+	ret = sscanf(buffer, "START %d", &pid);
+	if (ret != 1)
+		return -EINVAL;
+
+	work = kmalloc(sizeof(struct ve_start_callback), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	tsk = find_task_by_vpid(pid);
+	if (!tsk) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	init_task_work(&work->head, ve_start_work);
+
+	work->ve = get_ve(ve);
+	ret = task_work_add(tsk, &work->head, 1);
+	if (ret)
+		put_ve(ve);
+
+out_unlock:
+	rcu_read_unlock();
+	if (ret)
+		kfree(work);
+
+	return ret;
+}
+
+static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
+{
+	return cgroup_ve(cg)->veid;
+}
+
+static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	int err = 0;
+
+	if (value <= 0 || value > INT_MAX)
+		return -EINVAL;
+
+	down_write(&ve->op_sem);
+	if (ve->is_running || ve->ve_ns) {
+		if (ve->veid != value)
+			err = -EBUSY;
+	} else
+		ve->veid = value;
+	up_write(&ve->op_sem);
+	return err;
+}
+
+static void *ve_mount_opts_start(struct seq_file *m, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct ve_devmnt *devmnt;
+	loff_t pos = *ppos;
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		if (!pos--)
+			return devmnt;
+	}
+	return NULL;
+}
+
+static void *ve_mount_opts_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct ve_devmnt *devmnt = v;
+
+	(*ppos)++;
+	if (list_is_last(&devmnt->link, &ve->devmnt_list))
+		return NULL;
+	return list_entry(devmnt->link.next, struct ve_devmnt, link);
+}
+
+static void ve_mount_opts_stop(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = m->private;
+
+	mutex_unlock(&ve->devmnt_mutex);
+}
+
+static int ve_mount_opts_show(struct seq_file *m, void *v)
+{
+	struct ve_devmnt *devmnt = v;
+	dev_t dev = devmnt->dev;
+
+	seq_printf(m, "0 %u:%u;", MAJOR(dev), MINOR(dev));
+	if (devmnt->hidden_options)
+		seq_printf(m, "1 %s;", devmnt->hidden_options);
+	if (devmnt->allowed_options)
+		seq_printf(m, "2 %s;", devmnt->allowed_options);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+struct seq_operations ve_mount_opts_sops = {
+	.start = ve_mount_opts_start,
+	.stop = ve_mount_opts_stop,
+	.next = ve_mount_opts_next,
+	.show = ve_mount_opts_show,
+};
+
+static int ve_mount_opts_open(struct inode *inode, struct file *file)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	struct seq_file *m;
+	int ret;
+
+	if (ve_is_super(ve))
+		return -ENODEV;
+
+	ret = seq_open(file, &ve_mount_opts_sops);
+	if (!ret) {
+		m = file->private_data;
+		m->private = ve;
+	}
+	return ret;
+}
+
+static ssize_t ve_mount_opts_read(struct cgroup *cgrp, struct cftype *cft,
+				  struct file *file, char __user *buf,
+				  size_t nbytes, loff_t *ppos)
+{
+	return seq_read(file, buf, nbytes, ppos);
+}
+
+static int ve_mount_opts_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+/*
+ * 'data' for VE_CONFIGURE_MOUNT_OPTIONS is a zero-terminated string
+ * consisting of substrings separated by MNTOPT_DELIM.
+ */
+#define MNTOPT_DELIM ';'
+#define MNTOPT_MAXLEN 256
+
+/*
+ * Each substring has the form of "<type> <comma-separated-list-of-options>"
+ * where types are:
+ */
+enum {
+	MNTOPT_DEVICE = 0,
+	MNTOPT_HIDDEN = 1,
+	MNTOPT_ALLOWED = 2,
+};
+
+/*
+ * 'ptr' points to the first character of buffer to parse
+ * 'endp' points to the last character of buffer to parse
+ */
+static int ve_parse_mount_options(const char *ptr, const char *endp,
+				  struct ve_devmnt *devmnt)
+{
+	while (*ptr) {
+		const char *delim = strchr(ptr, MNTOPT_DELIM) ? : endp;
+		char *space = strchr(ptr, ' ');
+		int type;
+		char *options, c, s;
+		int options_size = delim - space;
+		char **opts_pp = NULL; /* where to store 'options' */
+
+		if (delim == ptr || !space || options_size <= 1 ||
+		    !isdigit(*ptr) || space > delim)
+			return -EINVAL;
+
+		if (sscanf(ptr, "%d%c", &type, &c) != 2 || c != ' ')
+			return -EINVAL;
+
+		if (type == MNTOPT_DEVICE) {
+			unsigned major, minor;
+			if (devmnt->dev)
+				return -EINVAL; /* Already set */
+			if (sscanf(space + 1, "%u%c%u%c", &major, &c,
+							  &minor, &s) != 4 ||
+			    c != ':' || s != MNTOPT_DELIM)
+				return -EINVAL;
+			devmnt->dev = MKDEV(major, minor);
+			goto next;
+		}
+
+	        options = kmalloc(options_size, GFP_KERNEL);
+		if (!options)
+			return -ENOMEM;
+
+		strncpy(options, space + 1, options_size - 1);
+		options[options_size - 1] = 0;
+
+		switch (type) {
+		case MNTOPT_ALLOWED:
+			opts_pp = &devmnt->allowed_options;
+			break;
+		case MNTOPT_HIDDEN:
+			opts_pp = &devmnt->hidden_options;
+			break;
+		};
+
+		/* wrong type or already set */
+		if (!opts_pp || *opts_pp) {
+			kfree(options);
+			return -EINVAL;
+		}
+
+		*opts_pp = options;
+next:
+		if (!*delim)
+			break;
+
+		ptr = delim + 1;
+	}
+
+	if (!devmnt->dev)
+		return -EINVAL;
+	return 0;
+}
+
+static int ve_mount_opts_write(struct cgroup *cg, struct cftype *cft,
+			       const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct ve_devmnt *devmnt, *old;
+	int size, err;
+
+	size = strlen(buffer);
+	if (size <= 1)
+		return -EINVAL;
+
+	devmnt = kzalloc(sizeof(*devmnt), GFP_KERNEL);
+	if (!devmnt)
+		return -ENOMEM;
+
+	err = ve_parse_mount_options(buffer, buffer + size, devmnt);
+	if (err) {
+		ve_devmnt_free(devmnt);
+		return err;
+	}
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(old, &ve->devmnt_list, link) {
+		/* Delete old devmnt */
+		if (old->dev == devmnt->dev) {
+			list_del(&old->link);
+			ve_devmnt_free(old);
+			break;
+		}
+	}
+	list_add(&devmnt->link, &ve->devmnt_list);
+	mutex_unlock(&ve->devmnt_mutex);
+
+	return 0;
+}
+
+static int ve_os_release_read(struct cgroup *cg, struct cftype *cft,
+			      struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	int ret = 0;
+
+	down_read(&ve->op_sem);
+
+	if (!ve->ve_ns) {
+		ret = -ENOENT;
+		goto up_opsem;
+	}
+
+	down_read(&uts_sem);
+	seq_puts(m, ve->ve_ns->uts_ns->name.release);
+	seq_putc(m, '\n');
+	up_read(&uts_sem);
+up_opsem:
+	up_read(&ve->op_sem);
+
+	return ret;
+}
+
+static int ve_os_release_write(struct cgroup *cg, struct cftype *cft,
+			       const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	char *release;
+	int ret = 0;
+
+	down_read(&ve->op_sem);
+
+	if (!ve->ve_ns) {
+		ret = -ENOENT;
+		goto up_opsem;
+	}
+
+	down_write(&uts_sem);
+	release = ve->ve_ns->uts_ns->name.release;
+	strncpy(release, buffer, __NEW_UTS_LEN);
+	release[__NEW_UTS_LEN] = '\0';
+	up_write(&uts_sem);
+up_opsem:
+	up_read(&ve->op_sem);
+
+	return ret;
+}
+
+enum {
+	VE_CF_STATE,
+	VE_CF_FEATURES,
+	VE_CF_IPTABLES_MASK,
+	VE_CF_PSEUDOSUPER,
+	VE_CF_CLOCK_MONOTONIC,
+	VE_CF_CLOCK_BOOTBASED,
+	VE_CF_AIO_MAX_NR,
+	VE_CF_PID_MAX,
+	VE_CF_NETNS_MAX_NR,
+	VE_CF_NETNS_NR,
+	VE_CF_NETIF_MAX_NR,
+	VE_CF_NETIF_NR,
+};
+
+static int ve_ts_read(struct cgroup *cg, struct cftype *cft, struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct timespec ts, *delta;
+
+	do_posix_clock_monotonic_gettime(&ts);
+	if (cft->private == VE_CF_CLOCK_MONOTONIC) {
+		delta = &ve->start_timespec;
+	} else if (cft->private == VE_CF_CLOCK_BOOTBASED) {
+		delta = &ve->real_start_timespec;
+		monotonic_to_bootbased(&ts);
+	} else {
+		delta = &ts;
+		memset(&ts, 0, sizeof(ts));
+		WARN_ON_ONCE(1);
+	}
+
+	set_normalized_timespec(&ts, ts.tv_sec - delta->tv_sec,
+				ts.tv_nsec - delta->tv_nsec);
+	seq_printf(m, "%ld %ld", ts.tv_sec, ts.tv_nsec);
+	return 0;
+}
+
+static int ve_ts_write(struct cgroup *cg, struct cftype *cft, const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct timespec ts, delta, *target;
+
+	if (sscanf(buffer, "%ld %ld", &delta.tv_sec, &delta.tv_nsec) != 2)
+		return -EINVAL;
+
+	do_posix_clock_monotonic_gettime(&ts);
+	if (cft->private == VE_CF_CLOCK_MONOTONIC) {
+		target = &ve->start_timespec;
+	} else if (cft->private == VE_CF_CLOCK_BOOTBASED) {
+		target = &ve->real_start_timespec;
+		monotonic_to_bootbased(&ts);
+	} else {
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	set_normalized_timespec(target, ts.tv_sec - delta.tv_sec,
+				ts.tv_nsec - delta.tv_nsec);
+	return 0;
+}
+
+static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
+{
+	if (cft->private == VE_CF_FEATURES)
+		return cgroup_ve(cg)->features;
+#ifdef CONFIG_VE_IPTABLES
+	else if (cft->private == VE_CF_IPTABLES_MASK)
+		return cgroup_ve(cg)->ipt_mask;
+#endif
+	else if (cft->private == VE_CF_PSEUDOSUPER)
+		return cgroup_ve(cg)->is_pseudosuper;
+	else if (cft->private == VE_CF_AIO_MAX_NR)
+		return cgroup_ve(cg)->aio_max_nr;
+	else if (cft->private == VE_CF_PID_MAX) {
+		struct ve_struct *ve = cgroup_ve(cg);
+		if (ve->ve_ns && ve->ve_ns->pid_ns)
+			return ve->ve_ns->pid_ns->pid_max;
+	} else if (cft->private == VE_CF_NETNS_MAX_NR)
+		return cgroup_ve(cg)->netns_max_nr;
+	else if (cft->private == VE_CF_NETNS_NR)
+		return atomic_read(&cgroup_ve(cg)->netns_avail_nr);
+	else if (cft->private == VE_CF_NETIF_MAX_NR)
+		return cgroup_ve(cg)->netif_max_nr;
+	else if (cft->private == VE_CF_NETIF_NR)
+		return atomic_read(&cgroup_ve(cg)->netif_avail_nr);
+	return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+				struct cftype *cft,
+				u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!ve_is_super(get_exec_env()) && value)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (value && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+	ve->is_pseudosuper = value;
+	up_write(&ve->op_sem);
+
+	return 0;
+}
+
+extern int pid_max_min, pid_max_max;
+
+static int ve_write_pid_max(struct cgroup *cg,
+			    struct cftype *cft,
+			    u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	if (!ve->ve_ns || !ve->ve_ns->pid_ns)
+		return -EBUSY;
+
+	if (pid_max_min > value ||
+	     pid_max_max < value)
+		return -EINVAL;
+
+	ve->ve_ns->pid_ns->pid_max = value;
+	return 0;
+}
+
+static int _ve_write_u64(struct cgroup *cg, struct cftype *cft,
+                         u64 value, int running)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve->is_pseudosuper)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (!running && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+
+	if (cft->private == VE_CF_FEATURES)
+		ve->features = value;
+#ifdef CONFIG_VE_IPTABLES
+	else if (cft->private == VE_CF_IPTABLES_MASK)
+		ve->ipt_mask = ve_setup_iptables_mask(value);
+#endif
+	else if (cft->private == VE_CF_AIO_MAX_NR)
+		ve->aio_max_nr = value;
+	else if (cft->private == VE_CF_PID_MAX) {
+		int ret;
+		ret = ve_write_pid_max(cg, cft, value);
+		up_write(&ve->op_sem);
+		return ret;
+	} else if (cft->private == VE_CF_NETNS_MAX_NR) {
+		int delta = value - ve->netns_max_nr;
+
+		ve->netns_max_nr = value;
+		atomic_add(delta, &ve->netns_avail_nr);
+	} else if (cft->private == VE_CF_NETIF_MAX_NR) {
+		int delta = value - ve->netif_max_nr;
+
+		ve->netif_max_nr = value;
+		atomic_add(delta, &ve->netif_avail_nr);
+	}
+	up_write(&ve->op_sem);
+	return 0;
+}
+
+static int ve_write_u64(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	return _ve_write_u64(cg, cft, value, 0);
+}
+
+static int ve_write_running_u64(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	return _ve_write_u64(cg, cft, value, 1);
+}
+
+static struct cftype ve_cftypes[] = {
+	{
+		.name			= "state",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_state_read,
+		.write_string		= ve_state_write,
+		.private		= VE_CF_STATE,
+	},
+	{
+		.name			= "veid",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_id_read,
+		.write_u64		= ve_id_write,
+	},
+	{
+		.name			= "features",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_FEATURES,
+	},
+	{
+		.name			= "mount_opts",
+		.max_write_len		= MNTOPT_MAXLEN,
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.open			= ve_mount_opts_open,
+		.read			= ve_mount_opts_read,
+		.release		= ve_mount_opts_release,
+		.write_string		= ve_mount_opts_write,
+	},
+	{
+		.name			= "os_release",
+		.max_write_len		= __NEW_UTS_LEN + 1,
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_os_release_read,
+		.write_string		= ve_os_release_write,
+	},
+	{
+		.name			= "iptables_mask",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_IPTABLES_MASK,
+	},
+	{
+		.name			= "pseudosuper",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_pseudosuper,
+		.private		= VE_CF_PSEUDOSUPER,
+	},
+	{
+		.name			= "clock_monotonic",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_ts_read,
+		.write_string		= ve_ts_write,
+		.private		= VE_CF_CLOCK_MONOTONIC,
+	},
+	{
+		.name			= "clock_bootbased",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_ts_read,
+		.write_string		= ve_ts_write,
+		.private		= VE_CF_CLOCK_BOOTBASED,
+	},
+	{
+		.name			= "aio_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_AIO_MAX_NR,
+	},
+	{
+		.name			= "pid_max",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_running_u64,
+		.private		= VE_CF_PID_MAX,
+	},
+	{
+		.name			= "netns_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_NETNS_MAX_NR,
+	},
+	{
+		.name			= "netns_avail_nr",
+		.read_u64		= ve_read_u64,
+		.private		= VE_CF_NETNS_NR,
+	},
+	{
+		.name			= "netif_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_NETIF_MAX_NR,
+	},
+	{
+		.name			= "netif_avail_nr",
+		.read_u64		= ve_read_u64,
+		.private		= VE_CF_NETIF_NR,
+	},
+	{ }
+};
+
+struct cgroup_subsys ve_subsys = {
+	.name		= "ve",
+	.subsys_id	= ve_subsys_id,
+	.css_alloc	= ve_create,
+	.css_free	= ve_destroy,
+	.can_attach	= ve_can_attach,
+	.attach		= ve_attach,
+	.base_cftypes	= ve_cftypes,
+};
+EXPORT_SYMBOL(ve_subsys);
+
+static int __init ve_subsys_init(void)
+{
+	ve_cachep = KMEM_CACHE(ve_struct, SLAB_PANIC);
+	list_add(&ve0.ve_list, &ve_list_head);
+	return 0;
+}
+late_initcall(ve_subsys_init);
+
+#ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+			 struct seq_file *p);
+
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_proc_stat(css->cgroup, NULL, p);
+	css_put(css);
+	return err;
+}
+
+int cpu_cgroup_proc_loadavg(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *p);
+
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_proc_loadavg(css->cgroup, NULL, p);
+	css_put(css);
+	return err;
+}
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avnrun);
+
+int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avnrun)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_get_avenrun(css->cgroup, avnrun);
+	css_put(css);
+	return err;
+}
+EXPORT_SYMBOL(ve_get_cpu_avenrun);
+
+int cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat);
+
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_get_stat(css->cgroup, kstat);
+	css_put(css);
+	return err;
+}
+EXPORT_SYMBOL(ve_get_cpu_stat);
+#endif /* CONFIG_CGROUP_SCHED */
--- /dev/null
+++ b/kernel/ve/vecalls.c
@@ -0,0 +1,763 @@
+/*
+ *  kernel/ve/vecalls.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * 'vecalls.c' is file with basic VE support. It provides basic primities
+ * along with initialization script
+ */
+
+#include <linux/ve.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/mount.h>
+#include <generated/utsrelease.h>
+
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/device_cgroup.h>
+
+static s64 ve_get_uptime(struct ve_struct *ve)
+{
+	struct timespec uptime;
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	uptime = timespec_sub(uptime, ve->real_start_timespec);
+	return timespec_to_ns(&uptime);
+}
+
+static int fill_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
+{
+	struct ve_struct *ve;
+	struct vz_cpu_stat *vstat;
+	int retval;
+	int i;
+	unsigned long tmp;
+	unsigned long avnrun[3];
+	struct kernel_cpustat kstat;
+
+	if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
+		return -EPERM;
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -ESRCH;
+
+	retval = -ENOMEM;
+	vstat = kzalloc(sizeof(*vstat), GFP_KERNEL);
+	if (!vstat)
+		goto out_put_ve;
+
+	retval = ve_get_cpu_stat(ve, &kstat);
+	if (retval)
+		goto out_free;
+
+	retval = ve_get_cpu_avenrun(ve, avnrun);
+	if (retval)
+		goto out_free;
+
+	vstat->user_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_USER]);
+	vstat->nice_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_NICE]);
+	vstat->system_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_SYSTEM]);
+	vstat->idle_clk += cputime_to_usecs(kstat.cpustat[CPUTIME_IDLE]) * NSEC_PER_USEC;
+
+	vstat->uptime_clk = ve_get_uptime(ve);
+
+	vstat->uptime_jif = (unsigned long)jiffies_64_to_clock_t(
+				get_jiffies_64() - ve->start_jiffies);
+	for (i = 0; i < 3; i++) {
+		tmp = avnrun[i] + (FIXED_1/200);
+		vstat->avenrun[i].val_int = LOAD_INT(tmp);
+		vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
+	}
+
+	retval = 0;
+	if (copy_to_user(buf, vstat, sizeof(*vstat)))
+		retval = -EFAULT;
+out_free:
+	kfree(vstat);
+out_put_ve:
+	put_ve(ve);
+	return retval;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * /proc/meminfo virtualization
+ *
+ **********************************************************************
+ **********************************************************************/
+static int ve_set_meminfo(envid_t veid, unsigned long val)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	struct ve_struct *ve;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	if (val == 0)
+		val = VE_MEMINFO_SYSTEM;
+	else if (val == 1)
+		val = VE_MEMINFO_DEFAULT;
+
+	ve->meminfo_val = val;
+	put_ve(ve);
+	return 0;
+#else
+	return -ENOTTY;
+#endif
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Pieces of VE network
+ *
+ **********************************************************************
+ **********************************************************************/
+
+#ifdef CONFIG_NET
+#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#endif
+
+static int ve_dev_add(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *dst_ve;
+	struct net *dst_net;
+	int err = -ESRCH;
+
+	dst_ve = get_ve_by_id(veid);
+	if (dst_ve == NULL)
+		goto out;
+
+	dst_net = dst_ve->ve_netns;
+
+	rtnl_lock();
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(&init_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = dev_change_net_namespace(dev, dst_net, dev_name);
+out_unlock:
+	rtnl_unlock();
+	put_ve(dst_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+static int ve_dev_del(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *src_ve;
+	struct net *src_net;
+	int err = -ESRCH;
+
+	src_ve = get_ve_by_id(veid);
+	if (src_ve == NULL)
+		goto out;
+
+	src_net = src_ve->ve_netns;
+
+	rtnl_lock();
+
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(src_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = dev_change_net_namespace(dev, &init_net, dev_name);
+out_unlock:
+	rtnl_unlock();
+	put_ve(src_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+int real_ve_dev_map(envid_t veid, int op, char *dev_name)
+{
+	if (!capable_setveid())
+		return -EPERM;
+	switch (op) {
+	case VE_NETDEV_ADD:
+		return ve_dev_add(veid, dev_name);
+	case VE_NETDEV_DEL:
+		return ve_dev_del(veid, dev_name);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE information via /proc
+ *
+ **********************************************************************
+ **********************************************************************/
+#ifdef CONFIG_PROC_FS
+#if BITS_PER_LONG == 32
+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
+#define VESTAT_LINE_FMT "%10s %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
+#else
+#define VESTAT_LINE_WIDTH (12 * 21)
+#define VESTAT_LINE_FMT "%20s %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
+#endif
+
+static int vestat_seq_show(struct seq_file *m, void *v)
+{
+	struct list_head *entry;
+	struct ve_struct *ve;
+	struct ve_struct *curve;
+	int ret;
+	unsigned long user_ve, nice_ve, system_ve;
+	unsigned long long uptime;
+	u64 uptime_cycles, idle_time, strv_time, used;
+	struct kernel_cpustat kstat;
+
+	entry = (struct list_head *)v;
+	ve = list_entry(entry, struct ve_struct, ve_list);
+
+	curve = get_exec_env();
+	if (entry == ve_list_head.next ||
+	    (!ve_is_super(curve) && ve == curve)) {
+		/* print header */
+		seq_printf(m, "%-*s\n",
+			VESTAT_LINE_WIDTH - 1,
+			"Version: 2.2");
+		seq_printf(m, VESTAT_HEAD_FMT, "VEID",
+					"user", "nice", "system",
+					"uptime", "idle",
+					"strv", "uptime", "used",
+					"maxlat", "totlat", "numsched");
+	}
+
+	if (ve == get_ve0())
+		return 0;
+
+	ret = ve_get_cpu_stat(ve, &kstat);
+	if (ret)
+		return ret;
+
+	strv_time = 0;
+	user_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_USER]);
+	nice_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_NICE]);
+	system_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_SYSTEM]);
+	used = cputime_to_usecs(kstat.cpustat[CPUTIME_USED]) * NSEC_PER_USEC;
+	idle_time = cputime_to_usecs(kstat.cpustat[CPUTIME_IDLE]) *
+							NSEC_PER_USEC;
+
+	uptime_cycles = ve_get_uptime(ve);
+	uptime = get_jiffies_64() - ve->start_jiffies;
+
+	seq_printf(m, VESTAT_LINE_FMT, ve_name(ve),
+				user_ve, nice_ve, system_ve,
+				(unsigned long long)uptime,
+				(unsigned long long)idle_time, 
+				(unsigned long long)strv_time,
+				(unsigned long long)uptime_cycles,
+				(unsigned long long)used,
+				(unsigned long long)ve->sched_lat_ve.last.maxlat,
+				(unsigned long long)ve->sched_lat_ve.last.totlat,
+				ve->sched_lat_ve.last.count);
+	return 0;
+}
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ve_struct *curve;
+
+	curve = get_exec_env();
+	mutex_lock(&ve_list_lock);
+	if (!ve_is_super(curve)) {
+		if (*pos != 0)
+			return NULL;
+		return &curve->ve_list;
+	}
+
+	return seq_list_start(&ve_list_head, *pos);
+}
+EXPORT_SYMBOL(ve_seq_start);
+
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+	else
+		return seq_list_next(v, &ve_list_head, pos);
+}
+EXPORT_SYMBOL(ve_seq_next);
+
+void ve_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&ve_list_lock);
+}
+EXPORT_SYMBOL(ve_seq_stop);
+
+static struct seq_operations vestat_seq_op = {
+        .start	= ve_seq_start,
+        .next	= ve_seq_next,
+        .stop	= ve_seq_stop,
+        .show	= vestat_seq_show
+};
+
+static int vestat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &vestat_seq_op);
+}
+
+static struct file_operations proc_vestat_operations = {
+        .open	 = vestat_open,
+        .read	 = seq_read,
+        .llseek	 = seq_lseek,
+        .release = seq_release
+};
+
+static int devperms_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
+
+	if (m->private == (void *)0) {
+		seq_printf(m, "Version: 2.7\n");
+		m->private = (void *)-1;
+	}
+
+	if (ve_is_super(ve))
+		seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
+	else
+		devcgroup_seq_show_ve(ve, m);
+
+	return 0;
+}
+
+static struct seq_operations devperms_seq_op = {
+	.start  = ve_seq_start,
+	.next   = ve_seq_next,
+	.stop   = ve_seq_stop,
+	.show   = devperms_seq_show,
+};
+
+static int devperms_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &devperms_seq_op);
+}
+
+static struct file_operations proc_devperms_ops = {
+	.open           = devperms_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static int vz_version_show(struct seq_file *file, void* v)
+{
+	static const char ver[] = VZVERSION "\n";
+
+	return seq_puts(file, ver);
+}
+
+static int vz_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vz_version_show, NULL);
+}
+
+static struct file_operations proc_vz_version_oparations = {
+	.open    = vz_version_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/* /proc/vz/veinfo */
+
+static ve_seq_print_t veaddr_seq_print_cb;
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, cb);
+}
+EXPORT_SYMBOL(vzmon_register_veaddr_print_cb);
+
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb);
+
+static int veinfo_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve;
+	ve_seq_print_t veaddr_seq_print;
+
+	ve = list_entry((struct list_head *)v, struct ve_struct, ve_list);
+
+	seq_printf(m, "%10s %5u %5u", ve_name(ve), ve->class_id, nr_threads_ve(ve));
+
+	rcu_read_lock();
+	veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb);
+	if (veaddr_seq_print)
+		veaddr_seq_print(m, ve);
+	rcu_read_unlock();
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static struct seq_operations veinfo_seq_op = {
+	.start	= ve_seq_start,
+	.next	=  ve_seq_next,
+	.stop	=  ve_seq_stop,
+	.show	=  veinfo_seq_show,
+};
+
+static int veinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &veinfo_seq_op);
+}
+
+static struct file_operations proc_veinfo_operations = {
+	.open		= veinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_vecalls_proc(void)
+{
+	struct proc_dir_entry *de;
+
+	de = proc_create("vestat", S_IFREG | S_IRUSR | S_ISVTX, proc_vz_dir,
+			&proc_vestat_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make vestat proc entry\n");
+
+	de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_devperms_ops);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
+
+	de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
+			&proc_vz_version_oparations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make version proc entry\n");
+
+	de = proc_create("veinfo", S_IFREG | S_IRUSR | S_ISVTX, proc_vz_dir,
+			&proc_veinfo_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n");
+
+	return 0;
+}
+
+static void fini_vecalls_proc(void)
+{
+	remove_proc_entry("version", proc_vz_dir);
+	remove_proc_entry("devperms", proc_vz_dir);
+	remove_proc_entry("vestat", proc_vz_dir);
+	remove_proc_entry("veinfo", proc_vz_dir);
+}
+#else
+#define init_vecalls_proc()	(0)
+#define fini_vecalls_proc()	do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+static int init_ve_osrelease(struct ve_struct *ve, char *release)
+{
+	if (!release)
+		return -ENODATA;
+
+	if (strlen(release) >= sizeof(ve->ve_ns->uts_ns->name.release))
+		return -EMSGSIZE;
+
+	down_write(&uts_sem);
+	strcpy(ve->ve_ns->uts_ns->name.release, release);
+	up_write(&uts_sem);
+
+	return 0;
+}
+
+static int ve_configure(envid_t veid, unsigned int key,
+			unsigned int val, unsigned int size, char *data)
+{
+	struct ve_struct *ve;
+	int err = -ENOKEY;
+
+	if (key == VE_CONFIGURE_OPEN_TTY)
+		return vtty_open_master(veid, val);
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	switch(key) {
+	case VE_CONFIGURE_OS_RELEASE:
+		err = init_ve_osrelease(ve, data);
+		break;
+	}
+
+	put_ve(ve);
+ 	return err;
+}
+
+static int ve_configure_ioctl(struct vzctl_ve_configure *arg)
+{
+	int err;
+	struct vzctl_ve_configure s;
+	char *data = NULL;
+
+	err = -EFAULT;
+	if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+		goto out;
+	if (s.size) {
+		if (s.size > PAGE_SIZE)
+			return -EMSGSIZE;
+
+		data = kzalloc(s.size + 1, GFP_KERNEL);
+		if (unlikely(!data))
+			return -ENOMEM;
+
+		if (copy_from_user(data, (void __user *) &arg->data, s.size))
+			goto out;
+	}
+	err = ve_configure(s.veid, s.key, s.val, s.size, data);
+out:
+	kfree(data);
+	return err;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * User ctl
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	    case VZCTL_MARK_ENV_TO_DOWN: {
+		        /* Compatibility issue */
+		        err = 0;
+		}
+		break;
+#ifdef CONFIG_INET
+	    case VZCTL_VE_NETDEV: {
+			struct vzctl_ve_netdev d;
+			char *s;
+			err = -EFAULT;
+			if (copy_from_user(&d, (void __user *)arg, sizeof(d)))
+				break;
+			err = -ENOMEM;
+			s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
+			if (s == NULL)
+				break;
+			err = -EFAULT;
+			if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
+				s[IFNAMSIZ] = 0;
+				err = real_ve_dev_map(d.veid, d.op, s);
+			}
+			kfree(s);
+		}
+		break;
+#endif
+	    case VZCTL_ENV_CREATE: {
+			err = -ENOTSUPP;
+		}
+		break;
+	    case VZCTL_ENV_CREATE_DATA: {
+			err = -ENOTSUPP;
+		}
+		break;
+	    case VZCTL_GET_CPU_STAT: {
+			struct vzctl_cpustatctl s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = fill_cpu_stat(s.veid, s.cpustat);
+		}
+		break;
+	    case VZCTL_VE_MEMINFO: {
+			struct vzctl_ve_meminfo s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = ve_set_meminfo(s.veid, s.val);
+		}
+		break;
+	    case VZCTL_VE_CONFIGURE:
+		err = ve_configure_ioctl((struct vzctl_ve_configure *)arg);
+		break;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_vzcalls_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VZCTL_GET_CPU_STAT: {
+		/* FIXME */
+	}
+	case VZCTL_COMPAT_ENV_CREATE_DATA: {
+		struct compat_vzctl_env_create_data cs;
+		struct vzctl_env_create_data __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.flags, &s->flags) ||
+		    put_user(cs.class_id, &s->class_id) ||
+		    put_user(compat_ptr(cs.data), &s->data) ||
+		    put_user(cs.datalen, &s->datalen))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA,
+						(unsigned long)s);
+		break;
+	}
+#ifdef CONFIG_NET
+	case VZCTL_COMPAT_VE_NETDEV: {
+		struct compat_vzctl_ve_netdev cs;
+		struct vzctl_ve_netdev __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.op, &s->op) ||
+		    put_user(compat_ptr(cs.dev_name), &s->dev_name))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s);
+		break;
+	}
+#endif
+	case VZCTL_COMPAT_VE_MEMINFO: {
+		struct compat_vzctl_ve_meminfo cs;
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		err = ve_set_meminfo(cs.veid, cs.val);
+		break;
+	}
+	default:
+		err = vzcalls_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo vzcalls = {
+	.type		= VZCTLTYPE,
+	.ioctl		= vzcalls_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_vzcalls_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Init/exit stuff
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static inline __init int init_vecalls_ioctls(void)
+{
+	vzioctl_register(&vzcalls);
+	return 0;
+}
+
+static inline void fini_vecalls_ioctls(void)
+{
+	vzioctl_unregister(&vzcalls);
+}
+
+static int __init vecalls_init(void)
+{
+	int err;
+
+	err = init_vecalls_proc();
+	if (err < 0)
+		goto out_proc;
+
+	err = init_vecalls_ioctls();
+	if (err < 0)
+		goto out_ioctls;
+
+	/*
+	 * This one can also be dereferenced since not freed
+	 * VE holds reference on module
+	 */
+
+	return 0;
+
+out_ioctls:
+	fini_vecalls_proc();
+out_proc:
+	return err;
+}
+
+static void __exit vecalls_exit(void)
+{
+	fini_vecalls_ioctls();
+	fini_vecalls_proc();
+}
+
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Control");
+MODULE_LICENSE("GPL v2");
+
+module_init(vecalls_init)
+module_exit(vecalls_exit)
--- /dev/null
+++ b/kernel/ve/veowner.c
@@ -0,0 +1,133 @@
+/*
+ *  kernel/ve/veowner.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/ipc.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/inetdevice.h>
+#include <linux/pid_namespace.h>
+#include <linux/xattr.h>
+#include <asm/io.h>
+
+#include <net/tcp.h>
+
+/*
+ * ------------------------------------------------------------------------
+ * proc entries
+ * ------------------------------------------------------------------------
+ */
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_vz_dir;
+EXPORT_SYMBOL(proc_vz_dir);
+
+static int proc_fairsched_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static ssize_t proc_fairsched_read(struct file *file, char __user *buf,
+				   size_t size, loff_t *ppos)
+{
+	return 0;
+}
+
+static struct file_operations proc_fairsched_operations = {
+	.open		= proc_fairsched_open,
+	.read		= proc_fairsched_read,
+	.llseek		= noop_llseek,
+};
+
+static void prepare_proc(void)
+{
+	proc_vz_dir = proc_mkdir_mode("vz", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
+	if (!proc_vz_dir)
+		panic("Can't create /proc/vz dir\n");
+
+	/* Legacy files. They are not really needed and should be removed
+	 * sooner or later, but leave the stubs for now as they may be required
+	 * by userspace */
+
+	proc_mkdir_mode("fairsched", 0, proc_vz_dir);
+
+	proc_create("fairsched", S_ISVTX, NULL,	&proc_fairsched_operations);
+	proc_create("fairsched2", S_ISVTX, NULL, &proc_fairsched_operations);
+}
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * OpenVZ sysctl
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Operations with a big amount of mount points can require a lot of time.
+ * These operations take the global lock namespace_sem, so they can affect
+ * other containers. Let us allow no more than sysctl_ve_mount_nr mount
+ * points for a VE.
+ */
+unsigned int sysctl_ve_mount_nr = 4096;
+static int ve_mount_nr_min = 0;
+static int ve_mount_nr_max = INT_MAX;
+
+static struct ctl_table vz_fs_table[] = {
+	{
+		.procname	= "fsync-enable",
+		.data		= &ve0.fsync_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_virtual,
+	},
+	{
+		.procname       = "ve-mount-nr",
+		.data           = &sysctl_ve_mount_nr,
+		.maxlen         = sizeof(sysctl_ve_mount_nr),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1		= &ve_mount_nr_min,
+		.extra2		= &ve_mount_nr_max,
+	},
+	{ 0 }
+};
+
+static struct ctl_path fs_path[] = {
+	{ .procname = "fs", },
+	{ }
+};
+
+static void prepare_sysctl(void)
+{
+	register_sysctl_paths(fs_path, vz_fs_table);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * XXX init_ve_system
+ * ------------------------------------------------------------------------
+ */
+
+void init_ve_system(void)
+{
+#ifdef CONFIG_PROC_FS
+	prepare_proc();
+#endif
+	prepare_sysctl();
+
+	kobj_ns_type_register(&ve_ns_type_operations);
+}
--- /dev/null
+++ b/kernel/ve/vzdev.c
@@ -0,0 +1,151 @@
+/*
+ *  kernel/ve/vzdev.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vzctl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <uapi/linux/vzcalluser.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <linux/device.h>
+
+#define VZCTL_MAJOR 126
+#define VZCTL_NAME "vzctl"
+
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Interface");
+MODULE_LICENSE("GPL v2");
+
+static LIST_HEAD(ioctls);
+static DEFINE_SPINLOCK(ioctl_lock);
+
+static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd)
+{
+	struct vzioctlinfo *h;
+
+	spin_lock(&ioctl_lock);
+	list_for_each_entry(h, &ioctls, list) {
+		if (h->type == _IOC_TYPE(cmd))
+			goto found;
+	}
+	h = NULL;
+found:
+	if (h && !try_module_get(h->owner))
+		h = NULL;
+	spin_unlock(&ioctl_lock);
+	return h;
+}
+
+static void vzctl_put_handler(struct vzioctlinfo *h)
+{
+	if (!h)
+		return;
+
+	module_put(h->owner);
+}
+
+long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOTTY;
+	h = vzctl_get_handler(cmd);
+	if (h && h->ioctl)
+		err = (*h->ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOIOCTLCMD;
+	h = vzctl_get_handler(cmd);
+	if (h && h->compat_ioctl)
+		err = (*h->compat_ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+void vzioctl_register(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_add(&inf->list, &ioctls);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_register);
+
+void vzioctl_unregister(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_del_init(&inf->list);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_unregister);
+
+/*
+ * Init/exit stuff.
+ */
+static struct file_operations vzctl_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= vzctl_ioctl,
+	.compat_ioctl	= compat_vzctl_ioctl,
+};
+
+static struct class *vzctl_class;
+
+static void __exit vzctl_exit(void)
+{
+	device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
+	class_destroy(vzctl_class);
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+}
+
+static int __init vzctl_init(void)
+{
+	int ret;
+	struct device *class_err;
+
+	ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
+	if (ret < 0)
+		goto out;
+
+	vzctl_class = class_create(THIS_MODULE, "vzctl");
+	if (IS_ERR(vzctl_class)) {
+		ret = PTR_ERR(vzctl_class);
+		goto out_cleandev;
+	}
+
+	class_err = device_create(vzctl_class, NULL,
+			MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME);
+	if (IS_ERR(class_err)) {
+		ret = PTR_ERR(class_err);
+		goto out_rmclass;
+	}
+
+	goto out;
+
+out_rmclass:
+	class_destroy(vzctl_class);
+out_cleandev:
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+out:
+	return ret;
+}
+
+module_init(vzctl_init)
+module_exit(vzctl_exit);
--- /dev/null
+++ b/kernel/ve/vzevent.c
@@ -0,0 +1,144 @@
+/*
+ *  kernel/ve/vzevent.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzevent.h>
+#include <linux/pid_namespace.h>
+
+#define NETLINK_UEVENT	31
+#define VZ_EVGRP_ALL	0x01
+
+static int reboot_event;
+module_param(reboot_event, int, 0644);
+MODULE_PARM_DESC(reboot_event, "Enable reboot events");
+
+/*
+ * NOTE: the original idea was to send events via kobject_uevent(),
+ * however, it turns out that it has negative consequences like
+ * start of /sbin/hotplug which tries to react on our events in inadequate manner.
+ */
+
+static struct sock *vzev_sock;
+
+static char *action_to_string(int action)
+{
+	switch (action) {
+	case VE_EVENT_MOUNT:
+		return "ve-mount";
+	case VE_EVENT_UMOUNT:
+		return "ve-umount";
+	case VE_EVENT_START:
+		return "ve-start";
+	case VE_EVENT_STOP:
+		return "ve-stop";
+	case VE_EVENT_REBOOT:
+		return "ve-reboot";
+	default:
+		return NULL;
+	}
+}
+
+static int do_vzevent_send(int event, char *msg, int len)
+{
+	struct sk_buff *skb;
+	char *buf, *action;
+	int alen;
+
+	action = action_to_string(event);
+	if (!action)
+		return -EINVAL;
+
+	alen = strlen(action);
+
+	skb = alloc_skb(len + 1 + alen, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	buf = skb_put(skb, len + 1 + alen);
+	memcpy(buf, action, alen);
+	buf[alen] = '@';
+	memcpy(buf + alen + 1, msg, len);
+	(void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL);
+	return 0;
+}
+
+int vzevent_send(int event, const char *attrs_fmt, ...)
+{
+	va_list args;
+	int len, err;
+	char *page;
+
+	err = -ENOMEM;
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	va_start(args, attrs_fmt);
+	len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args);
+	va_end(args);
+
+	err = do_vzevent_send(event, page, len);
+	free_page((unsigned long)page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(vzevent_send);
+
+static int ve_start(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	vzevent_send(VE_EVENT_START, "%s", ve_name(ve));
+	return 0;
+}
+
+static void ve_stop(void *data)
+{
+	struct ve_struct *ve = data;
+	int event = VE_EVENT_STOP;
+
+	if (ve->ve_ns->pid_ns->reboot == SIGHUP && reboot_event)
+		event = VE_EVENT_REBOOT;
+
+	vzevent_send(event, "%s", ve_name(ve));
+}
+
+static struct ve_hook ve_start_stop_hook = {
+	.init		= ve_start,
+	.fini		= ve_stop,
+	.owner		= THIS_MODULE,
+	.priority	= HOOK_PRIO_AFTERALL,
+};
+
+static int __init init_vzevent(void)
+{
+	vzev_sock = netlink_kernel_create(&init_net, NETLINK_UEVENT, NULL);
+	if (vzev_sock == NULL)
+		return -ENOMEM;
+	ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook);
+	return 0;
+}
+
+static void __exit exit_vzevent(void)
+{
+	ve_hook_unregister(&ve_start_stop_hook);
+	netlink_kernel_release(vzev_sock);
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+
+module_init(init_vzevent);
+module_exit(exit_vzevent);
--- /dev/null
+++ b/kernel/ve/vziolimit.c
@@ -0,0 +1,519 @@
+/*
+ *  kernel/ve/vziolimit.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/virtinfo.h>
+#include <linux/vzctl.h>
+#include <linux/vziolimit.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <asm/uaccess.h>
+#include <bc/beancounter.h>
+
+struct throttle {
+       unsigned speed;		/* maximum speed, units per second */
+       unsigned burst;		/* maximum bust, units */
+       unsigned latency;	/* maximum wait delay, jiffies */
+       unsigned remain;		/* units/HZ */
+       unsigned long time;	/* wall time in jiffies */
+       long long state;		/* current state in units */
+};
+
+enum {
+	UB_CGROUP_IOLIMIT_SPEED 	= 0,
+	UB_CGROUP_IOLIMIT_BURST 	= 1,
+	UB_CGROUP_IOLIMIT_LATENCY 	= 2,
+	UB_CGROUP_IOPSLIMIT_SPEED 	= 3,
+	UB_CGROUP_IOPSLIMIT_BURST 	= 4,
+	UB_CGROUP_IOPSLIMIT_LATENCY 	= 5,
+
+};
+
+/**
+ * set throttler initial state, externally serialized
+ * @speed	maximum speed (1/sec)
+ * @burst	maximum burst chunk
+ * @latency	maximum timeout (ms)
+ */
+static void throttle_setup(struct throttle *th, unsigned speed,
+		unsigned burst, unsigned latency)
+{
+	th->time = jiffies;
+	th->burst = burst;
+	th->latency = msecs_to_jiffies(latency);
+	wmb();
+	th->speed = speed;
+}
+
+/* externally serialized */
+static void throttle_charge(struct throttle *th, long long charge)
+{
+	unsigned long time, now = jiffies;
+	long long step, ceiling = charge + th->burst;
+
+	if (time_before(th->time, now)) {
+		step = (u64)th->speed * (now - th->time);
+		do_div(step, HZ);
+		step += th->state;
+		/* feed throttler as much as we can */
+		if (step <= ceiling)
+			th->state = step;
+		else if (th->state < ceiling)
+			th->state = ceiling;
+		th->time = now;
+	}
+
+	if (charge > th->state) {
+		charge -= th->state;
+		step = charge * HZ;
+		if (do_div(step, th->speed))
+			step++;
+		time = th->time + step;
+		/* limit maximum latency */
+		if (time_after(time, now + th->latency))
+			time = now + th->latency;
+		th->time = time;
+		step *= th->speed;
+		step += th->remain;
+		th->remain = do_div(step, HZ);
+		th->state += step;
+	}
+}
+
+/* lockless */
+static unsigned long throttle_timeout(struct throttle *th, unsigned long now)
+{
+	unsigned long time;
+
+	if (!th->speed)
+		return 0;
+	rmb();
+	time = th->time;
+	if (time_before(time, now))
+		return 0;
+	return min(time - now, (unsigned long)th->latency);
+}
+
+struct iolimit {
+	struct throttle throttle;
+	struct throttle iops;
+	wait_queue_head_t wq;
+};
+
+static void iolimit_wait(struct iolimit *iolimit, unsigned long timeout)
+{
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&iolimit->wq, &wait,
+				TASK_KILLABLE | __TASK_IOTHROTTLED);
+		timeout = schedule_timeout(timeout);
+		if (fatal_signal_pending(current))
+			break;
+		if (unlikely(timeout))
+			timeout = min(throttle_timeout(&iolimit->throttle,
+						jiffies), timeout);
+	} while (timeout);
+	finish_wait(&iolimit->wq, &wait);
+}
+
+static unsigned long iolimit_timeout(struct iolimit *iolimit)
+{
+	unsigned long now = jiffies;
+
+	return max(throttle_timeout(&iolimit->throttle, now),
+			throttle_timeout(&iolimit->iops, now));
+}
+
+static void iolimit_balance_dirty(struct iolimit *iolimit,
+				  struct user_beancounter *ub,
+				  unsigned long write_chunk)
+{
+	struct throttle *th = &iolimit->throttle;
+	unsigned long flags, dirty, state;
+
+	if (!th->speed)
+		return;
+
+	/* can be non-atomic on i386, but ok. this just hint. */
+	state = th->state >> PAGE_SHIFT;
+	dirty = ub_stat_get(ub, dirty_pages) + write_chunk;
+	/* protect agains ub-stat percpu drift */
+	if (dirty + UB_STAT_BATCH * num_possible_cpus()	< state)
+		return;
+	/* get exact value of for smooth throttling */
+	dirty = ub_stat_get_exact(ub, dirty_pages) + write_chunk;
+	if (dirty < state)
+		return;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	/* precharge dirty pages */
+	throttle_charge(th, (long long)dirty << PAGE_SHIFT);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static int iolimit_virtinfo(struct vnotifier_block *nb,
+		unsigned long cmd, void *arg, int old_ret)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	struct iolimit *iolimit = ub->iolimit;
+	unsigned long flags, timeout;
+	struct request_queue *q;
+
+	if (!iolimit)
+		return old_ret;
+
+	if (!iolimit->throttle.speed && !iolimit->iops.speed)
+		return NOTIFY_OK;
+
+	switch (cmd) {
+		case VIRTINFO_IO_ACCOUNT:
+			if (!iolimit->throttle.speed)
+				break;
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->throttle.speed) {
+				long long charge = *(size_t*)arg;
+
+				throttle_charge(&iolimit->throttle, charge);
+				iolimit->throttle.state -= charge;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_FUSE_REQ:
+		case VIRTINFO_IO_OP_ACCOUNT:
+
+			if (!iolimit->iops.speed)
+				break;
+
+			q = (struct request_queue *) arg;
+			if (q)
+				blk_add_trace_msg(q, "vziolimit iops ub:%s speed:%d remain:%d ",
+						  ub->ub_name,iolimit->iops.speed,
+						  iolimit->iops.remain);
+
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->iops.speed) {
+				throttle_charge(&iolimit->iops, 1);
+				/*
+				 * Writeback doesn't use last iops from stash
+				 * to avoid choking future sync operations.
+				 */
+				if (iolimit->iops.state > 1 ||
+				    !(current->flags & PF_SWAPWRITE))
+					iolimit->iops.state--;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_PREPARE:
+		case VIRTINFO_IO_JOURNAL:
+
+			if (current->flags & PF_SWAPWRITE)
+				break;
+
+			timeout = iolimit_timeout(iolimit);
+			q = (struct request_queue *) arg;
+			if (q)
+				blk_add_trace_msg(q, "vziolimit sleep ub:%s speed:%ld ",
+						  ub->ub_name, timeout);
+
+			if (timeout && !fatal_signal_pending(current))
+				iolimit_wait(iolimit, timeout);
+			break;
+		case VIRTINFO_IO_READAHEAD:
+		case VIRTINFO_IO_CONGESTION:
+			timeout = iolimit_timeout(iolimit);
+			if (timeout)
+				return NOTIFY_FAIL;
+			break;
+		case VIRTINFO_IO_BALANCE_DIRTY:
+			iolimit_balance_dirty(iolimit, ub, (unsigned long)arg);
+			break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block iolimit_virtinfo_nb = {
+	.notifier_call = iolimit_virtinfo,
+};
+
+
+static void throttle_state(struct user_beancounter *ub,
+		struct throttle *throttle, struct iolimit_state *state)
+{
+	spin_lock_irq(&ub->ub_lock);
+	state->speed = throttle->speed;
+	state->burst = throttle->burst;
+	state->latency = jiffies_to_msecs(throttle->latency);
+	spin_unlock_irq(&ub->ub_lock);
+}
+
+static struct iolimit *iolimit_get(struct user_beancounter *ub)
+{
+	struct iolimit *iolimit = ub->iolimit;
+
+	if (iolimit)
+		return iolimit;
+
+	iolimit = kzalloc(sizeof(struct iolimit), GFP_KERNEL);
+	if (!iolimit)
+		return NULL;
+	init_waitqueue_head(&iolimit->wq);
+
+	spin_lock_irq(&ub->ub_lock);
+	if (ub->iolimit) {
+		kfree(iolimit);
+		iolimit = ub->iolimit;
+	} else
+		ub->iolimit = iolimit;
+	spin_unlock_irq(&ub->ub_lock);
+
+	return iolimit;
+}
+
+static int iolimit_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct user_beancounter *ub;
+	struct iolimit *iolimit;
+	struct iolimit_state state;
+	int err;
+
+	if (cmd != VZCTL_SET_IOLIMIT && cmd != VZCTL_GET_IOLIMIT &&
+	    cmd != VZCTL_SET_IOPSLIMIT && cmd != VZCTL_GET_IOPSLIMIT)
+		return -ENOTTY;
+
+	if (copy_from_user(&state, (void __user *)arg, sizeof(state)))
+		return -EFAULT;
+
+	ub = get_beancounter_byuid(state.id, 0);
+	if (!ub)
+		return -ENOENT;
+
+	iolimit = ub->iolimit;
+
+	switch (cmd) {
+		case VZCTL_SET_IOLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->throttle, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_SET_IOPSLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->iops, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_GET_IOLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->throttle, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		case VZCTL_GET_IOPSLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->iops, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		default:
+			err = -ENOTTY;
+	}
+
+	put_beancounter(ub);
+	return err;
+}
+
+static struct vzioctlinfo iolimit_vzioctl = {
+	.type		= VZIOLIMITTYPE,
+	.ioctl		= iolimit_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= iolimit_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static ssize_t iolimit_cgroup_read(struct cgroup *cg, struct cftype *cft,
+			      struct file *file, char __user *buf,
+			      size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct iolimit *iolimit = ub->iolimit;
+	unsigned long val = 0;
+	int len;
+	char str[32];
+
+	if (!iolimit)
+		goto out;
+
+	spin_lock_irq(&ub->ub_lock);
+	switch (cft->private) {
+	case UB_CGROUP_IOLIMIT_SPEED:
+		val = iolimit->throttle.speed;
+		break;
+	case UB_CGROUP_IOLIMIT_BURST:
+		val = iolimit->throttle.burst;
+		break;
+	case UB_CGROUP_IOLIMIT_LATENCY:
+		val = iolimit->throttle.latency;
+		break;
+
+	case UB_CGROUP_IOPSLIMIT_SPEED:
+		val = iolimit->iops.speed;
+		break;
+	case UB_CGROUP_IOPSLIMIT_BURST:
+		val = iolimit->iops.burst;
+		break;
+	case UB_CGROUP_IOPSLIMIT_LATENCY:
+		val = iolimit->iops.latency;
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irq(&ub->ub_lock);
+out:
+	len = scnprintf(str, sizeof(str), "%lu\n", val);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int iolimit_cgroup_write_u64(struct cgroup *cg, struct cftype *cft, u64 val)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct iolimit *iolimit;
+
+	iolimit = iolimit_get(ub);
+	if (!iolimit)
+		return -ENOMEM;
+
+	spin_lock_irq(&ub->ub_lock);
+	iolimit->throttle.time = iolimit->iops.time = jiffies;
+
+	switch (cft->private) {
+	case UB_CGROUP_IOLIMIT_SPEED:
+		wmb();
+		iolimit->throttle.speed = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_SPEED:
+		wmb();
+		iolimit->iops.speed = val;
+		break;
+	case UB_CGROUP_IOLIMIT_BURST:
+		iolimit->throttle.burst = val;
+		break;
+	case UB_CGROUP_IOLIMIT_LATENCY:
+		iolimit->throttle.latency = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_BURST:
+		iolimit->iops.burst = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_LATENCY:
+		iolimit->iops.latency = val;
+		break;
+	default:
+		BUG();
+	}
+	wake_up_all(&iolimit->wq);
+	spin_unlock_irq(&ub->ub_lock);
+	return 0;
+}
+
+static struct cftype vziolimit_cftypes[] = {
+	{
+		.name = "iolimit.speed",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_SPEED,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iolimit.burst",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_BURST,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iolimit.latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_LATENCY,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+
+	{
+		.name = "iopslimit.speed",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_SPEED,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iopslimit.burst",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_BURST,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iopslimit.latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_LATENCY,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{ }
+};
+
+static int __init iolimit_init(void)
+{
+	int err;
+	virtinfo_notifier_register(VITYPE_IO, &iolimit_virtinfo_nb);
+	vzioctl_register(&iolimit_vzioctl);
+	err = cgroup_add_cftypes(&ub_subsys, vziolimit_cftypes);
+	if (err)
+		goto err_cgroup;
+	return 0;
+
+err_cgroup:
+	vzioctl_unregister(&iolimit_vzioctl);
+	virtinfo_notifier_unregister(VITYPE_IO, &iolimit_virtinfo_nb);
+	return err;
+}
+
+static void __exit iolimit_exit(void)
+{
+	cgroup_rm_cftypes(&ub_subsys, vziolimit_cftypes);
+	vzioctl_unregister(&iolimit_vzioctl);
+	virtinfo_notifier_unregister(VITYPE_IO, &iolimit_virtinfo_nb);
+}
+
+module_init(iolimit_init)
+module_exit(iolimit_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vzlist.c
@@ -0,0 +1,303 @@
+/*
+ *  kernel/ve/vzlist.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/ve.h>
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzlist.h>
+#include <linux/vmalloc.h>
+#include <linux/ve_proto.h>
+#include <linux/veip.h>
+#include <linux/uaccess.h>
+#include <linux/pid_namespace.h>
+
+static DEFINE_SEMAPHORE(vzlist_sem);
+
+static int get_veids(struct vzlist_veidctl *s)
+{
+	int ret;
+	int ves;
+	unsigned long size;
+	envid_t *buf;
+	struct ve_struct *ve;
+
+	ves = nr_ve + 1;
+	if (!s->num || s->id == NULL)
+		return ves;
+
+	down(&vzlist_sem);
+again:
+	size = (ves + 20)*sizeof(envid_t);
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	ves = 0;
+	mutex_lock(&ve_list_lock);
+	for_each_ve(ve) {
+		if (size >= (ves + 1)*sizeof(envid_t))
+			buf[ves] = ve->veid;
+		ves++;
+	}
+	mutex_unlock(&ve_list_lock);
+
+	ret = ves;
+	if (ves > s->num)
+		goto out;
+	if (size < ves*sizeof(envid_t)) {
+		vfree(buf);
+		goto again;
+	}
+	if (copy_to_user(s->id, buf, ves*sizeof(envid_t)))
+		ret = -EFAULT;
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	return ret;
+}
+
+static int get_vepids(struct vzlist_vepidctl *s)
+{
+	int ret;
+	int tasks = 0;
+	unsigned long size;
+	envid_t *buf;
+	struct ve_struct *ve;
+	struct task_struct *tsk;
+	struct pid_namespace *ns;
+	int nr;
+
+	ret = -ESRCH;
+	ve = get_ve_by_id(s->veid);
+	if (!ve)
+		goto out_no_ve;
+	ns = ve->ve_ns->pid_ns;
+
+	down(&vzlist_sem);
+again:
+	size = (tasks + 512)*(2*sizeof(pid_t));
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	tasks = 0;
+	qread_lock(&tasklist_lock);
+	nr = next_pidmap(ns, 0);
+	while (nr > 0) {
+		rcu_read_lock();
+
+		tsk = pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
+		if (tsk) {
+			if (size >= (tasks + 1)*(2*sizeof(pid_t))) {
+				buf[2*tasks] = tsk->pid;
+				buf[2*tasks + 1] = task_pid_nr_ns(tsk, ns);
+			}
+			tasks++;
+		}
+
+		rcu_read_unlock();
+		nr = next_pidmap(ns, nr);
+	}
+	qread_unlock(&tasklist_lock);
+
+	ret = tasks;
+	if ((tasks > s->num) | (!tasks))
+		goto out;
+	if (size < tasks*(2*sizeof(pid_t))) {
+		vfree(buf);
+		goto again;
+	}
+	if (copy_to_user(s->pid, buf, tasks*(2*sizeof(pid_t))))
+		ret = -EFAULT;
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	put_ve(ve);
+out_no_ve:
+	return ret;
+}
+
+static int get_veips(struct vzlist_veipctl *s, unsigned int cmd)
+{
+	int ret;
+	int ips;
+	unsigned long size;
+	u32 *buf, *pos;
+	struct ve_struct *ve;
+	struct veip_struct *veip;
+	struct ip_entry_struct *entry;
+	struct ve_addr_struct *addr;
+
+	ret = -ESRCH;
+	ve = get_ve_by_id(s->veid);
+	if (!ve)
+		goto out_no_ve;
+
+	size = PAGE_SIZE;
+	down(&vzlist_sem);
+again:
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	ips = 0;
+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
+	rcu_read_lock();
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		goto noip;
+
+	pos = buf;
+	list_for_each_entry_rcu(entry, &veip->ip_lh, ve_list) {
+		if (entry->active_env == NULL)
+			continue;
+
+		addr = &entry->addr;
+
+		if (cmd == VZCTL_GET_VEIPS && addr->family == AF_INET) {
+			if (size >= (ips + 1) * sizeof(addr->key[3])) {
+				pos[0] = addr->key[3];
+				pos++;
+			}
+			ips++;
+		}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (cmd == VZCTL_GET_VEIP6S && addr->family == AF_INET6) {
+			if (size >= (ips + 1) * sizeof(addr->key)) {
+				memcpy(pos, addr->key, sizeof(addr->key));
+				pos += 4;
+			}
+			ips++;
+		}
+#endif
+	}
+noip:
+	rcu_read_unlock();
+#endif
+
+	ret = ips;
+	if (ips > s->num)
+		goto out;
+
+	if (cmd == VZCTL_GET_VEIPS) {
+		if (size < ips * sizeof(u32)) {
+			size = ips * sizeof(u32);
+			vfree(buf);
+			goto again;
+		}
+		if (copy_to_user(s->ip, buf, ips * sizeof(u32)))
+			ret = -EFAULT;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	else {
+		if (size < ips * sizeof(u32) * 4) {
+			size = ips * sizeof(u32) * 4;
+			vfree(buf);
+			goto again;
+		}
+		if (copy_to_user(s->ip, buf, ips * sizeof(u32) * 4))
+			ret = -EFAULT;
+	}
+#endif
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	put_ve(ve);
+out_no_ve:
+	return ret;
+}
+
+static int vzlist_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err = -ENOTTY;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case VZCTL_GET_VEIDS: {
+			struct vzlist_veidctl s;
+
+			if (arg) {
+				err = -EFAULT;
+				if (copy_from_user(&s, argp, sizeof(s)))
+					break;
+				err = get_veids(&s);
+			} else
+				err = nr_ve;
+		}
+		break;
+	case VZCTL_GET_VEPIDS: {
+			struct vzlist_vepidctl s;
+
+			err = -EFAULT;
+			if (copy_from_user(&s, argp, sizeof(s)))
+				break;
+			err = get_vepids(&s);
+		}
+		break;
+	case VZCTL_GET_VEIP6S:
+	case VZCTL_GET_VEIPS: {
+			struct vzlist_veipctl s;
+
+			err = -EFAULT;
+			if (copy_from_user(&s, argp, sizeof(s)))
+				break;
+			err = get_veips(&s, cmd);
+		}
+		break;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int vzlist_ioctl_compat(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	/* do we need this? */
+	return -ENOTTY;
+}
+#endif
+
+static struct vzioctlinfo vzid_calls = {
+	.type		= VZLISTTYPE,
+	.ioctl		= vzlist_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vzlist_ioctl_compat,
+#endif
+	.owner		= THIS_MODULE
+};
+
+static int __init init_vzlist(void)
+{
+	vzioctl_register(&vzid_calls);
+	return 0;
+}
+
+static void __exit exit_vzlist(void)
+{
+	vzioctl_unregister(&vzid_calls);
+}
+
+module_init(init_vzlist);
+module_exit(exit_vzlist);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/Makefile
@@ -0,0 +1,8 @@
+#
+# kernel/ve/vznetstat/Makefile
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-$(CONFIG_VE_NETDEV_ACCOUNTING) += vznetstat.o ip_vznetstat.o ip6_vznetstat.o
--- /dev/null
+++ b/kernel/ve/vznetstat/ip6_vznetstat.c
@@ -0,0 +1,102 @@
+/*
+ *  kernel/ve/vznetstat/ip6_vznetstat.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics for IPv6
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/vznetstat.h>
+
+static unsigned int
+venet_acct_in_hook_v6(const struct nf_hook_ops *hook,
+		      struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_hook_state *state)
+{
+	int res = NF_ACCEPT;
+
+	if (in->flags & IFF_LOOPBACK)
+		goto out;
+
+	venet_acct_classify_add_incoming(in->nd_net->owner_ve->stat, skb);
+out:
+	return res;
+}
+
+static unsigned int
+venet_acct_out_hook_v6(const struct nf_hook_ops *hook,
+		    struct sk_buff *skb,
+		    const struct net_device *in,
+		    const struct net_device *out,
+		    const struct nf_hook_state *state)
+{
+	int res = NF_ACCEPT;
+
+	if (out->flags & IFF_LOOPBACK)
+		goto out;
+
+	skb->protocol = __constant_htons(ETH_P_IPV6);
+	venet_acct_classify_add_outgoing(out->nd_net->owner_ve->stat, skb);
+out:
+	return res;
+}
+
+static struct nf_hook_ops venet_acct_in_ops = {
+	.hook		= venet_acct_in_hook_v6,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_INET_LOCAL_IN,
+	.priority	= NF_IP6_PRI_FIRST,
+};
+
+static struct nf_hook_ops venet_acct_out_ops = {
+	.hook		= venet_acct_out_hook_v6,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_INET_LOCAL_OUT,
+	.priority	= NF_IP6_PRI_LAST,
+};
+
+int __init ip6_venetstat_init(void)
+{
+	int ret;
+
+	ret = nf_register_hook(&venet_acct_in_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hook(&venet_acct_out_ops);
+	if (ret < 0) {
+		nf_unregister_hook(&venet_acct_in_ops);
+		return ret;
+	}
+
+	return 0;
+}
+
+void __exit ip6_venetstat_exit(void)
+{
+	nf_unregister_hook(&venet_acct_out_ops);
+	nf_unregister_hook(&venet_acct_in_ops);
+}
+
+module_init(ip6_venetstat_init);
+module_exit(ip6_venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/ip_vznetstat.c
@@ -0,0 +1,170 @@
+/*
+ *  kernel/ve/vznetstat/ip_vznetstat.c
+ *
+ *  Copyright (c) 2004-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics for IPv4.
+ */
+
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <net/ip.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/venet.h>
+#include <linux/vznetstat.h>
+
+#define VZNS_DEBUG 0
+
+static unsigned int venet_acct_in_hook(const struct nf_hook_ops *hook,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       const struct nf_hook_state *state)
+{
+	int res;
+
+	res = NF_ACCEPT;
+
+	/* Skip loopback dev */
+	if (in == dev_net(in)->loopback_dev)
+		goto out;
+
+#if VZNS_DEBUG
+	printk("%s: in %s, out %s, size %d, in->owner_env=%s\n",
+		 __FUNCTION__, in ? in->name : NULL, out ? out->name : NULL,
+		 venet_acct_skb_size(skb),
+		 in ? in->nd_net->owner_ve->ve_name : -1);
+#endif
+
+	/*
+	 * Basically, pskb_may_pull() isn't necessary here, because it's done
+	 * in ip_rcv() before calling NF_IP_PRE_ROUTING NF_HOOK, but let's
+	 * have some insurance for the future.
+	 */
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+		goto out_hdr_error;
+
+	venet_acct_classify_add_incoming(in->nd_net->owner_ve->stat, skb);
+
+out:
+	return res;
+
+out_hdr_error:
+	if (net_ratelimit())
+		printk("%s: IN accounting: IP header error\n", in->name);
+	res = NF_DROP;
+	goto out;
+}
+
+static unsigned int venet_acct_out_hook(const struct nf_hook_ops *hook,
+				        struct sk_buff *skb,
+				        const struct net_device *in,
+				        const struct net_device *out,
+				        const struct nf_hook_state *state)
+{
+	int res;
+
+	res = NF_ACCEPT;
+
+	/* Skip loopback dev */
+	if (out == dev_net(out)->loopback_dev)
+		goto out;
+
+	/* Paranoia */
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+		goto out_hdr_error;
+
+#if VZNS_DEBUG
+	printk("%s: in %s, out %s, size %d, out->owner_env=%s\n",
+		 __FUNCTION__, in ? in->name : NULL, out ? out->name : NULL,
+		 venet_acct_skb_size(skb), out ? out->nd_net->owner_ve->ve_name : -1);
+#endif
+
+	/*
+	 * Basically, kproxy uses for accounting kp_account_check_in()
+	 * for incoming in it packets and kp_account_check_out() for
+	 * outgoing from it ones for both directions, from VE and to VE.
+	 *
+	 * So, for outgoing from VE packets on kproxy entrance
+	 * kp_account_check_in() substracts packet from accounting, then
+	 * kp_account_check_out() adds it back. Thus, we can don't worry
+	 * abount double accounting here.
+	 *
+	 * All kproxy's accounting can't be moved in this module,
+	 * since traffic amount between kproxy and outside world is a bit
+	 * different from traffic amount between VE and kproxy.
+	 */
+	skb->protocol = __constant_htons(ETH_P_IP);
+	venet_acct_classify_add_outgoing(out->nd_net->owner_ve->stat, skb);
+
+out:
+	return res;
+
+out_hdr_error:
+	if (net_ratelimit())
+		printk("%s: OUT accounting: IP header error\n", out->name);
+	res = NF_DROP;
+	goto out;
+}
+
+static struct nf_hook_ops venet_acct_in_ops = {
+	.hook		= venet_acct_in_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_INET_LOCAL_IN,
+	.priority	= NF_IP_PRI_FIRST,
+};
+
+static struct nf_hook_ops venet_acct_out_ops = {
+	.hook		= venet_acct_out_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_INET_LOCAL_OUT,
+	.priority	= NF_IP_PRI_LAST,
+};
+
+int __init ip_venetstat_init(void)
+{
+	int ret;
+
+	ret = nf_register_hook(&venet_acct_in_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hook(&venet_acct_out_ops);
+	if (ret < 0) {
+		nf_unregister_hook(&venet_acct_in_ops);
+		return ret;
+	}
+
+	return 0;
+}
+
+void __exit ip_venetstat_exit(void)
+{
+	nf_unregister_hook(&venet_acct_out_ops);
+	nf_unregister_hook(&venet_acct_in_ops);
+}
+
+#if defined(MODULE) && defined(VZ_AUDIT)
+VZ_AUDIT;
+#endif
+module_init(ip_venetstat_init);
+module_exit(ip_venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/vznetstat.c
@@ -0,0 +1,1173 @@
+/*
+ *  kernel/ve/vznetstat/vznetstat.c
+ *
+ *  Copyright (c) 2004-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics.
+ * Traffic classes support.
+ * Persistent (independent from VE struct storage)
+ */
+
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <net/ip.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+
+#include <linux/ve.h>
+#include <linux/venet.h>
+#include <linux/vznetstat.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzctl_netstat.h>
+#include <uapi/linux/vzcalluser.h>
+
+/*
+ * ---------------------------------------------------------------------------
+ * Traffic classes storage
+ * ---------------------------------------------------------------------------
+ */
+
+static int stat_num = 0;
+static DEFINE_RWLOCK(tc_lock);
+
+struct class_info_set {
+	unsigned int len;
+	union {
+		struct vz_tc_class_info info_v4[0];
+		struct vz_tc_class_info_v6 info_v6[0];
+		char data[0];
+	};
+};
+
+static struct class_info_set *info_v4 = NULL;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static struct class_info_set *info_v6 = NULL;
+#endif
+
+/* v6: flag IPv6 classes or IPv4 */
+static int venet_acct_set_classes(const void __user *user_info, int length, int v6)
+{
+	struct class_info_set *info, *old;
+	int size;
+	int err, i;
+
+	if (v6)
+		size = sizeof(struct vz_tc_class_info_v6);
+	else
+		size = sizeof(struct vz_tc_class_info);
+
+	info = __vmalloc((sizeof(struct class_info_set) + size * length),
+			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN,
+			 PAGE_KERNEL);
+
+	if (info == NULL)
+		return -ENOMEM;
+
+	err = -EFAULT;
+	info->len = length;
+	if (copy_from_user(info->data, user_info, size * length))
+		goto out_free;
+
+	/* Verify incoming data */
+	err = -EINVAL;
+	for (i = 0; i < length; i++) {
+		unsigned int cid;
+
+		if (v6)
+			cid = info->info_v6[i].cid;
+		else
+			cid = info->info_v4[i].cid;
+
+		if (cid < 0 || cid >= TC_CLASS_MAX)
+			goto out_free;
+	}
+
+	if (v6)
+		old = xchg(&info_v6, info);
+	else
+		old = xchg(&info_v4, info);
+	/* xchg() implies rcu_assign_pointer() barriers */
+
+	synchronize_net();
+	/* IMPORTANT. I think reset of statistics collected should not be
+	 * done here. */
+	vfree(old);
+	return 0;
+
+out_free:
+	vfree(info);
+	return err;
+}
+
+/* all records */
+static int venet_acct_get_classes(void __user *ret, int length, int v6)
+{
+	void *info;
+	struct class_info_set *rinfo;
+	int len, err;
+	unsigned int size;
+
+	if (v6)
+		size = sizeof(struct vz_tc_class_info_v6);
+	else
+		size = sizeof(struct vz_tc_class_info);
+
+	/* due to spinlock locking, see below */
+	info = __vmalloc(size * length,
+			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN,
+			 PAGE_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	if (v6)
+		rinfo = rcu_dereference(info_v6);
+	else
+		rinfo = rcu_dereference(info_v4);
+
+	len = min(length, (int)rinfo->len);
+	memcpy(info, rinfo->data, size * length);
+	rcu_read_unlock();
+
+	err = -EFAULT;
+	if (!copy_to_user(ret, info, size * len))
+		err = len;
+	vfree(info);
+	return err;
+}
+
+static inline int class_info_len(int v6)
+{
+	int ret = 0;
+	struct class_info_set *info;
+
+	rcu_read_lock();
+	if (v6)
+		info = rcu_dereference(info_v6);
+	else
+		info = rcu_dereference(info_v4);
+
+	if (info)
+		ret = info->len;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Persistent statistics storage
+ * ---------------------------------------------------------------------------
+ */
+
+/* The cache should not be good right now. It used only for user-space */
+#define STAT_HASH_LEN	128
+
+static struct list_head stat_hash_list[STAT_HASH_LEN];
+static int stat_hash(envid_t veid)
+{
+	return veid & (STAT_HASH_LEN - 1);
+}
+
+/* tc_lock is taken by the caller! */
+static inline struct venet_stat *__find(envid_t veid)
+{
+	int hash;
+	struct venet_stat *ptr;
+
+	hash = stat_hash(veid);
+	list_for_each_entry(ptr, stat_hash_list + hash, list) {
+		if (ptr->veid == veid)
+			return ptr;
+	}
+	return NULL;
+}
+
+static struct venet_stat *next_stat(int *hash, struct venet_stat *item)
+{
+	struct list_head *ptr;
+
+	ptr = item != NULL ? &item->list : (stat_hash_list + *hash);
+	while (*hash < STAT_HASH_LEN) {
+		if (ptr->next != stat_hash_list + *hash)
+			return list_entry(ptr->next, struct venet_stat, list);
+		(*hash)++;
+		ptr = stat_hash_list + *hash;
+	}
+	return NULL;
+}
+
+struct venet_stat *venet_acct_find_create_stat(envid_t veid)
+{
+	struct venet_stat *ptr;
+	unsigned long flags;
+	struct venet_stat *stat;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL) {
+		venet_acct_get_stat(ptr);
+		read_unlock(&tc_lock);
+		return ptr;
+	}
+	read_unlock(&tc_lock);
+
+	ptr = kzalloc(sizeof(struct venet_stat), GFP_KERNEL);
+	if (ptr == NULL)
+		goto out;
+	ptr->veid = veid;
+
+	ptr->ipv4_stat = alloc_percpu(struct acct_stat);
+	if (ptr->ipv4_stat == NULL)
+		goto out_free;
+
+	ptr->ipv6_stat = alloc_percpu(struct acct_stat);
+	if (ptr->ipv6_stat == NULL)
+		goto out_free_v4;
+
+	write_lock_irqsave(&tc_lock, flags);
+	stat = __find(veid);
+	if (stat != NULL) {
+		free_percpu(ptr->ipv6_stat);
+		free_percpu(ptr->ipv4_stat);
+		kfree(ptr);
+		ptr = stat;
+	} else {
+		list_add(&ptr->list, stat_hash_list + stat_hash(veid));
+		stat_num++;
+	}
+	venet_acct_get_stat(ptr);
+	write_unlock_irqrestore(&tc_lock, flags);
+	return ptr;
+
+out_free_v4:
+	free_percpu(ptr->ipv4_stat);
+out_free:
+	kfree(ptr);
+out:
+	return NULL;
+}
+
+struct venet_stat *venet_acct_find_stat(envid_t veid)
+{
+	struct venet_stat *ptr;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL)
+		venet_acct_get_stat(ptr);
+	read_unlock(&tc_lock);
+	return ptr;
+}
+
+void venet_acct_put_stat(struct venet_stat *stat)
+{
+	if (stat == NULL)
+		return;
+	atomic_dec(&stat->users);
+}
+
+static inline struct acct_stat *
+__choose_acct(struct venet_stat *stat, int v6)
+{
+	if (v6)
+		return stat->ipv6_stat;
+	else
+		return stat->ipv4_stat;
+}
+
+/*
+ * v6: flag - IPv6 or IPv4 statistic are interested in
+ * returns array of counters, indexed by tc
+ */
+static int venet_acct_get_ve_stat(struct vzctl_tc_get_stat *data, int v6)
+{
+	struct venet_stat *stat;
+	void *buf;
+	u64 *incoming, *outgoing;
+	u32 *incoming_pkt, *outgoing_pkt;
+	int err, size, cpu;
+	struct acct_stat *acct;
+
+	if (data->length < 0 || data->length > TC_CLASS_MAX)
+		return -EINVAL;
+
+	buf = kzalloc(2 * TC_CLASS_MAX * (sizeof(u64) + sizeof(u32)), GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	incoming = buf;
+	outgoing = incoming + TC_CLASS_MAX;
+	incoming_pkt = (u32 *)(outgoing + TC_CLASS_MAX);
+	outgoing_pkt = incoming_pkt + TC_CLASS_MAX;
+
+	read_lock(&tc_lock);
+	err = -ESRCH;
+	stat = __find(data->veid);
+	if (stat == NULL)
+		goto out_unlock;
+
+	acct = __choose_acct(stat, v6);
+
+	for_each_possible_cpu(cpu) {
+		struct acct_stat *stat;
+		int i;
+
+		stat = per_cpu_ptr(acct, cpu);
+		for (i = 0; i < data->length; i++) {
+			incoming[i] += stat->cnt[i][ACCT_IN].bytes;
+			outgoing[i] += stat->cnt[i][ACCT_OUT].bytes;
+			incoming_pkt[i] += stat->cnt[i][ACCT_IN].pkts;
+			outgoing_pkt[i] += stat->cnt[i][ACCT_OUT].pkts;
+		}
+	}
+
+	read_unlock(&tc_lock);
+
+	err = -EFAULT;
+	size = data->length * sizeof(u64);
+	if (copy_to_user(data->incoming, incoming, size))
+		goto out_free;
+	if (copy_to_user(data->outgoing, outgoing, size))
+		goto out_free;
+	size = data->length * sizeof(u32);
+	if (copy_to_user(data->incoming_pkt, incoming_pkt, size))
+		goto out_free;
+	if (copy_to_user(data->outgoing_pkt, outgoing_pkt, size))
+		goto out_free;
+
+	err = data->length;
+
+out_free:
+	kfree(buf);
+	return err;
+
+out_unlock:
+	read_unlock(&tc_lock);
+	goto out_free;
+}
+
+static int __tc_destroy_stat(struct venet_stat *stat)
+{
+	if (atomic_read(&stat->users))
+		return -EBUSY;
+	stat_num--;
+	list_del(&stat->list);
+	free_percpu(stat->ipv6_stat);
+	free_percpu(stat->ipv4_stat);
+	kfree(stat);
+	return 0;
+}
+
+/* cleans up counter and removes it from memory if VE not exists */
+static int venet_acct_destroy_stat(envid_t veid)
+{
+	struct venet_stat *stat;
+	int err;
+
+	err = -ESRCH;
+	write_lock_irq(&tc_lock);
+	stat = __find(veid);
+	if (stat != NULL)
+		err = __tc_destroy_stat(stat);
+	write_unlock_irq(&tc_lock);
+	return err;
+}
+
+static void venet_acct_destroy_all_stat(void)
+{
+	int hash;
+	struct list_head *ptr, *tmp;
+
+	write_lock_irq(&tc_lock);
+	for (hash = 0; hash < STAT_HASH_LEN; hash++) {
+		list_for_each_safe(ptr, tmp, stat_hash_list + hash)
+			__tc_destroy_stat(list_entry(ptr,
+						struct venet_stat, list));
+	}
+	write_unlock_irq(&tc_lock);
+}
+
+static DEFINE_MUTEX(req_mutex);
+static struct venet_stat *req_stat;
+
+static void zero_venet_stat(struct venet_stat *stat, unsigned cpu)
+{
+	struct acct_stat *acct;
+
+	acct = per_cpu_ptr(stat->ipv4_stat, cpu);
+	memset(acct, 0, sizeof(*acct));
+	acct = per_cpu_ptr(stat->ipv6_stat, cpu);
+	memset(acct, 0, sizeof(*acct));
+}
+
+static void clear_one_percpu_statistics(struct work_struct *dummy)
+{
+	unsigned cpu, this_cpu = get_cpu();
+
+	zero_venet_stat(req_stat, this_cpu);
+
+	if (cpumask_first(cpu_online_mask) != this_cpu)
+		goto out;
+
+	/* First cpu clears statistics on all offline cpus */
+	for_each_possible_cpu(cpu)
+		if (!cpu_online(cpu))
+			zero_venet_stat(req_stat, cpu);
+out:
+	put_cpu();
+}
+
+/* Clear VE's statistics */
+static int venet_acct_clear_stat(envid_t veid)
+{
+	int ret = -EINTR;
+
+	if (mutex_lock_interruptible(&req_mutex))
+		goto out;
+
+	req_stat = venet_acct_find_stat(veid);
+	if (!req_stat) {
+		ret = -ESRCH;
+		goto unlock;
+	}
+
+	ret = schedule_on_each_cpu(clear_one_percpu_statistics);
+
+	venet_acct_put_stat(req_stat);
+unlock:
+	mutex_unlock(&req_mutex);
+out:
+	return ret;
+}
+
+static void clear_all_percpu_statistics(struct work_struct *dummy)
+{
+	unsigned cpu, this_cpu = smp_processor_id();
+	struct venet_stat *stat = NULL;
+	int other = 0, hash = 0;
+
+	/*
+	 * Some cpus may be offline, and schedule_on_each_cpu()
+	 * does not create a work on them.
+	 * Work on the first online CPU clears their statistics.
+	 * Hotplug is disabled by schedule_on_each_cpu().
+	 */
+	if (cpumask_first(cpu_online_mask) == this_cpu)
+		other = 1;
+
+	read_lock(&tc_lock);
+
+	while ((stat = next_stat(&hash, stat)) != NULL) {
+		zero_venet_stat(stat, this_cpu);
+
+		if (!other)
+			continue;
+
+		/* Clear statistics on not active cpus */
+		for_each_possible_cpu(cpu)
+			if (!cpu_online(cpu))
+				zero_venet_stat(stat, cpu);
+	}
+
+	read_unlock(&tc_lock);
+}
+
+/* Clear all present statistics */
+static int venet_acct_clear_all_stat(void)
+{
+	int ret = -EINTR;
+
+	if (mutex_lock_interruptible(&req_mutex))
+		goto out;
+
+	ret = schedule_on_each_cpu(clear_all_percpu_statistics);
+
+	mutex_unlock(&req_mutex);
+out:
+	return ret;
+}
+
+static int venet_acct_get_stat_list(envid_t *__list, int length)
+{
+	int hash;
+	struct venet_stat *ptr;
+	int i, err;
+	envid_t *list;
+
+	if (length <= 0)
+		return -EINVAL;
+
+	list = __vmalloc(sizeof(envid_t) * length,
+			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN,
+			 PAGE_KERNEL);
+	if (list == NULL)
+		return -ENOMEM;
+
+	i = 0;
+	read_lock(&tc_lock);
+	for (hash = 0; hash < STAT_HASH_LEN; hash++) {
+		list_for_each_entry(ptr, stat_hash_list + hash, list) {
+			list[i++] = ptr->veid;
+			if (i == length)
+				break;
+		}
+	}
+	read_unlock(&tc_lock);
+
+	err = -EFAULT;
+	if (!copy_to_user(__list, list, sizeof(envid_t) * i))
+		err = i;
+	vfree(list);
+	return err;
+}
+
+static int venet_acct_get_base(envid_t veid)
+{
+	int err = -ESRCH;
+	struct venet_stat *ptr;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL)
+		err = ptr->base;
+	read_unlock(&tc_lock);
+	return err;
+}
+
+static int __check_base(__u16 base)
+{
+	int hash;
+	struct venet_stat *stat;
+
+	hash = 0;
+	stat = NULL;
+	while ((stat = next_stat(&hash, stat)) != NULL) {
+		if (stat->base == 0 || stat->base != base)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+
+static int venet_acct_set_base(envid_t veid, __u16 base)
+{
+	static __u16 rover = 1;
+	int err, pos;
+	struct venet_stat *stat;
+
+	stat = venet_acct_find_create_stat(veid);
+	if (stat == NULL)
+		return -ENOMEM;
+
+	write_lock_irq(&tc_lock);
+	if (base != 0)
+		goto done;
+
+	err = -ERANGE;
+	pos = rover;
+	do {
+		rover++;
+		if (rover == 0)
+			rover = 1;
+		if (__check_base(rover))
+			continue;
+		base = rover;
+done:
+		err = base;
+		stat->base = base;
+		break;
+	} while (pos != rover);
+
+	write_unlock_irq(&tc_lock);
+	venet_acct_put_stat(stat);
+	return err;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Accounting engine
+ * ---------------------------------------------------------------------------
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int match_v6_class(const __u32 *addr, struct vz_tc_class_info_v6 *class)
+{
+	return !(
+			((addr[0] & class->mask[0]) ^ class->addr[0]) |
+			((addr[1] & class->mask[1]) ^ class->addr[1]) |
+			((addr[2] & class->mask[2]) ^ class->addr[2]) |
+			((addr[3] & class->mask[3]) ^ class->addr[3])
+		);
+}
+
+static noinline int venet_acct_classify_v6(struct sk_buff *skb, int dir)
+{
+	int i, ret = 0;
+	struct class_info_set *info;
+	const __u32 *addr;
+
+	if (dir == ACCT_IN)
+		addr = ipv6_hdr(skb)->saddr.s6_addr32;
+	else
+		addr = ipv6_hdr(skb)->daddr.s6_addr32;
+
+	rcu_read_lock();
+	info = rcu_dereference(info_v6);
+	if (info == NULL)
+		goto out_unlock;
+
+	for (i = info->len - 1; i >= 0; i--) {
+		if (match_v6_class(addr, &info->info_v6[i])) {
+			ret = info->info_v6[i].cid;
+			break;
+		}
+	}
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+#else
+#define venet_acct_classify_v6(skb, dir)	(0)
+#endif
+
+static int __venet_acct_classify(__u32 daddr)
+{
+	int ret, i;
+	struct class_info_set *info;
+
+	ret = 0;
+	rcu_read_lock();
+	info = rcu_dereference(info_v4);
+	if (info == NULL)
+		goto out_unlock;
+	for (i = info->len - 1; i >= 0; i--) {
+		if ((daddr & info->info_v4[i].mask) == info->info_v4[i].addr) {
+			ret = info->info_v4[i].cid;
+			break;
+		}
+	}
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static int venet_acct_classify(struct sk_buff *skb, int dir)
+{
+	__u32 addr;
+
+	if (dir == ACCT_IN)
+		addr = ip_hdr(skb)->saddr;
+	else
+		addr = ip_hdr(skb)->daddr;
+
+	return __venet_acct_classify(addr);
+}
+
+static void __do_acct_one(struct acct_stat *acct, int class, int dir, int size)
+{
+	int cpu;
+	struct acct_counter *cnt;
+
+	cpu = get_cpu();
+
+	acct = per_cpu_ptr(acct, cpu);
+	cnt = &acct->cnt[class][dir];
+
+	cnt->bytes += size;
+	if (size > 0)
+		cnt->pkts++;
+	else
+		cnt->pkts--;
+
+	put_cpu();
+}
+
+static int acct_one_skb(struct venet_stat *stat, struct sk_buff *skb, int dir, int size)
+{
+	int class;
+	struct acct_stat *acct;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		class = venet_acct_classify(skb, dir);
+		acct = stat->ipv4_stat;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		class = venet_acct_classify_v6(skb, dir);
+		acct = stat->ipv6_stat;
+		break;
+	default:
+		return 0;
+	}
+
+	__do_acct_one(acct, class, dir, size);
+
+	return class;
+}
+
+void venet_acct_classify_add_incoming(struct venet_stat *stat, struct sk_buff *skb)
+{
+	acct_one_skb(stat, skb, ACCT_IN, venet_acct_skb_size(skb));
+}
+
+static inline void venet_acct_mark(struct venet_stat *stat,
+	       struct sk_buff *skb, int class)
+{
+#ifdef CONFIG_NETFILTER
+	if (stat->base == 0)	/* compatibility mode */
+		skb->mark = class + stat->veid*2*TC_CLASS_MAX;
+	else
+		skb->mark = class + stat->base*TC_CLASS_MAX;
+#endif
+}
+
+/* FIX ME: hardheader accouting */
+void venet_acct_classify_add_outgoing(struct venet_stat *stat, struct sk_buff *skb)
+{
+	int class;
+
+	class = acct_one_skb(stat, skb, ACCT_OUT, venet_acct_skb_size(skb));
+	/* Do not forget to mark skb for traffic shaper */
+	venet_acct_mark(stat, skb, class);
+}
+
+void venet_acct_classify_sub_outgoing(struct venet_stat *stat, struct sk_buff *skb)
+{
+	int class;
+
+	class = acct_one_skb(stat, skb, ACCT_OUT, -venet_acct_skb_size(skb));
+	/* Do not forget to mark skb for traffic shaper */
+	venet_acct_mark(stat, skb, class);
+}
+
+void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size)
+{
+	int class;
+
+	class = __venet_acct_classify(src_addr->key[3]);
+	__do_acct_one(stat->ipv4_stat, class, ACCT_IN, data_size);
+}
+
+void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size)
+{
+	int class;
+
+	class = __venet_acct_classify(dst_addr->key[3]);
+	__do_acct_one(stat->ipv4_stat, class, ACCT_OUT, data_size);
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * IOCTL interface for user
+ * ---------------------------------------------------------------------------
+ */
+
+static int venet_acct_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+	struct vzctl_tc_classes		tcl;
+	struct vzctl_tc_classes_v6	tcl_v6;
+	struct vzctl_tc_get_stat 	tcnt;
+	struct vzctl_tc_get_stat_list	tcsl;
+
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	err = -ENOTTY;
+	switch(cmd) {
+		case VZCTL_TC_MAX_CLASS:
+			err = TC_CLASS_MAX;
+			break;
+		case VZCTL_TC_CLASS_NUM:
+			err = class_info_len(0);
+			break;
+		case VZCTL_TC_SET_CLASS_TABLE:
+			err = -EFAULT;
+			if (copy_from_user(&tcl, (void *)arg, sizeof(tcl)))
+				break;
+			err = venet_acct_set_classes(tcl.info, tcl.length, 0);
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case VZCTL_TC_CLASS_NUM_V6:
+			err = class_info_len(1);
+			break;
+		case VZCTL_TC_SET_CLASS_TABLE_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcl_v6, (void *)arg, sizeof(tcl_v6)))
+				break;
+			err = venet_acct_set_classes(tcl_v6.info, tcl_v6.length, 1);
+			break;
+#endif
+		case VZCTL_TC_GET_CLASS_TABLE:
+			err = -EFAULT;
+			if (copy_from_user(&tcl, (void *)arg, sizeof(tcl)))
+				break;
+			err = venet_acct_get_classes(tcl.info, tcl.length, 0);
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case VZCTL_TC_GET_CLASS_TABLE_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcl_v6, (void *)arg, sizeof(tcl_v6)))
+				break;
+			err = venet_acct_get_classes(tcl_v6.info, tcl_v6.length, 1);
+			break;
+#endif
+
+		case VZCTL_TC_STAT_NUM:
+			err = stat_num;
+			break;
+		case VZCTL_TC_GET_STAT_LIST:
+			err = -EFAULT;
+			if (copy_from_user(&tcsl, (void *)arg, sizeof(tcsl)))
+				break;
+			err = venet_acct_get_stat_list(tcsl.list, tcsl.length);
+			break;
+		case VZCTL_TC_GET_STAT:
+		case VZCTL_TC_GET_STAT_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcnt, (void *)arg, sizeof(tcnt)))
+				break;
+			err = venet_acct_get_ve_stat(&tcnt, cmd == VZCTL_TC_GET_STAT_V6);
+			break;
+		case VZCTL_TC_DESTROY_STAT:
+			err = venet_acct_destroy_stat(arg);
+			break;
+		case VZCTL_TC_DESTROY_ALL_STAT:
+			err = 0;
+			venet_acct_destroy_all_stat();
+			break;
+		case VZCTL_TC_CLEAR_STAT:
+			err = venet_acct_clear_stat(arg);
+			break;
+		case VZCTL_TC_CLEAR_ALL_STAT:
+			err = venet_acct_clear_all_stat();
+			break;
+
+		case VZCTL_TC_GET_BASE:
+			err = venet_acct_get_base(arg);
+			break;
+		case VZCTL_TC_SET_BASE:
+		{
+			struct vzctl_tc_set_base tcb;
+			err = -EFAULT;
+			if (copy_from_user(&tcb, (void *)arg, sizeof(tcb)))
+				break;
+			err = venet_acct_set_base(tcb.veid, tcb.base);
+			break;
+		}
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_venet_acct_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	switch (cmd) {
+	case COMPAT_VZCTL_TC_GET_STAT: {
+		struct compat_vzctl_tc_get_stat cs;
+		struct vzctl_tc_get_stat __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(compat_ptr(cs.incoming), &s->incoming) ||
+		    put_user(compat_ptr(cs.outgoing), &s->outgoing) ||
+		    put_user(compat_ptr(cs.incoming_pkt), &s->incoming_pkt) ||
+		    put_user(compat_ptr(cs.outgoing_pkt), &s->outgoing_pkt) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file, VZCTL_TC_GET_STAT,
+				(unsigned long)s);
+		break;
+	}
+	case COMPAT_VZCTL_TC_GET_STAT_LIST: {
+		struct compat_vzctl_tc_get_stat_list cs;
+		struct vzctl_tc_get_stat_list __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(compat_ptr(cs.list), &s->list) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file, VZCTL_TC_GET_STAT_LIST,
+				(unsigned long)s);
+		break;
+	}
+	case COMPAT_VZCTL_TC_SET_CLASS_TABLE:
+	case COMPAT_VZCTL_TC_GET_CLASS_TABLE: {
+		struct compat_vzctl_tc_classes cs;
+		struct vzctl_tc_classes __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(compat_ptr(cs.info), &s->info) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file,
+				cmd == COMPAT_VZCTL_TC_GET_CLASS_TABLE ?
+					VZCTL_TC_GET_CLASS_TABLE :
+					VZCTL_TC_SET_CLASS_TABLE,
+				(unsigned long)s);
+		break;
+	}
+	default:
+		/* should be OK */
+		err = venet_acct_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo tc_ioctl_info = {
+	.type 		= VZTCCTLTYPE,
+	.ioctl		= venet_acct_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_venet_acct_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+
+/*
+ * ---------------------------------------------------------------------------
+ * /proc interface for user
+ * ---------------------------------------------------------------------------
+ */
+
+static char seq_buffer[1024];
+static DEFINE_SPINLOCK(seq_buffer_lock);
+
+static int stat_seq_show_common(struct seq_file *m, void *v, int v6)
+{
+	struct venet_stat *ptr = (struct venet_stat *)v;
+	struct acct_stat *acct = __choose_acct(ptr, v6);
+	int i;
+
+	spin_lock(&seq_buffer_lock);
+	*seq_buffer = 0;
+	for (i = 0; i < TC_CLASS_MAX; i++) {
+		u64 incoming = 0;
+		u64 outgoing = 0;
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct acct_stat *stat;
+
+			stat = per_cpu_ptr(acct, cpu);
+			incoming += stat->cnt[i][ACCT_IN].bytes;
+			outgoing += stat->cnt[i][ACCT_OUT].bytes;
+		}
+
+		sprintf(seq_buffer + strlen(seq_buffer), " %20Lu/%20Lu",
+				incoming, outgoing);
+	}
+
+	seq_printf(m, "%u %s\n", ptr->veid, seq_buffer);
+	spin_unlock(&seq_buffer_lock);
+	return 0;
+}
+
+static int stat_seq_show_v4(struct seq_file *m, void *v)
+{
+	return stat_seq_show_common(m, v, 0);
+}
+
+static int stat_seq_show_v6(struct seq_file *m, void *v)
+{
+	return stat_seq_show_common(m, v, 1);
+}
+
+static void *stat_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct venet_stat *stat;
+	int hash;
+	loff_t l;
+
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+
+	read_lock(&tc_lock);
+	hash = 0;
+	stat = NULL;
+	stat = next_stat(&hash, stat);
+	for (l = *pos; stat && l > 0; l--)
+		stat = next_stat(&hash, stat);
+	return stat;
+}
+
+static void *stat_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct venet_stat *ptr = (struct venet_stat *)v;
+	int hash;
+
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+	hash = stat_hash(ptr->veid);
+	(*pos)++;
+	return next_stat(&hash, ptr);
+}
+
+static void stat_seq_stop(struct seq_file *m, void *v)
+{
+	read_unlock(&tc_lock);
+}
+
+static struct seq_operations stat_seq_op = {
+        .start = stat_seq_start,
+        .next  = stat_seq_next,
+        .stop  = stat_seq_stop,
+        .show  = stat_seq_show_v4,
+};
+
+static struct seq_operations stat_v6_seq_op = {
+        .start = stat_seq_start,
+        .next  = stat_seq_next,
+        .stop  = stat_seq_stop,
+        .show  = stat_seq_show_v6,
+};
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &stat_seq_op);
+}
+
+static int stat_v6_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &stat_v6_seq_op);
+}
+
+static struct file_operations proc_venetstat_operations = {
+        .open		= stat_open,
+        .read		= seq_read,
+        .llseek		= seq_lseek,
+        .release	= seq_release,
+};
+
+static struct file_operations proc_venetstat_v6_operations = {
+        .open		= stat_v6_open,
+        .read		= seq_read,
+        .llseek		= seq_lseek,
+        .release	= seq_release,
+};
+
+static int __net_init net_init_acct(struct net *net)
+{
+	struct ve_struct *ve = net->owner_ve;
+
+	if (!ve->stat) {
+		ve->stat = venet_acct_find_create_stat(ve->veid);
+		if (!ve->stat)
+			return -ENOMEM;
+	} else
+		venet_acct_get_stat(ve->stat);
+
+	return 0;
+}
+
+static void __net_exit net_exit_acct(struct net *net)
+{
+	struct ve_struct *ve = net->owner_ve;
+
+	if (ve->stat) {
+		venet_acct_put_stat(ve->stat);
+		if (ve->ve_netns == net)
+			ve->stat = NULL;
+	}
+}
+
+static struct pernet_operations __net_initdata net_acct_ops = {
+	.init	= net_init_acct,
+	.exit	= net_exit_acct,
+};
+
+int __init venetstat_init(void)
+{
+	int i, ret;
+#if CONFIG_PROC_FS
+	struct proc_dir_entry *de;
+#endif
+
+	for (i = 0; i < STAT_HASH_LEN; i++)
+		INIT_LIST_HEAD(stat_hash_list + i);
+
+	ret = register_pernet_subsys(&net_acct_ops);
+	if (ret)
+		return ret;
+
+#if CONFIG_PROC_FS
+	de = proc_create("venetstat", S_IFREG|S_IRUSR, proc_vz_dir,
+			&proc_venetstat_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "VENET: can't make venetstat proc entry\n");
+
+	de = proc_create("venetstat_v6", S_IFREG|S_IRUSR, proc_vz_dir,
+			&proc_venetstat_v6_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "VENET: can't make venetstat_v6 proc entry\n");
+
+#endif
+	vzioctl_register(&tc_ioctl_info);
+	return 0;
+}
+
+void __exit venetstat_exit(void)
+{
+	unregister_pernet_subsys(&net_acct_ops);
+	vzioctl_unregister(&tc_ioctl_info);
+	venet_acct_destroy_all_stat();
+
+#if CONFIG_PROC_FS
+	remove_proc_entry("venetstat_v6", proc_vz_dir);
+	remove_proc_entry("venetstat", proc_vz_dir);
+#endif
+	vfree(info_v4);
+	vfree(info_v6);
+}
+
+module_init(venetstat_init);
+module_exit(venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+
+EXPORT_SYMBOL(venet_acct_find_create_stat);
+EXPORT_SYMBOL(venet_acct_find_stat);
+EXPORT_SYMBOL(venet_acct_put_stat);
+EXPORT_SYMBOL(venet_acct_classify);
+EXPORT_SYMBOL(venet_acct_classify_add_outgoing);
+EXPORT_SYMBOL(venet_acct_classify_sub_outgoing);
+EXPORT_SYMBOL(venet_acct_classify_add_incoming);
+EXPORT_SYMBOL(venet_acct_classify_add_incoming_plain);
+EXPORT_SYMBOL(venet_acct_classify_add_outgoing_plain);
--- /dev/null
+++ b/kernel/ve/vzstat.c
@@ -0,0 +1,763 @@
+/*
+ *  kernel/ve/vzstat.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/interrupt.h>
+#include <linux/mmzone.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/vzstat.h>
+
+/* local variables */
+static struct task_struct *vzstat_thread_tsk;
+
+static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+	"alocatomic:",
+	"aloclow:",
+	"alochigh:",
+	"aloclowmp:",
+	"alochighmp:"
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * Kernel protection: kernel code checksumming
+ * ------------------------------------------------------------------------
+ */
+#ifdef CONFIG_VE_KERNEL_CSUM
+
+#ifdef __x86_64__
+/* skip init_level4_pgt */
+#define KERNEL_PROT_START	((unsigned long)(&_stext) + 0x2000)
+#else
+#define KERNEL_PROT_START	((unsigned long)(&_stext))
+#endif
+#define KERNEL_PROT_END		((unsigned long)(&_etext))
+#define CSALIGN(value, size)	((value + (size - 1)) & ~(size - 1))
+
+void kernel_text_csum_check(void)
+{
+#define CSUM_NR	2
+	static unsigned long text_csum[CSUM_NR], text_csumed, csum_time;
+	unsigned long start, end, ptr, csum[CSUM_NR];
+	int i;
+
+	if (jiffies - csum_time < 60*HZ)
+		return;
+
+	csum_time = jiffies;
+	for (i = 0; i < CSUM_NR; i++) csum[i] = 0;
+	start = CSALIGN(KERNEL_PROT_START, sizeof(csum[0]));
+	end = CSALIGN(KERNEL_PROT_END, sizeof(csum[0]));
+
+	for (ptr = start; ptr < end; ptr += sizeof(csum[0])) {
+		unsigned long i = *(unsigned long*)ptr;
+		csum[0] = csum[0] + i;
+		csum[1] = (csum[1] ^ i) + ((csum[1] << 1) + (csum[1] >> 31));
+		cond_resched();
+	}
+
+	if (!text_csumed) {
+		for (i = 0; i < CSUM_NR; i++) text_csum[i] = csum[i];
+		text_csumed = 1;
+		return;
+	}
+	for (i = 0; i < CSUM_NR; i++)
+		if (text_csum[i] != csum[i]) {
+			printk(KERN_EMERG "Kernel checksum %d changed "
+				"(csum%d=%08lx, onboot csum%d=%08lx)\n",
+				i, i, csum[i], i, text_csum[i]);
+			kernel_text_csum_broken++;
+		}
+}
+
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * Latency update and show functions
+ * ------------------------------------------------------------------------
+ */
+static inline u64 get_task_lat(struct task_struct *t, u64 now)
+{
+	u64 wstamp;
+
+	wstamp = t->se.statistics->wait_start;
+	if (wstamp && now > wstamp && now - wstamp < (1ULL << 63))
+		return now - wstamp;
+	return 0;
+}
+
+static void update_max_sched_latency_snap(void)
+{
+	struct task_struct *t, *g;
+	u64 now, max, tmp;
+	struct kstat_lat_pcpu_struct *st;
+
+	max = 0;
+	qread_lock(&tasklist_lock);
+	now = ktime_to_ns(ktime_get());
+	do_each_thread(g, t) {
+		if (likely(t->state != TASK_RUNNING))
+			continue;
+
+		tmp = get_task_lat(t, now);
+		if (max < tmp)
+			max = tmp;
+		st = &t->task_ve->sched_lat_ve;
+		if (st->max_snap < tmp)
+			st->max_snap = tmp;
+	} while_each_thread(g, t);
+	qread_unlock(&tasklist_lock);
+	kstat_glob.sched_lat.max_snap = max;
+}
+
+static void update_schedule_latency(void)
+{
+	/*
+	 * global scheduling latency is updated in schedule() and
+	 * update_max_sched_latency_snap(). The latter function guarantees
+	 * that tasks which do not recieve CPU time are still accounted in
+	 * scheduling latency
+	 */
+	update_max_sched_latency_snap();
+
+	spin_lock_irq(&kstat_glb_lock);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.sched_lat);
+	spin_unlock_irq(&kstat_glb_lock);
+	/* Note: per-VE latency is updated in update_venum() */
+}
+
+static void update_alloc_latency(void)
+{
+	int i;
+
+	spin_lock_irq(&kstat_glb_lock);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		KSTAT_LAT_PCPU_UPDATE(&kstat_glob.alloc_lat[i]);
+	KSTAT_LAT_UPDATE(&kstat_glob.swap_in);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.page_in);
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void lastlat_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_lat_snap_struct *snap)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20lu\n", name,
+			snap->maxlat, snap->totlat, snap->count);
+}
+
+static void avglat_seq_show(struct seq_file *m,
+		const char *name,
+		u64 *avg)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20Lu\n", name,
+			avg[0], avg[1], avg[2]);
+}
+
+static int latency_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	if (!v)
+		return 0;
+
+	seq_puts(m, "Version: 2.5\n");
+
+	seq_puts(m, "\nLatencies:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Lat", "Total_lat", "Calls");
+	lastlat_seq_show(m, "scheduling:", &kstat_glob.sched_lat.last);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		lastlat_seq_show(m, alloc_descr[i],
+				&kstat_glob.alloc_lat[i].last);
+	lastlat_seq_show(m, "swap_in:", &kstat_glob.swap_in.last);
+	lastlat_seq_show(m, "page_in:", &kstat_glob.page_in.last);
+
+	seq_puts(m, "\nAverages:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Avg1", "Avg5", "Avg15");
+	avglat_seq_show(m, "scheduling:", kstat_glob.sched_lat.avg);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		avglat_seq_show(m, alloc_descr[i],
+				kstat_glob.alloc_lat[i].avg);
+	avglat_seq_show(m, "swap_in:", kstat_glob.swap_in.avg);
+	avglat_seq_show(m, "page_in:", kstat_glob.page_in.avg);
+
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * General system info: processes, memory, VE
+ * ------------------------------------------------------------------------
+ */
+static void update_memory(void)
+{
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned type;
+	unsigned long nr_free, nr_active, nr_inactive;
+	unsigned present;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+		nr_free = 0;
+		nr_active = 0;
+		nr_inactive = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (!zone->present_pages)
+				continue;
+
+			present++;
+			nr_free += zone_page_state(zone, NR_FREE_PAGES);
+			nr_active +=  zone_page_state(zone, NR_ACTIVE_ANON) +
+				zone_page_state(zone, NR_ACTIVE_FILE);
+			nr_inactive += zone_page_state(zone, NR_INACTIVE_ANON) +
+				zone_page_state(zone, NR_INACTIVE_FILE);
+		}
+
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+
+		CALC_LOAD(zone_avg->free_pages_avg[0], EXP_1, nr_free);
+		CALC_LOAD(zone_avg->free_pages_avg[1], EXP_5, nr_free);
+		CALC_LOAD(zone_avg->free_pages_avg[2], EXP_15,nr_free);
+
+		CALC_LOAD(zone_avg->nr_active_avg[0], EXP_1, nr_active);
+		CALC_LOAD(zone_avg->nr_active_avg[1], EXP_5, nr_active);
+		CALC_LOAD(zone_avg->nr_active_avg[2], EXP_15, nr_active);
+
+		CALC_LOAD(zone_avg->nr_inactive_avg[0], EXP_1, nr_inactive);
+		CALC_LOAD(zone_avg->nr_inactive_avg[1], EXP_5, nr_inactive);
+		CALC_LOAD(zone_avg->nr_inactive_avg[2], EXP_15, nr_inactive);
+	}
+}
+
+static void mem_avg_show(struct seq_file *m, void *v)
+{
+	unsigned type;
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned present;
+	int zone_id;
+
+	zone_id = 0;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (zone->present_pages) {
+				present++;
+				break;
+			}
+		}
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+		seq_printf(m, "ZONE%u %s averages: "
+			"active %lu %lu %lu, "
+			"inactive %lu %lu %lu, "
+			"free %lu %lu %lu\n",
+			zone_id++,
+			zone->name,
+			zone_avg->nr_active_avg[0],
+			zone_avg->nr_active_avg[1],
+			zone_avg->nr_active_avg[2],
+			zone_avg->nr_inactive_avg[0],
+			zone_avg->nr_inactive_avg[1],
+			zone_avg->nr_inactive_avg[2],
+			zone_avg->free_pages_avg[0],
+			zone_avg->free_pages_avg[1],
+			zone_avg->free_pages_avg[2]);
+	}
+}
+
+static void update_venum(void)
+{
+	struct ve_struct *ve;
+
+	mutex_lock(&ve_list_lock);
+	spin_lock_irq(&kstat_glb_lock);
+	for_each_ve(ve)
+		/* max_snap is already set in update_schedule_latency */
+		KSTAT_LAT_PCPU_UPDATE(&ve->sched_lat_ve);
+	spin_unlock_irq(&kstat_glb_lock);
+	mutex_unlock(&ve_list_lock);
+}
+
+static void task_counts_seq_show(struct seq_file *m, void *v)
+{
+	unsigned long _nr_running, _nr_sleeping, _nr_unint,
+				_nr_zombie, _nr_dead, _nr_stopped;
+	unsigned long avg[3];
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = nr_stopped();
+
+	spin_lock_irq(&kstat_glb_lock);
+	memcpy(avg, kstat_glob.nr_unint_avg, sizeof(avg));
+	spin_unlock_irq(&kstat_glb_lock);
+
+	seq_printf(m, "VEs: %d\n", nr_ve);
+	seq_printf(m, "Processes: R %lu, S %lu, D %lu, "
+		"Z %lu, T %lu, X %lu\n",
+			_nr_running,
+			_nr_sleeping,
+			_nr_unint,
+			_nr_zombie,
+			_nr_stopped,
+			_nr_dead);
+	seq_printf(m, "Processes avg: unint %lu %lu %lu\n",
+			avg[0] >> FSHIFT, avg[1] >> FSHIFT, avg[2] >> FSHIFT);
+}
+
+static void cycles_per_jiffy_show(struct seq_file *m, void *v)
+{
+	/* Now all time slices are measured in nanoseconds */
+	seq_printf(m, "cycles_per_jiffy: %llu\n", ((u64) jiffies_to_usecs(1)) * 1000);
+}
+
+static void jiffies_per_second_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "jiffies_per_second: %u\n", HZ);
+}
+
+static void kernel_text_csum_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "kernel_text_csum_broken: %d\n", 0);
+}
+
+static void swap_cache_seq_show(struct seq_file *m, void *v)
+{
+	struct swap_cache_info_struct *swpcache;
+	extern struct swap_cache_info_struct swap_cache_info;
+
+	swpcache = &swap_cache_info;
+	seq_printf(m, "Swap cache: add %lu, del %lu, find %lu/%lu\n",
+			swpcache->add_total,
+			swpcache->del_total,
+			swpcache->find_success,
+			swpcache->find_total);
+}
+
+/*
+ * Declare special structure to store summarized statistics. The 'struct zone'
+ * is not used because of it's tremendous size.
+ */
+struct zonestat {
+	const char *name;
+	unsigned long free_pages;
+	unsigned long nr_free[MAX_ORDER];
+	unsigned long pages_min;
+	unsigned long pages_low;
+	unsigned long pages_high;
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long present_pages;
+};
+
+/*
+ * Show information about all memory zones.
+ */
+static void mem_free_areas_show_zonestat(struct seq_file *m,
+						struct zonestat *zstat)
+{
+	unsigned int order;
+	unsigned type;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		struct zonestat *zone = &zstat[type];
+
+		if (!zone->name)
+			continue;
+
+		/* Skip empty zones */
+		if (!zone->present_pages)
+			continue;
+
+		seq_printf(m, "%s free %lu (", zone->name, zone->free_pages);
+		for (order = 0; order < MAX_ORDER; order++)
+			seq_printf(m, "%lu*%lu ", zone->nr_free[order],
+								1UL << order);
+
+		seq_printf(m, ") min %lu low %lu high %lu "
+			"active %lu inactive %lu size %lu\n",
+				zone->pages_min,
+				zone->pages_low,
+				zone->pages_high,
+				zone->nr_active,
+				zone->nr_inactive,
+				zone->present_pages);
+	}
+}
+
+/*
+ * Scan all registered pgdat's (i.e. memory nodes) and summarize
+ * values for identical zones.
+ */
+static void mem_free_areas_show(struct seq_file *m, void *v)
+{
+	pg_data_t *pgdat;
+	struct zonestat zones[MAX_NR_ZONES];
+	struct zonestat *zdst;
+	struct zone *zsrc;
+	int type, order;
+
+	memset(zones, 0, sizeof(zones));
+
+	for_each_online_pgdat (pgdat) {
+		for (type = 0; type < MAX_NR_ZONES; type++) {
+			unsigned long flags;
+
+			zdst = &zones[type];
+			zsrc = pgdat->node_zones + type;
+			if (!zsrc || !zsrc->name)
+				continue;
+
+			if (!zdst->name)
+				zdst->name = zsrc->name;
+			else if (strcmp(zsrc->name, zdst->name))
+				/* This shouldn't happen! */
+				printk("Warning: names mismatch for "
+					"zone %d: %s != %s\n",
+					type, zsrc->name, zdst->name);
+
+			spin_lock_irqsave(&zsrc->lock, flags);
+			for (order = 0; order < MAX_ORDER; order++)
+				zdst->nr_free[order] += zsrc->free_area[order].nr_free;
+			spin_unlock_irqrestore(&zsrc->lock, flags);
+
+			zdst->nr_active     += zone_page_state(zsrc, NR_ACTIVE_ANON) +
+						zone_page_state(zsrc, NR_ACTIVE_FILE);
+			zdst->nr_inactive   += zone_page_state(zsrc, NR_INACTIVE_ANON) +
+						zone_page_state(zsrc, NR_INACTIVE_FILE);
+			zdst->pages_min     += min_wmark_pages(zsrc);
+			zdst->pages_low     += low_wmark_pages(zsrc);
+			zdst->pages_high    += high_wmark_pages(zsrc);
+			zdst->present_pages += zsrc->present_pages;
+			zdst->free_pages    += zone_page_state(zsrc, NR_FREE_PAGES);
+		}
+	}
+	mem_free_areas_show_zonestat(m, zones);
+}
+
+static void mem_fails_show(struct seq_file *m, void *v)
+{
+	int i, cpu;
+	unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
+
+	memset(alloc_fails, 0, sizeof(alloc_fails));
+	for_each_online_cpu(cpu)
+		for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+			alloc_fails[i] += kstat_glob.alloc_fails[cpu][i];
+
+	seq_puts(m, "\nMemory fails:\n");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		seq_printf(m, "%-11s %20lu\n", alloc_descr[i],
+				alloc_fails[i]);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Memory management profiling
+ * ------------------------------------------------------------------------
+ */
+static void KSTAT_PERF_UPDATE(struct kstat_perf_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_perf_pcpu_snap_struct snap, *cur;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+
+		if (p->last.wall_maxdur < snap.wall_maxdur)
+			p->last.wall_maxdur = snap.wall_maxdur;
+		if (p->last.cpu_maxdur < snap.cpu_maxdur)
+			p->last.cpu_maxdur = snap.cpu_maxdur;
+		cur->wall_maxdur = cur->cpu_maxdur = 0;
+
+		p->last.count += snap.count;
+		p->last.wall_tottime += snap.wall_tottime;
+		p->last.cpu_tottime += snap.cpu_tottime;
+	}
+}
+
+static void update_mmperf(void)
+{
+	KSTAT_PERF_UPDATE(&kstat_glob.ttfp);
+	KSTAT_PERF_UPDATE(&kstat_glob.cache_reap);
+	KSTAT_PERF_UPDATE(&kstat_glob.refill_inact);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_icache);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_dcache);
+}
+
+static void perf_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_perf_pcpu_struct *p)
+{
+	seq_printf(m, "%-14s %10lu %20Lu %20Lu %20Lu %20Lu\n",
+			name,
+			p->last.count,
+			p->last.cpu_maxdur,
+			p->last.wall_maxdur,
+			p->last.cpu_tottime,
+			p->last.wall_tottime);
+}
+
+static int mmperf_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.5.1\n");
+	seq_printf(m, "%-14s %10s %20s %20s %20s %20s\n",
+			"Type",
+			"Count",
+			"CPU_max_dur",
+			"Wall_max_dur",
+			"CPU_tot_time",
+			"Wall_tot_time");
+	perf_seq_show(m, "ttfp:", &kstat_glob.ttfp);
+	perf_seq_show(m, "cache_reap:", &kstat_glob.cache_reap);
+	perf_seq_show(m, "refill_inact:", &kstat_glob.refill_inact);
+	perf_seq_show(m, "shrink_icache:", &kstat_glob.shrink_icache);
+	perf_seq_show(m, "shrink_dcache:", &kstat_glob.shrink_dcache);
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Main loop
+ * ------------------------------------------------------------------------
+ */
+static int vzstat_mon_loop(void* data)
+{
+	while (1) {
+		try_to_freeze();
+#ifdef CONFIG_VE_KERNEL_CSUM
+		kernel_text_csum_check();
+#endif
+		update_alloc_latency();
+		update_schedule_latency();
+		update_memory();
+		update_venum();
+		update_mmperf();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		schedule_timeout(LOAD_FREQ);
+	}
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * default sequential files methods
+ * ------------------------------------------------------------------------
+ */
+static void *empty_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return (void*)1;
+	else
+		return NULL;
+}
+
+static void *empty_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static void empty_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/latency sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations latency_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	latency_seq_show
+};
+
+static int latency_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_seq_op);
+}
+
+static struct file_operations proc_latency_operations = {
+	.open = latency_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/stats sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static int stats_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.6\n");
+	cycles_per_jiffy_show(m, v);
+	jiffies_per_second_show(m, v);
+	seq_puts(m, "\nLoad info:\n");
+	task_counts_seq_show(m, v);
+	seq_puts(m, "\nMemory info:\n");
+	kernel_text_csum_seq_show(m, v);
+	swap_cache_seq_show(m, v);
+	mem_free_areas_show(m, v);
+	mem_avg_show(m, v);
+	mem_fails_show(m, v);
+	return 0;
+}
+
+static struct seq_operations stats_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	stats_seq_show
+};
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &stats_seq_op);
+}
+
+static struct file_operations proc_stats_operations = {
+	.open = stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/mmperf sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations mmperf_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	mmperf_seq_show
+};
+
+static int mmperf_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &mmperf_seq_op);
+}
+
+static struct file_operations proc_mmperf_operations = {
+	.open = mmperf_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * module init/exit code
+ * ------------------------------------------------------------------------
+ */
+
+int __init vzstat_mon_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_create("latency", S_IRUGO, proc_vz_dir, &proc_latency_operations);
+	if (entry == NULL) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_lat;
+	}
+
+	entry = proc_create("stats", S_IRUGO, proc_vz_dir, &proc_stats_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_stat;
+	}
+
+	entry = proc_create("mmperf", S_IRUGO, proc_vz_dir, &proc_mmperf_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_perf;
+	}
+
+	vzstat_thread_tsk = kthread_run(vzstat_mon_loop, NULL, "vzstat");
+	if (IS_ERR(vzstat_thread_tsk))
+		goto fail_thread;
+
+	printk(KERN_INFO "VZSTAT: initialized successfully\n");
+
+	return 0;
+
+fail_thread:
+	remove_proc_entry("mmperf", proc_vz_dir);
+fail_perf:
+	remove_proc_entry("stats", proc_vz_dir);
+fail_stat:
+	remove_proc_entry("latency", proc_vz_dir);
+fail_lat:
+	return -EBUSY;
+}
+
+void __exit vzstat_mon_exit(void)
+{
+	kthread_stop(vzstat_thread_tsk);
+
+	remove_proc_entry("mmperf", proc_vz_dir);
+	remove_proc_entry("stats", proc_vz_dir);
+	remove_proc_entry("latency", proc_vz_dir);
+}
+
+module_init(vzstat_mon_init);
+module_exit(vzstat_mon_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vzstat_core.c
@@ -0,0 +1,122 @@
+/*
+ *  kernel/ve/vzstat_core.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/vzstat.h>
+
+void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time, u64 cpu_time)
+{
+	struct kstat_perf_pcpu_snap_struct *cur = get_cpu_ptr(ptr->cur);
+
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->wall_maxdur < real_time)
+		cur->wall_maxdur = real_time;
+	cur->wall_tottime += real_time;
+	if (cur->cpu_maxdur < cpu_time)
+		cur->cpu_maxdur = cpu_time;
+	cur->cpu_tottime += real_time;
+	write_seqcount_end(&cur->lock);
+	put_cpu_ptr(cur);
+}
+
+/*
+ * Add another statistics reading.
+ * Serialization is the caller's due.
+ */
+void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
+		u64 dur)
+{
+	p->cur.count++;
+	if (p->cur.maxlat < dur)
+		p->cur.maxlat = dur;
+	p->cur.totlat += dur;
+}
+
+/*
+ * Must be called with disabled interrupts to remove any possible
+ * locks and seqcounts under write-lock and avoid this 3-way deadlock:
+ *
+ * timer interrupt:
+ *	write_seqlock(&xtime_lock);
+ *	 spin_lock_irqsave(&kstat_glb_lock);
+ *
+ * update_schedule_latency():
+ *	spin_lock_irq(&kstat_glb_lock);
+ *	 read_seqcount_begin(&cur->lock)
+ *
+ * some-interrupt during KSTAT_LAT_PCPU_ADD()
+ *   KSTAT_LAT_PCPU_ADD()
+ *    write_seqcount_begin(&cur->lock);
+ *     <interrupt>
+ *      ktime_get()
+ *       read_seqcount_begin(&xtime_lock);
+ */
+void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
+		u64 dur)
+{
+	struct kstat_lat_pcpu_snap_struct *cur;
+
+	cur = per_cpu_ptr(p->cur, cpu);
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->maxlat < dur)
+		cur->maxlat = dur;
+	cur->totlat += dur;
+	write_seqcount_end(&cur->lock);
+}
+
+/*
+ * Move current statistics to last, clear last.
+ * Serialization is the caller's due.
+ */
+void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
+{
+	u64 m;
+	memcpy(&p->last, &p->cur, sizeof(p->last));
+	p->cur.maxlat = 0;
+	m = p->last.maxlat;
+	CALC_LOAD(p->avg[0], EXP_1, m)
+	CALC_LOAD(p->avg[1], EXP_5, m)
+	CALC_LOAD(p->avg[2], EXP_15, m)
+}
+EXPORT_SYMBOL(KSTAT_LAT_UPDATE);
+
+void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_lat_pcpu_snap_struct snap, *cur;
+	u64 m;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+		/* 
+		 * read above and this update of maxlat is not atomic,
+		 * but this is OK, since it happens rarely and losing
+		 * a couple of peaks is not essential. xemul
+		 */
+		cur->maxlat = 0;
+
+		p->last.count += snap.count;
+		p->last.totlat += snap.totlat;
+		if (p->last.maxlat < snap.maxlat)
+			p->last.maxlat = snap.maxlat;
+	}
+
+	m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
+	CALC_LOAD(p->avg[0], EXP_1, m);
+	CALC_LOAD(p->avg[1], EXP_5, m);
+	CALC_LOAD(p->avg[2], EXP_15, m);
+	/* reset max_snap to calculate it correctly next time */
+	p->max_snap = 0;
+}
+EXPORT_SYMBOL(KSTAT_LAT_PCPU_UPDATE);
--- /dev/null
+++ b/kernel/ve/vzwdog.c
@@ -0,0 +1,353 @@
+/*
+ *  kernel/ve/vzwdog.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/kobject.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
+#include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/* Staff regading kernel thread polling VE validity */
+static int sleep_timeout = 60;
+static struct task_struct *wdog_thread_tsk;
+
+static struct file *intr_file;
+static char page[PAGE_SIZE];
+
+static void parse_irq_list(int len)
+{
+	int i, k, skip;
+	for (i = 0; i < len; ) {
+		k = i;
+		while (i < len && page[i] != '\n' && page[i] != ':')
+			i++;
+		skip = 0;
+		if (i < len && page[i] != '\n') {
+			i++; /* skip ':' */
+			while (i < len && (page[i] == ' ' || page[i] == '0'))
+				i++;
+			skip = (i < len && (page[i] < '0' || page[i] > '9'));
+			while (i < len && page[i] != '\n')
+				i++;
+		}
+		if (!skip)
+			printk("%.*s\n", i - k, page + k);
+		if (i < len)
+			i++; /* skip '\n' */
+	}
+}
+
+static void show_irq_list(void)
+{
+	mm_segment_t fs;
+	int r;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	vfs_llseek(intr_file, 0, 0);
+	r = vfs_read(intr_file, (void __user *)page, sizeof(page),
+			&intr_file->f_pos);
+	set_fs(fs);
+
+	if (r > 0)
+		parse_irq_list(r);
+}
+
+static u64 max_sched_lat;
+static u64 max_alloc_lat[KSTAT_ALLOCSTAT_NR];
+
+static void update_max_alloc_latency(void)
+{
+	int i;
+
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		max_alloc_lat[i] = max(max_alloc_lat[i],
+				kstat_glob.alloc_lat[i].last.maxlat);
+}
+
+static void update_max_schedule_latency(void)
+{
+	max_sched_lat = max(max_sched_lat, kstat_glob.sched_lat.last.maxlat);
+}
+
+static void update_max_latencies(void)
+{
+	spin_lock_irq(&kstat_glb_lock);
+	update_max_alloc_latency();
+	update_max_schedule_latency();
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void reset_max_latencies(void)
+{
+	max_sched_lat = 0;
+	memset(max_alloc_lat, 0, sizeof(max_alloc_lat));
+}
+
+static void show_alloc_latency(void)
+{
+	static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+		"A0",
+		"L0",
+		"H0",
+		"L1",
+		"H1"
+	};
+	int i;
+
+	printk("lat: ");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+		struct kstat_lat_pcpu_struct *p;
+		u64 maxlat, avg0, avg1, avg2;
+
+		p = &kstat_glob.alloc_lat[i];
+		spin_lock_irq(&kstat_glb_lock);
+		maxlat = p->last.maxlat;
+		avg0 = p->avg[0];
+		avg1 = p->avg[1];
+		avg2 = p->avg[2];
+		spin_unlock_irq(&kstat_glb_lock);
+
+		printk("%s %Lu %Lu (%Lu %Lu %Lu)",
+				alloc_descr[i],
+				(unsigned long long)max_alloc_lat[i],
+				(unsigned long long)maxlat,
+				(unsigned long long)avg0,
+				(unsigned long long)avg1,
+				(unsigned long long)avg2);
+	}
+	printk("\n");
+}
+
+static void show_schedule_latency(void)
+{
+	struct kstat_lat_pcpu_struct *p;
+	cycles_t maxlat, totlat, avg0, avg1, avg2;
+	unsigned long count;
+
+	p = &kstat_glob.sched_lat;
+	spin_lock_irq(&kstat_glb_lock);
+	maxlat = p->last.maxlat;
+	totlat = p->last.totlat;
+	count = p->last.count;
+	avg0 = p->avg[0];
+	avg1 = p->avg[1];
+	avg2 = p->avg[2];
+	spin_unlock_irq(&kstat_glb_lock);
+
+	printk("sched lat: %Lu/%Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
+			(unsigned long long)max_sched_lat,
+			(unsigned long long)maxlat,
+			(unsigned long long)totlat,
+			count,
+			(unsigned long long)avg0,
+			(unsigned long long)avg1,
+			(unsigned long long)avg2);
+}
+
+static void show_header(void)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	preempt_disable();
+	printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
+			tv.tv_sec, (long)tv.tv_usec,
+			(unsigned long long)get_jiffies_64(),
+			smp_processor_id());
+	printk("*** jiffies_per_second %u ***\n", HZ);
+	preempt_enable();
+}
+
+static void show_pgdatinfo(void)
+{
+	pg_data_t *pgdat;
+
+	printk("pgdat:");
+	for_each_online_pgdat(pgdat) {
+		printk(" %d: %lu,%lu,%lu",
+				pgdat->node_id,
+				pgdat->node_start_pfn,
+				pgdat->node_present_pages,
+				pgdat->node_spanned_pages);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+		printk(",%p", pgdat->node_mem_map);
+#endif
+	}
+	printk("\n");
+}
+
+static int show_partitions_io(struct gendisk *gp)
+{
+	struct disk_part_iter piter;
+	struct hd_struct *hd;
+	char buf[BDEVNAME_SIZE];
+	int cpu;
+
+	/*
+	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
+		seq_puts(seqf,	"major minor name"
+				"     rio rmerge rsect ruse wio wmerge "
+				"wsect wuse running use aveq"
+				"\n\n");
+	*/
+ 
+	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
+	while ((hd = disk_part_iter_next(&piter))) {
+		cpu = part_stat_lock();
+		part_round_stats(cpu, hd);
+		part_stat_unlock();
+		printk("%4d %7d %s %lu %lu %llu "
+			   "%u %lu %lu %llu %u %u %u %u\n",
+			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
+			   disk_name(gp, hd->partno, buf),
+			   part_stat_read(hd, ios[0]),
+			   part_stat_read(hd, merges[0]),
+			   (unsigned long long)part_stat_read(hd, sectors[0]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+			   part_stat_read(hd, ios[1]),
+			   part_stat_read(hd, merges[1]),
+			   (unsigned long long)part_stat_read(hd, sectors[1]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+			   part_in_flight(hd),
+			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
+			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+			);
+	}
+	disk_part_iter_exit(&piter);
+ 
+	return 0;
+}
+
+static int show_one_disk_io(struct device *dev, void *x)
+{
+	char *name;
+	char buf[BDEVNAME_SIZE];
+	struct gendisk *gd;
+
+	if (dev->type != &disk_type)
+		return 0;
+
+	gd = dev_to_disk(dev);
+
+	name = disk_name(gd, 0, buf);
+	if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
+			isdigit(name[4]))
+		return 0;
+
+	if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
+			isdigit(name[3]))
+		return 0;
+
+	show_partitions_io(gd);
+
+	return 0;
+}
+
+static void show_diskio(void)
+{
+	printk("disk_io: ");
+	class_for_each_device(&block_class, NULL, NULL, show_one_disk_io);
+	printk("\n");
+}
+
+static void show_nrprocs(void)
+{
+	unsigned long _nr_running, _nr_sleeping,
+			_nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = nr_stopped();
+
+	printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
+		"Z %lu, X %lu, T %lu (tot %d)\n",
+		nr_ve,	_nr_running, _nr_sleeping, _nr_unint,
+		_nr_zombie, _nr_dead, _nr_stopped, nr_threads);
+}
+
+static void wdog_print(void)
+{
+	show_header();
+	show_irq_list();
+	show_pgdatinfo();
+	show_mem(SHOW_MEM_FILTER_NODES);
+	show_diskio();
+	show_schedule_latency();
+	show_alloc_latency();
+	show_nrprocs();
+}
+
+static int wdog_loop(void* data)
+{
+	unsigned long next_print;
+	long timeout;
+
+	next_print = jiffies;
+	while (1) {
+		update_max_latencies();
+		if (time_is_before_eq_jiffies(next_print)) {
+			wdog_print();
+			reset_max_latencies();
+			next_print = jiffies + sleep_timeout * HZ;
+		}
+		try_to_freeze();
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		timeout = clamp_t(long, next_print - jiffies, 0, LOAD_FREQ);
+		schedule_timeout(timeout);
+	}
+	return 0;
+}
+
+static int __init wdog_init(void)
+{
+	struct file *file;
+
+	file = filp_open("/proc/interrupts", 0, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	intr_file = file;
+
+	wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog");
+	if (IS_ERR(wdog_thread_tsk)) {
+		filp_close(intr_file, NULL);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static void __exit wdog_exit(void)
+{
+	kthread_stop(wdog_thread_tsk);
+	filp_close(intr_file, NULL);
+}
+
+module_param(sleep_timeout, int, 0660);
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo WDOG");
+MODULE_LICENSE("GPL v2");
+
+module_init(wdog_init)
+module_exit(wdog_exit)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -652,6 +652,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
 	 */
 	smp_wmb();
 	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+	/*
+	 * The following mb guarantees that previous clear of a PENDING bit
+	 * will not be reordered with any speculative LOADS or STORES from
+	 * work->current_func, which is executed afterwards.  This possible
+	 * reordering can lead to a missed execution on attempt to qeueue
+	 * the same @work.  E.g. consider this case:
+	 *
+	 *   CPU#0                         CPU#1
+	 *   ----------------------------  --------------------------------
+	 *
+	 * 1  STORE event_indicated
+	 * 2  queue_work_on() {
+	 * 3    test_and_set_bit(PENDING)
+	 * 4 }                             set_..._and_clear_pending() {
+	 * 5                                 set_work_data() # clear bit
+	 * 6                                 smp_mb()
+	 * 7                               work->current_func() {
+	 * 8				      LOAD event_indicated
+	 *				   }
+	 *
+	 * Without an explicit full barrier speculative LOAD on line 8 can
+	 * be executed before CPU#0 does STORE on line 1.  If that happens,
+	 * CPU#0 observes the PENDING bit is still set and new execution of
+	 * a @work is not queued in a hope, that CPU#1 will eventually
+	 * finish the queued @work.  Meanwhile CPU#1 does not see
+	 * event_indicated is set, because speculative LOAD was executed
+	 * before actual STORE.
+	 */
+	smp_mb();
 }
 
 static void clear_work_data(struct work_struct *work)
@@ -3099,6 +3128,7 @@ int schedule_on_each_cpu(work_func_t func)
 	free_percpu(works);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(schedule_on_each_cpu);
 
 /**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -445,6 +445,14 @@ config MPILIB
 	  It is used to implement RSA digital signature verification,
 	  which is used by IMA/EVM digital signature extension.
 
+config MPILIB_EXTRA
+	bool
+	depends on MPILIB
+	help
+	  Additional sources of multiprecision maths library from GnuPG.
+	  This code is unnecessary for RSA digital signature verification,
+	  but can be compiled if needed.
+
 config SIGNATURE
 	tristate
 	depends on KEYS && CRYPTO
@@ -484,4 +492,8 @@ config ARCH_HAS_MMIO_FLUSH
 config PARMAN
 	tristate
 
+config STACKDEPOT
+	bool
+	select STACKTRACE
+
 endmenu
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -174,6 +174,27 @@ config DEBUG_KERNEL
 	  Say Y here if you are developing drivers or trying to debug and
 	  identify kernel problems.
 
+config ARCH_HAS_KCOV
+	bool
+	help
+	  KCOV does not have any arch-specific code, but currently it is enabled
+	  only for x86_64. KCOV requires testing on other archs, and most likely
+	  disabling of instrumentation for some early boot code.
+
+config KCOV
+	bool "Code coverage for fuzzing"
+	depends on ARCH_HAS_KCOV
+	select DEBUG_FS
+	help
+	  KCOV exposes kernel code coverage information in a form suitable
+	  for coverage-guided fuzzing (randomized testing).
+
+	  If RANDOMIZE_BASE is enabled, PC values will not be stable across
+	  different machines and across reboots. If you need stable PC values,
+	  disable RANDOMIZE_BASE.
+
+	  For more details, see Documentation/kcov.txt.
+
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
 	depends on DEBUG_KERNEL && GENERIC_HARDIRQS
@@ -1550,6 +1571,8 @@ source "lib/Kconfig.kgdb"
 
 source "lib/Kconfig.kmemcheck"
 
+source "lib/Kconfig.kasan"
+
 config TEST_STRING_HELPERS
 	tristate "Test functions located in the string_helpers module at runtime"
 
--- /dev/null
+++ b/lib/Kconfig.kasan
@@ -0,0 +1,61 @@
+config HAVE_ARCH_KASAN
+	bool
+
+if HAVE_ARCH_KASAN
+
+config KASAN
+	bool "KASan: runtime memory debugger"
+	depends on SLUB || (SLAB && !DEBUG_SLAB)
+	select CONSTRUCTORS
+	select STACKDEPOT
+	help
+	  Enables kernel address sanitizer - runtime memory debugger,
+	  designed to find out-of-bounds accesses and use-after-free bugs.
+	  This is strictly a debugging feature and it requires a gcc version
+	  of 4.9.2 or later. Detection of out of bounds accesses to stack or
+	  global variables requires gcc 5.0 or later.
+	  This feature consumes about 1/8 of available memory and brings about
+	  ~x3 performance slowdown.
+
+	  For better error detection enable CONFIG_STACKTRACE.
+	  Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB
+	  (the resulting kernel does not boot).
+
+config KASAN_SHADOW_OFFSET
+	hex
+	default 0xdffffc0000000000 if X86_64
+
+choice
+	prompt "Instrumentation type"
+	depends on KASAN
+	default KASAN_OUTLINE
+
+config KASAN_OUTLINE
+	bool "Outline instrumentation"
+	help
+	  Before every memory access compiler insert function call
+	  __asan_load*/__asan_store*. These functions performs check
+	  of shadow memory. This is slower than inline instrumentation,
+	  however it doesn't bloat size of kernel's .text section so
+	  much as inline does.
+
+config KASAN_INLINE
+	bool "Inline instrumentation"
+	help
+	  Compiler directly inserts code checking shadow memory before
+	  memory accesses. This is faster than outline (in some workloads
+	  it gives about x2 boost over outline instrumentation), but
+	  make kernel's .text size much bigger.
+	  This requires a gcc version of 5.0 or later.
+
+endchoice
+
+config TEST_KASAN
+	tristate "Module for testing kasan for bug detection"
+	depends on m && KASAN
+	help
+	  This is a test module doing various nasty things like
+	  out of bounds accesses, use after free. It is useful for testing
+	  kernel debugging features like kernel address sanitizer.
+
+endif
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
@@ -34,6 +46,9 @@ obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
 obj-$(CONFIG_TEST_PARMAN) += test_parman.o
+obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+
+obj-y += kmapset.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
@@ -151,7 +166,12 @@ obj-$(CONFIG_SG_POOL) += sg_pool.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
+obj-$(CONFIG_STACKDEPOT) += stackdepot.o
+KASAN_SANITIZE_stackdepot.o := n
+KCOV_INSTRUMENT_stackdepot.o := n
+
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o
+
 $(foreach file, $(libfdt_files), \
 	$(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt))
 lib-$(CONFIG_LIBFDT) += $(libfdt_files)
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -597,21 +597,31 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
 		if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0)
 			goto all_leaves_cluster_together;
 
-		/* Otherwise we can just insert a new node ahead of the old
-		 * one.
+		/* Otherwise all the old leaves cluster in the same slot, but
+		 * the new leaf wants to go into a different slot - so we
+		 * create a new node (n0) to hold the new leaf and a pointer to
+		 * a new node (n1) holding all the old leaves.
+		 *
+		 * This can be done by falling through to the node splitting
+		 * path.
 		 */
-		goto present_leaves_cluster_but_not_new_leaf;
+		pr_devel("present leaves cluster but not new leaf\n");
 	}
 
 split_node:
 	pr_devel("split node\n");
 
-	/* We need to split the current node; we know that the node doesn't
-	 * simply contain a full set of leaves that cluster together (it
-	 * contains meta pointers and/or non-clustering leaves).
+	/* We need to split the current node.  The node must contain anything
+	 * from a single leaf (in the one leaf case, this leaf will cluster
+	 * with the new leaf) and the rest meta-pointers, to all leaves, some
+	 * of which may cluster.
+	 *
+	 * It won't contain the case in which all the current leaves plus the
+	 * new leaves want to cluster in the same slot.
 	 *
 	 * We need to expel at least two leaves out of a set consisting of the
-	 * leaves in the node and the new leaf.
+	 * leaves in the node and the new leaf.  The current meta pointers can
+	 * just be copied as they shouldn't cluster with any of the leaves.
 	 *
 	 * We need a new node (n0) to replace the current one and a new node to
 	 * take the expelled nodes (n1).
@@ -716,33 +726,6 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
 	pr_devel("<--%s() = ok [split node]\n", __func__);
 	return true;
 
-present_leaves_cluster_but_not_new_leaf:
-	/* All the old leaves cluster in the same slot, but the new leaf wants
-	 * to go into a different slot, so we create a new node to hold the new
-	 * leaf and a pointer to a new node holding all the old leaves.
-	 */
-	pr_devel("present leaves cluster but not new leaf\n");
-
-	new_n0->back_pointer = node->back_pointer;
-	new_n0->parent_slot = node->parent_slot;
-	new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
-	new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
-	new_n1->parent_slot = edit->segment_cache[0];
-	new_n1->nr_leaves_on_branch = node->nr_leaves_on_branch;
-	edit->adjust_count_on = new_n0;
-
-	for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++)
-		new_n1->slots[i] = node->slots[i];
-
-	new_n0->slots[edit->segment_cache[0]] = assoc_array_node_to_ptr(new_n0);
-	edit->leaf_p = &new_n0->slots[edit->segment_cache[ASSOC_ARRAY_FAN_OUT]];
-
-	edit->set[0].ptr = &assoc_array_ptr_to_node(node->back_pointer)->slots[node->parent_slot];
-	edit->set[0].to = assoc_array_node_to_ptr(new_n0);
-	edit->excised_meta[0] = assoc_array_node_to_ptr(node);
-	pr_devel("<--%s() = ok [insert node before]\n", __func__);
-	return true;
-
 all_leaves_cluster_together:
 	/* All the leaves, new and old, want to cluster together in this node
 	 * in the same slot, so we have to replace this node with a shortcut to
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -42,6 +42,13 @@ int debug_locks_off(void)
 			console_verbose();
 			return 1;
 		}
+
+		/*
+		 * We want to taint kernel so tests can easily detect a lockdep
+		 * related problem reported.
+		 */
+
+		add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
 	}
 	return 0;
 }
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -14,6 +14,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/kmemleak.h>
 
 #define ODEBUG_HASH_BITS	14
 #define ODEBUG_HASH_SIZE	(1 << ODEBUG_HASH_BITS)
@@ -106,6 +107,7 @@ static void fill_pool(void)
 		if (!new)
 			return;
 
+		kmemleak_ignore(new);
 		raw_spin_lock_irqsave(&pool_lock, flags);
 		hlist_add_head(&new->node, &obj_pool);
 		debug_objects_alloc++;
@@ -1047,6 +1049,7 @@ static int __init debug_objects_replace_static_objects(void)
 		obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL);
 		if (!obj)
 			goto free;
+		kmemleak_ignore(obj);
 		hlist_add_head(&obj->node, &objects);
 	}
 
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -250,7 +250,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa,
 			id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
 
 			/* if already at the top layer, we need to grow */
-			if (id >= 1 << (idp->layers * IDR_BITS)) {
+			if (id > idr_max(idp->layers)) {
 				*starting_id = id;
 				return -EAGAIN;
 			}
@@ -524,9 +524,7 @@ EXPORT_SYMBOL(idr_alloc_cyclic);
 
 static void idr_remove_warning(int id)
 {
-	printk(KERN_WARNING
-		"idr_remove called for id=%d which is not allocated.\n", id);
-	dump_stack();
+	WARN(1, "idr_remove called for id=%d which is not allocated.\n", id);
 }
 
 static void sub_remove(struct idr *idp, int shift, int id)
@@ -832,12 +830,10 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 	if (!p)
 		return ERR_PTR(-EINVAL);
 
-	n = (p->layer+1) * IDR_BITS;
-
-	if (id >= (1 << n))
+	if (id > idr_max(p->layer + 1))
 		return ERR_PTR(-EINVAL);
 
-	n -= IDR_BITS;
+	n = p->layer * IDR_BITS;
 	while ((n > 0) && p) {
 		p = p->ary[(id >> n) & IDR_MASK];
 		n -= IDR_BITS;
@@ -1077,8 +1073,7 @@ void ida_remove(struct ida *ida, int id)
 	return;
 
  err:
-	printk(KERN_WARNING
-	       "ida_remove called for id=%d which is not allocated.\n", id);
+	WARN(1, "ida_remove called for id=%d which is not allocated.\n", id);
 }
 EXPORT_SYMBOL(ida_remove);
 
--- /dev/null
+++ b/lib/kmapset.c
@@ -0,0 +1,339 @@
+/*
+ *  lib/kmapset.c
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kmapset.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+struct kmapset_map *kmapset_new(struct kmapset_set *set)
+{
+	struct kmapset_map *map;
+
+	map = kmalloc(sizeof(struct kmapset_map), GFP_KERNEL);
+	if (!map)
+		return NULL;
+	kmapset_init_map(map, set);
+	return map;
+}
+
+static void kmapset_free(struct kmapset_map *map)
+{
+	struct kmapset_link *link;
+	struct hlist_node *next;
+
+	hlist_for_each_entry_safe(link, next, &map->links, map_link)
+		kfree_rcu(link, rcu_head);
+	kfree_rcu(map, rcu_head);
+}
+
+static long kmapset_cmp(struct kmapset_map *map_a, struct kmapset_map *map_b)
+{
+	struct kmapset_link *link_a, *link_b;
+
+	if (map_a->hash != map_b->hash)
+		return map_a->hash - map_b->hash;
+
+	if (map_a->size != map_b->size)
+		return map_a->size - map_b->size;
+
+	link_a = hlist_entry(map_a->links.first,
+			struct kmapset_link, map_link);
+	link_b = hlist_entry(map_b->links.first,
+			struct kmapset_link, map_link);
+	while (&link_a->map_link) {
+		if (link_a->key != link_b->key)
+			return (long)link_a->key - (long)link_b->key;
+		if (link_a->value != link_b->value)
+			return link_a->value - link_b->value;
+		link_a = list_entry(link_a->map_link.next,
+				struct kmapset_link, map_link);
+		link_b = list_entry(link_b->map_link.next,
+				struct kmapset_link, map_link);
+	}
+
+	return map_a->default_value - map_b->default_value;
+}
+
+static inline bool kmapset_hashed(struct kmapset_map *map)
+{
+	return !RB_EMPTY_NODE(&map->node);
+}
+
+static bool kmapset_hash(struct kmapset_map *map, struct kmapset_map **old)
+{
+	struct rb_node **p = &map->set->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct kmapset_map *cur;
+	struct kmapset_link *link;
+	long diff;
+
+	map->hash = hash_long(map->default_value, BITS_PER_LONG);
+	hlist_for_each_entry(link, &map->links, map_link)
+		map->hash ^= hash_ptr(link->key, BITS_PER_LONG) *
+			     hash_long(link->value, BITS_PER_LONG);
+
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct kmapset_map, node);
+		diff = kmapset_cmp(map, cur);
+		if (diff < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+		if (!diff && old) {
+			*old = cur;
+			return true;
+		}
+	}
+	rb_link_node(&map->node, parent, p);
+	rb_insert_color(&map->node, &map->set->tree);
+	return false;
+}
+
+static void kmapset_unhash(struct kmapset_map *map)
+{
+	rb_erase(&map->node, &map->set->tree);
+	RB_CLEAR_NODE(&map->node);
+}
+
+static void kmapset_rehash(struct kmapset_map *map)
+{
+	if (kmapset_hashed(map)) {
+		kmapset_unhash(map);
+		kmapset_hash(map, NULL);
+	}
+}
+
+struct kmapset_map *kmapset_get(struct kmapset_map *map)
+{
+	if (map)
+		kref_get(&map->kref);
+	return map;
+}
+
+static void kmapset_release(struct kref *kref)
+{
+	struct kmapset_map *map = container_of(kref, struct kmapset_map, kref);
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *link;
+
+	if (kmapset_hashed(map))
+		kmapset_unhash(map);
+	hlist_for_each_entry(link, &map->links, map_link)
+		hlist_del(&link->key_link);
+	mutex_unlock(&set->mutex);
+
+	kmapset_free(map);
+}
+
+void kmapset_put(struct kmapset_map *map)
+{
+	if (map)
+		kref_put_mutex(&map->kref, kmapset_release, &map->set->mutex);
+}
+
+/*
+ * kmapset_commit - hash new map into set or lookup existing copy\
+ *
+ * after committing map must stay immutable
+ */
+struct kmapset_map *kmapset_commit(struct kmapset_map *map)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_map *ret = map;
+
+	kmapset_lock(set);
+	if (kmapset_hash(map, &ret)) {
+		kmapset_get(ret);
+		kmapset_release(&map->kref);
+	} else
+		kmapset_unlock(set);
+
+	return ret;
+}
+
+/*
+ * kmapset_copy - copy content of one set to another
+ */
+static int kmapset_copy(struct kmapset_map *dst, struct kmapset_map *src)
+{
+	struct kmapset_set *set = src->set;
+	struct kmapset_link *old_link, *new_link;
+	struct hlist_node *next;
+	int i;
+
+	for (i = src->size; i; i--) {
+		new_link = kmalloc(sizeof(struct kmapset_link), GFP_KERNEL);
+		if (!new_link)
+			return -ENOMEM;
+		hlist_add_head(&new_link->map_link, &dst->links);
+	}
+
+	kmapset_lock(set);
+	dst->default_value = src->default_value;
+	new_link = hlist_entry(dst->links.first, struct kmapset_link, map_link);
+	hlist_for_each_entry(old_link, &src->links, map_link) {
+		new_link->key = old_link->key;
+		new_link->value = old_link->value;
+		new_link->map = dst;
+		dst->size++;
+		hlist_add_head(&new_link->key_link, &new_link->key->links);
+		new_link = hlist_entry(new_link->map_link.next,
+				struct kmapset_link, map_link);
+	}
+	kmapset_unlock(set);
+
+	while (&new_link->map_link) {
+		next = new_link->map_link.next;
+		hlist_del(&new_link->map_link);
+		kfree(new_link);
+		new_link = hlist_entry(next, struct kmapset_link, map_link);
+	}
+
+	return 0;
+}
+
+struct kmapset_map *kmapset_dup(struct kmapset_map *map)
+{
+	struct kmapset_map *new;
+
+	new = kmapset_new(map->set);
+	if (!new)
+		return NULL;
+
+	if (kmapset_copy(new, map)) {
+		kmapset_free(new);
+		return NULL;
+	}
+
+	return new;
+}
+
+/*
+ * kmapset_value - lookup link object for given key
+ *
+ * requires kmapset_lock or rcu_read_lock
+ */
+struct kmapset_link *
+kmapset_lookup(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_link *link;
+
+	hlist_for_each_entry_rcu(link, &map->links, map_link) {
+		if (link->key == key)
+			return link;
+		if (link->key > key)
+			break;
+	}
+	return NULL;
+}
+
+/*
+ * kmapset_get_value - retrieve value for given key
+ */
+unsigned long
+kmapset_get_value(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_link *link;
+	unsigned long value;
+
+	rcu_read_lock();
+	link = kmapset_lookup(map, key);
+	value = link ? link->value : map->default_value;
+	rcu_read_unlock();
+	return value;
+}
+
+int kmapset_set_value(struct kmapset_map *map,
+		struct kmapset_key *key, unsigned long value)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *new_link, *old_link, *last_link = NULL;
+
+	new_link = kmalloc(sizeof(struct kmapset_link), GFP_KERNEL);
+	if (!new_link)
+		return -ENOMEM;
+
+	new_link->key = key;
+	new_link->value = value;
+	new_link->map = map;
+
+	kmapset_lock(set);
+	if (hlist_empty(&map->links)) {
+		hlist_add_head_rcu(&new_link->map_link, &map->links);
+	} else {
+		hlist_for_each_entry(old_link, &map->links, map_link) {
+			last_link = old_link;
+			if (old_link->key < key)
+				continue;
+			if (old_link->key == key) {
+				old_link->value = value;
+				kfree(new_link);
+				goto out;
+			}
+			hlist_add_before_rcu(&new_link->map_link,
+					     &old_link->map_link);
+			goto add;
+		}
+		hlist_add_behind_rcu(&new_link->map_link, &last_link->map_link);
+	}
+add:
+	hlist_add_head(&new_link->key_link, &new_link->key->links);
+	map->size++;
+out:
+	kmapset_unlock(set);
+
+	return 0;
+}
+
+bool kmapset_del_value(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *link;
+	bool ret = false;
+
+	kmapset_lock(set);
+	link = kmapset_lookup(map, key);
+	if (link) {
+		hlist_del_rcu(&link->map_link);
+		hlist_del(&link->key_link);
+		kfree_rcu(link, rcu_head);
+		ret = true;
+	}
+	kmapset_unlock(set);
+	return ret;
+}
+
+void kmapset_set_default(struct kmapset_map *map, unsigned long value)
+{
+	struct kmapset_set *set = map->set;
+
+	kmapset_lock(set);
+	map->default_value = value;
+	kmapset_unlock(set);
+}
+
+/*
+ * kmapset_unlink - unlink key from all maps in set
+ */
+void kmapset_unlink(struct kmapset_key *key, struct kmapset_set *set)
+{
+	struct kmapset_link *link;
+	struct kmapset_map *map;
+	struct hlist_node *next;
+
+	kmapset_lock(set);
+	hlist_for_each_entry_safe(link, next, &key->links, key_link) {
+		map = link->map;
+		hlist_del(&link->key_link);
+		hlist_del_rcu(&link->map_link);
+		map->size--;
+		kfree_rcu(link, rcu_head);
+		kmapset_rehash(map);
+	}
+	kmapset_unlock(set);
+}
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -652,6 +652,7 @@ struct kobject *kobject_create(void)
 	kobject_init(kobj, &dynamic_kobj_ktype);
 	return kobj;
 }
+EXPORT_SYMBOL(kobject_create);
 
 /**
  * kobject_create_and_add - create a struct kobject dynamically and register it with sysfs
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -27,6 +27,7 @@
 #include <net/sock.h>
 #include <net/net_namespace.h>
 
+#include <linux/ve.h>
 
 u64 uevent_seqnum;
 char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
@@ -128,7 +129,7 @@ static int kobj_usermode_filter(struct kobject *kobj)
  * Returns 0 if kobject_uevent_env() is completed with success or the
  * corresponding error when it fails.
  */
-int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
 		       char *envp_ext[])
 {
 	struct kobj_uevent_env *env;
@@ -246,7 +247,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 
 	mutex_lock(&uevent_sock_mutex);
 	/* we will send an event, so request a new sequence number */
-	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
+	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++ve_uevent_seqnum);
 	if (retval) {
 		mutex_unlock(&uevent_sock_mutex);
 		goto exit;
@@ -258,10 +259,15 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		struct sock *uevent_sock = ue_sk->sk;
 		struct sk_buff *skb;
 		size_t len;
+		struct ve_struct *owner_ve;
 
 		if (!netlink_has_listeners(uevent_sock, 1))
 			continue;
 
+		owner_ve = sock_net(uevent_sock)->owner_ve;
+		if (!ve_is_super(owner_ve) && owner_ve != get_exec_env())
+			continue;
+
 		/* allocate message with the maximum possible size */
 		len = strlen(action_string) + strlen(devpath) + 2;
 		skb = alloc_skb(len + env->buflen, GFP_KERNEL);
@@ -319,6 +325,12 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 }
 EXPORT_SYMBOL_GPL(kobject_uevent_env);
 
+int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+			char *envp_ext[])
+{
+	return kobject_uevent_env_one(kobj, action, envp_ext);
+}
+
 /**
  * kobject_uevent - notify userspace by sending an uevent
  *
--- a/lib/mpi/Makefile
+++ b/lib/mpi/Makefile
@@ -20,3 +20,14 @@ mpi-y = \
 	mpih-mul.o			\
 	mpi-pow.o			\
 	mpiutil.o
+
+mpi-$(CONFIG_MPILIB_EXTRA) += \
+	mpi-add.o			\
+	mpi-div.o			\
+	mpi-cmp.o			\
+	mpi-gcd.o			\
+	mpi-inline.o			\
+	mpi-inv.o			\
+	mpi-mpow.o			\
+	mpi-mul.o			\
+	mpi-scan.o
--- /dev/null
+++ b/lib/mpi/generic_mpi-asm-defs.h
@@ -0,0 +1,4 @@
+/* This file defines some basic constants for the MPI machinery.  We
+ * need to define the types on a per-CPU basis, so it is done with
+ * this file here.  */
+#define BYTES_PER_MPI_LIMB  (SIZEOF_UNSIGNED_LONG)
--- /dev/null
+++ b/lib/mpi/mpi-add.c
@@ -0,0 +1,234 @@
+/* mpi-add.c  -  MPI functions
+ *	Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Add the unsigned integer V to the mpi-integer U and store the
+ * result in W. U and V may be the same.
+ */
+int mpi_add_ui(MPI w, const MPI u, unsigned long v)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize, wsize;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+	wsign = 0;
+
+	/* If not space for W (and possible carry), increase space.  */
+	wsize = usize + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+
+	/* These must be after realloc (U may be the same as W).  */
+	up = u->d;
+	wp = w->d;
+
+	if (!usize) {		/* simple */
+		wp[0] = v;
+		wsize = v ? 1 : 0;
+	} else if (!usign) {	/* mpi is not negative */
+		mpi_limb_t cy;
+		cy = mpihelp_add_1(wp, up, usize, v);
+		wp[usize] = cy;
+		wsize = usize + cy;
+	} else {		/* The signs are different.  Need exact comparison to determine
+				 * which operand to subtract from which.  */
+		if (usize == 1 && up[0] < v) {
+			wp[0] = v - up[0];
+			wsize = 1;
+		} else {
+			mpihelp_sub_1(wp, up, usize, v);
+			/* Size can decrease with at most one limb. */
+			wsize = usize - (wp[usize - 1] == 0);
+			wsign = 1;
+		}
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_add(MPI w, MPI u, MPI v)
+{
+	mpi_ptr_t wp, up, vp;
+	mpi_size_t usize, vsize, wsize;
+	int usign, vsign, wsign;
+
+	if (u->nlimbs < v->nlimbs) {	/* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		wsize = usize + 1;
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = v->d;
+		vp = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		wsize = usize + 1;
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = u->d;
+		vp = v->d;
+	}
+	wp = w->d;
+	wsign = 0;
+
+	if (!vsize) {		/* simple */
+		MPN_COPY(wp, up, usize);
+		wsize = usize;
+		wsign = usign;
+	} else if (usign != vsign) {	/* different sign */
+		/* This test is right since USIZE >= VSIZE */
+		if (usize != vsize) {
+			mpihelp_sub(wp, up, usize, vp, vsize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			wsign = usign;
+		} else if (mpihelp_cmp(up, vp, usize) < 0) {
+			mpihelp_sub_n(wp, vp, up, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (!usign)
+				wsign = 1;
+		} else {
+			mpihelp_sub_n(wp, up, vp, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (usign)
+				wsign = 1;
+		}
+	} else {		/* U and V have same sign. Add them. */
+		mpi_limb_t cy = mpihelp_add(wp, up, usize, vp, vsize);
+		wp[usize] = cy;
+		wsize = usize + cy;
+		if (usign)
+			wsign = 1;
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+/****************
+ * Subtract the unsigned integer V from the mpi-integer U and store the
+ * result in W.
+ */
+int mpi_sub_ui(MPI w, MPI u, unsigned long v)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize, wsize;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+	wsign = 0;
+
+	/* If not space for W (and possible carry), increase space.  */
+	wsize = usize + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+
+	/* These must be after realloc (U may be the same as W).  */
+	up = u->d;
+	wp = w->d;
+
+	if (!usize) {		/* simple */
+		wp[0] = v;
+		wsize = v ? 1 : 0;
+		wsign = 1;
+	} else if (usign) {	/* mpi and v are negative */
+		mpi_limb_t cy;
+		cy = mpihelp_add_1(wp, up, usize, v);
+		wp[usize] = cy;
+		wsize = usize + cy;
+	} else {		/* The signs are different.  Need exact comparison to determine
+				 * which operand to subtract from which.  */
+		if (usize == 1 && up[0] < v) {
+			wp[0] = v - up[0];
+			wsize = 1;
+			wsign = 1;
+		} else {
+			mpihelp_sub_1(wp, up, usize, v);
+			/* Size can decrease with at most one limb. */
+			wsize = usize - (wp[usize - 1] == 0);
+		}
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_sub(MPI w, MPI u, MPI v)
+{
+	int rc;
+
+	if (w == v) {
+		MPI vv;
+		if (mpi_copy(&vv, v) < 0)
+			return -ENOMEM;
+		vv->sign = !vv->sign;
+		rc = mpi_add(w, u, vv);
+		mpi_free(vv);
+	} else {
+		/* fixme: this is not thread-save (we temp. modify v) */
+		v->sign = !v->sign;
+		rc = mpi_add(w, u, v);
+		v->sign = !v->sign;
+	}
+	return rc;
+}
+
+int mpi_addm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_add(w, u, v) < 0 || mpi_fdiv_r(w, w, m) < 0)
+		return -ENOMEM;
+	return 0;
+}
+
+int mpi_subm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_sub(w, u, v) < 0 || mpi_fdiv_r(w, w, m) < 0)
+		return -ENOMEM;
+	return 0;
+}
--- a/lib/mpi/mpi-bit.c
+++ b/lib/mpi/mpi-bit.c
@@ -54,3 +54,165 @@ unsigned mpi_get_nbits(MPI a)
 	return n;
 }
 EXPORT_SYMBOL_GPL(mpi_get_nbits);
+
+/****************
+ * Test whether bit N is set.
+ */
+int mpi_test_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+	mpi_limb_t limb;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return 0;	/* too far left: this is a 0 */
+	limb = a->d[limbno];
+	return (limb & (A_LIMB_1 << bitno)) ? 1 : 0;
+}
+
+/****************
+ * Set bit N of A.
+ */
+int mpi_set_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs) {	/* resize */
+		if (a->alloced >= limbno)
+			if (mpi_resize(a, limbno + 1) < 0)
+				return -ENOMEM;
+		a->nlimbs = limbno + 1;
+	}
+	a->d[limbno] |= (A_LIMB_1 << bitno);
+	return 0;
+}
+
+/****************
+ * Set bit N of A. and clear all bits above
+ */
+int mpi_set_highbit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs) {	/* resize */
+		if (a->alloced >= limbno)
+			if (mpi_resize(a, limbno + 1) < 0)
+				return -ENOMEM;
+		a->nlimbs = limbno + 1;
+	}
+	a->d[limbno] |= (A_LIMB_1 << bitno);
+	for (bitno++; bitno < BITS_PER_MPI_LIMB; bitno++)
+		a->d[limbno] &= ~(A_LIMB_1 << bitno);
+	a->nlimbs = limbno + 1;
+	return 0;
+}
+
+/****************
+ * clear bit N of A and all bits above
+ */
+void mpi_clear_highbit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return;		/* not allocated, so need to clear bits :-) */
+
+	for (; bitno < BITS_PER_MPI_LIMB; bitno++)
+		a->d[limbno] &= ~(A_LIMB_1 << bitno);
+	a->nlimbs = limbno + 1;
+}
+
+/****************
+ * Clear bit N of A.
+ */
+void mpi_clear_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return;		/* don't need to clear this bit, it's to far to left */
+	a->d[limbno] &= ~(A_LIMB_1 << bitno);
+}
+
+/****************
+ * Shift A by N bits to the right
+ * FIXME: should use alloc_limb if X and A are same.
+ */
+int mpi_rshift(MPI x, MPI a, unsigned n)
+{
+	mpi_ptr_t xp;
+	mpi_size_t xsize;
+
+	xsize = a->nlimbs;
+	x->sign = a->sign;
+	if (RESIZE_IF_NEEDED(x, (size_t) xsize) < 0)
+		return -ENOMEM;
+	xp = x->d;
+
+	if (xsize) {
+		mpihelp_rshift(xp, a->d, xsize, n);
+		MPN_NORMALIZE(xp, xsize);
+	}
+	x->nlimbs = xsize;
+	return 0;
+}
+
+/****************
+ * Shift A by COUNT limbs to the left
+ * This is used only within the MPI library
+ */
+int mpi_lshift_limbs(MPI a, unsigned int count)
+{
+	const int n = a->nlimbs;
+	mpi_ptr_t ap;
+	int i;
+
+	if (!count || !n)
+		return 0;
+
+	if (RESIZE_IF_NEEDED(a, n + count) < 0)
+		return -ENOMEM;
+
+	ap = a->d;
+	for (i = n - 1; i >= 0; i--)
+		ap[i + count] = ap[i];
+	for (i = 0; i < count; i++)
+		ap[i] = 0;
+	a->nlimbs += count;
+	return 0;
+}
+
+/****************
+ * Shift A by COUNT limbs to the right
+ * This is used only within the MPI library
+ */
+void mpi_rshift_limbs(MPI a, unsigned int count)
+{
+	mpi_ptr_t ap = a->d;
+	mpi_size_t n = a->nlimbs;
+	unsigned int i;
+
+	if (count >= n) {
+		a->nlimbs = 0;
+		return;
+	}
+
+	for (i = 0; i < n - count; i++)
+		ap[i] = ap[i + count];
+	ap[i] = 0;
+	a->nlimbs -= count;
+}
--- /dev/null
+++ b/lib/mpi/mpi-div.c
@@ -0,0 +1,339 @@
+/* mpi-div.c  -  MPI functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/string.h>
+#include "mpi-internal.h"
+#include "longlong.h"
+
+int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor)
+{
+	int rc = -ENOMEM;
+	int divisor_sign = divisor->sign;
+	MPI temp_divisor = NULL;
+
+	/* We need the original value of the divisor after the remainder has been
+	 * preliminary calculated.      We have to copy it to temporary space if it's
+	 * the same variable as REM.  */
+	if (rem == divisor) {
+		if (mpi_copy(&temp_divisor, divisor) < 0)
+			goto nomem;
+		divisor = temp_divisor;
+	}
+
+	if (mpi_tdiv_qr(NULL, rem, dividend, divisor) < 0)
+		goto nomem;
+	if (((divisor_sign ? 1 : 0) ^ (dividend->sign ? 1 : 0)) && rem->nlimbs)
+		if (mpi_add(rem, rem, divisor) < 0)
+			goto nomem;
+
+	rc = 0;
+
+nomem:
+	if (temp_divisor)
+		mpi_free(temp_divisor);
+	return rc;
+}
+EXPORT_SYMBOL(mpi_fdiv_r);
+
+/****************
+ * Division rounding the quotient towards -infinity.
+ * The remainder gets the same sign as the denominator.
+ * rem is optional
+ */
+
+ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor)
+{
+	mpi_limb_t rlimb;
+
+	rlimb = mpihelp_mod_1(dividend->d, dividend->nlimbs, divisor);
+	if (rlimb && dividend->sign)
+		rlimb = divisor - rlimb;
+
+	if (rem) {
+		rem->d[0] = rlimb;
+		rem->nlimbs = rlimb ? 1 : 0;
+	}
+	return rlimb;
+}
+
+int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor)
+{
+	MPI tmp = mpi_alloc(mpi_get_nlimbs(quot));
+	if (!tmp)
+		return -ENOMEM;
+	mpi_fdiv_qr(quot, tmp, dividend, divisor);
+	mpi_free(tmp);
+	return 0;
+}
+
+int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor)
+{
+	int divisor_sign = divisor->sign;
+	MPI temp_divisor = NULL;
+
+	if (quot == divisor || rem == divisor) {
+		if (mpi_copy(&temp_divisor, divisor) < 0)
+			return -ENOMEM;
+		divisor = temp_divisor;
+	}
+
+	if (mpi_tdiv_qr(quot, rem, dividend, divisor) < 0)
+		goto nomem;
+
+	if ((divisor_sign ^ dividend->sign) && rem->nlimbs) {
+		if (mpi_sub_ui(quot, quot, 1) < 0)
+			goto nomem;
+		if (mpi_add(rem, rem, divisor) < 0)
+			goto nomem;
+	}
+
+	if (temp_divisor)
+		mpi_free(temp_divisor);
+
+	return 0;
+
+nomem:
+	mpi_free(temp_divisor);
+	return -ENOMEM;
+}
+
+/* If den == quot, den needs temporary storage.
+ * If den == rem, den needs temporary storage.
+ * If num == quot, num needs temporary storage.
+ * If den has temporary storage, it can be normalized while being copied,
+ *   i.e no extra storage should be allocated.
+ */
+
+int mpi_tdiv_r(MPI rem, MPI num, MPI den)
+{
+	return mpi_tdiv_qr(NULL, rem, num, den);
+}
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den)
+{
+	int rc = -ENOMEM;
+	mpi_ptr_t np, dp;
+	mpi_ptr_t qp, rp;
+	mpi_size_t nsize = num->nlimbs;
+	mpi_size_t dsize = den->nlimbs;
+	mpi_size_t qsize, rsize;
+	mpi_size_t sign_remainder = num->sign;
+	mpi_size_t sign_quotient = num->sign ^ den->sign;
+	unsigned normalization_steps;
+	mpi_limb_t q_limb;
+	mpi_ptr_t marker[5];
+	int markidx = 0;
+
+	if (!dsize)
+		return -EINVAL;
+
+	memset(marker, 0, sizeof(marker));
+
+	/* Ensure space is enough for quotient and remainder.
+	 * We need space for an extra limb in the remainder, because it's
+	 * up-shifted (normalized) below.  */
+	rsize = nsize + 1;
+	if (mpi_resize(rem, rsize) < 0)
+		goto nomem;
+
+	qsize = rsize - dsize;	/* qsize cannot be bigger than this.  */
+	if (qsize <= 0) {
+		if (num != rem) {
+			rem->nlimbs = num->nlimbs;
+			rem->sign = num->sign;
+			MPN_COPY(rem->d, num->d, nsize);
+		}
+		if (quot) {
+			/* This needs to follow the assignment to rem, in case the
+			 * numerator and quotient are the same.  */
+			quot->nlimbs = 0;
+			quot->sign = 0;
+		}
+		return 0;
+	}
+
+	if (quot)
+		if (mpi_resize(quot, qsize) < 0)
+			goto nomem;
+
+	/* Read pointers here, when reallocation is finished.  */
+	np = num->d;
+	dp = den->d;
+	rp = rem->d;
+
+	/* Optimize division by a single-limb divisor.  */
+	if (dsize == 1) {
+		mpi_limb_t rlimb;
+		if (quot) {
+			qp = quot->d;
+			rlimb = mpihelp_divmod_1(qp, np, nsize, dp[0]);
+			qsize -= qp[qsize - 1] == 0;
+			quot->nlimbs = qsize;
+			quot->sign = sign_quotient;
+		} else
+			rlimb = mpihelp_mod_1(np, nsize, dp[0]);
+		rp[0] = rlimb;
+		rsize = rlimb != 0 ? 1 : 0;
+		rem->nlimbs = rsize;
+		rem->sign = sign_remainder;
+		return 0;
+	}
+
+	if (quot) {
+		qp = quot->d;
+		/* Make sure QP and NP point to different objects.  Otherwise the
+		 * numerator would be gradually overwritten by the quotient limbs.  */
+		if (qp == np) {	/* Copy NP object to temporary space.  */
+			np = marker[markidx++] = mpi_alloc_limb_space(nsize);
+			if (!np)
+				goto nomem;
+			MPN_COPY(np, qp, nsize);
+		}
+	} else			/* Put quotient at top of remainder. */
+		qp = rp + dsize;
+
+	normalization_steps = count_leading_zeros(dp[dsize - 1]);
+
+	/* Normalize the denominator, i.e. make its most significant bit set by
+	 * shifting it NORMALIZATION_STEPS bits to the left.  Also shift the
+	 * numerator the same number of steps (to keep the quotient the same!).
+	 */
+	if (normalization_steps) {
+		mpi_ptr_t tp;
+		mpi_limb_t nlimb;
+
+		/* Shift up the denominator setting the most significant bit of
+		 * the most significant word.  Use temporary storage not to clobber
+		 * the original contents of the denominator.  */
+		tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+		if (!tp)
+			goto nomem;
+		mpihelp_lshift(tp, dp, dsize, normalization_steps);
+		dp = tp;
+
+		/* Shift up the numerator, possibly introducing a new most
+		 * significant word.  Move the shifted numerator in the remainder
+		 * meanwhile.  */
+		nlimb = mpihelp_lshift(rp, np, nsize, normalization_steps);
+		if (nlimb) {
+			rp[nsize] = nlimb;
+			rsize = nsize + 1;
+		} else
+			rsize = nsize;
+	} else {
+		/* The denominator is already normalized, as required.  Copy it to
+		 * temporary space if it overlaps with the quotient or remainder.  */
+		if (dp == rp || (quot && (dp == qp))) {
+			mpi_ptr_t tp;
+
+			tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+			if (!tp)
+				goto nomem;
+			MPN_COPY(tp, dp, dsize);
+			dp = tp;
+		}
+
+		/* Move the numerator to the remainder.  */
+		if (rp != np)
+			MPN_COPY(rp, np, nsize);
+
+		rsize = nsize;
+	}
+
+	q_limb = mpihelp_divrem(qp, 0, rp, rsize, dp, dsize);
+
+	if (quot) {
+		qsize = rsize - dsize;
+		if (q_limb) {
+			qp[qsize] = q_limb;
+			qsize += 1;
+		}
+
+		quot->nlimbs = qsize;
+		quot->sign = sign_quotient;
+	}
+
+	rsize = dsize;
+	MPN_NORMALIZE(rp, rsize);
+
+	if (normalization_steps && rsize) {
+		mpihelp_rshift(rp, rp, rsize, normalization_steps);
+		rsize -= rp[rsize - 1] == 0 ? 1 : 0;
+	}
+
+	rem->nlimbs = rsize;
+	rem->sign = sign_remainder;
+
+	rc = 0;
+nomem:
+	while (markidx)
+		mpi_free_limb_space(marker[--markidx]);
+	return rc;
+}
+
+int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count)
+{
+	mpi_size_t usize, wsize;
+	mpi_size_t limb_cnt;
+
+	usize = u->nlimbs;
+	limb_cnt = count / BITS_PER_MPI_LIMB;
+	wsize = usize - limb_cnt;
+	if (limb_cnt >= usize)
+		w->nlimbs = 0;
+	else {
+		mpi_ptr_t wp;
+		mpi_ptr_t up;
+
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		wp = w->d;
+		up = u->d;
+
+		count %= BITS_PER_MPI_LIMB;
+		if (count) {
+			mpihelp_rshift(wp, up + limb_cnt, wsize, count);
+			wsize -= !wp[wsize - 1];
+		} else {
+			MPN_COPY_INCR(wp, up + limb_cnt, wsize);
+		}
+
+		w->nlimbs = wsize;
+	}
+	return 0;
+}
+
+/****************
+ * Check whether dividend is divisible by divisor
+ * (note: divisor must fit into a limb)
+ */
+int mpi_divisible_ui(MPI dividend, ulong divisor)
+{
+	return !mpihelp_mod_1(dividend->d, dividend->nlimbs, divisor);
+}
--- /dev/null
+++ b/lib/mpi/mpi-gcd.c
@@ -0,0 +1,59 @@
+/* mpi-gcd.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Find the greatest common divisor G of A and B.
+ * Return: true if this 1, false in all other cases
+ */
+int mpi_gcd(MPI g, const MPI xa, const MPI xb)
+{
+	MPI a = NULL, b = NULL;
+
+	if (mpi_copy(&a, xa) < 0)
+		goto nomem;
+
+	if (mpi_copy(&b, xb) < 0)
+		goto nomem;
+
+	/* TAOCP Vol II, 4.5.2, Algorithm A */
+	a->sign = 0;
+	b->sign = 0;
+	while (mpi_cmp_ui(b, 0)) {
+		if (mpi_fdiv_r(g, a, b) < 0)	/* g used as temorary variable */
+			goto nomem;
+		if (mpi_set(a, b) < 0)
+			goto nomem;
+		if (mpi_set(b, g) < 0)
+			goto nomem;
+	}
+	if (mpi_set(g, a) < 0)
+		goto nomem;
+
+	mpi_free(a);
+	mpi_free(b);
+	return !mpi_cmp_ui(g, 1);
+
+nomem:
+	mpi_free(a);
+	mpi_free(b);
+	return -ENOMEM;
+}
--- /dev/null
+++ b/lib/mpi/mpi-inline.c
@@ -0,0 +1,31 @@
+/* mpi-inline.c
+ * Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+/* put the inline functions as real functions into the lib */
+#define G10_MPI_INLINE_DECL
+
+#include "mpi-internal.h"
+
+/* always include the header becuase it is only
+ * included by mpi-internal if __GCC__ is defined but we
+ * need it here in all cases and the above definition of
+ * of the macro allows us to do so
+ */
+#include "mpi-inline.h"
--- /dev/null
+++ b/lib/mpi/mpi-inv.c
@@ -0,0 +1,188 @@
+/* mpi-inv.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Calculate the multiplicative inverse X of A mod N
+ * That is: Find the solution x for
+ *		1 = (a*x) mod n
+ */
+int mpi_invm(MPI x, const MPI a, const MPI n)
+{
+	/* Extended Euclid's algorithm (See TAOPC Vol II, 4.5.2, Alg X)
+	 * modified according to Michael Penk's solution for Exercice 35
+	 * with further enhancement */
+	MPI u = NULL, v = NULL;
+	MPI u1 = NULL, u2 = NULL, u3 = NULL;
+	MPI v1 = NULL, v2 = NULL, v3 = NULL;
+	MPI t1 = NULL, t2 = NULL, t3 = NULL;
+	unsigned k;
+	int sign;
+	int odd = 0;
+	int rc = -ENOMEM;
+
+	if (mpi_copy(&u, a) < 0)
+		goto cleanup;
+	if (mpi_copy(&v, n) < 0)
+		goto cleanup;
+
+	for (k = 0; !mpi_test_bit(u, 0) && !mpi_test_bit(v, 0); k++) {
+		if (mpi_rshift(u, u, 1) < 0)
+			goto cleanup;
+		if (mpi_rshift(v, v, 1) < 0)
+			goto cleanup;
+	}
+	odd = mpi_test_bit(v, 0);
+
+	u1 = mpi_alloc_set_ui(1);
+	if (!u1)
+		goto cleanup;
+	if (!odd) {
+		u2 = mpi_alloc_set_ui(0);
+		if (!u2)
+			goto cleanup;
+	}
+	if (mpi_copy(&u3, u) < 0)
+		goto cleanup;
+	if (mpi_copy(&v1, v) < 0)
+		goto cleanup;
+	if (!odd) {
+		v2 = mpi_alloc(mpi_get_nlimbs(u));
+		if (!v2)
+			goto cleanup;
+		if (mpi_sub(v2, u1, u) < 0)
+			goto cleanup;	/* U is used as const 1 */
+	}
+	if (mpi_copy(&v3, v) < 0)
+		goto cleanup;
+	if (mpi_test_bit(u, 0)) {	/* u is odd */
+		t1 = mpi_alloc_set_ui(0);
+		if (!t1)
+			goto cleanup;
+		if (!odd) {
+			t2 = mpi_alloc_set_ui(1);
+			if (!t2)
+				goto cleanup;
+			t2->sign = 1;
+		}
+		if (mpi_copy(&t3, v) < 0)
+			goto cleanup;
+		t3->sign = !t3->sign;
+		goto Y4;
+	} else {
+		t1 = mpi_alloc_set_ui(1);
+		if (!t1)
+			goto cleanup;
+		if (!odd) {
+			t2 = mpi_alloc_set_ui(0);
+			if (!t2)
+				goto cleanup;
+		}
+		if (mpi_copy(&t3, u) < 0)
+			goto cleanup;
+	}
+	do {
+		do {
+			if (!odd) {
+				if (mpi_test_bit(t1, 0) || mpi_test_bit(t2, 0)) {	/* one is odd */
+					if (mpi_add(t1, t1, v) < 0)
+						goto cleanup;
+					if (mpi_sub(t2, t2, u) < 0)
+						goto cleanup;
+				}
+				if (mpi_rshift(t1, t1, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t2, t2, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t3, t3, 1) < 0)
+					goto cleanup;
+			} else {
+				if (mpi_test_bit(t1, 0))
+					if (mpi_add(t1, t1, v) < 0)
+						goto cleanup;
+				if (mpi_rshift(t1, t1, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t3, t3, 1) < 0)
+					goto cleanup;
+			}
+Y4:
+			;
+		} while (!mpi_test_bit(t3, 0));	/* while t3 is even */
+
+		if (!t3->sign) {
+			if (mpi_set(u1, t1) < 0)
+				goto cleanup;
+			if (!odd)
+				if (mpi_set(u2, t2) < 0)
+					goto cleanup;
+			if (mpi_set(u3, t3) < 0)
+				goto cleanup;
+		} else {
+			if (mpi_sub(v1, v, t1) < 0)
+				goto cleanup;
+			sign = u->sign;
+			u->sign = !u->sign;
+			if (!odd)
+				if (mpi_sub(v2, u, t2) < 0)
+					goto cleanup;
+			u->sign = sign;
+			sign = t3->sign;
+			t3->sign = !t3->sign;
+			if (mpi_set(v3, t3) < 0)
+				goto cleanup;
+			t3->sign = sign;
+		}
+		if (mpi_sub(t1, u1, v1) < 0)
+			goto cleanup;
+		if (!odd)
+			if (mpi_sub(t2, u2, v2) < 0)
+				goto cleanup;
+		if (mpi_sub(t3, u3, v3) < 0)
+			goto cleanup;
+		if (t1->sign) {
+			if (mpi_add(t1, t1, v) < 0)
+				goto cleanup;
+			if (!odd)
+				if (mpi_sub(t2, t2, u) < 0)
+					goto cleanup;
+		}
+	} while (mpi_cmp_ui(t3, 0));	/* while t3 != 0 */
+	/* mpi_lshift( u3, k ); */
+	rc = mpi_set(x, u1);
+
+cleanup:
+	mpi_free(u1);
+	mpi_free(v1);
+	mpi_free(t1);
+	if (!odd) {
+		mpi_free(u2);
+		mpi_free(v2);
+		mpi_free(t2);
+	}
+	mpi_free(u3);
+	mpi_free(v3);
+	mpi_free(t3);
+
+	mpi_free(u);
+	mpi_free(v);
+	return rc;
+}
+EXPORT_SYMBOL(mpi_invm);
--- /dev/null
+++ b/lib/mpi/mpi-mpow.c
@@ -0,0 +1,135 @@
+/* mpi-mpow.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+static int build_index(const MPI *exparray, int k, int i, int t)
+{
+	int j, bitno;
+	int index = 0;
+
+	bitno = t - i;
+	for (j = k - 1; j >= 0; j--) {
+		index <<= 1;
+		if (mpi_test_bit(exparray[j], bitno))
+			index |= 1;
+	}
+	return index;
+}
+
+/****************
+ * RES = (BASE[0] ^ EXP[0]) *  (BASE[1] ^ EXP[1]) * ... * mod M
+ */
+int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI m)
+{
+	int rc = -ENOMEM;
+	int k;			/* number of elements */
+	int t;			/* bit size of largest exponent */
+	int i, j, idx;
+	MPI *G = NULL;		/* table with precomputed values of size 2^k */
+	MPI tmp = NULL;
+
+	for (k = 0; basearray[k]; k++)
+		;
+	if (!k) {
+		pr_emerg("mpi_mulpowm: assert(k) failed\n");
+		BUG();
+	}
+	for (t = 0, i = 0; (tmp = exparray[i]); i++) {
+		j = mpi_get_nbits(tmp);
+		if (j > t)
+			t = j;
+	}
+	if (i != k) {
+		pr_emerg("mpi_mulpowm: assert(i==k) failed\n");
+		BUG();
+	}
+	if (!t) {
+		pr_emerg("mpi_mulpowm: assert(t) failed\n");
+		BUG();
+	}
+	if (k >= 10) {
+		pr_emerg("mpi_mulpowm: assert(k<10) failed\n");
+		BUG();
+	}
+
+	G = kzalloc((1 << k) * sizeof *G, GFP_KERNEL);
+	if (!G)
+		goto err_out;
+
+	/* and calculate */
+	tmp = mpi_alloc(mpi_get_nlimbs(m) + 1);
+	if (!tmp)
+		goto nomem;
+	if (mpi_set_ui(res, 1) < 0)
+		goto nomem;
+	for (i = 1; i <= t; i++) {
+		if (mpi_mulm(tmp, res, res, m) < 0)
+			goto nomem;
+		idx = build_index(exparray, k, i, t);
+		if (!(idx >= 0 && idx < (1 << k))) {
+			pr_emerg("mpi_mulpowm: assert(idx >= 0 && idx < (1<<k)) failed\n");
+			BUG();
+		}
+		if (!G[idx]) {
+			if (!idx) {
+				G[0] = mpi_alloc_set_ui(1);
+				if (!G[0])
+					goto nomem;
+			} else {
+				for (j = 0; j < k; j++) {
+					if ((idx & (1 << j))) {
+						if (!G[idx]) {
+							if (mpi_copy
+							    (&G[idx],
+							     basearray[j]) < 0)
+								goto nomem;
+						} else {
+							if (mpi_mulm
+							    (G[idx], G[idx],
+							     basearray[j],
+							     m) < 0)
+								goto nomem;
+						}
+					}
+				}
+				if (!G[idx]) {
+					G[idx] = mpi_alloc(0);
+					if (!G[idx])
+						goto nomem;
+				}
+			}
+		}
+		if (mpi_mulm(res, tmp, G[idx], m) < 0)
+			goto nomem;
+	}
+
+	rc = 0;
+nomem:
+	/* cleanup */
+	mpi_free(tmp);
+	for (i = 0; i < (1 << k); i++)
+		mpi_free(G[i]);
+	kfree(G);
+err_out:
+	return rc;
+}
+EXPORT_SYMBOL(mpi_mulpowm);
--- /dev/null
+++ b/lib/mpi/mpi-mul.c
@@ -0,0 +1,195 @@
+/* mpi-mul.c  -  MPI functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+int mpi_mul_ui(MPI prod, MPI mult, unsigned long small_mult)
+{
+	mpi_size_t size, prod_size;
+	mpi_ptr_t prod_ptr;
+	mpi_limb_t cy;
+	int sign;
+
+	size = mult->nlimbs;
+	sign = mult->sign;
+
+	if (!size || !small_mult) {
+		prod->nlimbs = 0;
+		prod->sign = 0;
+		return 0;
+	}
+
+	prod_size = size + 1;
+	if (prod->alloced < prod_size)
+		if (mpi_resize(prod, prod_size) < 0)
+			return -ENOMEM;
+	prod_ptr = prod->d;
+
+	cy = mpihelp_mul_1(prod_ptr, mult->d, size, (mpi_limb_t) small_mult);
+	if (cy)
+		prod_ptr[size++] = cy;
+	prod->nlimbs = size;
+	prod->sign = sign;
+	return 0;
+}
+
+int mpi_mul_2exp(MPI w, MPI u, unsigned long cnt)
+{
+	mpi_size_t usize, wsize, limb_cnt;
+	mpi_ptr_t wp;
+	mpi_limb_t wlimb;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+
+	if (!usize) {
+		w->nlimbs = 0;
+		w->sign = 0;
+		return 0;
+	}
+
+	limb_cnt = cnt / BITS_PER_MPI_LIMB;
+	wsize = usize + limb_cnt + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+	wp = w->d;
+	wsize = usize + limb_cnt;
+	wsign = usign;
+
+	cnt %= BITS_PER_MPI_LIMB;
+	if (cnt) {
+		wlimb = mpihelp_lshift(wp + limb_cnt, u->d, usize, cnt);
+		if (wlimb) {
+			wp[wsize] = wlimb;
+			wsize++;
+		}
+	} else {
+		MPN_COPY_DECR(wp + limb_cnt, u->d, usize);
+	}
+
+	/* Zero all whole limbs at low end.  Do it here and not before calling
+	 * mpn_lshift, not to lose for U == W.  */
+	MPN_ZERO(wp, limb_cnt);
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_mul(MPI w, MPI u, MPI v)
+{
+	int rc = -ENOMEM;
+	mpi_size_t usize, vsize, wsize;
+	mpi_ptr_t up, vp, wp;
+	mpi_limb_t cy;
+	int usign, vsign, sign_product;
+	int assign_wp = 0;
+	mpi_ptr_t tmp_limb = NULL;
+
+	if (u->nlimbs < v->nlimbs) {	/* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		up = v->d;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		vp = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		up = u->d;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		vp = v->d;
+	}
+	sign_product = usign ^ vsign;
+	wp = w->d;
+
+	/* Ensure W has space enough to store the result.  */
+	wsize = usize + vsize;
+	if (w->alloced < (size_t) wsize) {
+		if (wp == up || wp == vp) {
+			wp = mpi_alloc_limb_space(wsize);
+			if (!wp)
+				goto nomem;
+			assign_wp = 1;
+		} else {
+			if (mpi_resize(w, wsize) < 0)
+				goto nomem;
+			wp = w->d;
+		}
+	} else {		/* Make U and V not overlap with W.      */
+		if (wp == up) {
+			/* W and U are identical.  Allocate temporary space for U.      */
+			up = tmp_limb = mpi_alloc_limb_space(usize);
+			if (!up)
+				goto nomem;
+			/* Is V identical too?  Keep it identical with U.  */
+			if (wp == vp)
+				vp = up;
+			/* Copy to the temporary space.  */
+			MPN_COPY(up, wp, usize);
+		} else if (wp == vp) {
+			/* W and V are identical.  Allocate temporary space for V.      */
+			vp = tmp_limb = mpi_alloc_limb_space(vsize);
+			if (!vp)
+				goto nomem;
+			/* Copy to the temporary space.  */
+			MPN_COPY(vp, wp, vsize);
+		}
+	}
+
+	if (!vsize)
+		wsize = 0;
+	else {
+		if (mpihelp_mul(wp, up, usize, vp, vsize, &cy) < 0)
+			goto nomem;
+		wsize -= cy ? 0 : 1;
+	}
+
+	if (assign_wp)
+		mpi_assign_limb_space(w, wp, wsize);
+
+	w->nlimbs = wsize;
+	w->sign = sign_product;
+	rc = 0;
+nomem:
+	if (tmp_limb)
+		mpi_free_limb_space(tmp_limb);
+	return rc;
+}
+
+int mpi_mulm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_mul(w, u, v) < 0)
+		return -ENOMEM;
+	return mpi_fdiv_r(w, w, m);
+}
+EXPORT_SYMBOL(mpi_mulm);
--- /dev/null
+++ b/lib/mpi/mpi-scan.c
@@ -0,0 +1,136 @@
+/* mpi-scan.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+/****************
+ * Scan through an mpi and return byte for byte. a -1 is returned to indicate
+ * the end of the mpi. Scanning is done from the lsb to the msb, returned
+ * values are in the range of 0 .. 255.
+ *
+ * FIXME: This code is VERY ugly!
+ */
+int mpi_getbyte(const MPI a, unsigned idx)
+{
+	int i, j;
+	unsigned n;
+	mpi_ptr_t ap;
+	mpi_limb_t limb;
+
+	ap = a->d;
+	for (n = 0, i = 0; i < a->nlimbs; i++) {
+		limb = ap[i];
+		for (j = 0; j < BYTES_PER_MPI_LIMB; j++, n++)
+			if (n == idx)
+				return (limb >> j * 8) & 0xff;
+	}
+	return -1;
+}
+
+/****************
+ * Put a value at position IDX into A. idx counts from lsb to msb
+ */
+void mpi_putbyte(MPI a, unsigned idx, int xc)
+{
+	int i, j;
+	unsigned n;
+	mpi_ptr_t ap;
+	mpi_limb_t limb, c;
+
+	c = xc & 0xff;
+	ap = a->d;
+	for (n = 0, i = 0; i < a->alloced; i++) {
+		limb = ap[i];
+		for (j = 0; j < BYTES_PER_MPI_LIMB; j++, n++)
+			if (n == idx) {
+#if BYTES_PER_MPI_LIMB == 4
+				if (j == 0)
+					limb = (limb & 0xffffff00) | c;
+				else if (j == 1)
+					limb = (limb & 0xffff00ff) | (c << 8);
+				else if (j == 2)
+					limb = (limb & 0xff00ffff) | (c << 16);
+				else
+					limb = (limb & 0x00ffffff) | (c << 24);
+#elif BYTES_PER_MPI_LIMB == 8
+				if (j == 0)
+					limb = (limb & 0xffffffffffffff00) | c;
+				else if (j == 1)
+					limb =
+					    (limb & 0xffffffffffff00ff) | (c <<
+									   8);
+				else if (j == 2)
+					limb =
+					    (limb & 0xffffffffff00ffff) | (c <<
+									   16);
+				else if (j == 3)
+					limb =
+					    (limb & 0xffffffff00ffffff) | (c <<
+									   24);
+				else if (j == 4)
+					limb =
+					    (limb & 0xffffff00ffffffff) | (c <<
+									   32);
+				else if (j == 5)
+					limb =
+					    (limb & 0xffff00ffffffffff) | (c <<
+									   40);
+				else if (j == 6)
+					limb =
+					    (limb & 0xff00ffffffffffff) | (c <<
+									   48);
+				else
+					limb =
+					    (limb & 0x00ffffffffffffff) | (c <<
+									   56);
+#else
+#error please enhance this function, its ugly - i know.
+#endif
+				if (a->nlimbs <= i)
+					a->nlimbs = i + 1;
+				ap[i] = limb;
+				return;
+			}
+	}
+	log_bug("index out of range\n");
+}
+
+/****************
+ * Count the number of zerobits at the low end of A
+ */
+unsigned mpi_trailing_zeros(const MPI a)
+{
+	unsigned n, count = 0;
+
+	for (n = 0; n < a->nlimbs; n++) {
+		if (a->d[n]) {
+			unsigned nn;
+			mpi_limb_t alimb = a->d[n];
+
+			nn = count_trailing_zeros(alimb);
+			count += nn;
+			break;
+		}
+		count += BITS_PER_MPI_LIMB;
+	}
+	return count;
+
+}
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -128,6 +128,81 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread)
 }
 EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
 
+/****************
+ * Make an mpi from a character string.
+ */
+int mpi_fromstr(MPI val, const char *str)
+{
+	int hexmode = 0, sign = 0, prepend_zero = 0, i, j, c, c1, c2;
+	unsigned nbits, nbytes, nlimbs;
+	mpi_limb_t a;
+
+	if (*str == '-') {
+		sign = 1;
+		str++;
+	}
+	if (*str == '0' && str[1] == 'x')
+		hexmode = 1;
+	else
+		return -EINVAL;	/* other bases are not yet supported */
+	str += 2;
+
+	nbits = strlen(str) * 4;
+	if (nbits % 8)
+		prepend_zero = 1;
+	nbytes = (nbits + 7) / 8;
+	nlimbs = (nbytes + BYTES_PER_MPI_LIMB - 1) / BYTES_PER_MPI_LIMB;
+	if (val->alloced < nlimbs)
+		if (!mpi_resize(val, nlimbs))
+			return -ENOMEM;
+	i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+	i %= BYTES_PER_MPI_LIMB;
+	j = val->nlimbs = nlimbs;
+	val->sign = sign;
+	for (; j > 0; j--) {
+		a = 0;
+		for (; i < BYTES_PER_MPI_LIMB; i++) {
+			if (prepend_zero) {
+				c1 = '0';
+				prepend_zero = 0;
+			} else
+				c1 = *str++;
+			assert(c1);
+			c2 = *str++;
+			assert(c2);
+			if (c1 >= '0' && c1 <= '9')
+				c = c1 - '0';
+			else if (c1 >= 'a' && c1 <= 'f')
+				c = c1 - 'a' + 10;
+			else if (c1 >= 'A' && c1 <= 'F')
+				c = c1 - 'A' + 10;
+			else {
+				mpi_clear(val);
+				return 1;
+			}
+			c <<= 4;
+			if (c2 >= '0' && c2 <= '9')
+				c |= c2 - '0';
+			else if (c2 >= 'a' && c2 <= 'f')
+				c |= c2 - 'a' + 10;
+			else if (c2 >= 'A' && c2 <= 'F')
+				c |= c2 - 'A' + 10;
+			else {
+				mpi_clear(val);
+				return 1;
+			}
+			a <<= 8;
+			a |= c;
+		}
+		i = 0;
+
+		val->d[j - 1] = a;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_fromstr);
+
 /**
  * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
  *
--- a/lib/mpi/mpih-div.c
+++ b/lib/mpi/mpih-div.c
@@ -37,6 +37,159 @@
 #define UDIV_TIME UMUL_TIME
 #endif
 
+/* FIXME: We should be using invert_limb (or invert_normalized_limb)
+ * here (not udiv_qrnnd).
+ */
+
+mpi_limb_t
+mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+	      mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	int dummy;
+
+	/* Botch: Should this be handled at all?  Rely on callers?  */
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 *   Does it ever help to use udiv_qrnnd_preinv?
+	 *     && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+	    && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						  ((n1 << normalization_steps)
+						   | (n0 >>
+						      (BITS_PER_MPI_LIMB -
+						       normalization_steps))),
+						  divisor_limb,
+						  divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(dummy, r, r,
+					  n1 << normalization_steps,
+					  divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				i--;
+
+			for (; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						  n0, divisor_limb,
+						  divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB -
+					   normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(dummy, r, r,
+						   ((n1 << normalization_steps)
+						    | (n0 >>
+						       (BITS_PER_MPI_LIMB -
+							normalization_steps))),
+						   divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(dummy, r, r,
+					   n1 << normalization_steps,
+					   divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.  */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			i--;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(dummy, r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
+
 /* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
  * the NSIZE-DSIZE least significant quotient limbs at QP
  * and the DSIZE long remainder at NP.	If QEXTRA_LIMBS is
@@ -234,3 +387,159 @@ mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
 
 	return most_significant_q_limb;
 }
+
+/****************
+ * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ * Return the single-limb remainder.
+ * There are no constraints on the value of the divisor.
+ *
+ * QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+ */
+
+mpi_limb_t
+mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+		 mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+		 mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	int dummy;
+
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 * Does it ever help to use udiv_qrnnd_preinv?
+	 * && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+	    && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i + 1], r, r,
+						  ((n1 << normalization_steps)
+						   | (n0 >>
+						      (BITS_PER_MPI_LIMB -
+						       normalization_steps))),
+						  divisor_limb,
+						  divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(quot_ptr[0], r, r,
+					  n1 << normalization_steps,
+					  divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				quot_ptr[i--] = 0;
+
+			for (; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i], r, r,
+						  n0, divisor_limb,
+						  divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB -
+					   normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(quot_ptr[i + 1], r, r,
+						   ((n1 << normalization_steps)
+						    | (n0 >>
+						       (BITS_PER_MPI_LIMB -
+							normalization_steps))),
+						   divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(quot_ptr[0], r, r,
+					   n1 << normalization_steps,
+					   divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.  */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			quot_ptr[i--] = 0;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(quot_ptr[i], r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
--- a/lib/mpi/mpih-mul.c
+++ b/lib/mpi/mpih-mul.c
@@ -330,6 +330,36 @@ mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace)
 	}
 }
 
+/* This should be made into an inline function in gmp.h.  */
+int mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
+{
+	if (up == vp) {
+		if (size < KARATSUBA_THRESHOLD)
+			mpih_sqr_n_basecase(prodp, up, size);
+		else {
+			mpi_ptr_t tspace;
+			tspace = mpi_alloc_limb_space(2 * size);
+			if (!tspace)
+				return -ENOMEM;
+			mpih_sqr_n(prodp, up, size, tspace);
+			mpi_free_limb_space(tspace);
+		}
+	} else {
+		if (size < KARATSUBA_THRESHOLD)
+			mul_n_basecase(prodp, up, vp, size);
+		else {
+			mpi_ptr_t tspace;
+			tspace = mpi_alloc_limb_space(2 * size);
+			if (!tspace)
+				return -ENOMEM;
+			mul_n(prodp, up, vp, size, tspace);
+			mpi_free_limb_space(tspace);
+		}
+	}
+
+	return 0;
+}
+
 int
 mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
 			   mpi_ptr_t up, mpi_size_t usize,
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -106,6 +106,13 @@ int mpi_resize(MPI a, unsigned nlimbs)
 	return 0;
 }
 
+void mpi_clear(MPI a)
+{
+	a->nlimbs = 0;
+	a->nbits = 0;
+	a->flags = 0;
+}
+
 void mpi_free(MPI a)
 {
 	if (!a)
@@ -122,5 +129,86 @@ void mpi_free(MPI a)
 }
 EXPORT_SYMBOL_GPL(mpi_free);
 
+/****************
+ * Note: This copy function should not interpret the MPI
+ *	 but copy it transparently.
+ */
+int mpi_copy(MPI *copied, const MPI a)
+{
+	size_t i;
+	MPI b;
+
+	*copied = NULL;
+
+	if (a) {
+		b = mpi_alloc(a->nlimbs);
+		if (!b)
+			return -ENOMEM;
+
+		b->nlimbs = a->nlimbs;
+		b->sign = a->sign;
+		b->flags = a->flags;
+		b->nbits = a->nbits;
+
+		for (i = 0; i < b->nlimbs; i++)
+			b->d[i] = a->d[i];
+
+		*copied = b;
+	}
+
+	return 0;
+}
+
+int mpi_set(MPI w, const MPI u)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize = u->nlimbs;
+	int usign = u->sign;
+
+	if (RESIZE_IF_NEEDED(w, (size_t) usize) < 0)
+		return -ENOMEM;
+
+	wp = w->d;
+	up = u->d;
+	MPN_COPY(wp, up, usize);
+	w->nlimbs = usize;
+	w->nbits = u->nbits;
+	w->flags = u->flags;
+	w->sign = usign;
+	return 0;
+}
+
+int mpi_set_ui(MPI w, unsigned long u)
+{
+	if (RESIZE_IF_NEEDED(w, 1) < 0)
+		return -ENOMEM;
+	w->d[0] = u;
+	w->nlimbs = u ? 1 : 0;
+	w->sign = 0;
+	w->nbits = 0;
+	w->flags = 0;
+	return 0;
+}
+
+MPI mpi_alloc_set_ui(unsigned long u)
+{
+	MPI w = mpi_alloc(1);
+	if (!w)
+		return w;
+	w->d[0] = u;
+	w->nlimbs = u ? 1 : 0;
+	w->sign = 0;
+	return w;
+}
+
+void mpi_swap(MPI a, MPI b)
+{
+	struct gcry_mpi tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
 MODULE_DESCRIPTION("Multiprecision maths library");
 MODULE_LICENSE("GPL");
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -201,8 +201,9 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 	}
 
 	if (unlikely(rem > 0))
-		pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
-				    rem, current->comm);
+		ve_pr_warn_ratelimited(VE_LOG,
+			"netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
+			rem, current->comm);
 
 	err = 0;
 errout:
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -34,7 +34,6 @@
 #include <linux/rcupdate.h>
 #include <linux/hardirq.h>		/* in_interrupt() */
 
-
 /*
  * The height_to_maxindex array needs to be one deeper than the maximum
  * path as height 0 holds only 1 entry.
@@ -111,9 +110,15 @@ static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag
 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void root_tag_move_all_to_prev(struct radix_tree_root *root)
+{
+	root->gfp_mask = (root->gfp_mask & __GFP_BITS_MASK) |
+		(root->gfp_mask & RADIX_ROOT_TAG_MASK) << RADIX_TREE_MAX_TAGS;
+}
+
 static inline void root_tag_clear_all(struct radix_tree_root *root)
 {
-	root->gfp_mask &= __GFP_BITS_MASK;
+	root->gfp_mask &= (__force gfp_t)~RADIX_ROOT_TAG_MASK;
 }
 
 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
@@ -121,6 +126,27 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void prev_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= ~(1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK | RADIX_ROOT_TAG_MASK;
+}
+
+static inline int prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	BUILD_BUG_ON(__GFP_BITS_SHIFT + RADIX_TREE_MAX_TAGS * 2 > 32);
+	return root->gfp_mask & (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
@@ -583,6 +609,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	BUG_ON(index > radix_tree_maxindex(height));
@@ -590,11 +618,15 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	slot = indirect_to_ptr(root->rnode);
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
+	if (!height)
+		prev = root_tag_get(root, tag);
+
 	while (height > 0) {
 		int offset;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
+		prev = tag_get(slot, tag, offset);
+		if (!prev)
 			tag_set(slot, tag, offset);
 		slot = slot->slots[offset];
 		BUG_ON(slot == NULL);
@@ -602,6 +634,13 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		height--;
 	}
 
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	/* set the root's tag bit */
 	if (slot && !root_tag_get(root, tag))
 		root_tag_set(root, tag);
@@ -631,6 +670,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
 	int uninitialized_var(offset);
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -639,6 +680,13 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 	shift = height * RADIX_TREE_MAP_SHIFT;
 	slot = indirect_to_ptr(root->rnode);
 
+	if (!height) {
+		prev = root_tag_get(root, tag);
+		if (prev)
+			root_tag_clear(root, tag);
+		goto out;
+	}
+
 	while (shift) {
 		if (slot == NULL)
 			goto out;
@@ -653,7 +701,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		goto out;
 
 	while (node) {
-		if (!tag_get(node, tag, offset))
+		prev = tag_get(node, tag, offset);
+		if (!prev)
 			goto out;
 		tag_clear(node, tag, offset);
 		if (any_tag_set(node, tag))
@@ -669,10 +718,27 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		root_tag_clear(root, tag);
 
 out:
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
+void __radix_tree_root_tag_move_all_to_prev(struct radix_tree_root *root)
+{
+	root_tag_move_all_to_prev(root);
+}
+
+void __radix_tree_prev_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	prev_tag_clear(root, tag);
+}
+
 /**
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:		radix tree root
@@ -1369,18 +1435,22 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	void **slot;
 	void *entry;
 	int tag;
+	int right_prev[RADIX_TREE_MAX_TAGS] = {0,};
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
 	if (!entry)
-		return NULL;
+		goto out_none;
 
 	if (item && entry != item)
-		return NULL;
+		goto out_none;
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		right_prev[tag] = radix_tree_tag_get(root, index, tag);
 
 	if (!node) {
-		root_tag_clear_all(root);
+		root_tag_move_all_to_prev(root);
 		root->rnode = NULL;
-		return entry;
+		goto out;
 	}
 
 	offset = index & RADIX_TREE_MAP_MASK;
@@ -1392,6 +1462,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		if (tag_get(node, tag, offset))
 			radix_tree_tag_clear(root, index, tag);
+		else
+			prev_tag_clear(root, tag);
 	}
 
 	node->slots[offset] = NULL;
@@ -1399,7 +1471,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	__radix_tree_delete_node(root, node);
 
+out:
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		BUG_ON(!right_prev[tag] != !prev_tag_get(root, tag));
 	return entry;
+out_none:
+	prev_tag_clear_all(root);
+	goto out;
 }
 EXPORT_SYMBOL(radix_tree_delete_item);
 
@@ -1429,6 +1507,19 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
+/**
+ *	radix_tree_prev_tag_get - get previous tag status for last changed item
+ *			call is valid right after radix_tree_tag_set/clear for
+ *			changed tag and after radix_tree_delete for all tags
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return prev_tag_get(root, tag);
+}
+EXPORT_SYMBOL(radix_tree_prev_tag_get);
+
 static void
 radix_tree_node_ctor(void *arg)
 {
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -44,6 +44,30 @@
  *  parentheses and have some accompanying text comment.
  */
 
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
 static inline void rb_set_black(struct rb_node *rb)
 {
 	rb->__rb_parent_color |= RB_BLACK;
@@ -129,8 +153,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 				 * This still leaves us in violation of 4), the
 				 * continuation into Case 3 will fix that.
 				 */
-				parent->rb_right = tmp = node->rb_left;
-				node->rb_left = parent;
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -149,8 +174,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			 *     /                 \
 			 *    n                   U
 			 */
-			gparent->rb_left = tmp;  /* == parent->rb_right */
-			parent->rb_right = gparent;
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -171,8 +196,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			tmp = parent->rb_left;
 			if (node == tmp) {
 				/* Case 2 - right rotate at parent */
-				parent->rb_left = tmp = node->rb_right;
-				node->rb_right = parent;
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -183,8 +209,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			}
 
 			/* Case 3 - left rotate at gparent */
-			gparent->rb_right = tmp;  /* == parent->rb_left */
-			parent->rb_left = gparent;
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -224,8 +250,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *      / \         / \
 				 *     Sl  Sr      N   Sl
 				 */
-				parent->rb_right = tmp1 = sibling->rb_left;
-				sibling->rb_left = parent;
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -275,9 +302,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *                       \
 				 *                        Sr
 				 */
-				sibling->rb_left = tmp1 = tmp2->rb_right;
-				tmp2->rb_right = sibling;
-				parent->rb_right = tmp2;
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -297,8 +325,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			 *        / \         / \
 			 *      (sl) sr      N  (sl)
 			 */
-			parent->rb_right = tmp2 = sibling->rb_left;
-			sibling->rb_left = parent;
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);
@@ -310,8 +339,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			sibling = parent->rb_left;
 			if (rb_is_red(sibling)) {
 				/* Case 1 - right rotate at parent */
-				parent->rb_left = tmp1 = sibling->rb_right;
-				sibling->rb_right = parent;
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -336,9 +366,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 					break;
 				}
 				/* Case 3 - right rotate at sibling */
-				sibling->rb_right = tmp1 = tmp2->rb_left;
-				tmp2->rb_left = sibling;
-				parent->rb_left = tmp2;
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -347,8 +378,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				sibling = tmp2;
 			}
 			/* Case 4 - left rotate at parent + color flips */
-			parent->rb_left = tmp2 = sibling->rb_right;
-			sibling->rb_right = parent;
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -198,3 +198,4 @@ void sha_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
+EXPORT_SYMBOL(sha_init);
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/nmi.h>
 #include <linux/quicklist.h>
+#include <linux/module.h>
 
 void show_mem(unsigned int filter)
 {
@@ -47,3 +48,4 @@ void show_mem(unsigned int filter)
 		quicklist_total_size());
 #endif
 }
+EXPORT_SYMBOL(show_mem);
--- /dev/null
+++ b/lib/stackdepot.c
@@ -0,0 +1,285 @@
+/*
+ * Generic stack depot for storing stack traces.
+ *
+ * Some debugging tools need to save stack traces of certain events which can
+ * be later presented to the user. For example, KASAN needs to safe alloc and
+ * free stacks for each object, but storing two stack traces per object
+ * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for
+ * that).
+ *
+ * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc
+ * and free stacks repeat a lot, we save about 100x space.
+ * Stacks are never removed from depot, so we store them contiguously one after
+ * another in a contiguos memory allocation.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
+
+#define STACK_ALLOC_NULL_PROTECTION_BITS 1
+#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
+#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
+#define STACK_ALLOC_ALIGN 4
+#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
+					STACK_ALLOC_ALIGN)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
+		STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_SLABS_CAP 8192
+#define STACK_ALLOC_MAX_SLABS \
+	(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
+	 (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
+
+/* The compact structure to store the reference to stacks. */
+union handle_parts {
+	depot_stack_handle_t handle;
+	struct {
+		u32 slabindex : STACK_ALLOC_INDEX_BITS;
+		u32 offset : STACK_ALLOC_OFFSET_BITS;
+		u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS;
+	};
+};
+
+struct stack_record {
+	struct stack_record *next;	/* Link in the hashtable */
+	u32 hash;			/* Hash in the hastable */
+	u32 size;			/* Number of frames in the stack */
+	union handle_parts handle;
+	unsigned long entries[1];	/* Variable-sized array of entries. */
+};
+
+static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
+
+static int depot_index;
+static int next_slab_inited;
+static size_t depot_offset;
+static DEFINE_SPINLOCK(depot_lock);
+
+static bool init_stack_slab(void **prealloc)
+{
+	if (!*prealloc)
+		return false;
+	/*
+	 * This smp_load_acquire() pairs with smp_store_release() to
+	 * |next_slab_inited| below and in depot_alloc_stack().
+	 */
+	if (smp_load_acquire(&next_slab_inited))
+		return true;
+	if (stack_slabs[depot_index] == NULL) {
+		stack_slabs[depot_index] = *prealloc;
+	} else {
+		stack_slabs[depot_index + 1] = *prealloc;
+		/*
+		 * This smp_store_release pairs with smp_load_acquire() from
+		 * |next_slab_inited| above and in depot_save_stack().
+		 */
+		smp_store_release(&next_slab_inited, 1);
+	}
+	*prealloc = NULL;
+	return true;
+}
+
+/* Allocation of a new stack in raw storage */
+static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
+		u32 hash, void **prealloc, gfp_t alloc_flags)
+{
+	int required_size = offsetof(struct stack_record, entries) +
+		sizeof(unsigned long) * size;
+	struct stack_record *stack;
+
+	required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN);
+
+	if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) {
+		if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) {
+			WARN_ONCE(1, "Stack depot reached limit capacity");
+			return NULL;
+		}
+		depot_index++;
+		depot_offset = 0;
+		/*
+		 * smp_store_release() here pairs with smp_load_acquire() from
+		 * |next_slab_inited| in depot_save_stack() and
+		 * init_stack_slab().
+		 */
+		if (depot_index + 1 < STACK_ALLOC_MAX_SLABS)
+			smp_store_release(&next_slab_inited, 0);
+	}
+	init_stack_slab(prealloc);
+	if (stack_slabs[depot_index] == NULL)
+		return NULL;
+
+	stack = stack_slabs[depot_index] + depot_offset;
+
+	stack->hash = hash;
+	stack->size = size;
+	stack->handle.slabindex = depot_index;
+	stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+	stack->handle.valid = 1;
+	memcpy(stack->entries, entries, size * sizeof(unsigned long));
+	depot_offset += required_size;
+
+	return stack;
+}
+
+#define STACK_HASH_ORDER 20
+#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER)
+#define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
+#define STACK_HASH_SEED 0x9747b28c
+
+static struct stack_record *stack_table[STACK_HASH_SIZE] = {
+	[0 ...	STACK_HASH_SIZE - 1] = NULL
+};
+
+/* Calculate hash for a stack */
+static inline u32 hash_stack(unsigned long *entries, unsigned int size)
+{
+	return jhash2((u32 *)entries,
+			       size * sizeof(unsigned long) / sizeof(u32),
+			       STACK_HASH_SEED);
+}
+
+/* Find a stack that is equal to the one stored in entries in the hash */
+static inline struct stack_record *find_stack(struct stack_record *bucket,
+					     unsigned long *entries, int size,
+					     u32 hash)
+{
+	struct stack_record *found;
+
+	for (found = bucket; found; found = found->next) {
+		if (found->hash == hash &&
+		    found->size == size &&
+		    !memcmp(entries, found->entries,
+			    size * sizeof(unsigned long))) {
+			return found;
+		}
+	}
+	return NULL;
+}
+
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
+{
+	union handle_parts parts = { .handle = handle };
+	void *slab = stack_slabs[parts.slabindex];
+	size_t offset = parts.offset << STACK_ALLOC_ALIGN;
+	struct stack_record *stack = slab + offset;
+
+	trace->nr_entries = trace->max_entries = stack->size;
+	trace->entries = stack->entries;
+	trace->skip = 0;
+}
+
+/**
+ * depot_save_stack - save stack in a stack depot.
+ * @trace - the stacktrace to save.
+ * @alloc_flags - flags for allocating additional memory if required.
+ *
+ * Returns the handle of the stack struct stored in depot.
+ */
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
+				    gfp_t alloc_flags)
+{
+	u32 hash;
+	depot_stack_handle_t retval = 0;
+	struct stack_record *found = NULL, **bucket;
+	unsigned long flags;
+	struct page *page = NULL;
+	void *prealloc = NULL;
+
+	if (unlikely(trace->nr_entries == 0))
+		goto fast_exit;
+
+	hash = hash_stack(trace->entries, trace->nr_entries);
+	bucket = &stack_table[hash & STACK_HASH_MASK];
+
+	/*
+	 * Fast path: look the stack trace up without locking.
+	 * The smp_load_acquire() here pairs with smp_store_release() to
+	 * |bucket| below.
+	 */
+	found = find_stack(smp_load_acquire(bucket), trace->entries,
+			   trace->nr_entries, hash);
+	if (found)
+		goto exit;
+
+	/*
+	 * Check if the current or the next stack slab need to be initialized.
+	 * If so, allocate the memory - we won't be able to do that under the
+	 * lock.
+	 *
+	 * The smp_load_acquire() here pairs with smp_store_release() to
+	 * |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
+	 */
+	if (unlikely(!smp_load_acquire(&next_slab_inited))) {
+		/*
+		 * Zero out zone modifiers, as we don't have specific zone
+		 * requirements. Keep the flags related to allocation in atomic
+		 * contexts and I/O.
+		 */
+		alloc_flags &= ~GFP_ZONEMASK;
+		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
+		alloc_flags |= __GFP_NOWARN;
+		page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
+		if (page)
+			prealloc = page_address(page);
+	}
+
+	spin_lock_irqsave(&depot_lock, flags);
+
+	found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
+	if (!found) {
+		struct stack_record *new =
+			depot_alloc_stack(trace->entries, trace->nr_entries,
+					  hash, &prealloc, alloc_flags);
+		if (new) {
+			new->next = *bucket;
+			/*
+			 * This smp_store_release() pairs with
+			 * smp_load_acquire() from |bucket| above.
+			 */
+			smp_store_release(bucket, new);
+			found = new;
+		}
+	} else if (prealloc) {
+		/*
+		 * We didn't need to store this stack trace, but let's keep
+		 * the preallocated memory for the future.
+		 */
+		WARN_ON(!init_stack_slab(&prealloc));
+	}
+
+	spin_unlock_irqrestore(&depot_lock, flags);
+exit:
+	if (prealloc) {
+		/* Nobody used this memory, ok to free it. */
+		free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER);
+	}
+	if (found)
+		retval = found->handle.handle;
+fast_exit:
+	return retval;
+}
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/kasan-checks.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 
@@ -106,6 +107,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	src_addr = (unsigned long)src;
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
+		kasan_check_write(dst, count);
 		return do_strncpy_from_user(dst, src, count, max);
 	}
 	return -EFAULT;
--- /dev/null
+++ b/lib/test_kasan.c
@@ -0,0 +1,277 @@
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+static noinline void __init kmalloc_oob_right(void)
+{
+	char *ptr;
+	size_t size = 123;
+
+	pr_info("out-of-bounds to right\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 'x';
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_left(void)
+{
+	char *ptr;
+	size_t size = 15;
+
+	pr_info("out-of-bounds to left\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	*ptr = *(ptr - 1);
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_node_oob_right(void)
+{
+	char *ptr;
+	size_t size = 4096;
+
+	pr_info("kmalloc_node(): out-of-bounds to right\n");
+	ptr = kmalloc_node(size, GFP_KERNEL, 0);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 0;
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_large_oob_rigth(void)
+{
+	char *ptr;
+	size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+	pr_info("kmalloc large allocation: out-of-bounds to right\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 0;
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_krealloc_more(void)
+{
+	char *ptr1, *ptr2;
+	size_t size1 = 17;
+	size_t size2 = 19;
+
+	pr_info("out-of-bounds after krealloc more\n");
+	ptr1 = kmalloc(size1, GFP_KERNEL);
+	ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		return;
+	}
+
+	ptr2[size2] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_krealloc_less(void)
+{
+	char *ptr1, *ptr2;
+	size_t size1 = 17;
+	size_t size2 = 15;
+
+	pr_info("out-of-bounds after krealloc less\n");
+	ptr1 = kmalloc(size1, GFP_KERNEL);
+	ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		return;
+	}
+	ptr2[size1] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_16(void)
+{
+	struct {
+		u64 words[2];
+	} *ptr1, *ptr2;
+
+	pr_info("kmalloc out-of-bounds for 16-bytes access\n");
+	ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+	ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		kfree(ptr2);
+		return;
+	}
+	*ptr1 = *ptr2;
+	kfree(ptr1);
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_in_memset(void)
+{
+	char *ptr;
+	size_t size = 666;
+
+	pr_info("out-of-bounds in memset\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	memset(ptr, 0, size+5);
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_uaf(void)
+{
+	char *ptr;
+	size_t size = 10;
+
+	pr_info("use-after-free\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr);
+	*(ptr + 8) = 'x';
+}
+
+static noinline void __init kmalloc_uaf_memset(void)
+{
+	char *ptr;
+	size_t size = 33;
+
+	pr_info("use-after-free in memset\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr);
+	memset(ptr, 0, size);
+}
+
+static noinline void __init kmalloc_uaf2(void)
+{
+	char *ptr1, *ptr2;
+	size_t size = 43;
+
+	pr_info("use-after-free after another kmalloc\n");
+	ptr1 = kmalloc(size, GFP_KERNEL);
+	if (!ptr1) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr1);
+	ptr2 = kmalloc(size, GFP_KERNEL);
+	if (!ptr2) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr1[40] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmem_cache_oob(void)
+{
+	char *p;
+	size_t size = 200;
+	struct kmem_cache *cache = kmem_cache_create("test_cache",
+						size, 0,
+						0, NULL);
+	if (!cache) {
+		pr_err("Cache allocation failed\n");
+		return;
+	}
+	pr_info("out-of-bounds in kmem_cache_alloc\n");
+	p = kmem_cache_alloc(cache, GFP_KERNEL);
+	if (!p) {
+		pr_err("Allocation failed\n");
+		kmem_cache_destroy(cache);
+		return;
+	}
+
+	*p = p[size];
+	kmem_cache_free(cache, p);
+	kmem_cache_destroy(cache);
+}
+
+static char global_array[10];
+
+static noinline void __init kasan_global_oob(void)
+{
+	volatile int i = 3;
+	char *p = &global_array[ARRAY_SIZE(global_array) + i];
+
+	pr_info("out-of-bounds global variable\n");
+	*(volatile char *)p;
+}
+
+static noinline void __init kasan_stack_oob(void)
+{
+	char stack_array[10];
+	volatile int i = 0;
+	char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
+
+	pr_info("out-of-bounds on stack\n");
+	*(volatile char *)p;
+}
+
+static int __init kmalloc_tests_init(void)
+{
+	kmalloc_oob_right();
+	kmalloc_oob_left();
+	kmalloc_node_oob_right();
+	kmalloc_large_oob_rigth();
+	kmalloc_oob_krealloc_more();
+	kmalloc_oob_krealloc_less();
+	kmalloc_oob_16();
+	kmalloc_oob_in_memset();
+	kmalloc_uaf();
+	kmalloc_uaf_memset();
+	kmalloc_uaf2();
+	kmem_cache_oob();
+	kasan_stack_oob();
+	kasan_global_oob();
+	return -EAGAIN;
+}
+
+module_init(kmalloc_tests_init);
+MODULE_LICENSE("GPL");
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -549,6 +549,37 @@ config MEM_SOFT_DIRTY
 
 	  See Documentation/vm/soft-dirty.txt for more details.
 
+config TCACHE
+	bool "Transcendent file cache"
+	depends on CLEANCACHE
+	default n
+	help
+	  Transcendent file cache is a simple backend for cleancache, which
+	  stores reclaimed pages in memory without any modifications. It is
+	  only worth enabling if used along with memory cgroups in order to
+	  cache pages which were reclaimed on local pressure.
+
+config TSWAP
+	bool "Transcendent swap cache"
+	depends on FRONTSWAP
+	default n
+	help
+	  Transcendent swap cache is a simple backend for frontswap, which
+	  stores reclaimed pages in memory without any modifications. It is
+	  only worth enabling if used along with memory cgroups in order to
+	  cache pages which were reclaimed on local pressure.
+
+config IDLE_PAGE_TRACKING
+	bool "Enable idle page tracking"
+	depends on SYSFS && MMU && 64BIT
+	help
+	  This feature allows to estimate the amount of user pages that have
+	  not been touched during a given period of time. This information can
+	  be useful to tune memory cgroup limits and/or for job placement
+	  within a compute cluster.
+
+	  See Documentation/vm/idle_page_tracking.txt for more details.
+
 config ZSWAP
 	bool "Compressed cache for swap pages (EXPERIMENTAL)"
 	depends on FRONTSWAP && CRYPTO=y
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,8 +2,27 @@
 # Makefile for the linux memory manager.
 #
 
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
+KASAN_SANITIZE_slub.o := n
+
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
 mmu-y			:= nommu.o
-mmu-$(CONFIG_MMU)	:= fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)	:= gup.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 
@@ -17,7 +36,8 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o \
-			   interval_tree.o list_lru.o workingset.o $(mmu-y)
+			   interval_tree.o list_lru.o workingset.o oom_group.o \
+			   iov-iter.o $(mmu-y)
 
 obj-y += init-mm.o
 
@@ -45,6 +65,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MIGRATION) += migrate.o
@@ -65,3 +86,6 @@ obj-$(CONFIG_ZBUD)	+= zbud.o
 obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_TCACHE) += tcache.o
+obj-$(CONFIG_TSWAP) += tswap.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
+	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+		if (inode->i_state & I_DIRTY_TIME)
+			nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty:            %10lu\n"
 		   "b_io:               %10lu\n"
 		   "b_more_io:          %10lu\n"
+		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
+		   nr_dirty_time,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -184,43 +189,52 @@ static ssize_t name##_show(struct device *dev,				\
 
 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 
-static ssize_t min_ratio_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
+static inline ssize_t generic_uint_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count,
+		int (*set_func) (struct backing_dev_info *, unsigned int))
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
+	unsigned int val;
 	ssize_t ret;
 
-	ret = kstrtouint(buf, 10, &ratio);
+	ret = kstrtouint(buf, 10, &val);
 	if (ret < 0)
 		return ret;
 
-	ret = bdi_set_min_ratio(bdi, ratio);
+	ret = set_func(bdi, val);
 	if (!ret)
 		ret = count;
 
 	return ret;
 }
+
+static ssize_t min_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_ratio);
+}
 BDI_SHOW(min_ratio, bdi->min_ratio)
 
 static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
-	ssize_t ret;
-
-	ret = kstrtouint(buf, 10, &ratio);
-	if (ret < 0)
-		return ret;
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_ratio);
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
 
-	ret = bdi_set_max_ratio(bdi, ratio);
-	if (!ret)
-		ret = count;
+static ssize_t min_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_dirty);
+}
+BDI_SHOW(min_dirty_pages, bdi->min_dirty_pages)
 
-	return ret;
+static ssize_t max_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_dirty);
 }
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_dirty_pages, bdi->max_dirty_pages)
 
 static ssize_t stable_pages_required_show(struct device *dev,
 					  struct device_attribute *attr,
@@ -236,6 +250,8 @@ static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(min_dirty_pages),
+	__ATTR_RW(max_dirty_pages),
 	__ATTR_RO(stable_pages_required),
 	__ATTR_NULL,
 };
@@ -427,6 +443,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
@@ -445,9 +462,12 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
+	bdi->min_dirty_pages = 0;
+	bdi->max_dirty_pages = 0;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
+	init_waitqueue_head(&bdi->cong_waitq);
 
 	bdi_wb_init(&bdi->wb, bdi);
 
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,11 +15,12 @@
 #include <linux/fs.h>
 #include <linux/exportfs.h>
 #include <linux/mm.h>
+#include <linux/memcontrol.h>
 #include <linux/debugfs.h>
 #include <linux/cleancache.h>
 
 /*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
 static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +35,107 @@ static u64 cleancache_failed_gets;
 static u64 cleancache_puts;
 static u64 cleancache_invalidates;
 
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- *	a) mount /	-> __cleancache_init_fs is called. We set the
- *		[shared_|]fs_poolid_map and uuids for.
- *
- *	b). user does I/Os -> we call the rest of __cleancache_* functions
- *		which return immediately as cleancache_ops is false.
- *
- *	c). modprobe zcache -> cleancache_register_ops. We init the backend
- *		and set cleancache_ops to true, and for any fs_poolid_map
- *		(which is set by __cleancache_init_fs) we initialize the poolid.
- *
- *	d). user does I/Os -> now that cleancache_ops is true all the
- *		__cleancache_* functions can call the backend. They all check
- *		that fs_poolid_map is valid and if so invoke the backend.
- *
- *	e). umount /	-> __cleancache_invalidate_fs, the fs_poolid_map is
- *		reset (which is the second check in the __cleancache_* ops
- *		to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+	switch (sb->cleancache_poolid) {
+	case CLEANCACHE_NO_BACKEND:
+		__cleancache_init_fs(sb);
+		break;
+	case CLEANCACHE_NO_BACKEND_SHARED:
+		__cleancache_init_shared_fs(sb);
+		break;
+	}
+}
 
 /*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
  */
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
 {
-	struct cleancache_ops *old = cleancache_ops;
-	int i;
+	if (cmpxchg(&cleancache_ops, NULL, ops))
+		return -EBUSY;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (fs_poolid_map[i] == FS_NO_BACKEND)
-			fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
-		if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
-			shared_fs_poolid_map[i] = ops->init_shared_fs
-					(uuids[i], PAGE_SIZE);
-	}
 	/*
-	 * We MUST set cleancache_ops _after_ we have called the backends
-	 * init_fs or init_shared_fs functions. Otherwise the compiler might
-	 * re-order where cleancache_ops is set in this function.
+	 * A cleancache backend can be built as a module and hence loaded after
+	 * a cleancache enabled filesystem has called cleancache_init_fs. To
+	 * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+	 * for each active super block. To differentiate between local and
+	 * shared filesystems, we temporarily initialize sb->cleancache_poolid
+	 * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+	 * respectively in case there is no backend registered at the time
+	 * cleancache_init_fs or cleancache_init_shared_fs is called.
+	 *
+	 * Since filesystems can be mounted concurrently with cleancache
+	 * backend registration, we have to be careful to guarantee that all
+	 * cleancache enabled filesystems that has been mounted by the time
+	 * cleancache_register_ops is called has got and all mounted later will
+	 * get cleancache_poolid. This is assured by the following statements
+	 * tied together:
+	 *
+	 * a) iterate_supers skips only those super blocks that has started
+	 *    ->kill_sb
+	 *
+	 * b) if iterate_supers encounters a super block that has not finished
+	 *    ->mount yet, it waits until it is finished
+	 *
+	 * c) cleancache_init_fs is called from ->mount and
+	 *    cleancache_invalidate_fs is called from ->kill_sb
+	 *
+	 * d) we call iterate_supers after cleancache_ops has been set
+	 *
+	 * From a) it follows that if iterate_supers skips a super block, then
+	 * either the super block is already dead, in which case we do not need
+	 * to bother initializing cleancache for it, or it was mounted after we
+	 * initiated iterate_supers. In the latter case, it must have seen
+	 * cleancache_ops set according to d) and initialized cleancache from
+	 * ->mount by itself according to c). This proves that we call
+	 * ->init_fs at least once for each active super block.
+	 *
+	 * From b) and c) it follows that if iterate_supers encounters a super
+	 * block that has already started ->init_fs, it will wait until ->mount
+	 * and hence ->init_fs has finished, then check cleancache_poolid, see
+	 * that it has already been set and therefore do nothing. This proves
+	 * that we call ->init_fs no more than once for each super block.
+	 *
+	 * Combined together, the last two paragraphs prove the function
+	 * correctness.
+	 *
+	 * Note that various cleancache callbacks may proceed before this
+	 * function is called or even concurrently with it, but since
+	 * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+	 * until the corresponding ->init_fs has been actually called and
+	 * cleancache_ops has been set.
 	 */
-	barrier();
-	cleancache_ops = ops;
-	mutex_unlock(&poolid_mutex);
-	return old;
+	iterate_supers(cleancache_register_ops_sb, NULL);
+	return 0;
 }
 EXPORT_SYMBOL(cleancache_register_ops);
 
 /* Called by a cleancache-enabled filesystem at time of mount */
 void __cleancache_init_fs(struct super_block *sb)
 {
-	int i;
+	int pool_id = CLEANCACHE_NO_BACKEND;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (fs_poolid_map[i] == FS_UNKNOWN) {
-			sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
-			if (cleancache_ops)
-				fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
-			else
-				fs_poolid_map[i] = FS_NO_BACKEND;
-			break;
-		}
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
 	}
-	mutex_unlock(&poolid_mutex);
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_fs);
 
 /* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
 {
-	int i;
+	int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
-			sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
-			uuids[i] = uuid;
-			if (cleancache_ops)
-				shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
-						(uuid, PAGE_SIZE);
-			else
-				shared_fs_poolid_map[i] = FS_NO_BACKEND;
-			break;
-		}
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
 	}
-	mutex_unlock(&poolid_mutex);
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_shared_fs);
 
@@ -201,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
 	return 0;
 }
 
-/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
-	if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
-		return shared_fs_poolid_map[fake_pool_id -
-			FAKE_SHARED_FS_POOLID_OFFSET];
-	else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
-		return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
-	return FS_NO_BACKEND;
-}
-
 /*
  * "Get" data from cleancache associated with the poolid/inode/index
  * that were specified when the data was put to cleanache and, if
@@ -229,7 +179,6 @@ int __cleancache_get_page(struct page *page)
 {
 	int ret = -1;
 	int pool_id;
-	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops) {
@@ -238,17 +187,14 @@ int __cleancache_get_page(struct page *page)
 	}
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (fake_pool_id < 0)
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id < 0)
 		goto out;
-	pool_id = get_poolid_from_fake(fake_pool_id);
 
 	if (cleancache_get_key(page->mapping->host, &key) < 0)
 		goto out;
 
-	if (pool_id >= 0)
-		ret = cleancache_ops->get_page(pool_id,
-				key, page->index, page);
+	ret = cleancache_ops->get_page(pool_id, key, page->index, page);
 	if (ret == 0)
 		cleancache_succ_gets++;
 	else
@@ -271,25 +217,21 @@ EXPORT_SYMBOL(__cleancache_get_page);
 void __cleancache_put_page(struct page *page)
 {
 	int pool_id;
-	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
-	if (!cleancache_ops) {
-		cleancache_puts++;
+	if (!cleancache_ops)
 		return;
-	}
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (fake_pool_id < 0)
-		return;
-
-	pool_id = get_poolid_from_fake(fake_pool_id);
-
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
 	if (pool_id >= 0 &&
 		cleancache_get_key(page->mapping->host, &key) >= 0) {
-		cleancache_ops->put_page(pool_id, key, page->index, page);
-		cleancache_puts++;
+		if (!mem_cgroup_cleancache_disabled(page)) {
+			cleancache_puts += cleancache_ops->put_page(pool_id, key,
+						 page->index, page);
+		} else
+			cleancache_ops->invalidate_page(pool_id, key,
+							page->index);
 	}
 }
 EXPORT_SYMBOL(__cleancache_put_page);
@@ -306,18 +248,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
 					struct page *page)
 {
 	/* careful... page->mapping is NULL sometimes when this is called */
-	int pool_id;
-	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops)
 		return;
 
-	if (fake_pool_id >= 0) {
-		pool_id = get_poolid_from_fake(fake_pool_id);
-		if (pool_id < 0)
-			return;
-
+	if (pool_id >= 0) {
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		if (cleancache_get_key(mapping->host, &key) >= 0) {
 			cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +276,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
  */
 void __cleancache_invalidate_inode(struct address_space *mapping)
 {
-	int pool_id;
-	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops)
 		return;
 
-	if (fake_pool_id < 0)
-		return;
-
-	pool_id = get_poolid_from_fake(fake_pool_id);
-
 	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
 		cleancache_ops->invalidate_inode(pool_id, key);
 }
@@ -363,32 +294,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
  */
 void __cleancache_invalidate_fs(struct super_block *sb)
 {
-	int index;
-	int fake_pool_id = sb->cleancache_poolid;
-	int old_poolid = fake_pool_id;
+	int pool_id;
 
-	mutex_lock(&poolid_mutex);
-	if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
-		index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
-		old_poolid = shared_fs_poolid_map[index];
-		shared_fs_poolid_map[index] = FS_UNKNOWN;
-		uuids[index] = NULL;
-	} else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
-		index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
-		old_poolid = fs_poolid_map[index];
-		fs_poolid_map[index] = FS_UNKNOWN;
-	}
-	sb->cleancache_poolid = -1;
-	if (cleancache_ops)
-		cleancache_ops->invalidate_fs(old_poolid);
-	mutex_unlock(&poolid_mutex);
+	pool_id = sb->cleancache_poolid;
+	sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+	if (cleancache_ops && pool_id >= 0)
+		cleancache_ops->invalidate_fs(pool_id);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
 
 static int __init init_cleancache(void)
 {
-	int i;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *root = debugfs_create_dir("cleancache", NULL);
 	if (root == NULL)
@@ -400,10 +317,6 @@ static int __init init_cleancache(void)
 	debugfs_create_u64("invalidates", S_IRUGO,
 				root, &cleancache_invalidates);
 #endif
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		fs_poolid_map[i] = FS_UNKNOWN;
-		shared_fs_poolid_map[i] = FS_UNKNOWN;
-	}
 	return 0;
 }
 module_init(init_cleancache)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kasan.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -59,6 +60,7 @@ static void map_pages(struct list_head *list)
 	list_for_each_entry(page, list, lru) {
 		arch_alloc_page(page, 0);
 		kernel_map_pages(page, 1, 1);
+		kasan_alloc_pages(page, 0);
 	}
 }
 
@@ -431,6 +433,10 @@ static bool too_many_isolated(struct zone *zone)
 	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
 					zone_page_state(zone, NR_ISOLATED_ANON);
 
+	if (isolated > (inactive + active) / 2)
+		isolated = zone_page_state_snapshot(zone, NR_ISOLATED_FILE) +
+			   zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+
 	return isolated > (inactive + active) / 2;
 }
 
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -7,6 +7,7 @@
  *		Initial version.
  */
 
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -21,15 +22,51 @@
 
 #include <asm/unistd.h>
 
+static void fadvise_deactivate(struct address_space *mapping,
+		pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t index = start;
+	int i;
+
+	if (start > end)
+		return;
+
+	/*
+	 * Note: this function may get called on a shmem/tmpfs mapping:
+	 * pagevec_lookup() might then return 0 prematurely (because it
+	 * got a gangful of swap entries); but it's hardly worth worrying
+	 * about - it can rarely have anything to free from such a mapping
+	 * (most pages are dirty), and already skips over any difficulties.
+	 */
+
+	pagevec_init(&pvec, 0);
+	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			/* We rely upon deletion not changing page->index */
+			index = page->index;
+			if (index > end)
+				break;
+
+			deactivate_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+		index++;
+	}
+}
+
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
  */
-SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {
-	struct fd f = fdget(fd);
 	struct inode *inode;
-	struct address_space *mapping;
+	struct address_space *mapping = file->f_mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
@@ -37,20 +74,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	unsigned long nrpages;
 	int ret = 0;
 
-	if (!f.file)
-		return -EBADF;
-
-	inode = file_inode(f.file);
-	if (S_ISFIFO(inode->i_mode)) {
-		ret = -ESPIPE;
-		goto out;
-	}
-
-	mapping = f.file->f_mapping;
-	if (!mapping || len < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
+	inode = file_inode(file);
 
 	if (IS_DAX(inode)) {
 		switch (advice) {
@@ -60,6 +84,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		case POSIX_FADV_WILLNEED:
 		case POSIX_FADV_NOREUSE:
 		case POSIX_FADV_DONTNEED:
+		case FADV_DEACTIVATE:
 			/* no bad return value, but ignore advice */
 			break;
 		default:
@@ -79,21 +104,21 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
-		f.file->f_ra.ra_pages = bdi->ra_pages;
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		file->f_ra.ra_pages = bdi->ra_pages;
+		spin_lock(&file->f_lock);
+		file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode |= FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		spin_lock(&file->f_lock);
+		file->f_mode |= FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_SEQUENTIAL:
-		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		file->f_ra.ra_pages = bdi->ra_pages * 2;
+		spin_lock(&file->f_lock);
+		file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_WILLNEED:
 		/* First and last PARTIAL page! */
@@ -109,7 +134,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		 * Ignore return value because fadvise() shall return
 		 * success even if filesystem can't retrieve a hint,
 		 */
-		force_page_cache_readahead(mapping, f.file, start_index,
+		force_page_cache_readahead(mapping, file, start_index,
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
@@ -140,11 +165,43 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 			}
 		}
 		break;
+	case FADV_DEACTIVATE:
+		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte >> PAGE_CACHE_SHIFT);
+		fadvise_deactivate(mapping, start_index, end_index);
+		break;
 	default:
 		ret = -EINVAL;
 	}
 out:
-	fdput(f);
+	return ret;
+}
+EXPORT_SYMBOL(generic_fadvise);
+
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+{
+	struct file *file = fget(fd);
+	int (*fadvise)(struct file *,loff_t, loff_t, int) = generic_fadvise;
+	int ret = 0;
+
+	if (!file)
+		return -EBADF;
+
+	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+		ret = -ESPIPE;
+		goto out;
+	}
+
+	if (!file->f_mapping || len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (file->f_op && file->f_op->fadvise)
+		fadvise = file->f_op->fadvise;
+
+	ret = fadvise(file, offset, len, advice);
+out:
+	fput(file);
 	return ret;
 }
 
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 #include "internal.h"
@@ -46,6 +47,9 @@
 
 #include <asm/mman.h>
 
+#include <linux/virtinfo.h>
+#include <bc/io_acct.h>
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -135,7 +139,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 
 	if (!node) {
 		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+		__radix_tree_root_tag_move_all_to_prev(&mapping->page_tree);
 		radix_tree_replace_slot(slot, shadow);
 		return;
 	}
@@ -146,6 +150,8 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		if (test_bit(offset, node->tags[tag]))
 			radix_tree_tag_clear(&mapping->page_tree, index, tag);
+		else
+			__radix_tree_prev_tag_clear(&mapping->page_tree, tag);
 	}
 
 	/* Delete page, swap shadow entry */
@@ -169,7 +175,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
-		workingset_remember_node(node);
+		list_lru_add(&workingset_shadow_nodes, &node->private_list);
 	}
 }
 
@@ -194,6 +200,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 		cleancache_invalidate_page(mapping, page);
 
 	page_cache_tree_delete(mapping, page, shadow);
+	if (mapping_cap_account_dirty(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY))
+		ub_io_account_cancel(mapping);
+
+	if (mapping_cap_account_writeback(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK))
+		ub_io_writeback_dec(mapping);
 
 	page->mapping = NULL;
 	/* Leave page->index set: truncation lookup relies upon it */
@@ -235,7 +250,6 @@ void delete_from_page_cache(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	__delete_from_page_cache(page, NULL);
 	spin_unlock_irq(&mapping->tree_lock);
-	mem_cgroup_uncharge_cache_page(page);
 
 	if (freepage)
 		freepage(page);
@@ -533,8 +547,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		if (PageSwapBacked(new))
 			__inc_zone_page_state(new, NR_SHMEM);
 		spin_unlock_irq(&mapping->tree_lock);
-		/* mem_cgroup codes must not be called under tree_lock */
-		mem_cgroup_replace_page_cache(old, new);
+		mem_cgroup_migrate(old, new, true);
 		radix_tree_preload_end();
 		if (freepage)
 			freepage(old);
@@ -595,7 +608,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		 * mapping->tree_lock.
 		 */
 		if (!list_empty(&node->private_list))
-			workingset_forget_node(node);
+			list_lru_del(&workingset_shadow_nodes,
+				     &node->private_list);
 	}
 	return 0;
 }
@@ -605,15 +619,19 @@ static int __add_to_page_cache_locked(struct page *page,
 				      pgoff_t offset, gfp_t gfp_mask,
 				      void **shadowp)
 {
+	int huge = PageHuge(page);
+	struct mem_cgroup *memcg;
 	int error;
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageSwapBacked(page));
 
-	error = mem_cgroup_cache_charge(page, current->mm,
-					gfp_mask & GFP_RECLAIM_MASK);
-	if (error)
-		goto out;
+	if (!huge) {
+		error = mem_cgroup_try_charge(page, current->mm, gfp_mask,
+					&memcg);
+		if (error)
+			return error;
+	}
 
 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
@@ -626,18 +644,20 @@ static int __add_to_page_cache_locked(struct page *page,
 		if (likely(!error)) {
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			spin_unlock_irq(&mapping->tree_lock);
+			if (!huge)
+				mem_cgroup_commit_charge(page, memcg, false);
 			trace_mm_filemap_add_to_page_cache(page);
 		} else {
 			page->mapping = NULL;
 			/* Leave page->index set: truncation relies upon it */
 			spin_unlock_irq(&mapping->tree_lock);
-			mem_cgroup_uncharge_cache_page(page);
+			if (!huge)
+				mem_cgroup_cancel_charge(page, memcg);
 			page_cache_release(page);
 		}
 		radix_tree_preload_end();
-	} else
-		mem_cgroup_uncharge_cache_page(page);
-out:
+	} else if (!huge)
+		mem_cgroup_cancel_charge(page, memcg);
 	return error;
 }
 
@@ -1575,162 +1595,6 @@ static void shrink_readahead_size_eio(struct file *filp,
 	ra->ra_pages /= 4;
 }
 
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-	                 struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *from;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		from = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_to_user_inatomic(buf, from, copy);
-		copy -= left;
-		skip += copy;
-		from += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_to_user_inatomic(buf, from, copy);
-			copy -= left;
-			skip = copy;
-			from += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = from - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	from = kaddr + offset;
-	left = __copy_to_user(buf, from, copy);
-	copy -= left;
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_to_user(buf, from, copy);
-		copy -= left;
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-			   struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *to;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		to = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_from_user_inatomic(to, buf, copy);
-		copy -= left;
-		skip += copy;
-		to += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_from_user_inatomic(to, buf, copy);
-			copy -= left;
-			skip = copy;
-			to += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = to - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	to = kaddr + offset;
-	left = __copy_from_user(to, buf, copy);
-	copy -= left;
-	skip += copy;
-	to += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_from_user(to, buf, copy);
-		copy -= left;
-		skip = copy;
-		to += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
 /**
  * do_generic_file_read - generic file read routine
  * @filp:	the file to read
@@ -1772,6 +1636,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 		cond_resched();
 find_page:
 		page = find_get_page(mapping, index);
+		if (!page && mapping->i_peer_file) {
+			page = pick_peer_page(mapping, index, ra,
+					      last_index - index);
+			if (page)
+				goto page_ok;
+		}
 		if (!page) {
 			page_cache_sync_readahead(mapping,
 					ra, filp,
@@ -1864,6 +1734,8 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 		goto out;
 
 page_not_up_to_date:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/* Get exclusive access to the page ... */
 		error = lock_page_killable(page);
 		if (unlikely(error))
@@ -1931,6 +1803,8 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 		goto out;
 
 no_cached_page:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/*
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
@@ -2038,31 +1912,60 @@ int generic_segment_checks(const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_segment_checks);
 
+static ssize_t mapping_direct_IO(struct address_space *mapping, int rw,
+			         struct kiocb *iocb, struct iov_iter *iter,
+			         loff_t pos)
+{
+	if (iov_iter_has_iovec(iter))
+		return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter),
+						 pos, iter->nr_segs);
+	else if (iov_iter_has_bvec(iter))
+		return mapping->a_ops->direct_IO_bvec(rw, iocb,
+						      iov_iter_bvec(iter), pos,
+						      iter->nr_segs);
+	else if (iov_iter_has_page(iter))
+		return mapping->a_ops->direct_IO_page(rw, iocb,
+						      iov_iter_page(iter), pos);
+	else
+		BUG();
+}
+
+static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
+				unsigned long offset, unsigned long size)
+{
+	struct iov_iter *iter = desc->arg.data;
+	unsigned long copied = 0;
+
+	if (size > desc->count)
+		size = desc->count;
+
+	copied = iov_iter_copy_to_user(page, iter, offset, size);
+	if (copied < size)
+		desc->error = -EFAULT;
+
+	iov_iter_advance(iter, copied);
+	desc->count -= copied;
+	desc->written += copied;
+
+	return copied;
+}
+
+
 /**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
  * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
+ * @iov_iter:	memory vector
  * @pos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
  */
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg = 0;
-	size_t count;
+	read_descriptor_t desc;
+	ssize_t retval = 0;
+	size_t count = iov_iter_count(iter);
 	loff_t *ppos = &iocb->ki_pos;
 
-	count = 0;
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
-
 	if (io_is_direct(filp)) {
 		loff_t size;
 		struct address_space *mapping;
@@ -2073,30 +1976,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		if (!count)
 			goto out; /* skip atime */
 		size = i_size_read(inode);
-		if (pos < size) {
-			retval = filemap_write_and_wait_range(mapping, pos,
-					pos + iov_length(iov, nr_segs) - 1);
-			if (!retval) {
-				retval = mapping->a_ops->direct_IO(READ, iocb,
-							iov, pos, nr_segs);
-			}
-			if (retval > 0) {
-				*ppos = pos + retval;
-				count -= retval;
-			}
+		retval = filemap_write_and_wait_range(mapping, pos,
+				pos + count - 1);
+		if (!retval) {
+			retval = mapping_direct_IO(mapping, READ,
+						   iocb, iter, pos);
+		}
+		if (retval > 0) {
+			*ppos = pos + retval;
+			count -= retval;
+		}
 
-			/*
-			 * Btrfs can have a short DIO read if we encounter
-			 * compressed extents, so if there was an error, or if
-			 * we've already read everything we wanted to, or if
-			 * there was a short read because we hit EOF, go ahead
-			 * and return.  Otherwise fallthrough to buffered io for
-			 * the rest of the read.
-			 */
-			if (retval < 0 || !count || *ppos >= size) {
-				file_accessed(filp);
-				goto out;
-			}
+		/*
+		 * Btrfs can have a short DIO read if we encounter
+		 * compressed extents, so if there was an error, or if
+		 * we've already read everything we wanted to, or if
+		 * there was a short read because we hit EOF, go ahead
+		 * and return.  Otherwise fallthrough to buffered io for
+		 * the rest of the read.
+		 */
+		if (retval < 0 || !count || *ppos >= size) {
+			file_accessed(filp);
+			goto out;
 		}
 
 		/*
@@ -2109,42 +2010,49 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	count = retval;
-	for (seg = 0; seg < nr_segs; seg++) {
-		read_descriptor_t desc;
-		loff_t offset = 0;
+	iov_iter_advance(iter, retval);
 
-		/*
-		 * If we did a short DIO read we need to skip the section of the
-		 * iov that we've already read data into.
-		 */
-		if (count) {
-			if (count > iov[seg].iov_len) {
-				count -= iov[seg].iov_len;
-				continue;
-			}
-			offset = count;
-			count = 0;
-		}
+	desc.written = 0;
+	desc.arg.data = iter;
+	desc.count = count;
+	desc.error = 0;
+	do_generic_file_read(filp, ppos, &desc, file_read_iter_actor);
 
-		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base + offset;
-		desc.count = iov[seg].iov_len - offset;
-		if (desc.count == 0)
-			continue;
-		desc.error = 0;
-		do_generic_file_read(filp, ppos, &desc, file_read_actor);
-		retval += desc.written;
-		if (desc.error) {
-			retval = retval ?: desc.error;
-			break;
-		}
-		if (desc.count > 0)
-			break;
-	}
+	retval += desc.written;
+	if (desc.error && !retval)
+		retval = desc.error;
 out:
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter iter;
+	int ret;
+	size_t count;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+	if (ret)
+		return ret;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	return generic_file_read_iter(iocb, &iter, pos);
+}
 EXPORT_SYMBOL(generic_file_aio_read);
 
 #ifdef CONFIG_MMU
@@ -2162,6 +2070,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 	struct page *page; 
 	int ret;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	do {
 		page = page_cache_alloc_cold(mapping);
 		if (!page)
@@ -2279,12 +2189,24 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
+	if (!page && mapping->i_peer_file) {
+		page = pick_peer_page(mapping, offset, ra, ra->ra_pages);
+		if (page) {
+			vmf->page = page;
+			return 0; /* unlocked page */
+		}
+	}
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
+
+		if (unlikely(!PageUptodate(page)))
+			virtinfo_notifier_call(VITYPE_IO,
+					VIRTINFO_IO_PREPARE, NULL);
+
 	} else if (!page) {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
@@ -2410,7 +2332,6 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -2595,150 +2516,6 @@ struct page *read_cache_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	BUG_ON(!in_atomic());
-	kaddr = kmap_atomic(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr);
-
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct iovec *iov = i->iov;
-		size_t base = i->iov_offset;
-		unsigned long nr_segs = i->nr_segs;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !iov->iov_len)) {
-			int copy;
-
-			copy = min(bytes, iov->iov_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				nr_segs--;
-				base = 0;
-			}
-		}
-		i->iov = iov;
-		i->iov_offset = base;
-		i->nr_segs = nr_segs;
-	}
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	char __user *buf = i->iov->iov_base + i->iov_offset;
-	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-	return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	if (i->nr_segs == 1)
-		return i->count;
-	else
-		return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
 /*
  * Performs necessary checks before doing a write
  *
@@ -2844,9 +2621,8 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, size_t ocount)
+generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, size_t count)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -2855,10 +2631,13 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	size_t		write_len;
 	pgoff_t		end;
 
-	if (count != ocount)
-		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+	if (count != iov_iter_count(iter)) {
+		written = iov_iter_shorten(iter, count);
+		if (written)
+			goto out;
+	}
 
-	write_len = iov_length(iov, *nr_segs);
+	write_len = count;
 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2871,21 +2650,19 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	 * about to write.  We do this *before* the write so that we can return
 	 * without clobbering -EIOCBQUEUED from ->direct_IO().
 	 */
-	if (mapping->nrpages) {
-		written = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
-		/*
-		 * If a page can not be invalidated, return 0 to fall back
-		 * to buffered write.
-		 */
-		if (written) {
-			if (written == -EBUSY)
-				return 0;
-			goto out;
-		}
+	written = invalidate_inode_pages2_range(mapping,
+						pos >> PAGE_CACHE_SHIFT, end);
+	/*
+	 * If a page can not be invalidated, return 0 to fall back
+	 * to buffered write.
+	 */
+	if (written) {
+		if (written == -EBUSY)
+			return 0;
+		goto out;
 	}
 
-	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+	written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
@@ -2895,10 +2672,8 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	 * so we don't support it 100%.  If this invalidation
 	 * fails, tough, the write still worked...
 	 */
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
-	}
+	invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
 
 	if (written > 0) {
 		pos += written;
@@ -2911,6 +2686,23 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 out:
 	return written;
 }
+EXPORT_SYMBOL(generic_file_direct_write_iter);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, size_t ocount)
+{
+	struct iov_iter iter;
+	ssize_t ret;
+
+	iov_iter_init(&iter, iov, *nr_segs, ocount, 0);
+	ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count);
+	/* generic_file_direct_write_iter() might have shortened the vec */
+	if (*nr_segs != iter.nr_segs)
+		*nr_segs = iter.nr_segs;
+	return ret;
+}
 EXPORT_SYMBOL(generic_file_direct_write);
 
 /*
@@ -3044,16 +2836,15 @@ static ssize_t generic_perform_write(struct file *file,
 }
 
 ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t status;
-	struct iov_iter i;
 
-	iov_iter_init(&i, iov, nr_segs, count, written);
-	status = generic_perform_write(file, &i, pos);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	status = generic_perform_write(file, iter, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
@@ -3062,13 +2853,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	
 	return written ? written : status;
 }
+EXPORT_SYMBOL(generic_file_buffered_write_iter);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct iov_iter iter;
+	iov_iter_init(&iter, iov, nr_segs, count, written);
+	return generic_file_buffered_write_iter(iocb, &iter, pos, ppos,
+						written);
+}
 EXPORT_SYMBOL(generic_file_buffered_write);
 
 /**
  * __generic_file_aio_write - write data to a file
  * @iocb:	IO state structure (file, offset, etc.)
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
+ * @iter:	iov_iter specifying memory to write
  * @ppos:	position where to write
  *
  * This function does all the work needed for actually writing data to a
@@ -3083,24 +2885,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
  * A caller has to handle it. This is mainly due to the fact that we want to
  * avoid syncing under i_mutex.
  */
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t *ppos)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos;
 	ssize_t		written;
 	ssize_t		err;
 
-	ocount = 0;
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		return err;
-
-	count = ocount;
+	count = iov_iter_count(iter);
 	pos = *ppos;
 
 	/* We can write back this queue in page reclaim */
@@ -3126,8 +2922,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		loff_t endbyte;
 		ssize_t written_buffered;
 
-		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-							ppos, count, ocount);
+		written = generic_file_direct_write_iter(iocb, iter, pos,
+							 ppos, count);
 		/*
 		 * If the write stopped short of completing, fall back to
 		 * buffered writes.  Some filesystems do this for writes to
@@ -3140,9 +2936,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 		pos += written;
 		count -= written;
-		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, ppos, count,
-						written);
+		iov_iter_advance(iter, written);
+		written_buffered = generic_file_buffered_write_iter(iocb, iter,
+						pos, ppos, written);
 		/*
 		 * If generic_file_buffered_write() retuned a synchronous error
 		 * then we want to return the number of bytes which were
@@ -3174,13 +2970,57 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 */
 		}
 	} else {
-		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, ppos, count, written);
+		iter->count = count;
+		written = generic_file_buffered_write_iter(iocb, iter,
+				pos, ppos, written);
 	}
 out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+			        loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t
+__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t *ppos)
+{
+	struct iov_iter iter;
+	size_t count;
+	int ret;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (ret)
+		goto out;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	ret = __generic_file_write_iter(iocb, &iter, ppos);
+out:
+	return ret;
+}
 EXPORT_SYMBOL(__generic_file_aio_write);
 
 /**
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,483 @@
+/*
+ *	linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <bc/vmpages.h>
+
+/*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
+static struct page *__xip_sparse_page;
+
+/* called under xip_sparse_mutex */
+static struct page *xip_sparse_page(void)
+{
+	if (!__xip_sparse_page) {
+		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+		if (page)
+			__xip_sparse_page = page;
+	}
+	return __xip_sparse_page;
+}
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all.  It may be NULL.
+ */
+static ssize_t
+do_xip_mapping_read(struct address_space *mapping,
+		    struct file_ra_state *_ra,
+		    struct file *filp,
+		    char __user *buf,
+		    size_t len,
+		    loff_t *ppos)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index, end_index;
+	unsigned long offset;
+	loff_t isize, pos;
+	size_t copied = 0, error = 0;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	pos = *ppos;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		goto out;
+
+	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	do {
+		unsigned long nr, left;
+		void *xip_mem;
+		unsigned long xip_pfn;
+		int zero = 0;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = PAGE_CACHE_SIZE;
+		if (index >= end_index) {
+			if (index > end_index)
+				goto out;
+			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (nr <= offset) {
+				goto out;
+			}
+		}
+		nr = nr - offset;
+		if (nr > len - copied)
+			nr = len - copied;
+
+		error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+							&xip_mem, &xip_pfn);
+		if (unlikely(error)) {
+			if (error == -ENODATA) {
+				/* sparse */
+				zero = 1;
+			} else
+				goto out;
+		}
+
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(mapping))
+			/* address based flush */ ;
+
+		/*
+		 * Ok, we have the mem, so now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		if (!zero)
+			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+		else
+			left = __clear_user(buf + copied, nr);
+
+		if (left) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		copied += (nr - left);
+		offset += (nr - left);
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+	} while (copied < len);
+
+out:
+	*ppos = pos + copied;
+	if (filp)
+		file_accessed(filp);
+
+	return (copied ? copied : error);
+}
+
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+			    buf, len, ppos);
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * __xip_sparse_page when found at pgoff.
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+		     unsigned long pgoff)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	unsigned long address;
+	pte_t *pte;
+	pte_t pteval;
+	spinlock_t *ptl;
+	struct page *page;
+	unsigned count;
+	int locked = 0;
+
+	count = read_seqcount_begin(&xip_sparse_seq);
+
+	page = __xip_sparse_page;
+	if (!page)
+		return;
+
+retry:
+	mutex_lock(&mapping->i_mmap_mutex);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+		mm = vma->vm_mm;
+		address = vma->vm_start +
+			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+		pte = page_check_address(page, mm, address, &ptl, 1);
+		if (pte) {
+			/* Nuke the page table entry. */
+			flush_cache_page(vma, address, pte_pfn(*pte));
+			pteval = ptep_clear_flush(vma, address, pte);
+			page_remove_rmap(page);
+			dec_mm_counter(mm, MM_FILEPAGES);
+			BUG_ON(pte_dirty(pteval));
+			pte_unmap_unlock(pte, ptl);
+			/* must invalidate_page _before_ freeing the page */
+			mmu_notifier_invalidate_page(mm, address);
+			page_cache_release(page);
+		}
+	}
+	mutex_unlock(&mapping->i_mmap_mutex);
+
+	if (locked) {
+		mutex_unlock(&xip_sparse_mutex);
+	} else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+		mutex_lock(&xip_sparse_mutex);
+		locked = 1;
+		goto retry;
+	}
+}
+
+/*
+ * xip_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_fault, but used for execute in place
+ */
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	pgoff_t size;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	struct page *page;
+	int error;
+
+	/* XXX: are VM_FAULT_ codes OK? */
+again:
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+						&xip_mem, &xip_pfn);
+	if (likely(!error))
+		goto found;
+	if (error != -ENODATA)
+		return VM_FAULT_OOM;
+
+	/* sparse block */
+	if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+	    (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
+	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+		int err;
+
+		/* maybe shared writable, allocate new block */
+		mutex_lock(&xip_sparse_mutex);
+		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+							&xip_mem, &xip_pfn);
+		mutex_unlock(&xip_sparse_mutex);
+		if (error)
+			return VM_FAULT_SIGBUS;
+		/* unmap sparse mappings at pgoff from all other vmas */
+		__xip_unmap(mapping, vmf->pgoff);
+
+found:
+		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+							xip_pfn);
+		if (err == -ENOMEM)
+			return VM_FAULT_OOM;
+		/*
+		 * err == -EBUSY is fine, we've raced against another thread
+		 * that faulted-in the same page
+		 */
+		if (err != -EBUSY)
+			BUG_ON(err);
+		return VM_FAULT_NOPAGE;
+	} else {
+		int err, ret = VM_FAULT_OOM;
+
+		mutex_lock(&xip_sparse_mutex);
+		write_seqcount_begin(&xip_sparse_seq);
+		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+							&xip_mem, &xip_pfn);
+		if (unlikely(!error)) {
+			write_seqcount_end(&xip_sparse_seq);
+			mutex_unlock(&xip_sparse_mutex);
+			goto again;
+		}
+		if (error != -ENODATA)
+			goto out;
+		/* not shared and writable, use xip_sparse_page() */
+		page = xip_sparse_page();
+		if (!page)
+			goto out;
+		err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+							page);
+		if (err == -ENOMEM)
+			goto out;
+
+		ret = VM_FAULT_NOPAGE;
+out:
+		write_seqcount_end(&xip_sparse_seq);
+		mutex_unlock(&xip_sparse_mutex);
+
+		return ret;
+	}
+}
+
+static const struct vm_operations_struct xip_file_vm_ops = {
+	.fault	= xip_file_fault,
+	.page_mkwrite	= filemap_page_mkwrite,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+	file_accessed(file);
+	vma->vm_ops = &xip_file_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+		  size_t count, loff_t pos, loff_t *ppos)
+{
+	struct address_space * mapping = filp->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	size_t		bytes;
+	ssize_t		written = 0;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+		void *xip_mem;
+		unsigned long xip_pfn;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+		if (status == -ENODATA) {
+			/* we allocate a new page unmap it */
+			mutex_lock(&xip_sparse_mutex);
+			status = a_ops->get_xip_mem(mapping, index, 1,
+							&xip_mem, &xip_pfn);
+			mutex_unlock(&xip_sparse_mutex);
+			if (!status)
+				/* unmap page at pgoff from all other vmas */
+				__xip_unmap(mapping, index);
+		}
+
+		if (status)
+			break;
+
+		copied = bytes -
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
+
+		if (likely(copied > 0)) {
+			status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		if (status < 0)
+			break;
+	} while (count);
+	*ppos = pos;
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+
+	return written ? written : status;
+}
+
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+	       loff_t *ppos)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count;
+	loff_t pos;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (!access_ok(VERIFY_READ, buf, len)) {
+		ret=-EFAULT;
+		goto out_up;
+	}
+
+	pos = *ppos;
+	count = len;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out_backing;
+	if (count == 0)
+		goto out_backing;
+
+	ret = file_remove_suid(filp);
+	if (ret)
+		goto out_backing;
+
+	ret = file_update_time(filp);
+	if (ret)
+		goto out_backing;
+
+	ret = __xip_file_write (filp, buf, count, pos, ppos);
+
+ out_backing:
+	current->backing_dev_info = NULL;
+ out_up:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_mem
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	unsigned length;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	int err;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	blocksize = 1 << mapping->host->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+
+	err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+	if (unlikely(err)) {
+		if (err == -ENODATA)
+			/* Hole? No need to truncate */
+			return 0;
+		else
+			return err;
+	}
+	memset(xip_mem + offset, 0, length);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
--- a/mm/fremap.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- *   linux/mm/fremap.c
- * 
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-#include <linux/userfaultfd_k.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	struct page *page;
-	swp_entry_t entry;
-
-	if (pte_present(pte)) {
-		flush_cache_page(vma, addr, pte_pfn(pte));
-		pte = ptep_clear_flush_notify(vma, addr, ptep);
-		page = vm_normal_page(vma, addr, pte);
-		if (page) {
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			update_hiwater_rss(mm);
-			dec_mm_counter(mm, mm_counter(page));
-			page_remove_rmap(page);
-			page_cache_release(page);
-		}
-	} else {	/* zap_pte() is not called when pte_none() */
-		if (!pte_file(pte)) {
-			update_hiwater_rss(mm);
-			entry = pte_to_swp_entry(pte);
-			if (non_swap_entry(entry)) {
-				if (is_migration_entry(entry)) {
-					page = migration_entry_to_page(entry);
-					dec_mm_counter(mm, mm_counter(page));
-				}
-			} else {
-				free_swap_and_cache(entry);
-				dec_mm_counter(mm, MM_SWAPENTS);
-			}
-		}
-		pte_clear_not_present_full(mm, addr, ptep, 0);
-	}
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
-	int err = -ENOMEM;
-	pte_t *pte, ptfile;
-	spinlock_t *ptl;
-
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-
-	ptfile = pgoff_to_pte(pgoff);
-
-	if (!pte_none(*pte))
-		zap_pte(mm, vma, addr, pte);
-
-	set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
-	/*
-	 * We don't need to run update_mmu_cache() here because the "file pte"
-	 * being installed by install_file_pte() is not a real pte - it's a
-	 * non-present entry (like a swap entry), noting what file offset should
-	 * be mapped there when there's a fault (in a non-linear vma where
-	 * that's not obvious).
-	 */
-	pte_unmap_unlock(pte, ptl);
-	err = 0;
-out:
-	return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	int err;
-
-	do {
-		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
-		if (err)
-			return err;
-
-		size -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	} while (size);
-
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct address_space *mapping;
-	struct vm_area_struct *vma;
-	int err = -EINVAL;
-	int has_write_lock = 0;
-	vm_flags_t vm_flags = 0;
-	LIST_HEAD(uf);
-
-	if (prot)
-		return err;
-	/*
-	 * Sanitize the syscall parameters:
-	 */
-	start = start & PAGE_MASK;
-	size = size & PAGE_MASK;
-
-	/* Does the address range wrap, or is the span zero-sized? */
-	if (start + size <= start)
-		return err;
-
-	/* Does pgoff wrap? */
-	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
-		return err;
-
-	/* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
-	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
-		return err;
-#endif
-
-	/* We need down_write() to change vma->vm_flags. */
-	down_read(&mm->mmap_sem);
- retry:
-	vma = find_vma(mm, start);
-
-	/*
-	 * Make sure the vma is shared, that it supports prefaulting,
-	 * and that the remapped range is valid and fully within
-	 * the single existing vma.
-	 */
-	if (!vma || !(vma->vm_flags & VM_SHARED))
-		goto out;
-
-	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
-		goto out;
-
-	if (start < vma->vm_start || start + size > vma->vm_end)
-		goto out;
-
-	/* Must set VM_NONLINEAR before any pages are populated. */
-	if (!(vma->vm_flags & VM_NONLINEAR)) {
-		/*
-		 * vm_private_data is used as a swapout cursor
-		 * in a VM_NONLINEAR vma.
-		 */
-		if (vma->vm_private_data)
-			goto out;
-
-		/* Don't need a nonlinear mapping, exit success */
-		if (pgoff == linear_page_index(vma, start)) {
-			err = 0;
-			goto out;
-		}
-
-		if (!has_write_lock) {
-get_write_lock:
-			up_read(&mm->mmap_sem);
-			down_write(&mm->mmap_sem);
-			has_write_lock = 1;
-			goto retry;
-		}
-		mapping = vma->vm_file->f_mapping;
-		/*
-		 * page_mkclean doesn't work on nonlinear vmas, so if
-		 * dirty pages need to be accounted, emulate with linear
-		 * vmas.
-		 */
-		if (mapping_cap_account_dirty(mapping)) {
-			unsigned long addr;
-			struct file *file = get_file(vma->vm_file);
-			/* mmap_region may free vma; grab the info now */
-			vm_flags = vma->vm_flags;
-
-			addr = mmap_region(file, start, size, vm_flags, pgoff,
-					   &uf);
-			fput(file);
-			if (IS_ERR_VALUE(addr)) {
-				err = addr;
-			} else {
-				BUG_ON(addr != start);
-				err = 0;
-			}
-			goto out_freed;
-		}
-		mutex_lock(&mapping->i_mmap_mutex);
-		flush_dcache_mmap_lock(mapping);
-		vma->vm_flags |= VM_NONLINEAR;
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
-		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		flush_dcache_mmap_unlock(mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
-	}
-
-	if (vma->vm_flags & VM_LOCKED) {
-		/*
-		 * drop PG_Mlocked flag for over-mapped range
-		 */
-		if (!has_write_lock)
-			goto get_write_lock;
-		vm_flags = vma->vm_flags;
-		munlock_vma_pages_range(vma, start, start + size);
-		vma->vm_flags = vm_flags;
-	}
-
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
-	mmu_notifier_invalidate_range_end(mm, start, start + size);
-
-	/*
-	 * We can't clear VM_NONLINEAR because we'd have to do
-	 * it after ->populate completes, and that would prevent
-	 * downgrading the lock.  (Locks can't be upgraded).
-	 */
-
-out:
-	if (vma)
-		vm_flags = vma->vm_flags;
-out_freed:
-	if (likely(!has_write_lock))
-		up_read(&mm->mmap_sem);
-	else
-		up_write(&mm->mmap_sem);
-	userfaultfd_unmap_complete(mm, &uf);
-	if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
-		mm_populate(start, size);
-
-	return err;
-}
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -244,8 +244,10 @@ int __frontswap_store(struct page *page)
 		  the (older) page from frontswap
 		 */
 		inc_frontswap_failed_stores();
-		if (dup)
+		if (dup) {
 			__frontswap_clear(sis, offset);
+			frontswap_ops->invalidate_page(type, offset);
+		}
 	}
 	if (frontswap_writethrough_enabled)
 		/* report failure so swap also writes to swap device */
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -158,7 +158,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 		 */
 		if (likely(!(flags & FOLL_MIGRATION)))
 			goto no_page;
-		if (pte_none(pte) || pte_file(pte))
+		if (pte_none(pte))
 			goto no_page;
 		entry = pte_to_swp_entry(pte);
 		if (!is_migration_entry(entry))
@@ -216,7 +216,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
+			set_page_dirty_mm(page, mm);
 		/*
 		 * pte_mkyoung() would be more correct here, but atomic care
 		 * is needed to avoid losing the dirty bit: it is easier to use
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -24,6 +24,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -59,11 +60,10 @@ static DEFINE_MUTEX(khugepaged_mutex);
 static DEFINE_SPINLOCK(khugepaged_mm_lock);
 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 /*
- * default collapse hugepages if there is at least one pte mapped like
- * it would have happened if the vma was large enough during page
- * fault.
+ * default collapse hugepages if there is at least 1/4th ptes mapped
+ * to avoid memory footprint growth due to fragmentation
  */
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR*3/4;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
@@ -204,24 +204,29 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
-static int shrink_huge_zero_page(struct shrinker *shrink,
-		struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+					struct shrink_control *sc)
 {
-	if (!sc->nr_to_scan)
-		/* we can free zero page only if last reference remains */
-		return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+	/* we can free zero page only if last reference remains */
+	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
 
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 		struct page *zero_page = xchg(&huge_zero_page, NULL);
 		BUG_ON(zero_page == NULL);
 		__free_pages(zero_page, compound_order(zero_page));
+		return HPAGE_PMD_NR;
 	}
 
 	return 0;
 }
 
 static struct shrinker huge_zero_page_shrinker = {
-	.shrink = shrink_huge_zero_page,
+	.count_objects = shrink_huge_zero_page_count,
+	.scan_objects = shrink_huge_zero_page_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -697,13 +702,14 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					unsigned long address, pmd_t *pmd,
 					struct page *page, unsigned int flags)
 {
+	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
 	spinlock_t *ptl;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+	if (unlikely(mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -711,7 +717,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	pgtable = pte_alloc_one(mm, haddr);
 	if (unlikely(!pgtable)) {
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		put_page(page);
 		return VM_FAULT_OOM;
 	}
@@ -727,7 +733,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	ptl = pmd_lock(mm, pmd);
 	if (unlikely(!pmd_none(*pmd))) {
 		spin_unlock(ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		put_page(page);
 		pte_free(mm, pgtable);
 	} else {
@@ -738,7 +744,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 			int ret;
 
 			spin_unlock(ptl);
-			mem_cgroup_uncharge_page(page);
+			mem_cgroup_cancel_charge(page, memcg);
 			put_page(page);
 			pte_free(mm, pgtable);
 			ret = handle_userfault(vma, address, flags,
@@ -750,6 +756,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		init_trans_huge_mmu_gather_count(page);
 		entry = mk_huge_pmd(page, vma);
 		page_add_new_anon_rmap(page, vma, haddr);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -998,6 +1006,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
 {
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd;
@@ -1012,7 +1021,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		goto out;
 	}
 
-	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
@@ -1041,6 +1050,8 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 			entry = mk_pte(page, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			page_add_new_anon_rmap(page, vma, haddr);
+			mem_cgroup_commit_charge(page, memcg, false);
+			lru_cache_add_active_or_unevictable(page, vma);
 		} else {
 			entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
 			entry = pte_mkspecial(entry);
@@ -1064,7 +1075,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 out_free_page:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 	put_page(page);
 	goto out;
 }
@@ -1076,6 +1087,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct page *page,
 					unsigned long haddr)
 {
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd = {0};
@@ -1096,20 +1108,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					       __GFP_OTHER_NODE,
 					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
-			     mem_cgroup_newpage_charge(pages[i], mm,
-						       GFP_KERNEL))) {
+			     mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
+						   &memcg))) {
 			if (pages[i])
 				put_page(pages[i]);
-			mem_cgroup_uncharge_start();
 			while (--i >= 0) {
-				mem_cgroup_uncharge_page(pages[i]);
+				memcg = (void *)page_private(pages[i]);
+				set_page_private(pages[i], 0);
+				mem_cgroup_cancel_charge(pages[i], memcg);
 				put_page(pages[i]);
 			}
-			mem_cgroup_uncharge_end();
 			kfree(pages);
 			ret |= VM_FAULT_OOM;
 			goto out;
 		}
+		set_page_private(pages[i], (unsigned long)memcg);
 	}
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1138,7 +1151,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		pte_t *pte, entry;
 		entry = mk_pte(pages[i], vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		memcg = (void *)page_private(pages[i]);
+		set_page_private(pages[i], 0);
 		page_add_new_anon_rmap(pages[i], vma, haddr);
+		mem_cgroup_commit_charge(pages[i], memcg, false);
+		lru_cache_add_active_or_unevictable(pages[i], vma);
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);
@@ -1162,12 +1179,12 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 out_free_pages:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		mem_cgroup_uncharge_page(pages[i]);
+		memcg = (void *)page_private(pages[i]);
+		set_page_private(pages[i], 0);
+		mem_cgroup_cancel_charge(pages[i], memcg);
 		put_page(pages[i]);
 	}
-	mem_cgroup_uncharge_end();
 	kfree(pages);
 	goto out;
 }
@@ -1178,6 +1195,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	int ret = 0;
 	struct page *page = NULL, *new_page;
+	struct mem_cgroup *memcg;
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
@@ -1231,7 +1249,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 	}
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+	if (unlikely(mem_cgroup_try_charge(new_page, mm,
+					   GFP_TRANSHUGE, &memcg))) {
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
@@ -1260,7 +1279,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(ptl);
-		mem_cgroup_uncharge_page(new_page);
+		mem_cgroup_cancel_charge(new_page, memcg);
 		put_page(new_page);
 		goto out_mn;
 	} else {
@@ -1269,6 +1288,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = mk_huge_pmd(new_page, vma);
 		pmdp_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
+		mem_cgroup_commit_charge(new_page, memcg, false);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		set_pmd_at(mm, haddr, pmd, entry);
 		update_mmu_cache_pmd(vma, address, pmd);
 		if (!page) {
@@ -1291,6 +1312,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+/*
+ * foll_force can write to even unwritable pmd's, but only
+ * after we've gone through a cow cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+					unsigned int flags)
+{
+	return pmd_write(pmd) ||
+		((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+		 page && PageAnon(page));
+}
+
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr,
 				   pmd_t *pmd,
@@ -1301,9 +1334,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
 	assert_spin_locked(pmd_lockptr(mm, pmd));
 
-	if (flags & FOLL_WRITE && !pmd_write(*pmd))
-		goto out;
-
 	/* Avoid dumping huge zero page */
 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
 		return ERR_PTR(-EFAULT);
@@ -1314,19 +1344,18 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
 	page = pmd_page(*pmd);
 	VM_BUG_ON_PAGE(!PageHead(page), page);
+
+	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, page, flags))
+		return NULL;
+
 	if (flags & FOLL_TOUCH) {
 		pmd_t _pmd;
-		/*
-		 * We should set the dirty bit only for FOLL_WRITE but
-		 * for now the dirty bit in the pmd is meaningless.
-		 * And if the dirty bit will become meaningful and
-		 * we'll only set it with FOLL_WRITE, an atomic
-		 * set_bit will be required on the pmd to set the
-		 * young bit, instead of the current set_pmd_at.
-		 */
-		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+
+		_pmd = pmd_mkyoung(*pmd);
+		if (flags & FOLL_WRITE)
+			_pmd = pmd_mkdirty(_pmd);
 		if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-					  pmd, _pmd,  1))
+					  pmd, _pmd,  flags & FOLL_WRITE))
 			update_mmu_cache_pmd(vma, addr, pmd);
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
@@ -1822,6 +1851,11 @@ static void __split_huge_page_refcount(struct page *page,
 		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 
+		if (page_is_young(page))
+			set_page_young(page_tail);
+		if (page_is_idle(page))
+			set_page_idle(page_tail);
+
 		/*
 		 * __split_huge_page_splitting() already set the
 		 * splitting bit in all pmd that could map this
@@ -2296,7 +2330,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
 		/* If there is no mapped pte young don't collapse the page */
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
@@ -2503,6 +2538,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spinlock_t *pmd_ptl, *pte_ptl;
 	int isolated;
 	unsigned long hstart, hend;
+	struct mem_cgroup *memcg;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 
@@ -2513,7 +2549,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (!new_page)
 		return;
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+	if (unlikely(mem_cgroup_try_charge(new_page, mm,
+					   GFP_TRANSHUGE, &memcg)))
 		return;
 
 	/*
@@ -2601,6 +2638,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
+	mem_cgroup_commit_charge(new_page, memcg, false);
+	lru_cache_add_active_or_unevictable(new_page, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
@@ -2614,7 +2653,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	return;
 
 out:
-	mem_cgroup_uncharge_page(new_page);
+	mem_cgroup_cancel_charge(new_page, memcg);
 	goto out_up_write;
 }
 
@@ -2668,7 +2707,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		/* cannot use mapcount: can't collapse if there's a gup pin */
 		if (page_count(page) != 1)
 			goto out_unmap;
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
+#include <linux/module.h>
 
 #include <linux/atomic.h>
 #include <asm/pgtable.h>
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -86,6 +86,7 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
+extern bool zone_reclaimable(struct zone *zone);
 
 /*
  * in mm/rmap.c:
@@ -166,8 +167,13 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 #ifdef CONFIG_MMU
 extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, int *nonblocking);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end);
+extern void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int acct);
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__munlock_vma_pages_range(vma, start, end, 1);
+}
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
 	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
 }
 
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
-		     unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+		     unsigned long, shared.rb_subtree_last,
 		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
 
 /* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 
 	VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
 
-	if (!prev->shared.linear.rb.rb_right) {
+	if (!prev->shared.rb.rb_right) {
 		parent = prev;
-		link = &prev->shared.linear.rb.rb_right;
+		link = &prev->shared.rb.rb_right;
 	} else {
-		parent = rb_entry(prev->shared.linear.rb.rb_right,
-				  struct vm_area_struct, shared.linear.rb);
-		if (parent->shared.linear.rb_subtree_last < last)
-			parent->shared.linear.rb_subtree_last = last;
-		while (parent->shared.linear.rb.rb_left) {
-			parent = rb_entry(parent->shared.linear.rb.rb_left,
-				struct vm_area_struct, shared.linear.rb);
-			if (parent->shared.linear.rb_subtree_last < last)
-				parent->shared.linear.rb_subtree_last = last;
+		parent = rb_entry(prev->shared.rb.rb_right,
+				  struct vm_area_struct, shared.rb);
+		if (parent->shared.rb_subtree_last < last)
+			parent->shared.rb_subtree_last = last;
+		while (parent->shared.rb.rb_left) {
+			parent = rb_entry(parent->shared.rb.rb_left,
+				struct vm_area_struct, shared.rb);
+			if (parent->shared.rb_subtree_last < last)
+				parent->shared.rb_subtree_last = last;
 		}
-		link = &parent->shared.linear.rb.rb_left;
+		link = &parent->shared.rb.rb_left;
 	}
 
-	node->shared.linear.rb_subtree_last = last;
-	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&node->shared.linear.rb, root,
+	node->shared.rb_subtree_last = last;
+	rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+	rb_insert_augmented(&node->shared.rb, root,
 			    &vma_interval_tree_augment);
 }
 
--- /dev/null
+++ b/mm/iov-iter.c
@@ -0,0 +1,481 @@
+/*
+ *  mm/iov-iter.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/hardirq.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+static size_t __iovec_copy_to_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_to_user_inatomic(buf, vaddr, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_to_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_to_user_inatomic(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr);
+
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_to_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = copy_to_user(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr);
+
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+static void ii_iovec_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		struct iovec *iov = (struct iovec *)i->data;
+		size_t base = i->iov_offset;
+		unsigned long nr_segs = i->nr_segs;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !iov->iov_len)) {
+			int copy;
+
+			copy = min(bytes, iov->iov_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				nr_segs--;
+				base = 0;
+			}
+		}
+		i->data = (unsigned long)iov;
+		i->iov_offset = base;
+		i->nr_segs = nr_segs;
+	}
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char __user *buf = iov->iov_base + i->iov_offset;
+	bytes = min(bytes, iov->iov_len - i->iov_offset);
+	return fault_in_pages_readable(buf, bytes);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+static size_t ii_iovec_single_seg_count(const struct iov_iter *i)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static int ii_iovec_shorten(struct iov_iter *i, size_t count)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	i->nr_segs = iov_shorten(iov, i->nr_segs, count);
+	return 0;
+}
+
+struct iov_iter_ops ii_iovec_ops = {
+	.ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_iovec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_iovec_copy_from_user,
+	.ii_advance = ii_iovec_advance,
+	.ii_fault_in_readable = ii_iovec_fault_in_readable,
+	.ii_single_seg_count = ii_iovec_single_seg_count,
+	.ii_shorten = ii_iovec_shorten,
+};
+EXPORT_SYMBOL(ii_iovec_ops);
+
+/*
+ * As an easily verifiable first pass, we implement all the methods that
+ * copy data to and from bvec pages with one function.  We implement it
+ * all with kmap_atomic().
+ */
+static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct bio_vec *bvec = (struct bio_vec *)iter->data;
+	size_t bvec_offset = iter->iov_offset;
+	size_t remaining = bytes;
+	void *bvec_map;
+	void *page_map;
+	size_t copy;
+
+	page_map = kmap_atomic(page);
+
+	BUG_ON(bytes > iter->count);
+	while (remaining) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec_offset >= bvec->bv_len);
+		copy = min(remaining, bvec->bv_len - bvec_offset);
+		bvec_map = kmap_atomic(bvec->bv_page);
+		if (topage)
+			memcpy(page_map + page_offset,
+			       bvec_map + bvec->bv_offset + bvec_offset,
+			       copy);
+		else
+			memcpy(bvec_map + bvec->bv_offset + bvec_offset,
+			       page_map + page_offset,
+			       copy);
+		kunmap_atomic(bvec_map);
+		remaining -= copy;
+		bvec_offset += copy;
+		page_offset += copy;
+		if (bvec_offset == bvec->bv_len) {
+			bvec_offset = 0;
+			bvec++;
+		}
+	}
+
+	kunmap_atomic(page_map);
+
+	return bytes;
+}
+
+size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+/*
+ * bio_vecs have a stricter structure than iovecs that might have
+ * come from userspace.  There are no zero length bio_vec elements.
+ */
+void ii_bvec_advance(struct iov_iter *i, size_t bytes)
+{
+	struct bio_vec *bvec = (struct bio_vec *)i->data;
+	size_t offset = i->iov_offset;
+	size_t delta;
+
+	BUG_ON(i->count < bytes);
+	while (bytes) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec->bv_len <= offset);
+		delta = min(bytes, bvec->bv_len - offset);
+		offset += delta;
+		i->count -= delta;
+		bytes -= delta;
+		if (offset == bvec->bv_len) {
+			bvec++;
+			offset = 0;
+		}
+	}
+
+	i->data = (unsigned long)bvec;
+	i->iov_offset = offset;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_bvec_single_seg_count(const struct iov_iter *i)
+{
+	const struct bio_vec *bvec = (struct bio_vec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, bvec->bv_len - i->iov_offset);
+}
+
+static int ii_bvec_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_bvec_ops = {
+	.ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_bvec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_bvec_copy_from_user,
+	.ii_advance = ii_bvec_advance,
+	.ii_fault_in_readable = ii_bvec_fault_in_readable,
+	.ii_single_seg_count = ii_bvec_single_seg_count,
+	.ii_shorten = ii_bvec_shorten,
+};
+EXPORT_SYMBOL(ii_bvec_ops);
+
+/* Functions to get on with single page */
+
+static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct page *ipage = (struct page *)iter->data;
+	size_t ipage_offset = iter->iov_offset;
+	void *ipage_map;
+	void *page_map;
+
+	BUG_ON(bytes > iter->count);
+	BUG_ON(bytes > PAGE_SIZE - ipage_offset);
+	BUG_ON(ipage_offset >= PAGE_SIZE);
+
+	page_map = kmap_atomic(page);
+	ipage_map = kmap_atomic(ipage);
+
+	if (topage)
+		memcpy(page_map + page_offset,
+		       ipage_map + ipage_offset,
+		       bytes);
+	else
+		memcpy(ipage_map + ipage_offset,
+		       page_map + page_offset,
+		       bytes);
+
+	kunmap_atomic(ipage_map);
+	kunmap_atomic(page_map);
+
+	return bytes;
+}
+
+size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+void ii_page_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+	BUG_ON(i->iov_offset >= PAGE_SIZE);
+	BUG_ON(bytes > PAGE_SIZE - i->iov_offset);
+
+	i->iov_offset += bytes;
+	i->count      -= bytes;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_page_single_seg_count(const struct iov_iter *i)
+{
+	BUG_ON(i->nr_segs != 1);
+
+	return i->count;
+}
+
+static int ii_page_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_page_ops = {
+	.ii_copy_to_user_atomic = ii_page_copy_to_user_atomic,
+	.ii_copy_to_user = ii_page_copy_to_user,
+	.ii_copy_from_user_atomic = ii_page_copy_from_user_atomic,
+	.ii_copy_from_user = ii_page_copy_from_user,
+	.ii_advance = ii_page_advance,
+	.ii_fault_in_readable = ii_page_fault_in_readable,
+	.ii_single_seg_count = ii_page_single_seg_count,
+	.ii_shorten = ii_page_shorten,
+};
+EXPORT_SYMBOL(ii_page_ops);
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,10 @@
+KASAN_SANITIZE := n
+
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o quarantine.o
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,730 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+	void *shadow_start, *shadow_end;
+
+	shadow_start = kasan_mem_to_shadow(address);
+	shadow_end = kasan_mem_to_shadow(address + size);
+
+	memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+	kasan_poison_shadow(address, size, 0);
+
+	if (size & KASAN_SHADOW_MASK) {
+		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+		*shadow = size & KASAN_SHADOW_MASK;
+	}
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+	s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(shadow_value)) {
+		s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+		return unlikely(last_accessible_byte >= shadow_value);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 1))
+			return true;
+
+		if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 3))
+			return true;
+
+		if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 7))
+			return true;
+
+		if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+	u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		u16 shadow_first_bytes = *(u16 *)shadow_addr;
+		s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+		if (unlikely(shadow_first_bytes))
+			return true;
+
+		if (likely(!last_byte))
+			return false;
+
+		return memory_is_poisoned_1(addr + 15);
+	}
+
+	return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+					size_t size)
+{
+	while (size) {
+		if (unlikely(*start))
+			return (unsigned long)start;
+		start++;
+		size--;
+	}
+
+	return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+						const void *end)
+{
+	unsigned int words;
+	unsigned long ret;
+	unsigned int prefix = (unsigned long)start % 8;
+
+	if (end - start <= 16)
+		return bytes_is_zero(start, end - start);
+
+	if (prefix) {
+		prefix = 8 - prefix;
+		ret = bytes_is_zero(start, prefix);
+		if (unlikely(ret))
+			return ret;
+		start += prefix;
+	}
+
+	words = (end - start) / 8;
+	while (words) {
+		if (unlikely(*(u64 *)start))
+			return bytes_is_zero(start, 8);
+		start += 8;
+		words--;
+	}
+
+	return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+						size_t size)
+{
+	unsigned long ret;
+
+	ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+			kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+	if (unlikely(ret)) {
+		unsigned long last_byte = addr + size - 1;
+		s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+		if (unlikely(ret != (unsigned long)last_shadow ||
+			((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+			return true;
+	}
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+	if (__builtin_constant_p(size)) {
+		switch (size) {
+		case 1:
+			return memory_is_poisoned_1(addr);
+		case 2:
+			return memory_is_poisoned_2(addr);
+		case 4:
+			return memory_is_poisoned_4(addr);
+		case 8:
+			return memory_is_poisoned_8(addr);
+		case 16:
+			return memory_is_poisoned_16(addr);
+		default:
+			BUILD_BUG();
+		}
+	}
+
+	return memory_is_poisoned_n(addr, size);
+}
+
+static __always_inline void check_memory_region_inline(unsigned long addr,
+						size_t size, bool write,
+						unsigned long ret_ip)
+{
+	if (unlikely(size == 0))
+		return;
+
+	if (unlikely((void *)addr <
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+		kasan_report(addr, size, write, ret_ip);
+		return;
+	}
+
+	if (likely(!memory_is_poisoned(addr, size)))
+		return;
+
+	kasan_report(addr, size, write, ret_ip);
+}
+
+static void check_memory_region(unsigned long addr,
+				size_t size, bool write,
+				unsigned long ret_ip)
+{
+	check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+	check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+
+	return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+	return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+	return __memcpy(dest, src, len);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_poison_shadow(page_address(page),
+				PAGE_SIZE << order,
+				KASAN_FREE_PAGE);
+}
+
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+	int rz =
+		object_size <= 64        - 16   ? 16 :
+		object_size <= 128       - 32   ? 32 :
+		object_size <= 512       - 64   ? 64 :
+		object_size <= 4096      - 128  ? 128 :
+		object_size <= (1 << 14) - 256  ? 256 :
+		object_size <= (1 << 15) - 512  ? 512 :
+		object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+	return rz;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+			unsigned long *flags)
+{
+	int redzone_adjust;
+	int orig_size = *size;
+
+	/* Add alloc meta. */
+	cache->kasan_info.alloc_meta_offset = *size;
+	*size += sizeof(struct kasan_alloc_meta);
+
+	/* Add free meta. */
+	if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+	    cache->object_size < sizeof(struct kasan_free_meta)) {
+		cache->kasan_info.free_meta_offset = *size;
+		*size += sizeof(struct kasan_free_meta);
+	}
+	redzone_adjust = optimal_redzone(cache->object_size) -
+		(*size - cache->object_size);
+
+	if (redzone_adjust > 0)
+		*size += redzone_adjust;
+
+	*size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
+					optimal_redzone(cache->object_size)));
+
+	/*
+	 * If the metadata doesn't fit, don't enable KASAN at all.
+	 */
+	if (*size <= cache->kasan_info.alloc_meta_offset ||
+			*size <= cache->kasan_info.free_meta_offset) {
+		cache->kasan_info.alloc_meta_offset = 0;
+		cache->kasan_info.free_meta_offset = 0;
+		*size = orig_size;
+		return;
+	}
+
+	*flags |= SLAB_KASAN;
+}
+
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_shutdown(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+	return (cache->kasan_info.alloc_meta_offset ?
+		sizeof(struct kasan_alloc_meta) : 0) +
+		(cache->kasan_info.free_meta_offset ?
+		sizeof(struct kasan_free_meta) : 0);
+}
+
+void kasan_poison_slab(struct page *page)
+{
+	kasan_poison_shadow(page_address(page),
+			PAGE_SIZE << compound_order(page),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_poison_shadow(object,
+			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+			KASAN_KMALLOC_REDZONE);
+}
+
+static inline int in_irqentry_text(unsigned long ptr)
+{
+	return (ptr >= (unsigned long)&__irqentry_text_start &&
+		ptr < (unsigned long)&__irqentry_text_end) ||
+		(ptr >= (unsigned long)&__softirqentry_text_start &&
+		 ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+	int i;
+
+	if (!trace->nr_entries)
+		return;
+	for (i = 0; i < trace->nr_entries; i++)
+		if (in_irqentry_text(trace->entries[i])) {
+			/* Include the irqentry function into the stack. */
+			trace->nr_entries = i + 1;
+			break;
+		}
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+	unsigned long entries[KASAN_STACK_DEPTH];
+	struct stack_trace trace = {
+		.nr_entries = 0,
+		.entries = entries,
+		.max_entries = KASAN_STACK_DEPTH,
+		.skip = 0
+	};
+
+	save_stack_trace(&trace);
+	filter_irq_stacks(&trace);
+	if (trace.nr_entries != 0 &&
+	    trace.entries[trace.nr_entries-1] == ULONG_MAX)
+		trace.nr_entries--;
+
+	return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+	track->pid = current->pid;
+	track->stack = save_stack(flags);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+					const void *object)
+{
+	BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+	return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+				      const void *object)
+{
+	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+	return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+void kasan_init_slab_obj(struct kmem_cache *cache, const void *object)
+{
+	struct kasan_alloc_meta *alloc_info;
+
+	if (!(cache->flags & SLAB_KASAN))
+		return;
+
+	alloc_info = get_alloc_info(cache, object);
+	__memset(alloc_info, 0, sizeof(*alloc_info));
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+	kasan_kmalloc(cache, object, cache->object_size, flags);
+}
+
+static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
+{
+	unsigned long size = cache->object_size;
+	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return;
+
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+	s8 shadow_byte;
+
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
+	if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
+		kasan_report_double_free(cache, object, shadow_byte);
+		return true;
+	}
+
+	kasan_poison_slab_free(cache, object);
+
+	if (unlikely(!(cache->flags & SLAB_KASAN)))
+		return false;
+
+	set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+	quarantine_put(get_free_info(cache, object), cache);
+	return true;
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+		   gfp_t flags)
+{
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (flags & __GFP_WAIT)
+		quarantine_reduce();
+
+	if (unlikely(object == NULL))
+		return;
+
+	redzone_start = round_up((unsigned long)(object + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = round_up((unsigned long)object + cache->object_size,
+				KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(object, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_KMALLOC_REDZONE);
+
+	if (cache->flags & SLAB_KASAN)
+		set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+	struct page *page;
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (flags & __GFP_WAIT)
+		quarantine_reduce();
+
+	if (unlikely(ptr == NULL))
+		return;
+
+	page = virt_to_page(ptr);
+	redzone_start = round_up((unsigned long)(ptr + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+	kasan_unpoison_shadow(ptr, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
+{
+	struct page *page;
+
+	if (unlikely(object == ZERO_SIZE_PTR))
+		return;
+
+	page = virt_to_head_page(object);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_kmalloc_large(object, size, flags);
+	else
+		kasan_kmalloc(page->slab_cache, object, size, flags);
+}
+
+void kasan_poison_kfree(void *ptr)
+{
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+				KASAN_FREE_PAGE);
+	else
+		kasan_poison_slab_free(page->slab_cache, ptr);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+	struct page *page = virt_to_page(ptr);
+
+	kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+			KASAN_FREE_PAGE);
+}
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+			PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+
+	if (ret) {
+		find_vm_area(addr)->flags |= VM_KASAN;
+		kmemleak_ignore(ret);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(global->beg, global->size);
+
+	kasan_poison_shadow(global->beg + aligned_size,
+		global->size_with_redzone - aligned_size,
+		KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size)					\
+	void __asan_load##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, false, _RET_IP_);\
+	}								\
+	EXPORT_SYMBOL(__asan_load##size);				\
+	__alias(__asan_load##size)					\
+	void __asan_load##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);			\
+	void __asan_store##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, true, _RET_IP_);	\
+	}								\
+	EXPORT_SYMBOL(__asan_store##size);				\
+	__alias(__asan_store##size)					\
+	void __asan_store##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int kasan_mem_notifier(struct notifier_block *nb,
+			unsigned long action, void *data)
+{
+	return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+	pr_err("WARNING: KASan doesn't support memory hot-add\n");
+	pr_err("Memory hot-add will be disabled\n");
+
+	hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+	return 0;
+}
+
+module_init(kasan_memhotplug_init);
+#endif
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,120 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+#include <linux/stackdepot.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
+
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+
+struct kasan_access_info {
+	const void *access_addr;
+	const void *first_bad_addr;
+	size_t access_size;
+	bool is_write;
+	unsigned long ip;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+	const char *filename;
+	int line_no;
+	int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+	const void *beg;		/* Address of the beginning of the global variable. */
+	size_t size;			/* Size of the global variable. */
+	size_t size_with_redzone;	/* Size of the variable + size of the red zone. 32 bytes aligned */
+	const void *name;
+	const void *module_name;	/* Name of the module where the global variable is declared. */
+	unsigned long has_dynamic_init;	/* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+	struct kasan_source_location *location;
+#endif
+#if KASAN_ABI_VERSION >= 5
+	char *odr_indicator;
+#endif
+};
+
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+	u32 pid;
+	depot_stack_handle_t stack;
+};
+
+struct kasan_alloc_meta {
+	struct kasan_track alloc_track;
+	struct kasan_track free_track;
+};
+
+struct qlist_node {
+	struct qlist_node *next;
+};
+struct kasan_free_meta {
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+					const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+					const void *object);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+		<< KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+	return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip);
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+			s8 shadow);
+
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
+#endif
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,321 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+	(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
+static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	unsigned long flags;
+
+	if (IS_ENABLED(CONFIG_SLAB))
+		local_irq_save(flags);
+
+	___cache_free(cache, object, _THIS_IP_);
+
+	if (IS_ENABLED(CONFIG_SLAB))
+		local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	/*
+	 * Note: irq must be disabled until after we move the batch to the
+	 * global quarantine. Otherwise quarantine_remove_cache() can miss
+	 * some objects belonging to the cache if they are in our local temp
+	 * list. quarantine_remove_cache() executes on_each_cpu() at the
+	 * beginning which ensures that it either sees the objects in per-cpu
+	 * lists or in the global quarantine.
+	 */
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
+		qlist_move_all(q, &temp);
+
+		spin_lock(&quarantine_lock);
+		WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+		qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+		if (global_quarantine[quarantine_tail].bytes >=
+				READ_ONCE(quarantine_batch_size)) {
+			int new_tail;
+
+			new_tail = quarantine_tail + 1;
+			if (new_tail == QUARANTINE_BATCHES)
+				new_tail = 0;
+			if (new_tail != quarantine_head)
+				quarantine_tail = new_tail;
+		}
+		spin_unlock(&quarantine_lock);
+	}
+
+	local_irq_restore(flags);
+}
+
+void quarantine_reduce(void)
+{
+	size_t total_size, new_quarantine_size, percpu_quarantines;
+	unsigned long flags;
+	int srcu_idx;
+	struct qlist_head to_free = QLIST_INIT;
+
+	if (likely(READ_ONCE(quarantine_size) <=
+		   READ_ONCE(quarantine_max_size)))
+		return;
+
+	/*
+	 * srcu critical section ensures that quarantine_remove_cache()
+	 * will not miss objects belonging to the cache while they are in our
+	 * local to_free list. srcu is chosen because (1) it gives us private
+	 * grace period domain that does not interfere with anything else,
+	 * and (2) it allows synchronize_srcu() to return without waiting
+	 * if there are no pending read critical sections (which is the
+	 * expected case).
+	 */
+	srcu_idx = srcu_read_lock(&remove_cache_srcu);
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	new_quarantine_size = (total_size < percpu_quarantines) ?
+		0 : total_size - percpu_quarantines;
+	WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+	/* Aim at consuming at most 1/2 of slots in quarantine. */
+	WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+		2 * total_size / QUARANTINE_BATCHES));
+
+	if (likely(quarantine_size > quarantine_max_size)) {
+		qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+		WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+		quarantine_head++;
+		if (quarantine_head == QUARANTINE_BATCHES)
+			quarantine_head = 0;
+	}
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+	srcu_read_unlock(&remove_cache_srcu, srcu_idx);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	qlist_init(from);
+	while (curr) {
+		struct qlist_node *next = curr->next;
+		struct kmem_cache *obj_cache = qlink_to_cache(curr);
+
+		if (obj_cache == cache)
+			qlist_put(to, curr, obj_cache->size);
+		else
+			qlist_put(from, curr, obj_cache->size);
+
+		curr = next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+/* Free all quarantined objects belonging to cache. */
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags, i;
+	struct qlist_head to_free = QLIST_INIT;
+
+	/*
+	 * Must be careful to not miss any objects that are being moved from
+	 * per-cpu list to the global quarantine in quarantine_put(),
+	 * nor objects being freed in quarantine_reduce(). on_each_cpu()
+	 * achieves the first goal, while synchronize_srcu() achieves the
+	 * second.
+	 */
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	for (i = 0; i < QUARANTINE_BATCHES; i++)
+		qlist_move_cache(&global_quarantine[i], &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+
+	synchronize_srcu(&remove_cache_srcu);
+}
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,336 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+	u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+	const void *first_bad_addr = addr;
+
+	while (!shadow_val && first_bad_addr < addr + size) {
+		first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+		shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+	}
+	return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown crash";
+	u8 shadow_val;
+
+	info->first_bad_addr = find_first_bad_addr(info->access_addr,
+						info->access_size);
+
+	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+	switch (shadow_val) {
+	case KASAN_FREE_PAGE:
+	case KASAN_KMALLOC_FREE:
+		bug_type = "use after free";
+		break;
+	case KASAN_PAGE_REDZONE:
+	case KASAN_KMALLOC_REDZONE:
+	case KASAN_GLOBAL_REDZONE:
+	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		bug_type = "out of bounds access";
+		break;
+	case KASAN_STACK_LEFT:
+	case KASAN_STACK_MID:
+	case KASAN_STACK_RIGHT:
+	case KASAN_STACK_PARTIAL:
+		bug_type = "out of bounds on stack";
+		break;
+	}
+
+	pr_err("BUG: KASan: %s in %pS at addr %p\n",
+		bug_type, (void *)info->ip,
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+}
+
+static inline bool kernel_or_module_addr(const void *addr)
+{
+	return (addr >= (void *)_stext && addr < (void *)_end)
+		|| (addr >= (void *)MODULES_VADDR
+			&& addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+	return addr >= (void *)&init_thread_union.stack &&
+		(addr <= (void *)&init_thread_union.stack +
+			sizeof(init_thread_union.stack));
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+static void kasan_start_report(unsigned long *flags)
+{
+	/*
+	 * Make sure we don't end up in loop.
+	 */
+	kasan_disable_current();
+	spin_lock_irqsave(&report_lock, *flags);
+	pr_err("==================================================================\n");
+}
+
+static void kasan_end_report(unsigned long *flags)
+{
+	pr_err("==================================================================\n");
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	spin_unlock_irqrestore(&report_lock, *flags);
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
+	kasan_enable_current();
+}
+
+static void print_track(struct kasan_track *track)
+{
+	pr_err("PID = %u\n", track->pid);
+	if (track->stack) {
+		struct stack_trace trace;
+
+		depot_fetch_stack(track->stack, &trace);
+		print_stack_trace(&trace, 0);
+	} else {
+		pr_err("(stack is not available)\n");
+	}
+}
+
+static void kasan_object_err(struct kmem_cache *cache, void *object)
+{
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+
+	dump_stack();
+	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
+		cache->object_size);
+
+	if (!(cache->flags & SLAB_KASAN))
+		return;
+
+	pr_err("Allocated:\n");
+	print_track(&alloc_info->alloc_track);
+	pr_err("Freed:\n");
+	print_track(&alloc_info->free_track);
+}
+
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+			s8 shadow)
+{
+	unsigned long flags;
+
+	kasan_start_report(&flags);
+	pr_err("BUG: Double free or freeing an invalid pointer\n");
+	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
+	kasan_object_err(cache, object);
+	kasan_end_report(&flags);
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+	const void *addr = info->access_addr;
+
+	if ((addr >= (void *)PAGE_OFFSET) &&
+		(addr < high_memory)) {
+		struct page *page = virt_to_head_page(addr);
+
+		if (PageSlab(page)) {
+			void *object;
+			struct kmem_cache *cache = page->slab_cache;
+			object = nearest_obj(cache, page,
+						(void *)info->access_addr);
+			kasan_object_err(cache, object);
+			return;
+		}
+		dump_page(page, "kasan: bad access detected");
+	}
+
+	if (kernel_or_module_addr(addr)) {
+		if (!init_task_stack_addr(addr))
+			pr_err("Address belongs to variable %pS\n", addr);
+	}
+	dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+	return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+	/* The length of ">ff00ff00ff00ff00: " is
+	 *    3 + (BITS_PER_LONG/8)*2 chars.
+	 */
+	return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+		(shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+	int i;
+	const void *shadow = kasan_mem_to_shadow(addr);
+	const void *shadow_row;
+
+	shadow_row = (void *)round_down((unsigned long)shadow,
+					SHADOW_BYTES_PER_ROW)
+		- SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+	pr_err("Memory state around the buggy address:\n");
+
+	for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+		const void *kaddr = kasan_shadow_to_mem(shadow_row);
+		char buffer[4 + (BITS_PER_LONG/8)*2];
+
+		snprintf(buffer, sizeof(buffer),
+			(i == 0) ? ">%p: " : " %p: ", kaddr);
+
+		kasan_disable_current();
+		print_hex_dump(KERN_ERR, buffer,
+			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+			shadow_row, SHADOW_BYTES_PER_ROW, 0);
+		kasan_enable_current();
+
+		if (row_is_guilty(shadow_row, shadow))
+			pr_err("%*c\n",
+				shadow_pointer_offset(shadow_row, shadow),
+				'^');
+
+		shadow_row += SHADOW_BYTES_PER_ROW;
+	}
+}
+
+static void kasan_report_error(struct kasan_access_info *info)
+{
+	unsigned long flags;
+	const char *bug_type;
+
+	kasan_start_report(&flags);
+
+	if (info->access_addr <
+			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+		if ((unsigned long)info->access_addr < PAGE_SIZE)
+			bug_type = "null-ptr-deref";
+		else if ((unsigned long)info->access_addr < TASK_SIZE)
+			bug_type = "user-memory-access";
+		else
+			bug_type = "wild-memory-access";
+		pr_err("BUG: KASan: %s on address %p\n",
+			bug_type, info->access_addr);
+		pr_err("%s of size %zu by task %s/%d\n",
+			info->is_write ? "Write" : "Read",
+			info->access_size, current->comm,
+			task_pid_nr(current));
+		dump_stack();
+	} else {
+		print_error_description(info);
+		print_address_description(info);
+		print_shadow_for_address(info->first_bad_addr);
+	}
+
+	kasan_end_report(&flags);
+}
+
+static bool print_till_death;
+static int __init kasan_setup(char *arg)
+{
+	print_till_death = true;
+	return 0;
+}
+__setup("kasan_print_till_death", kasan_setup);
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip)
+{
+	struct kasan_access_info info;
+	static bool reported = false;
+
+	if (likely(!kasan_enabled()))
+		return;
+
+	if (likely(!print_till_death)) {
+		if (reported)
+			return;
+		reported = true;
+	}
+	info.access_addr = (void *)addr;
+	info.access_size = size;
+	info.is_write = is_write;
+	info.ip = ip;
+
+	kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+	kasan_report(addr, size, false, _RET_IP_);	  \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+	kasan_report(addr, size, true, _RET_IP_);	   \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
 #include <linux/mm_types.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/kmemcheck.h>
 
 void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
 #include <asm/processor.h>
 #include <linux/atomic.h>
 
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
@@ -298,6 +299,8 @@ static void hex_dump_object(struct seq_file *seq,
 		min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
 
 	seq_printf(seq, "  hex dump (first %d bytes):\n", len);
+	kasan_disable_current();
+
 	for (i = 0; i < len; i += HEX_ROW_SIZE) {
 		int linelen = min(remaining, HEX_ROW_SIZE);
 
@@ -307,6 +310,7 @@ static void hex_dump_object(struct seq_file *seq,
 				   HEX_ASCII);
 		seq_printf(seq, "    %s\n", linebuf);
 	}
+	kasan_enable_current();
 }
 
 /*
@@ -1077,7 +1081,10 @@ static bool update_checksum(struct kmemleak_object *object)
 	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
 		return false;
 
+	kasan_disable_current();
 	object->checksum = crc32(0, (void *)object->pointer, object->size);
+	kasan_enable_current();
+
 	return object->checksum != old_csum;
 }
 
@@ -1128,7 +1135,9 @@ static void scan_block(void *_start, void *_end,
 						  BYTES_PER_POINTER))
 			continue;
 
+		kasan_disable_current();
 		pointer = *ptr;
+		kasan_enable_current();
 
 		object = find_and_get_object(pointer, 1);
 		if (!object)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2346,7 +2346,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
@@ -2477,6 +2477,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 		return page;		/* let do_swap_page report the error */
 
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+
 	if (new_page) {
 		copy_user_highpage(new_page, page, address, vma);
 
@@ -2601,8 +2602,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 }
 
 #ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-		  struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct stable_node *stable_node;
 	struct rmap_item *rmap_item;
@@ -2637,11 +2637,19 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
 				continue;
 
-			ret = rmap_one(page, vma, rmap_item->address, arg);
+			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+				continue;
+
+			ret = rwc->rmap_one(page, vma,
+					rmap_item->address, rwc->arg);
 			if (ret != SWAP_AGAIN) {
 				anon_vma_unlock_read(anon_vma);
 				goto out;
 			}
+			if (rwc->done && rwc->done(page)) {
+				anon_vma_unlock_read(anon_vma);
+				goto out;
+			}
 		}
 		anon_vma_unlock_read(anon_vma);
 	}
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,108 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_add(&lru->list, &list_lrus);
+	mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_del(&lru->list);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	struct list_lru_memcg *memcg_lrus;
+	/* Here we only check the pointer is not NULL, so RCU lock isn't need */
+	memcg_lrus = rcu_dereference_check(lru->node[0].memcg_lrus, true);
+	return !!memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	struct list_lru_memcg *memcg_lrus;
+	/*
+	 * Either lock and RCU protects the array of per cgroup lists
+	 * from relocation (see memcg_update_list_lru_node).
+	 */
+	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+					   lockdep_is_held(&nlru->lock));
+	if (memcg_lrus && idx >= 0)
+		return memcg_lrus->lru[idx];
+
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	struct list_lru_memcg *memcg_lrus;
+	struct mem_cgroup *memcg;
+
+	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+					   lockdep_is_held(&nlru->lock));
+	if (!memcg_lrus)
+		return &nlru->lru;
+
+	memcg = mem_cgroup_from_kmem(ptr);
+	if (!memcg)
+		return &nlru->lru;
+
+	return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
+	l = list_lru_from_kmem(nlru, item);
 	if (list_empty(item)) {
-		list_add_tail(item, &nlru->list);
-		if (nlru->nr_items++ == 0)
-			node_set(nid, lru->active_nodes);
+		list_add_tail(item, &l->list);
+		l->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -33,13 +123,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_kmem(nlru, item);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		if (--nlru->nr_items == 0)
-			node_clear(nid, lru->active_nodes);
-		WARN_ON_ONCE(nlru->nr_items < 0);
+		l->nr_items--;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -48,33 +138,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+	list_del_init(item);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head)
+{
+	list_move(item, head);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+					  int nid, int memcg_idx)
 {
-	unsigned long count = 0;
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	unsigned long count;
 
-	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
-	count += nlru->nr_items;
-	spin_unlock(&nlru->lock);
+	rcu_read_lock();
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
+	count = l->nr_items;
+	rcu_read_unlock();
+
+	return count;
+}
 
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg)
+{
+	return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+	long count = 0;
+	int memcg_idx;
+
+	count += __list_lru_count_one(lru, nid, -1);
+	if (list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx)
+			count += __list_lru_count_one(lru, nid, memcg_idx);
+	}
 	return count;
 }
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
-		   void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+		    list_lru_walk_cb isolate, void *cb_arg,
+		    unsigned long *nr_to_walk)
 {
 
-	struct list_lru_node	*nlru = &lru->node[nid];
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-	list_for_each_safe(item, n, &nlru->list) {
+	list_for_each_safe(item, n, &l->list) {
 		enum lru_status ret;
 
 		/*
@@ -85,14 +214,11 @@ list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, &nlru->lock, cb_arg);
+		ret = isolate(item, l, &nlru->lock, cb_arg);
 		switch (ret) {
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			if (--nlru->nr_items == 0)
-				node_clear(nid, lru->active_nodes);
-			WARN_ON_ONCE(nlru->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
@@ -103,7 +229,7 @@ list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
 				goto restart;
 			break;
 		case LRU_ROTATE:
-			list_move_tail(item, &nlru->list);
+			list_move_tail(item, &l->list);
 			break;
 		case LRU_SKIP:
 			break;
@@ -122,31 +248,341 @@ list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		  list_lru_walk_cb isolate, void *cb_arg,
+		  unsigned long *nr_to_walk)
+{
+	return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+				   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+				 list_lru_walk_cb isolate, void *cb_arg,
+				 unsigned long *nr_to_walk)
+{
+	long isolated = 0;
+	int memcg_idx;
+
+	isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+					nr_to_walk);
+	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx) {
+			isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+						isolate, cb_arg, nr_to_walk);
+			if (*nr_to_walk <= 0)
+				break;
+		}
+	}
+	return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+	INIT_LIST_HEAD(&l->list);
+	l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+					  int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++)
+		kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+				      int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++) {
+		struct list_lru_one *l;
+
+		l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+		if (!l)
+			goto fail;
+
+		init_one_lru(l);
+		memcg_lrus->lru[i] = l;
+	}
+	return 0;
+fail:
+	__memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+	return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+	struct list_lru_memcg *memcg_lrus;
+	int size = memcg_nr_cache_ids;
+
+	memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
+			     size * sizeof(void *), GFP_KERNEL);
+	if (!memcg_lrus)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
+		kvfree(memcg_lrus);
+		return -ENOMEM;
+	}
+	rcu_assign_pointer(nlru->memcg_lrus, memcg_lrus);
+
+	return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+	struct list_lru_memcg *memcg_lrus;
+
+	/*
+	 * This is called when shrinker has already been unregistered,
+	 * so nobody can use it.
+	 */
+	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, true);
+	__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
+	kvfree(memcg_lrus);
+}
+
+static void free_list_lru_memcg(struct rcu_head *head)
+{
+	kvfree(container_of(head, struct list_lru_memcg, rcu));
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+				      int old_size, int new_size)
+{
+	struct list_lru_memcg *old, *new;
+
+	BUG_ON(old_size > new_size);
+	lockdep_assert_held(&list_lrus_mutex);
+
+	/* list_lrus_mutex is held, nobody can change memcg_lrus. Silence RCU */
+	old = rcu_dereference_check(nlru->memcg_lrus, true);
+	new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+		kvfree(new);
+		return -ENOMEM;
+	}
+
+	memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
+
+	/*
+	 * The locking below allows the readers, that already take nlru->lock,
+	 * not to use additional rcu_read_lock()/rcu_read_unlock() pair.
+	 *
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+	rcu_assign_pointer(nlru->memcg_lrus, new);
+	spin_unlock_irq(&nlru->lock);
+
+	call_rcu(&old->rcu, free_list_lru_memcg);
+	return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+					      int old_size, int new_size)
+{
+	/* do not bother shrinking the array back to the old size, because we
+	 * cannot handle allocation failures here */
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (!memcg_aware)
+			rcu_assign_pointer(lru->node[i].memcg_lrus, NULL);
+		else if (memcg_init_list_lru_node(&lru->node[i]))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+	return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+				 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return 0;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (memcg_update_list_lru_node(&lru->node[i],
+					       old_size, new_size))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+	return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+					 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+	int ret = 0;
+	struct list_lru *lru;
+	int old_size = memcg_nr_cache_ids;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list) {
+		ret = memcg_update_list_lru(lru, old_size, new_size);
+		if (ret)
+			goto fail;
+	}
+out:
+	mutex_unlock(&list_lrus_mutex);
+	return ret;
+fail:
+	list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+		memcg_cancel_update_list_lru(lru, old_size, new_size);
+	goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+				      int src_idx, int dst_idx)
+{
+	struct list_lru_one *src, *dst;
+
+	/*
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+
+	src = list_lru_from_memcg_idx(nlru, src_idx);
+	dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+	list_splice_init(&src->list, &dst->list);
+	dst->nr_items += src->nr_items;
+	src->nr_items = 0;
+
+	spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+				 int src_idx, int dst_idx)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+	struct list_lru *lru;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list)
+		memcg_drain_list_lru(lru, src_idx, dst_idx);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key)
 {
 	int i;
 	size_t size = sizeof(*lru->node) * nr_node_ids;
+	int err = -ENOMEM;
+
+	memcg_get_cache_ids();
 
 	lru->node = kzalloc(size, GFP_KERNEL);
 	if (!lru->node)
-		return -ENOMEM;
+		goto out;
 
-	nodes_clear(lru->active_nodes);
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
 			lockdep_set_class(&lru->node[i].lock, key);
-		INIT_LIST_HEAD(&lru->node[i].list);
-		lru->node[i].nr_items = 0;
+		init_one_lru(&lru->node[i].lru);
 	}
-	return 0;
+
+	err = memcg_init_list_lru(lru, memcg_aware);
+	if (err) {
+		kfree(lru->node);
+		/* Do this so a list_lru_destroy() doesn't crash: */
+		lru->node = NULL;
+		goto out;
+	}
+
+	list_lru_register(lru);
+out:
+	memcg_put_cache_ids();
+	return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 
 void list_lru_destroy(struct list_lru *lru)
 {
+	/* Already destroyed or not yet initialized? */
+	if (!lru->node)
+		return;
+
+	memcg_get_cache_ids();
+
+	list_lru_unregister(lru);
+
+	memcg_destroy_list_lru(lru);
 	kfree(lru->node);
+	lru->node = NULL;
+
+	memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -157,7 +157,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 		pte_unmap_unlock(orig_pte, ptl);
 
-		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+		if (pte_present(pte) || pte_none(pte))
 			continue;
 		entry = pte_to_swp_entry(pte);
 		if (unlikely(non_swap_entry(entry)))
@@ -317,14 +317,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 		}
 		VM_WARN_ON(start >= end);
 	}
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		struct zap_details details = {
-			.nonlinear_vma = vma,
-			.last_index = ULONG_MAX,
-		};
-		zap_page_range(vma, start, end - start, &details);
-	} else
-		zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start, NULL);
 	return 0;
 }
 
@@ -345,7 +338,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
 
-	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
 		return -EINVAL;
 
 	f = vma->vm_file;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -48,16 +48,19 @@
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
-#include <linux/vmalloc.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
+#include <linux/virtinfo.h>
+#include <linux/migrate.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
+#include <net/udp_memcontrol.h>
+#include "slab.h"
 
 #include <asm/uaccess.h>
 
@@ -92,25 +95,43 @@ enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
-	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_SHMEM,		/* # of charged shmem pages */
+	MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE, /* # of unreclaimable slab pages */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 
+enum mem_cgroup_stat2_index {
+	/*
+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+	 */
+	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
+	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
+	MEM_CGROUP_STAT2_NSTATS,
+};
+
 static const char * const mem_cgroup_stat_names[] = {
-	"cache",
-	"rss",
 	"rss_huge",
 	"mapped_file",
+	"shmem",
+	"slab_unreclaimable",
 	"swap",
 };
 
+static const char * const mem_cgroup_stat2_names[] = {
+	"cache",
+	"rss",
+	"slab_reclaimable",
+};
+
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
+	MEM_CGROUP_EVENTS_PSWPIN,	/* # of pages swapped in */
+	MEM_CGROUP_EVENTS_PSWPOUT,	/* # of pages swapped out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
@@ -119,6 +140,8 @@ enum mem_cgroup_events_index {
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
+	"pswpin",
+	"pswpout",
 	"pgfault",
 	"pgmajfault",
 };
@@ -154,6 +177,10 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
+struct mem_cgroup_stat2_cpu {
+	struct percpu_counter counters[MEM_CGROUP_STAT2_NSTATS];
+};
+
 struct mem_cgroup_reclaim_iter {
 	/*
 	 * last scanned hierarchy member. Valid only if last_dead_count
@@ -267,46 +294,46 @@ struct mem_cgroup {
 
 	unsigned long soft_limit;
 
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 
-	union {
-		/*
-		 * the counter to account for mem+swap usage.
-		 */
-		struct page_counter memsw;
-		/*
-		 * rcu_freeing is used only when freeing struct mem_cgroup,
-		 * so put it into a union to avoid wasting more memory.
-		 * It must be disjoint from the css field.  It could be
-		 * in a union with the res field, but res plays a much
-		 * larger part in mem_cgroup life than memsw, and might
-		 * be of interest, even at time of free, when debugging.
-		 * So share rcu_head with the less interesting memsw.
-		 */
-		struct rcu_head rcu_freeing;
-		/*
-		 * We also need some space for a worker in deferred freeing.
-		 * By the time we call it, rcu_freeing is no longer in use.
-		 */
-		struct work_struct work_freeing;
-	};
 	/*
 	 * the counter to account for kernel memory usage.
 	 */
 	struct page_counter kmem;
+	struct page_counter memsw;
+	/*
+	 * the counter to account for dcache usage.
+	 *
+	 * Never limited, only needed for showing stats. We could use a per cpu
+	 * counter if we did not have to report max usage.
+	 */
+	struct page_counter dcache;
+
+	/* beancounter-related stats */
+	unsigned long long swap_max;
+	atomic_long_t mem_failcnt;
+	atomic_long_t swap_failcnt;
+	atomic_long_t oom_kill_cnt;
+
+	struct oom_context oom_ctx;
+	unsigned long oom_guarantee;
+
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
+	bool is_offline;
 	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	oom_wakeups;
 
-	atomic_t	refcnt;
-
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
@@ -314,6 +341,20 @@ struct mem_cgroup {
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 
+#ifdef CONFIG_CLEANCACHE
+	/*
+	 * cleancache_disabled_toggle: toggled by writing to
+	 * memory.disable_cleancache
+	 *
+	 * cleancache_disabled: set iff cleancache_disabled_toggle is
+	 * set in this cgroup or any of its ascendants; controls whether
+	 * cleancache callback is called when a page is evicted from
+	 * this cgroup
+	 */
+	bool cleancache_disabled_toggle;
+	bool cleancache_disabled;
+#endif
+
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 
@@ -341,24 +382,19 @@ struct mem_cgroup {
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
-	/*
-	 * used when a cpu is offlined or other synchronizations
-	 * See mem_cgroup_read_stat().
-	 */
-	struct mem_cgroup_stat_cpu nocpu_base;
+	struct mem_cgroup_stat2_cpu stat2;
 	spinlock_t pcp_counter_lock;
 
 	atomic_t	dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct tcp_memcontrol tcp_mem;
+	struct udp_memcontrol udp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-	/* analogous to slab_common's slab_caches list. per-memcg */
-	struct list_head memcg_slab_caches;
-	/* Not a spinlock, we can take a lot of time walking the list */
-	struct mutex slab_caches_mutex;
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	/* List of memcgs sharing the same kmemcg_id */
+	struct list_head kmemcg_sharers;
 #endif
 
 	int last_scanned_node;
@@ -378,48 +414,27 @@ struct mem_cgroup {
 	struct mem_cgroup_lru_info info;
 };
 
-static size_t memcg_size(void)
-{
-	return sizeof(struct mem_cgroup) +
-		nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
-
 /* internal only representation about the status of kmem accounting. */
 enum {
-	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
-	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
+	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled */
 	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
 #ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
-	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
-	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
-		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+	/*
+	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
+	 * will call css_put() if it sees the memcg is dead.
+	 */
+	smp_wmb();
+	set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
 
 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
@@ -495,14 +510,6 @@ enum res_type {
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 
-/*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
-#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-
 /*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
@@ -510,9 +517,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *memcg);
-
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
@@ -563,15 +567,15 @@ void sock_update_memcg(struct sock *sk)
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
-			mem_cgroup_get(sk->sk_cgrp->memcg);
+			css_get(&sk->sk_cgrp->memcg->css);
 			return;
 		}
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
-		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
-			mem_cgroup_get(memcg);
+		if (!mem_cgroup_is_root(memcg) &&
+		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
@@ -585,7 +589,7 @@ void sock_release_memcg(struct sock *sk)
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
-		mem_cgroup_put(memcg);
+		css_put(&sk->sk_cgrp->memcg->css);
 	}
 }
 
@@ -598,11 +602,21 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 
+struct cg_proto *udp_proto_cgroup(struct mem_cgroup *memcg)
+{
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return NULL;
+
+	return &memcg->udp_mem.cg_proto;
+}
+EXPORT_SYMBOL(udp_proto_cgroup);
+
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
-	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
-		return;
-	static_key_slow_dec(&memcg_socket_limit_enabled);
+	if (memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+		static_key_slow_dec(&memcg_socket_limit_enabled);
+	if (memcg_proto_activated(&memcg->udp_mem.cg_proto))
+		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
@@ -612,7 +626,7 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * There are two main reasons for not using the css_id for this:
  *  1) this works better in sparse environments, where we have a lot of memcgs,
  *     but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -624,12 +638,24 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
  *     css_id. Having a separate index prevents us from messing with the cgroup
  *     core for this
  *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
  */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+	down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+	up_read(&memcg_cache_ids_sem);
+}
 
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -657,10 +683,8 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg_kmem_is_active(memcg)) {
+	if (test_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags))
 		static_key_slow_dec(&memcg_kmem_enabled_key);
-		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
-	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
@@ -693,6 +717,32 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 	return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the memory cgroup @page is charged to and return its inode number or
+ * 0 if @page is not charged to any cgroup. It is safe to call this function
+ * without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+	struct page_cgroup *pc;
+	unsigned long ino = 0;
+
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return 0;
+	if (likely(PageCgroupUsed(pc)))
+		ino = pc->mem_cgroup->css.cgroup->dentry->d_inode->i_ino;
+	return ino;
+}
+
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -767,9 +817,11 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
-	spin_lock(&mctz->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mctz->lock, flags);
 	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
-	spin_unlock(&mctz->lock);
+	spin_unlock_irqrestore(&mctz->lock, flags);
 }
 
 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
@@ -805,7 +857,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
-			spin_lock(&mctz->lock);
+			unsigned long flags;
+
+			spin_lock_irqsave(&mctz->lock, flags);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
@@ -814,7 +868,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
-			spin_unlock(&mctz->lock);
+			spin_unlock_irqrestore(&mctz->lock, flags);
 		}
 	}
 }
@@ -865,19 +919,21 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 
-	spin_lock(&mctz->lock);
+	spin_lock_irq(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
-	spin_unlock(&mctz->lock);
+	spin_unlock_irq(&mctz->lock);
 	return mz;
 }
 
 /*
+ * Return page count for single (non recursive) @memcg.
+ *
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronizion of counter in memcg's counter.
+ * a periodic synchronization of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
@@ -887,32 +943,71 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threashold and synchonization as vmstat[] should be
+ * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
  */
-static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx)
+static unsigned long
+mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu)
+	/* Per-cpu values can be negative, use a signed accumulator */
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.count[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
-	put_online_cpus();
+	/*
+	 * Summing races with updates, so val may be negative.  Avoid exposing
+	 * transient negative values.
+	 */
+	if (val < 0)
+		val = 0;
 	return val;
 }
 
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
-					 bool charge)
+static inline unsigned long
+mem_cgroup_read_stat2_fast(struct mem_cgroup *memcg, enum mem_cgroup_stat2_index idx)
 {
-	int val = (charge) ? 1 : -1;
-	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+	return percpu_counter_read_positive(&memcg->stat2.counters[idx]);
+}
+
+static inline unsigned long
+mem_cgroup_read_stat2(struct mem_cgroup *memcg, enum mem_cgroup_stat2_index idx)
+{
+	return percpu_counter_sum_positive(&memcg->stat2.counters[idx]);
+}
+
+static void mem_cgroup_update_swap_max(struct mem_cgroup *memcg)
+{
+	long long swap;
+
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		swap = page_counter_read(&memcg->memsw) -
+			page_counter_read(&memcg->memory);
+
+		/* This is racy, but we don't have to be absolutely precise */
+		if (swap > (long long)memcg->swap_max)
+			memcg->swap_max = swap;
+       }
+}
+
+static void mem_cgroup_inc_failcnt(struct mem_cgroup *memcg,
+                                  gfp_t gfp_mask, unsigned int nr_pages)
+{
+	unsigned long margin = 0;
+	unsigned long count;
+	unsigned long limit;
+
+	if (gfp_mask & __GFP_NOWARN)
+		return;
+
+	atomic_long_inc(&memcg->mem_failcnt);
+	count = page_counter_read(&memcg->memsw);
+	limit = ACCESS_ONCE(memcg->memsw.limit);
+	if (count < limit)
+		margin = limit - count;
+
+	if (do_swap_account && margin < nr_pages)
+		atomic_long_inc(&memcg->swap_failcnt);
 }
 
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -921,19 +1016,14 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 	unsigned long val = 0;
 	int cpu;
 
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.events[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
 	return val;
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
-					 bool anon, int nr_pages)
+					 int nr_pages)
 {
 	preempt_disable();
 
@@ -941,12 +1031,16 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
-	if (anon)
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+	if (PageAnon(page))
+		percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS],
 				nr_pages);
-	else
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+	else {
+		percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
+		if (PageSwapBacked(page))
+			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+				       nr_pages);
+	}
 
 	if (PageTransHuge(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
@@ -1050,7 +1144,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
-	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
@@ -1063,8 +1156,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
-		preempt_enable();
-
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
@@ -1072,8 +1163,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
-	} else
-		preempt_enable();
+	}
 }
 
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -1095,22 +1185,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 	return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
 }
 
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 
-	if (!mm)
-		return NULL;
-	/*
-	 * Because we have no locks, mm->owner's may be being moved to other
-	 * cgroup. We use css_tryget() here even if this looks
-	 * pessimistic (rather than adding locks here).
-	 */
 	rcu_read_lock();
 	do {
-		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-		if (unlikely(!memcg))
-			break;
+		/*
+		 * Page cache insertions can happen withou an
+		 * actual mm context, e.g. during disk probing
+		 * on boot, loopback IO, acct() writes etc.
+		 */
+		if (unlikely(!mm))
+			memcg = root_mem_cgroup;
+		else {
+			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+			if (unlikely(!memcg))
+				memcg = root_mem_cgroup;
+		}
 	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
@@ -1296,6 +1388,19 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
+void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+			     unsigned long *pages)
+{
+	struct mem_cgroup *iter;
+	int i;
+
+	for_each_mem_cgroup_tree(iter, memcg) {
+		for (i = 0; i < NR_LRU_LISTS; i++)
+			pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid,
+								 BIT(i));
+	}
+}
+
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
@@ -1353,20 +1458,6 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 	return lruvec;
 }
 
-/*
- * Following LRU functions are allowed to be used without PCG_LOCK.
- * Operations are called by routine of global LRU independently from memcg.
- * What we have to take care of here is validness of pc->mem_cgroup.
- *
- * Changes to pc->mem_cgroup happens when
- * 1. charge
- * 2. moving account
- * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
- * It is added to LRU before charge.
- * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
- * When moving account, the page is not on LRU. It's isolated.
- */
-
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
@@ -1469,7 +1560,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 
 	p = find_lock_task_mm(task);
 	if (p) {
-		curr = try_get_mem_cgroup_from_mm(p->mm);
+		curr = get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
@@ -1483,8 +1574,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 			css_get(&curr->css);
 		task_unlock(task);
 	}
-	if (!curr)
-		return 0;
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1515,6 +1604,156 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive * inactive_ratio < active;
 }
 
+bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg, int vfs_cache_min_ratio)
+{
+	unsigned long anon, file, dcache;
+
+	anon = mem_cgroup_read_stat2_fast(memcg, MEM_CGROUP_STAT_RSS);
+	file = mem_cgroup_read_stat2_fast(memcg, MEM_CGROUP_STAT_CACHE);
+	dcache = mem_cgroup_read_stat2_fast(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+
+	return dcache / vfs_cache_min_ratio <
+			(anon + file + dcache) / 100;
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
+ *
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
+ */
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	/*
+	 * The toplevel group doesn't have a configurable range, so
+	 * it's never low when looked at directly, and it is not
+	 * considered an ancestor when assessing the hierarchy.
+	 */
+
+	if (memcg == root_mem_cgroup)
+		return false;
+
+	if (page_counter_read(&memcg->memory) >= memcg->low)
+		return false;
+
+	/*
+	 * XXX: It's OK to set memory.low for a cgroup to infinity. This might
+	 * be useful if no tasks are supposed to run inside the cgroup itself,
+	 * but only in its sub-cgroups (e.g. /machine.slice). In this case
+	 * protection against memory pressure originating on upper levels will
+	 * be guarded solely by memory.low configuration in sub-cgroups.
+	 *
+	 * However, in the current implementation, in contrast to mainstream,
+	 * charges can appear in a cgroup even if there's no tasks in it - they
+	 * can be reparented from a dead sub-cgroup. If the cgroup has
+	 * memory.low set to inf, such reparented charges will not get
+	 * reclaimed normally on memory pressure, resulting in performance
+	 * degradation in other cgroups. To avoid that, let's ignore memory.low
+	 * for cgroups w/o tasks.
+	 */
+	if (cgroup_task_count(memcg->css.cgroup) == 0)
+		return false;
+
+	while (memcg != root) {
+		memcg = parent_mem_cgroup(memcg);
+		if (!memcg)
+			break;
+
+		if (memcg == root_mem_cgroup)
+			break;
+
+		if (page_counter_read(&memcg->memory) >= memcg->low)
+			return false;
+	}
+	return true;
+}
+
+#ifdef CONFIG_CLEANCACHE
+bool mem_cgroup_cleancache_disabled(struct page *page)
+{
+	struct page_cgroup *pc;
+	bool ret = false;
+
+	if (mem_cgroup_disabled())
+		return false;
+
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return false;
+
+	if (likely(PageCgroupUsed(pc)))
+		ret = pc->mem_cgroup->cleancache_disabled;
+	return ret;
+}
+#endif
+
+void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg,
+			      struct task_struct *task)
+{
+	struct mem_cgroup *memcg, *memcg_to_put;
+	struct task_struct *p;
+
+	if (!root_memcg)
+		root_memcg = root_mem_cgroup;
+
+	p = find_lock_task_mm(task);
+	if (p) {
+		memcg = get_mem_cgroup_from_mm(p->mm);
+		task_unlock(p);
+	} else {
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(task);
+		css_get(&memcg->css);
+		rcu_read_unlock();
+	}
+	memcg_to_put = memcg;
+	if (!memcg || !mem_cgroup_same_or_subtree(root_memcg, memcg))
+		memcg = root_memcg;
+
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		atomic_long_inc(&memcg->oom_kill_cnt);
+		if (memcg == root_memcg)
+			break;
+	}
+
+	if (memcg_to_put)
+		css_put(&memcg_to_put->css);
+}
+
+struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return &global_oom_ctx;
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	return &memcg->oom_ctx;
+}
+
+unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	unsigned long long guarantee, usage;
+
+	if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
+		return 0;
+
+	guarantee = ACCESS_ONCE(memcg->oom_guarantee);
+	usage = page_counter_read(&memcg->memsw);
+	return div64_u64(1000 * usage, guarantee + 1);
+}
+
+unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
+{
+	unsigned long limit;
+
+	limit = swap ? memcg->memsw.limit : memcg->memory.limit;
+	return min_t(unsigned long, PAGE_COUNTER_MAX, limit);
+}
+
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1525,7 +1764,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
-static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg, bool kmem)
 {
 	unsigned long margin = 0;
 	unsigned long count;
@@ -1541,6 +1780,15 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 		limit = ACCESS_ONCE(memcg->memsw.limit);
 		if (count <= limit)
 			margin = min(margin, limit - count);
+		else
+			margin = 0;
+	}
+
+	if (kmem && margin) {
+		count = page_counter_read(&memcg->kmem);
+		limit = READ_ONCE(memcg->kmem.limit);
+		if (count <= limit)
+			margin = min(margin, limit - count);
 	}
 
 	return margin;
@@ -1679,13 +1927,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-	struct cgroup *task_cgrp;
-	struct cgroup *mem_cgrp;
 	/*
-	 * Need a buffer in BSS, can't rely on allocations. The code relies
-	 * on the assumption that OOM is serialized for memory controller.
-	 * If this assumption is broken, revisit this code.
+	 * protects memcg_name and makes sure that parallel ooms do not
+	 * interleave
 	 */
+	static DEFINE_MUTEX(oom_info_lock);
+	struct cgroup *task_cgrp;
+	struct cgroup *mem_cgrp;
 	static char memcg_name[PATH_MAX];
 	int ret;
 	struct mem_cgroup *iter;
@@ -1694,6 +1942,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 	if (!p)
 		return;
 
+	mutex_lock(&oom_info_lock);
 	rcu_read_lock();
 
 	mem_cgrp = memcg->css.cgroup;
@@ -1749,9 +1998,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 				continue;
-			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
 		}
+		for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+			pr_cont(" %s:%luKB", mem_cgroup_stat2_names[i],
+				K(mem_cgroup_read_stat2(iter, i)));
+		}
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
@@ -1759,6 +2012,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
 		pr_cont("\n");
 	}
+	mutex_unlock(&oom_info_lock);
 }
 
 /*
@@ -1796,9 +2050,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
+	unsigned long max_overdraft = 0;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
-	unsigned int points = 0;
+	unsigned long overdraft;
+	unsigned long points = 0;
 	struct task_struct *chosen = NULL;
 
 	/*
@@ -1807,7 +2063,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_oom_victim(current);
 		return;
 	}
 
@@ -1820,32 +2076,27 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
-			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false)) {
+			switch (oom_scan_process_thread(task, NULL)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
+				max_overdraft = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
-			case OOM_SCAN_ABORT:
-				cgroup_iter_end(cgroup, &it);
-				mem_cgroup_iter_break(memcg, iter);
-				if (chosen)
-					put_task_struct(chosen);
-				return;
 			case OOM_SCAN_OK:
 				break;
 			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (points > chosen_points) {
+			points = oom_badness(task, memcg, NULL, totalpages,
+					     &overdraft);
+			if (oom_worse(points, overdraft, &chosen_points,
+				      &max_overdraft)) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
-				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
@@ -1854,8 +2105,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (!chosen)
 		return;
-	points = chosen_points * 1000 / totalpages;
-	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+	oom_kill_process(chosen, gfp_mask, order, chosen_points, max_overdraft,
+			 totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 
@@ -1875,7 +2126,11 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+						      gfp_mask, flags);
+		if (test_thread_flag(TIF_MEMDIE) ||
+		    fatal_signal_pending(current))
+			return 1;
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
@@ -1883,8 +2138,18 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 		 */
 		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
 			break;
-		if (mem_cgroup_margin(memcg))
+		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
 			break;
+
+		/*
+		 * Try harder to reclaim dcache. dcache reclaim may
+		 * temporarly fail due to dcache->dlock being held
+		 * by someone else. We must try harder to avoid premature
+		 * slab allocation failures.
+		 */
+		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
+		    page_counter_read(&memcg->dcache))
+			continue;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
@@ -2085,60 +2350,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 	return total;
 }
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
-
-/*
- * Check OOM-Killer is already running under our hierarchy.
- * If someone is running, return false.
- */
-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter, *failed = NULL;
-
-	spin_lock(&memcg_oom_lock);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter->oom_lock) {
-			/*
-			 * this subtree of our hierarchy is already locked
-			 * so we cannot give a lock.
-			 */
-			failed = iter;
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		} else
-			iter->oom_lock = true;
-	}
-
-	if (failed) {
-		/*
-		 * OK, we failed to lock the whole subtree so we have
-		 * to clean up what we set up to the failing subtree
-		 */
-		for_each_mem_cgroup_tree(iter, memcg) {
-			if (iter == failed) {
-				mem_cgroup_iter_break(memcg, iter);
-				break;
-			}
-			iter->oom_lock = false;
-		}
-	}
-
-	spin_unlock(&memcg_oom_lock);
-
-	return !failed;
-}
-
-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	spin_lock(&memcg_oom_lock);
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->oom_lock = false;
-	spin_unlock(&memcg_oom_lock);
-}
-
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
@@ -2200,6 +2411,23 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
+static void memcg_wait_oom_recover(struct mem_cgroup *memcg)
+{
+	struct oom_wait_info owait;
+
+	owait.memcg = memcg;
+	owait.wait.flags = 0;
+	owait.wait.func = memcg_oom_wake_function;
+	owait.wait.private = current;
+	INIT_LIST_HEAD(&owait.wait.task_list);
+
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	schedule();
+	finish_wait(&memcg_oom_waitq, &owait.wait);
+
+	memcg_wakeup_oom(memcg);
+}
+
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
 	if (!current->memcg_oom.may_oom)
@@ -2244,8 +2472,6 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 bool mem_cgroup_oom_synchronize(bool handle)
 {
 	struct mem_cgroup *memcg = current->memcg_oom.memcg;
-	struct oom_wait_info owait;
-	bool locked;
 
 	/* OOM is global, do not handle */
 	if (!memcg)
@@ -2254,40 +2480,19 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!handle)
 		goto cleanup;
 
-	owait.memcg = memcg;
-	owait.wait.flags = 0;
-	owait.wait.func = memcg_oom_wake_function;
-	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.task_list);
-
-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
+	if (oom_trylock(memcg)) {
 		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
-	} else {
-		schedule();
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
+		if (memcg->oom_kill_disable)
+			memcg_wait_oom_recover(memcg);
+		else
+			mem_cgroup_out_of_memory(memcg,
+						 current->memcg_oom.gfp_mask,
+						 current->memcg_oom.order);
+		oom_unlock(memcg);
 	}
+	mem_cgroup_unmark_under_oom(memcg);
 
-	if (locked) {
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	}
 cleanup:
 	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
@@ -2300,22 +2505,14 @@ bool mem_cgroup_oom_synchronize(bool handle)
  *
  * Notes: Race condition
  *
- * We usually use page_cgroup_lock() for accessing page_cgroup member but
- * it tends to be costly. But considering some conditions, we doesn't need
- * to do so _always_.
- *
- * Considering "charge", lock_page_cgroup() is not required because all
- * file-stat operations happen after a page is attached to radix-tree. There
- * are no race with "charge".
+ * Charging occurs during page instantiation, while the page is
+ * unmapped and locked in page migration, or while the page table is
+ * locked in THP migration.  No race is possible.
  *
- * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
- * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
- * if there are race with "uncharge". Statistics itself is properly handled
- * by flags.
+ * Uncharge happens to pages with zero references, no race possible.
  *
- * Considering "move", this is an only case we see a race. To make the race
- * small, we check mm->moving_account and detect there are possibility of race
- * If there is, we take a lock.
+ * Charge moving between groups is protected by checking mm->moving
+ * account and taking the move_lock in the slowpath.
  */
 
 void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2547,37 +2744,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 	mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-	int i;
-
-	spin_lock(&memcg->pcp_counter_lock);
-	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		long x = per_cpu(memcg->stat->count[i], cpu);
-
-		per_cpu(memcg->stat->count[i], cpu) = 0;
-		memcg->nocpu_base.count[i] += x;
-	}
-	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-		per_cpu(memcg->stat->events[i], cpu) = 0;
-		memcg->nocpu_base.events[i] += x;
-	}
-	spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
-	struct mem_cgroup *iter;
 
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
@@ -2585,62 +2757,103 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
-	for_each_mem_cgroup(iter)
-		mem_cgroup_drain_pcp_counter(iter, cpu);
-
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 
-
-/* See __mem_cgroup_try_charge() for details */
-enum {
-	CHARGE_OK,		/* success */
-	CHARGE_RETRY,		/* need to retry but retry is not bad */
-	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
-	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
-};
-
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-				unsigned int nr_pages, unsigned int min_pages,
-				bool invoke_oom)
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
+ */
+static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge,
+		      unsigned int nr_pages)
 {
+	unsigned int batch = max(CHARGE_BATCH, nr_pages);
+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
-	unsigned long flags = 0;
-	int ret;
+	unsigned long nr_reclaimed;
+	unsigned long flags;
 
-	ret = page_counter_try_charge(&memcg->memory, nr_pages, &counter);
+	if (mem_cgroup_is_root(memcg))
+		goto done;
+retry:
+	flags = 0;
 
-	if (likely(!ret)) {
-		if (!do_swap_account)
-			return CHARGE_OK;
-		ret = page_counter_try_charge(&memcg->memsw, nr_pages, &counter);
-		if (likely(!ret))
-			return CHARGE_OK;
+	if (consume_stock(memcg, nr_pages)) {
+		if (!kmem_charge)
+			goto done;
+		if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+			goto done;
+	}
 
-		page_counter_uncharge(&memcg->memory, nr_pages);
-		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+	mem_over_limit = NULL;
+	if (!page_counter_try_charge(&memcg->memory, batch, &counter)) {
+		if (do_swap_account && page_counter_try_charge(
+				&memcg->memsw, batch, &counter)) {
+			page_counter_uncharge(&memcg->memory, batch);
+			mem_over_limit = mem_cgroup_from_counter(counter, memsw);
+			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+		}
 	} else
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
+
+	if (!mem_over_limit && kmem_charge) {
+		if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+			goto done_restock;
+
+		flags |= MEM_CGROUP_RECLAIM_KMEM;
+		mem_over_limit = mem_cgroup_from_counter(counter, kmem);
+		page_counter_uncharge(&memcg->memory, batch);
+		if (do_swap_account)
+			page_counter_uncharge(&memcg->memsw, batch);
+	} else if (!mem_over_limit)
+		goto done_restock;
+
+	if (batch > nr_pages) {
+		batch = nr_pages;
+		goto retry;
+	}
+
+	/*
+	 * Unlike in global OOM situations, memcg is not in a physical
+	 * memory shortage.  Allow dying and OOM-killed tasks to
+	 * bypass the last charges so that they can exit quickly and
+	 * free their memory.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+		     fatal_signal_pending(current) ||
+		     current->flags & PF_EXITING))
+		goto bypass;
+
 	/*
-	 * Never reclaim on behalf of optional batching, retry with a
-	 * single page instead.
+	 * Prevent unbounded recursion when reclaim operations need to
+	 * allocate memory. This might exceed the limits temporarily,
+	 * but we prefer facilitating memory reclaim and getting back
+	 * under the limit over triggering OOM kills in these cases.
 	 */
-	if (nr_pages > min_pages)
-		return CHARGE_RETRY;
+	if (unlikely(current->flags & PF_MEMALLOC))
+		goto bypass;
+
+	if (unlikely(task_in_memcg_oom(current)))
+		goto nomem;
 
 	if (!(gfp_mask & __GFP_WAIT))
-		return CHARGE_WOULDBLOCK;
+		goto nomem;
 
-	if (gfp_mask & __GFP_NORETRY)
-		return CHARGE_NOMEM;
+	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 
-	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-		return CHARGE_RETRY;
+	if (mem_cgroup_margin(mem_over_limit,
+				flags & MEM_CGROUP_RECLAIM_KMEM) >= batch)
+		goto retry;
+
+	if (gfp_mask & __GFP_NORETRY)
+		goto nomem;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
@@ -2650,192 +2863,42 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
-	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
-		return CHARGE_RETRY;
-
+	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+		goto retry;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
-		return CHARGE_RETRY;
-
-	if (invoke_oom)
-		mem_cgroup_oom(mem_over_limit, gfp_mask,
-			       get_order(nr_pages * PAGE_SIZE));
-
-	return CHARGE_NOMEM;
-}
+		goto retry;
 
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update page_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- *  0       ...  on success, filling *ptr with a valid memcg pointer.
- *  -ENOMEM ...  charge failure because of resource limits.
- *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
- *
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
- */
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
-				   gfp_t gfp_mask,
-				   unsigned int nr_pages,
-				   struct mem_cgroup **ptr,
-				   bool oom)
-{
-	unsigned int batch = max(CHARGE_BATCH, nr_pages);
-	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct mem_cgroup *memcg = NULL;
-	int ret;
+	if (nr_retries--)
+		goto retry;
 
-	/*
-	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
-	 * in system level. So, allow to go ahead dying process in addition to
-	 * MEMDIE process.
-	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)
-		     || fatal_signal_pending(current)))
+	if (gfp_mask & __GFP_NOFAIL)
 		goto bypass;
 
-	/*
-	 * Prevent unbounded recursion when reclaim operations need to
-	 * allocate memory. This might exceed the limits temporarily,
-	 * but we prefer facilitating memory reclaim and getting back
-	 * under the limit over triggering OOM kills in these cases.
-	 */
-	if (unlikely(current->flags & PF_MEMALLOC))
+	if (fatal_signal_pending(current))
 		goto bypass;
 
-	if (unlikely(task_in_memcg_oom(current)))
-		goto nomem;
-
-	if (gfp_mask & __GFP_NOFAIL)
-		oom = false;
-
-	/*
-	 * We always charge the cgroup the mm_struct belongs to.
-	 * The mm_struct's mem_cgroup changes on task migration if the
-	 * thread group leader migrates. It's possible that mm is not
-	 * set, if so charge the root memcg (happens for pagecache usage).
-	 */
-	if (!*ptr && !mm)
-		*ptr = root_mem_cgroup;
-again:
-	if (*ptr) { /* css should be a valid one */
-		memcg = *ptr;
-		if (mem_cgroup_is_root(memcg))
-			goto done;
-		if (consume_stock(memcg, nr_pages))
-			goto done;
-		css_get(&memcg->css);
-	} else {
-		struct task_struct *p;
-
-		rcu_read_lock();
-		p = rcu_dereference(mm->owner);
-		/*
-		 * Because we don't have task_lock(), "p" can exit.
-		 * In that case, "memcg" can point to root or p can be NULL with
-		 * race with swapoff. Then, we have small risk of mis-accouning.
-		 * But such kind of mis-account by race always happens because
-		 * we don't have cgroup_mutex(). It's overkill and we allo that
-		 * small race, here.
-		 * (*) swapoff at el will charge against mm-struct not against
-		 * task-struct. So, mm->owner can be NULL.
-		 */
-		memcg = mem_cgroup_from_task(p);
-		if (!memcg)
-			memcg = root_mem_cgroup;
-		if (mem_cgroup_is_root(memcg)) {
-			rcu_read_unlock();
-			goto done;
-		}
-		if (consume_stock(memcg, nr_pages)) {
-			/*
-			 * It seems dagerous to access memcg without css_get().
-			 * But considering how consume_stok works, it's not
-			 * necessary. If consume_stock success, some charges
-			 * from this memcg are cached on this cpu. So, we
-			 * don't need to call css_get()/css_tryget() before
-			 * calling consume_stock().
-			 */
-			rcu_read_unlock();
-			goto done;
-		}
-		/* after here, we may be blocked. we need to get refcnt */
-		if (!css_tryget(&memcg->css)) {
-			rcu_read_unlock();
-			goto again;
-		}
-		rcu_read_unlock();
-	}
-
-	do {
-		bool invoke_oom = oom && !nr_oom_retries;
+	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch * PAGE_SIZE));
 
-		/* If killed, bypass charge */
-		if (fatal_signal_pending(current)) {
-			css_put(&memcg->css);
-			goto bypass;
-		}
+nomem:
+	mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
-					   nr_pages, invoke_oom);
-		switch (ret) {
-		case CHARGE_OK:
-			break;
-		case CHARGE_RETRY: /* not in OOM situation but retry */
-			batch = nr_pages;
-			css_put(&memcg->css);
-			memcg = NULL;
-			goto again;
-		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-			css_put(&memcg->css);
-			goto nomem;
-		case CHARGE_NOMEM: /* OOM routine works */
-			if (!oom || invoke_oom) {
-				css_put(&memcg->css);
-				goto nomem;
-			}
-			nr_oom_retries--;
-			break;
-		}
-	} while (ret != CHARGE_OK);
+	if (!(gfp_mask & __GFP_NOFAIL))
+		return -ENOMEM;
+bypass:
+	return -EINTR;
 
+done_restock:
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
-	css_put(&memcg->css);
 done:
-	*ptr = memcg;
 	return 0;
-nomem:
-	if (!(gfp_mask & __GFP_NOFAIL)) {
-		*ptr = NULL;
-		return -ENOMEM;
-	}
-bypass:
-	*ptr = root_mem_cgroup;
-	return -EINTR;
 }
 
-/*
- * Somemtimes we have to undo a charge we got by try_charge().
- * This function is for that and do uncharge, put css's refcnt.
- * gotten by try_charge().
- */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
-				       unsigned int nr_pages)
+static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(memcg)) {
 		page_counter_uncharge(&memcg->memory, nr_pages);
@@ -2863,6 +2926,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 	return mem_cgroup_from_css(css);
 }
 
+/*
+ * try_get_mem_cgroup_from_page - look up page's memcg association
+ * @page: the page
+ *
+ * Look up, get a css reference, and return the memcg that owns @page.
+ *
+ * The page must be locked to prevent racing with swap-in and page
+ * cache charges.  If coming from an unlocked page table, the caller
+ * must ensure the page is on the LRU or this can race with charging.
+ */
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
@@ -2873,7 +2946,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget(&memcg->css))
@@ -2887,23 +2959,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 			memcg = NULL;
 		rcu_read_unlock();
 	}
-	unlock_page_cgroup(pc);
 	return memcg;
 }
 
-static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
-				       struct page *page,
-				       unsigned int nr_pages,
-				       enum charge_type ctype,
-				       bool lrucare)
+static void lock_page_lru(struct page *page, int *isolated)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	if (PageLRU(page)) {
+		struct lruvec *lruvec;
+
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+		ClearPageLRU(page);
+		del_page_from_lru_list(page, lruvec, page_lru(page));
+		*isolated = 1;
+	} else
+		*isolated = 0;
+}
+
+static void unlock_page_lru(struct page *page, int isolated)
+{
+	struct zone *zone = page_zone(page);
+
+	if (isolated) {
+		struct lruvec *lruvec;
+
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+		SetPageLRU(page);
+		add_page_to_lru_list(page, lruvec, page_lru(page));
+	}
+	spin_unlock_irq(&zone->lru_lock);
+}
+
+static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+			  unsigned int nr_pages, bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
-	struct zone *uninitialized_var(zone);
-	struct lruvec *lruvec;
-	bool was_on_lru = false;
-	bool anon;
+	int isolated;
 
-	lock_page_cgroup(pc);
 	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
@@ -2914,348 +3009,226 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
-	if (lrucare) {
-		zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page)) {
-			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-			ClearPageLRU(page);
-			del_page_from_lru_list(page, lruvec, page_lru(page));
-			was_on_lru = true;
-		}
-	}
+	if (lrucare)
+		lock_page_lru(page, &isolated);
 
-	pc->mem_cgroup = memcg;
 	/*
-	 * We access a page_cgroup asynchronously without lock_page_cgroup().
-	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
-	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
-	 * before USED bit, we need memory barrier here.
-	 * See mem_cgroup_add_lru_list(), etc.
- 	 */
-	smp_wmb();
-	SetPageCgroupUsed(pc);
-
-	if (lrucare) {
-		if (was_on_lru) {
-			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-			VM_BUG_ON_PAGE(PageLRU(page), page);
-			SetPageLRU(page);
-			add_page_to_lru_list(page, lruvec, page_lru(page));
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
-		anon = true;
-	else
-		anon = false;
+	 * Nobody should be changing or seriously looking at
+	 * pc->mem_cgroup and pc->flags at this point:
+	 *
+	 * - the page is uncharged
+	 *
+	 * - the page is off-LRU
+	 *
+	 * - an anonymous fault has exclusive page access, except for
+	 *   a locked page table
+	 *
+	 * - a page cache insertion, a swapin fault, or a migration
+	 *   have the page locked
+	 */
+	pc->mem_cgroup = memcg;
+	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
 
-	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
-	unlock_page_cgroup(pc);
+	if (lrucare)
+		unlock_page_lru(page, isolated);
 
+	local_irq_disable();
+	mem_cgroup_charge_statistics(memcg, page, nr_pages);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(memcg, page);
+	local_irq_enable();
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
-{
-	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
-}
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-	struct kmem_cache *cachep;
 
-	VM_BUG_ON(p->is_root_cache);
-	cachep = p->root_cache;
-	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
-}
+static DEFINE_MUTEX(activate_kmem_mutex);
 
 #ifdef CONFIG_SLABINFO
 static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
 					struct seq_file *m)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	struct memcg_cache_params *params;
-
-	if (!memcg_can_account_kmem(memcg))
-		return -EIO;
+	loff_t pos = 0;
+	void *p;
 
-	print_slabinfo_header(m);
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
-		cache_show(memcg_params_to_cache(params), m);
-	mutex_unlock(&memcg->slab_caches_mutex);
+	for (p = slab_start(m, &pos); p; p = slab_next(m, p, &pos))
+		memcg_slab_show(memcg, m, p);
+	slab_stop(m, p);
 
 	return 0;
 }
 #endif
 
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 			     unsigned long nr_pages)
 {
-	struct page_counter *counter;
-	struct mem_cgroup *_memcg;
 	int ret = 0;
-	bool may_oom;
-
-	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Conditions under which we can wait for the oom_killer. Those are
-	 * the same conditions tested by the core page allocator
-	 */
-	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
-	_memcg = memcg;
-	ret = __mem_cgroup_try_charge(NULL, gfp, nr_pages, &_memcg, may_oom);
 
+	ret = try_charge(memcg, gfp, true, nr_pages);
 	if (ret == -EINTR)  {
 		/*
-		 * __mem_cgroup_try_charge() chosed to bypass to root due to
-		 * OOM kill or fatal signal.  Since our only options are to
-		 * either fail the allocation or charge it to this cgroup, do
-		 * it as a temporary condition. But we can't fail. From a
-		 * kmem/slab perspective, the cache has already been selected,
-		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+		 * try_charge() chose to bypass to root due to OOM kill or
+		 * fatal signal.  Since our only options are to either fail
+		 * the allocation or charge it to this cgroup, do it as a
+		 * temporary condition. But we can't fail. From a kmem/slab
+		 * perspective, the cache has already been selected, by
+		 * mem_cgroup_kmem_get_cache(), so it is too late to change
 		 * our minds.
 		 *
 		 * This condition will only trigger if the task entered
-		 * memcg_charge_kmem in a sane state, but was OOM-killed during
-		 * __mem_cgroup_try_charge() above. Tasks that were already
-		 * dying when the allocation triggers should have been already
+		 * memcg_charge_kmem in a sane state, but was OOM-killed
+		 * during try_charge() above. Tasks that were already dying
+		 * when the allocation triggers should have been already
 		 * directed to the root cgroup in memcontrol.h
 		 */
 		page_counter_charge(&memcg->memory, nr_pages);
 		if (do_swap_account)
 			page_counter_charge(&memcg->memsw, nr_pages);
+		page_counter_charge(&memcg->kmem, nr_pages);
+
 		ret = 0;
-	} else if (ret)
-		page_counter_uncharge(&memcg->kmem, nr_pages);
+	}
 
 	return ret;
 }
 
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
-				unsigned long nr_pages)
+void memcg_charge_kmem_nofail(struct mem_cgroup *memcg, unsigned long nr_pages)
 {
-	page_counter_uncharge(&memcg->memory, nr_pages);
+	page_counter_charge(&memcg->memory, nr_pages);
 	if (do_swap_account)
-		page_counter_uncharge(&memcg->memsw, nr_pages);
+		page_counter_charge(&memcg->memsw, nr_pages);
 
-	/* Not down to 0 */
-	if (page_counter_uncharge(&memcg->kmem, nr_pages))
-		return;
+	/* kmem must be charged after res - see memcg_charge_kmem() */
+	page_counter_charge(&memcg->kmem, nr_pages);
+}
 
-	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
-}
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+				unsigned long nr_pages)
 {
-	if (!memcg)
-		return;
+	u64 kmem;
 
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-	mutex_unlock(&memcg->slab_caches_mutex);
-}
+	kmem = page_counter_uncharge(&memcg->kmem, nr_pages);
 
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-	return memcg ? memcg->kmemcg_id : -1;
-}
+	page_counter_uncharge(&memcg->memory, nr_pages);
+	if (do_swap_account)
+		page_counter_uncharge(&memcg->memsw, nr_pages);
 
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-	int num, ret;
+	/* Not down to 0 */
+	if (kmem)
+		return;
 
-	num = ida_simple_get(&kmem_limited_groups,
-				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-	if (num < 0)
-		return num;
 	/*
-	 * After this point, kmem_accounted (that we test atomically in
-	 * the beginning of this conditional), is no longer 0. This
-	 * guarantees only one process will set the following boolean
-	 * to true. We don't need test_and_set because we're protected
-	 * by the set_limit_mutex anyway.
+	 * Releases a reference taken in memcg_deactivate_kmem in case
+	 * this last uncharge is racing with the offlining code or it is
+	 * outliving the memcg existence.
+	 *
+	 * The memory barrier imposed by test&clear is paired with the
+	 * explicit one in memcg_kmem_mark_dead().
 	 */
-	memcg_kmem_set_activated(memcg);
+	if (memcg_kmem_test_and_clear_dead(memcg))
+		css_put(&memcg->css);
+}
 
-	ret = memcg_update_all_caches(num+1);
-	if (ret) {
-		ida_simple_remove(&kmem_limited_groups, num);
-		memcg_kmem_clear_activated(memcg);
+int __memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, unsigned int nr_pages)
+{
+	struct mem_cgroup *memcg;
+	int idx;
+	int ret;
+
+	VM_BUG_ON(is_root_cache(s));
+	memcg = s->memcg_params.memcg;
+
+	ret = memcg_charge_kmem(memcg, gfp, nr_pages);
+	if (ret)
 		return ret;
+	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
+		page_counter_charge(&memcg->dcache, nr_pages);
+		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
+		percpu_counter_add(&memcg->stat2.counters[idx], nr_pages);
+	} else {
+		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
+		this_cpu_add(memcg->stat->count[idx], nr_pages);
 	}
-
-	memcg->kmemcg_id = num;
-	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-	mutex_init(&memcg->slab_caches_mutex);
 	return 0;
 }
 
-static size_t memcg_caches_array_size(int num_groups)
+void __memcg_uncharge_slab(struct kmem_cache *s, unsigned int nr_pages)
 {
-	ssize_t size;
-	if (num_groups <= 0)
-		return 0;
+	struct mem_cgroup *memcg;
+	int idx;
 
-	size = 2 * num_groups;
-	if (size < MEMCG_CACHES_MIN_SIZE)
-		size = MEMCG_CACHES_MIN_SIZE;
-	else if (size > MEMCG_CACHES_MAX_SIZE)
-		size = MEMCG_CACHES_MAX_SIZE;
+	VM_BUG_ON(is_root_cache(s));
+	memcg = s->memcg_params.memcg;
 
-	return size;
+	memcg_uncharge_kmem(memcg, nr_pages);
+	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
+		page_counter_uncharge(&memcg->dcache, nr_pages);
+		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
+		percpu_counter_sub(&memcg->stat2.counters[idx], nr_pages);
+	} else {
+		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
+		this_cpu_sub(memcg->stat->count[idx], nr_pages);
+	}
 }
 
 /*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
  */
-void memcg_update_array_size(int num)
+int memcg_cache_id(struct mem_cgroup *memcg)
 {
-	if (num > memcg_limited_groups_array_size)
-		memcg_limited_groups_array_size = memcg_caches_array_size(num);
+	return memcg ? memcg->kmemcg_id : -1;
 }
 
-static void kmem_cache_destroy_work_func(struct work_struct *w);
-
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+static int memcg_alloc_cache_id(void)
 {
-	struct memcg_cache_params *cur_params = s->memcg_params;
+	int id, size;
+	int err;
 
-	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+	id = ida_simple_get(&memcg_cache_ida,
+			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+	if (id < 0)
+		return id;
 
-	if (num_groups > memcg_limited_groups_array_size) {
-		int i;
-		ssize_t size = memcg_caches_array_size(num_groups);
+	if (id < memcg_nr_cache_ids)
+		return id;
 
-		size *= sizeof(void *);
-		size += sizeof(struct memcg_cache_params);
+	/*
+	 * There's no space for the new id in memcg_caches arrays,
+	 * so we have to grow them.
+	 */
+	down_write(&memcg_cache_ids_sem);
 
-		s->memcg_params = kzalloc(size, GFP_KERNEL);
-		if (!s->memcg_params) {
-			s->memcg_params = cur_params;
-			return -ENOMEM;
-		}
+	size = 2 * (id + 1);
+	if (size < MEMCG_CACHES_MIN_SIZE)
+		size = MEMCG_CACHES_MIN_SIZE;
+	else if (size > MEMCG_CACHES_MAX_SIZE)
+		size = MEMCG_CACHES_MAX_SIZE;
 
-		s->memcg_params->is_root_cache = true;
+	err = memcg_update_all_caches(size);
+	if (!err)
+		err = memcg_update_all_list_lrus(size);
+	if (!err)
+		memcg_nr_cache_ids = size;
 
-		/*
-		 * There is the chance it will be bigger than
-		 * memcg_limited_groups_array_size, if we failed an allocation
-		 * in a cache, in which case all caches updated before it, will
-		 * have a bigger array.
-		 *
-		 * But if that is the case, the data after
-		 * memcg_limited_groups_array_size is certainly unused
-		 */
-		for (i = 0; i < memcg_limited_groups_array_size; i++) {
-			if (!cur_params->memcg_caches[i])
-				continue;
-			s->memcg_params->memcg_caches[i] =
-						cur_params->memcg_caches[i];
-		}
+	up_write(&memcg_cache_ids_sem);
 
-		/*
-		 * Ideally, we would wait until all caches succeed, and only
-		 * then free the old one. But this is not worth the extra
-		 * pointer per-cache we'd have to have for this.
-		 *
-		 * It is not a big deal if some caches are left with a size
-		 * bigger than the others. And all updates will reset this
-		 * anyway.
-		 */
-		kfree(cur_params);
+	if (err) {
+		ida_simple_remove(&memcg_cache_ida, id);
+		return err;
 	}
-	return 0;
-}
-
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache)
-{
-	size_t size = sizeof(struct memcg_cache_params);
-
-	if (!memcg_kmem_enabled())
-		return 0;
-
-	if (!memcg)
-		size += memcg_limited_groups_array_size * sizeof(void *);
-
-	s->memcg_params = kzalloc(size, GFP_KERNEL);
-	if (!s->memcg_params)
-		return -ENOMEM;
-
-	if (memcg) {
-		s->memcg_params->memcg = memcg;
-		s->memcg_params->root_cache = root_cache;
-		INIT_WORK(&s->memcg_params->destroy,
-				kmem_cache_destroy_work_func);
-	} else
-		s->memcg_params->is_root_cache = true;
-
-	return 0;
+	return id;
 }
 
-void memcg_release_cache(struct kmem_cache *s)
+static void memcg_free_cache_id(int id)
 {
-	struct kmem_cache *root;
-	struct mem_cgroup *memcg;
-	int id;
-
-	/*
-	 * This happens, for instance, when a root cache goes away before we
-	 * add any memcg.
-	 */
-	if (!s->memcg_params)
-		return;
-
-	if (s->memcg_params->is_root_cache)
-		goto out;
-
-	memcg = s->memcg_params->memcg;
-	id  = memcg_cache_id(memcg);
-
-	root = s->memcg_params->root_cache;
-	root->memcg_params->memcg_caches[id] = NULL;
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_del(&s->memcg_params->list);
-	mutex_unlock(&memcg->slab_caches_mutex);
-
-	mem_cgroup_put(memcg);
-out:
-	kfree(s->memcg_params);
+	ida_simple_remove(&memcg_cache_ida, id);
 }
 
 /*
@@ -3277,274 +3250,56 @@ void memcg_release_cache(struct kmem_cache *s)
  * memcg_kmem_skip_account. So we enclose anything that might allocate memory
  * inside the following two functions.
  */
-static inline void memcg_stop_kmem_account(void)
-{
-	VM_BUG_ON(!current->mm);
-	current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
-	VM_BUG_ON(!current->mm);
-	current->memcg_kmem_skip_account--;
-}
-
-static void kmem_cache_destroy_work_func(struct work_struct *w)
-{
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *p;
-
-	p = container_of(w, struct memcg_cache_params, destroy);
-
-	cachep = memcg_params_to_cache(p);
-
-	/*
-	 * If we get down to 0 after shrink, we could delete right away.
-	 * However, memcg_release_pages() already puts us back in the workqueue
-	 * in that case. If we proceed deleting, we'll get a dangling
-	 * reference, and removing the object from the workqueue in that case
-	 * is unnecessary complication. We are not a fast path.
-	 *
-	 * Note that this case is fundamentally different from racing with
-	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
-	 * kmem_cache_shrink, not only we would be reinserting a dead cache
-	 * into the queue, but doing so from inside the worker racing to
-	 * destroy it.
-	 *
-	 * So if we aren't down to zero, we'll just schedule a worker and try
-	 * again
-	 */
-	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
-		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
-			return;
-	} else
-		kmem_cache_destroy(cachep);
-}
-
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
-{
-	if (!cachep->memcg_params->dead)
-		return;
-
-	/*
-	 * There are many ways in which we can get here.
-	 *
-	 * We can get to a memory-pressure situation while the delayed work is
-	 * still pending to run. The vmscan shrinkers can then release all
-	 * cache memory and get us to destruction. If this is the case, we'll
-	 * be executed twice, which is a bug (the second time will execute over
-	 * bogus data). In this case, cancelling the work should be fine.
-	 *
-	 * But we can also get here from the worker itself, if
-	 * kmem_cache_shrink is enough to shake all the remaining objects and
-	 * get the page count to 0. In this case, we'll deadlock if we try to
-	 * cancel the work (the worker runs with an internal lock held, which
-	 * is the same lock we would hold for cancel_work_sync().)
-	 *
-	 * Since we can't possibly know who got us here, just refrain from
-	 * running if there is already work pending
-	 */
-	if (work_pending(&cachep->memcg_params->destroy))
-		return;
-	/*
-	 * We have to defer the actual destroying to a workqueue, because
-	 * we might currently be in a context that cannot sleep.
-	 */
-	schedule_work(&cachep->memcg_params->destroy);
-}
-
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-					 struct kmem_cache *s)
-{
-	struct kmem_cache *new;
-	static char *tmp_name = NULL;
-
-	lockdep_assert_held(&memcg_cache_mutex);
-
-	/*
-	 * kmem_cache_create_memcg duplicates the given name and
-	 * cgroup_name for this name requires RCU context.
-	 * This static temporary buffer is used to prevent from
-	 * pointless shortliving allocation.
-	 */
-	if (!tmp_name) {
-		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
-		if (!tmp_name)
-			return NULL;
-	}
-
-	rcu_read_lock();
-	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
-			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
-	rcu_read_unlock();
-
-	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
-				      (s->flags & ~SLAB_PANIC), s->ctor, s);
-
-	if (new)
-		new->allocflags |= __GFP_KMEMCG;
-
-	return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-						  struct kmem_cache *cachep)
-{
-	struct kmem_cache *new_cachep;
-	int idx;
-
-	BUG_ON(!memcg_can_account_kmem(memcg));
-
-	idx = memcg_cache_id(memcg);
-
-	mutex_lock(&memcg_cache_mutex);
-	new_cachep = cachep->memcg_params->memcg_caches[idx];
-	if (new_cachep)
-		goto out;
-
-	new_cachep = kmem_cache_dup(memcg, cachep);
-	if (new_cachep == NULL) {
-		new_cachep = cachep;
-		goto out;
-	}
-
-	mem_cgroup_get(memcg);
-	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
-	cachep->memcg_params->memcg_caches[idx] = new_cachep;
-	/*
-	 * the readers won't lock, make sure everybody sees the updated value,
-	 * so they won't put stuff in the queue again for no reason
-	 */
-	wmb();
-out:
-	mutex_unlock(&memcg_cache_mutex);
-	return new_cachep;
-}
 
 static DEFINE_MUTEX(memcg_limit_mutex);
 
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
-{
-	struct kmem_cache *c;
-	int i;
-
-	if (!s->memcg_params)
-		return;
-	if (!s->memcg_params->is_root_cache)
-		return;
-
-	/*
-	 * If the cache is being destroyed, we trust that there is no one else
-	 * requesting objects from it. Even if there are, the sanity checks in
-	 * kmem_cache_destroy should caught this ill-case.
-	 *
-	 * Still, we don't want anyone else freeing memcg_caches under our
-	 * noses, which can happen if a new memcg comes to life. As usual,
-	 * we'll take the memcg_limit_mutex to protect ourselves against this.
-	 */
-	mutex_lock(&memcg_limit_mutex);
-	for (i = 0; i < memcg_limited_groups_array_size; i++) {
-		c = s->memcg_params->memcg_caches[i];
-		if (!c)
-			continue;
-
-		/*
-		 * We will now manually delete the caches, so to avoid races
-		 * we need to cancel all pending destruction workers and
-		 * proceed with destruction ourselves.
-		 *
-		 * kmem_cache_destroy() will call kmem_cache_shrink internally,
-		 * and that could spawn the workers again: it is likely that
-		 * the cache still have active pages until this very moment.
-		 * This would lead us back to mem_cgroup_destroy_cache.
-		 *
-		 * But that will not execute at all if the "dead" flag is not
-		 * set, so flip it down to guarantee we are in control.
-		 */
-		c->memcg_params->dead = false;
-		cancel_work_sync(&c->memcg_params->destroy);
-		kmem_cache_destroy(c);
-	}
-	mutex_unlock(&memcg_limit_mutex);
-}
-
-struct create_work {
+struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 
-static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *params;
+	struct memcg_kmem_cache_create_work *cw =
+		container_of(w, struct memcg_kmem_cache_create_work, work);
+	struct mem_cgroup *memcg = cw->memcg;
+	struct kmem_cache *cachep = cw->cachep;
 
-	if (!memcg_kmem_is_active(memcg))
-		return;
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
-		cachep = memcg_params_to_cache(params);
-		cachep->memcg_params->dead = true;
-		schedule_work(&cachep->memcg_params->destroy);
-	}
-	mutex_unlock(&memcg->slab_caches_mutex);
-}
-
-static void memcg_create_cache_work_func(struct work_struct *w)
-{
-	struct create_work *cw;
+	memcg_create_kmem_cache(memcg, cachep);
 
-	cw = container_of(w, struct create_work, work);
-	memcg_create_kmem_cache(cw->memcg, cw->cachep);
-	/* Drop the reference gotten when we enqueued. */
-	css_put(&cw->memcg->css);
+	css_put(&memcg->css);
 	kfree(cw);
 }
 
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
-static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
-					 struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					       struct kmem_cache *cachep)
 {
-	struct create_work *cw;
+	struct memcg_kmem_cache_create_work *cw;
 
-	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
-	if (cw == NULL) {
-		css_put(&memcg->css);
+	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+	if (!cw)
 		return;
-	}
+
+	css_get(&memcg->css);
 
 	cw->memcg = memcg;
 	cw->cachep = cachep;
+	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	INIT_WORK(&cw->work, memcg_create_cache_work_func);
 	schedule_work(&cw->work);
 }
 
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
-				       struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					     struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
-	 * in __memcg_create_cache_enqueue will recurse.
+	 * in __memcg_schedule_kmem_cache_create will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -3553,9 +3308,10 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	memcg_stop_kmem_account();
-	__memcg_create_cache_enqueue(memcg, cachep);
+	__memcg_schedule_kmem_cache_create(memcg, cachep);
 	memcg_resume_kmem_account();
 }
+
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
@@ -3573,36 +3329,27 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 					  gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
-	int idx;
+	struct kmem_cache *memcg_cachep;
 
-	VM_BUG_ON(!cachep->memcg_params);
-	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+	VM_BUG_ON(!is_root_cache(cachep));
 
-	if (!current->mm || current->memcg_kmem_skip_account)
-		return cachep;
+	if (cachep->flags & SLAB_ACCOUNT)
+		gfp |= __GFP_ACCOUNT;
 
-	rcu_read_lock();
-	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+	if (!(gfp & __GFP_ACCOUNT))
+		return cachep;
 
-	if (!memcg_can_account_kmem(memcg))
-		goto out;
+	if (!current->mm || current->memcg_kmem_skip_account)
+		return cachep;
 
-	idx = memcg_cache_id(memcg);
+	memcg = get_mem_cgroup_from_mm(current->mm);
 
-	/*
-	 * barrier to mare sure we're always seeing the up to date value.  The
-	 * code updating memcg_caches will issue a write barrier to match this.
-	 */
-	read_barrier_depends();
-	if (likely(cachep->memcg_params->memcg_caches[idx])) {
-		cachep = cachep->memcg_params->memcg_caches[idx];
+	if (!memcg_kmem_is_active(memcg))
 		goto out;
-	}
 
-	/* The corresponding put will be done in the workqueue. */
-	if (!css_tryget(&memcg->css))
-		goto out;
-	rcu_read_unlock();
+	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+	if (likely(memcg_cachep))
+		return memcg_cachep;
 
 	/*
 	 * If we are in a safe context (can wait, and not in interrupt
@@ -3612,23 +3359,23 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 	 *
 	 * However, there are some clashes that can arrive from locking.
 	 * For instance, because we acquire the slab_mutex while doing
-	 * kmem_cache_dup, this means no further allocation could happen
-	 * with the slab_mutex held.
-	 *
-	 * Also, because cache creation issue get_online_cpus(), this
-	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
-	 * that ends up reversed during cpu hotplug. (cpuset allocates
-	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
-	 * better to defer everything.
+	 * memcg_create_kmem_cache, this means no further allocation
+	 * could happen with the slab_mutex held. So it's better to
+	 * defer everything.
 	 */
-	memcg_create_cache_enqueue(memcg, cachep);
-	return cachep;
+	memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
-	rcu_read_unlock();
+	css_put(&memcg->css);
 	return cachep;
 }
 EXPORT_SYMBOL(__memcg_kmem_get_cache);
 
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+	if (!is_root_cache(cachep))
+		css_put(&cachep->memcg_params.memcg->css);
+}
+
 /*
  * We need to verify if the allocation against current->mm->owner's memcg is
  * possible for the given order. But the page is not allocated yet, so we'll
@@ -3644,53 +3391,61 @@ EXPORT_SYMBOL(__memcg_kmem_get_cache);
  * Returning true means the allocation is possible.
  */
 bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+__memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
+	struct page_cgroup *pc;
 	struct mem_cgroup *memcg;
 	int ret;
 
-	*_memcg = NULL;
-	memcg = try_get_mem_cgroup_from_mm(current->mm);
-
 	/*
-	 * very rare case described in mem_cgroup_from_task. Unfortunately there
-	 * isn't much we can do without complicating this too much, and it would
-	 * be gfp-dependent anyway. Just let it go
+	 * Disabling accounting is only relevant for some specific memcg
+	 * internal allocations. Therefore we would initially not have such
+	 * check here, since direct calls to the page allocator that are
+	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+	 * outside memcg core. We are mostly concerned with cache allocations,
+	 * and by having this test at memcg_kmem_get_cache, we are already able
+	 * to relay the allocation to the root cache and bypass the memcg cache
+	 * altogether.
+	 *
+	 * There is one exception, though: the SLUB allocator does not create
+	 * large order caches, but rather service large kmallocs directly from
+	 * the page allocator. Therefore, the following sequence when backed by
+	 * the SLUB allocator:
+	 *
+	 * 	memcg_stop_kmem_account();
+	 * 	kmalloc(<large_number>)
+	 * 	memcg_resume_kmem_account();
+	 *
+	 * would effectively ignore the fact that we should skip accounting,
+	 * since it will drive us directly to this function without passing
+	 * through the cache selector memcg_kmem_get_cache. Such large
+	 * allocations are extremely rare but can happen, for instance, for the
+	 * cache arrays. We bring this test here.
 	 */
-	if (unlikely(!memcg))
+	if (!current->mm || current->memcg_kmem_skip_account)
 		return true;
 
-	if (!memcg_can_account_kmem(memcg)) {
+	memcg = get_mem_cgroup_from_mm(current->mm);
+
+	if (!memcg_kmem_is_active(memcg)) {
 		css_put(&memcg->css);
 		return true;
 	}
 
 	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-	if (!ret)
-		*_memcg = memcg;
-
-	css_put(&memcg->css);
-	return (ret == 0);
-}
-
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-			      int order)
-{
-	struct page_cgroup *pc;
-
-	VM_BUG_ON(mem_cgroup_is_root(memcg));
-
-	/* The page allocation failed. Revert */
-	if (!page) {
-		memcg_uncharge_kmem(memcg, 1 << order);
-		return;
+	if (ret) {
+		css_put(&memcg->css);
+		return false;
 	}
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
 	pc->mem_cgroup = memcg;
-	SetPageCgroupUsed(pc);
-	unlock_page_cgroup(pc);
+        pc->flags = PCG_USED;
+
+	__SetPageKmemcg(page);
+
+	css_put(&memcg->css);
+	return true;
 }
 
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -3707,12 +3462,10 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 	if (!PageCgroupUsed(pc))
 		return;
 
-	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
-		ClearPageCgroupUsed(pc);
+		pc->flags = 0;
 	}
-	unlock_page_cgroup(pc);
 
 	/*
 	 * We trust that only if there is a memcg associated with the page, it
@@ -3723,16 +3476,34 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 	memcg_uncharge_kmem(memcg, 1 << order);
+
+	__ClearPageKmemcg(page);
 }
-#else
-static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
 {
+	struct mem_cgroup *memcg = NULL;
+	struct page_cgroup *pc;
+	struct kmem_cache *cachep;
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+	if (PageSlab(page)) {
+		cachep = page->slab_cache;
+		if (!is_root_cache(cachep))
+			memcg = cachep->memcg_params.memcg;
+	} else {
+		pc = lookup_page_cgroup(page);
+		if (PageCgroupUsed(pc))
+			memcg = pc->mem_cgroup;
+	}
+
+	return memcg;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
-#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -3753,8 +3524,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = memcg;
-		smp_wmb();/* see __commit_charge() */
-		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+		pc->flags = head_pc->flags;
 	}
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 		       HPAGE_PMD_NR);
@@ -3784,7 +3554,6 @@ static int mem_cgroup_move_account(struct page *page,
 {
 	unsigned long flags;
 	int ret;
-	bool anon = PageAnon(page);
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -3798,35 +3567,46 @@ static int mem_cgroup_move_account(struct page *page,
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 
-	lock_page_cgroup(pc);
-
+	/*
+	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+	 * of its source page while we change it: page migration takes
+	 * both pages off the LRU, but page cache replacement doesn't.
+	 */
+	if (!trylock_page(page))
+		goto out;
+
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
-		goto unlock;
+		goto out_unlock;
 
 	move_lock_mem_cgroup(from, &flags);
 
-	if (!anon && page_mapped(page)) {
+	if (!PageAnon(page) && page_mapped(page)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
-	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+
+	/*
+	 * It is safe to change pc->mem_cgroup here because the page
+	 * is referenced, charged, and isolated - we can't race with
+	 * uncharging, charging, migration, or LRU putback.
+	 */
 
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
-	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
-	move_unlock_mem_cgroup(from, &flags);
+	spin_unlock(&from->move_lock);
 	ret = 0;
-unlock:
-	unlock_page_cgroup(pc);
-	/*
-	 * check events
-	 */
+
+	mem_cgroup_charge_statistics(to, page, nr_pages);
 	memcg_check_events(to, page);
+	mem_cgroup_charge_statistics(from, page, -nr_pages);
 	memcg_check_events(from, page);
+	local_irq_restore(flags);
+out_unlock:
+	unlock_page(page);
 out:
 	return ret;
 }
@@ -3897,681 +3677,66 @@ static int mem_cgroup_move_parent(struct page *page,
 	putback_lru_page(page);
 put:
 	put_page(page);
-out:
-	return ret;
-}
-
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask, enum charge_type ctype)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	bool oom = true;
-	int ret;
-
-	if (PageTransHuge(page)) {
-		nr_pages <<= compound_order(page);
-		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-		/*
-		 * Never OOM-kill a process for a huge page.  The
-		 * fault handler will fall back to regular pages.
-		 */
-		oom = false;
-	}
-
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
-	if (ret == -ENOMEM)
-		return ret;
-	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
-	return 0;
-}
-
-int mem_cgroup_newpage_charge(struct page *page,
-			      struct mm_struct *mm, gfp_t gfp_mask)
-{
-	if (mem_cgroup_disabled())
-		return 0;
-	VM_BUG_ON_PAGE(page_mapped(page), page);
-	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-	VM_BUG_ON(!mm);
-	return mem_cgroup_charge_common(page, mm, gfp_mask,
-					MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-/*
- * While swap-in, try_charge -> commit or cancel, the page is locked.
- * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is acquired. This refcnt will be consumed by
- * "commit()" or removed by "cancel()"
- */
-static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-					  struct page *page,
-					  gfp_t mask,
-					  struct mem_cgroup **memcgp)
-{
-	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
-	int ret;
-
-	pc = lookup_page_cgroup(page);
-	/*
-	 * Every swap fault against a single page tries to charge the
-	 * page, bail as early as possible.  shmem_unuse() encounters
-	 * already charged pages, too.  The USED bit is protected by
-	 * the page lock, which serializes swap cache removal, which
-	 * in turn serializes uncharging.
-	 */
-	if (PageCgroupUsed(pc))
-		return 0;
-	if (!do_swap_account)
-		goto charge_cur_mm;
-	memcg = try_get_mem_cgroup_from_page(page);
-	if (!memcg)
-		goto charge_cur_mm;
-	*memcgp = memcg;
-	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
-	css_put(&memcg->css);
-	if (ret == -EINTR)
-		ret = 0;
-	return ret;
-charge_cur_mm:
-	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
-	if (ret == -EINTR)
-		ret = 0;
-	return ret;
-}
-
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
-				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
-{
-	*memcgp = NULL;
-	if (mem_cgroup_disabled())
-		return 0;
-	/*
-	 * A racing thread's fault, or swapoff, may have already
-	 * updated the pte, and even removed page from swap cache: in
-	 * those cases unuse_pte()'s pte_same() test will fail; but
-	 * there's also a KSM case which does need to charge the page.
-	 */
-	if (!PageSwapCache(page)) {
-		int ret;
-
-		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
-		if (ret == -EINTR)
-			ret = 0;
-		return ret;
-	}
-	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
-}
-
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
-{
-	if (mem_cgroup_disabled())
-		return;
-	if (!memcg)
-		return;
-	__mem_cgroup_cancel_charge(memcg, 1);
-}
-
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
-					enum charge_type ctype)
-{
-	if (mem_cgroup_disabled())
-		return;
-	if (!memcg)
-		return;
-
-	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
-	/*
-	 * Now swap is on-memory. This means this page may be
-	 * counted both as mem and swap....double count.
-	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
-	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
-	 * may call delete_from_swap_cache() before reach here.
-	 */
-	if (do_swap_account && PageSwapCache(page)) {
-		swp_entry_t ent = {.val = page_private(page)};
-		mem_cgroup_uncharge_swap(ent);
-	}
-}
-
-void mem_cgroup_commit_charge_swapin(struct page *page,
-				     struct mem_cgroup *memcg)
-{
-	__mem_cgroup_commit_charge_swapin(page, memcg,
-					  MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask)
-{
-	struct mem_cgroup *memcg = NULL;
-	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-	int ret;
-
-	if (mem_cgroup_disabled())
-		return 0;
-	if (PageCompound(page))
-		return 0;
-
-	if (!PageSwapCache(page))
-		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-	else { /* page is swapcache/shmem */
-		ret = __mem_cgroup_try_charge_swapin(mm, page,
-						     gfp_mask, &memcg);
-		if (!ret)
-			__mem_cgroup_commit_charge_swapin(page, memcg, type);
-	}
-	return ret;
-}
-
-static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
-				   unsigned int nr_pages,
-				   const enum charge_type ctype)
-{
-	struct memcg_batch_info *batch = NULL;
-	bool uncharge_memsw = true;
-
-	/* If swapout, usage of swap doesn't decrease */
-	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-		uncharge_memsw = false;
-
-	batch = &current->memcg_batch;
-	/*
-	 * In usual, we do css_get() when we remember memcg pointer.
-	 * But in this case, we keep res->usage until end of a series of
-	 * uncharges. Then, it's ok to ignore memcg's refcnt.
-	 */
-	if (!batch->memcg)
-		batch->memcg = memcg;
-	/*
-	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
-	 * In those cases, all pages freed continuously can be expected to be in
-	 * the same cgroup and we have chance to coalesce uncharges.
-	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
-	 * because we want to do uncharge as soon as possible.
-	 */
-
-	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
-		goto direct_uncharge;
-
-	if (nr_pages > 1)
-		goto direct_uncharge;
-
-	/*
-	 * In typical case, batch->memcg == mem. This means we can
-	 * merge a series of uncharges to an uncharge of page_counter.
-	 * If not, we uncharge page_counter ony by one.
-	 */
-	if (batch->memcg != memcg)
-		goto direct_uncharge;
-	/* remember freed charge and uncharge it later */
-	batch->nr_pages++;
-	if (uncharge_memsw)
-		batch->memsw_nr_pages++;
-	return;
-direct_uncharge:
-	page_counter_uncharge(&memcg->memory, nr_pages);
-	if (uncharge_memsw)
-		page_counter_uncharge(&memcg->memsw, nr_pages);
-	if (unlikely(batch->memcg != memcg))
-		memcg_oom_recover(memcg);
-}
-
-/*
- * uncharge if !page_mapped(page)
- */
-static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
-			     bool end_migration)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	struct page_cgroup *pc;
-	bool anon;
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	if (PageTransHuge(page)) {
-		nr_pages <<= compound_order(page);
-		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-	}
-	/*
-	 * Check if our page_cgroup is valid
-	 */
-	pc = lookup_page_cgroup(page);
-	if (unlikely(!PageCgroupUsed(pc)))
-		return NULL;
-
-	lock_page_cgroup(pc);
-
-	memcg = pc->mem_cgroup;
-
-	if (!PageCgroupUsed(pc))
-		goto unlock_out;
-
-	anon = PageAnon(page);
-
-	switch (ctype) {
-	case MEM_CGROUP_CHARGE_TYPE_ANON:
-		/*
-		 * Generally PageAnon tells if it's the anon statistics to be
-		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
-		 * used before page reached the stage of being marked PageAnon.
-		 */
-		anon = true;
-		/* fallthrough */
-	case MEM_CGROUP_CHARGE_TYPE_DROP:
-		/* See mem_cgroup_prepare_migration() */
-		if (page_mapped(page))
-			goto unlock_out;
-		/*
-		 * Pages under migration may not be uncharged.  But
-		 * end_migration() /must/ be the one uncharging the
-		 * unused post-migration page and so it has to call
-		 * here with the migration bit still set.  See the
-		 * page_counter handling below.
-		 */
-		if (!end_migration && PageCgroupMigration(pc))
-			goto unlock_out;
-		break;
-	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
-		if (!PageAnon(page)) {	/* Shared memory */
-			if (page->mapping && !page_is_file_cache(page))
-				goto unlock_out;
-		} else if (page_mapped(page)) /* Anon */
-				goto unlock_out;
-		break;
-	default:
-		break;
-	}
-
-	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
-
-	ClearPageCgroupUsed(pc);
-	/*
-	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
-	 * freed from LRU. This is safe because uncharged page is expected not
-	 * to be reused (freed soon). Exception is SwapCache, it's handled by
-	 * special functions.
-	 */
-
-	unlock_page_cgroup(pc);
-	/*
-	 * even after unlock, we have memcg->memory.usage here and this memcg
-	 * will never be freed.
-	 */
-	memcg_check_events(memcg, page);
-	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-		mem_cgroup_swap_statistics(memcg, true);
-		mem_cgroup_get(memcg);
-	}
-	/*
-	 * Migration does not charge the page_counter for the
-	 * replacement page, so leave it alone when phasing out the
-	 * page that is unused after the migration.
-	 */
-	if (!end_migration && !mem_cgroup_is_root(memcg))
-		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
-
-	return memcg;
-
-unlock_out:
-	unlock_page_cgroup(pc);
-	return NULL;
-}
-
-void mem_cgroup_uncharge_page(struct page *page)
-{
-	/* early check. */
-	if (page_mapped(page))
-		return;
-	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-	/*
-	 * If the page is in swap cache, uncharge should be deferred
-	 * to the swap path, which also properly accounts swap usage
-	 * and handles memcg lifetime.
-	 *
-	 * Note that this check is not stable and reclaim may add the
-	 * page to swap cache at any time after this.  However, if the
-	 * page is not in swap cache by the time page->mapcount hits
-	 * 0, there won't be any page table references to the swap
-	 * slot, and reclaim will free it and not actually write the
-	 * page to disk.
-	 */
-	if (PageSwapCache(page))
-		return;
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
-}
-
-void mem_cgroup_uncharge_cache_page(struct page *page)
-{
-	VM_BUG_ON_PAGE(page_mapped(page), page);
-	VM_BUG_ON_PAGE(page->mapping, page);
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
-}
-
-/*
- * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
- * In that cases, pages are freed continuously and we can expect pages
- * are in the same memcg. All these calls itself limits the number of
- * pages freed at once, then uncharge_start/end() is called properly.
- * This may be called prural(2) times in a context,
- */
-
-void mem_cgroup_uncharge_start(void)
-{
-	current->memcg_batch.do_batch++;
-	/* We can do nest. */
-	if (current->memcg_batch.do_batch == 1) {
-		current->memcg_batch.memcg = NULL;
-		current->memcg_batch.nr_pages = 0;
-		current->memcg_batch.memsw_nr_pages = 0;
-	}
-}
-
-void mem_cgroup_uncharge_end(void)
-{
-	struct memcg_batch_info *batch = &current->memcg_batch;
-
-	if (!batch->do_batch)
-		return;
-
-	batch->do_batch--;
-	if (batch->do_batch) /* If stacked, do nothing. */
-		return;
-
-	if (!batch->memcg)
-		return;
-	/*
-	 * This "batch->memcg" is valid without any css_get/put etc...
-	 * bacause we hide charges behind us.
-	 */
-	if (batch->nr_pages)
-		page_counter_uncharge(&batch->memcg->memory, batch->nr_pages);
-	if (batch->memsw_nr_pages)
-		page_counter_uncharge(&batch->memcg->memsw, batch->memsw_nr_pages);
-	memcg_oom_recover(batch->memcg);
-	/* forget this pointer (for sanity check) */
-	batch->memcg = NULL;
-}
-
-#ifdef CONFIG_SWAP
-/*
- * called after __delete_from_swap_cache() and drop "page" account.
- * memcg information is recorded to swap_cgroup of "ent"
- */
-void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
-{
-	struct mem_cgroup *memcg;
-	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
-
-	if (!swapout) /* this was a swap cache but the swap is unused ! */
-		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-
-	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
-
-	/*
-	 * record memcg information,  if swapout && memcg != NULL,
-	 * mem_cgroup_get() was called in uncharge().
-	 */
-	if (do_swap_account && swapout && memcg)
-		swap_cgroup_record(ent, css_id(&memcg->css));
-}
-#endif
-
-#ifdef CONFIG_MEMCG_SWAP
-/*
- * called from swap_entry_free(). remove record in swap_cgroup and
- * uncharge "memsw" account.
- */
-void mem_cgroup_uncharge_swap(swp_entry_t ent)
-{
-	struct mem_cgroup *memcg;
-	unsigned short id;
-
-	if (!do_swap_account)
-		return;
-
-	id = swap_cgroup_record(ent, 0);
-	rcu_read_lock();
-	memcg = mem_cgroup_lookup(id);
-	if (memcg) {
-		/*
-		 * We uncharge this because swap is freed.
-		 * This memcg can be obsolete one. We avoid calling css_tryget
-		 */
-		if (!mem_cgroup_is_root(memcg))
-			page_counter_uncharge(&memcg->memsw, 1);
-		mem_cgroup_swap_statistics(memcg, false);
-		mem_cgroup_put(memcg);
-	}
-	rcu_read_unlock();
-}
-
-/**
- * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
- * @entry: swap entry to be moved
- * @from:  mem_cgroup which the entry is moved from
- * @to:  mem_cgroup which the entry is moved to
- *
- * It succeeds only when the swap_cgroup's record for this entry is the same
- * as the mem_cgroup's id of @from.
- *
- * Returns 0 on success, -EINVAL on failure.
- *
- * The caller must have charged to @to, IOW, called page_counter_charge() about
- * both res and memsw, and called css_get().
- */
-static int mem_cgroup_move_swap_account(swp_entry_t entry,
-				struct mem_cgroup *from, struct mem_cgroup *to)
-{
-	unsigned short old_id, new_id;
-
-	old_id = css_id(&from->css);
-	new_id = css_id(&to->css);
-
-	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
-		mem_cgroup_swap_statistics(from, false);
-		mem_cgroup_swap_statistics(to, true);
-		/*
-		 * This function is only called from task migration context now.
-		 * It postpones page_counter and refcount handling till the end
-		 * of task migration(mem_cgroup_clear_mc()) for performance
-		 * improvement. But we cannot postpone mem_cgroup_get(to)
-		 * because if the process that has been moved to @to does
-		 * swap-in, the refcount of @to might be decreased to 0.
-		 */
-		mem_cgroup_get(to);
-		return 0;
-	}
-	return -EINVAL;
-}
-#else
-static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-				struct mem_cgroup *from, struct mem_cgroup *to)
-{
-	return -EINVAL;
-}
-#endif
-
-/*
- * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
- * page belongs to.
- */
-void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-				  struct mem_cgroup **memcgp)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	struct page_cgroup *pc;
-	enum charge_type ctype;
-
-	*memcgp = NULL;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	if (PageTransHuge(page))
-		nr_pages <<= compound_order(page);
-
-	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
-	if (PageCgroupUsed(pc)) {
-		memcg = pc->mem_cgroup;
-		css_get(&memcg->css);
-		/*
-		 * At migrating an anonymous page, its mapcount goes down
-		 * to 0 and uncharge() will be called. But, even if it's fully
-		 * unmapped, migration may fail and this page has to be
-		 * charged again. We set MIGRATION flag here and delay uncharge
-		 * until end_migration() is called
-		 *
-		 * Corner Case Thinking
-		 * A)
-		 * When the old page was mapped as Anon and it's unmap-and-freed
-		 * while migration was ongoing.
-		 * If unmap finds the old page, uncharge() of it will be delayed
-		 * until end_migration(). If unmap finds a new page, it's
-		 * uncharged when it make mapcount to be 1->0. If unmap code
-		 * finds swap_migration_entry, the new page will not be mapped
-		 * and end_migration() will find it(mapcount==0).
-		 *
-		 * B)
-		 * When the old page was mapped but migraion fails, the kernel
-		 * remaps it. A charge for it is kept by MIGRATION flag even
-		 * if mapcount goes down to 0. We can do remap successfully
-		 * without charging it again.
-		 *
-		 * C)
-		 * The "old" page is under lock_page() until the end of
-		 * migration, so, the old page itself will not be swapped-out.
-		 * If the new page is swapped out before end_migraton, our
-		 * hook to usual swap-out path will catch the event.
-		 */
-		if (PageAnon(page))
-			SetPageCgroupMigration(pc);
-	}
-	unlock_page_cgroup(pc);
-	/*
-	 * If the page is not charged at this point,
-	 * we return here.
-	 */
-	if (!memcg)
-		return;
-
-	*memcgp = memcg;
-	/*
-	 * We charge new page before it's used/mapped. So, even if unlock_page()
-	 * is called before end_migration, we can catch all events on this new
-	 * page. In the case new page is migrated but not remapped, new page's
-	 * mapcount will be finally 0 and we call uncharge in end_migration().
-	 */
-	if (PageAnon(page))
-		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-	else
-		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-	/*
-	 * The page is committed to the memcg, but it's not actually
-	 * charged to the page_counter since we plan on replacing the
-	 * old one and only one page is going to be left afterwards.
-	 */
-	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
-}
-
-/* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-	struct page *oldpage, struct page *newpage, bool migration_ok)
-{
-	struct page *used, *unused;
-	struct page_cgroup *pc;
-	bool anon;
-
-	if (!memcg)
-		return;
-
-	if (!migration_ok) {
-		used = oldpage;
-		unused = newpage;
-	} else {
-		used = newpage;
-		unused = oldpage;
-	}
-	anon = PageAnon(used);
-	__mem_cgroup_uncharge_common(unused,
-				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
-				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
-				     true);
-	css_put(&memcg->css);
-	/*
-	 * We disallowed uncharge of pages under migration because mapcount
-	 * of the page goes down to zero, temporarly.
-	 * Clear the flag and check the page should be charged.
-	 */
-	pc = lookup_page_cgroup(oldpage);
-	lock_page_cgroup(pc);
-	ClearPageCgroupMigration(pc);
-	unlock_page_cgroup(pc);
+out:
+	return ret;
+}
 
-	/*
-	 * If a page is a file cache, radix-tree replacement is very atomic
-	 * and we can skip this check. When it was an Anon page, its mapcount
-	 * goes down to 0. But because we added MIGRATION flage, it's not
-	 * uncharged yet. There are several case but page->mapcount check
-	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
-	 * check. (see prepare_charge() also)
-	 */
-	if (anon)
-		mem_cgroup_uncharge_page(used);
+#ifdef CONFIG_MEMCG_SWAP
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+					 bool charge)
+{
+	int val = (charge) ? 1 : -1;
+	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 
-/*
- * At replace page cache, newpage is not under any memcg but it's on
- * LRU. So, this function doesn't touch page_counter but handles LRU
- * in correct way. Both pages are locked so we cannot race with uncharge.
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from:  mem_cgroup which the entry is moved from
+ * @to:  mem_cgroup which the entry is moved to
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called page_counter_charge() about
+ * both res and memsw, and called css_get().
  */
-void mem_cgroup_replace_page_cache(struct page *oldpage,
-				  struct page *newpage)
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+				struct mem_cgroup *from, struct mem_cgroup *to)
 {
-	struct mem_cgroup *memcg = NULL;
-	struct page_cgroup *pc;
-	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+	unsigned short old_id, new_id;
 
-	if (mem_cgroup_disabled())
-		return;
+	old_id = css_id(&from->css);
+	new_id = css_id(&to->css);
 
-	pc = lookup_page_cgroup(oldpage);
-	/* fix accounting on old pages */
-	lock_page_cgroup(pc);
-	if (PageCgroupUsed(pc)) {
-		memcg = pc->mem_cgroup;
-		mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
-		ClearPageCgroupUsed(pc);
+	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+		mem_cgroup_swap_statistics(from, false);
+		mem_cgroup_swap_statistics(to, true);
+		/*
+		 * This function is only called from task migration context now.
+		 * It postpones page_counter and refcount handling till the end
+		 * of task migration(mem_cgroup_clear_mc()) for performance
+		 * improvement. But we cannot postpone css_get(to)  because if
+		 * the process that has been moved to @to does swap-in, the
+		 * refcount of @to might be decreased to 0.
+		 *
+		 * We are in attach() phase, so the cgroup is guaranteed to be
+		 * alive, so we can just call css_get().
+		 */
+		css_get(&to->css);
+		return 0;
 	}
-	unlock_page_cgroup(pc);
-
-	/*
-	 * When called from shmem_replace_page(), in some cases the
-	 * oldpage has already been charged, and in some cases not.
-	 */
-	if (!memcg)
-		return;
-	/*
-	 * Even if newpage->mapping was NULL before starting replacement,
-	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
-	 * LRU while we overwrite pc->mem_cgroup.
-	 */
-	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
+	return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+				struct mem_cgroup *from, struct mem_cgroup *to)
+{
+	return -EINVAL;
 }
+#endif
 
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
@@ -4766,7 +3931,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
-		spin_lock(&mctz->lock);
+		spin_lock_irq(&mctz->lock);
 
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
@@ -4806,7 +3971,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
-		spin_unlock(&mctz->lock);
+		spin_unlock_irq(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
@@ -4872,9 +4037,11 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 		if (mem_cgroup_move_parent(page, pc, memcg)) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = page;
-			cond_resched();
-		} else
+			schedule_timeout_uninterruptible(1);
+		} else {
 			busy = NULL;
+			cond_resched();
+		}
 	} while (!list_empty(list));
 }
 
@@ -4974,8 +4141,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-						false);
+		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+							GFP_KERNEL, 0);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -5049,18 +4216,26 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 }
 
 
-static unsigned long tree_stat(struct mem_cgroup *memcg,
-			       enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
-	long val = 0;
+	unsigned long val = 0;
 
-	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 
-	if (val < 0) /* race ? */
-		val = 0;
+	return val;
+}
+static unsigned long mem_cgroup_recursive_stat2(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat2_index idx)
+{
+	struct mem_cgroup *iter;
+	unsigned long val = 0;
+
+	for_each_mem_cgroup_tree(iter, memcg)
+		val += mem_cgroup_read_stat2(iter, idx);
+
 	return val;
 }
 
@@ -5068,18 +4243,60 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 
-	if (mem_cgroup_is_root(memcg)) {
-		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
-		if (swap)
-			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
-	} else {
+	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
-			val = page_counter_read(&memcg->memory);
+			return page_counter_read(&memcg->memory);
 		else
-			val = page_counter_read(&memcg->memsw);
+			return page_counter_read(&memcg->memsw);
 	}
-	return val << PAGE_SHIFT;
+
+	/*
+	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+	 */
+	val = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_RSS);
+
+	if (swap)
+		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+	return val;
+}
+
+void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
+{
+	int nid;
+
+	memset(&mi->pages, 0, sizeof(mi->pages));
+	for_each_online_node(nid)
+		mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
+
+	mi->slab_reclaimable = mem_cgroup_recursive_stat2(memcg,
+					MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+	mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
+					MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
+	mi->cached = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+}
+
+int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.limit - page_counter_read(&memcg->memsw);
+
+	/* reclaimable slabs */
+	free += page_counter_read(&memcg->dcache);
+
+	/* assume file cache is reclaimable */
+	free += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+
+	/* but do not count shmem pages as they can't be purged,
+	 * only swapped out */
+	free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+
+	return free < pages ? -ENOMEM : 0;
 }
 
 enum {
@@ -5117,9 +4334,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	switch (MEMFILE_ATTR(cft->private)) {
 	case RES_USAGE:
 		if (counter == &memcg->memory)
-			val = mem_cgroup_usage(memcg, false);
+			val = mem_cgroup_usage(memcg, false) * PAGE_SIZE;
 		else if (counter == &memcg->memsw)
-			val = mem_cgroup_usage(memcg, true);
+			val = mem_cgroup_usage(memcg, true) * PAGE_SIZE;
 		else
 			val = (u64)page_counter_read(counter) * PAGE_SIZE;
 		break;
@@ -5143,11 +4360,17 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 
-static int memcg_update_kmem_limit(struct cgroup *cont, unsigned long limit)
-{
-	int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+				 unsigned long nr_pages)
+{
+	int err = 0;
+	int memcg_id;
+
+	if (memcg_kmem_is_active(memcg))
+		return 0;
+
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
 	 * be changed if the cgroup has children already, or if tasks had
@@ -5161,80 +4384,99 @@ static int memcg_update_kmem_limit(struct cgroup *cont, unsigned long limit)
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
-	mutex_lock(&memcg_limit_mutex);
-	if (!memcg->kmem_account_flags && limit != PAGE_COUNTER_MAX) {
-		if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
-			ret = -EBUSY;
-			goto out;
-		}
-		ret = page_counter_limit(&memcg->kmem, limit);
-		VM_BUG_ON(ret);
+	if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+		err = -EBUSY;
+	mutex_unlock(&memcg_create_mutex);
+	if (err)
+		goto out;
 
-		ret = memcg_update_cache_sizes(memcg);
-		if (ret) {
-			page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
-			goto out;
-		}
-		static_key_slow_inc(&memcg_kmem_enabled_key);
-		/*
-		 * setting the active bit after the inc will guarantee no one
-		 * starts accounting before all call sites are patched
-		 */
-		memcg_kmem_set_active(memcg);
+	memcg_id = memcg_alloc_cache_id();
+        if (memcg_id < 0) {
+                err = memcg_id;
+                goto out;
+        }
 
-		/*
-		 * kmem charges can outlive the cgroup. In the case of slab
-		 * pages, for instance, a page contain objects from various
-		 * processes, so it is unfeasible to migrate them away. We
-		 * need to reference count the memcg because of that.
-		 */
-		mem_cgroup_get(memcg);
-	} else
-		ret = page_counter_limit(&memcg->kmem, limit);
+	/*
+	 * We couldn't have accounted to this cgroup, because it hasn't got
+	 * activated yet, so this should succeed.
+	 */
+	err = page_counter_limit(&memcg->kmem, nr_pages);
+	VM_BUG_ON(err);
+
+	static_key_slow_inc(&memcg_kmem_enabled_key);
+	/*
+	 * A memory cgroup is considered kmem-active as soon as it gets
+	 * kmemcg_id. Setting the id after enabling static branching will
+	 * patched.
+	 */
+	memcg->kmemcg_id = memcg_id;
+	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 out:
+	return err;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+			       unsigned long nr_pages)
+{
+	int ret;
+
+	mutex_lock(&activate_kmem_mutex);
+	ret = __memcg_activate_kmem(memcg, nr_pages);
+	mutex_unlock(&activate_kmem_mutex);
+	return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+				   unsigned long nr_pages)
+{
+	int ret;
+
+	mutex_lock(&memcg_limit_mutex);
+	if (!memcg_kmem_is_active(memcg))
+		ret = memcg_activate_kmem(memcg, nr_pages);
+	else
+		ret = page_counter_limit(&memcg->kmem, nr_pages);
 	mutex_unlock(&memcg_limit_mutex);
-	mutex_unlock(&memcg_create_mutex);
-#endif
 	return ret;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+static bool do_kmem_account = true;
+
+static int __init enable_kmem_account(char *s)
+{
+	if (!strcmp(s, "1"))
+		do_kmem_account = true;
+	else if (!strcmp(s, "0"))
+		do_kmem_account = false;
+	return 1;
+}
+__setup("kmemaccount=", enable_kmem_account);
+
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
 	int ret = 0;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	if (!parent)
-		goto out;
 
-	memcg->kmem_account_flags = parent->kmem_account_flags;
-	/*
-	 * When that happen, we need to disable the static branch only on those
-	 * memcgs that enabled it. To achieve this, we would be forced to
-	 * complicate the code by keeping track of which memcgs were the ones
-	 * that actually enabled limits, and which ones got it from its
-	 * parents.
-	 *
-	 * It is a lot simpler just to do static_key_slow_inc() on every child
-	 * that is accounted.
-	 */
-	if (!memcg_kmem_is_active(memcg))
-		goto out;
+	if (!parent)
+		return 0;
 
+	mutex_lock(&activate_kmem_mutex);
 	/*
-	 * destroy(), called if we fail, will issue static_key_slow_inc() and
-	 * mem_cgroup_put() if kmem is enabled. We have to either call them
-	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
-	 * this more consistent, since it always leads to the same destroy path
+	 * If the parent cgroup is not kmem-active now, it cannot be activated
+	 * after this point, because it has at least one child already.
 	 */
-	mem_cgroup_get(memcg);
-	static_key_slow_inc(&memcg_kmem_enabled_key);
-
-	mutex_lock(&memcg_limit_mutex);
-	ret = memcg_update_cache_sizes(memcg);
-	mutex_unlock(&memcg_limit_mutex);
-out:
+	if (do_kmem_account || memcg_kmem_is_active(parent))
+		ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
+	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+				   unsigned long long val)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 /*
@@ -5252,31 +4494,146 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 	if (ret)
 		return ret;
 
-	switch (MEMFILE_ATTR(cft->private)) {
-	case RES_LIMIT:
-		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
-			ret = -EINVAL;
-			break;
-		}
-		switch (MEMFILE_TYPE(cft->private)) {
-		case _MEM:
-			ret = mem_cgroup_resize_limit(memcg, nr_pages);
-			break;
-		case _MEMSWAP:
-			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
-			break;
-		case _KMEM:
-			ret = memcg_update_kmem_limit(cont, nr_pages);
-			break;
-		}
-		break;
-	case RES_SOFT_LIMIT:
-		memcg->soft_limit = nr_pages;
-		ret = 0;
-		break;
+	switch (MEMFILE_ATTR(cft->private)) {
+	case RES_LIMIT:
+		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+			ret = -EINVAL;
+			break;
+		}
+		switch (MEMFILE_TYPE(cft->private)) {
+		case _MEM:
+			ret = mem_cgroup_resize_limit(memcg, nr_pages);
+			break;
+		case _MEMSWAP:
+			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+			break;
+		case _KMEM:
+			ret = memcg_update_kmem_limit(memcg, nr_pages);
+			break;
+		}
+		break;
+	case RES_SOFT_LIMIT:
+		memcg->soft_limit = nr_pages;
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+static ssize_t mem_cgroup_low_read(struct cgroup *cont, struct cftype *cft,
+				   struct file *file, char __user *buf,
+				   size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->low) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_low_write(struct cgroup *cont, struct cftype *cft,
+				const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->low = nr_pages;
+	return 0;
+}
+
+static ssize_t mem_cgroup_high_read(struct cgroup *cont, struct cftype *cft,
+				    struct file *file, char __user *buf,
+				    size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->high) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
+				 const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages, usage;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->high = nr_pages;
+
+	usage = page_counter_read(&memcg->memory);
+	if (usage > nr_pages)
+		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
+					     GFP_KERNEL, 0);
+	return 0;
+}
+
+static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
+		struct cftype *cft, struct file *file, char __user *buf,
+		size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->oom_guarantee) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_oom_guarantee_write(struct cgroup *cont,
+		struct cftype *cft, const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->oom_guarantee = nr_pages;
+	return 0;
+}
+
+#ifdef CONFIG_CLEANCACHE
+static u64 mem_cgroup_disable_cleancache_read(struct cgroup *cgrp,
+					      struct cftype *cft)
+{
+	return mem_cgroup_from_cont(cgrp)->cleancache_disabled_toggle;
+}
+
+static int mem_cgroup_disable_cleancache_write(struct cgroup *cgrp,
+					       struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *iter, *parent;
+
+	mutex_lock(&memcg_create_mutex);
+	memcg->cleancache_disabled_toggle = !!val;
+	for_each_mem_cgroup_tree(iter, memcg) {
+		parent = parent_mem_cgroup(iter);
+		iter->cleancache_disabled = iter->cleancache_disabled_toggle;
+		if (parent)
+			iter->cleancache_disabled |= parent->cleancache_disabled;
 	}
-	return ret;
+	mutex_unlock(&memcg_create_mutex);
+	return 0;
 }
+#endif
 
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
@@ -5343,6 +4700,148 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 }
 #endif
 
+#ifdef CONFIG_BEANCOUNTERS
+
+#include <bc/beancounter.h>
+
+void mem_cgroup_sync_beancounter(struct mem_cgroup *memcg,
+				 struct user_beancounter *ub)
+{
+	struct mem_cgroup *mi;
+	unsigned long lim, held, maxheld;
+	volatile struct ubparm *k, *d, *p, *s, *o;
+
+	k = &ub->ub_parms[UB_KMEMSIZE];
+	d = &ub->ub_parms[UB_DCACHESIZE];
+	p = &ub->ub_parms[UB_PHYSPAGES];
+	s = &ub->ub_parms[UB_SWAPPAGES];
+	o = &ub->ub_parms[UB_OOMGUARPAGES];
+
+	p->held	= page_counter_read(&memcg->memory);
+	p->maxheld = memcg->memory.watermark;
+	p->failcnt = atomic_long_read(&memcg->mem_failcnt);
+	lim = memcg->memory.limit;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long, lim, UB_MAXVALUE);
+	p->barrier = p->limit = lim;
+
+	//todo: check odd code - counting in bytes instead of pages. wtf???
+	k->held = page_counter_read(&memcg->kmem) << PAGE_SHIFT;
+	k->maxheld = memcg->kmem.watermark << PAGE_SHIFT;
+	k->failcnt = memcg->kmem.failcnt << PAGE_SHIFT;
+	lim = memcg->kmem.limit << PAGE_SHIFT;
+	lim = lim >= (PAGE_COUNTER_MAX << PAGE_SHIFT) ? UB_MAXVALUE :
+		min_t(unsigned long long, lim, UB_MAXVALUE);
+	k->barrier = k->limit = lim;
+
+	d->held = page_counter_read(&memcg->dcache) << PAGE_SHIFT;
+	d->maxheld = memcg->dcache.watermark << PAGE_SHIFT;
+	d->failcnt = 0;
+	d->barrier = d->limit = UB_MAXVALUE;
+
+	held = page_counter_read(&memcg->memsw) -
+		page_counter_read(&memcg->memory);
+	maxheld = memcg->swap_max;
+	s->failcnt = atomic_long_read(&memcg->swap_failcnt);
+	lim = memcg->memsw.limit;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long long, lim, UB_MAXVALUE);
+	if (lim != UB_MAXVALUE)
+		lim -= p->limit;
+	s->barrier = s->limit = lim;
+
+	/* Due to global reclaim, memory.memsw.usage can be greater than
+	 * (memory.memsw.limit - memory.limit). */
+	s->held = min(held, lim);
+	s->maxheld = min(maxheld, lim);
+
+	o->held = page_counter_read(&memcg->memsw);
+	o->maxheld = memcg->memsw.watermark;
+	o->failcnt = atomic_long_read(&memcg->oom_kill_cnt);
+	lim = memcg->oom_guarantee;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long long, lim >> PAGE_SHIFT, UB_MAXVALUE);
+	o->barrier = o->limit = lim;
+
+	ub->swapin = 0;
+	ub->swapout = 0;
+	for_each_mem_cgroup_tree(mi, memcg) {
+		ub->swapin += mem_cgroup_read_events(mi, MEM_CGROUP_EVENTS_PSWPIN);
+		ub->swapout += mem_cgroup_read_events(mi, MEM_CGROUP_EVENTS_PSWPOUT);
+	}
+}
+
+int mem_cgroup_apply_beancounter(struct mem_cgroup *memcg,
+				 struct user_beancounter *ub)
+{
+	unsigned long long mem, memsw, mem_old, memsw_old, oomguar;
+	int ret = 0;
+
+	if (mem_cgroup_is_root(memcg))
+		return -EPERM;
+
+	mem = ub->ub_parms[UB_PHYSPAGES].limit;
+	memsw = ub->ub_parms[UB_SWAPPAGES].limit;
+
+	if (mem > PAGE_COUNTER_MAX)
+		mem = PAGE_COUNTER_MAX;
+
+	if (memsw + mem < mem || memsw + mem > PAGE_COUNTER_MAX)
+		memsw = PAGE_COUNTER_MAX;
+	else
+		memsw += mem;
+
+	oomguar = ub->ub_parms[UB_OOMGUARPAGES].barrier;
+
+	if (ub->ub_parms[UB_KMEMSIZE].limit != UB_MAXVALUE)
+		pr_warn_once("ub: kmemsize limit is deprecated\n");
+	if (ub->ub_parms[UB_DCACHESIZE].limit != UB_MAXVALUE)
+		pr_warn_once("ub: dcachesize limit is deprecated\n");
+
+	/* activate kmem accounting */
+	ret = memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+	if (ret)
+		goto out;
+
+	/* try change mem+swap before changing mem limit */
+	if (memcg->memsw.limit != memsw)
+		(void)mem_cgroup_resize_memsw_limit(memcg, memsw);
+
+	if (memcg->memory.limit != mem) {
+		ret = mem_cgroup_resize_limit(memcg, mem);
+		if (ret)
+			goto out;
+	}
+
+	mem_old = memcg->memory.limit;
+	memsw_old = memcg->memsw.limit;
+
+	if (mem != mem_old) {
+		/* first, reset memsw limit since it cannot be < mem limit */
+		if (memsw_old < PAGE_COUNTER_MAX) {
+			memsw_old = PAGE_COUNTER_MAX;
+			ret = mem_cgroup_resize_memsw_limit(memcg, memsw_old);
+			if (ret)
+				goto out;
+		}
+		ret = mem_cgroup_resize_limit(memcg, mem);
+		if (ret)
+			goto out;
+	}
+
+	if (memsw != memsw_old) {
+		ret = mem_cgroup_resize_memsw_limit(memcg, memsw);
+		if (ret)
+			goto out;
+	}
+
+	memcg->oom_guarantee = oomguar;
+out:
+	return ret;
+}
+
+#endif /* CONFIG_BEANCOUNTERS */
+
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 				      struct seq_file *m)
@@ -5388,6 +4887,236 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 	seq_putc(m, '\n');
 	return 0;
 }
+
+/*
+ * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the
+ * set of nodes to allocate pages from. @current_node is the current preferable
+ * node, it gets rotated after each allocation.
+ */
+struct memcg_numa_migrate_struct {
+	nodemask_t *target_nodes;
+	int current_node;
+};
+
+/*
+ * Used as an argument for migrate_pages(). Allocated pages are spread evenly
+ * among destination nodes.
+ */
+static struct page *memcg_numa_migrate_new_page(struct page *page,
+				unsigned long private, int **result)
+{
+	struct memcg_numa_migrate_struct *ms = (void *)private;
+	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
+
+	ms->current_node = next_node(ms->current_node, *ms->target_nodes);
+	if (ms->current_node >= MAX_NUMNODES) {
+		ms->current_node = first_node(*ms->target_nodes);
+		VM_BUG_ON(ms->current_node >= MAX_NUMNODES);
+	}
+
+	return __alloc_pages_nodemask(gfp_mask, 0,
+			node_zonelist(ms->current_node, gfp_mask),
+			ms->target_nodes);
+}
+
+/*
+ * Isolate at most @nr_to_scan pages from @lruvec for further migration and
+ * store them in @dst. Returns the number of pages scanned. Return value of 0
+ * means that @lruved is empty.
+ */
+static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
+				     long nr_to_scan, struct list_head *dst)
+{
+	struct list_head *src = &lruvec->lists[lru];
+	struct zone *zone = lruvec_zone(lruvec);
+	struct page *page, *tmp;
+	long scanned = 0, taken = 0;
+
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
+		int nr_pages;
+		page = list_last_entry(src, struct page, lru);
+
+		scanned++;
+
+		switch (__isolate_lru_page(page, ISOLATE_ASYNC_MIGRATE)) {
+		case 0:
+			nr_pages = hpage_nr_pages(page);
+			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			list_move(&page->lru, dst);
+			taken += nr_pages;
+			break;
+
+		case -EBUSY:
+			list_move(&page->lru, src);
+			continue;
+
+		default:
+			BUG();
+		}
+	}
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + is_file_lru(lru), taken);
+	spin_unlock_irq(&zone->lru_lock);
+
+	list_for_each_entry_safe(page, tmp, dst, lru) {
+		if (PageTransHuge(page) && split_huge_page_to_list(page, dst)) {
+			list_del(&page->lru);
+			mod_zone_page_state(zone, NR_ISOLATED_ANON,
+					-HPAGE_PMD_NR);
+			putback_lru_page(page);
+		}
+	}
+
+	return scanned;
+}
+
+static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list lru,
+				       nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct memcg_numa_migrate_struct ms = {
+		.target_nodes = target_nodes,
+		.current_node = -1,
+	};
+	LIST_HEAD(pages);
+	long total_scanned = 0;
+
+	/*
+	 * If no limit on the maximal number of migrated pages is specified,
+	 * assume the caller wants to migrate them all.
+	 */
+	if (nr_to_scan < 0)
+		nr_to_scan = mem_cgroup_get_lru_size(lruvec, lru);
+
+	while (total_scanned < nr_to_scan) {
+		int ret;
+		long scanned;
+
+		scanned = memcg_numa_isolate_pages(lruvec, lru,
+						   SWAP_CLUSTER_MAX, &pages);
+		if (!scanned)
+			break;
+
+		ret = migrate_pages(&pages, memcg_numa_migrate_new_page,
+				    (unsigned long)&ms, MIGRATE_ASYNC,
+				    MR_SYSCALL);
+		putback_lru_pages(&pages);
+		if (ret < 0)
+			return ret;
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		total_scanned += scanned;
+	}
+
+	return total_scanned;
+}
+
+/*
+ * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes.
+ * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0,
+ * then the function will attempt to migrate all pages accounted to @memcg.
+ */
+static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
+				    nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct mem_cgroup *mi;
+	long total_scanned = 0, scanned;
+
+again:
+	scanned = 0;
+	for_each_mem_cgroup_tree(mi, memcg) {
+		struct zone *zone;
+
+		for_each_populated_zone(zone) {
+			struct lruvec *lruvec;
+			enum lru_list lru;
+
+			if (node_isset(zone_to_nid(zone), *target_nodes))
+				continue;
+
+			lruvec = mem_cgroup_zone_lruvec(zone, mi);
+			/*
+			 * For the sake of simplicity, do not attempt to migrate
+			 * unevictable pages. It should be fine as long as there
+			 * aren't too many of them, which is usually true.
+			 */
+			for_each_evictable_lru(lru) {
+				long ret = __memcg_numa_migrate_pages(lruvec,
+						lru, target_nodes,
+						nr_to_scan > 0 ?
+						SWAP_CLUSTER_MAX : -1);
+				if (ret < 0) {
+					mem_cgroup_iter_break(memcg, mi);
+					return ret;
+				}
+				scanned += ret;
+			}
+		}
+	}
+
+	total_scanned += scanned;
+
+	/*
+	 * Retry only if we made progress in the previous iteration.
+	 */
+	if (nr_to_scan > 0 && scanned > 0 && total_scanned < nr_to_scan)
+		goto again;
+
+	return 0;
+}
+
+/*
+ * The format of memory.numa_migrate is
+ *
+ *   NODELIST[ MAX_SCAN]
+ *
+ * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
+ * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
+ * imposes a limit on the number of pages that can be migrated in one go.
+ *
+ * The call may be interrupted by a signal, in which case -EINTR is returned.
+ */
+static int memcg_numa_migrate_write(struct cgroup *cont,
+		struct cftype *cft, const char *buf)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL);
+	const char *nodes_str = buf, *nr_str;
+	long nr_to_scan = -1;
+	int ret = -ENOMEM;
+
+	if (!target_nodes)
+		goto out;
+
+	nr_str = strchr(buf, ' ');
+	if (nr_str) {
+		nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL);
+		if (!nodes_str)
+			goto out;
+		nr_str += 1;
+	}
+
+	ret = nodelist_parse(nodes_str, *target_nodes);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (!nodes_subset(*target_nodes, node_states[N_MEMORY]))
+		goto out;
+
+	if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0))
+		goto out;
+
+	ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan);
+out:
+	if (nodes_str != buf)
+		kfree(nodes_str);
+	NODEMASK_FREE(target_nodes);
+	return ret;
+}
+
 #endif /* CONFIG_NUMA */
 
 static inline void mem_cgroup_lru_names_not_uptodate(void)
@@ -5406,9 +5135,13 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
-		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+		seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		seq_printf(m, "%s %lu\n", mem_cgroup_stat2_names[i],
+			   mem_cgroup_read_stat2(memcg, i) * PAGE_SIZE);
+	}
 
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
@@ -5431,13 +5164,20 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 			   (u64)memsw * PAGE_SIZE);
 
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		long long val = 0;
+		unsigned long long val = 0;
 
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
-		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
+	}
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		unsigned long long val = 0;
+
+		for_each_mem_cgroup_tree(mi, memcg)
+			val += mem_cgroup_read_stat2(mi, i) * PAGE_SIZE;
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat2_names[i], val);
 	}
 
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@ -5584,16 +5324,18 @@ static int compare_thresholds(const void *a, const void *b)
 	return 0;
 }
 
+static DEFINE_SPINLOCK(memcg_oom_notify_lock);
+
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 	return 0;
 }
 
@@ -5780,7 +5522,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	if (!event)
 		return -ENOMEM;
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
@@ -5788,7 +5530,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 
 	return 0;
 }
@@ -5802,7 +5544,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 
 	BUG_ON(type != _OOM_TYPE);
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
@@ -5811,7 +5553,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 		}
 	}
 
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 }
 
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -5837,6 +5579,9 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env()) && val != 0)
+		return -EACCES;
+
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
@@ -5849,7 +5594,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	int ret;
 
-	memcg->kmemcg_id = -1;
 	ret = memcg_propagate_kmem(memcg);
 	if (ret)
 		return ret;
@@ -5857,9 +5601,79 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
+	if (test_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags)) {
+		list_del(&memcg->kmemcg_sharers);
+		memcg_destroy_kmem_caches(memcg);
+	}
 	mem_cgroup_sockets_destroy(memcg);
+}
+
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *parent, *sharer;
+	int kmemcg_id;
+
+	if (!memcg_kmem_is_active(memcg))
+		return;
+
+	/*
+	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
+	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+	 * guarantees no cache will be created for this cgroup after we are
+	 * done (see memcg_create_kmem_cache()).
+	 */
+	clear_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+
+	memcg_deactivate_kmem_caches(memcg);
+
+	kmemcg_id = memcg->kmemcg_id;
+	BUG_ON(kmemcg_id < 0);
+
+	parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	/*
+	 * Change kmemcg_id of this cgroup and all its descendants to the
+	 * parent's id, and then move all entries from this cgroup's list_lrus
+	 * to ones of the parent. After we have finished, all list_lrus
+	 * corresponding to this cgroup are guaranteed to remain empty. The
+	 * ordering is imposed by list_lru_node->lock taken by
+	 * memcg_drain_all_list_lrus().
+	 */
+	list_for_each_entry(sharer, &memcg->kmemcg_sharers, kmemcg_sharers) {
+		BUG_ON(sharer->kmemcg_id != kmemcg_id);
+		sharer->kmemcg_id = parent->kmemcg_id;
+	}
+	memcg->kmemcg_id = parent->kmemcg_id;
+	list_splice(&memcg->kmemcg_sharers, &parent->kmemcg_sharers);
+	list_add(&memcg->kmemcg_sharers, &parent->kmemcg_sharers);
+
+	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+	memcg_free_cache_id(kmemcg_id);
+
+	/*
+	 * kmem charges can outlive the cgroup. In the case of slab
+	 * pages, for instance, a page contain objects from various
+	 * processes. As we prevent from taking a reference for every
+	 * such allocation we have to be careful when doing uncharge
+	 * (see memcg_uncharge_kmem) and here during offlining.
+	 *
+	 * The idea is that that only the _last_ uncharge which sees
+	 * the dead memcg will drop the last reference. An additional
+	 * reference is taken here before the group is marked dead
+	 * which is then paired with css_put during uncharge resp. here.
+	 *
+	 * Although this might sound strange as this path is called from
+	 * css_offline() when the referencemight have dropped down to 0
+	 * and shouldn't be incremented anymore (css_tryget would fail)
+	 * we do not have other options because of the kmem allocations
+	 * lifetime.
+	 */
+	css_get(&memcg->css);
 
 	memcg_kmem_mark_dead(memcg);
 
@@ -5873,7 +5687,7 @@ static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 	 * page_counter read, so in that case, we don't need the put
 	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5881,7 +5695,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -5912,6 +5730,18 @@ static struct cftype mem_cgroup_files[] = {
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
+	{
+		.name = "low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_low_write,
+		.read = mem_cgroup_low_read,
+	},
+	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_high_write,
+		.read = mem_cgroup_high_read,
+	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
@@ -5928,7 +5758,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "use_hierarchy",
-		.flags = CFTYPE_INSANE,
+		.flags = CFTYPE_INSANE | CFTYPE_VE_WRITABLE,
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
@@ -5950,6 +5780,12 @@ static struct cftype mem_cgroup_files[] = {
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
+	{
+		.name = "oom_guarantee",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_oom_guarantee_write,
+		.read = mem_cgroup_oom_guarantee_read,
+	},
 	{
 		.name = "pressure_level",
 		.register_event = vmpressure_register_event,
@@ -5960,6 +5796,19 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "numa_stat",
 		.read_seq_string = memcg_numa_stat_show,
 	},
+	{
+		.name = "numa_migrate",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = memcg_numa_migrate_write,
+	},
+#endif
+#ifdef CONFIG_CLEANCACHE
+	{
+		.name = "disable_cleancache",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = mem_cgroup_disable_cleancache_read,
+		.write_u64 = mem_cgroup_disable_cleancache_write,
+	},
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 	{
@@ -6063,28 +5912,35 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
-	size_t size = memcg_size();
+	size_t size;
+	int i, ret;
 
-	/* Can be very big if nr_node_ids is very big */
-	if (size < PAGE_SIZE)
-		memcg = kzalloc(size, GFP_KERNEL);
-	else
-		memcg = vzalloc(size);
+	size = sizeof(struct mem_cgroup);
+	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 
+	memcg = kzalloc(size, GFP_KERNEL);
 	if (!memcg)
 		return NULL;
 
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
+
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		ret = percpu_counter_init(&memcg->stat2.counters[i], 0, GFP_KERNEL);
+		if (ret)
+			goto out_pcpu_free;
+	}
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 
+out_pcpu_free:
+	while (--i >= 0)
+		percpu_counter_destroy(&memcg->stat2.counters[i]);
+
+	free_percpu(memcg->stat);
 out_free:
-	if (size < PAGE_SIZE)
-		kfree(memcg);
-	else
-		vfree(memcg);
+	kfree(memcg);
 	return NULL;
 }
 
@@ -6102,7 +5958,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
-	size_t size = memcg_size();
+	int i;
 
 	mem_cgroup_remove_from_trees(memcg);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6110,6 +5966,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++)
+		percpu_counter_destroy(&memcg->stat2.counters[i]);
+
 	free_percpu(memcg->stat);
 
 	/*
@@ -6124,53 +5983,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	 * the cgroup_lock.
 	 */
 	disarm_static_keys(memcg);
-	if (size < PAGE_SIZE)
-		kfree(memcg);
-	else
-		vfree(memcg);
-}
-
-
-/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context.  The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
- */
-static void free_work(struct work_struct *work)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	__mem_cgroup_free(memcg);
-}
-
-static void free_rcu(struct rcu_head *rcu_head)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, free_work);
-	schedule_work(&memcg->work_freeing);
-}
-
-static void mem_cgroup_get(struct mem_cgroup *memcg)
-{
-	atomic_inc(&memcg->refcnt);
-}
-
-static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
-{
-	if (atomic_sub_and_test(count, &memcg->refcnt)) {
-		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-		call_rcu(&memcg->rcu_freeing, free_rcu);
-		if (parent)
-			mem_cgroup_put(parent);
-	}
-}
-
-static void mem_cgroup_put(struct mem_cgroup *memcg)
-{
-	__mem_cgroup_put(memcg, 1);
+	kfree(memcg);
 }
 
 /*
@@ -6227,17 +6040,23 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 		root_mem_cgroup = memcg;
 		page_counter_init(&memcg->memory, NULL);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
+		page_counter_init(&memcg->dcache, NULL);
 	}
 
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
-	atomic_set(&memcg->refcnt, 1);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
+	init_oom_context(&memcg->oom_ctx);
+#ifdef CONFIG_MEMCG_KMEM
+	memcg->kmemcg_id = -1;
+	INIT_LIST_HEAD(&memcg->kmemcg_sharers);
+#endif
 
 	return &memcg->css;
 
@@ -6250,7 +6069,6 @@ static int
 mem_cgroup_css_online(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg, *parent;
-	int error = 0;
 
 	if (!cont->parent)
 		return 0;
@@ -6262,25 +6080,29 @@ mem_cgroup_css_online(struct cgroup *cont)
 	memcg->use_hierarchy = parent->use_hierarchy;
 	memcg->oom_kill_disable = parent->oom_kill_disable;
 	memcg->swappiness = mem_cgroup_swappiness(parent);
+#ifdef CONFIG_CLEANCACHE
+	memcg->cleancache_disabled = parent->cleancache_disabled;
+#endif
 
 	if (parent->use_hierarchy) {
 		page_counter_init(&memcg->memory, &parent->memory);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
+		page_counter_init(&memcg->dcache, &parent->dcache);
 
 		/*
-		 * We increment refcnt of the parent to ensure that we can
-		 * safely access it on page_counter_charge/uncharge.
-		 * This refcnt will be decremented when freeing this
-		 * mem_cgroup(see mem_cgroup_put).
+		 * No need to take a reference to the parent because cgroup
+		 * core guarantees its existence.
 		 */
-		mem_cgroup_get(parent);
 	} else {
 		page_counter_init(&memcg->memory, NULL);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
+		page_counter_init(&memcg->dcache, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -6289,10 +6111,9 @@ mem_cgroup_css_online(struct cgroup *cont)
 		if (parent != root_mem_cgroup)
 			mem_cgroup_subsys.broken_hierarchy = true;
 	}
-
-	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
 	mutex_unlock(&memcg_create_mutex);
-	return error;
+
+	return memcg_init_kmem(memcg, &mem_cgroup_subsys);
 }
 
 /*
@@ -6318,6 +6139,17 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct cgroup *iter;
 
+	/*
+	 * Mark memory cgroup as offline before going to reparent charges.
+	 * This guarantees that __mem_cgroup_try_charge() either charges before
+	 * reparenting starts or doesn't charge at all, hence we won't have
+	 * pending user memory charges after reparenting is done.
+	 */
+	memcg->is_offline = true;
+	smp_mb();
+
+	memcg_deactivate_kmem(memcg);
+
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 
 	/*
@@ -6333,70 +6165,101 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 	rcu_read_unlock();
 	mem_cgroup_reparent_charges(memcg);
 
-	mem_cgroup_destroy_all_caches(memcg);
+	/*
+	 * A cgroup can be destroyed while somebody is waiting for its
+	 * oom context, in which case the context will never be unlocked
+	 * from oom_unlock, because the latter only iterates over live
+	 * cgroups. So we need to release the context now, when one can
+	 * no longer iterate over it.
+	 */
+	release_oom_context(&memcg->oom_ctx);
 }
 
 static void mem_cgroup_css_free(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	kmem_cgroup_destroy(memcg);
+	/*
+	 * XXX: css_offline() would be where we should reparent all
+	 * memory to prepare the cgroup for destruction.  However,
+	 * memcg does not do css_tryget() and res_counter charging
+	 * under the same RCU lock region, which means that charging
+	 * could race with offlining.  Offlining only happens to
+	 * cgroups with no tasks in them but charges can show up
+	 * without any tasks from the swapin path when the target
+	 * memcg is looked up from the swapout record and not from the
+	 * current task as it usually is.  A race like this can leak
+	 * charges and put pages with stale cgroup pointers into
+	 * circulation:
+	 *
+	 * #0                        #1
+	 *                           lookup_swap_cgroup_id()
+	 *                           rcu_read_lock()
+	 *                           mem_cgroup_lookup()
+	 *                           css_tryget()
+	 *                           rcu_read_unlock()
+	 * disable css_tryget()
+	 * call_rcu()
+	 *   offline_css()
+	 *     reparent_charges()
+	 *                           res_counter_charge()
+	 *                           css_put()
+	 *                             css_free()
+	 *                           pc->mem_cgroup = dead memcg
+	 *                           add page to lru
+	 *
+	 * The bulk of the charges are still moved in offline_css() to
+	 * avoid pinning a lot of pages in case a long-term reference
+	 * like a swapout record is deferring the css_free() to long
+	 * after offlining.  But this makes sure we catch any charges
+	 * made after offlining:
+	 */
+	mem_cgroup_reparent_charges(memcg);
 
-	mem_cgroup_put(memcg);
+	memcg_destroy_kmem(memcg);
+	__mem_cgroup_free(memcg);
 }
 
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
-#define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
-	int batch_count = PRECHARGE_COUNT_AT_ONCE;
-	struct mem_cgroup *memcg = mc.to;
 
-	if (mem_cgroup_is_root(memcg)) {
+	if (mem_cgroup_is_root(mc.to)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
-	/* try to charge at once */
-	if (count > 1) {
-		struct page_counter *dummy;
-		/*
-		 * "memcg" cannot be under rmdir() because we've already checked
-		 * by cgroup_lock_live_cgroup() that it is not removed and we
-		 * are still under the same cgroup_mutex. So we can postpone
-		 * css_get().
-		 */
-		if (page_counter_try_charge(&memcg->memory, count, &dummy))
-			goto one_by_one;
-		if (do_swap_account &&
-		    page_counter_try_charge(&memcg->memsw, count, &dummy)) {
-			page_counter_uncharge(&memcg->memory, count);
-			goto one_by_one;
-		}
+
+	/* Try a single bulk charge without reclaim first */
+	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, false, count);
+	if (!ret) {
 		mc.precharge += count;
 		return ret;
 	}
-one_by_one:
-	/* fall back to one by one charge */
+	if (ret == -EINTR) {
+		cancel_charge(root_mem_cgroup, count);
+		return ret;
+	}
+
+	/* Try charges one by one with reclaim */
 	while (count--) {
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-		if (!batch_count--) {
-			batch_count = PRECHARGE_COUNT_AT_ONCE;
-			cond_resched();
-		}
-		ret = __mem_cgroup_try_charge(NULL,
-					GFP_KERNEL, 1, &memcg, false);
+		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, false, 1);
+		/*
+		 * In case of failure, any residual charges against
+		 * mc.to will be dropped by mem_cgroup_clear_mc()
+		 * later on.  However, cancel any charges that are
+		 * bypassed to root right away or they'll be lost.
+		 */
+		if (ret == -EINTR)
+			cancel_charge(root_mem_cgroup, 1);
 		if (ret)
-			/* mem_cgroup_clear_mc() will do uncharge later */
 			return ret;
 		mc.precharge++;
+		cond_resched();
 	}
-	return ret;
+	return 0;
 }
 
 /**
@@ -6488,10 +6351,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		return NULL;
 
 	mapping = vma->vm_file->f_mapping;
-	if (pte_none(ptent))
-		pgoff = linear_page_index(vma, addr);
-	else /* pte_file(ptent) is true */
-		pgoff = pte_to_pgoff(ptent);
+	pgoff = linear_page_index(vma, addr);
 
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -6524,7 +6384,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-	else if (pte_none(ptent) || pte_file(ptent))
+	else if (pte_none(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 
 	if (!page && !ent.val)
@@ -6532,9 +6392,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
-		 * Do only loose check w/o page_cgroup lock.
-		 * mem_cgroup_move_account() checks the pc is valid or not under
-		 * the lock.
+		 * Do only loose check w/o serialization.
+		 * mem_cgroup_move_account() checks the pc is valid or
+		 * not under LRU exclusion.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
@@ -6655,10 +6515,11 @@ static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
+	int i;
 
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
-		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
+		cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
@@ -6666,7 +6527,7 @@ static void __mem_cgroup_clear_mc(void)
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
-		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+		cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
@@ -6675,6 +6536,9 @@ static void __mem_cgroup_clear_mc(void)
 		if (!mem_cgroup_is_root(mc.from))
 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
 
+		for (i = 0; i < mc.moved_swap; i++)
+			css_put(&mc.from->css);
+
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->memory and to->memsw, so we
@@ -6682,11 +6546,13 @@ static void __mem_cgroup_clear_mc(void)
 			 */
 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
 		}
-		__mem_cgroup_put(mc.from, mc.moved_swap);
-
-		/* we've already done mem_cgroup_get(mc.to) */
+		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
+	if (do_swap_account) {
+		mem_cgroup_update_swap_max(from);
+		mem_cgroup_update_swap_max(to);
+	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
@@ -6996,6 +6862,421 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+	struct page_cgroup *pc;
+	unsigned short oldid;
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+	VM_BUG_ON_PAGE(page_count(page), page);
+
+	if (!do_swap_account)
+		return;
+
+	pc = lookup_page_cgroup(page);
+
+	/* Readahead page, never charged */
+	if (!PageCgroupUsed(pc))
+		return;
+
+	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+
+	oldid = swap_cgroup_record(entry, css_id(&pc->mem_cgroup->css));
+	VM_BUG_ON_PAGE(oldid, page);
+
+	pc->flags &= ~PCG_MEMSW;
+	css_get(&pc->mem_cgroup->css);
+	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+	struct mem_cgroup *memcg;
+	unsigned short id;
+
+	if (!do_swap_account)
+		return;
+
+	id = swap_cgroup_record(entry, 0);
+	rcu_read_lock();
+	memcg = mem_cgroup_lookup(id);
+	if (memcg) {
+		if (!mem_cgroup_is_root(memcg))
+			page_counter_uncharge(&memcg->memsw, 1);
+		mem_cgroup_swap_statistics(memcg, false);
+		css_put(&memcg->css);
+	}
+	rcu_read_unlock();
+}
+#endif
+
+/**
+ * mem_cgroup_try_charge - try charging a page
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ * @memcgp: charged memcg return
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+ * Otherwise, an error code is returned.
+ *
+ * After page->mapping has been set up, the caller must finalize the
+ * charge with mem_cgroup_commit_charge().  Or abort the transaction
+ * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ */
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+	struct mem_cgroup *memcg = NULL;
+	unsigned int nr_pages = 1;
+	int ret = 0;
+
+	if (mem_cgroup_disabled())
+		goto out;
+
+	if (PageSwapCache(page)) {
+		struct page_cgroup *pc = lookup_page_cgroup(page);
+		/*
+		 * Every swap fault against a single page tries to charge the
+		 * page, bail as early as possible.  shmem_unuse() encounters
+		 * already charged pages, too.  The USED bit is protected by
+		 * the page lock, which serializes swap cache removal, which
+		 * in turn serializes uncharging.
+		 */
+		if (PageCgroupUsed(pc))
+			goto out;
+	}
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	if (do_swap_account && PageSwapCache(page))
+		memcg = try_get_mem_cgroup_from_page(page);
+	if (!memcg)
+		memcg = get_mem_cgroup_from_mm(mm);
+
+	ret = try_charge(memcg, gfp_mask, false, nr_pages);
+
+	css_put(&memcg->css);
+
+	if (ret == -EINTR) {
+		memcg = root_mem_cgroup;
+		ret = 0;
+	}
+out:
+	*memcgp = memcg;
+	return ret;
+}
+
+/**
+ * mem_cgroup_commit_charge - commit a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ * @lrucare: page might be on LRU already
+ *
+ * Finalize a charge transaction started by mem_cgroup_try_charge(),
+ * after page->mapping has been set up.  This must happen atomically
+ * as part of the page instantiation, i.e. under the page table lock
+ * for anonymous pages, under the page lock for page and swap cache.
+ *
+ * In addition, the page must not be on the LRU during the commit, to
+ * prevent racing with task migration.  If it might be, use @lrucare.
+ *
+ * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+ */
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+			      bool lrucare)
+{
+	unsigned int nr_pages = 1;
+
+	VM_BUG_ON_PAGE(!page->mapping, page);
+	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+
+	if (mem_cgroup_disabled())
+		return;
+	/*
+	 * Swap faults will attempt to charge the same page multiple
+	 * times.  But reuse_swap_page() might have removed the page
+	 * from swapcache already, so we can't check PageSwapCache().
+	 */
+	if (!memcg)
+		return;
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	commit_charge(page, memcg, nr_pages, lrucare);
+
+	if (do_swap_account && PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page_private(page) };
+		/*
+		 * The swap entry might not get freed for a long time,
+		 * let's not wait for it.  The page already received a
+		 * memory+swap charge, drop the swap entry duplicate.
+		 */
+		mem_cgroup_uncharge_swap(entry);
+	}
+}
+
+/**
+ * mem_cgroup_cancel_charge - cancel a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ *
+ * Cancel a charge transaction started by mem_cgroup_try_charge().
+ */
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+{
+	unsigned int nr_pages = 1;
+
+	if (mem_cgroup_disabled())
+		return;
+	/*
+	 * Swap faults will attempt to charge the same page multiple
+	 * times.  But reuse_swap_page() might have removed the page
+	 * from swapcache already, so we can't check PageSwapCache().
+	 */
+	if (!memcg)
+		return;
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	cancel_charge(memcg, nr_pages);
+}
+
+static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+			   unsigned long nr_mem, unsigned long nr_memsw,
+			   unsigned long nr_anon, unsigned long nr_file,
+			   unsigned long nr_huge, unsigned long nr_kmem,
+			   unsigned long nr_shmem, struct page *dummy_page)
+{
+	unsigned long flags;
+
+	if (!mem_cgroup_is_root(memcg)) {
+		if (nr_mem + nr_kmem)
+			page_counter_uncharge(&memcg->memory, nr_mem + nr_kmem);
+		if (nr_memsw + nr_kmem)
+			page_counter_uncharge(&memcg->memsw, nr_memsw + nr_kmem);
+		if (nr_kmem)
+			page_counter_uncharge(&memcg->kmem, nr_kmem);
+
+		memcg_oom_recover(memcg);
+	}
+
+	local_irq_save(flags);
+	percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS], nr_anon);
+	percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE], nr_file);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
+	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+	memcg_check_events(memcg, dummy_page);
+	local_irq_restore(flags);
+}
+
+static void uncharge_list(struct list_head *page_list)
+{
+	struct mem_cgroup *memcg = NULL;
+	unsigned long nr_memsw = 0;
+	unsigned long nr_anon = 0;
+	unsigned long nr_file = 0;
+	unsigned long nr_huge = 0;
+	unsigned long nr_kmem = 0;
+	unsigned long pgpgout = 0;
+	unsigned long nr_mem = 0;
+	unsigned long nr_shmem = 0;
+	struct list_head *next;
+	struct page *page;
+
+	next = page_list->next;
+	do {
+		unsigned int nr_pages = 1;
+		struct page_cgroup *pc;
+
+		page = list_entry(next, struct page, lru);
+		next = page->lru.next;
+
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+		VM_BUG_ON_PAGE(page_count(page), page);
+
+		pc = lookup_page_cgroup(page);
+		if (!PageCgroupUsed(pc))
+			continue;
+
+		/*
+		 * Nobody should be changing or seriously looking at
+		 * pc->mem_cgroup and pc->flags at this point, we have
+		 * fully exclusive access to the page.
+		 */
+
+		if (memcg != pc->mem_cgroup) {
+			if (memcg) {
+				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+					nr_anon, nr_file, nr_huge, nr_kmem,
+					nr_shmem, page);
+				pgpgout = nr_mem = nr_memsw = nr_kmem = 0;
+				nr_anon = nr_file = nr_huge = nr_shmem = 0;
+			}
+			memcg = pc->mem_cgroup;
+		}
+
+		if (!PageKmemcg(page)) {
+			if (PageTransHuge(page)) {
+				nr_pages <<= compound_order(page);
+				VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+				nr_huge += nr_pages;
+			}
+			if (PageAnon(page))
+				nr_anon += nr_pages;
+			else {
+				if (PageSwapBacked(page))
+					nr_shmem += nr_pages;
+				nr_file += nr_pages;
+			}
+			pgpgout++;
+		} else {
+			nr_kmem += 1 << compound_order(page);
+			__ClearPageKmemcg(page);
+		}
+
+		if (pc->flags & PCG_MEM)
+			nr_mem += nr_pages;
+		if (pc->flags & PCG_MEMSW)
+			nr_memsw += nr_pages;
+		pc->flags = 0;
+
+		pgpgout++;
+	} while (next != page_list);
+
+	if (memcg)
+		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, nr_anon,
+				nr_file, nr_huge, nr_kmem, nr_shmem, page);
+}
+
+/**
+ * mem_cgroup_uncharge - uncharge a page
+ * @page: page to uncharge
+ *
+ * Uncharge a page previously charged with mem_cgroup_try_charge() and
+ * mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge(struct page *page)
+{
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	/* Don't touch page->lru of any random page, pre-check: */
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return;
+
+	INIT_LIST_HEAD(&page->lru);
+	uncharge_list(&page->lru);
+}
+
+/**
+ * mem_cgroup_uncharge_list - uncharge a list of page
+ * @page_list: list of pages to uncharge
+ *
+ * Uncharge a list of pages previously charged with
+ * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	if (!list_empty(page_list))
+		uncharge_list(page_list);
+}
+
+/**
+ * mem_cgroup_migrate - migrate a charge to another page
+ * @oldpage: currently charged page
+ * @newpage: page to transfer the charge to
+ * @lrucare: both pages might be on the LRU already
+ *
+ * Migrate the charge from @oldpage to @newpage.
+ *
+ * Both pages must be locked, @newpage->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+			bool lrucare)
+{
+	unsigned int nr_pages = 1;
+	struct page_cgroup *pc;
+	int isolated;
+
+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+
+	if (mem_cgroup_disabled())
+		return;
+
+	/* Page cache replacement: new page already charged? */
+	pc = lookup_page_cgroup(newpage);
+	if (PageCgroupUsed(pc))
+		return;
+
+	/* Re-entrant migration: old page already uncharged? */
+	pc = lookup_page_cgroup(oldpage);
+	if (!PageCgroupUsed(pc))
+		return;
+
+	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+
+	if (PageTransHuge(oldpage)) {
+		nr_pages <<= compound_order(oldpage);
+		VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
+		VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
+	}
+
+	if (lrucare)
+		lock_page_lru(oldpage, &isolated);
+
+	pc->flags = 0;
+
+	if (lrucare)
+		unlock_page_lru(oldpage, isolated);
+
+	local_irq_disable();
+	mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
+	memcg_check_events(pc->mem_cgroup, oldpage);
+	local_irq_enable();
+
+	commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
+}
+
 /*
  * subsys_initcall() for memory controller.
  *
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -243,21 +243,11 @@ void shake_page(struct page *p, int access)
 	}
 
 	/*
-	 * Only call shrink_slab here (which would also shrink other caches) if
-	 * access is not potentially fatal.
+	 * Only call shrink_node_slabs here (which would also shrink
+	 * other caches) if access is not potentially fatal.
 	 */
-	if (access) {
-		int nr;
-		do {
-			struct shrink_control shrink = {
-				.gfp_mask = GFP_KERNEL,
-			};
-
-			nr = shrink_slab(&shrink, 1000, 1000);
-			if (page_count(p) == 1)
-				break;
-		} while (nr > 10);
-	}
+	if (access)
+		drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
+#include <linux/virtinfo.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -63,6 +64,11 @@
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/vmpages.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -105,7 +111,7 @@ EXPORT_SYMBOL(high_memory);
  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
  *   as ancient (libc5 based) binaries can segfault. )
  */
-int randomize_va_space __read_mostly =
+int _randomize_va_space __read_mostly =
 #ifdef CONFIG_COMPAT_BRK
 					1;
 #else
@@ -727,6 +733,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 		       vma->vm_file->f_op->mmap);
 	dump_stack();
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
 }
 
 /*
@@ -841,61 +849,64 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
-		if (!pte_file(pte)) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (swap_duplicate(entry) < 0)
-				return entry.val;
-
-			/* make sure dst_mm is on swapoff's mmlist. */
-			if (unlikely(list_empty(&dst_mm->mmlist))) {
-				spin_lock(&mmlist_lock);
-				if (list_empty(&dst_mm->mmlist))
-					list_add(&dst_mm->mmlist,
-						 &src_mm->mmlist);
-				spin_unlock(&mmlist_lock);
-			}
-			if (likely(!non_swap_entry(entry)))
-				rss[MM_SWAPENTS]++;
-			else if (is_migration_entry(entry)) {
-				page = migration_entry_to_page(entry);
-
-				rss[mm_counter(page)]++;
-
-				if (is_write_migration_entry(entry) &&
-				    is_cow_mapping(vm_flags)) {
-					/*
-					 * COW mappings require pages in both
-					 * parent and child to be set to read.
-					 */
-					make_migration_entry_read(&entry);
-					pte = swp_entry_to_pte(entry);
-					if (pte_swp_soft_dirty(*src_pte))
-						pte = pte_swp_mksoft_dirty(pte);
-					set_pte_at(src_mm, addr, src_pte, pte);
-				}
-			} else if (is_hmm_entry(entry)) {
-				page = hmm_entry_to_page(entry);
+		swp_entry_t entry = pte_to_swp_entry(pte);
+
+		if (swap_duplicate(entry) < 0)
+			return entry.val;
+
+		/* make sure dst_mm is on swapoff's mmlist. */
+		if (unlikely(list_empty(&dst_mm->mmlist))) {
+			spin_lock(&mmlist_lock);
+			if (list_empty(&dst_mm->mmlist))
+				list_add(&dst_mm->mmlist,
+					 &src_mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		if (likely(!non_swap_entry(entry)))
+			rss[MM_SWAPENTS]++;
+		else if (is_migration_entry(entry)) {
+			page = migration_entry_to_page(entry);
+
+			rss[mm_counter(page)]++;
 
+			if (is_write_migration_entry(entry) &&
+			    is_cow_mapping(vm_flags)) {
 				/*
-				 * Update rss count even for un-addressable
-				 * page as they should be consider just like
-				 * any other page.
+				 * COW mappings require pages in both
+				 * parent and child to be set to read.
 				 */
-				get_page(page);
-				rss[mm_counter(page)]++;
-				page_dup_rmap(page);
-
-				if (is_write_hmm_entry(entry) &&
-				    is_cow_mapping(vm_flags)) {
-					make_hmm_entry_read(&entry);
-					pte = swp_entry_to_pte(entry);
-					if (pte_swp_soft_dirty(*src_pte))
-						pte = pte_swp_mksoft_dirty(pte);
-					set_pte_at(src_mm, addr, src_pte, pte);
-				}
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(*src_pte))
+					pte = pte_swp_mksoft_dirty(pte);
+				set_pte_at(src_mm, addr, src_pte, pte);
 			}
-		}
+		} else if (is_hmm_entry(entry)) {
+			page = hmm_entry_to_page(entry);
+
+			/*
+			 * Update rss count even for un-addressable
+			 * page as they should be consider just like
+			 * any other page.
+			 */
+			get_page(page);
+			rss[mm_counter(page)]++;
+			page_dup_rmap(page);
+
+			if (is_write_hmm_entry(entry) &&
+				is_cow_mapping(vm_flags)) {
+				make_hmm_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(*src_pte))
+					pte = pte_swp_mksoft_dirty(pte);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
+		} else
+			/*
+			 * This can not happen because HMM migration holds
+			 * mmap_sem in read mode.
+			 */
+			 VM_BUG_ON(is_hmm_entry(entry));
 		goto out_set_pte;
 	}
 
@@ -1064,11 +1075,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-			       VM_PFNMAP | VM_MIXEDMAP))) {
-		if (!vma->anon_vma)
-			return 0;
-	}
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+			!vma->anon_vma)
+		return 0;
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1126,6 +1135,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
+	swp_entry_t entry;
 
 again:
 	init_rss_vec(rss);
@@ -1133,7 +1143,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	pte = start_pte;
 	arch_enter_lazy_mmu_mode();
 	do {
-		swp_entry_t entry;
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
 			continue;
@@ -1152,31 +1161,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
-				/*
-				 * Each page->index must be checked when
-				 * invalidating or truncating nonlinear.
-				 */
-				if (details->nonlinear_vma &&
-				    (page->index < details->first_index ||
-				     page->index > details->last_index))
-					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
-			if (unlikely(details) && details->nonlinear_vma
-			    && linear_page_index(details->nonlinear_vma,
-						addr) != page->index) {
-				pte_t ptfile = pgoff_to_pte(page->index);
-				if (pte_soft_dirty(ptent))
-					pte_file_mksoft_dirty(ptfile);
-				set_pte_at(mm, addr, pte, ptfile);
-			}
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent))
-					set_page_dirty(page);
+					set_page_dirty_mm(page, mm);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
@@ -1193,42 +1186,21 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 		/*
 		 * If details->check_mapping, we leave swap entries;
-		 * if details->nonlinear_vma, we leave file entries.
 		 */
 		if (unlikely(details))
 			continue;
-		if (pte_file(ptent)) {
-			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-				print_bad_pte(vma, addr, ptent, NULL);
-		} else {
-			entry = pte_to_swp_entry(ptent);
-
-			if (!non_swap_entry(entry))
-				rss[MM_SWAPENTS]--;
-			else if (is_hmm_entry(entry)) {
-				/*
-				 * Un-addressable page must always be check
-				 * that are not like other swap entries and
-				 * thus should be check no matter what
-				 * details value is.
-				 */
-				struct page *page = hmm_entry_to_page(entry);
-
-				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-				rss[mm_counter(page)]--;
-				page_remove_rmap(page);
-				put_page(page);
-				continue;
-			} else if (is_migration_entry(entry)) {
-				struct page *page;
 
-				page = migration_entry_to_page(entry);
+		entry = pte_to_swp_entry(ptent);
+		if (!non_swap_entry(entry))
+			rss[MM_SWAPENTS]--;
+		else if (is_migration_entry(entry)) {
+			struct page *page;
 
-				rss[mm_counter(page)]--;
-			}
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			page = migration_entry_to_page(entry);
+			rss[mm_counter(page)]--;
 		}
+		if (unlikely(!free_swap_and_cache(entry)))
+			print_bad_pte(vma, addr, ptent, NULL);
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -1337,11 +1309,10 @@ static void unmap_page_range(struct mmu_gather *tlb,
 	pgd_t *pgd;
 	unsigned long next;
 
-	if (details && !details->check_mapping && !details->nonlinear_vma)
+	if (details && !details->check_mapping)
 		details = NULL;
 
 	BUG_ON(addr >= end);
-	mem_cgroup_uncharge_start();
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
@@ -1351,7 +1322,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
 		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
-	mem_cgroup_uncharge_end();
 }
 
 
@@ -1433,7 +1403,7 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
@@ -1459,7 +1429,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * The range must fit into one VMA.
  */
@@ -1547,7 +1517,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
 	inc_mm_counter_fast(mm, mm_counter_file(page));
-	page_add_file_rmap(page);
+	page_add_file_rmap(page, mm);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
 	retval = 0;
@@ -1992,12 +1962,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2160,6 +2129,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	int page_copied = 0;
 	const unsigned long mmun_start = address & PAGE_MASK;	/* For mmu_notifiers */
 	const unsigned long mmun_end = mmun_start + PAGE_SIZE;	/* For mmu_notifiers */
+	struct mem_cgroup *memcg;
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
@@ -2176,7 +2146,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	__SetPageUptodate(new_page);
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
 		goto oom_free_new;
 
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
@@ -2206,6 +2176,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		ptep_clear_flush_notify(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
+		mem_cgroup_commit_charge(new_page, memcg, false);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2242,9 +2214,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Free the old page.. */
 		new_page = old_page;
 		page_copied = 1;
-	} else {
-		mem_cgroup_uncharge_page(new_page);
-	}
+	} else
+		mem_cgroup_cancel_charge(new_page, memcg);
 
 	if (new_page)
 		page_cache_release(new_page);
@@ -2478,25 +2449,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
 	}
 }
 
-static inline void unmap_mapping_range_list(struct list_head *head,
-					    struct zap_details *details)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * In nonlinear VMAs there is no correspondence between virtual address
-	 * offset and file offset.  So we must perform an exhaustive search
-	 * across *all* the pages in each nonlinear VMA, not just the pages
-	 * whose virtual address lies outside the file truncation point.
-	 */
-	list_for_each_entry(vma, head, shared.nonlinear) {
-		details->nonlinear_vma = vma;
-		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-	}
-}
-
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2525,7 +2482,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	}
 
 	details.check_mapping = even_cows? NULL: mapping;
-	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
@@ -2534,8 +2490,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	mutex_lock(&mapping->i_mmap_mutex);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	mutex_unlock(&mapping->i_mmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -2551,13 +2505,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	spinlock_t *ptl;
 	struct page *page, *swapcache;
+	struct mem_cgroup *memcg;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
-	struct mem_cgroup *ptr;
 	int exclusive = 0;
 	int ret = 0;
+	cycles_t start;
 
+	start = get_cycles();
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
@@ -2638,7 +2594,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_page;
 	}
 
-	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
 	}
@@ -2663,10 +2619,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * while the page is counted on swap but not yet in mapcount i.e.
 	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
 	 * must be called after the swap_free(), or it will never succeed.
-	 * Because delete_from_swap_page() may be called by reuse_swap_page(),
-	 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
-	 * in page->private. In this case, a record in swap_cgroup  is silently
-	 * discarded at swap_free().
 	 */
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
@@ -2682,12 +2634,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (pte_swp_soft_dirty(orig_pte))
 		pte = pte_mksoft_dirty(pte);
 	set_pte_at(mm, address, page_table, pte);
-	if (page == swapcache)
+	if (page == swapcache) {
 		do_page_add_anon_rmap(page, vma, address, exclusive);
-	else /* ksm created a completely new copy */
+		mem_cgroup_commit_charge(page, memcg, true);
+	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, address);
-	/* It's better to call commit-charge after rmap is established */
-	mem_cgroup_commit_charge_swapin(page, ptr);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
+	}
 
 	swap_free(entry);
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2718,9 +2672,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	spin_lock_irq(&kstat_glb_lock);
+	KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
+	spin_unlock_irq(&kstat_glb_lock);
+
 	return ret;
 out_nomap:
-	mem_cgroup_cancel_charge_swapin(ptr);
+	mem_cgroup_cancel_charge(page, memcg);
 	pte_unmap_unlock(page_table, ptl);
 out_page:
 	unlock_page(page);
@@ -2742,6 +2700,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags)
 {
+	struct mem_cgroup *memcg;
 	struct page *page;
 	spinlock_t *ptl;
 	pte_t entry;
@@ -2768,7 +2727,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto setpte;
 	}
 
-	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2781,7 +2739,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	__SetPageUptodate(page);
 
-	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
 		goto oom_free_page;
 
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -2795,7 +2753,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Deliver the page fault to userland, check inside PT lock */
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(page_table, ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		page_cache_release(page);
 		return handle_userfault(vma, address, flags,
 					VM_UFFD_MISSING);
@@ -2803,6 +2761,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, vma);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
 
@@ -2812,7 +2772,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 release:
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 	page_cache_release(page);
 	goto unlock;
 oom_free_page:
@@ -2828,6 +2788,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 {
 	struct vm_fault vmf;
 	int ret;
+	cycles_t start;
 
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
@@ -2835,6 +2796,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.page = NULL;
 	vmf.cow_page = cow_page;
 
+	start = get_cycles();
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
@@ -2855,6 +2817,11 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+	local_irq_disable();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.page_in, smp_processor_id(),
+			get_cycles() - start);
+	local_irq_enable();
+
 	*page = vmf.page;
 	return ret;
 }
@@ -2868,14 +2835,12 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-		pte_mksoft_dirty(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page);
+		page_add_file_rmap(page, vma->vm_mm);
 	}
 	set_pte_at(vma->vm_mm, address, pte, entry);
 
@@ -2915,6 +2880,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *fault_page, *new_page;
 	void *fault_entry;
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret;
@@ -2926,7 +2892,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!new_page)
 		return VM_FAULT_OOM;
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
 		page_cache_release(new_page);
 		return VM_FAULT_OOM;
 	}
@@ -2953,6 +2919,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto uncharge_out;
 	}
 	do_set_pte(vma, address, new_page, pte, true, true);
+	mem_cgroup_commit_charge(new_page, memcg, false);
+	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
 	if (!(ret & VM_FAULT_DAX_LOCKED)) {
 		unlock_page(fault_page);
@@ -2962,7 +2930,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	return ret;
 uncharge_out:
-	mem_cgroup_uncharge_page(new_page);
+	mem_cgroup_cancel_charge(new_page, memcg);
 	page_cache_release(new_page);
 	return ret;
 }
@@ -3025,7 +2993,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
@@ -3045,44 +3013,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
-{
-	pgoff_t pgoff;
-
-	flags |= FAULT_FLAG_NONLINEAR;
-
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 */
-		print_bad_pte(vma, address, orig_pte, NULL);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pgoff = pte_to_pgoff(orig_pte);
-	if (!(flags & FAULT_FLAG_WRITE))
-		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	if (!(vma->vm_flags & VM_SHARED))
-		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 				unsigned long addr, int page_nid,
 				int *flags)
@@ -3218,14 +3148,11 @@ static int handle_pte_fault(struct mm_struct *mm,
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (!vma_is_anonymous(vma))
-				return do_linear_fault(mm, vma, address,
+				return do_fault(mm, vma, address,
 						pte, pmd, flags, entry);
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, flags);
 		}
-		if (pte_file(entry))
-			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
 	}
@@ -3712,7 +3639,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
 /*
  * Print the name of a VMA.
  */
-void print_vma_addr(char *prefix, unsigned long ip)
+void ve_print_vma_addr(int dst, char *prefix, unsigned long ip)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -3735,7 +3662,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 			p = d_path(&f->f_path, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
-			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
+			ve_printk(dst, "%s%s[%lx+%lx]", prefix, kbasename(p),
 					vma->vm_start,
 					vma->vm_end - vma->vm_start);
 			free_page((unsigned long)buf);
@@ -3916,3 +3843,181 @@ int add_pages(int nid, unsigned long start,
 	return __add_pages(nid, zone, start >> PAGE_SHIFT, size >> PAGE_SHIFT);
 }
 #endif /* ARCH_HAS_ADD_PAGES */
+
+#include <linux/file.h>
+
+int open_mapping_peer(struct address_space *mapping,
+		struct path *path, const struct cred *cred)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct address_space *peer = inode->i_mapping;
+	struct file *file = NULL;
+	struct user_beancounter *ub;
+
+restart:
+	if (!peer->i_peer_file) {
+		ub = set_exec_ub(&ub0);
+		file = dentry_open(path, O_RDONLY | O_LARGEFILE, cred);
+		set_exec_ub(ub);
+		if (IS_ERR(file)) {
+			return PTR_ERR(file);
+		}
+
+		spin_lock(&inode->i_lock);
+		if (atomic_read(&inode->i_writecount) > 0) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			return -ETXTBSY;
+		}
+		if (inode->i_size != mapping->host->i_size) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			return -EINVAL;
+		}
+		if (peer->i_peer_file) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			file = NULL;
+			goto restart;
+		}
+		atomic_dec(&inode->i_writecount);
+		rcu_assign_pointer(peer->i_peer_file, get_file(file));
+		spin_unlock(&inode->i_lock);
+	}
+
+	mutex_lock_nested(&peer->i_mmap_mutex, SINGLE_DEPTH_NESTING);
+	if (!peer->i_peer_file) {
+		mutex_unlock(&peer->i_mmap_mutex);
+		goto restart;
+	}
+	mutex_lock(&mapping->i_mmap_mutex);
+	rcu_assign_pointer(mapping->i_peer_file, peer->i_peer_file);
+	list_add(&mapping->i_peer_list, &peer->i_peer_list);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	mutex_unlock(&peer->i_mmap_mutex);
+
+	invalidate_mapping_pages(mapping, 0, -1);
+
+	if (file) {
+		file_accessed(file);
+		fput(file);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(open_mapping_peer);
+
+static bool synchronize_mapping_faults_vma(struct address_space *mapping,
+		struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (vma->vm_private_data2 == vma)
+		return false;
+	BUG_ON(vma->vm_private_data2);
+	vma->vm_private_data2 = vma;
+
+	atomic_inc(&mm->mm_count);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	down_write(&mm->mmap_sem);
+	up_write(&mm->mmap_sem);
+	mmdrop(mm);
+	mutex_lock(&mapping->i_mmap_mutex);
+
+	return true;
+}
+
+static void synchronize_mapping_faults(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+
+restart:
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
+		if (synchronize_mapping_faults_vma(mapping, vma))
+			goto restart;
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
+		vma->vm_private_data2 = NULL;
+}
+
+void close_mapping_peer(struct address_space *mapping)
+{
+	struct file *file = mapping->i_peer_file;
+	struct address_space *peer;
+
+	if (!file)
+		return;
+
+	mutex_lock(&mapping->i_mmap_mutex);
+
+	rcu_assign_pointer(mapping->i_peer_file, NULL);
+
+	if (mapping_mapped(mapping)) {
+		struct zap_details details = {
+			.check_mapping = file->f_mapping,
+			.first_index = 0,
+			.last_index = -1,
+		};
+
+		synchronize_mapping_faults(mapping);
+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+	}
+
+	mutex_unlock(&mapping->i_mmap_mutex);
+
+	peer = file->f_mapping;
+
+	mutex_lock(&peer->i_mmap_mutex);
+	list_del_init(&mapping->i_peer_list);
+	if (list_empty(&peer->i_peer_list))
+		rcu_assign_pointer(peer->i_peer_file, NULL);
+	else
+		file = NULL;
+	mutex_unlock(&peer->i_mmap_mutex);
+
+	if (file) {
+		atomic_inc(&file->f_inode->i_writecount);
+		file_accessed(file);
+		fput(file);
+	}
+}
+EXPORT_SYMBOL(close_mapping_peer);
+
+struct page *pick_peer_page(struct address_space *mapping, pgoff_t index,
+		struct file_ra_state *ra, unsigned ra_size)
+{
+	struct address_space *peer;
+	struct page *page;
+	struct file *file;
+
+	rcu_read_lock();
+	file = rcu_dereference(mapping->i_peer_file);
+	if (!file || !atomic_long_inc_not_zero(&file->f_count)) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	rcu_read_unlock();
+
+	peer = file->f_mapping;
+
+	page = find_get_page(peer, index);
+	if (!page) {
+		page_cache_sync_readahead(peer, ra, file, index, ra_size);
+		page = find_get_page(peer, index);
+		if (!page)
+			goto out;
+	}
+	if (PageReadahead(page))
+		page_cache_async_readahead(peer, ra, file,
+				page, index, ra->ra_pages);
+	if (!PageUptodate(page)) {
+		if (!lock_page_killable(page)) {
+			unlock_page(page);
+			if (PageUptodate(page))
+				goto out;;
+		}
+		put_page(page);
+		page = NULL;
+	}
+out:
+	fput(file);
+	return page;
+}
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -513,6 +513,7 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
 	return addr != end;
 }
 
@@ -651,19 +652,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
  * @nodes and @flags,) it's isolated and queued to the pagelist which is
  * passed via @private.)
  */
-static struct vm_area_struct *
+static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags, void *private)
 {
-	int err;
-	struct vm_area_struct *first, *vma, *prev;
-
+	int err = 0;
+	struct vm_area_struct *vma, *prev;
 
-	first = find_vma(mm, start);
-	if (!first)
-		return ERR_PTR(-EFAULT);
+	vma = find_vma(mm, start);
+	if (!vma)
+		return -EFAULT;
 	prev = NULL;
-	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+	for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 		unsigned long endvma = vma->vm_end;
 
 		if (endvma > end)
@@ -673,9 +673,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 			if (!vma->vm_next && vma->vm_end < end)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 			if (prev && prev->vm_end < vma->vm_start)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 		}
 
 		if (flags & MPOL_MF_LAZY) {
@@ -689,15 +689,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
 						flags, private);
-			if (err) {
-				first = ERR_PTR(err);
+			if (err)
 				break;
-			}
 		}
 next:
 		prev = vma;
 	}
-	return first;
+	return err;
 }
 
 /*
@@ -1169,16 +1167,17 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 
 /*
  * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
  * Search forward from there, if not.  N.B., this assumes that the
  * list of pages handed to migrate_pages()--which is how we get here--
  * is in virtual address order.
  */
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
-	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+	struct vm_area_struct *vma;
 	unsigned long uninitialized_var(address);
 
+	vma = find_vma(current->mm, start);
 	while (vma) {
 		address = page_address_in_vma(page, vma);
 		if (address != -EFAULT)
@@ -1208,7 +1207,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	return -ENOSYS;
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
 	return NULL;
 }
@@ -1218,7 +1217,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 		     unsigned short mode, unsigned short mode_flags,
 		     nodemask_t *nmask, unsigned long flags)
 {
-	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
@@ -1284,11 +1282,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (err)
 		goto mpol_out;
 
-	vma = queue_pages_range(mm, start, end, nmask,
+	err = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
-
-	err = PTR_ERR(vma);	/* maybe ... */
-	if (!IS_ERR(vma))
+	if (!err)
 		err = mbind_range(mm, start, end, new);
 
 	if (!err) {
@@ -1296,9 +1292,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
-			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
-					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+			nr_failed = migrate_pages(&pagelist, new_page,
+				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
 		}
@@ -2393,6 +2388,23 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 	return ret;
 }
 
+/*
+ * Drop the (possibly final) reference to task->mempolicy.  It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+	struct mempolicy *pol;
+
+	task_lock(task);
+	pol = task->mempolicy;
+	task->mempolicy = NULL;
+	task_unlock(task);
+	mpol_put(pol);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,25 +6,134 @@
  *  extreme VM load.
  *
  *  started by Ingo Molnar, Copyright (C) 2001
+ *  debugging by David Rientjes, Copyright (C) 2015
  */
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+
+#include <linux/highmem.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
 
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+			 size_t byte)
+{
+	const int nr = pool->curr_nr;
+	const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+	const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+	int i;
+
+	pr_err("BUG: mempool element poison mismatch\n");
+	pr_err("Mempool %p size %zu\n", pool, size);
+	pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+	for (i = start; i < end; i++)
+		pr_cont("%x ", *(u8 *)(element + i));
+	pr_cont("%s\n", end < size ? "..." : "");
+	dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+	u8 *obj = element;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+		if (obj[i] != exp) {
+			poison_error(pool, element, size, i);
+			return;
+		}
+	}
+	memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+		__check_element(pool, element, ksize(element));
+
+	/* Mempools backed by page allocator */
+	if (pool->free == mempool_free_pages) {
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+
+static void __poison_element(void *element, size_t size)
+{
+	u8 *obj = element;
+
+	memset(obj, POISON_FREE, size - 1);
+	obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		__poison_element(element, ksize(element));
+
+	/* Mempools backed by page allocator */
+	if (pool->alloc == mempool_alloc_pages) {
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__poison_element(addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_poison_kfree(element);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
+{
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_unpoison_slab(element);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
+
 static void add_element(mempool_t *pool, void *element)
 {
 	BUG_ON(pool->curr_nr >= pool->min_nr);
+	poison_element(pool, element);
+	kasan_poison_element(pool, element);
 	pool->elements[pool->curr_nr++] = element;
 }
 
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
 {
-	BUG_ON(pool->curr_nr <= 0);
-	return pool->elements[--pool->curr_nr];
+	void *element = pool->elements[--pool->curr_nr];
+
+	BUG_ON(pool->curr_nr < 0);
+	kasan_unpoison_element(pool, element, flags);
+	check_element(pool, element);
+	return element;
 }
 
 /**
@@ -41,7 +150,7 @@ void mempool_destroy(mempool_t *pool)
 		return;
 
 	while (pool->curr_nr) {
-		void *element = remove_element(pool);
+		void *element = remove_element(pool, GFP_KERNEL);
 		pool->free(element, pool->pool_data);
 	}
 	kfree(pool->elements);
@@ -136,7 +245,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
 	spin_lock_irqsave(&pool->lock, flags);
 	if (new_min_nr <= pool->min_nr) {
 		while (new_min_nr < pool->curr_nr) {
-			element = remove_element(pool);
+			element = remove_element(pool, GFP_KERNEL);
 			spin_unlock_irqrestore(&pool->lock, flags);
 			pool->free(element, pool->pool_data);
 			spin_lock_irqsave(&pool->lock, flags);
@@ -219,7 +328,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 
 	spin_lock_irqsave(&pool->lock, flags);
 	if (likely(pool->curr_nr)) {
-		element = remove_element(pool);
+		element = remove_element(pool, gfp_temp);
 		spin_unlock_irqrestore(&pool->lock, flags);
 		/* paired with rmb in mempool_free(), read comment there */
 		smp_wmb();
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -188,7 +189,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 	} else if (PageAnon(new))
 		page_add_anon_rmap(new, vma, addr);
 	else
-		page_add_file_rmap(new);
+		page_add_file_rmap(new, mm);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, ptep);
@@ -204,7 +205,12 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  */
 static void remove_migration_ptes(struct page *old, struct page *new)
 {
-	rmap_walk(new, remove_migration_pte, old);
+	struct rmap_walk_control rwc = {
+		.rmap_one = remove_migration_pte,
+		.arg = old,
+	};
+
+	rmap_walk(new, &rwc);
 }
 
 /*
@@ -544,6 +550,11 @@ void migrate_page_states(struct page *newpage, struct page *page)
 			__set_page_dirty_nobuffers(newpage);
  	}
 
+	if (page_is_young(page))
+		set_page_young(newpage);
+	if (page_is_idle(page))
+		set_page_idle(newpage);
+
 	/*
 	 * Copy NUMA information to the new page, to prevent over-eager
 	 * future migrations of this same page.
@@ -800,6 +811,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		newpage->mapping = NULL;
 	} else {
+		mem_cgroup_migrate(page, newpage, false);
 		if (page_was_mapped)
 			remove_migration_ptes(page, newpage);
 		page->mapping = NULL;
@@ -815,7 +827,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
 	int rc = -EAGAIN;
 	int page_was_mapped = 0;
-	struct mem_cgroup *mem;
 	struct anon_vma *anon_vma = NULL;
 
 	if (!trylock_page(page)) {
@@ -841,9 +852,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		lock_page(page);
 	}
 
-	/* charge against new page */
-	mem_cgroup_prepare_migration(page, newpage, &mem);
-
 	if (PageWriteback(page)) {
 		/*
 		 * Only in the case of a full synchronous migration is it
@@ -857,10 +865,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 			break;
 		default:
 			rc = -EBUSY;
-			goto uncharge;
+			goto out_unlock;
 		}
 		if (!force)
-			goto uncharge;
+			goto out_unlock;
 		wait_on_page_writeback(page);
 	}
 	/*
@@ -895,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 			 * completes
 			 */
 		} else {
-			goto uncharge;
+			goto out_unlock;
 		}
 	}
 
@@ -908,7 +916,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		 * the page migration right away (proteced by page lock).
 		 */
 		rc = balloon_page_migrate(newpage, page, mode);
-		goto uncharge;
+		goto out_unlock;
 	}
 
 	/*
@@ -927,7 +935,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		VM_BUG_ON_PAGE(PageAnon(page), page);
 		if (page_has_private(page)) {
 			try_to_free_buffers(page);
-			goto uncharge;
+			goto out_unlock;
 		}
 		goto skip_unmap;
 	}
@@ -950,9 +958,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-uncharge:
-	mem_cgroup_end_migration(mem, page, newpage,
-				 rc == MIGRATEPAGE_SUCCESS);
+out_unlock:
 	unlock_page(page);
 out:
 	return rc;
@@ -1583,7 +1589,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 		if (!populated_zone(zone))
 			continue;
 
-		if (zone->all_unreclaimable)
+		if (!zone_reclaimable(zone))
 			continue;
 
 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -1784,7 +1790,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
 	struct page *new_page = NULL;
-	struct mem_cgroup *memcg = NULL;
 	int page_lru = page_is_file_cache(page);
 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
@@ -1850,17 +1855,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		goto out_unlock;
 	}
 
-	/*
-	 * Traditional migration needs to prepare the memcg charge
-	 * transaction early to prevent the old page from being
-	 * uncharged when installing migration entries.  Here we can
-	 * save the potential rollback and start the charge transfer
-	 * only when migration is already known to end successfully.
-	 */
-	mem_cgroup_prepare_migration(page, new_page, &memcg);
-
-	init_trans_huge_mmu_gather_count(new_page);
-
 	orig_entry = *pmd;
 	entry = mk_pmd(new_page, vma->vm_page_prot);
 	entry = pmd_mkhuge(entry);
@@ -1889,14 +1883,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		goto fail_putback;
 	}
 
+	mem_cgroup_migrate(page, new_page, false);
+
 	page_remove_rmap(page);
 
-	/*
-	 * Finish the charge transaction under the page table lock to
-	 * prevent split_huge_page() from dividing up the charge
-	 * before it's fully transferred to the new page.
-	 */
-	mem_cgroup_end_migration(memcg, page, new_page, true);
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
@@ -1997,9 +1987,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		if (!pte_present(pte)) {
 			mpfn = pfn = 0;
 
-			if (pte_file(pte))
-				goto next;
-
 			/*
 			 * Only care about unaddressable device page special
 			 * page table entry. Other special swap entries are not
@@ -2312,6 +2299,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 				    unsigned long *src,
 				    unsigned long *dst)
 {
+	struct mem_cgroup *memcg;
 	struct vm_area_struct *vma = migrate->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
@@ -2355,7 +2343,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto abort;
-	if (mem_cgroup_newpage_charge(page, vma->vm_mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg))
 		goto abort;
 
 	/*
@@ -2379,7 +2367,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	if (!pte_none(*ptep)) {
 		pte_unmap_unlock(ptep, ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		goto abort;
 	}
 
@@ -2389,12 +2377,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	 */
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(ptep, ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		goto abort;
 	}
 
 	inc_mm_counter(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, addr);
+	mem_cgroup_commit_charge(page, memcg, false);
+	if (!is_zone_device_page(page))
+		lru_cache_add_active_or_unevictable(page, vma);
 	set_pte_at(mm, addr, ptep, entry);
 
 	/* Take a reference on the page */
@@ -2428,7 +2419,6 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
 		struct address_space *mapping;
-		struct mem_cgroup *memcg;
 		int r;
 
 		if (!newpage) {
@@ -2469,12 +2459,11 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 			}
 		}
 
-		mem_cgroup_prepare_migration(page, newpage, &memcg);
 		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
-		mem_cgroup_end_migration(memcg, page, newpage,
-					 r == MIGRATEPAGE_SUCCESS);
 		if (r != MIGRATEPAGE_SUCCESS)
 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+		else
+			mem_cgroup_migrate(page, newpage, false);
 	}
 }
 
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		pte_t pte = *ptep;
-		pgoff_t pgoff;
 
 		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else if (pte_present(pte))
 			*vec = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
+		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
 			if (is_migration_entry(entry)) {
@@ -142,9 +138,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				*vec = 1;
 			} else {
 #ifdef CONFIG_SWAP
-				pgoff = entry.val;
 				*vec = mincore_page(swap_address_space(entry),
-					pgoff);
+					entry.val);
 #else
 				WARN_ON(1);
 				*vec = 1;
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -19,11 +19,13 @@
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
 
+#include <bc/vmpages.h>
+
 #include "internal.h"
 
 int can_do_mlock(void)
 {
-	if (capable(CAP_IPC_LOCK))
+	if (ve_capable(CAP_IPC_LOCK))
 		return 1;
 	if (rlimit(RLIMIT_MEMLOCK) != 0)
 		return 1;
@@ -229,11 +231,14 @@ static int __mlock_posix_error_return(long retval)
  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
  * free them.  This will result in freeing mlocked pages.
  */
-void munlock_vma_pages_range(struct vm_area_struct *vma,
-			     unsigned long start, unsigned long end)
+void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			       unsigned long start, unsigned long end, int acct)
 {
 	vma->vm_flags &= ~VM_LOCKED;
 
+	if (acct)
+		ub_locked_uncharge(vma->vm_mm, end - start);
+
 	while (start < end) {
 		struct page *page;
 		unsigned int page_mask, page_increm;
@@ -287,6 +292,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
 		goto out;	/* don't set VM_LOCKED,  don't count */
 
+	if (newflags & VM_LOCKED) {
+		ret = ub_locked_charge(mm, end - start);
+		if (ret < 0)
+			goto out;
+	}
+
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma),
@@ -299,13 +310,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	if (start != vma->vm_start) {
 		ret = split_vma(mm, vma, start, 1);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 	if (end != vma->vm_end) {
 		ret = split_vma(mm, vma, end, 0);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 success:
@@ -331,6 +342,11 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 out:
 	*prev = vma;
 	return ret;
+
+out_uncharge:
+	if (newflags & VM_LOCKED)
+		ub_locked_uncharge(mm, end - start);
+	goto out;
 }
 
 static int do_mlock(unsigned long start, size_t len, int on)
@@ -469,7 +485,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 	lock_limit >>= PAGE_SHIFT;
 
 	/* check against resource limits */
-	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+	if ((locked <= lock_limit) || ve_capable(CAP_IPC_LOCK))
 		error = do_mlock(start, len, 1);
 	up_write(&current->mm->mmap_sem);
 	if (!error)
@@ -536,7 +552,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 
 	ret = -ENOMEM;
 	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
-	    capable(CAP_IPC_LOCK))
+	    ve_capable(CAP_IPC_LOCK))
 		ret = do_mlockall(flags);
 	up_write(&current->mm->mmap_sem);
 	if (!ret && (flags & MCL_CURRENT))
@@ -573,7 +589,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
 	lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
 	if (!allowed &&
-	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+	    locked + user->locked_shm > lock_limit && !ve_capable(CAP_IPC_LOCK))
 		goto out;
 	get_uid(user);
 	user->locked_shm += locked;
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -148,5 +148,4 @@ static int __init mm_sysfs_init(void)
 
 	return 0;
 }
-
-__initcall(mm_sysfs_init);
+postcore_initcall(mm_sysfs_init);
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/virtinfo.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
@@ -38,6 +39,9 @@
 #include <linux/memory.h>
 #include <linux/userfaultfd_k.h>
 
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
@@ -162,6 +166,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	long free, allowed, reserve;
 
+	if (mm && ub_enough_memory(mm, pages) != 0)
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*
@@ -247,10 +254,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.nonlinear);
-	else
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
+	vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -278,17 +282,20 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	struct vm_area_struct *next = vma->vm_next;
 
 	might_sleep();
+
+	ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
+			vma->vm_flags, vma->vm_file);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma->vm_mm, vma);
 	return next;
 }
 
 static unsigned long do_brk(unsigned long addr, unsigned long len,
-			    struct list_head *uf);
+			    struct list_head *uf, int soft);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -347,7 +354,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk, &uf) != oldbrk)
+	if (do_brk(oldbrk, newbrk-oldbrk, &uf, UB_HARD) != oldbrk)
 		goto out;
 
 set_brk:
@@ -685,10 +692,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 			atomic_inc(&mapping->i_mmap_writable);
 
 		flush_dcache_mmap_lock(mapping);
-		if (unlikely(vma->vm_flags & VM_NONLINEAR))
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		else
-			vma_interval_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -876,14 +880,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
 	if (file) {
 		mapping = file->f_mapping;
-		if (!(vma->vm_flags & VM_NONLINEAR)) {
-			root = &mapping->i_mmap;
-			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+		root = &mapping->i_mmap;
+		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 
-			if (adjust_next)
-				uprobe_munmap(next, next->vm_start,
-							next->vm_end);
-		}
+		if (adjust_next)
+			uprobe_munmap(next, next->vm_start, next->vm_end);
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		if (insert) {
@@ -1000,7 +1001,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(mm, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1035,7 +1036,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 		else if (next)
 			vma_gap_update(next);
 		else
-			mm->highest_vm_end = end;
+			WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
 	}
 	if (insert && file)
 		uprobe_mmap(insert);
@@ -1463,7 +1464,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
@@ -1686,6 +1687,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	int error;
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
+	unsigned long ub_charged = 0;
 
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1723,6 +1725,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vm_flags |= VM_ACCOUNT;
 	}
 
+	if (ub_memory_charge(mm, len, vm_flags, file, UB_HARD))
+		goto charge_error;
+	ub_charged = 1;
+
 	/*
 	 * Can we just expand an old mapping?
 	 */
@@ -1736,7 +1742,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
@@ -1775,6 +1781,18 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if (vm_flags != vma->vm_flags) {
+		/*
+		 * ->vm_flags has been changed in f_op->mmap method.
+		 * We have to recharge ub memory.
+		 */
+			ub_memory_uncharge(mm, len, vm_flags, file);
+			if (ub_memory_charge(mm, len, vma->vm_flags, file, UB_HARD)) {
+				ub_charged = 0;
+				error = -ENOMEM;
+				goto unmap_and_free_vma;
+			}
+		}
 
 		/* Can addr have changed??
 		 *
@@ -1813,8 +1831,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm)))
 			mm->locked_vm += (len >> PAGE_SHIFT);
-		else
+		else {
 			vma->vm_flags &= ~VM_LOCKED;
+			ub_locked_uncharge(mm, len);
+		}
 	}
 
 	if (file)
@@ -1846,8 +1866,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 unacct_error:
+	if (ub_charged)
+		ub_memory_uncharge(mm, len, vm_flags, file);
+charge_error:
 	if (charged)
 		vm_unacct_memory(charged);
 	return error;
@@ -2297,7 +2320,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 		locked = mm->locked_vm + grow;
 		limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
 		limit >>= PAGE_SHIFT;
-		if (locked > limit && !capable(CAP_IPC_LOCK))
+		if (locked > limit && !ve_capable(CAP_IPC_LOCK))
 			return -ENOMEM;
 	}
 
@@ -2307,18 +2330,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
 		return -EFAULT;
 
+	if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
+				vma->vm_file, UB_SOFT))
+		goto fail_charge;
+
 	/*
 	 * Overcommit..  This must be the final test, as it will
 	 * update security statistics.
 	 */
 	if (security_vm_enough_memory_mm(mm, grow))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Ok, everything looks good - let it rip */
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
+
+fail_sec:
+	ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
+fail_charge:
+	return -ENOMEM;
 }
 
 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
@@ -2330,48 +2362,39 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
 	struct vm_area_struct *next;
 	unsigned long gap_addr;
-	int error;
+	int error = 0;
 
 	if (!(vma->vm_flags & VM_GROWSUP))
 		return -EFAULT;
 
-	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
-	 */
-	if (unlikely(anon_vma_prepare(vma)))
-		return -ENOMEM;
-	vma_lock_anon_vma(vma);
-
-	/*
-	 * vma->vm_start/vm_end cannot change under us because the caller
-	 * is required to hold the mmap_sem in read mode.  We need the
-	 * anon_vma lock to serialize against concurrent expand_stacks.
-	 * Also guard against wrapping around to address 0.
-	 */
+	/* Guard against wrapping around to address 0. */
 	address &= PAGE_MASK;
 	address += PAGE_SIZE;
-	if (!address) {
-		vma_unlock_anon_vma(vma);
+	if (!address)
 		return -ENOMEM;
-	}
-	error = 0;
 
 	/* Enforce stack_guard_gap */
 	gap_addr = address + stack_guard_gap;
-	if (gap_addr < address) {
-		vma_unlock_anon_vma(vma);
+	if (gap_addr < address)
 		return -ENOMEM;
-	}
 	next = vma->vm_next;
 	if (next && next->vm_start < gap_addr) {
-		if (!(next->vm_flags & VM_GROWSUP)) {
-			vma_unlock_anon_vma(vma);
+		if (!(next->vm_flags & VM_GROWSUP))
 			return -ENOMEM;
-		}
 		/* Check that both stack segments have the same anon_vma? */
 	}
 
+	/* We must make sure the anon_vma is allocated. */
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+
+	/*
+	 * vma->vm_start/vm_end cannot change under us because the caller
+	 * is required to hold the mmap_sem in read mode.  We need the
+	 * anon_vma lock to serialize against concurrent expand_stacks.
+	 */
+	vma_lock_anon_vma(vma);
+
 	/* Somebody else might have raced and expanded it already */
 	if (address > vma->vm_end) {
 		unsigned long size, grow;
@@ -2425,13 +2448,6 @@ int expand_downwards(struct vm_area_struct *vma,
 	unsigned long gap_addr;
 	int error;
 
-	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
-	 */
-	if (unlikely(anon_vma_prepare(vma)))
-		return -ENOMEM;
-
 	address &= PAGE_MASK;
 	error = security_mmap_addr(address);
 	if (error)
@@ -2439,22 +2455,26 @@ int expand_downwards(struct vm_area_struct *vma,
 
 	/* Enforce stack_guard_gap */
 	gap_addr = address - stack_guard_gap;
-	if (gap_addr > address) 
+	if (gap_addr > address)
 		return -ENOMEM;
-	
+
 	prev = vma->vm_prev;
 	if (prev && prev->vm_end > gap_addr) {
-		if (!(prev->vm_flags & VM_GROWSDOWN)) 
+		if (!(prev->vm_flags & VM_GROWSDOWN))
 			return -ENOMEM;
 		/* Check that both stack segments have the same anon_vma? */
 	}
 
-	vma_lock_anon_vma(vma);
+	/* We must make sure the anon_vma is allocated. */
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode.  We need the
 	 * anon_vma lock to serialize against concurrent expand_stacks.
 	 */
+	vma_lock_anon_vma(vma);
 
 	/* Somebody else might have raced and expanded it already */
 	if (address < vma->vm_start) {
@@ -2659,7 +2679,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = allocate_vma(mm, GFP_KERNEL);
 	if (!new)
 		goto out_err;
 
@@ -2710,7 +2730,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
  out_free_mpol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new);
+	free_vma(mm, new);
  out_err:
 	return err;
 }
@@ -2845,6 +2865,99 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 	return vm_munmap(addr, len);
 }
 
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long populate = 0;
+	unsigned long ret = -EINVAL;
+	struct file *file;
+
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+			"See Documentation/vm/remap_file_pages.txt.\n",
+			current->comm, current->pid);
+
+	if (prot)
+		return ret;
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	if (start + size <= start)
+		return ret;
+
+	/* Does pgoff wrap? */
+	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+		return ret;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma(mm, start);
+
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (start < vma->vm_start)
+		goto out;
+
+	if (start + size > vma->vm_end) {
+		struct vm_area_struct *next;
+
+		for (next = vma->vm_next; next; next = next->vm_next) {
+			/* hole between vmas ? */
+			if (next->vm_start != next->vm_prev->vm_end)
+				goto out;
+
+			if (next->vm_file != vma->vm_file)
+				goto out;
+
+			if (next->vm_flags != vma->vm_flags)
+				goto out;
+
+			if (start + size <= next->vm_end)
+				break;
+		}
+
+		if (!next)
+			goto out;
+	}
+
+	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+	flags &= MAP_NONBLOCK;
+	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+	if (vma->vm_flags & VM_LOCKED) {
+		struct vm_area_struct *tmp;
+		flags |= MAP_LOCKED;
+
+		/* drop PG_Mlocked flag for over-mapped range */
+		for (tmp = vma; tmp->vm_start >= start + size;
+				tmp = tmp->vm_next) {
+			munlock_vma_pages_range(tmp,
+					max(tmp->vm_start, start),
+					min(tmp->vm_end, start + size));
+		}
+	}
+
+	file = get_file(vma->vm_file);
+	ret = do_mmap_pgoff(vma->vm_file, start, size,
+			prot, flags, pgoff, &populate, NULL);
+	fput(file);
+out:
+	up_write(&mm->mmap_sem);
+	if (populate)
+		mm_populate(ret, populate);
+	if (!IS_ERR_VALUE(ret))
+		ret = 0;
+	return ret;
+}
+
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
@@ -2860,7 +2973,8 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk_flags(unsigned long addr, unsigned long len, struct list_head *uf, unsigned long flags)
+static unsigned long do_brk_flags(unsigned long addr, unsigned long len,
+	struct list_head *uf, unsigned long flags, int soft)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
@@ -2890,7 +3004,7 @@ static unsigned long do_brk_flags(unsigned long addr, unsigned long len, struct
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
@@ -2917,8 +3031,11 @@ static unsigned long do_brk_flags(unsigned long addr, unsigned long len, struct
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	if (ub_memory_charge(mm, len, flags, NULL, soft))
+		goto fail_charge;
+
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
@@ -2929,11 +3046,9 @@ static unsigned long do_brk_flags(unsigned long addr, unsigned long len, struct
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!vma) {
-		vm_unacct_memory(len >> PAGE_SHIFT);
-		return -ENOMEM;
-	}
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
+	if (!vma)
+		goto fail_alloc;
 
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_mm = mm;
@@ -2950,11 +3065,19 @@ static unsigned long do_brk_flags(unsigned long addr, unsigned long len, struct
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
 	return addr;
+
+fail_alloc:
+	vm_unacct_memory(len >> PAGE_SHIFT);
+fail_sec:
+	ub_memory_uncharge(mm, len, flags, NULL);
+fail_charge:
+	return -ENOMEM;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
+static unsigned long do_brk(unsigned long addr, unsigned long len,
+			    struct list_head *uf, int soft)
 {
-	return do_brk_flags(addr, len, uf, 0);
+	return do_brk_flags(addr, len, uf, 0, soft);
 }
 
 unsigned long vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
@@ -2965,7 +3088,7 @@ unsigned long vm_brk_flags(unsigned long addr, unsigned long len, unsigned long
 	LIST_HEAD(uf);
 
 	down_write(&mm->mmap_sem);
-	ret = do_brk_flags(addr, len, &uf, flags);
+	ret = do_brk_flags(addr, len, &uf, flags, UB_SOFT);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
 	userfaultfd_unmap_complete(mm, &uf);
@@ -3120,7 +3243,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		}
 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		new_vma = allocate_vma(mm, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
 			new_vma->vm_start = addr;
@@ -3146,7 +3269,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  out_free_mempol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new_vma);
+	free_vma(mm, new_vma);
 	return NULL;
 }
 
@@ -3222,7 +3345,7 @@ int install_special_mapping(struct mm_struct *mm,
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (unlikely(vma == NULL))
 		return -ENOMEM;
 
@@ -3248,10 +3371,30 @@ int install_special_mapping(struct mm_struct *mm,
 	return 0;
 
 out:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 	return ret;
 }
 
+bool vma_is_vdso_or_vvar(const struct vm_area_struct *vma,
+		   const struct mm_struct *mm)
+{
+	/*
+	 * As we have uts name virtualization, we can't tell if area
+	 * is VVAR/VDSO the same way as in mainline: vma->vm_private_data
+	 * is different, allocated in uts_prep_vdso_pages_locked().
+	 * As install_special_mapping() can be used currently only by
+	 * uprobes (besides vdso and vvar), check if special mapping
+	 * is related to uprobes, if not - it's vdso/vvar.
+	 */
+	struct page *xol_page = NULL;
+
+	if (mm->uprobes_state.xol_area)
+		xol_page = mm->uprobes_state.xol_area->page;
+
+	return (vma->vm_ops == &special_mapping_vmops) &&
+		(vma->vm_private_data != xol_page);
+}
+
 static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
@@ -3306,8 +3449,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  *
  * mmap_sem in write mode is required in order to block all operations
  * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
  * anon_vmas to be associated with existing vmas.
  *
  * A single task can't take more than one mm_take_all_locks() in a row
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -168,6 +168,22 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+			       unsigned long address)
+{
+	struct mmu_notifier *mn;
+	int young = 0, id;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->clear_young)
+			young |= mn->ops->clear_young(mn, mm, address);
+	}
+	srcu_read_unlock(&srcu, id);
+
+	return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
 			      unsigned long address)
 {
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,11 +8,13 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/module.h>
 
 struct pglist_data *first_online_pgdat(void)
 {
 	return NODE_DATA(first_online_node);
 }
+EXPORT_SYMBOL(first_online_pgdat);
 
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
@@ -22,6 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 		return NULL;
 	return NODE_DATA(nid);
 }
+EXPORT_SYMBOL(next_online_pgdat);
 
 /*
  * next_zone - helper magic for for_each_zone()
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,11 +24,14 @@
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
 #include <linux/ksm.h>
+#include <linux/module.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/vmpages.h>
+
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
 		int dirty_accountable, int prot_numa)
@@ -91,7 +94,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 			if (updated)
 				pages++;
-		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
 			if (is_write_migration_entry(entry)) {
@@ -267,6 +270,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		return 0;
 	}
 
+	error = -ENOMEM;
+	if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    VM_UB_PRIVATE(newflags, vma->vm_file) &&
+	    charge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages, UB_SOFT))
+		goto fail_ch;
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
@@ -278,7 +287,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory_mm(mm, charged))
-				return -ENOMEM;
+				goto fail_sec;
 			newflags |= VM_ACCOUNT;
 		}
 	}
@@ -324,11 +333,21 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+
+	if (VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    !VM_UB_PRIVATE(newflags, vma->vm_file))
+		uncharge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages);
+
 	perf_event_mmap(vma);
 	return 0;
 
 fail:
 	vm_unacct_memory(charged);
+fail_sec:
+	if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    VM_UB_PRIVATE(newflags, vma->vm_file))
+		uncharge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages);
+fail_ch:
 	return error;
 }
 
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,8 @@
 #include <linux/mm-arch-hooks.h>
 #include <linux/userfaultfd_k.h>
 
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -84,8 +86,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 		pte = pte_mksoft_dirty(pte);
 	else if (is_swap_pte(pte))
 		pte = pte_swp_mksoft_dirty(pte);
-	else if (pte_file(pte))
-		pte = pte_file_mksoft_dirty(pte);
 #endif
 	return pte;
 }
@@ -253,12 +253,16 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	int err;
 	bool need_rmap_locks;
 
+	if (ub_memory_charge(mm, new_len, vm_flags,
+			     vma->vm_file, UB_HARD))
+		goto err;
+
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
 	 * which may split one vma into three before unmapping.
 	 */
 	if (mm->map_count >= sysctl_max_map_count - 3)
-		return -ENOMEM;
+		goto err_nomem;
 
 	/*
 	 * Advise KSM to break any KSM pages in the area to be moved:
@@ -270,13 +274,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
 						MADV_UNMERGEABLE, &vm_flags);
 	if (err)
-		return err;
+		goto err_nomem;
 
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 			   &need_rmap_locks);
 	if (!new_vma)
-		return -ENOMEM;
+		goto err_nomem;
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 				     need_rmap_locks);
@@ -348,7 +352,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		*locked = true;
 	}
 
-	return new_addr;
+	if (new_addr != -ENOMEM)
+		return new_addr;
+
+err_nomem:
+	ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
+err:
+	return -ENOMEM;
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
@@ -384,7 +394,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		locked += new_len - old_len;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			goto Eagain;
 	}
 
@@ -559,10 +569,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (old_len == vma->vm_end - addr) {
 		/* can we just expand the current mapping? */
 		if (vma_expandable(vma, new_len - old_len)) {
-			int pages = (new_len - old_len) >> PAGE_SHIFT;
+			unsigned long len = (new_len - old_len);
+			int pages = len >> PAGE_SHIFT;
+
+			ret = -ENOMEM;
+			if (ub_memory_charge(mm, len, vma->vm_flags,
+						vma->vm_file, UB_HARD))
+				goto out;
 
 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
 				       vma->vm_pgoff, NULL)) {
+				ub_memory_uncharge(mm, len,
+						vma->vm_flags, vma->vm_file);
 				ret = -ENOMEM;
 				goto out;
 			}
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -48,6 +48,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 	if (end < start)
 		goto out;
 	error = 0;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	if (end == start)
 		goto out;
 	/*
@@ -86,10 +88,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
 			up_read(&mm->mmap_sem);
-			if (vma->vm_flags & VM_NONLINEAR)
-				error = vfs_fsync(file, 1);
-			else
-				error = vfs_fsync_range(file, fstart, fend, 1);
+			error = vfs_fsync_range(file, fstart, fend, 1);
 			fput(file);
 			if (error || start >= end)
 				goto out;
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -282,6 +282,11 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+{
+	return __vmalloc(size, flags, PAGE_KERNEL);
+}
+
 void *vmalloc_user(unsigned long size)
 {
 	void *ret;
@@ -567,7 +572,7 @@ void __init mmap_init(void)
 
 	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
 	VM_BUG_ON(ret);
-	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
@@ -2021,14 +2026,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
--- /dev/null
+++ b/mm/oom_group.c
@@ -0,0 +1,226 @@
+/*
+ *  mm/oom_group.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+#include <linux/oom.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
+
+static LIST_HEAD(oom_group_list_head);
+static DEFINE_RWLOCK(oom_group_lock);
+
+struct oom_group_pattern {
+	char comm[TASK_COMM_LEN], pcomm[TASK_COMM_LEN];
+	int oom_uid;
+	int oom_score_adj;
+	struct list_head group_list;
+};
+
+static void oom_groups_append(struct list_head *list)
+{
+	write_lock_irq(&oom_group_lock);
+	list_splice_tail(list, &oom_group_list_head);
+	write_unlock_irq(&oom_group_lock);
+}
+
+static void oom_groups_reset(void)
+{
+	struct list_head list;
+	struct oom_group_pattern *gp, *tmp;
+
+	write_lock_irq(&oom_group_lock);
+	list_replace_init(&oom_group_list_head, &list);
+	write_unlock_irq(&oom_group_lock);
+
+	list_for_each_entry_safe(gp, tmp, &list, group_list)
+		kfree(gp);
+}
+
+/*
+ * If mask ends with asterisk it matches any comm suffix:
+ * "foo" matches only "foo", "foo*" matches "foo" and "foobar"
+ * "*" matches any string.
+ */
+static bool oom_match_comm(const char *comm, const char *mask)
+{
+	while (*comm && *mask != '*' && *comm == *mask) {
+		comm++;
+		mask++;
+	}
+	return (!*mask && !*comm) || (*mask == '*');
+}
+
+int get_task_oom_score_adj(struct task_struct *t)
+{
+	struct oom_group_pattern *gp;
+	unsigned long flags;
+	const struct cred *cred;
+	uid_t task_uid;
+	int adj = t->signal->oom_score_adj;
+
+	/* Do not impose grouping rules if the score is adjusted by the user */
+	if (adj != 0)
+		return adj;
+
+	rcu_read_lock();
+	cred = __task_cred(t);
+	task_uid = from_kuid_munged(cred->user_ns, cred->uid);
+	rcu_read_unlock();
+
+	read_lock_irqsave(&oom_group_lock, flags);
+	list_for_each_entry(gp, &oom_group_list_head, group_list) {
+		if (gp->oom_uid >= 0 && task_uid != gp->oom_uid)
+			continue;
+		if (gp->oom_uid < -1 && task_uid >= -gp->oom_uid)
+			continue;
+		if (!oom_match_comm(t->comm, gp->comm))
+			continue;
+		if (!oom_match_comm(t->parent->comm, gp->pcomm))
+			continue;
+		adj = gp->oom_score_adj;
+		break;
+	}
+	read_unlock_irqrestore(&oom_group_lock, flags);
+	return adj;
+}
+
+static int oom_group_parse_line(struct list_head *list, char *line)
+{
+	struct oom_group_pattern *gp;
+	char dummy;
+	int ret;
+
+	gp = kmalloc(sizeof(struct oom_group_pattern), GFP_KERNEL);
+	if (gp == NULL)
+		return -ENOMEM;
+
+	BUILD_BUG_ON(TASK_COMM_LEN != 16);
+	ret = sscanf(line, "%15s %15s %d %d %c",
+			gp->comm, gp->pcomm, &gp->oom_uid,
+			&gp->oom_score_adj, &dummy);
+
+	if (ret != 4 || gp->oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			gp->oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		kfree(gp);
+		return -EINVAL;
+	}
+
+	list_add_tail(&gp->group_list, list);
+
+	return 0;
+}
+
+static ssize_t oom_group_write(struct file * file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char *line, *next, *page;
+	int ret, len;
+	LIST_HEAD(groups);
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	len = min(count, PAGE_SIZE - 1);
+	ret = copy_from_user(page, buf, len);
+	if (ret)
+		goto err;
+
+	page[len] = '\0';
+
+	next = page;
+	while (1) {
+		line = skip_spaces(next);
+		next = strchr(line, '\n');
+		if (next) {
+			*next++ = '\0';
+		} else if (len < count) {
+			ret = line != page ? line - page : -EINVAL;
+			break;
+		}
+		if (*line && *line != '#') {
+			ret = oom_group_parse_line(&groups, line);
+			if (ret)
+				break;
+		}
+		if (!next) {
+			ret = len;
+			break;
+		}
+	}
+
+	oom_groups_append(&groups);
+err:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+static void *oom_group_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_irq(&oom_group_lock);
+	return seq_list_start(&oom_group_list_head, *pos);
+}
+
+static void oom_group_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_irq(&oom_group_lock);
+}
+
+static int oom_group_seq_show(struct seq_file *s, void *v)
+{
+	struct list_head *entry = v;
+	struct oom_group_pattern *p;
+
+	p = list_entry(entry, struct oom_group_pattern, group_list);
+	seq_printf(s, "%s %s %d %d\n", p->comm, p->pcomm,
+			p->oom_uid, p->oom_score_adj);
+	return 0;
+}
+
+static void *oom_group_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &oom_group_list_head, pos);
+}
+
+static struct seq_operations oom_group_seq_ops = {
+	.start = oom_group_seq_start,
+	.next  = oom_group_seq_next,
+	.stop  = oom_group_seq_stop,
+	.show  = oom_group_seq_show,
+};
+
+static int oom_group_seq_open(struct inode *inode, struct file *file)
+{
+	if (file->f_flags & O_TRUNC)
+		oom_groups_reset();
+	return seq_open(file, &oom_group_seq_ops);
+}
+
+static struct file_operations proc_oom_group_ops = {
+	.owner   = THIS_MODULE,
+	.open    = oom_group_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = oom_group_write,
+};
+
+static int __init oom_group_init(void) {
+	struct proc_dir_entry *proc;
+
+	proc = proc_create("oom_score_adj", 0660,
+			   proc_vz_dir, &proc_oom_group_ops);
+	if (!proc)
+		return -ENOMEM;
+	return 0;
+}
+
+module_init(oom_group_init);
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,8 +41,47 @@
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
+
+static DEFINE_SPINLOCK(oom_context_lock);
+
+#define OOM_TIMEOUT	(5 * HZ)
+
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
+struct oom_context global_oom_ctx = {
+	.rage		= OOM_BASE_RAGE,
+	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(global_oom_ctx.waitq),
+};
+
+void init_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	ctx->marked = false;
+	ctx->oom_start = 0;
+	ctx->oom_end = 0;
+	ctx->rage = OOM_BASE_RAGE;
+	init_waitqueue_head(&ctx->waitq);
+}
+
+static void __release_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	ctx->marked = false;
+	ctx->oom_end = jiffies;
+	wake_up_all(&ctx->waitq);
+}
+
+void release_oom_context(struct oom_context *ctx)
+{
+	spin_lock(&oom_context_lock);
+	__release_oom_context(ctx);
+	spin_unlock(&oom_context_lock);
+}
 
 #ifdef CONFIG_NUMA
 /**
@@ -137,6 +176,21 @@ static bool oom_unkillable_task(struct task_struct *p,
 	return false;
 }
 
+static unsigned long mm_overdraft(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+	unsigned long overdraft;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	overdraft = ctx->overdraft;
+	if (memcg)
+		mem_cgroup_put(memcg);
+
+	return overdraft;
+}
+
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
@@ -147,11 +201,15 @@ static bool oom_unkillable_task(struct task_struct *p,
  * task consuming the most memory to avoid subsequent oom failures.
  */
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+			  const nodemask_t *nodemask, unsigned long totalpages,
+			  unsigned long *overdraft)
 {
 	long points;
 	long adj;
 
+	if (overdraft)
+		*overdraft = 0;
+
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
 
@@ -159,7 +217,10 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
-	adj = (long)p->signal->oom_score_adj;
+	if (overdraft)
+		*overdraft = mm_overdraft(p->mm);
+
+	adj = get_task_oom_score_adj(p);
 	if (adj == OOM_SCORE_ADJ_MIN) {
 		task_unlock(p);
 		return 0;
@@ -255,24 +316,21 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 #endif
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+					const nodemask_t *nodemask)
 {
-	if (task->exit_state)
-		return OOM_SCAN_CONTINUE;
 	if (oom_unkillable_task(task, NULL, nodemask))
 		return OOM_SCAN_CONTINUE;
 
 	/*
 	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves.
+	 * Try to select another one.
+	 *
+	 * This can only happen if oom_trylock timeout-ed, which most probably
+	 * means that the victim had dead-locked.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (unlikely(frozen(task)))
-			__thaw_task(task);
-		if (!force_kill)
-			return OOM_SCAN_ABORT;
-	}
+	if (test_tsk_thread_flag(task, TIF_MEMDIE))
+		return OOM_SCAN_CONTINUE;
+
 	if (!task->mm)
 		return OOM_SCAN_CONTINUE;
 
@@ -283,14 +341,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
-	if (task->flags & PF_EXITING && !force_kill) {
-		/*
-		 * If this task is not being ptraced on exit, then wait for it
-		 * to finish before killing some other task unnecessarily.
-		 */
-		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-			return OOM_SCAN_ABORT;
-	}
 	return OOM_SCAN_OK;
 }
 
@@ -300,43 +350,43 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+		unsigned long *poverdraft,
+		unsigned long totalpages, const nodemask_t *nodemask)
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
+	unsigned long max_overdraft = 0;
 
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
 		unsigned int points;
+		unsigned long overdraft;
 
-		switch (oom_scan_process_thread(p, totalpages, nodemask,
-						force_kill)) {
+		switch (oom_scan_process_thread(p, nodemask)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
+			max_overdraft = ULONG_MAX;
 			/* fall through */
 		case OOM_SCAN_CONTINUE:
 			continue;
-		case OOM_SCAN_ABORT:
-			rcu_read_unlock();
-			return ERR_PTR(-1UL);
 		case OOM_SCAN_OK:
 			break;
 		};
-		points = oom_badness(p, NULL, nodemask, totalpages);
-		if (points > chosen_points) {
+		points = oom_badness(p, NULL, nodemask, totalpages,
+				     &overdraft);
+		if (oom_worse(points, overdraft, &chosen_points,
+			      &max_overdraft))
 			chosen = p;
-			chosen_points = points;
-		}
 	}
 	if (chosen)
 		get_task_struct(chosen);
 	rcu_read_unlock();
 
-	*ppoints = chosen_points * 1000 / totalpages;
+	*ppoints = chosen_points;
+	*poverdraft = max_overdraft;
 	return chosen;
 }
 
@@ -402,13 +452,341 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 		dump_tasks(memcg, nodemask);
 }
 
+/**
+ * mark_oom_victim - mark the given task as OOM victim
+ * @tsk: task to mark
+ */
+void mark_oom_victim(struct task_struct *tsk)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+
+	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+
+	/*
+	 * Make sure that the task is woken up from uninterruptible sleep
+	 * if it is frozen because OOM killer wouldn't be able to free
+	 * any memory and livelock. freezing_slow_path will tell the freezer
+	 * that TIF_MEMDIE tasks should be ignored.
+	 */
+	__thaw_task(tsk);
+
+	/*
+	 * Record the pointer to the victim in the oom context of the
+	 * owner memcg so that others can wait for it to exit. It will
+	 * be cleared in exit_oom_victim.
+	 */
+	memcg = get_mem_cgroup_from_mm(tsk->mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	spin_lock(&oom_context_lock);
+	if (!ctx->victim) {
+		ctx->victim = tsk;
+		ctx->marked = true;
+	}
+	spin_unlock(&oom_context_lock);
+	if (memcg)
+		mem_cgroup_put(memcg);
+}
+
+/**
+ * exit_oom_victim - note the exit of an OOM victim
+ */
+void exit_oom_victim(void)
+{
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+
+	clear_thread_flag(TIF_MEMDIE);
+
+	/*
+	 * Wake up every process waiting for this oom victim to exit.
+	 */
+	spin_lock(&oom_context_lock);
+	iter = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->victim != current)
+			continue;
+		if (!ctx->owner)
+			__release_oom_context(ctx);
+		else
+			/* To be released by owner (see oom_unlock) */
+			ctx->victim = NULL;
+	} while ((iter = mem_cgroup_iter(NULL, iter, NULL)));
+	spin_unlock(&oom_context_lock);
+}
+
+static void __wait_oom_context(struct oom_context *ctx)
+{
+	unsigned long now = jiffies;
+	unsigned long timeout;
+	DEFINE_WAIT(wait);
+
+	if (ctx->victim == current ||
+	    time_after_eq(now, ctx->oom_start + OOM_TIMEOUT)) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+	timeout = ctx->oom_start + OOM_TIMEOUT - now;
+	spin_unlock(&oom_context_lock);
+	schedule_timeout(timeout);
+	finish_wait(&ctx->waitq, &wait);
+}
+
+bool oom_trylock(struct mem_cgroup *memcg)
+{
+	unsigned long now = jiffies;
+	struct mem_cgroup *iter, *parent;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Check if oom context of memcg or any of its descendants is
+	 * active, i.e. if there is a process selecting a victim or a
+	 * victim dying. If there is, wait for it to finish, otherwise
+	 * proceed to oom.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if ((ctx->owner || ctx->victim) &&
+		    time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
+			__wait_oom_context(ctx);
+			mem_cgroup_iter_break(memcg, iter);
+			return false;
+		} else if (ctx->owner || ctx->victim) {
+			/*
+			 * Timeout. Release the context and dump stack
+			 * trace of the stuck process.
+			 *
+			 * To avoid dumping stack trace of the same task
+			 * more than once, we mark the context that
+			 * contained the victim when it was killed (see
+			 * mark_oom_victim).
+			 */
+			struct task_struct *p = ctx->victim;
+
+			if (p && ctx->marked) {
+				pr_err("OOM kill timeout: %d (%s)\n",
+				       task_pid_nr(p), p->comm);
+				show_stack(p, NULL);
+			}
+
+			__release_oom_context(ctx);
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Acquire oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner);
+		BUG_ON(ctx->victim);
+		ctx->owner = current;
+		ctx->oom_start = now;
+		/*
+		 * Update overdraft of each cgroup under us. This
+		 * information will be used in oom_badness.
+		 */
+		ctx->overdraft = mem_cgroup_overdraft(iter);
+		parent = iter ? parent_mem_cgroup(iter) : NULL;
+		if (parent && iter != memcg)
+			ctx->overdraft = max(ctx->overdraft,
+				mem_cgroup_oom_context(parent)->overdraft);
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	spin_unlock(&oom_context_lock);
+
+	return true;
+}
+
+void oom_unlock(struct mem_cgroup *memcg)
+{
+	struct task_struct *victim = NULL;
+	struct mem_cgroup *iter, *victim_memcg = NULL;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Find oom victim if any.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner != current) {
+			/* Lost ownership on timeout */
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+		if (ctx->victim) {
+			victim = ctx->victim;
+			/*
+			 * Remember the victim memcg so that we can wait
+			 * on it for the victim to exit below.
+			 */
+			victim_memcg = iter;
+			if (iter)
+				mem_cgroup_get(iter);
+
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Propagate victim up to the context that initiated oom.
+	 */
+	for (iter = victim_memcg; iter; iter = parent_mem_cgroup(iter)) {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner != current);
+		if (!ctx->victim)
+			ctx->victim = victim;
+		if (iter == memcg)
+			break;
+	}
+
+	/*
+	 * Release oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner != current)
+			/* Lost ownership on timeout */
+			continue;
+		if (!ctx->victim)
+			/*
+			 * Victim already exited or nobody was killed in
+			 * this cgroup? It's our responsibility to wake
+			 * up blocked processes then.
+			 */
+			__release_oom_context(ctx);
+		else
+			/* To be released by victim (see exit_oom_victim) */
+			ctx->owner = NULL;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	if (!victim) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	/*
+	 * Wait for the victim to exit.
+	 */
+	ctx = mem_cgroup_oom_context(victim_memcg);
+	__wait_oom_context(ctx);
+	if (victim_memcg)
+		mem_cgroup_put(victim_memcg);
+}
+
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(unsigned long points, unsigned long overdraft,
+			  unsigned long totalpages, struct mem_cgroup *memcg,
+			  nodemask_t *nodemask)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct oom_context *ctx;
+	struct task_struct *p;
+	int rage;
+	int killed = 0;
+
+	spin_lock(&oom_context_lock);
+	ctx = mem_cgroup_oom_context(memcg);
+	if (ctx->owner != current) {
+		/* Lost ownership on timeout */
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+	/*
+	 * Increase rage if oom happened recently in this context, reset
+	 * rage otherwise.
+	 *
+	 * previous oom                            this oom (unfinished)
+	 * ++++++++++++----------------------------++++++++
+	 *            ^                            ^
+	 *         oom_end  <<oom_relaxation>>  oom_start
+	 */
+	if (time_after(ctx->oom_start, ctx->oom_end + sysctl_oom_relaxation))
+		ctx->rage = OOM_BASE_RAGE;
+	else if (ctx->rage < OOM_MAX_RAGE)
+		ctx->rage++;
+	rage = ctx->rage;
+	spin_unlock(&oom_context_lock);
+
+	if (rage < 0)
+		return;
+
+	/*
+	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
+	 * as bad as the victim.
+	 */
+	qread_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		unsigned long tsk_points;
+		unsigned long tsk_overdraft;
+
+		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+		    fatal_signal_pending(p) || p->flags & PF_EXITING ||
+		    oom_unkillable_task(p, memcg, nodemask))
+			continue;
+
+		tsk_points = oom_badness(p, memcg, nodemask, totalpages,
+					 &tsk_overdraft);
+		if (tsk_overdraft < overdraft)
+			continue;
+
+		/*
+		 * oom_badness never returns a negative value, even if
+		 * oom_score_adj would make badness so, instead it
+		 * returns 1. So we do not kill task with badness 1 if
+		 * the victim has badness > 1 so as not to risk killing
+		 * protected tasks.
+		 */
+		if (tsk_points <= 1 && points > 1)
+			continue;
+
+		/*
+		 * Consider tasks as equally bad if they occupy equal
+		 * percentage of available memory.
+		 */
+		if (tsk_points * 100 / totalpages <
+		    points * 100 / totalpages)
+			continue;
+
+		if (__ratelimit(&berserker_rs))
+			pr_err("Rage kill process %d (%s)\n",
+			       task_pid_nr(p), p->comm);
+
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		mem_cgroup_note_oom_kill(memcg, p);
+
+		if (++killed >= 1 << rage)
+			break;
+	}
+	qread_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
  */
 void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-		      unsigned int points, unsigned long totalpages,
+		      unsigned long points, unsigned long overdraft,
+		      unsigned long totalpages,
 		      struct mem_cgroup *memcg, nodemask_t *nodemask,
 		      const char *message)
 {
@@ -424,19 +802,19 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * If the task is already exiting, don't alarm the sysadmin or kill
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
 	 */
-	if (p->flags & PF_EXITING) {
-		set_tsk_thread_flag(p, TIF_MEMDIE);
-		put_task_struct(p);
-		return;
+	task_lock(p);
+	if (p->mm && p->flags & PF_EXITING) {
+		mark_oom_victim(p);
+		task_unlock(p);
+		goto out;
 	}
+	task_unlock(p);
 
 	if (__ratelimit(&oom_rs))
 		dump_header(p, gfp_mask, order, memcg, nodemask);
 
-	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
-	task_unlock(p);
+	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
+		message, task_pid_nr(p), p->comm, points * 1000 / totalpages);
 
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
@@ -451,11 +829,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 			if (child->mm == p->mm)
 				continue;
+			if (!child->mm ||
+			    test_tsk_thread_flag(child, TIF_MEMDIE))
+				continue;
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
 			child_points = oom_badness(child, memcg, nodemask,
-								totalpages);
+						   totalpages, NULL);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
 				victim = child;
@@ -468,8 +849,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
-		put_task_struct(victim);
-		return;
+		goto out;
 	} else if (victim != p) {
 		get_task_struct(p);
 		put_task_struct(victim);
@@ -478,11 +858,15 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+	mark_oom_victim(victim);
+	rcu_read_lock();
+	pr_err("Killed process %d (%s) in VE \"%s\" total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(victim), victim->comm, task_ve_name(victim),
+		K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
 		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
 		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+	rcu_read_unlock();
 	task_unlock(victim);
 
 	/*
@@ -501,17 +885,18 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 				continue;
 
-			task_lock(p);	/* Protect ->comm from prctl() */
-			pr_err("Kill process %d (%s) sharing same memory\n",
-				task_pid_nr(p), p->comm);
-			task_unlock(p);
+			pr_err("Kill process %d (%s) in VE \"%s\" sharing same memory\n",
+				task_pid_nr(p), p->comm, task_ve_name(p));
 			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+			mem_cgroup_note_oom_kill(memcg, p);
 		}
 	rcu_read_unlock();
 
-	set_tsk_thread_flag(victim, TIF_MEMDIE);
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+	mem_cgroup_note_oom_kill(memcg, victim);
+out:
 	put_task_struct(victim);
+	oom_berserker(points, overdraft, totalpages, memcg, nodemask);
 }
 #undef K
 
@@ -551,63 +936,12 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-	int ret = 1;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		if (zone_is_oom_locked(zone)) {
-			ret = 0;
-			goto out;
-		}
-	}
-
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		/*
-		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
-		 * when it shouldn't.
-		 */
-		zone_set_flag(zone, ZONE_OOM_LOCKED);
-	}
-
-out:
-	spin_unlock(&zone_scan_lock);
-	return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		zone_clear_flag(zone, ZONE_OOM_LOCKED);
-	}
-	spin_unlock(&zone_scan_lock);
-}
-
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
  * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
@@ -615,15 +949,15 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
  * don't have to be perfect here, we just have to be good.
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *nodemask, bool force_kill)
+		   int order, nodemask_t *nodemask)
 {
 	const nodemask_t *mpol_mask;
 	struct task_struct *p;
 	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned int uninitialized_var(points);
+	unsigned long uninitialized_var(points);
+	unsigned long uninitialized_var(overdraft);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	int killed = 0;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
@@ -634,9 +968,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
+	 *
+	 * But don't select if current has already released its mm and cleared
+	 * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
 	 */
-	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+	if (current->mm &&
+	    (fatal_signal_pending(current) || current->flags & PF_EXITING)) {
+		mark_oom_victim(current);
 		return;
 	}
 
@@ -653,30 +991,21 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	    !oom_unkillable_task(current, NULL, nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
-				 nodemask,
+		oom_kill_process(current, gfp_mask, order, 0, 0, totalpages,
+				 NULL, nodemask,
 				 "Out of memory (oom_kill_allocating_task)");
-		goto out;
+		return;
 	}
 
-	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+	p = select_bad_process(&points, &overdraft, totalpages, mpol_mask);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
 		panic("Out of memory and no killable processes...\n");
-	}
-	if (PTR_ERR(p) != -1UL) {
-		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+	} else
+		oom_kill_process(p, gfp_mask, order, points, overdraft,
+				 totalpages, NULL,
 				 nodemask, "Out of memory");
-		killed = 1;
-	}
-out:
-	/*
-	 * Give the killed threads a good chance of exiting before trying to
-	 * allocate memory again.
-	 */
-	if (killed)
-		schedule_timeout_killable(1);
 }
 
 /*
@@ -686,14 +1015,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  */
 void pagefault_out_of_memory(void)
 {
-	struct zonelist *zonelist;
-
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
-	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
-		out_of_memory(NULL, 0, 0, NULL, false);
-		clear_zonelist_oom(zonelist, GFP_KERNEL);
+	if (oom_trylock(NULL)) {
+		out_of_memory(NULL, 0, 0, NULL);
+		oom_unlock(NULL);
 	}
 }
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/virtinfo.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -525,6 +526,41 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned min_dirty)
+{
+	int ret = 0;
+
+	spin_lock_bh(&bdi_lock);
+	if (min_dirty > bdi->max_dirty_pages) {
+		ret = -EINVAL;
+	} else {
+		bdi->min_dirty_pages = min_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_min_dirty);
+
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned max_dirty)
+{
+	int ret = 0;
+
+	if (max_dirty > num_physpages)
+		return -EINVAL;
+
+	spin_lock_bh(&bdi_lock);
+	if (bdi->min_dirty_pages > max_dirty) {
+		ret = -EINVAL;
+	} else {
+		bdi->max_dirty_pages = max_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_dirty);
+
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 					   unsigned long bg_thresh)
 {
@@ -576,6 +612,12 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 		bdi_dirty = dirty * bdi->max_ratio / 100;
 
+	if (bdi->min_dirty_pages && bdi_dirty < bdi->min_dirty_pages)
+		bdi_dirty = min((unsigned long)bdi->min_dirty_pages, dirty);
+
+	if (bdi->max_dirty_pages && bdi_dirty > bdi->max_dirty_pages)
+		bdi_dirty = bdi->max_dirty_pages;
+
 	return bdi_dirty;
 }
 
@@ -749,7 +791,8 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 		if (bdi_dirty >= bdi_thresh)
 			return 0;
 
-		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh,
+					thresh + 1);
 		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
 						     bdi_bg_thresh);
 
@@ -1087,12 +1130,15 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	 * of backing device (see the implementation of bdi_dirty_limit()).
 	 */
 	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		unsigned long bdi_bg_thresh;
+
+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+
 		dirty = bdi_dirty;
 		if (bdi_dirty < 8)
 			setpoint = bdi_dirty + 1;
 		else
-			setpoint = (bdi_thresh +
-				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+			setpoint = (bdi_thresh + bdi_bg_thresh) / 2;
 	}
 
 	if (dirty < setpoint) {
@@ -1324,9 +1370,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 
 	if (bdi_bg_thresh)
-		*bdi_bg_thresh = div_u64((u64)*bdi_thresh *
-					 background_thresh,
-					 dirty_thresh);
+		*bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
+							background_thresh,
+							dirty_thresh) : 0;
 
 	/*
 	 * In order to avoid the stacked BDI deadlock we need
@@ -1349,6 +1395,102 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	}
 }
 
+static void balance_dirty_pages_ub(struct address_space *mapping,
+				unsigned long write_chunk)
+{
+	long ub_dirty, ub_writeback;
+	long ub_thresh, ub_background_thresh;
+	unsigned long pages_written = 0;
+	unsigned long pause = 1;
+	struct user_beancounter *ub = get_io_ub();
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	if (ub == get_ub0())
+		return;
+
+	for (;;) {
+		unsigned long nr_to_write = write_chunk - pages_written;
+
+		ub_dirty = ub_stat_get(ub, dirty_pages);
+		ub_writeback = ub_stat_get(ub, writeback_pages);
+
+		if (!ub_dirty_limits(&ub_background_thresh, &ub_thresh, ub))
+			break;
+
+		/*
+		 * Check thresholds and start background writeback
+		 * before throttling.
+		 */
+		if (ub_dirty + ub_writeback <= ub_thresh)
+			break;
+		if (!writeback_in_progress(bdi))
+			bdi_start_background_writeback(bdi);
+
+		/*
+		 * Throttle it only when the background writeback cannot
+		 * catch-up. This avoids (excessively) small writeouts
+		 * when the bdi limits are ramping up.
+		 */
+		if (ub_dirty + ub_writeback <
+			(ub_background_thresh + ub_thresh) / 2)
+			break;
+
+		if (ub_dirty > ub_thresh) {
+			pages_written += writeback_inodes_wb(&bdi->wb,
+						nr_to_write,
+						WB_REASON_BACKGROUND, ub);
+			ub_dirty = ub_stat_get(ub, dirty_pages);
+			ub_writeback = ub_stat_get(ub, writeback_pages);
+		}
+
+		/* fixup ub-stat per-cpu drift to avoid false-positive */
+		if (ub_dirty + ub_writeback > ub_thresh &&
+		    ub_dirty + ub_writeback - ub_thresh <
+				    UB_STAT_BATCH * num_possible_cpus()) {
+			ub_dirty = ub_stat_get_exact(ub, dirty_pages);
+			ub_writeback = ub_stat_get_exact(ub, writeback_pages);
+		}
+
+		if (ub_dirty + ub_writeback <= ub_thresh)
+			break;
+
+		if (pages_written >= write_chunk)
+			break;		/* We've done our duty */
+
+		__set_current_state(TASK_KILLABLE);
+		io_schedule_timeout(pause);
+
+		/*
+		 * Increase the delay for each loop, up to our previous
+		 * default of taking a 100ms nap.
+		 */
+		pause <<= 1;
+		if (pause > HZ / 10)
+			pause = HZ / 10;
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+			       (void*)write_chunk);
+
+	/*
+	 * Even if this is filtered writeback for other ub it will write
+	 * inodes for this ub, because we check ub limits of inode (via
+	 * __ub_over_bground_thresh(ub)) during writeback.
+	 */
+	if (writeback_in_progress(bdi))
+		return;
+
+	/*
+	 * We start background writeout at the lower ub_background_thresh,
+	 * to keep the amount of dirty memory low.
+	 */
+	if (ub_dirty > ub_background_thresh)
+		bdi_start_background_writeback(bdi);
+}
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1544,6 +1686,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+			       (void*)pages_dirtied);
+
 	if (writeback_in_progress(bdi))
 		return;
 
@@ -1644,8 +1789,10 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	}
 	preempt_enable();
 
-	if (unlikely(current->nr_dirtied >= ratelimit))
+	if (unlikely(current->nr_dirtied >= ratelimit)) {
+		balance_dirty_pages_ub(mapping, current->nr_dirtied);
 		balance_dirty_pages(mapping, current->nr_dirtied);
+	}
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
@@ -1928,6 +2075,8 @@ int write_cache_pages(struct address_space *mapping,
 
 			done_index = page->index;
 
+			virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 			lock_page(page);
 
 			/*
@@ -2129,7 +2278,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
-		task_io_account_write(PAGE_CACHE_SIZE);
+		task_io_account_dirty(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
 	}
@@ -2180,6 +2329,11 @@ int __set_page_dirty_nobuffers(struct page *page)
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					!radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_dirty(mapping);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		if (mapping->host) {
@@ -2268,6 +2422,18 @@ int set_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty);
 
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm)
+{
+	struct user_beancounter *old_ub;
+	int ret;
+
+	old_ub = set_exec_ub(mm_ub(mm));
+	ret = set_page_dirty(page);
+	(void)set_exec_ub(old_ub);
+	return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_mm);
+
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
@@ -2375,6 +2541,9 @@ int test_clear_page_writeback(struct page *page)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
+				if (radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_dec(mapping);
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
@@ -2405,13 +2574,23 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_account_writeback(bdi))
+			if (bdi_cap_account_writeback(bdi)) {
+				if (!radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_inc(mapping);
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+			}
 		}
-		if (!PageDirty(page))
+		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_clean(mapping);
+		}
 		if (!keep_write)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -62,6 +63,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/kthread.h>
+#include <linux/ve.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -124,6 +126,24 @@ unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+static int zero_data_pages_enabled;
+struct static_key __initdata zero_free_pages = STATIC_KEY_INIT_FALSE;
+
+static int __init enable_zero_free_pages(char *__unused)
+{
+	zero_data_pages_enabled = 1;
+	return 1;
+}
+__setup("zero-free-pages", enable_zero_free_pages);
+
+static int __init setup_zero_free_pages(void)
+{
+	if (zero_data_pages_enabled)
+		static_key_slow_inc(&zero_free_pages);
+	return 0;
+}
+early_initcall(setup_zero_free_pages);
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -443,6 +463,8 @@ static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
 }
 
 /*
@@ -770,7 +792,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	int to_free = count;
 
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	while (to_free) {
@@ -819,7 +840,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
@@ -904,11 +924,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
+	kasan_free_pages(page, order);
 
 	if (PageAnon(page))
 		page->mapping = NULL;
-	for (i = 0; i < (1 << order); i++)
+	memcg_kmem_uncharge_pages(page, order);
+	for (i = 0; i < (1 << order); i++) {
 		bad += free_pages_check(page + i);
+		if (static_key_false(&zero_free_pages))
+			clear_highpage(page + i);
+	}
 	if (bad)
 		return false;
 
@@ -1297,6 +1322,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 	set_page_private(page, 0);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
+	kasan_alloc_pages(page, order);
 
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
@@ -2580,10 +2606,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
-		schedule_timeout_uninterruptible(1);
+	if (!oom_trylock(NULL))
 		return NULL;
-	}
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -2615,10 +2639,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+	out_of_memory(zonelist, gfp_mask, order, nodemask);
 
 out:
-	clear_zonelist_oom(zonelist, gfp_mask);
+	oom_unlock(NULL);
 	return page;
 }
 
@@ -2877,6 +2901,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 
+int alloc_fail_warn;
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -3082,6 +3108,36 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	return page;
 }
 
+static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
+		struct page *page, u64 time)
+{
+#ifdef CONFIG_VE
+	unsigned long flags;
+	int ind, cpu;
+
+	time = jiffies_to_usecs(jiffies - time) * 1000;
+	if (!(gfp_mask & __GFP_WAIT))
+		ind = KSTAT_ALLOCSTAT_ATOMIC;
+	else if (!(gfp_mask & __GFP_HIGHMEM))
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_LOW_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_LOW;
+	else
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_HIGH_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_HIGH;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.alloc_lat[ind], cpu, time);
+	if (!page)
+		kstat_glob.alloc_fails[cpu][ind]++;
+	local_irq_restore(flags);
+#endif
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -3095,13 +3151,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-	struct mem_cgroup *memcg = NULL;
+	cycles_t start;
 
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
+	WARN_ON_ONCE((gfp_mask & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
@@ -3114,13 +3172,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
-	/*
-	 * Will only have any effect when __GFP_KMEMCG is set.  This is
-	 * verified in the (always inline) callee
-	 */
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
@@ -3136,6 +3187,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
+	start = jiffies;
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
@@ -3168,6 +3220,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 				preferred_zone, migratetype);
 	}
 
+	__alloc_collect_stats(gfp_mask, order, page, start);
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
 out:
@@ -3180,7 +3233,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 
-	memcg_kmem_commit_charge(page, memcg, order);
+	if (page && !memcg_kmem_newpage_charge(page, gfp_mask, order)) {
+		__free_pages(page, order);
+		return NULL;
+	}
 
 	return page;
 }
@@ -3332,62 +3388,6 @@ void __free_page_frag(void *addr)
 }
 EXPORT_SYMBOL(__free_page_frag);
 
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-	struct mem_cgroup *memcg = NULL;
-
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-	page = alloc_pages(gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
-	return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-	struct mem_cgroup *memcg = NULL;
-
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-	page = alloc_pages_node(nid, gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
-	return page;
-}
-
-/*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
- *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
- *
- * The caller knows better which flags it relies on.
- */
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
-{
-	memcg_kmem_uncharge_pages(page, order);
-	__free_pages(page, order);
-}
-
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
-{
-	if (addr != 0) {
-		VM_BUG_ON(!virt_addr_valid((void *)addr));
-		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
-	}
-}
-
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
@@ -3525,6 +3525,10 @@ static inline void show_node(struct zone *zone)
 		printk("Node %d ", zone_to_nid(zone));
 }
 
+#ifdef CONFIG_TCACHE
+extern unsigned long get_nr_tcache_pages(void);
+#endif
+
 long si_mem_available(void)
 {
 	long available;
@@ -3562,6 +3566,10 @@ long si_mem_available(void)
 	available += global_page_state(NR_SLAB_RECLAIMABLE) -
 		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
 
+#ifdef CONFIG_TCACHE
+	available += get_nr_tcache_pages();
+#endif
+
 	if (available < 0)
 		available = 0;
 	return available;
@@ -3674,7 +3682,7 @@ void show_free_areas(unsigned int filter)
 
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-		" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+		" unevictable:%lu dirty:%lu writeback:%lu wbtmp:%lu unstable:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free:%lu free_pcp:%lu free_cma:%lu\n",
@@ -3687,6 +3695,7 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
+		global_page_state(NR_WRITEBACK_TEMP),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
@@ -3772,7 +3781,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
-			(zone->all_unreclaimable ? "yes" : "no")
+			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6388,49 +6397,6 @@ void setup_per_zone_wmarks(void)
 	mutex_unlock(&zonelists_mutex);
 }
 
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
-	unsigned int gb, ratio;
-
-	/* Zone size in gigabytes */
-	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
-	struct zone *zone;
-
-	for_each_zone(zone)
-		calculate_zone_inactive_ratio(zone);
-}
-
 /*
  * Initialise min_free_kbytes.
  *
@@ -6476,7 +6442,6 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
-	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
@@ -6849,9 +6814,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		if (!PageLRU(page))
 			found++;
 		/*
-		 * If there are RECLAIMABLE pages, we need to check it.
-		 * But now, memory offline itself doesn't call shrink_slab()
-		 * and it still to be fixed.
+		 * If there are RECLAIMABLE pages, we need to check
+		 * it.  But now, memory offline itself doesn't call
+		 * shrink_node_slabs() and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
@@ -7240,6 +7205,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+	{1UL << PG_young,		"young"		},
+	{1UL << PG_idle,		"idle"		},
+#endif
 };
 
 static void dump_page_flags(unsigned long flags)
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,229 @@
+/*
+ *  mm/page_idle.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/ksm.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE	sizeof(u64)
+#define BITMAP_CHUNK_BITS	(BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+	struct page *page;
+	struct zone *zone;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+	if (!page || !PageLRU(page) ||
+	    !get_page_unless_zero(page))
+		return NULL;
+
+	zone = page_zone(page);
+	spin_lock_irq(&zone->lru_lock);
+	if (unlikely(!PageLRU(page))) {
+		put_page(page);
+		page = NULL;
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+					struct vm_area_struct *vma,
+					unsigned long addr, void *arg)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool referenced = false;
+
+	if (unlikely(PageTransHuge(page))) {
+		pmd = page_check_address_pmd(page, mm, addr,
+					     PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+		if (pmd) {
+			referenced = pmdp_clear_young_notify(vma, addr, pmd);
+			spin_unlock(ptl);
+		}
+	} else {
+		pte = page_check_address(page, mm, addr, &ptl, 0);
+		if (pte) {
+			referenced = ptep_clear_young_notify(vma, addr, pte);
+			pte_unmap_unlock(pte, ptl);
+		}
+	}
+	if (referenced) {
+		clear_page_idle(page);
+		/*
+		 * We cleared the referenced bit in a mapping to this page. To
+		 * avoid interference with page reclaim, mark it young so that
+		 * page_referenced() will return > 0.
+		 */
+		set_page_young(page);
+	}
+	return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+	/*
+	 * Since rwc.arg is unused, rwc is effectively immutable, so we
+	 * can make it static const to save some cycles and stack.
+	 */
+	static const struct rmap_walk_control rwc = {
+		.rmap_one = page_idle_clear_pte_refs_one,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+	bool need_lock;
+
+	if (!page_mapped(page) ||
+	    !page_rmapping(page))
+		return;
+
+	need_lock = !PageAnon(page) || PageKsm(page);
+	if (need_lock && !trylock_page(page))
+		return;
+
+	rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+	if (need_lock)
+		unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+				     struct bin_attribute *attr, char *buf,
+				     loff_t pos, size_t count)
+{
+	u64 *out = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return 0;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if (!bit)
+			*out = 0ULL;
+		page = page_idle_get_page(pfn);
+		if (page) {
+			if (page_is_idle(page)) {
+				/*
+				 * The page might have been referenced via a
+				 * pte, in which case it is not idle. Clear
+				 * refs and recheck.
+				 */
+				page_idle_clear_pte_refs(page);
+				if (page_is_idle(page))
+					*out |= 1ULL << bit;
+			}
+			put_page(page);
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			out++;
+		cond_resched();
+	}
+	return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+				      struct bin_attribute *attr, char *buf,
+				      loff_t pos, size_t count)
+{
+	const u64 *in = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return -ENXIO;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if ((*in >> bit) & 1) {
+			page = page_idle_get_page(pfn);
+			if (page) {
+				page_idle_clear_pte_refs(page);
+				set_page_idle(page);
+				put_page(page);
+			}
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			in++;
+		cond_resched();
+	}
+	return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+		__BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+			   page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+	&page_idle_bitmap_attr,
+	NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+	.bin_attrs = page_idle_bin_attrs,
+	.name = "page_idle",
+};
+
+static int __init page_idle_init(void)
+{
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+	if (err) {
+		pr_err("page_idle: register sysfs failed\n");
+		return err;
+	}
+	return 0;
+}
+subsys_initcall(page_idle_init);
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -24,12 +24,14 @@
 #include <linux/blkdev.h>
 #include <asm/pgtable.h>
 
+static struct bio_set *swap_bio_set;
+
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc_bioset(gfp_flags, 1, swap_bio_set);
 	if (bio) {
 		bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
 		bio->bi_sector <<= PAGE_SHIFT - 9;
@@ -336,3 +338,12 @@ int swap_set_page_dirty(struct page *page)
 		return __set_page_dirty_no_writeback(page);
 	}
 }
+
+static int __init swap_init(void)
+{
+	swap_bio_set = bioset_create(SWAP_CLUSTER_MAX, 0);
+	if (!swap_bio_set)
+		panic("can't allocate swap_bio_set\n");
+	return 0;
+}
+late_initcall(swap_init);
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -20,6 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/virtinfo.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -118,6 +119,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
@@ -514,6 +517,10 @@ void page_cache_sync_readahead(struct address_space *mapping,
 		return;
 	}
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
 }
@@ -558,6 +565,10 @@ page_cache_async_readahead(struct address_space *mapping,
 	if (bdi_read_congested(mapping->backing_dev_info))
 		return;
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -61,6 +61,7 @@
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
 #include <linux/memremap.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -113,6 +114,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
 	 * LOCK should suffice since the actual taking of the lock must
 	 * happen _before_ what follows.
 	 */
+	might_sleep();
 	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
 		anon_vma_lock_write(anon_vma);
 		anon_vma_unlock_write(anon_vma);
@@ -428,8 +430,10 @@ static void anon_vma_ctor(void *data)
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
-	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+			anon_vma_ctor);
+	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+			SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
@@ -481,8 +485,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
 	 * above cannot corrupt).
 	 */
 	if (!page_mapped(page)) {
+		rcu_read_unlock();
 		put_anon_vma(anon_vma);
-		anon_vma = NULL;
+		return NULL;
 	}
 out:
 	rcu_read_unlock();
@@ -532,9 +537,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
 	}
 
 	if (!page_mapped(page)) {
+		rcu_read_unlock();
 		put_anon_vma(anon_vma);
-		anon_vma = NULL;
-		goto out;
+		return NULL;
 	}
 
 	/* we pinned the anon_vma, its safe to sleep */
@@ -706,9 +711,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 		if (!vma->anon_vma || !page__anon_vma ||
 		    vma->anon_vma->root != page__anon_vma->root)
 			return -EFAULT;
-	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-		if (!vma->vm_file ||
-		    vma->vm_file->f_mapping != page->mapping)
+	} else if (page->mapping) {
+		if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
 			return -EFAULT;
 	} else
 		return -EFAULT;
@@ -887,6 +891,11 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
+	if (referenced)
+		clear_page_idle(page);
+	if (test_and_clear_page_young(page))
+		referenced++;
+
 	(*mapcount)--;
 
 	if (referenced)
@@ -949,7 +958,7 @@ static int page_referenced_file(struct page *page,
 				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page->mapping, *peer;
 	pgoff_t pgoff = page_to_pgoff(page);
 	struct vm_area_struct *vma;
 	int referenced = 0;
@@ -969,7 +978,7 @@ static int page_referenced_file(struct page *page,
 	 */
 	BUG_ON(!PageLocked(page));
 
-	mutex_lock(&mapping->i_mmap_mutex);
+	mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING);
 
 	/*
 	 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
@@ -989,9 +998,36 @@ static int page_referenced_file(struct page *page,
 		referenced += page_referenced_one(page, vma, address,
 						  &mapcount, vm_flags);
 		if (!mapcount)
-			break;
+			goto out;
 	}
 
+	/* Does page belong to pfcache mapping? */
+	if (!mapping->i_peer_file ||
+	    mapping->i_peer_file->f_mapping != mapping)
+		goto out;
+
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		if (!mapping_mapped(peer))
+			continue;
+
+		mutex_lock(&peer->i_mmap_mutex);
+
+		vma_interval_tree_foreach(vma, &peer->i_mmap, pgoff, pgoff) {
+			unsigned long address = vma_address(page, vma);
+			if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+				continue;
+			referenced += page_referenced_one(page, vma, address,
+							  &mapcount, vm_flags);
+			if (!mapcount)
+				break;
+		}
+
+		mutex_unlock(&peer->i_mmap_mutex);
+
+		if (!mapcount)
+			goto out;
+	}
+out:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return referenced;
 }
@@ -1251,12 +1287,6 @@ void page_add_new_anon_rmap(struct page *page,
 	else
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
-	if (!mlocked_vma_newpage(vma, page)) {
-		SetPageActive(page);
-		if (!is_zone_device_page(page))
-			lru_cache_add(page);
-	} else if (!is_zone_device_page(page))
-		add_page_to_unevictable_list(page);
 }
 
 /**
@@ -1265,7 +1295,7 @@ void page_add_new_anon_rmap(struct page *page,
  *
  * The caller needs to hold the pte lock.
  */
-void page_add_file_rmap(struct page *page)
+void page_add_file_rmap(struct page *page, struct mm_struct *mm)
 {
 	bool locked;
 	unsigned long flags;
@@ -1309,7 +1339,6 @@ void page_remove_rmap(struct page *page)
 	if (unlikely(PageHuge(page)))
 		goto out;
 	if (anon) {
-		mem_cgroup_uncharge_page(page);
 		if (!PageTransHuge(page))
 			__dec_zone_page_state(page, NR_ANON_PAGES);
 		else
@@ -1367,7 +1396,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			goto out;
 
 		pteval = ptep_get_and_clear(mm, address, pte);
-		if (pte_present(pteval) || pte_none(pteval) || pte_file(pteval)) {
+		if (pte_present(pteval) || pte_none(pteval)) {
 			set_pte_at(mm, address, pte, pteval);
 			goto out_unmap;
 		}
@@ -1438,7 +1467,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
-		set_page_dirty(page);
+		set_page_dirty_mm(page, mm);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
@@ -1483,7 +1512,6 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (pte_soft_dirty(pteval))
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
-		BUG_ON(pte_file(*pte));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 		   (TTU_ACTION(flags) == TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
@@ -1525,133 +1553,6 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	return ret;
 }
 
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-		struct vm_area_struct *vma, struct page *check_page)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t pteval;
-	spinlock_t *ptl;
-	struct page *page;
-	unsigned long address;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
-	unsigned long end;
-	int ret = SWAP_AGAIN;
-	int locked_vma = 0;
-
-	address = (vma->vm_start + cursor) & CLUSTER_MASK;
-	end = address + CLUSTER_SIZE;
-	if (address < vma->vm_start)
-		address = vma->vm_start;
-	if (end > vma->vm_end)
-		end = vma->vm_end;
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return ret;
-
-	mmun_start = address;
-	mmun_end   = end;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
-	/*
-	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-	 * keep the sem while scanning the cluster for mlocking pages.
-	 */
-	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-		locked_vma = (vma->vm_flags & VM_LOCKED);
-		if (!locked_vma)
-			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-	}
-
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
-	for (; address < end; pte++, address += PAGE_SIZE) {
-		if (!pte_present(*pte))
-			continue;
-		page = vm_normal_page(vma, address, *pte);
-		BUG_ON(!page || PageAnon(page));
-
-		if (locked_vma) {
-			if (page == check_page) {
-				/* we know we have check_page locked */
-				mlock_vma_page(page);
-				ret = SWAP_MLOCK;
-			} else if (trylock_page(page)) {
-				/*
-				 * If we can lock the page, perform mlock.
-				 * Otherwise leave the page alone, it will be
-				 * eventually encountered again later.
-				 */
-				mlock_vma_page(page);
-				unlock_page(page);
-			}
-			continue;	/* don't unmap */
-		}
-
-		if (ptep_clear_flush_young_notify(vma, address, pte))
-			continue;
-
-		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
-
-		/* If nonlinear, store the file page offset in the pte. */
-		if (page->index != linear_page_index(vma, address)) {
-			pte_t ptfile = pgoff_to_pte(page->index);
-			if (pte_soft_dirty(pteval))
-				ptfile = pte_file_mksoft_dirty(ptfile);
-			set_pte_at(mm, address, pte, ptfile);
-		}
-
-		/* Move the dirty bit to the physical page now the pte is gone. */
-		if (pte_dirty(pteval))
-			set_page_dirty(page);
-
-		page_remove_rmap(page);
-		page_cache_release(page);
-		dec_mm_counter(mm, mm_counter_file(page));
-		(*mapcount)--;
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	if (locked_vma)
-		up_read(&vma->vm_mm->mmap_sem);
-	return ret;
-}
-
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1735,18 +1636,13 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_mapping(struct page *page,
+		struct address_space *mapping, enum ttu_flags flags)
 {
-	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page_to_pgoff(page);
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
-	unsigned long cursor;
-	unsigned long max_nl_cursor = 0;
-	unsigned long max_nl_size = 0;
-	unsigned int mapcount;
 
-	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		ret = try_to_unmap_one(page, vma, address, flags);
@@ -1754,75 +1650,39 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 			goto out;
 	}
 
-	if (list_empty(&mapping->i_mmap_nonlinear))
-		goto out;
+out:
+	return ret;
+}
 
-	/*
-	 * We don't bother to try to find the munlocked page in nonlinears.
-	 * It's costly. Instead, later, page reclaim logic may call
-	 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
-	 */
-	if (TTU_ACTION(flags) == TTU_MUNLOCK)
-		goto out;
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+{
+	struct address_space *mapping = page->mapping, *peer;
+	int ret;
 
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-							shared.nonlinear) {
-		cursor = (unsigned long) vma->vm_private_data;
-		if (cursor > max_nl_cursor)
-			max_nl_cursor = cursor;
-		cursor = vma->vm_end - vma->vm_start;
-		if (cursor > max_nl_size)
-			max_nl_size = cursor;
-	}
+	mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING);
 
-	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
-		ret = SWAP_FAIL;
+	ret = try_to_unmap_mapping(page, mapping, flags);
+	if (ret != SWAP_AGAIN || !page_mapped(page))
 		goto out;
-	}
 
-	/*
-	 * We don't try to search for this page in the nonlinear vmas,
-	 * and page_referenced wouldn't have found it anyway.  Instead
-	 * just walk the nonlinear vmas trying to age and unmap some.
-	 * The mapcount of the page we came in with is irrelevant,
-	 * but even so use it as a guide to how hard we should try?
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
+	/* Does page belong to pfcache mapping? */
+	if (!mapping->i_peer_file ||
+	    mapping->i_peer_file->f_mapping != mapping)
 		goto out;
-	cond_resched();
-
-	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-	if (max_nl_cursor == 0)
-		max_nl_cursor = CLUSTER_SIZE;
-
-	do {
-		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-							shared.nonlinear) {
-			cursor = (unsigned long) vma->vm_private_data;
-			while ( cursor < max_nl_cursor &&
-				cursor < vma->vm_end - vma->vm_start) {
-				if (try_to_unmap_cluster(cursor, &mapcount,
-						vma, page) == SWAP_MLOCK)
-					ret = SWAP_MLOCK;
-				cursor += CLUSTER_SIZE;
-				vma->vm_private_data = (void *) cursor;
-				if ((int)mapcount <= 0)
-					goto out;
-			}
-			vma->vm_private_data = (void *) max_nl_cursor;
-		}
-		cond_resched();
-		max_nl_cursor += CLUSTER_SIZE;
-	} while (max_nl_cursor <= max_nl_size);
 
 	/*
-	 * Don't loop forever (perhaps all the remaining pages are
-	 * in locked vmas).  Reset cursor on all unreserved nonlinear
-	 * vmas, now forgetting on which ones it had fallen behind.
+	 * Handle TTU_MIGRATION like TTU_UNMAP, without migration ptes.
 	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-		vma->vm_private_data = NULL;
+	if (TTU_ACTION(flags) != TTU_MUNLOCK)
+		flags = TTU_UNMAP | (flags & ~TTU_ACTION_MASK);
+
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		mutex_lock(&peer->i_mmap_mutex);
+		ret = try_to_unmap_mapping(page, peer, flags);
+		mutex_unlock(&peer->i_mmap_mutex);
+		if (ret != SWAP_AGAIN || !page_mapped(page))
+			break;
+	}
 out:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return ret;
@@ -1897,17 +1757,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
 }
 
 #ifdef CONFIG_MIGRATION
-/*
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+					struct rmap_walk_control *rwc)
 {
 	struct anon_vma *anon_vma;
-	pgoff_t pgoff;
-	struct anon_vma_chain *avc;
-	int ret = SWAP_AGAIN;
+
+	if (rwc->anon_lock)
+		return rwc->anon_lock(page);
 
 	/*
 	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1917,58 +1773,91 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	 */
 	anon_vma = page_anon_vma(page);
 	if (!anon_vma)
-		return ret;
+		return NULL;
+
 	anon_vma_lock_read(anon_vma);
+	return anon_vma;
+}
+
+/*
+ * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
+ * Called by migrate.c to remove migration ptes, but might be used more later.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
+	struct anon_vma_chain *avc;
+	int ret = SWAP_AGAIN;
+
+	anon_vma = rmap_walk_anon_lock(page, rwc);
+	if (!anon_vma)
+		return ret;
+
 	pgoff = page_to_pgoff(page);
+
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		ret = rmap_one(page, vma, address, arg);
+
+		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+			continue;
+
+		ret = rwc->rmap_one(page, vma, address, rwc->arg);
 		if (ret != SWAP_AGAIN)
 			break;
+		if (rwc->done && rwc->done(page))
+			break;
 	}
 	anon_vma_unlock_read(anon_vma);
 	return ret;
 }
 
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
+	/*
+	 * The page lock not only makes sure that page->mapping cannot
+	 * suddenly be NULLified by truncation, it makes sure that the
+	 * structure at mapping cannot be freed and reused yet,
+	 * so we can safely take mapping->i_mmap_mutex.
+	 */
+	VM_BUG_ON(!PageLocked(page));
+
 	if (!mapping)
 		return ret;
 	pgoff = page_to_pgoff(page);
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		ret = rmap_one(page, vma, address, arg);
+
+		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+			continue;
+
+		ret = rwc->rmap_one(page, vma, address, rwc->arg);
 		if (ret != SWAP_AGAIN)
-			break;
+			goto done;
+		if (rwc->done && rwc->done(page))
+			goto done;
 	}
-	/*
-	 * No nonlinear handling: being always shared, nonlinear vmas
-	 * never contain migration ptes.  Decide what to do about this
-	 * limitation to linear when we need rmap_walk() on nonlinear.
-	 */
+
+done:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return ret;
 }
 
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
 {
-	VM_BUG_ON(!PageLocked(page));
-
 	if (unlikely(PageKsm(page)))
-		return rmap_walk_ksm(page, rmap_one, arg);
+		return rmap_walk_ksm(page, rwc);
 	else if (PageAnon(page))
-		return rmap_walk_anon(page, rmap_one, arg);
+		return rmap_walk_anon(page, rwc);
 	else
-		return rmap_walk_file(page, rmap_one, arg);
+		return rmap_walk_file(page, rwc);
 }
 #endif /* CONFIG_MIGRATION */
 
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -68,9 +68,14 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/rmap.h>
+#include <uapi/linux/memfd.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -107,14 +112,24 @@ enum sgp_type {
 };
 
 #ifdef CONFIG_TMPFS
+static unsigned long tmpfs_ram_pages(void)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	if (ub == get_ub0())
+		return totalram_pages;
+
+	return min(totalram_pages, ub_total_pages(ub, false));
+}
+
 static unsigned long shmem_default_max_blocks(void)
 {
-	return totalram_pages / 2;
+	return tmpfs_ram_pages() / 2;
 }
 
 static unsigned long shmem_default_max_inodes(void)
 {
-	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+	return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2);
 }
 #endif
 
@@ -146,16 +161,67 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
  * consistent with the pre-accounting of private mappings ...
  */
-static inline int shmem_acct_size(unsigned long flags, loff_t size)
+static inline int shmem_acct_size(unsigned long flags, loff_t size,
+				  struct user_beancounter *ub)
 {
-	return (flags & VM_NORESERVE) ?
-		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
+	long pages = VM_ACCT(size);
+	int ret;
+
+	if (flags & VM_NORESERVE)
+		return 0;
+
+	ret = charge_beancounter(ub, UB_SHMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_shm;
+
+	ret = charge_beancounter_fast(ub, UB_PRIVVMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_privvm;
+
+	ret = security_vm_enough_memory_mm(current->mm, pages);
+	if (ret)
+		goto no_vm;
+#ifdef CONFIG_VE
+	/*
+	 * In container the maximal amount of shared pages available
+	 * is limited with @max_blocks so make sure we have space
+	 * left thus users won't wonder why their applications get
+	 * VM_FAULT_SIGBUS when pool exceeded.
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(shm_mnt->mnt_sb);
+
+		if (sbinfo->max_blocks) {
+			if (sbinfo->max_blocks < pages ||
+			    percpu_counter_compare(&sbinfo->used_blocks,
+						   sbinfo->max_blocks - pages) > 0) {
+				ret = -ENOSPC;
+				goto no_vm;
+			}
+		}
+	}
+#endif
+
+	return 0;
+
+no_vm:
+	uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+no_privvm:
+	uncharge_beancounter(ub, UB_SHMPAGES, pages);
+no_shm:
+	return ret;
 }
 
-static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+static inline void shmem_unacct_size(unsigned long flags, loff_t size,
+				     struct user_beancounter *ub)
 {
-	if (!(flags & VM_NORESERVE))
-		vm_unacct_memory(VM_ACCT(size));
+	long pages = VM_ACCT(size);
+
+	if (!(flags & VM_NORESERVE)) {
+		vm_unacct_memory(pages);
+		uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+		uncharge_beancounter(ub, UB_SHMPAGES, pages);
+	}
 }
 
 /*
@@ -164,16 +230,16 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
-static inline int shmem_acct_block(unsigned long flags)
+static inline int shmem_acct_block(struct shmem_inode_info *info)
 {
-	return (flags & VM_NORESERVE) ?
-		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+	return shmem_acct_size(info->flags ^ VM_NORESERVE,
+			       PAGE_CACHE_SIZE, info->shmi_ub);
 }
 
-static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+static inline void shmem_unacct_blocks(struct shmem_inode_info *info, long pages)
 {
-	if (flags & VM_NORESERVE)
-		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+	shmem_unacct_size(info->flags ^ VM_NORESERVE,
+			  pages << PAGE_SHIFT, info->shmi_ub);
 }
 
 static const struct super_operations shmem_ops;
@@ -246,7 +312,7 @@ static void shmem_recalc_inode(struct inode *inode)
 			percpu_counter_add(&sbinfo->used_blocks, -freed);
 		info->alloced -= freed;
 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
-		shmem_unacct_blocks(info->flags, freed);
+		shmem_unacct_blocks(info, freed);
 	}
 }
 
@@ -501,7 +567,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			pvec.pages, indices);
 		if (!pvec.nr)
 			break;
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -529,7 +594,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -577,7 +641,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			index = start;
 			continue;
 		}
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -613,7 +676,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		index++;
 	}
 
@@ -675,7 +737,7 @@ static void shmem_evict_inode(struct inode *inode)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
-		shmem_unacct_size(info->flags, inode->i_size);
+		shmem_unacct_size(info->flags, inode->i_size, info->shmi_ub);
 		inode->i_size = 0;
 		shmem_truncate_range(inode, 0, (loff_t)-1);
 		if (!list_empty(&info->swaplist)) {
@@ -707,7 +769,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 	radswap = swp_to_radix_entry(swap);
 	index = radix_tree_locate_item(&mapping->page_tree, radswap);
 	if (index == -1)
-		return 0;
+		return -EAGAIN;	/* tell shmem_unuse we found nothing */
 
 	/*
 	 * Move _head_ to start search for next from here.
@@ -766,7 +828,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 			spin_unlock(&info->lock);
 			swap_free(swap);
 		}
-		error = 1;	/* not an error, but entry was found */
 	}
 	return error;
 }
@@ -778,7 +839,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 {
 	struct list_head *this, *next;
 	struct shmem_inode_info *info;
-	int found = 0;
+	struct mem_cgroup *memcg;
 	int error = 0;
 
 	/*
@@ -793,26 +854,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
 	 * Charged back to the user (not to caller) when swap account is used.
 	 */
-	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
 	if (error)
 		goto out;
 	/* No radix_tree_preload: swap entry keeps a place for page in tree */
+	error = -EAGAIN;
 
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(this, next, &shmem_swaplist) {
 		info = list_entry(this, struct shmem_inode_info, swaplist);
 		if (info->swapped)
-			found = shmem_unuse_inode(info, swap, &page);
+			error = shmem_unuse_inode(info, swap, &page);
 		else
 			list_del_init(&info->swaplist);
 		cond_resched();
-		if (found)
+		if (error != -EAGAIN)
 			break;
+		/* found nothing in this: move on to search the next */
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (found < 0)
-		error = found;
+	if (error) {
+		if (error != -ENOMEM)
+			error = 0;
+		mem_cgroup_cancel_charge(page, memcg);
+	} else
+		mem_cgroup_commit_charge(page, memcg, true);
 out:
 	unlock_page(page);
 	page_cache_release(page);
@@ -916,7 +983,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	mutex_unlock(&shmem_swaplist_mutex);
-	swapcache_free(swap, NULL);
+	swapcache_free(swap);
 redirty:
 	set_page_dirty(page);
 	if (wbc->for_reclaim)
@@ -1089,7 +1156,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 		 */
 		oldpage = newpage;
 	} else {
-		mem_cgroup_replace_page_cache(oldpage, newpage);
+		mem_cgroup_migrate(oldpage, newpage, false);
 		lru_cache_add_anon(newpage);
 		*pagep = newpage;
 	}
@@ -1118,6 +1185,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo;
+	struct mem_cgroup *memcg;
 	struct page *page;
 	swp_entry_t swap;
 	int error;
@@ -1193,8 +1261,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 				goto failed;
 		}
 
-		error = mem_cgroup_cache_charge(page, current->mm,
-						gfp & GFP_RECLAIM_MASK);
+		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, index,
 						gfp, swp_to_radix_entry(swap));
@@ -1210,12 +1277,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 			 * Reset swap.val? No, leave it so "failed" goes back to
 			 * "repeat": reading a hole and writing should succeed.
 			 */
-			if (error)
+			if (error) {
+				mem_cgroup_cancel_charge(page, memcg);
 				delete_from_swap_cache(page);
+			}
 		}
 		if (error)
 			goto failed;
 
+		mem_cgroup_commit_charge(page, memcg, true);
+
 		spin_lock(&info->lock);
 		info->swapped--;
 		shmem_recalc_inode(inode);
@@ -1234,9 +1305,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 						       VM_UFFD_MISSING);
 			return 0;
 		}
-		if (shmem_acct_block(info->flags)) {
+		if (shmem_acct_block(info)) {
 			error = -ENOSPC;
-
 			goto failed;
 		}
 		if (sbinfo->max_blocks) {
@@ -1256,8 +1326,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 
 		SetPageSwapBacked(page);
 		__set_page_locked(page);
-		error = mem_cgroup_cache_charge(page, current->mm,
-						gfp & GFP_RECLAIM_MASK);
+
+		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
 		if (error)
 			goto decused;
 		error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1267,9 +1337,10 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 			radix_tree_preload_end();
 		}
 		if (error) {
-			mem_cgroup_uncharge_cache_page(page);
+			mem_cgroup_cancel_charge(page, memcg);
 			goto decused;
 		}
+		mem_cgroup_commit_charge(page, memcg, false);
 		lru_cache_add_anon(page);
 
 		spin_lock(&info->lock);
@@ -1327,7 +1398,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
-	shmem_unacct_blocks(info->flags, 1);
+	shmem_unacct_blocks(info, 1);
 failed:
 	if (swap.val && error != -EINVAL &&
 	    !shmem_confirm_swap(mapping, index, swap))
@@ -1464,19 +1535,25 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 
 	spin_lock(&info->lock);
 	if (lock && !(info->flags & VM_LOCKED)) {
+		if (ub_lockedshm_charge(info, inode->i_size) < 0)
+			goto out_ch;
 		if (!user_shm_lock(inode->i_size, user))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
 	if (!lock && (info->flags & VM_LOCKED) && user) {
+		ub_lockedshm_uncharge(info, inode->i_size);
 		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 	}
-	retval = 0;
+	spin_unlock(&info->lock);
+	return 0;
 
 out_nomem:
+	ub_lockedshm_uncharge(info, inode->i_size);
+out_ch:
 	spin_unlock(&info->lock);
 	return retval;
 }
@@ -1566,6 +1643,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp = mapping_gfp_mask(mapping);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	void *page_kaddr;
 	struct page *page;
@@ -1573,7 +1651,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	int ret;
 
 	ret = -ENOMEM;
-	if (shmem_acct_block(info->flags))
+	if (shmem_acct_block(info))
 		goto out;
 	if (sbinfo->max_blocks) {
 		if (percpu_counter_compare(&sbinfo->used_blocks,
@@ -1597,7 +1675,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			*pagep = page;
 			if (sbinfo->max_blocks)
 				percpu_counter_add(&sbinfo->used_blocks, -1);
-			shmem_unacct_blocks(info->flags, 1);
+			shmem_unacct_blocks(info, 1);
 			/* don't free the page */
 			return -EFAULT;
 		}
@@ -1611,8 +1689,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	__SetPageSwapBacked(page);
 	__SetPageUptodate(page);
 
-	ret = mem_cgroup_cache_charge(page, dst_mm,
-				      gfp & GFP_RECLAIM_MASK);
+	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg);
 	if (ret)
 		goto out_release;
 
@@ -1624,6 +1701,8 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (ret)
 		goto out_release_uncharge;
 
+	mem_cgroup_commit_charge(page, memcg, false);
+
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
 	if (dst_vma->vm_flags & VM_WRITE)
 		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
@@ -1642,7 +1721,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	spin_unlock(&info->lock);
 
 	inc_mm_counter(dst_mm, mm_counter_file(page));
-	page_add_file_rmap(page);
+	page_add_file_rmap(page, dst_mm);
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
 	/* No need to invalidate - it was non-present before */
@@ -1655,7 +1734,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 out_release_uncharge_unlock:
 	pte_unmap_unlock(dst_pte, ptl);
 out_release_uncharge:
-	mem_cgroup_uncharge_cache_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 out_release:
 	unlock_page(page);
 	put_page(page);
@@ -1663,7 +1742,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 out_unacct_blocks:
-	shmem_unacct_blocks(info->flags, 1);
+	shmem_unacct_blocks(info, 1);
 	goto out;
 }
 
@@ -2363,11 +2442,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		struct page *page;
 
 		/*
-		 * Good, the fallocate(2) manpage permits EINTR: we may have
-		 * been interrupted because we are using up too much memory.
+		 * Although fallocate(2) manpage permits EINTR, the more
+		 * places use ERESTARTSYS the better. If we have been
+		 * interrupted because we are using up too much memory,
+		 * oom-killer used fatal signal and we will die anyway.
 		 */
 		if (signal_pending(current))
-			error = -EINTR;
+			error = -ERESTARTSYS;
 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
 			error = -ENOMEM;
 		else
@@ -2984,6 +3065,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 		if (!*this_char)
 			continue;
+		if (!strcmp(this_char, "relatime"))
+			continue;
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
@@ -2998,7 +3081,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			size = memparse(value,&rest);
 			if (*rest == '%') {
 				size <<= PAGE_SHIFT;
-				size *= totalram_pages;
+				size *= tmpfs_ram_pages();
 				do_div(size, 100);
 				rest++;
 			}
@@ -3286,12 +3369,14 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!info)
 		return NULL;
+	info->shmi_ub = get_beancounter(get_exec_ub());
 	return &info->vfs_inode;
 }
 
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
+	put_beancounter(SHMEM_I(inode)->shmi_ub);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -3312,7 +3397,7 @@ static int shmem_init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, SLAB_PANIC, shmem_init_inode);
+				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
 	return 0;
 }
 
@@ -3416,7 +3501,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -3430,7 +3514,7 @@ static struct file_system_type shmem_fs_type = {
 	.name		= "tmpfs",
 	.mount		= shmem_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int __init shmem_init(void)
@@ -3526,7 +3610,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
-#define shmem_acct_size(flags, size)		0
+#define shmem_acct_size(flags, size, ub)	0
 #define shmem_unacct_size(flags, size)		do {} while (0)
 
 #endif /* CONFIG_SHMEM */
@@ -3552,7 +3636,7 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	if (size < 0 || size > MAX_LFS_FILESIZE)
 		return ERR_PTR(-EINVAL);
 
-	if (shmem_acct_size(flags, size))
+	if (shmem_acct_size(flags, size, get_exec_ub()))
 		return ERR_PTR(-ENOMEM);
 
 	res = ERR_PTR(-ENOMEM);
@@ -3560,16 +3644,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	this.len = strlen(name);
 	this.hash = 0; /* will go */
 	sb = shm_mnt->mnt_sb;
+	path.mnt = mntget(shm_mnt);
 	path.dentry = d_alloc_pseudo(sb, &this);
 	if (!path.dentry)
 		goto put_memory;
 	d_set_d_op(path.dentry, &anon_ops);
-	path.mnt = mntget(shm_mnt);
 
 	res = ERR_PTR(-ENOSPC);
 	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
 	if (!inode)
-		goto put_dentry;
+		goto put_memory;
 
 	inode->i_flags |= i_flags;
 	d_instantiate(path.dentry, inode);
@@ -3586,10 +3670,10 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 
 	return res;
 
+put_memory:
+	shmem_unacct_size(flags, size, get_exec_ub());
 put_dentry:
 	path_put(&path);
-put_memory:
-	shmem_unacct_size(flags, size);
 	return res;
 }
 
@@ -3635,6 +3719,9 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 
 	if (vma->vm_file)
 		fput(vma->vm_file);
+	else if (vma->vm_flags & VM_WRITE)
+		uncharge_beancounter_fast(mm_ub(vma->vm_mm), UB_PRIVVMPAGES,
+					  size >> PAGE_SHIFT);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	return 0;
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,7 @@
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
+#include	<linux/vzstat.h>
 
 #include	<net/sock.h>
 
@@ -1748,8 +1749,12 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
+	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+		return NULL;
+
 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page) {
+		memcg_uncharge_slab(cachep, cachep->gfporder);
 		if (!(flags & __GFP_NOWARN) && printk_ratelimit())
 			slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
@@ -1772,7 +1777,6 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		if (page_is_pfmemalloc(page))
 			SetPageSlabPfmemalloc(page + i);
 	}
-	memcg_bind_pages(cachep, cachep->gfporder);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1810,10 +1814,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 		page++;
 	}
 
-	memcg_release_pages(cachep, cachep->gfporder);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	free_pages((unsigned long)addr, cachep->gfporder);
+	memcg_uncharge_slab(cachep, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2094,7 +2098,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 			offslab_limit = size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
 
- 			if (num > offslab_limit)
+			if (num > offslab_limit)
 				break;
 		}
 
@@ -2312,6 +2316,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 #endif
 #endif
 
+	kasan_cache_create(cachep, &size, &flags);
+
 	/*
 	 * Determine if the slab management is 'on' or 'off' slab.
 	 * (bootstrapping cannot cope with offslab caches so don't do
@@ -2387,7 +2393,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 
 	err = setup_cpu_cache(cachep, gfp);
 	if (err) {
-		__kmem_cache_shutdown(cachep);
+		__kmem_cache_release(cachep);
 		return err;
 	}
 
@@ -2518,8 +2524,7 @@ static int drain_freelist(struct kmem_cache *cache,
 	return nr_freed;
 }
 
-/* Called with slab_mutex held to protect against cpu hotplug */
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
 	int ret = 0, i = 0;
 	struct kmem_cache_node *n;
@@ -2540,35 +2545,15 @@ static int __cache_shrink(struct kmem_cache *cachep)
 	return (ret ? 1 : 0);
 }
 
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	int ret;
-	BUG_ON(!cachep || in_interrupt());
-
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
-	ret = __cache_shrink(cachep);
-	mutex_unlock(&slab_mutex);
-	put_online_cpus();
-	return ret;
+	return __kmem_cache_shrink(cachep, false);
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
-int __kmem_cache_shutdown(struct kmem_cache *cachep)
+void __kmem_cache_release(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_cache_node *n;
-	int rc = __cache_shrink(cachep);
-
-	if (rc)
-		return rc;
 
 	for_each_online_cpu(i)
 	    kfree(cachep->array[i]);
@@ -2582,7 +2567,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 			kfree(n);
 		}
 	}
-	return 0;
 }
 
 /*
@@ -2640,6 +2624,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, slabp, i);
+		kasan_init_slab_obj(cachep, objp);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2656,8 +2641,13 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		 * cache which they are a constructor for.  Otherwise, deadlock.
 		 * They must also be threaded.
 		 */
-		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+			kasan_unpoison_object_data(cachep,
+						   objp + obj_offset(cachep));
 			cachep->ctor(objp + obj_offset(cachep));
+			kasan_poison_object_data(
+				cachep, objp + obj_offset(cachep));
+		}
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2672,8 +2662,11 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 0);
 #else
-		if (cachep->ctor)
+		if (cachep->ctor) {
+			kasan_unpoison_object_data(cachep, objp);
 			cachep->ctor(objp);
+			kasan_poison_object_data(cachep, objp);
+		}
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
@@ -2812,6 +2805,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
 	slab_map_pages(cachep, slabp, objp);
 
+	kasan_poison_slab(slabp);
 	cache_init_objs(cachep, slabp);
 
 	if (local_flags & __GFP_WAIT)
@@ -3361,6 +3355,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
@@ -3404,6 +3400,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely((flags & __GFP_ZERO) && ptr))
 		memset(ptr, 0, cachep->object_size);
 
+	memcg_kmem_put_cache(cachep);
 	return ptr;
 }
 
@@ -3449,6 +3446,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
@@ -3470,6 +3469,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely((flags & __GFP_ZERO) && objp))
 		memset(objp, 0, cachep->object_size);
 
+	memcg_kmem_put_cache(cachep);
 	return objp;
 }
 
@@ -3580,6 +3580,16 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
+{
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
+
+	___cache_free(cachep, objp, caller);
+}
+
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
@@ -3621,6 +3631,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 
+	kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
 
@@ -3649,6 +3660,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 
 	ret = slab_alloc(cachep, flags, _RET_IP_);
 
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
 	return ret;
@@ -3661,6 +3673,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
+	kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);
@@ -3679,6 +3692,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 
 	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
 			   flags, nodeid);
@@ -3691,11 +3705,15 @@ static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+	kasan_kmalloc(cachep, ret, size, flags);
+
+	return ret;
 }
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -3742,6 +3760,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = slab_alloc(cachep, flags, caller);
 
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
 
@@ -3978,8 +3997,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	int ret;
-	struct kmem_cache *c = NULL;
-	int i = 0;
+	struct kmem_cache *c;
 
 	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 
@@ -3989,12 +4007,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	if ((ret < 0) || !is_root_cache(cachep))
 		return ret;
 
-	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg(cachep, i);
-		if (c)
-			/* return value determined by the parent cache only */
-			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
+	lockdep_assert_held(&slab_mutex);
+	for_each_memcg_cache(c, cachep) {
+		/* return value determined by the root cache only */
+		__do_tune_cpucache(c, limit, batchcount, shared, gfp);
 	}
 
 	return ret;
@@ -4119,6 +4135,7 @@ static void cache_reap(struct work_struct *w)
 		/* Give up. Setup the next iteration. */
 		goto out;
 
+	KSTAT_PERF_ENTER(cache_reap)
 	list_for_each_entry(searchp, &slab_caches, list) {
 		check_irq_on();
 
@@ -4159,11 +4176,80 @@ static void cache_reap(struct work_struct *w)
 	check_irq_on();
 	mutex_unlock(&slab_mutex);
 	next_reap_node();
+	KSTAT_PERF_LEAVE(cache_reap);
 out:
 	/* Set up the next iteration */
 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
 }
 
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_list3 *l3;
+	struct list_head *lh;
+	int node;
+
+	slabs = 0;
+
+	for_each_online_node (node) {
+		l3 = cachep->nodelists[node];
+		if (l3 == NULL)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each (lh, &l3->slabs_full)
+			slabs++;
+		list_for_each (lh, &l3->slabs_partial)
+			slabs++;
+		list_for_each (lh, &l3->slabs_free)
+			slabs++;
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << cachep->gfporder) +
+		(OFF_SLAB(cachep) ?
+		 cachep->slabp_cache->size * slabs : 0);
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+//	spin_lock(&cache_chain_lock);
+	list_for_each_entry (ptr, &slab_caches, list) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++)
+			if (sizes[i] < sizes[j])
+				j = i;
+
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++)
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+					top[i]->name, sizes[i],
+					top[i]->size);
+//	spin_unlock(&cache_chain_lock);
+}
+
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
@@ -4319,12 +4405,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 
-static void *leaks_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&slab_mutex);
-	return seq_list_start(&slab_caches, *pos);
-}
-
 static inline int add_caller(unsigned long *n, unsigned long v)
 {
 	unsigned long *p;
@@ -4445,20 +4525,10 @@ static int leaks_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&slab_mutex);
-}
-
 static const struct seq_operations slabstats_op = {
-	.start = leaks_start,
-	.next = s_next,
-	.stop = s_stop,
+	.start = slab_start,
+	.next = slab_next,
+	.stop = slab_stop,
 	.show = leaks_show,
 };
 
@@ -4511,10 +4581,18 @@ module_init(slab_proc_init);
  */
 size_t ksize(const void *objp)
 {
+	size_t size;
+
 	BUG_ON(!objp);
 	if (unlikely(objp == ZERO_SIZE_PTR))
 		return 0;
 
-	return virt_to_cache(objp)->object_size;
+	size = virt_to_cache(objp)->object_size;
+	/* We assume that ksize callers could use the whole allocated area,
+	 * so we need to unpoison this area.
+	 */
+	kasan_unpoison_shadow(objp, size);
+
+	return size;
 }
 EXPORT_SYMBOL(ksize);
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
  * Internal slab definitions
  */
 
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+	unsigned int object_size;/* The original size of the object */
+	unsigned int size;	/* The aligned/padded/added on size  */
+	unsigned int align;	/* Alignment as calculated */
+	unsigned long flags;	/* Active flags on the slab */
+	const char *name;	/* Slab name for sysfs */
+	int refcount;		/* Use counter */
+	void (*ctor)(void *);	/* Called on object slot creation */
+	struct list_head list;	/* List of all slab caches on the system */
+};
+
+#endif /* CONFIG_SLOB */
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+
+#include <linux/memcontrol.h>
+
 /*
  * State of the slab allocator.
  *
@@ -52,15 +87,14 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
 			size_t size, unsigned long flags);
 
-struct mem_cgroup;
 #ifdef CONFIG_SLUB
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *));
 #else
 static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
 
@@ -80,10 +114,11 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
-			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+			  SLAB_NOTRACK | SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK)
+			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -91,6 +126,9 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
+void slab_kmem_cache_release(struct kmem_cache *);
 
 struct seq_file;
 struct file;
@@ -123,38 +161,27 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline bool is_root_cache(struct kmem_cache *s)
-{
-	return !s->memcg_params || s->memcg_params->is_root_cache;
-}
-
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-				     struct mem_cgroup *memcg)
-{
-	return (is_root_cache(cachep) && !memcg) ||
-				(cachep->memcg_params->memcg == memcg);
-}
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+	list_for_each_entry(iter, &(root)->memcg_params.list, \
+			    memcg_params.list)
 
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
-{
-	if (!is_root_cache(s))
-		atomic_add(1 << order, &s->memcg_params->nr_pages);
-}
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+				 memcg_params.list)
 
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+static inline bool is_root_cache(struct kmem_cache *s)
 {
-	if (is_root_cache(s))
-		return;
-
-	if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
-		mem_cgroup_destroy_cache(s);
+	return s->memcg_params.is_root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-					struct kmem_cache *p)
+				      struct kmem_cache *p)
 {
-	return (p == s) ||
-		(s->memcg_params && (p == s->memcg_params->root_cache));
+	return p == s || p == s->memcg_params.root_cache;
 }
 
 /*
@@ -165,41 +192,76 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->name;
+		s = s->memcg_params.root_cache;
 	return s->name;
 }
 
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
+ */
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
-	if (!s->memcg_params)
-		return NULL;
-	return s->memcg_params->memcg_caches[idx];
+	struct kmem_cache *cachep;
+	struct memcg_cache_array *arr;
+
+	rcu_read_lock();
+	arr = rcu_dereference(s->memcg_params.memcg_caches);
+	cachep = arr->entries[idx];
+	rcu_read_unlock();
+
+	/*
+	 * Make sure we will access the up-to-date value. The code updating
+	 * memcg_caches issues a write barrier to match this (see
+	 * memcg_create_kmem_cache()).
+	 */
+	smp_read_barrier_depends();
+	return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
 		return s;
-	return s->memcg_params->root_cache;
-}
-#else
-static inline bool is_root_cache(struct kmem_cache *s)
-{
-	return true;
+	return s->memcg_params.root_cache;
 }
 
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-				     struct mem_cgroup *memcg)
+extern int __memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, unsigned int nr_pages);
+extern void __memcg_uncharge_slab(struct kmem_cache *s, unsigned int nr_pages);
+
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+					     gfp_t gfp, int order)
 {
-	return true;
+	if (!memcg_kmem_enabled())
+		return 0;
+	if (is_root_cache(s))
+		return 0;
+	return __memcg_charge_slab(s, gfp, 1 << order);
 }
 
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
+	if (!memcg_kmem_enabled())
+		return;
+	if (is_root_cache(s))
+		return;
+	__memcg_uncharge_slab(s, 1 << order);
 }
 
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
+#define for_each_memcg_cache(iter, root) \
+	for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
+static inline bool is_root_cache(struct kmem_cache *s)
 {
+	return true;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -213,7 +275,8 @@ static inline const char *cache_name(struct kmem_cache *s)
 	return s->name;
 }
 
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 	return NULL;
 }
@@ -222,7 +285,20 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	return s;
 }
-#endif
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+	return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
@@ -282,3 +358,9 @@ struct kmem_cache_node {
 #endif
 
 };
+
+void *slab_start(struct seq_file *m, loff_t *pos);
+void *slab_next(struct seq_file *m, void *p, loff_t *pos);
+void slab_stop(struct seq_file *m, void *p);
+int memcg_slab_show(struct mem_cgroup *memcg, struct seq_file *m, void *p);
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -20,6 +20,9 @@
 #include <asm/page.h>
 #include <linux/memcontrol.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
 #include "slab.h"
 
 enum slab_state slab_state;
@@ -27,9 +30,17 @@ LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+	return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
-				   size_t size)
+static int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	struct kmem_cache *s = NULL;
 
@@ -55,13 +66,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
 			continue;
 		}
 
-		/*
-		 * For simplicity, we won't check this in the list of memcg
-		 * caches. We have control over memcg naming, and if there
-		 * aren't duplicates in the global list, there won't be any
-		 * duplicates in the memcg lists as well.
-		 */
-		if (!memcg && !strcmp(s->name, name)) {
+		if (!strcmp(s->name, name)) {
 			pr_err("%s (%s): Cache name already exists.\n",
 			       __func__, name);
 			dump_stack();
@@ -74,8 +79,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
 	return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
-					  const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	return 0;
 }
@@ -105,32 +109,107 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+void slab_init_memcg_params(struct kmem_cache *s)
+{
+	s->memcg_params.is_root_cache = true;
+	INIT_LIST_HEAD(&s->memcg_params.list);
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct memcg_cache_array *arr;
+
+	if (memcg) {
+		s->memcg_params.is_root_cache = false;
+		s->memcg_params.memcg = memcg;
+		s->memcg_params.root_cache = root_cache;
+		return 0;
+	}
+
+	slab_init_memcg_params(s);
+
+	if (!memcg_nr_cache_ids)
+		return 0;
+
+	arr = kvzalloc(sizeof(struct memcg_cache_array) +
+		       memcg_nr_cache_ids * sizeof(void *),
+		       GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
+
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
+	return 0;
+}
+
+static void destroy_memcg_params(struct kmem_cache *s)
+{
+	if (is_root_cache(s))
+		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static void free_memcg_params(struct rcu_head *rcu)
+{
+	struct memcg_cache_array *old;
+
+	old = container_of(rcu, struct memcg_cache_array, rcu);
+	kvfree(old);
+}
+
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
+{
+	struct memcg_cache_array *old, *new;
+
+	if (!is_root_cache(s))
+		return 0;
+
+	new = kvzalloc(sizeof(struct memcg_cache_array) +
+		      new_array_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+	if (old)
+		memcpy(new->entries, old->entries,
+		       memcg_nr_cache_ids * sizeof(void *));
+
+	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+	if (old)
+		call_rcu(&old->rcu, free_memcg_params);
+	return 0;
+}
+
 int memcg_update_all_caches(int num_memcgs)
 {
 	struct kmem_cache *s;
 	int ret = 0;
-	mutex_lock(&slab_mutex);
 
+	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
-		ret = memcg_update_cache_size(s, num_memcgs);
+		ret = update_memcg_params(s, num_memcgs);
 		/*
-		 * See comment in memcontrol.c, memcg_update_cache_size:
 		 * Instead of freeing the memory, we'll just leave the caches
 		 * up to this point in an updated state.
 		 */
 		if (ret)
-			goto out;
+			break;
 	}
-
-	memcg_update_array_size(num_memcgs);
-out:
 	mutex_unlock(&slab_mutex);
 	return ret;
 }
-#endif
+#else
+static inline int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	return 0;
+}
+
+static inline void destroy_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 /*
  * Figure out what the alignment of the objects will be given a set of
@@ -159,6 +238,45 @@ unsigned long calculate_alignment(unsigned long flags,
 	return ALIGN(align, sizeof(void *));
 }
 
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+		     unsigned long flags, void (*ctor)(void *),
+		     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct kmem_cache *s;
+	int err;
+
+	err = -ENOMEM;
+	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+	if (!s)
+		goto out;
+
+	s->name = name;
+	s->object_size = object_size;
+	s->size = size;
+	s->align = align;
+	s->ctor = ctor;
+
+	err = init_memcg_params(s, memcg, root_cache);
+	if (err)
+		goto out_free_cache;
+
+	err = __kmem_cache_create(s, flags);
+	if (err)
+		goto out_free_cache;
+
+	s->refcount = 1;
+	list_add(&s->list, &slab_caches);
+out:
+	if (err)
+		return ERR_PTR(err);
+	return s;
+
+out_free_cache:
+	destroy_memcg_params(s);
+	kfree(s);
+	goto out;
+}
 
 /*
  * kmem_cache_create - Create a cache.
@@ -184,20 +302,21 @@ unsigned long calculate_alignment(unsigned long flags,
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-
 struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
-			size_t align, unsigned long flags, void (*ctor)(void *),
-			struct kmem_cache *parent_cache)
+kmem_cache_create(const char *name, size_t size, size_t align,
+		  unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s = NULL;
-	int err = 0;
+	struct kmem_cache *s;
+	char *cache_name;
+	int err;
 
 	get_online_cpus();
+	memcg_get_cache_ids();
 	mutex_lock(&slab_mutex);
 
-	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-		goto out_locked;
+	err = kmem_cache_sanity_check(name, size);
+	if (err)
+		goto out_unlock;
 
 	/*
 	 * Some allocators will constraint the set of valid flags to a subset
@@ -207,47 +326,30 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 	 */
 	flags &= CACHE_CREATE_MASK;
 
-	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+	s = __kmem_cache_alias(name, size, align, flags, ctor);
 	if (s)
-		goto out_locked;
-
-	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-	if (s) {
-		s->object_size = s->size = size;
-		s->align = calculate_alignment(flags, align, size);
-		s->ctor = ctor;
-
-		if (memcg_register_cache(memcg, s, parent_cache)) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
+		goto out_unlock;
 
-		s->name = kstrdup(name, GFP_KERNEL);
-		if (!s->name) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
-
-		err = __kmem_cache_create(s, flags);
-		if (!err) {
-			s->refcount = 1;
-			list_add(&s->list, &slab_caches);
-			memcg_cache_list_add(memcg, s);
-		} else {
-			kfree(s->name);
-			kmem_cache_free(kmem_cache, s);
-		}
-	} else
+	cache_name = kstrdup(name, GFP_KERNEL);
+	if (!cache_name) {
 		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	s = do_kmem_cache_create(cache_name, size, size,
+				 calculate_alignment(flags, align, size),
+				 flags, ctor, NULL, NULL);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		kfree(cache_name);
+	}
 
-out_locked:
+out_unlock:
 	mutex_unlock(&slab_mutex);
+	memcg_put_cache_ids();
 	put_online_cpus();
 
 	if (err) {
-
 		if (flags & SLAB_PANIC)
 			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
 				name, err);
@@ -256,57 +358,248 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 				name, err);
 			dump_stack();
 		}
-
 		return NULL;
 	}
-
 	return s;
 }
+EXPORT_SYMBOL(kmem_cache_create);
 
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
-		  unsigned long flags, void (*ctor)(void *))
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
 {
-	return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+
+	/* free asan quarantined objects */
+	kasan_cache_shutdown(s);
+
+	if (__kmem_cache_shutdown(s) != 0) {
+		printk(KERN_ERR "kmem_cache_destroy %s: "
+		       "Slab cache still has objects\n", s->name);
+		dump_stack();
+		return -EBUSY;
+	}
+
+	if (s->flags & SLAB_DESTROY_BY_RCU)
+		*need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s))
+		list_del(&s->memcg_params.list);
+#endif
+	list_move(&s->list, release);
+	return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+				  bool need_rcu_barrier)
+{
+	struct kmem_cache *s, *s2;
+
+	if (need_rcu_barrier)
+		rcu_barrier();
+
+	list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_remove(s);
+#else
+		slab_kmem_cache_release(s);
+#endif
+	}
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * memcg_create_kmem_cache - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+			     struct kmem_cache *root_cache)
+{
+	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+	struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s = NULL;
+	char *cache_name;
+	int idx;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+
+	/*
+	 * The memory cgroup could have been deactivated while the cache
+	 * creation work was pending.
+	 */
+	if (!memcg_kmem_is_active(memcg))
+		goto out_unlock;
+
+	idx = memcg_cache_id(memcg);
+	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+
+	/*
+	 * Since per-memcg caches are created asynchronously on first
+	 * allocation (see memcg_kmem_get_cache()), several threads can try to
+	 * create the same cache, but only one of them may succeed.
+	 */
+	if (arr->entries[idx])
+		goto out_unlock;
+
+	rcu_read_lock();
+	strlcpy(memcg_name_buf, cgroup_name(css->cgroup), NAME_MAX + 1);
+	rcu_read_unlock();
+	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+			       css_id(css), memcg_name_buf);
+	if (!cache_name)
+		goto out_unlock;
+
+	s = do_kmem_cache_create(cache_name, root_cache->object_size,
+				 root_cache->size, root_cache->align,
+				 root_cache->flags, root_cache->ctor,
+				 memcg, root_cache);
+	/*
+	 * If we could not create a memcg cache, do not complain, because
+	 * that's not critical at all as we can always proceed with the root
+	 * cache.
+	 */
+	if (IS_ERR(s)) {
+		kfree(cache_name);
+		goto out_unlock;
+	}
+
+	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
+	/*
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
+	 */
+	smp_wmb();
+	arr->entries[idx] = s;
+
+out_unlock:
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+}
+
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+	int idx;
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s, *c;
+
+	idx = memcg_cache_id(memcg);
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!is_root_cache(s))
+			continue;
+
+		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		c = arr->entries[idx];
+		if (!c)
+			continue;
+
+		__kmem_cache_shrink(c, true);
+		arr->entries[idx] = NULL;
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+}
+
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
+{
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	struct kmem_cache *s, *s2;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	list_for_each_entry_safe(s, s2, &slab_caches, list) {
+		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
+			continue;
+		/*
+		 * The cgroup is about to be freed and therefore has no charges
+		 * left. Hence, all its caches must be empty by now.
+		 */
+		BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+void slab_kmem_cache_release(struct kmem_cache *s)
+{
+	__kmem_cache_release(s);
+	destroy_memcg_params(s);
+	kfree(s->name);
+	kmem_cache_free(kmem_cache, s);
 }
-EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+	struct kmem_cache *c, *c2;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	bool busy = false;
+
 	if (unlikely(!s))
 		return;
 
-	/* Destroy all the children caches if we aren't a memcg cache */
-	kmem_cache_destroy_memcg_children(s);
+	BUG_ON(!is_root_cache(s));
 
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
+
 	s->refcount--;
-	if (!s->refcount) {
-		list_del(&s->list);
-
-		if (!__kmem_cache_shutdown(s)) {
-			mutex_unlock(&slab_mutex);
-			if (s->flags & SLAB_DESTROY_BY_RCU)
-				rcu_barrier();
-
-			memcg_release_cache(s);
-			kfree(s->name);
-			kmem_cache_free(kmem_cache, s);
-		} else {
-			list_add(&s->list, &slab_caches);
-			mutex_unlock(&slab_mutex);
-			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
-				s->name);
-			dump_stack();
-		}
-	} else {
-		mutex_unlock(&slab_mutex);
+	if (s->refcount)
+		goto out_unlock;
+
+	for_each_memcg_cache_safe(c, c2, s) {
+		if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+			busy = true;
 	}
+
+	if (!busy)
+		do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+
+out_unlock:
+	mutex_unlock(&slab_mutex);
 	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+	int ret;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	kasan_cache_shrink(cachep);
+	ret = __kmem_cache_shrink(cachep, false);
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
 int slab_is_available(void)
 {
 	return slab_state >= UP;
@@ -322,6 +615,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 	s->name = name;
 	s->size = s->object_size = size;
 	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+	slab_init_memcg_params(s);
+
 	err = __kmem_cache_create(s, flags);
 
 	if (err)
@@ -537,9 +833,32 @@ void __init create_kmalloc_caches(unsigned long flags)
 }
 #endif /* !CONFIG_SLOB */
 
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+	void *ret;
+	struct page *page;
+
+	flags |= __GFP_COMP;
+	page = alloc_pages(flags, order);
+	ret = page ? page_address(page) : NULL;
+	kmemleak_alloc(ret, size, 1, flags);
+	kasan_kmalloc_large(ret, size, flags);
+	return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
+#ifdef CONFIG_TRACING
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+	void *ret = kmalloc_order(size, flags, order);
+	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+	return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
+#endif
 
 #ifdef CONFIG_SLABINFO
-void print_slabinfo_header(struct seq_file *m)
+static void print_slabinfo_header(struct seq_file *m)
 {
 	/*
 	 * Output format version, so at least we can change it
@@ -562,23 +881,18 @@ void print_slabinfo_header(struct seq_file *m)
 	seq_putc(m, '\n');
 }
 
-static void *s_start(struct seq_file *m, loff_t *pos)
+void *slab_start(struct seq_file *m, loff_t *pos)
 {
-	loff_t n = *pos;
-
 	mutex_lock(&slab_mutex);
-	if (!n)
-		print_slabinfo_header(m);
-
 	return seq_list_start(&slab_caches, *pos);
 }
 
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	return seq_list_next(p, &slab_caches, pos);
 }
 
-static void s_stop(struct seq_file *m, void *p)
+void slab_stop(struct seq_file *m, void *p)
 {
 	mutex_unlock(&slab_mutex);
 }
@@ -588,16 +902,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
 	struct kmem_cache *c;
 	struct slabinfo sinfo;
-	int i;
 
 	if (!is_root_cache(s))
 		return;
 
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg(s, i);
-		if (!c)
-			continue;
-
+	for_each_memcg_cache(c, s) {
 		memset(&sinfo, 0, sizeof(sinfo));
 		get_slabinfo(c, &sinfo);
 
@@ -609,7 +918,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 	}
 }
 
-int cache_show(struct kmem_cache *s, struct seq_file *m)
+static void cache_show(struct kmem_cache *s, struct seq_file *m)
 {
 	struct slabinfo sinfo;
 
@@ -628,17 +937,31 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
 	slabinfo_show_stats(m, s);
 	seq_putc(m, '\n');
+}
+
+static int slab_show(struct seq_file *m, void *p)
+{
+	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+	if (p == slab_caches.next)
+		print_slabinfo_header(m);
+	if (is_root_cache(s))
+		cache_show(s, m);
 	return 0;
 }
 
-static int s_show(struct seq_file *m, void *p)
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_slab_show(struct mem_cgroup *memcg, struct seq_file *m, void *p)
 {
 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
 
-	if (!is_root_cache(s))
-		return 0;
-	return cache_show(s, m);
+	if (p == slab_caches.next)
+		print_slabinfo_header(m);
+	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
+		cache_show(s, m);
+	return 0;
 }
+#endif
 
 /*
  * slabinfo_op - iterator that generates /proc/slabinfo
@@ -654,10 +977,10 @@ static int s_show(struct seq_file *m, void *p)
  * + further values on SMP and with statistics enabled
  */
 static const struct seq_operations slabinfo_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
+	.start = slab_start,
+	.next = slab_next,
+	.stop = slab_stop,
+	.show = slab_show,
 };
 
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -680,3 +1003,104 @@ static int __init slab_proc_init(void)
 }
 module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
+
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+					   gfp_t flags)
+{
+	void *ret;
+	size_t ks = 0;
+
+	if (p)
+		ks = ksize(p);
+
+	if (ks >= new_size) {
+		kasan_krealloc((void *)p, new_size, flags);
+		return (void *)p;
+	}
+
+	ret = kmalloc_track_caller(new_size, flags);
+	if (ret && p)
+		memcpy(ret, p, ks);
+
+	return ret;
+}
+
+/**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+	if (unlikely(!new_size))
+		return ZERO_SIZE_PTR;
+
+	return __do_krealloc(p, new_size, flags);
+
+}
+EXPORT_SYMBOL(__krealloc);
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+	void *ret;
+
+	if (unlikely(!new_size)) {
+		kfree(p);
+		return ZERO_SIZE_PTR;
+	}
+
+	ret = __do_krealloc(p, new_size, flags);
+	if (ret && p != ret)
+		kfree(p);
+
+	return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+void kzfree(const void *p)
+{
+	size_t ks;
+	void *mem = (void *)p;
+
+	if (unlikely(ZERO_OR_NULL_PTR(mem)))
+		return;
+	ks = ksize(mem);
+	memset(mem, 0, ks);
+	kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
+
+/* Tracepoints definitions. */
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -462,11 +462,11 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 	return ret;
 }
 
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+void *__kmalloc(size_t size, gfp_t gfp)
 {
-	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+	return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
 }
-EXPORT_SYMBOL(__kmalloc_node);
+EXPORT_SYMBOL(__kmalloc);
 
 #ifdef CONFIG_TRACING
 void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
@@ -534,7 +534,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 	return 0;
 }
 
-void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
@@ -560,7 +560,27 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
 	return b;
 }
+EXPORT_SYMBOL(slob_alloc_node);
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
+{
+	return slob_alloc_node(cachep, gfp, node);
+}
 EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
 
 static void __kmem_cache_free(void *b, int size)
 {
@@ -613,11 +633,14 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
 	return 0;
 }
 
-int kmem_cache_shrink(struct kmem_cache *d)
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
 	return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
 struct kmem_cache kmem_cache_boot = {
 	.name = "kmem_cache",
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -165,10 +166,10 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
  */
 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB)
+		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
-		SLAB_CACHE_DMA | SLAB_NOTRACK)
+		SLAB_CACHE_DMA | SLAB_NOTRACK | SLAB_ACCOUNT)
 
 #define OO_SHIFT	16
 #define OO_MASK		((1 << OO_SHIFT) - 1)
@@ -201,14 +202,11 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 #ifdef CONFIG_SYSFS
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void sysfs_slab_remove(struct kmem_cache *);
 static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
-
 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 #endif
 
@@ -295,6 +293,9 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 		return s->object_size;
 
 #endif
+	if (s->flags & SLAB_KASAN)
+		return s->object_size;
+
 	/*
 	 * If we have the need to store the freelist pointer
 	 * back there or track user information then we can
@@ -466,13 +467,31 @@ static int slub_debug;
 static char *slub_debug_slabs;
 static int disable_higher_order_debug;
 
+/*
+ * slub is about to manipulate internal object metadata.  This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error.  metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+	kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+	kasan_enable_current();
+}
+
 /*
  * Object debugging
  */
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
+	metadata_access_enable();
 	print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
 			length, 1);
+	metadata_access_disable();
 }
 
 static struct track *get_track(struct kmem_cache *s, void *object,
@@ -502,7 +521,9 @@ static void set_track(struct kmem_cache *s, void *object,
 		trace.max_entries = TRACK_ADDRS_COUNT;
 		trace.entries = p->addrs;
 		trace.skip = 3;
+		metadata_access_enable();
 		save_stack_trace(&trace);
+		metadata_access_disable();
 
 		/* See rant in lockdep.c */
 		if (trace.nr_entries != 0 &&
@@ -621,6 +642,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	if (s->flags & SLAB_STORE_USER)
 		off += 2 * sizeof(struct track);
 
+	off += kasan_metadata_size(s);
+
 	if (off != s->size)
 		/* Beginning of the filler is the free pointer */
 		print_section("Padding ", p + off, s->size - off);
@@ -628,7 +651,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	dump_stack();
 }
 
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
 			u8 *object, char *reason)
 {
 	slab_bug(s, "%s", reason);
@@ -675,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 	u8 *fault;
 	u8 *end;
 
+	metadata_access_enable();
 	fault = memchr_inv(start, value, bytes);
+	metadata_access_disable();
 	if (!fault)
 		return 1;
 
@@ -742,6 +767,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 		/* We also have user information there */
 		off += 2 * sizeof(struct track);
 
+	off += kasan_metadata_size(s);
+
 	if (s->size == off)
 		return 1;
 
@@ -768,7 +795,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 	if (!remainder)
 		return 1;
 
+	metadata_access_enable();
 	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+	metadata_access_disable();
 	if (!fault)
 		return 1;
 	while (end > fault && end[-1] == POISON_INUSE)
@@ -930,61 +959,8 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
 	}
 }
 
-/*
- * Hooks for other subsystems that check memory allocations. In a typical
- * production configuration these hooks all should produce no code at all.
- */
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-{
-	flags &= gfp_allowed_mask;
-	lockdep_trace_alloc(flags);
-	might_sleep_if(flags & __GFP_WAIT);
-
-	return should_failslab(s->object_size, flags, s->flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-					size_t size, void **p)
-{
-	size_t i;
-
-	flags &= gfp_allowed_mask;
-	for (i = 0; i < size; i++) {
-		void *object = p[i];
-
-		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-		kmemleak_alloc_recursive(object, s->object_size, 1,
-					 s->flags, flags);
-	}
-}
-
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
-{
-	kmemleak_free_recursive(x, s->flags);
-
-	/*
-	 * Trouble is that we may no longer disable interupts in the fast path
-	 * So in order to make the debug calls that expect irqs to be
-	 * disabled we need to disable interrupts temporarily.
-	 */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
-	{
-		unsigned long flags;
-
-		local_irq_save(flags);
-		kmemcheck_slab_free(s, x, s->object_size);
-		debug_check_no_locks_freed(x, s->object_size);
-		local_irq_restore(flags);
-	}
-#endif
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(x, s->object_size);
-}
-
 /*
  * Tracking of fully allocated slabs for debugging purposes.
- *
- * list_lock must be held.
  */
 static void add_full(struct kmem_cache *s,
 	struct kmem_cache_node *n, struct page *page)
@@ -992,17 +968,16 @@ static void add_full(struct kmem_cache *s,
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 
+	lockdep_assert_held(&n->list_lock);
 	list_add(&page->lru, &n->full);
 }
 
-/*
- * list_lock must be held.
- */
-static void remove_full(struct kmem_cache *s, struct page *page)
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
 {
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 
+	lockdep_assert_held(&n->list_lock);
 	list_del(&page->lru);
 }
 
@@ -1266,7 +1241,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 			void *object, u8 val) { return 1; }
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+					struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long object_size,
 	unsigned long flags, const char *name,
 	void (*ctor)(void *))
@@ -1285,16 +1261,70 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
 							int objects) {}
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
 							int objects) {}
-
+#endif /* CONFIG_SLUB_DEBUG */
+/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-							{ return 0; }
+{
+	flags &= gfp_allowed_mask;
+	lockdep_trace_alloc(flags);
+	might_sleep_if(flags & __GFP_WAIT);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
+
+	return should_failslab(s->object_size, flags, s->flags);
+}
 
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-		void *object) {}
+		size_t size, void **p)
+{
+	size_t i;
 
-static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+	flags &= gfp_allowed_mask;
+	for (i = 0; i < size; i++) {
+		void *object = p[i];
 
-#endif /* CONFIG_SLUB_DEBUG */
+		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+		kmemleak_alloc_recursive(object, s->object_size, 1,
+				s->flags, flags);
+		kasan_slab_alloc(s, object, flags);
+	}
+}
+
+static inline void *slab_free_hook(struct kmem_cache *s, void *x)
+{
+	void *freeptr;
+
+	kmemleak_free_recursive(x, s->flags);
+
+	/*
+	 * Trouble is that we may no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->object_size);
+		debug_check_no_locks_freed(x, s->object_size);
+		local_irq_restore(flags);
+	}
+#endif
+	if (!(s->flags & SLAB_DEBUG_OBJECTS))
+		debug_check_no_obj_freed(x, s->object_size);
+
+	freeptr = get_freepointer(s, x);
+	/*
+	 * kasan_slab_free() may put x into memory quarantine, delaying its
+	 * reuse. In this case the object's freelist pointer is changed.
+	 */
+	kasan_slab_free(s, x);
+	return freeptr;
+}
 
 static inline void slab_free_freelist_hook(struct kmem_cache *s,
 					   void *head, void *tail)
@@ -1311,28 +1341,37 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
 
 	void *object = head;
 	void *tail_obj = tail ? : head;
+	void *freeptr;
 
 	do {
-		slab_free_hook(s, object);
-	} while ((object != tail_obj) &&
-		 (object = get_freepointer(s, object)));
+		freeptr = slab_free_hook(s, object);
+	} while ((object != tail_obj) && (object = freeptr));
 #endif
 }
 
 /*
  * Slab allocation and freeing
  */
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
-					struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+		gfp_t flags, int node, struct kmem_cache_order_objects oo)
 {
+	struct page *page;
 	int order = oo_order(oo);
 
 	flags |= __GFP_NOTRACK;
 
+	if (memcg_charge_slab(s, flags, order))
+		return NULL;
+
 	if (node == NUMA_NO_NODE)
-		return alloc_pages(flags, order);
+		page = alloc_pages(flags, order);
 	else
-		return alloc_pages_exact_node(node, flags, order);
+		page = alloc_pages_exact_node(node, flags, order);
+
+	if (!page)
+		memcg_uncharge_slab(s, order);
+
+	return page;
 }
 
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1354,14 +1393,14 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	 */
 	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
 
-	page = alloc_slab_page(alloc_gfp, node, oo);
+	page = alloc_slab_page(s, alloc_gfp, node, oo);
 	if (unlikely(!page)) {
 		oo = s->min;
 		/*
 		 * Allocation may have failed due to fragmentation.
 		 * Try a lower order alloc if possible
 		 */
-		page = alloc_slab_page(flags, node, oo);
+		page = alloc_slab_page(s, flags, node, oo);
 
 		if (page)
 			stat(s, ORDER_FALLBACK);
@@ -1401,8 +1440,13 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
-	if (unlikely(s->ctor))
+	kasan_init_slab_obj(s, object);
+
+	if (unlikely(s->ctor)) {
+		kasan_unpoison_object_data(s, object);
 		s->ctor(object);
+		kasan_poison_object_data(s, object);
+	}
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1422,7 +1466,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	order = compound_order(page);
 	inc_slabs_node(s, page_to_nid(page), page->objects);
-	memcg_bind_pages(s, order);
 	page->slab_cache = s;
 	__SetPageSlab(page);
 	if (page_is_pfmemalloc(page))
@@ -1433,6 +1476,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (unlikely(s->flags & SLAB_POISON))
 		memset(start, POISON_INUSE, PAGE_SIZE << order);
 
+	kasan_poison_slab(page);
+
 	last = start;
 	for_each_object(p, s, start, page->objects) {
 		setup_object(s, page, last);
@@ -1473,11 +1518,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__ClearPageSlabPfmemalloc(page);
 	__ClearPageSlab(page);
 
-	memcg_release_pages(s, order);
 	page_mapcount_reset(page);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
-	__free_memcg_kmem_pages(page, order);
+	__free_pages(page, order);
+	memcg_uncharge_slab(s, order);
 }
 
 #define need_reserve_slab_rcu						\
@@ -1526,11 +1571,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 
 /*
  * Management of partially allocated slabs.
- *
- * list_lock must be held.
  */
-static inline void add_partial(struct kmem_cache_node *n,
-				struct page *page, int tail)
+static inline void
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
 {
 	n->nr_partial++;
 	if (tail == DEACTIVATE_TO_TAIL)
@@ -1539,12 +1582,17 @@ static inline void add_partial(struct kmem_cache_node *n,
 		list_add(&page->lru, &n->partial);
 }
 
-/*
- * list_lock must be held.
- */
+static inline void add_partial(struct kmem_cache_node *n,
+				struct page *page, int tail)
+{
+	lockdep_assert_held(&n->list_lock);
+	__add_partial(n, page, tail);
+}
+
 static inline void remove_partial(struct kmem_cache_node *n,
 					struct page *page)
 {
+	lockdep_assert_held(&n->list_lock);
 	list_del(&page->lru);
 	n->nr_partial--;
 }
@@ -1554,8 +1602,6 @@ static inline void remove_partial(struct kmem_cache_node *n,
  * return the pointer to the freelist.
  *
  * Returns a list of objects or NULL if it fails.
- *
- * Must hold list_lock since we modify the partial list.
  */
 static inline void *acquire_slab(struct kmem_cache *s,
 		struct kmem_cache_node *n, struct page *page,
@@ -1565,6 +1611,8 @@ static inline void *acquire_slab(struct kmem_cache *s,
 	unsigned long counters;
 	struct page new;
 
+	lockdep_assert_held(&n->list_lock);
+
 	/*
 	 * Zap the freelist and set the frozen bit.
 	 * The old freelist is the list of objects for the
@@ -1876,7 +1924,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freel
 
 	new.frozen = 0;
 
-	if (!new.inuse && n->nr_partial > s->min_partial)
+	if (!new.inuse && n->nr_partial >= s->min_partial)
 		m = M_FREE;
 	else if (new.freelist) {
 		m = M_PARTIAL;
@@ -1910,7 +1958,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freel
 
 		else if (l == M_FULL)
 
-			remove_full(s, page);
+			remove_full(s, n, page);
 
 		if (m == M_PARTIAL) {
 
@@ -1986,7 +2034,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 				new.freelist, new.counters,
 				"unfreezing slab"));
 
-		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
 			page->next = discard_page;
 			discard_page = page;
 		} else {
@@ -2023,6 +2071,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 	int pages;
 	int pobjects;
 
+	preempt_disable();
 	do {
 		pages = 0;
 		pobjects = 0;
@@ -2055,6 +2104,15 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		page->next = oldpage;
 
 	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+
+	if (unlikely(!s->cpu_partial)) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+		local_irq_restore(flags);
+	}
+	preempt_enable();
 }
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2480,6 +2538,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 		memset(object, 0, s->object_size);
 
 	slab_post_alloc_hook(s, gfpflags, 1, &object);
+	memcg_kmem_put_cache(s);
 
 	return object;
 }
@@ -2505,17 +2564,10 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
-
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
-{
-	void *ret = kmalloc_order(size, flags, order);
-	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
-	return ret;
-}
-EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 
 #ifdef CONFIG_NUMA
@@ -2539,6 +2591,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2585,7 +2639,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		new.inuse -= cnt;
 		if ((!new.inuse || !prior) && !was_frozen) {
 
-			if (!kmem_cache_debug(s) && !prior)
+			if (!kmem_cache_debug(s) && !prior) {
 
 				/*
 				 * Slab was on no list before and will be partially empty
@@ -2593,7 +2647,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 				 */
 				new.frozen = 1;
 
-			else { /* Needs to be taken off a list */
+			} else { /* Needs to be taken off a list */
 
 	                        n = get_node(s, page_to_nid(page));
 				/*
@@ -2633,7 +2687,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 return;
         }
 
-	if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
 		goto slab_empty;
 
 	/*
@@ -2641,7 +2695,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	 * then add it.
 	 */
 	if (kmem_cache_debug(s) && unlikely(!prior)) {
-		remove_full(s, page);
+		remove_full(s, n, page);
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
@@ -2655,9 +2709,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		 */
 		remove_partial(n, page);
 		stat(s, FREE_REMOVE_PARTIAL);
-	} else
+	} else {
 		/* Slab must be on the full list */
-		remove_full(s, page);
+		remove_full(s, n, page);
+	}
 
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	stat(s, FREE_SLAB);
@@ -2679,16 +2734,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
  * same page) possible by specifying head and tail ptr, plus objects
  * count (cnt). Bulk free indicated by tail pointer being set.
  */
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
-				      void *head, void *tail, int cnt,
-				      unsigned long addr)
+static __always_inline void do_slab_free(struct kmem_cache *s,
+				struct page *page, void *head, void *tail,
+				int cnt, unsigned long addr)
 {
 	void *tail_obj = tail ? : head;
 	struct kmem_cache_cpu *c;
 	unsigned long tid;
-
-	slab_free_freelist_hook(s, head, tail);
-
 redo:
 	/*
 	 * Determine the currently cpus per cpu slab.
@@ -2719,6 +2771,27 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
 
 }
 
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+				      void *head, void *tail, int cnt,
+				      unsigned long addr)
+{
+	slab_free_freelist_hook(s, head, tail);
+	/*
+	 * slab_free_freelist_hook() could have put the items into quarantine.
+	 * If so, no need to free them.
+	 */
+	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+		return;
+	do_slab_free(s, page, head, tail, cnt, addr);
+}
+
+#ifdef CONFIG_KASAN
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
+{
+	do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+}
+#endif
+
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
 	s = cache_from_obj(s, x);
@@ -3087,10 +3160,17 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
+	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+		      GFP_KERNEL);
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
-	add_partial(n, page, DEACTIVATE_TO_HEAD);
+	/*
+	 * No locks need to be taken here as it has just been
+	 * initialized and there is no concurrent access.
+	 */
+
+	__add_partial(n, page, DEACTIVATE_TO_HEAD);
 }
 
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3107,6 +3187,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 	}
 }
 
+void __kmem_cache_release(struct kmem_cache *s)
+{
+	free_percpu(s->cpu_slab);
+	free_kmem_cache_nodes(s);
+}
+
 static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
 	int node;
@@ -3148,7 +3234,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	unsigned long flags = s->flags;
-	unsigned long size = s->object_size;
+	size_t size = s->object_size;
 	int order;
 
 	/*
@@ -3207,7 +3293,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		 * the object.
 		 */
 		size += 2 * sizeof(struct track);
+#endif
 
+	kasan_cache_create(s, &size, &s->flags);
+#ifdef CONFIG_SLUB_DEBUG
 	if (flags & SLAB_RED_ZONE)
 		/*
 		 * Add some empty padding so that we can catch
@@ -3367,28 +3456,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 
 /*
  * Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
  */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
 	struct page *page, *h;
 
+	BUG_ON(irqs_disabled());
+	spin_lock_irq(&n->list_lock);
 	list_for_each_entry_safe(page, h, &n->partial, lru) {
 		if (!page->inuse) {
 			remove_partial(n, page);
 			discard_slab(s, page);
 		} else {
 			list_slab_objects(s, page,
-			"Objects remaining in %s on kmem_cache_close()");
+			"Objects remaining in %s on __kmem_cache_shutdown()");
 		}
 	}
+	spin_unlock_irq(&n->list_lock);
 }
 
 /*
  * Release all resources used by a slab cache.
  */
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
 {
 	int node;
 
@@ -3401,33 +3493,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
-	free_percpu(s->cpu_slab);
-	free_kmem_cache_nodes(s);
 	return 0;
 }
 
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
-	int rc = kmem_cache_close(s);
-
-	if (!rc) {
-		/*
-		 * Since slab_attr_store may take the slab_mutex, we should
-		 * release the lock while removing the sysfs entry in order to
-		 * avoid a deadlock. Because this is pretty much the last
-		 * operation we do and the lock will be released shortly after
-		 * that in slab_common.c, we could just move sysfs_slab_remove
-		 * to a later point in common code. We should do that when we
-		 * have a common sysfs framework for all allocators.
-		 */
-		mutex_unlock(&slab_mutex);
-		sysfs_slab_remove(s);
-		mutex_lock(&slab_mutex);
-	}
-
-	return rc;
-}
-
 /********************************************************************
  *		Kmalloc subsystem
  *******************************************************************/
@@ -3485,6 +3553,8 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
+	kasan_kmalloc(s, ret, size, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -3495,12 +3565,13 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
+	flags |= __GFP_COMP | __GFP_NOTRACK;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
 
 	kmemleak_alloc(ptr, size, 1, flags);
+	kasan_kmalloc_large(ptr, size, flags);
 	return ptr;
 }
 
@@ -3528,12 +3599,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
+	kasan_kmalloc(s, ret, size, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
 {
 	struct page *page;
 
@@ -3549,6 +3622,16 @@ size_t ksize(const void *object)
 
 	return slab_ksize(page->slab_cache);
 }
+
+size_t ksize(const void *object)
+{
+	size_t size = __ksize(object);
+	/* We assume that ksize callers could use whole allocated area,
+	 * so we need to unpoison this area.
+	 */
+	kasan_unpoison_shadow(object, size);
+	return size;
+}
 EXPORT_SYMBOL(ksize);
 
 #ifdef CONFIG_SLUB_DEBUG
@@ -3601,37 +3684,53 @@ void kfree(const void *x)
 	if (unlikely(!PageSlab(page))) {
 		BUG_ON(!PageCompound(page));
 		kmemleak_free(x);
-		__free_memcg_kmem_pages(page, compound_order(page));
+		kasan_kfree_large(x);
+		__free_pages(page, compound_order(page));
 		return;
 	}
 	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
 
+#define SHRINK_PROMOTE_MAX 32
+
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
  *
  * The slabs with the least items are placed last. This results in them
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
 	int node;
 	int i;
 	struct kmem_cache_node *n;
 	struct page *page;
 	struct page *t;
-	int objects = oo_objects(s->max);
-	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+	LIST_HEAD(discard);
+	struct list_head promote[SHRINK_PROMOTE_MAX];
 	unsigned long flags;
 
-	if (!slabs_by_inuse)
-		return -ENOMEM;
+	if (deactivate) {
+		/*
+		 * Disable empty slabs caching. Used to avoid pinning offline
+		 * memory cgroups by kmem pages that can be freed.
+		 */
+		s->cpu_partial = 0;
+		s->min_partial = 0;
+
+		/*
+		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+		 * so we have to make sure the change is visible.
+		 */
+		kick_all_cpus_sync();
+	}
+
+	for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+		INIT_LIST_HEAD(promote + i);
 
 	flush_all(s);
 	for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -3640,41 +3739,48 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		if (!n->nr_partial)
 			continue;
 
-		for (i = 0; i < objects; i++)
-			INIT_LIST_HEAD(slabs_by_inuse + i);
-
 		spin_lock_irqsave(&n->list_lock, flags);
 
 		/*
-		 * Build lists indexed by the items in use in each slab.
+		 * Build lists of slabs to discard or promote.
 		 *
 		 * Note that concurrent frees may occur while we hold the
 		 * list_lock. page->inuse here is the upper limit.
 		 */
 		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			list_move(&page->lru, slabs_by_inuse + page->inuse);
-			if (!page->inuse)
+			int free = page->objects - page->inuse;
+
+			/* Do not reread page->inuse */
+			barrier();
+
+			/* We do not keep full slabs on the list */
+			BUG_ON(free <= 0);
+
+			if (free == page->objects) {
+				list_move(&page->lru, &discard);
 				n->nr_partial--;
+			} else if (free <= SHRINK_PROMOTE_MAX)
+				list_move(&page->lru, promote + free - 1);
 		}
 
 		/*
-		 * Rebuild the partial list with the slabs filled up most
-		 * first and the least used slabs at the end.
+		 * Promote the slabs filled up most to the head of the
+		 * partial list.
 		 */
-		for (i = objects - 1; i > 0; i--)
-			list_splice(slabs_by_inuse + i, n->partial.prev);
+		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+			list_splice_init(promote + i, &n->partial);
 
 		spin_unlock_irqrestore(&n->list_lock, flags);
 
 		/* Release empty slabs */
-		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+		list_for_each_entry_safe(page, t, &discard, lru)
 			discard_slab(s, page);
+
+		INIT_LIST_HEAD(&discard);
 	}
 
-	kfree(slabs_by_inuse);
 	return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
 static int slab_mem_going_offline_callback(void *arg)
 {
@@ -3682,7 +3788,7 @@ static int slab_mem_going_offline_callback(void *arg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list)
-		kmem_cache_shrink(s);
+		__kmem_cache_shrink(s, false);
 	mutex_unlock(&slab_mutex);
 
 	return 0;
@@ -3832,6 +3938,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 #endif
 		}
 	}
+	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
 	return s;
 }
@@ -3896,6 +4003,9 @@ static int slab_unmergeable(struct kmem_cache *s)
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
+	if (!is_root_cache(s))
+		return 1;
+
 	if (s->ctor)
 		return 1;
 
@@ -3908,9 +4018,8 @@ static int slab_unmergeable(struct kmem_cache *s)
 	return 0;
 }
 
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
-		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+		unsigned long flags, const char *name, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3933,7 +4042,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
 			continue;
 
 		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
-				continue;
+			continue;
 		/*
 		 * Check if alignment is compatible.
 		 * Courtesy of Adrian Drzewiecki
@@ -3944,23 +4053,21 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
 		if (s->size - size >= sizeof(void *))
 			continue;
 
-		if (!cache_match_memcg(s, memcg))
-			continue;
-
 		return s;
 	}
 	return NULL;
 }
 
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
+	struct kmem_cache *s, *c;
 
-	s = find_mergeable(memcg, size, align, flags, name, ctor);
+	s = find_mergeable(size, align, flags, name, ctor);
 	if (s) {
 		s->refcount++;
+
 		/*
 		 * Adjust the object sizes so that we clear
 		 * the complete object on kzalloc.
@@ -3968,6 +4075,12 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 		s->object_size = max(s->object_size, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 
+		for_each_memcg_cache(c, s) {
+			c->object_size = s->object_size;
+			c->inuse = max_t(int, c->inuse,
+					 ALIGN(size, sizeof(void *)));
+		}
+
 		if (sysfs_slab_alias(s, name)) {
 			s->refcount--;
 			s = NULL;
@@ -3992,7 +4105,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 	memcg_propagate_slab_attrs(s);
 	err = sysfs_slab_add(s);
 	if (err)
-		kmem_cache_close(s);
+		__kmem_cache_release(s);
 
 	return err;
 }
@@ -4999,12 +5112,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
 			const char *buf, size_t length)
 {
-	if (buf[0] == '1') {
-		int rc = kmem_cache_shrink(s);
-
-		if (rc)
-			return rc;
-	} else
+	if (buf[0] == '1')
+		kmem_cache_shrink(s);
+	else
 		return -EINVAL;
 	return length;
 }
@@ -5228,7 +5338,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 	err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
 	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-		int i;
+		struct kmem_cache *c;
 
 		mutex_lock(&slab_mutex);
 		if (s->max_attr_size < len)
@@ -5251,11 +5361,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		 * directly either failed or succeeded, in which case we loop
 		 * through the descendants with best-effort propagation.
 		 */
-		for_each_memcg_cache_index(i) {
-			struct kmem_cache *c = cache_from_memcg(s, i);
-			if (c)
-				attribute->store(c, buf, len);
-		}
+		for_each_memcg_cache(c, s)
+			attribute->store(c, buf, len);
 		mutex_unlock(&slab_mutex);
 	}
 #endif
@@ -5267,15 +5374,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 #ifdef CONFIG_MEMCG_KMEM
 	int i;
 	char *buffer = NULL;
+	struct kmem_cache *root_cache;
 
-	if (!is_root_cache(s))
+	if (is_root_cache(s))
 		return;
 
+	root_cache = s->memcg_params.root_cache;
+
 	/*
 	 * This mean this cache had no attribute written. Therefore, no point
 	 * in copying default values around
 	 */
-	if (!s->max_attr_size)
+	if (!root_cache->max_attr_size)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
@@ -5297,7 +5407,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 		 */
 		if (buffer)
 			buf = buffer;
-		else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
 			buf = mbuf;
 		else {
 			buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5306,7 +5416,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 			buf = buffer;
 		}
 
-		attr->show(s->memcg_params->root_cache, buf);
+		attr->show(root_cache, buf);
 		attr->store(s, buf, strlen(buf));
 	}
 
@@ -5315,6 +5425,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 #endif
 }
 
+static void kmem_cache_release(struct kobject *k)
+{
+	slab_kmem_cache_release(to_slab(k));
+}
+
 static const struct sysfs_ops slab_sysfs_ops = {
 	.show = slab_attr_show,
 	.store = slab_attr_store,
@@ -5322,6 +5437,7 @@ static const struct sysfs_ops slab_sysfs_ops = {
 
 static struct kobj_type slab_ktype = {
 	.sysfs_ops = &slab_sysfs_ops,
+	.release = kmem_cache_release,
 };
 
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5339,6 +5455,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
 
 static struct kset *slab_kset;
 
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s))
+		return s->memcg_params.root_cache->memcg_kset;
+#endif
+	return slab_kset;
+}
+
 #define ID_STR_LENGTH 64
 
 /* Create a unique string id for a slab cache:
@@ -5368,13 +5493,15 @@ static char *create_unique_id(struct kmem_cache *s)
 		*p++ = 'F';
 	if (!(s->flags & SLAB_NOTRACK))
 		*p++ = 't';
+	if (s->flags & SLAB_ACCOUNT)
+		*p++ = 'A';
 	if (p != name + 1)
 		*p++ = '-';
 	p += sprintf(p, "%07d", s->size);
 
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s))
-		p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+		p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params.memcg));
 #endif
 
 	BUG_ON(p > name + ID_STR_LENGTH - 1);
@@ -5403,7 +5530,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 		name = create_unique_id(s);
 	}
 
-	s->kobj.kset = slab_kset;
+	s->kobj.kset = cache_kset(s);
 	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
 	if (err) {
 		kobject_put(&s->kobj);
@@ -5416,6 +5543,18 @@ static int sysfs_slab_add(struct kmem_cache *s)
 		kobject_put(&s->kobj);
 		return err;
 	}
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (is_root_cache(s)) {
+		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+		if (!s->memcg_kset) {
+			kobject_del(&s->kobj);
+			kobject_put(&s->kobj);
+			return -ENOMEM;
+		}
+	}
+#endif
+
 	kobject_uevent(&s->kobj, KOBJ_ADD);
 	if (!unmergeable) {
 		/* Setup first alias */
@@ -5425,7 +5564,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 	return 0;
 }
 
-static void sysfs_slab_remove(struct kmem_cache *s)
+void sysfs_slab_remove(struct kmem_cache *s)
 {
 	if (slab_state < FULL)
 		/*
@@ -5434,6 +5573,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
 		 */
 		return;
 
+#ifdef CONFIG_MEMCG_KMEM
+	kset_unregister(s->memcg_kset);
+#endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
 	kobject_del(&s->kobj);
 	kobject_put(&s->kobj);
@@ -5520,6 +5662,77 @@ __initcall(slab_sysfs_init);
  * The /proc/slabinfo ABI
  */
 #ifdef CONFIG_SLABINFO
+
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_cache_node *n;
+	struct list_head *lh;
+	int cpu, node;
+
+	slabs = 0;
+
+	for_each_online_cpu(cpu)
+		slabs++;
+
+	for_each_online_node(node) {
+		n = get_node(cache, node);
+		if (!n)
+			continue;
+		spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLUB_DEBUG
+		list_for_each(lh, &n->full)
+			slabs++;
+#endif
+		list_for_each(lh, &n->partial)
+			slabs++;
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << oo_order(cache->oo));
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+//	spin_lock(&cache_chain_lock);
+	list_for_each_entry(ptr, &slab_caches, list) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++) {
+			if (sizes[i] < sizes[j])
+				j = i;
+		}
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++) {
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+				top[i]->name, sizes[i],
+				top[i]->size);
+	}
+
+//	spin_unlock(&cache_chain_lock);
+}
+
 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 {
 	unsigned long nr_partials = 0;
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -63,6 +64,7 @@ static void __page_cache_release(struct page *page)
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
+	mem_cgroup_uncharge(page);
 }
 
 static void __put_single_page(struct page *page)
@@ -623,6 +625,8 @@ void mark_page_accessed(struct page *page)
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
+	if (page_is_idle(page))
+		clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
@@ -678,6 +682,40 @@ void add_page_to_unevictable_list(struct page *page)
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma:   vma in which page is mapped for determining reclaimability
+ *
+ * Place @page on the active or unevictable LRU list, depending on its
+ * evictability.  Note that if the page is not evictable, it goes
+ * directly back onto it's zone's unevictable list, it does NOT use a
+ * per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+					 struct vm_area_struct *vma)
+{
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+		SetPageActive(page);
+		lru_cache_add(page);
+		return;
+	}
+
+	if (!TestSetPageMlocked(page)) {
+		/*
+		 * We use the irq-unsafe __mod_zone_page_stat because this
+		 * counter is not modified from interrupt context, and the pte
+		 * lock is held(spinlock), which implies preemption disabled.
+		 */
+		__mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+	}
+	add_page_to_unevictable_list(page);
+}
+
 /*
  * If the page can not be invalidated, it is moved to the
  * inactive list to speed up its reclaim.  It is moved to the
@@ -1008,11 +1046,15 @@ void release_pages(struct page **pages, int nr, bool cold)
 	if (zone)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-	if (!list_empty(&pages_to_free))
+	if (!list_empty(&pages_to_free)) {
+		mem_cgroup_uncharge_list(&pages_to_free);
 		free_hot_cold_page_list(&pages_to_free, cold);
+	}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (!list_empty(&trans_huge_pages_to_free))
+	if (!list_empty(&trans_huge_pages_to_free)) {
+		mem_cgroup_uncharge_list(&trans_huge_pages_to_free);
 		free_trans_huge_page_list(&trans_huge_pages_to_free);
+	}
 #endif
 }
 EXPORT_SYMBOL(release_pages);
@@ -1192,10 +1234,8 @@ void __init swap_setup(void)
 	int i;
 
 	bdi_init(swapper_spaces[0].backing_dev_info);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
+	for (i = 0; i < MAX_SWAPFILES; i++)
 		spin_lock_init(&swapper_spaces[i].tree_lock);
-		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-	}
 #endif
 
 	/* Use a smaller cluster for small-memory machines */
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -47,12 +47,13 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 
-static struct {
+struct {
 	unsigned long add_total;
 	unsigned long del_total;
 	unsigned long find_success;
 	unsigned long find_total;
 } swap_cache_info;
+EXPORT_SYMBOL(swap_cache_info);
 
 unsigned long total_swapcache_pages(void)
 {
@@ -175,7 +176,7 @@ int add_to_swap(struct page *page, struct list_head *list)
 
 	if (unlikely(PageTransHuge(page)))
 		if (unlikely(split_huge_page_to_list(page, list))) {
-			swapcache_free(entry, NULL);
+			swapcache_free(entry);
 			return 0;
 		}
 
@@ -201,7 +202,7 @@ int add_to_swap(struct page *page, struct list_head *list)
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 		return 0;
 	}
 }
@@ -224,7 +225,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&address_space->tree_lock);
 
-	swapcache_free(entry, page);
+	swapcache_free(entry);
 	page_cache_release(page);
 }
 
@@ -394,7 +395,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -641,16 +641,13 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-void swapcache_free(swp_entry_t entry, struct page *page)
+void swapcache_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
-	unsigned char count;
 
 	p = swap_info_get(entry);
 	if (p) {
-		count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
-		if (page)
-			mem_cgroup_uncharge_swapcache(page, entry, count != 0);
+		swap_entry_free(p, entry, SWAP_HAS_CACHE);
 		spin_unlock(&p->lock);
 	}
 }
@@ -692,7 +689,20 @@ int reuse_swap_page(struct page *page)
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
 		if (count == 1 && !PageWriteback(page)) {
-			delete_from_swap_cache(page);
+			swp_entry_t entry;
+			struct address_space *address_space;
+
+			entry.val = page_private(page);
+
+			address_space = swap_address_space(entry);
+			spin_lock_irq(&address_space->tree_lock);
+			__delete_from_swap_cache(page);
+			spin_unlock_irq(&address_space->tree_lock);
+
+			/* the page is still in use, do not uncharge */
+			swapcache_free(entry);
+			page_cache_release(page);
+
 			SetPageDirty(page);
 		}
 	}
@@ -898,35 +908,38 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret = 1;
+	struct mm_struct *mm = vma->vm_mm;
 
 	swapcache = page;
 	page = ksm_might_need_to_copy(page, vma, addr);
 	if (unlikely(!page))
 		return -ENOMEM;
 
-	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
-					 GFP_KERNEL, &memcg)) {
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
 		ret = -ENOMEM;
 		goto out_nolock;
 	}
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
-		mem_cgroup_cancel_charge_swapin(memcg);
+		mem_cgroup_cancel_charge(page, memcg);
 		ret = 0;
 		goto out;
 	}
 
 	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+	inc_mm_counter(mm, MM_ANONPAGES);
 	get_page(page);
-	set_pte_at(vma->vm_mm, addr, pte,
+	set_pte_at(mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-	if (page == swapcache)
+	if (page == swapcache) {
 		page_add_anon_rmap(page, vma, addr);
-	else /* ksm created a completely new copy */
+		mem_cgroup_commit_charge(page, memcg, true);
+	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, addr);
-	mem_cgroup_commit_charge_swapin(page, memcg);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
+	}
 	swap_free(entry);
 	/*
 	 * Move the page to the active list so it is not
@@ -1266,6 +1279,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+
 		if (retval) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1594,6 +1608,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct filename *pathname;
 	int err, found = 0;
 
+	/* VE admin check is just to be on the safe side, the admin may affect
+	 * swaps only if he has access to special, i.e. if he has been granted
+	 * access to the block device or if the swap file is in the area
+	 * visible to him. */
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -1815,11 +1833,42 @@ static const struct seq_operations swaps_op = {
 	.show =		swap_show
 };
 
+#include <linux/virtinfo.h>
+
+static int swap_show_ve(struct seq_file *swap, void *v)
+{
+	struct user_beancounter *old_ub;
+	struct sysinfo si;
+	int ret;
+
+	si_swapinfo(&si);
+	old_ub = set_exec_ub(current->mm->mm_ub);
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, &si);
+	(void)set_exec_ub(old_ub);
+	if (ret & NOTIFY_FAIL)
+		goto out;
+
+	seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+	if (!si.totalswap)
+		goto out;
+	seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n",
+			"/dev/null",
+			"partition",
+			si.totalswap  << (PAGE_SHIFT - 10),
+			(si.totalswap - si.freeswap) << (PAGE_SHIFT - 10),
+			-1);
+out:
+	return 0;
+}
+
 static int swaps_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
 	int ret;
 
+	if (!ve_is_super(get_exec_env()))
+		return single_open(file, &swap_show_ve, NULL);
+
 	ret = seq_open(file, &swaps_op);
 	if (ret)
 		return ret;
@@ -1829,17 +1878,26 @@ static int swaps_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int swaps_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *f = file->private_data;
+
+	if (f->op != &swaps_op)
+		return single_release(inode, file);
+	return seq_release(inode, file);
+}
+
 static const struct file_operations proc_swaps_operations = {
 	.open		= swaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= swaps_release,
 	.poll		= swaps_poll,
 };
 
 static int __init procswaps_init(void)
 {
-	proc_create("swaps", 0, NULL, &proc_swaps_operations);
+	proc_create("swaps", S_ISVTX, NULL, &proc_swaps_operations);
 	return 0;
 }
 __initcall(procswaps_init);
--- /dev/null
+++ b/mm/tcache.c
@@ -0,0 +1,1454 @@
+/*
+ *  mm/tcache.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/idr.h>
+#include <linux/atomic.h>
+#include <linux/kref.h>
+#include <linux/jhash.h>
+#include <linux/completion.h>
+#include <linux/shrinker.h>
+#include <linux/vmstat.h>
+#include <linux/swap.h>
+#include <linux/cleancache.h>
+
+/* cleancache_put_page is called from atomic context */
+#define TCACHE_GFP_MASK			(__GFP_NORETRY | __GFP_NOWARN)
+
+struct tcache_node_tree {
+	struct rb_root			root;
+	spinlock_t			lock;
+};
+
+/*
+ * Per NUMA node data of a tcache_pool. Protected by tcache_nodeinfo->lock.
+ */
+struct tcache_pool_nodeinfo {
+	struct tcache_pool		*pool;
+
+	/* node in tcache_nodeinfo->reclaim_tree */
+	struct rb_node			reclaim_node;
+
+	/* LRU list of pages, linked through page->lru */
+	struct list_head		lru;
+
+	/* number of pages on the LRU list */
+	unsigned long			nr_pages;
+
+	/* recent number of successful gets and puts from the pool;
+	 * used in calculating reclaim prio */
+	unsigned long			recent_gets;
+	unsigned long			recent_puts;
+
+	/* reuse_ratio is basically recent_gets / recent_puts;
+	 * it shows the efficiency of the pool */
+	unsigned long			reuse_ratio;
+
+	/* timestamp of the eldest page on the LRU list */
+	unsigned long			timestamp;
+
+	/* increased on every LRU add/del, reset once it gets big enough;
+	 * used for rate limiting rebalancing of reclaim_tree */
+	unsigned long			events;
+	spinlock_t			lock;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Tcache pools correspond to super blocks. A pool is created on FS mount
+ * (cleancache_init_fs) and destroyed on unmount (cleancache_invalidate_fs).
+ */
+struct tcache_pool {
+	/*
+	 * Reference counter. Pool destruction (triggered by unmount) will
+	 * actually start only after it reaches zero.
+	 *
+	 * Initialized to 1 on creation, decremented on destruction. May be
+	 * held temporarily by active users.
+	 */
+	struct kref			kref;
+
+	/*
+	 * Binary search trees of tcache_node structs that belong to this pool.
+	 * Linked by tcache_node->tree_node.
+	 */
+	struct tcache_node_tree		*node_tree;
+
+	/* track total number of nodes in each pool for debugging */
+	atomic_long_t			nr_nodes;
+
+	/* used to synchronize destruction */
+	struct completion		completion;
+	struct rcu_head			rcu;
+
+	/* Per NUMA node data. This must be the last element of the struct. */
+	struct tcache_pool_nodeinfo	nodeinfo[0];
+};
+
+static atomic_long_t nr_tcache_nodes;
+
+/*
+ * Tcache nodes correspond to inodes. A node is created automatically when a
+ * new page is added to the cache (cleancache_put_page) and destroyed either
+ * when the corresponding inode is invalidated (cleancache_invalidate_inode) or
+ * when the last page is removed from it (by the shrinker, cleancache_get_page,
+ * or cleancache_invalidate_page).
+ */
+struct tcache_node {
+	/*
+	 * Reference counter. Node is freed when it reaches zero.
+	 *
+	 * Incremented when the first page is attached to the node (node
+	 * becomes non-empty) and decremented when the last page is detached
+	 * (node becomes empty). May also be held temporarily by active users.
+	 *
+	 * Note that a node with a non-zero reference count is not guaranteed
+	 * to be present on the tcache_pool->node_tree - it could have been
+	 * removed by cleancache_invalidate_inode. However, if a node is found
+	 * on the tree with the tree_lock held, it must have a positive
+	 * reference count.
+	 */
+	struct kref			kref;
+
+	struct tcache_pool		*pool;
+	struct cleancache_filekey	key;
+	struct rb_node			tree_node;
+
+	/*
+	 * Radix tree of pages attached to this node. Protected by tree_lock.
+	 */
+	struct radix_tree_root		page_tree;
+	spinlock_t			tree_lock;
+
+	unsigned long			nr_pages;
+	bool				invalidated;
+};
+
+/*
+ * To reduce contention on tcache_node_tree->lock, we maintain several trees
+ * per each pool and distribute nodes among them in accordance with their hash.
+ */
+static int num_node_trees __read_mostly = 1;
+
+/*
+ * tcache_pool_idr provides id -> tcache_pool map. Lookups are lock free (RCU).
+ * Updated are protected by the tcache_pool_lock.
+ */
+static DEFINE_IDR(tcache_pool_idr);
+static DEFINE_SPINLOCK(tcache_pool_lock);
+
+struct tcache_nodeinfo {
+	spinlock_t lock;
+
+	/* tree of pools, sorted by reclaim prio */
+	struct rb_root reclaim_tree;
+	struct rb_node __rcu *rb_first;
+
+	/* total number of pages on all LRU lists corresponding to this node */
+	atomic_long_t nr_pages;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Global per NUMA node data.
+ */
+static struct tcache_nodeinfo *tcache_nodeinfo;
+
+/*
+ * Locking rules:
+ *
+ *  tcache_node->tree_lock
+ *       tcache_node_tree->lock
+ *       tcache_nodeinfo->lock
+ */
+
+/* Enable/disable tcache backend (set at boot time) */
+static bool tcache_enabled __read_mostly = true;
+module_param_named(enabled, tcache_enabled, bool, 0444);
+
+/* Enable/disable populating the cache */
+static bool tcache_active __read_mostly = true;
+module_param_named(active, tcache_active, bool, 0644);
+
+/*
+ * How long a tcache page is considered active, i.e. likely to be reused.
+ * A pool that contains only active pages will be given a boost over other
+ * pools while selecting a reclaim target.
+ */
+static unsigned long tcache_active_interval __read_mostly = 60 * HZ;
+
+/* Total number of pages cached */
+static DEFINE_PER_CPU(long, nr_tcache_pages);
+
+static inline u32 key_hash(const struct cleancache_filekey *key)
+{
+	return jhash2(key->u.key, CLEANCACHE_KEY_MAX, 0);
+}
+
+static inline struct tcache_node_tree *
+node_tree_from_key(struct tcache_pool *pool,
+		   const struct cleancache_filekey *key)
+{
+	return &pool->node_tree[key_hash(key) & (num_node_trees - 1)];
+}
+
+static struct rb_node *update_ni_rb_first(struct tcache_nodeinfo *ni)
+{
+	struct rb_node *first = rb_first(&ni->reclaim_tree);
+	rcu_assign_pointer(ni->rb_first, first);
+	return first;
+}
+
+static void __tcache_insert_reclaim_node(struct tcache_nodeinfo *ni,
+					 struct tcache_pool_nodeinfo *pni);
+
+static inline bool tcache_check_events(struct tcache_pool_nodeinfo *pni)
+{
+	/*
+	 * We don't want to rebalance reclaim_tree on each get/put, because it
+	 * would be way too costly. Instead we count get/put events per each
+	 * pool and update a pool's reclaim prio only once the counter gets big
+	 * enough. This should yield satisfactory reclaim fairness while still
+	 * keeping the cost of get/put low.
+	 */
+	pni->events++;
+	if (likely(pni->events < 1024))
+		return false;
+
+	pni->events = 0;
+
+	/*
+	 * The pool is empty, so there's no point in adding it to the
+	 * reclaim_tree. Neither do we need to remove it from the tree -
+	 * it will be done by the shrinker once it tries to scan it.
+	 */
+	if (unlikely(list_empty(&pni->lru)))
+		return false;
+
+	/*
+	 * This can only happen if the node was removed from the tree on pool
+	 * destruction (see tcache_remove_from_reclaim_trees()). Nothing to do
+	 * then.
+	 */
+	if (unlikely(RB_EMPTY_NODE(&pni->reclaim_node)))
+		return false;
+
+	return true;
+}
+
+/*
+ * Add a page to the LRU list. This effectively makes the page visible to the
+ * shrinker, so it must only be called after the page was properly initialized
+ * and added to the corresponding page tree.
+ */
+static void tcache_lru_add(struct tcache_pool *pool, struct page *page)
+{
+	int nid = page_to_nid(page);
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+
+	atomic_long_inc(&ni->nr_pages);
+
+	spin_lock(&pni->lock);
+	pni->nr_pages++;
+	list_add_tail(&page->lru, &pni->lru);
+
+	pni->recent_puts++;
+	if (unlikely(pni->recent_puts > pni->nr_pages / 2)) {
+		pni->recent_gets /= 2;
+		pni->recent_puts /= 2;
+	}
+
+	if (tcache_check_events(pni) || RB_EMPTY_NODE(&pni->reclaim_node)) {
+		spin_lock(&ni->lock);
+		if (!RB_EMPTY_NODE(&pni->reclaim_node))
+			rb_erase(&pni->reclaim_node, &ni->reclaim_tree);
+		__tcache_insert_reclaim_node(ni, pni);
+		update_ni_rb_first(ni);
+		spin_unlock(&ni->lock);
+	}
+	spin_unlock(&pni->lock);
+}
+
+static void __tcache_lru_del(struct tcache_pool_nodeinfo *pni,
+			     struct page *page)
+{
+	pni->nr_pages--;
+	list_del_init(&page->lru);
+}
+
+/*
+ * Remove a page from the LRU list. This function is safe to call on the same
+ * page from concurrent threads - the page will be removed only once.
+ */
+static void tcache_lru_del(struct tcache_pool *pool, struct page *page,
+			   bool reused)
+{
+	int nid = page_to_nid(page);
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+	bool deleted = false;
+
+	spin_lock(&pni->lock);
+
+	/* Raced with reclaimer? */
+	if (unlikely(list_empty(&page->lru)))
+		goto out;
+
+	__tcache_lru_del(pni, page);
+	deleted = true;
+
+	if (reused)
+		pni->recent_gets++;
+
+	if (tcache_check_events(pni)) {
+		spin_lock(&ni->lock);
+		if (!RB_EMPTY_NODE(&pni->reclaim_node))
+			rb_erase(&pni->reclaim_node, &ni->reclaim_tree);
+		__tcache_insert_reclaim_node(ni, pni);
+		update_ni_rb_first(ni);
+		spin_unlock(&ni->lock);
+	}
+out:
+	spin_unlock(&pni->lock);
+	if (deleted)
+		atomic_long_dec(&ni->nr_pages);
+}
+
+static int tcache_create_pool(void)
+{
+	size_t size;
+	struct tcache_pool *pool;
+	struct tcache_pool_nodeinfo *pni;
+	int id;
+	int i;
+
+	size = sizeof(struct tcache_pool);
+	size += nr_node_ids * sizeof(struct tcache_pool_nodeinfo);
+
+	pool = kzalloc(size, GFP_KERNEL);
+	if (!pool)
+		goto fail;
+
+	pool->node_tree = kcalloc(num_node_trees, sizeof(*pool->node_tree),
+				  GFP_KERNEL);
+	if (!pool->node_tree)
+		goto free_pool;
+
+	kref_init(&pool->kref);
+	init_completion(&pool->completion);
+
+	for (i = 0; i < num_node_trees; i++) {
+		pool->node_tree[i].root = RB_ROOT;
+		spin_lock_init(&pool->node_tree[i].lock);
+	}
+
+	for (i = 0; i < nr_node_ids; i++) {
+		pni = &pool->nodeinfo[i];
+		pni->pool = pool;
+		RB_CLEAR_NODE(&pni->reclaim_node);
+		INIT_LIST_HEAD(&pni->lru);
+		spin_lock_init(&pni->lock);
+	}
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&tcache_pool_lock);
+
+	id = idr_alloc(&tcache_pool_idr, pool, 0, 0, GFP_NOWAIT);
+
+	spin_unlock(&tcache_pool_lock);
+	idr_preload_end();
+
+	if (id < 0)
+		goto free_trees;
+	return id;
+
+free_trees:
+	kfree(pool->node_tree);
+free_pool:
+	kfree(pool);
+fail:
+	return -1;
+}
+
+/*
+ * Take a reference to a pool unless it is being destroyed. Returns true on
+ * success, false on failure. The caller must guarantee that the pool can be
+ * safely dereferenced.
+ */
+static bool tcache_grab_pool(struct tcache_pool *pool)
+{
+	return kref_get_unless_zero(&pool->kref);
+}
+
+static void tcache_hold_pool(struct tcache_pool *pool)
+{
+	kref_get(&pool->kref);
+}
+
+/*
+ * Return the pool corresponding to an id (or NULL if there is no such). The
+ * reference counter of the returned pool is incremented.
+ */
+static struct tcache_pool *tcache_get_pool(int id)
+{
+	struct tcache_pool *pool;
+
+	if (id < 0)
+		return NULL;
+
+	rcu_read_lock();
+	pool = idr_find(&tcache_pool_idr, id);
+	if (pool && !tcache_grab_pool(pool))
+		pool = NULL;
+	rcu_read_unlock();
+
+	return pool;
+}
+
+static void tcache_pool_release_fn(struct kref *kref)
+{
+	struct tcache_pool *pool = container_of(kref, struct tcache_pool, kref);
+
+	/*
+	 * Notify tcache_destroy_pool that it is now safe to proceed to
+	 * destruction.
+	 */
+	complete(&pool->completion);
+}
+
+/*
+ * Release reference to a pool taken by tcache_grab_pool or tcache_get_pool.
+ */
+static inline void tcache_put_pool(struct tcache_pool *pool)
+{
+	kref_put(&pool->kref, tcache_pool_release_fn);
+}
+
+static void tcache_remove_from_reclaim_trees(struct tcache_pool *pool);
+static void tcache_invalidate_node_tree(struct tcache_node_tree *tree);
+
+static void tcache_destroy_pool(int id)
+{
+	int i;
+	struct tcache_pool *pool;
+	unsigned long nr_nodes;
+
+	spin_lock(&tcache_pool_lock);
+	pool = idr_find(&tcache_pool_idr, id);
+	if (pool)
+		idr_remove(&tcache_pool_idr, id);
+	spin_unlock(&tcache_pool_lock);
+
+	if (!pool)
+		return;
+
+	tcache_put_pool(pool);
+
+	/*
+	 * Wait until all references to this pool are released.
+	 *
+	 * We removed the pool from id -> pool map, so now new references can
+	 * only be taken by the shrinker. The latter takes a reference to this
+	 * pool only in order to remove a page from it. Since no new pages can
+	 * be added to the pool, we are guaranteed to make progress.
+	 */
+	wait_for_completion(&pool->completion);
+
+	tcache_remove_from_reclaim_trees(pool);
+
+	for (i = 0; i < num_node_trees; i++)
+		tcache_invalidate_node_tree(&pool->node_tree[i]);
+
+	nr_nodes = atomic_long_read(&pool->nr_nodes);
+	if (WARN(nr_nodes != 0, "pool->nr_nodes %ld", nr_nodes))
+		return;
+
+	kfree(pool->node_tree);
+	kfree_rcu(pool, rcu);
+}
+
+static struct tcache_node *tcache_alloc_node(void)
+{
+	struct tcache_node *node;
+
+	node = kzalloc(sizeof(*node), TCACHE_GFP_MASK);
+	if (!node)
+		return NULL;
+
+	kref_init(&node->kref);
+	INIT_RADIX_TREE(&node->page_tree, TCACHE_GFP_MASK);
+	spin_lock_init(&node->tree_lock);
+
+	return node;
+}
+
+static struct tcache_node *__tcache_lookup_node(struct rb_root *rb_root,
+		const struct cleancache_filekey *key,
+		struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+	struct rb_node **__rb_link = &rb_root->rb_node;
+	struct rb_node *__rb_parent = NULL;
+	struct tcache_node *node;
+	int ret;
+
+	*rb_link = NULL;
+	*rb_parent = NULL;
+
+	while (*__rb_link) {
+		__rb_parent = *__rb_link;
+		node = rb_entry(__rb_parent, struct tcache_node, tree_node);
+
+		ret = memcmp(&node->key, key, sizeof(*key));
+		if (ret > 0)
+			__rb_link = &__rb_parent->rb_left;
+		else if (ret < 0)
+			__rb_link = &__rb_parent->rb_right;
+		else
+			return node;
+	}
+
+	*rb_parent = __rb_parent;
+	*rb_link = __rb_link;
+
+	return NULL;
+}
+
+static void __tcache_insert_node(struct rb_root *rb_root,
+		struct tcache_node *node,
+		struct rb_node **rb_link, struct rb_node *rb_parent)
+{
+	rb_link_node(&node->tree_node, rb_parent, rb_link);
+	rb_insert_color(&node->tree_node, rb_root);
+}
+
+static void __tcache_delete_node(struct rb_root *rb_root,
+				 struct tcache_node *node)
+{
+	/*
+	 * A node is deleted from the tree automatically by the node release
+	 * function as soon as the last reference to it has been dropped (all
+	 * pages and users have gone), but it can also be deleted explicitly by
+	 * tcache_invalidate_node, in which case the release function will
+	 * receive a node which is already not on the tree.
+	 */
+	if (!RB_EMPTY_NODE(&node->tree_node)) {
+		rb_erase(&node->tree_node, rb_root);
+		RB_CLEAR_NODE(&node->tree_node);
+	}
+}
+
+/*
+ * Take a reference to a node. The caller must guarantee that the node has a
+ * positive reference count. In particular, the function is safe to call if the
+ * node is known to be on the tree.
+ */
+static inline void tcache_hold_node(struct tcache_node *node)
+{
+	kref_get(&node->kref);
+}
+
+/*
+ * Find and get a reference to the node corresponding to a key in a pool. If
+ * the requested node does not exist and may_create is true, try to create a
+ * new one.
+ */
+static noinline_for_stack struct tcache_node *
+tcache_get_node(struct tcache_pool *pool, const struct cleancache_filekey *key,
+		bool may_create)
+{
+	struct tcache_node_tree *tree;
+	struct tcache_node *new_node = NULL, *node;
+	struct rb_node **rb_link, *rb_parent;
+	unsigned long flags;
+
+	tree = node_tree_from_key(pool, key);
+retry:
+	spin_lock_irqsave(&tree->lock, flags);
+	node = __tcache_lookup_node(&tree->root, key, &rb_link, &rb_parent);
+	if (node)
+		tcache_hold_node(node);
+	else if (new_node) {
+		node = new_node;
+		node->pool = pool;
+		node->key = *key;
+		atomic_long_inc(&pool->nr_nodes);
+		atomic_long_inc(&nr_tcache_nodes);
+		__tcache_insert_node(&tree->root, node, rb_link, rb_parent);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+
+	if (node) {
+		if (node != new_node)
+			kfree(new_node);
+		if (WARN_ON(node->pool != pool))
+			node = NULL;
+		return node;
+	}
+
+	if (may_create) {
+		new_node = tcache_alloc_node();
+		if (new_node)
+			goto retry;
+	}
+	return NULL;
+}
+
+static void tcache_node_release_fn(struct kref *kref)
+{
+	struct tcache_node *node = container_of(kref, struct tcache_node, kref);
+	struct tcache_node_tree *tree;
+
+	tree = node_tree_from_key(node->pool, &node->key);
+
+	__tcache_delete_node(&tree->root, node);
+	spin_unlock(&tree->lock);
+
+	atomic_long_dec(&nr_tcache_nodes);
+	atomic_long_dec(&node->pool->nr_nodes);
+	kfree(node);
+}
+
+/*
+ * Release a reference to a node taken by tcache_hold_node or tcache_get_node.
+ */
+static inline void tcache_put_node(struct tcache_node *node)
+{
+	struct tcache_node_tree *tree;
+
+	tree = node_tree_from_key(node->pool, &node->key);
+	kref_put_spinlock_irqsave(&node->kref, tcache_node_release_fn,
+				  &tree->lock);
+}
+
+static struct tcache_node *tcache_get_node_and_pool(int pool_id,
+		const struct cleancache_filekey *key, bool may_create)
+{
+	struct tcache_pool *pool;
+	struct tcache_node *node;
+
+	pool = tcache_get_pool(pool_id);
+	if (!pool)
+		return NULL;
+	node = tcache_get_node(pool, key, may_create);
+	if (!node)
+		tcache_put_pool(pool);
+	return node;
+}
+
+static void tcache_put_node_and_pool(struct tcache_node *node)
+{
+	struct tcache_pool *pool = node->pool;
+
+	tcache_put_node(node);
+	tcache_put_pool(pool);
+}
+
+static void tcache_invalidate_node_pages(struct tcache_node *node);
+
+/*
+ * Remove a node from the tree and invalidate its pages.
+ */
+static void tcache_invalidate_node(struct tcache_pool *pool,
+				   const struct cleancache_filekey *key)
+{
+	struct tcache_node_tree *tree;
+	struct tcache_node *node;
+	struct rb_node **rb_link, *rb_parent;
+
+	tree = node_tree_from_key(pool, key);
+
+	spin_lock_irq(&tree->lock);
+	node = __tcache_lookup_node(&tree->root, key, &rb_link, &rb_parent);
+	if (node) {
+		tcache_hold_node(node);
+		__tcache_delete_node(&tree->root, node);
+	}
+	spin_unlock_irq(&tree->lock);
+
+	if (node) {
+		tcache_invalidate_node_pages(node);
+		tcache_put_node(node);
+	}
+}
+
+static noinline_for_stack void
+tcache_invalidate_node_tree(struct tcache_node_tree *tree)
+{
+	struct tcache_node *node;
+
+	/*
+	 * There is no need to take tree->lock, because this function is only
+	 * called when the pool is about to be destroyed.
+	 */
+	while (!RB_EMPTY_ROOT(&tree->root)) {
+		node = rb_entry(rb_first(&tree->root),
+				struct tcache_node, tree_node);
+
+		/* Remaining nodes must be held solely by their pages */
+		WARN_ON(atomic_read(&node->kref.refcount) != 1);
+		WARN_ON(node->nr_pages == 0);
+		WARN_ON(node->invalidated);
+
+		tcache_hold_node(node);
+		tcache_invalidate_node_pages(node);
+		tcache_put_node(node);
+	}
+}
+
+static inline struct tcache_node *tcache_page_node(struct page *page)
+{
+	return (struct tcache_node *)page->mapping;
+}
+
+static inline unsigned long tcache_page_timestamp(struct page *page)
+{
+	return page->private;
+}
+
+static inline void tcache_init_page(struct page *page,
+				    struct tcache_node *node, pgoff_t index)
+{
+	page->mapping = (struct address_space *)node;
+	page->private = jiffies;
+	page->index = index;
+}
+
+static inline void tcache_put_page(struct page *page)
+{
+	page->mapping = NULL;
+	free_hot_cold_page(page, false);
+}
+
+static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
+				    struct page *page)
+{
+	int err = 0;
+
+	/*
+	 * If the node was invalidated after we looked it up, abort in order to
+	 * avoid clashes with tcache_invalidate_node_pages.
+	 */
+	if (unlikely(node->invalidated)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	err = radix_tree_insert(&node->page_tree, index, page);
+	WARN_ON(err == -EEXIST);
+	if (!err) {
+		if (!node->nr_pages++)
+			tcache_hold_node(node);
+		__this_cpu_inc(nr_tcache_pages);
+		__inc_zone_page_state(page, NR_FILE_PAGES);
+	}
+out:
+	return err;
+}
+
+static struct page *__tcache_page_tree_delete(struct tcache_node *node,
+					      pgoff_t index, struct page *page)
+{
+	if (!page_ref_freeze(page, 2)) {
+		put_page(page);
+		return NULL;
+	}
+
+	page = radix_tree_delete_item(&node->page_tree, index, page);
+	if (page) {
+		if (!--node->nr_pages)
+			tcache_put_node(node);
+		__this_cpu_dec(nr_tcache_pages);
+		__dec_zone_page_state(page, NR_FILE_PAGES);
+	}
+	return page;
+}
+
+static struct page *tcache_page_tree_delete(struct tcache_node *node,
+					    pgoff_t index, struct page *page)
+{
+	spin_lock(&node->tree_lock);
+	page = __tcache_page_tree_delete(node, index, page);
+	spin_unlock(&node->tree_lock);
+	return page;
+}
+
+/*
+ * Attempt to attach a page to a node at a given offset. If there is already a
+ * page at the given offset, it will be replaced. Returns 0 on success. The
+ * caller must put the page no matter if the function succeeds or fails.
+ */
+static noinline_for_stack int
+tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
+{
+	unsigned long flags;
+	int err = 0;
+
+	tcache_init_page(page, node, index);
+
+	spin_lock_irqsave(&node->tree_lock, flags);
+	err = tcache_page_tree_insert(node, index, page);
+	spin_unlock(&node->tree_lock);
+	if (!err)
+		tcache_lru_add(node->pool, page);
+	local_irq_restore(flags);
+	return err;
+}
+
+/*
+ * Detach and return the page at a given offset of a node. The caller must put
+ * the page when it is done with it.
+ */
+static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index,
+				       bool reused)
+{
+	void **pagep;
+	unsigned long flags;
+	struct page *page;
+
+	rcu_read_lock();
+repeat:
+	page = NULL;
+	pagep = radix_tree_lookup_slot(&node->page_tree, index);
+	if (pagep) {
+		page = radix_tree_deref_slot(pagep);
+		if (unlikely(!page))
+			goto out;
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page))
+				goto repeat;
+			WARN_ON(1);
+		}
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+		/*
+		 * Has the page moved?
+		 * This is part of the lockless pagecache protocol. See
+		 * include/linux/pagemap.h for details.
+		 */
+		if (unlikely(page != *pagep)) {
+			put_page(page);
+			goto repeat;
+		}
+	}
+out:
+	rcu_read_unlock();
+
+	if (page) {
+		local_irq_save(flags);
+		page = tcache_page_tree_delete(node, index, page);
+		if (page)
+			tcache_lru_del(node->pool, page, reused);
+		local_irq_restore(flags);
+		/*
+		 * Shrinker could isolated the page in parallel
+		 * with us. This case page_ref_freeze(page, 2)
+		 * in __tcache_page_tree_delete() fails, and
+		 * we have to repeat the cycle.
+		 */
+		if (!page)
+			goto repeat;
+	}
+
+	return page;
+}
+
+static unsigned tcache_lookup(struct page **pages, struct tcache_node *node,
+			pgoff_t start, unsigned int nr_pages, pgoff_t *indices)
+{
+	struct radix_tree_iter iter;
+	unsigned int ret = 0;
+	void **slot;
+
+	if (!nr_pages)
+		return 0;
+
+	rcu_read_lock();
+restart:
+	radix_tree_for_each_slot(slot, &node->page_tree, &iter, start) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot(slot);
+		if (unlikely(!page))
+			continue;
+
+		if (radix_tree_exception(page) && radix_tree_deref_retry(page))
+			goto restart;
+
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *slot)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		indices[ret] = iter.index;
+		pages[ret] = page;
+		if (++ret == nr_pages)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+#define TCACHE_PAGEVEC_SIZE 16
+static noinline_for_stack void
+tcache_invalidate_node_pages(struct tcache_node *node)
+{
+	pgoff_t indices[TCACHE_PAGEVEC_SIZE];
+	struct page *pages[TCACHE_PAGEVEC_SIZE];
+	pgoff_t index = 0;
+	unsigned nr_pages;
+	bool repeat;
+	int i;
+
+	/*
+	 * First forbid new page insertions - see tcache_page_tree_replace.
+	 */
+	node->invalidated = true;
+again:
+	repeat = false;
+	while ((nr_pages = tcache_lookup(pages, node, index,
+						TCACHE_PAGEVEC_SIZE, indices))) {
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pages[i];
+
+			index = indices[i];
+
+			spin_lock_irq(&node->tree_lock);
+			page = __tcache_page_tree_delete(node, page->index, page);
+			spin_unlock(&node->tree_lock);
+
+			if (page) {
+				tcache_lru_del(node->pool, page, false);
+				local_irq_enable();
+				tcache_put_page(page);
+			} else {
+				local_irq_enable();
+				repeat = true;
+			}
+		}
+		cond_resched();
+		index++;
+	}
+
+	if (repeat) {
+		index = 0;
+		goto again;
+	}
+
+	WARN_ON(node->nr_pages != 0);
+}
+
+static noinline_for_stack void
+tcache_remove_from_reclaim_trees(struct tcache_pool *pool)
+{
+	int i;
+	struct tcache_nodeinfo *ni;
+	struct tcache_pool_nodeinfo *pni;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		ni = &tcache_nodeinfo[i];
+		pni = &pool->nodeinfo[i];
+
+		spin_lock_irq(&ni->lock);
+		if (!RB_EMPTY_NODE(&pni->reclaim_node)) {
+			rb_erase(&pni->reclaim_node, &ni->reclaim_tree);
+			update_ni_rb_first(ni);
+			/*
+			 * Clear the node for tcache_check_events() not to
+			 * reinsert the pool back into the tree.
+			 */
+			RB_CLEAR_NODE(&pni->reclaim_node);
+		}
+		spin_unlock_irq(&ni->lock);
+	}
+}
+
+static inline bool tcache_reclaim_node_before(struct tcache_pool_nodeinfo *a,
+					      struct tcache_pool_nodeinfo *b,
+					      unsigned long now)
+{
+	bool a_active = now - a->timestamp < tcache_active_interval;
+	bool b_active = now - b->timestamp < tcache_active_interval;
+
+	/*
+	 * Always favor active pools over inactive. If the two pools are both
+	 * active or both inactive, the order in the reclaim_tree is determined
+	 * by the reuse ratio.
+	 */
+	if (a_active && !b_active)
+		return false;
+	if (!a_active && b_active)
+		return true;
+	return a->reuse_ratio < b->reuse_ratio;
+}
+
+static noinline_for_stack void
+__tcache_insert_reclaim_node(struct tcache_nodeinfo *ni,
+			     struct tcache_pool_nodeinfo *pni)
+{
+	struct rb_node **link = &ni->reclaim_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct tcache_pool_nodeinfo *pni2;
+	unsigned long now = jiffies;
+
+	BUG_ON(list_empty(&pni->lru));
+
+	pni->reuse_ratio = pni->recent_gets * 100 / (pni->recent_puts + 1);
+	pni->timestamp = tcache_page_timestamp(list_first_entry(&pni->lru,
+							struct page, lru));
+
+	while (*link) {
+		parent = *link;
+		pni2 = rb_entry(parent, struct tcache_pool_nodeinfo,
+				reclaim_node);
+		if (tcache_reclaim_node_before(pni, pni2, now))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(&pni->reclaim_node, parent, link);
+	rb_insert_color(&pni->reclaim_node, &ni->reclaim_tree);
+}
+
+static noinline_for_stack int
+__tcache_lru_isolate(struct tcache_pool_nodeinfo *pni,
+		     struct page **pages, int nr_to_scan)
+{
+	struct tcache_node *node;
+	struct page *page;
+	int nr_isolated = 0;
+
+	while (nr_to_scan-- > 0 && !list_empty(&pni->lru)) {
+		page = list_first_entry(&pni->lru, struct page, lru);
+
+		if (unlikely(!page_cache_get_speculative(page)))
+			continue;
+
+		__tcache_lru_del(pni, page);
+
+		/*
+		 * A node can be destroyed only if all its pages have been
+		 * removed both from the tree and the LRU list. Since we are
+		 * holding the LRU lock here and hence preventing the page
+		 * from being removed from the LRU list, it is therefore safe
+		 * to access the node which the page is attached to.
+		 */
+		node = tcache_page_node(page);
+		tcache_hold_node(node);
+		tcache_hold_pool(node->pool);
+
+		pages[nr_isolated++] = page;
+	}
+	return nr_isolated;
+}
+
+static noinline_for_stack int
+tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
+{
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni;
+	int nr_isolated = 0;
+	struct rb_node *rbn;
+
+	rcu_read_lock();
+again:
+	rbn = rcu_dereference(ni->rb_first);
+	if (!rbn) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	pni = rb_entry(rbn, struct tcache_pool_nodeinfo, reclaim_node);
+	if (!tcache_grab_pool(pni->pool)) {
+		spin_lock_irq(&ni->lock);
+		if (!RB_EMPTY_NODE(rbn) && list_empty(&pni->lru)) {
+			rb_erase(rbn, &ni->reclaim_tree);
+			RB_CLEAR_NODE(rbn);
+			update_ni_rb_first(ni);
+		}
+		spin_unlock_irq(&ni->lock);
+		goto again;
+	}
+	rcu_read_unlock();
+
+	spin_lock_irq(&pni->lock);
+	nr_isolated = __tcache_lru_isolate(pni, pages, nr_to_isolate);
+
+	if (!nr_isolated)
+		goto unlock;
+
+	if (!RB_EMPTY_NODE(rbn) || !list_empty(&pni->lru)) {
+		spin_lock(&ni->lock);
+		if (!RB_EMPTY_NODE(rbn))
+			rb_erase(rbn, &ni->reclaim_tree);
+		if (!list_empty(&pni->lru))
+			__tcache_insert_reclaim_node(ni, pni);
+		else
+			RB_CLEAR_NODE(rbn);
+		update_ni_rb_first(ni);
+		spin_unlock(&ni->lock);
+	}
+unlock:
+	spin_unlock_irq(&pni->lock);
+	tcache_put_pool(pni->pool);
+out:
+	if (nr_isolated)
+		atomic_long_sub(nr_isolated, &ni->nr_pages);
+	return nr_isolated;
+}
+
+static bool __tcache_reclaim_page(struct page *page)
+{
+	struct tcache_node *node;
+
+	node = tcache_page_node(page);
+	page = tcache_page_tree_delete(node, page->index, page);
+	tcache_put_node_and_pool(node);
+	return (page != NULL);
+}
+
+static int tcache_reclaim_pages(struct page **pages, int nr)
+{
+	int i;
+	int nr_reclaimed = 0;
+
+	local_irq_disable();
+	for (i = 0; i < nr; i++) {
+		if (__tcache_reclaim_page(pages[i])) {
+			nr_reclaimed++;
+			tcache_put_page(pages[i]);
+		}
+		pages[i] = NULL;
+	}
+	local_irq_enable();
+	return nr_reclaimed;
+}
+
+static noinline_for_stack struct page *
+tcache_try_to_reclaim_page(struct tcache_pool *pool, int nid)
+{
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+	struct page *page = NULL;
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+
+	spin_lock(&pni->lock);
+	ret = __tcache_lru_isolate(pni, &page, 1);
+	spin_unlock(&pni->lock);
+
+	if (!ret)
+		goto out;
+
+	atomic_long_dec(&ni->nr_pages);
+
+	if (!__tcache_reclaim_page(page))
+		page = NULL;
+	else
+		page_ref_unfreeze(page, 1);
+out:
+	local_irq_restore(flags);
+	return page;
+}
+
+static struct page *tcache_alloc_page(struct tcache_pool *pool)
+{
+	struct page *page;
+
+	page = alloc_page(TCACHE_GFP_MASK | __GFP_HIGHMEM);
+	if (!page)
+		page = tcache_try_to_reclaim_page(pool, numa_node_id());
+
+	return page;
+}
+
+static unsigned long tcache_shrink_count(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	atomic_long_t *nr_pages = &tcache_nodeinfo[sc->nid].nr_pages;
+	long ret;
+
+	ret = atomic_long_read(nr_pages);
+	WARN_ON(ret < 0);
+	return ret >= 0 ? ret : 0;
+}
+
+#define TCACHE_SCAN_BATCH 128UL
+static DEFINE_PER_CPU(struct page * [TCACHE_SCAN_BATCH], tcache_page_vec);
+
+static unsigned long tcache_shrink_scan(struct shrinker *shrink,
+					struct shrink_control *sc)
+{
+	struct page **pages = get_cpu_var(tcache_page_vec);
+	int nr_isolated, nr_reclaimed;
+
+	if (WARN_ON(sc->nr_to_scan > TCACHE_SCAN_BATCH))
+		sc->nr_to_scan = TCACHE_SCAN_BATCH;
+
+	nr_isolated = tcache_lru_isolate(sc->nid, pages, sc->nr_to_scan);
+	if (!nr_isolated) {
+		put_cpu_var(tcache_page_vec);
+		return SHRINK_STOP;
+	}
+
+	nr_reclaimed = tcache_reclaim_pages(pages, nr_isolated);
+	put_cpu_var(tcache_page_vec);
+
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += nr_reclaimed;
+
+	return nr_reclaimed;
+}
+
+struct shrinker tcache_shrinker = {
+	.count_objects		= tcache_shrink_count,
+	.scan_objects		= tcache_shrink_scan,
+	.seeks			= 8,
+	.batch			= TCACHE_SCAN_BATCH,
+	.flags			= SHRINKER_NUMA_AWARE,
+};
+
+static int tcache_cleancache_init_fs(size_t pagesize)
+{
+	return tcache_create_pool();
+}
+
+static int tcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	return -1;
+}
+
+static int tcache_cleancache_put_page(int pool_id,
+				       struct cleancache_filekey key,
+				       pgoff_t index, struct page *page)
+{
+	int ret = 0;
+	struct tcache_node *node;
+	struct page *cache_page = NULL;
+
+	/* It makes no sense to populate tcache when we are short on memory */
+	if (!READ_ONCE(tcache_active) || !(current->flags & PF_MEMCG_RECLAIM))
+		return 0;
+
+	node = tcache_get_node_and_pool(pool_id, &key, true);
+	if (node) {
+		cache_page = tcache_alloc_page(node->pool);
+		if (cache_page) {
+			copy_highpage(cache_page, page);
+			if (tcache_attach_page(node, index, cache_page)) {
+				if (put_page_testzero(cache_page))
+					tcache_put_page(cache_page);
+			} else
+				ret = 1;
+		}
+		tcache_put_node_and_pool(node);
+	}
+
+	return ret;
+}
+
+static int tcache_cleancache_get_page(int pool_id,
+				      struct cleancache_filekey key,
+				      pgoff_t index, struct page *page)
+{
+	struct tcache_node *node;
+	struct page *cache_page = NULL;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return -1;
+
+	node = tcache_get_node_and_pool(pool_id, &key, false);
+	if (node) {
+		cache_page = tcache_detach_page(node, index, true);
+		if (unlikely(cache_page && node->invalidated)) {
+			tcache_put_page(cache_page);
+			cache_page = NULL;
+		}
+		tcache_put_node_and_pool(node);
+	}
+
+	if (cache_page) {
+		copy_highpage(page, cache_page);
+		tcache_put_page(cache_page);
+		return 0;
+	}
+	return -1;
+}
+
+static void tcache_cleancache_invalidate_page(int pool_id,
+		struct cleancache_filekey key, pgoff_t index)
+{
+	struct tcache_node *node;
+	struct page *page;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return;
+
+	node = tcache_get_node_and_pool(pool_id, &key, false);
+	if (node) {
+		page = tcache_detach_page(node, index, false);
+		if (page)
+			tcache_put_page(page);
+		tcache_put_node_and_pool(node);
+	}
+}
+
+static void tcache_cleancache_invalidate_inode(int pool_id,
+					       struct cleancache_filekey key)
+{
+	struct tcache_pool *pool;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return;
+
+	pool = tcache_get_pool(pool_id);
+	if (pool) {
+		tcache_invalidate_node(pool, &key);
+		tcache_put_pool(pool);
+	}
+}
+
+static void tcache_cleancache_invalidate_fs(int pool_id)
+{
+	tcache_destroy_pool(pool_id);
+}
+
+static struct cleancache_ops tcache_cleancache_ops = {
+	.init_fs		= tcache_cleancache_init_fs,
+	.init_shared_fs		= tcache_cleancache_init_shared_fs,
+	.put_page		= tcache_cleancache_put_page,
+	.get_page		= tcache_cleancache_get_page,
+	.invalidate_page	= tcache_cleancache_invalidate_page,
+	.invalidate_inode	= tcache_cleancache_invalidate_inode,
+	.invalidate_fs		= tcache_cleancache_invalidate_fs,
+};
+
+unsigned long get_nr_tcache_pages(void)
+{
+	int cpu;
+	long val = 0;
+
+	for_each_possible_cpu(cpu)
+		val += per_cpu(nr_tcache_pages, cpu);
+	if (val < 0)
+		val = 0;
+	return val;
+}
+
+static int param_get_nr_pages(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%lu", get_nr_tcache_pages());
+}
+
+static struct kernel_param_ops param_ops_nr_pages = {
+	.get = param_get_nr_pages,
+};
+module_param_cb(nr_pages, &param_ops_nr_pages, NULL, 0444);
+
+static int param_set_active_interval(const char *val,
+				     const struct kernel_param *kp)
+{
+	int ret;
+	unsigned int msecs;
+
+	ret = kstrtouint(val, 10, &msecs);
+	if (ret)
+		return ret;
+
+	tcache_active_interval = msecs_to_jiffies(msecs);
+	return 0;
+}
+
+static int param_get_active_interval(char *buffer,
+				     const struct kernel_param *kp)
+{
+	unsigned int msecs;
+
+	msecs = jiffies_to_msecs(tcache_active_interval);
+	return sprintf(buffer, "%u", msecs);
+}
+
+static struct kernel_param_ops param_ops_active_interval = {
+	.set = param_set_active_interval,
+	.get = param_get_active_interval,
+};
+module_param_cb(active_interval_msecs, &param_ops_active_interval, NULL, 0644);
+
+static int __init tcache_nodeinfo_init(void)
+{
+	int i;
+	struct tcache_nodeinfo *ni;
+
+	tcache_nodeinfo = kcalloc(nr_node_ids, sizeof(*tcache_nodeinfo),
+				  GFP_KERNEL);
+	if (!tcache_nodeinfo)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		ni = &tcache_nodeinfo[i];
+		spin_lock_init(&ni->lock);
+		atomic_long_set(&ni->nr_pages, 0);
+		ni->reclaim_tree = RB_ROOT;
+		update_ni_rb_first(ni);
+	}
+	return 0;
+}
+
+static int __init tcache_init(void)
+{
+	int err;
+
+	if (!tcache_enabled)
+		return 0;
+
+	err = tcache_nodeinfo_init();
+	if (err)
+		goto out_fail;
+
+	err = register_shrinker(&tcache_shrinker);
+	if (err)
+		goto out_free_lru;
+
+#ifdef CONFIG_SMP
+	num_node_trees = roundup_pow_of_two(2 * num_possible_cpus());
+#endif
+
+	err = cleancache_register_ops(&tcache_cleancache_ops);
+	if (err)
+		goto out_unregister_shrinker;
+
+	pr_info("tcache loaded\n");
+	return 0;
+
+out_unregister_shrinker:
+	unregister_shrinker(&tcache_shrinker);
+out_free_lru:
+	kfree(tcache_nodeinfo);
+out_fail:
+	return err;
+}
+module_init(tcache_init);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Transcendent file cache");
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -63,7 +63,8 @@ static void clear_exceptional_entry(struct address_space *mapping,
 	 */
 	if (!workingset_node_shadows(node) &&
 	    !list_empty(&node->private_list))
-		workingset_forget_node(node);
+		list_lru_del(&workingset_shadow_nodes,
+			     &node->private_list);
 	__radix_tree_delete_node(&mapping->page_tree, node);
 
 unlock:
@@ -281,16 +282,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t		indices[PAGEVEC_SIZE];
 	pgoff_t		index;
 	int		i;
+	int		bug_if_page_has_bh = 0;
 
-	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
-		return;
+		goto out;
 
 	/* Offsets within partial pages */
 	partial_start = lstart & (PAGE_CACHE_SIZE - 1);
 	partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 	if (!inode_has_invalidate_range(mapping->host))
-		BUG_ON(partial_end);
+		bug_if_page_has_bh = 1;
 
 	/*
 	 * 'start' and 'end' always covers the range of pages to be fully
@@ -314,7 +315,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	while (index < end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE),
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -340,7 +340,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -375,9 +374,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			wait_on_page_writeback(page);
 			zero_user_segment(page, 0, partial_end);
 			cleancache_invalidate_page(mapping, page);
-			if (page_has_private(page))
+			if (page_has_private(page)) {
+				BUG_ON(bug_if_page_has_bh);
 				do_invalidatepage_range(page, 0,
 							partial_end);
+			}
 			unlock_page(page);
 			page_cache_release(page);
 		}
@@ -387,7 +388,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	 * will be released, just zeroed, so we can bail out now.
 	 */
 	if (start >= end)
-		return;
+		goto out;
 
 	index = start;
 	for ( ; ; ) {
@@ -405,7 +406,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			pagevec_release(&pvec);
 			break;
 		}
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -427,9 +427,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		index++;
 	}
+
+out:
 	cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -526,7 +527,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -555,7 +555,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -586,7 +585,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	BUG_ON(page_has_private(page));
 	__delete_from_page_cache(page, NULL);
 	spin_unlock_irq(&mapping->tree_lock);
-	mem_cgroup_uncharge_cache_page(page);
 
 	if (mapping->a_ops->freepage)
 		mapping->a_ops->freepage(page);
@@ -629,13 +627,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int ret2 = 0;
 	int did_range_unmap = 0;
 
-	cleancache_invalidate_inode(mapping);
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+		goto out;
+
 	pagevec_init(&pvec, 0);
 	index = start;
 	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -688,10 +687,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
+
+out:
 	cleancache_invalidate_inode(mapping);
 	return ret;
 }
--- /dev/null
+++ b/mm/tswap.c
@@ -0,0 +1,444 @@
+/*
+ *  mm/tswap.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/atomic.h>
+#include <linux/spinlock.h>
+#include <linux/radix-tree.h>
+#include <linux/list.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/shrinker.h>
+#include <linux/frontswap.h>
+
+#define TSWAP_GFP_MASK		(GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN)
+
+static RADIX_TREE(tswap_page_tree, GFP_ATOMIC | __GFP_NOWARN);
+static DEFINE_SPINLOCK(tswap_lock);
+
+struct tswap_lru {
+	struct list_head list;
+	unsigned long nr_items;
+} ____cacheline_aligned_in_smp;
+
+static struct tswap_lru *tswap_lru_node;
+
+/* Enable/disable tswap backend (set at boot time) */
+static bool tswap_enabled __read_mostly = true;
+module_param_named(enabled, tswap_enabled, bool, 0444);
+
+/* Enable/disable populating the cache */
+static bool tswap_active __read_mostly = true;
+module_param_named(active, tswap_active, bool, 0644);
+
+/* Total number of pages cached */
+static unsigned long tswap_nr_pages;
+module_param_named(nr_pages, tswap_nr_pages, ulong, 0444);
+
+/* Enable/disable zero pages */
+static bool tswap_check_zero __read_mostly = true;
+module_param_named(check_zero, tswap_check_zero, bool, 0644);
+
+unsigned long get_nr_tswap_pages(void)
+{
+	return tswap_nr_pages;
+}
+
+static void tswap_lru_add(struct page *page)
+{
+	struct tswap_lru *lru = &tswap_lru_node[page_to_nid(page)];
+
+	if (page != ZERO_PAGE(0)) {
+		list_add_tail(&page->lru, &lru->list);
+		lru->nr_items++;
+	}
+}
+
+static void tswap_lru_del(struct page *page)
+{
+	struct tswap_lru *lru = &tswap_lru_node[page_to_nid(page)];
+
+	if (page != ZERO_PAGE(0)) {
+		list_del(&page->lru);
+		lru->nr_items--;
+	}
+}
+
+static struct page *tswap_lookup_page(swp_entry_t entry)
+{
+	struct page *page;
+
+	spin_lock(&tswap_lock);
+	page = radix_tree_lookup(&tswap_page_tree, entry.val);
+	spin_unlock(&tswap_lock);
+	BUG_ON(page && page != ZERO_PAGE(0) && page_private(page) != entry.val);
+	return page;
+}
+
+static int tswap_insert_page(swp_entry_t entry, struct page *page)
+{
+	int err;
+
+	err = radix_tree_preload(TSWAP_GFP_MASK);
+	if (err)
+		return err;
+
+	if (page != ZERO_PAGE(0))
+		set_page_private(page, entry.val);
+	spin_lock(&tswap_lock);
+	err = radix_tree_insert(&tswap_page_tree, entry.val, page);
+	if (!err) {
+		tswap_lru_add(page);
+		tswap_nr_pages++;
+	}
+	spin_unlock(&tswap_lock);
+
+	radix_tree_preload_end();
+	return err;
+}
+
+static struct page *tswap_delete_page(swp_entry_t entry, struct page *expected)
+{
+	struct page *page;
+
+	spin_lock(&tswap_lock);
+	page = radix_tree_delete_item(&tswap_page_tree, entry.val, expected);
+	if (page) {
+		tswap_lru_del(page);
+		tswap_nr_pages--;
+	}
+	spin_unlock(&tswap_lock);
+	if (page) {
+		BUG_ON(expected && page != expected);
+		BUG_ON(page_private(page) != entry.val && page != ZERO_PAGE(0));
+	}
+	return page;
+}
+
+static unsigned long tswap_shrink_count(struct shrinker *shrink,
+					struct shrink_control *sc)
+{
+	return tswap_lru_node[sc->nid].nr_items;
+}
+
+static int tswap_evict_page(struct page *page)
+{
+	struct address_space *swapper_space;
+	struct page *found_page;
+	swp_entry_t entry;
+	int err;
+
+	BUG_ON(!PageLocked(page));
+
+	entry.val = page_private(page);
+	swapper_space = swap_address_space(entry);
+retry:
+	err = -EEXIST;
+	found_page = find_get_page(swapper_space, entry.val);
+	if (found_page) {
+		/*
+		 * There is already a swap cache page at the given offset. If
+		 * the page is uptodate, we can safely free the frontswap page,
+		 * marking the swapcache page dirty. Otherwise, the frontswap
+		 * page is about to be loaded and cannot be released.
+		 */
+		err = -EBUSY;
+		if (!trylock_page(found_page)) {
+			put_page(found_page);
+			goto out;
+		}
+		/* recheck that the page is still in the swap cache */
+		if (!PageSwapCache(found_page) ||
+		    page_private(found_page) != entry.val) {
+			unlock_page(found_page);
+			put_page(found_page);
+			goto retry;
+		}
+		if (PageUptodate(found_page)) {
+			/*
+			 * Since we are holding the swap cache page lock, no
+			 * frontswap callbacks are allowed now. However, the
+			 * frontswap page could have been invalidated before we
+			 * took the lock, in which case we have nothing to do.
+			 */
+			err = -ENOENT;
+			if (tswap_delete_page(entry, page)) {
+				SetPageDirty(found_page);
+				put_page(page);
+				err = 0;
+			}
+		}
+		unlock_page(found_page);
+		put_page(found_page);
+		goto out;
+	}
+
+	err = swapcache_prepare(entry);
+	if (err == -EEXIST) {
+		cond_resched();
+		goto retry;
+	}
+	if (err)
+		/* the swap entry has been freed, and therefore the page must
+		 * have been invalidated */
+		goto out;
+
+	/*
+	 * From now on, no frontswap callbacks can be called on the swap entry,
+	 * because we hold its swap cache reference.
+	 */
+
+	err = -ENOENT;
+	if (tswap_lookup_page(entry) != page)
+		/* the page could have been removed from tswap before we
+		 * prepared swap cache */
+		goto out_free_swapcache;
+
+	SetPageSwapBacked(page);
+	err = __add_to_swap_cache(page, entry);
+	if (err) {
+		ClearPageSwapBacked(page);
+		/* __add_to_swap_cache clears page->private on failure */
+		set_page_private(page, entry.val);
+		/* __add_to_swap_cache does not return -EEXIST, so we can
+		 * safely clear SWAP_HAS_CACHE flag */
+		goto out_free_swapcache;
+	}
+
+	/* the page is now in the swap cache, remove it from tswap */
+	BUG_ON(!tswap_delete_page(entry, page));
+	put_page(page);
+
+	lru_cache_add_anon(page);
+	SetPageUptodate(page);
+	SetPageDirty(page);
+	return 0;
+
+out_free_swapcache:
+	swapcache_free(entry);
+out:
+	return err;
+}
+
+static unsigned long tswap_shrink_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct tswap_lru *lru = &tswap_lru_node[sc->nid];
+	unsigned long nr_reclaimed = 0;
+
+	spin_lock(&tswap_lock);
+	while (sc->nr_to_scan-- > 0) {
+		struct page *page;
+
+		if (!lru->nr_items)
+			break;
+		
+		page = list_first_entry(&lru->list, struct page, lru);
+		/* lock the page to avoid interference with
+		 * other reclaiming threads */
+		if (!trylock_page(page)) {
+			list_move_tail(&page->lru, &lru->list);
+			cond_resched_lock(&tswap_lock);
+			continue;
+		}
+		get_page(page);
+		spin_unlock(&tswap_lock);
+
+		if (tswap_evict_page(page) == 0)
+			nr_reclaimed++;
+
+		unlock_page(page);
+		put_page(page);
+
+		cond_resched();
+		spin_lock(&tswap_lock);
+	}
+	spin_unlock(&tswap_lock);
+
+	return nr_reclaimed;
+}
+
+static struct shrinker tswap_shrinker = {
+	.count_objects = tswap_shrink_count,
+	.scan_objects = tswap_shrink_scan,
+	.seeks = 4,
+	.flags = SHRINKER_NUMA_AWARE,
+};
+
+static void tswap_frontswap_init(unsigned type)
+{
+	/*
+	 * We maintain the single page tree for all swap types, so nothing to
+	 * do here.
+	 */
+}
+
+static bool is_zero_filled_page(struct page *page)
+{
+	bool zero_filled = true;
+	unsigned long *v;
+	int i;
+
+	if (!tswap_check_zero)
+		return false;
+
+	v = kmap_atomic(page);
+	for (i = 0; i < PAGE_SIZE / sizeof(*v); i++) {
+		if (v[i] != 0) {
+			zero_filled = false;
+			break;
+		}
+	}
+	kunmap_atomic(v);
+	return zero_filled;
+}
+
+static int tswap_frontswap_store(unsigned type, pgoff_t offset,
+				 struct page *page)
+{
+	swp_entry_t entry = swp_entry(type, offset);
+	int zero_filled = -1, err = 0;
+	struct page *cache_page;
+
+	if (!tswap_active)
+		return -1;
+
+	cache_page = tswap_lookup_page(entry);
+	if (cache_page) {
+		zero_filled = is_zero_filled_page(page);
+		/* If type of page has not changed, just reuse it */
+		if (zero_filled == (cache_page == ZERO_PAGE(0)))
+			goto copy;
+		tswap_delete_page(entry, NULL);
+		put_page(cache_page);
+	}
+
+	if (!(current->flags & PF_MEMCG_RECLAIM))
+		return -1;
+
+	if (zero_filled == -1)
+		zero_filled = is_zero_filled_page(page);
+
+	if (!zero_filled) {
+		cache_page = alloc_page(TSWAP_GFP_MASK | __GFP_HIGHMEM);
+		if (!cache_page)
+			return -1;
+	} else {
+		cache_page = ZERO_PAGE(0);
+		get_page(cache_page);
+	}
+
+	err = tswap_insert_page(entry, cache_page);
+	if (err) {
+		/*
+		 * Frontswap stores proceed under the page lock, so this can
+		 * only fail with ENOMEM.
+		 */
+		BUG_ON(err == -EEXIST);
+		put_page(cache_page);
+		return -1;
+	}
+copy:
+	if (cache_page != ZERO_PAGE(0))
+		copy_highpage(cache_page, page);
+	return 0;
+}
+
+static int tswap_frontswap_load(unsigned type, pgoff_t offset,
+				struct page *page)
+{
+	struct page *cache_page;
+
+	cache_page = tswap_delete_page(swp_entry(type, offset), NULL);
+	if (!cache_page)
+		return -1;
+
+	copy_highpage(page, cache_page);
+	put_page(cache_page);
+	return 0;
+}
+
+static void tswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+	struct page *cache_page;
+
+	cache_page = tswap_delete_page(swp_entry(type, offset), NULL);
+	if (cache_page)
+		put_page(cache_page);
+}
+
+static void tswap_frontswap_invalidate_area(unsigned type)
+{
+	/*
+	 * This function is called on swapoff after all swap entries of the
+	 * given type has been freed and therefore all frontswap pages has been
+	 * invalidated, so nothing to do here.
+	 */
+}
+
+static struct frontswap_ops tswap_frontswap_ops = {
+	.init = tswap_frontswap_init,
+	.store = tswap_frontswap_store,
+	.load = tswap_frontswap_load,
+	.invalidate_page = tswap_frontswap_invalidate_page,
+	.invalidate_area = tswap_frontswap_invalidate_area,
+};
+
+static int __init tswap_lru_init(void)
+{
+	int i;
+
+	tswap_lru_node = kcalloc(nr_node_ids, sizeof(*tswap_lru_node),
+				 GFP_KERNEL);
+	if (!tswap_lru_node)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++)
+		INIT_LIST_HEAD(&tswap_lru_node[i].list);
+	return 0;
+}
+
+static int __init tswap_init(void)
+{
+	int err;
+	struct frontswap_ops *old_ops;
+
+	if (!tswap_enabled)
+		return 0;
+
+	err = tswap_lru_init();
+	if (err)
+		goto out_fail;
+
+	err = register_shrinker(&tswap_shrinker);
+	if (err)
+		goto out_free_lru;
+
+	frontswap_tmem_exclusive_gets(true);
+
+	old_ops = frontswap_register_ops(&tswap_frontswap_ops);
+	pr_info("tswap loaded\n");
+	if (old_ops)
+		pr_warn("tswap: frontswap_ops %p overridden\n", old_ops);
+
+	return 0;
+
+out_free_lru:
+	kfree(tswap_lru_node);
+out_fail:
+	return err;
+}
+module_init(tswap_init);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Transcendent swap cache");
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -27,6 +27,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long src_addr,
 			    struct page **pagep)
 {
+	struct mem_cgroup *memcg;
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
 	void *page_kaddr;
@@ -65,7 +66,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 	__SetPageUptodate(page);
 
 	ret = -ENOMEM;
-	if (mem_cgroup_newpage_charge(page, dst_mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
 		goto out_release;
 
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -79,6 +80,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	inc_mm_counter(dst_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, dst_vma, dst_addr);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, dst_vma);
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
@@ -91,7 +94,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 	return ret;
 out_release_uncharge_unlock:
 	pte_unmap_unlock(dst_pte, ptl);
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 out_release:
 	page_cache_release(page);
 	goto out;
--- a/mm/util.c
+++ b/mm/util.c
@@ -17,9 +17,6 @@
 
 #include "internal.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
-
 static inline int is_kernel_rodata(unsigned long addr)
 {
 	return addr >= (unsigned long)__start_rodata &&
@@ -150,97 +147,6 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
 
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
-					   gfp_t flags)
-{
-	void *ret;
-	size_t ks = 0;
-
-	if (p)
-		ks = ksize(p);
-
-	if (ks >= new_size)
-		return (void *)p;
-
-	ret = kmalloc_track_caller(new_size, flags);
-	if (ret && p)
-		memcpy(ret, p, ks);
-
-	return ret;
-}
-
-/**
- * __krealloc - like krealloc() but don't free @p.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * This function is like krealloc() except it never frees the originally
- * allocated buffer. Use this if you don't want to free the buffer immediately
- * like, for example, with RCU.
- */
-void *__krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-	if (unlikely(!new_size))
-		return ZERO_SIZE_PTR;
-
-	return __do_krealloc(p, new_size, flags);
-
-}
-EXPORT_SYMBOL(__krealloc);
-
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
- * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-	void *ret;
-
-	if (unlikely(!new_size)) {
-		kfree(p);
-		return ZERO_SIZE_PTR;
-	}
-
-	ret = __do_krealloc(p, new_size, flags);
-	if (ret && p != ret)
-		kfree(p);
-
-	return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
-/**
- * kzfree - like kfree but zero memory
- * @p: object to free memory of
- *
- * The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
- *
- * Note: this function zeroes the whole allocated buffer which can be a good
- * deal bigger than the requested buffer size passed to kmalloc(). So be
- * careful when using this function in performance sensitive code.
- */
-void kzfree(const void *p)
-{
-	size_t ks;
-	void *mem = (void *)p;
-
-	if (unlikely(ZERO_OR_NULL_PTR(mem)))
-		return;
-	ks = ksize(mem);
-	memset(mem, 0, ks);
-	kfree(mem);
-}
-EXPORT_SYMBOL(kzfree);
-
 /*
  * strndup_user - duplicate an existing string from user space
  * @s: The string to duplicate
@@ -395,6 +301,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
+		/* Ugly fix for PSBM-23133 vdavydov@ */
+		if (file && file->f_op && (flag & MAP_TYPE) == MAP_SHARED &&
+		    S_ISREG(file_inode(file)->i_mode) &&
+		    (file_inode(file)->i_sb->s_type->fs_flags & FS_HAS_MMAP_PREP))
+			file->f_op->mmap(file, NULL);
 		down_write(&mm->mmap_sem);
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate, &uf);
@@ -419,6 +330,51 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+	gfp_t kmalloc_flags = flags;
+	void *ret;
+
+	/*
+	 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+	 * so the given set of flags has to be compatible.
+	 */
+	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+	/*
+	 * Make sure that larger requests are not too disruptive - no OOM
+	 * killer and no allocation failure warnings as we have a fallback
+	 */
+	if (size > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN;
+
+	ret = kmalloc_node(size, kmalloc_flags, node);
+
+	/*
+	 * It doesn't really make sense to fallback to vmalloc for sub page
+	 * requests
+	 */
+	if (ret || size <= PAGE_SIZE)
+		return ret;
+
+	return __vmalloc_node_flags(size, node, flags | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
 void kvfree(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -535,11 +491,3 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
 out:
 	return res;
 }
-
-/* Tracepoints definitions. */
-EXPORT_TRACEPOINT_SYMBOL(kmalloc);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kfree);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1290,7 +1290,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
 	unsigned long addr = (unsigned long)area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
+	unsigned long end = addr + get_vm_area_size(area);
 	int err;
 
 	err = vmap_page_range(addr, end, prot, *pages);
@@ -1361,10 +1361,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (unlikely(!area))
 		return NULL;
 
-	/*
-	 * We always allocate a guard page.
-	 */
-	size += PAGE_SIZE;
+	if (!(flags & VM_NO_GUARD))
+		size += PAGE_SIZE;
 
 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 	if (IS_ERR(va)) {
@@ -1466,6 +1464,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 		spin_unlock(&vmap_area_lock);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
+		kasan_free_shadow(vm);
 		free_unmap_vmap_area(va);
 		vm->size -= PAGE_SIZE;
 
@@ -1610,7 +1609,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	unsigned int nr_pages, array_size, i;
 	gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 
-	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -1635,7 +1634,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
 
 		if (node < 0)
-			page = alloc_page(tmp_mask);
+			page = alloc_pages(tmp_mask, order);
 		else
 			page = alloc_pages_node(node, tmp_mask, order);
 
@@ -1667,6 +1666,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
  *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@vm_flags:	additional vm area flags (e.g. %VM_NO_GUARD)
  *	@node:		node to use for allocation or NUMA_NO_NODE
  *	@caller:	caller's return address
  *
@@ -1676,7 +1676,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
  */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller)
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1686,8 +1687,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
-				  start, end, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST |
+				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
@@ -1736,7 +1737,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    int node, const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, prot, node, caller);
+				gfp_mask, prot, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
@@ -1746,7 +1747,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-static inline void *__vmalloc_node_flags(unsigned long size,
+void *__vmalloc_node_flags(unsigned long size,
 					int node, gfp_t flags)
 {
 	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
@@ -1786,6 +1787,20 @@ void *vzalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vzalloc);
 
+void *vmalloc_account(unsigned long size)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE,
+			GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(vmalloc_account);
+
+void *vzalloc_account(unsigned long size)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE,
+			GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_account);
+
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
  * @size: allocation size
@@ -2042,7 +2057,7 @@ long vread(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -2052,7 +2067,7 @@ long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP))
@@ -2124,7 +2139,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -2133,7 +2148,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP)) {
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@
 
 #include <linux/swapops.h>
 #include <linux/balloon_compaction.h>
+#include <linux/vzstat.h>
 
 #include "internal.h"
 
@@ -79,11 +80,17 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 
+	/* Can cgroups be reclaimed below their normal consumption range? */
+	int may_thrash;
+
 	int order;
 
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 
+	/* Reclaim only slab */
+	bool slab_only;
+
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
@@ -193,7 +200,7 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
 	return nr;
 }
 
-static bool zone_reclaimable(struct zone *zone)
+bool zone_reclaimable(struct zone *zone)
 {
 	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
@@ -207,14 +214,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
  */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-	atomic_long_set(&shrinker->nr_in_batch, 0);
+	size_t size = sizeof(*shrinker->nr_deferred);
+
+	/*
+	 * If we only have one possible node in the system anyway, save
+	 * ourselves the trouble and disable NUMA aware behavior. This way we
+	 * will save memory and some small loop time later.
+	 */
+	if (nr_node_ids == 1)
+		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+	if (shrinker->flags & SHRINKER_NUMA_AWARE)
+		size *= nr_node_ids;
+
+	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+	if (!shrinker->nr_deferred)
+		return -ENOMEM;
+
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 
@@ -226,159 +250,198 @@ void unregister_shrinker(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
+	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
-				     struct shrink_control *sc,
-				     unsigned long nr_to_scan)
-{
-	int objects;
-	sc->nr_to_scan = nr_to_scan;
-	objects = (*shrinker->shrink)(shrinker, sc);
+#define SHRINK_BATCH 128
+
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+				    struct shrinker *shrinker, int priority)
+{
+	unsigned long freed = 0;
+	unsigned long long delta;
+	long total_scan;
+	long max_pass;
+	long nr;
+	long new_nr;
+	int nid = shrinkctl->nid;
+	long batch_size = shrinker->batch ? shrinker->batch
+					  : SHRINK_BATCH;
+
+	max_pass = shrinker->count_objects(shrinker, shrinkctl);
+	if (max_pass == 0)
+		return 0;
+
+	/*
+	 * copy the current shrinker scan count into a local variable
+	 * and zero it so that other concurrent shrinker invocations
+	 * don't also do this scanning work.
+	 */
+	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+	total_scan = nr;
+	delta = max_pass >> priority;
+	delta = (4 * delta) / shrinker->seeks;
+	total_scan += delta;
+	if (total_scan < 0) {
+		printk(KERN_ERR
+		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+		       shrinker->scan_objects, total_scan);
+		total_scan = max_pass;
+	}
+
+	/*
+	 * We need to avoid excessive windup on filesystem shrinkers
+	 * due to large numbers of GFP_NOFS allocations causing the
+	 * shrinkers to return -1 all the time. This results in a large
+	 * nr being built up so when a shrink that can do some work
+	 * comes along it empties the entire cache due to nr >>>
+	 * max_pass.  This is bad for sustaining a working set in
+	 * memory.
+	 *
+	 * Hence only allow the shrinker to scan the entire cache when
+	 * a large delta change is calculated directly.
+	 */
+	if (delta < max_pass / 4)
+		total_scan = min(total_scan, max_pass / 2);
+
 	/*
-	 * A shrinker can legitimately return -1 meaning that it cannot do
-	 * much work without a risk of deadlock.
-	 * However, in some extreme cases, specially when there is abusive
-	 * usage of vm.vfs_cache_pressure, a shrinker might return a negative
-	 * value indicating that its integer return value has overflown.
-	 * In such cases, we just go ahead and cap the return val to INT_MAX.
+	 * Avoid risking looping forever due to too large nr value:
+	 * never try to free more than twice the estimate number of
+	 * freeable entries.
 	 */
-	if (objects < -1)
-		return INT_MAX;
+	if (total_scan > max_pass * 2)
+		total_scan = max_pass * 2;
+
+	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+				   max_pass, delta, total_scan, priority);
+
+	while (total_scan >= batch_size) {
+		unsigned long ret;
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			break;
 
-	return objects;
+		shrinkctl->nr_to_scan = batch_size;
+		ret = shrinker->scan_objects(shrinker, shrinkctl);
+		if (ret == SHRINK_STOP)
+			break;
+		freed += ret;
+
+		count_vm_events(SLABS_SCANNED, batch_size);
+		total_scan -= batch_size;
+
+		cond_resched();
+	}
+
+	/*
+	 * move the unused scan count back into the shrinker in a
+	 * manner that handles concurrent updates. If we exhausted the
+	 * scan, there is no need to do an update.
+	 */
+	if (total_scan > 0)
+		new_nr = atomic_long_add_return(total_scan,
+						&shrinker->nr_deferred[nid]);
+	else
+		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+	return freed;
 }
 
-#define SHRINK_BATCH 128
-/*
- * Call the shrink functions to age shrinkable caches
+/**
+ * shrink_slab - shrink slab caches
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
+ * @priority: the reclaim priority
  *
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object.  With this in mind we age equal
- * percentages of the lru and ageable caches.  This should balance the seeks
- * generated by these structures.
+ * Call the shrink functions to age shrinkable caches.
  *
- * If the vm encountered mapped pages on the LRU it increase the pressure on
- * slab to avoid swapping.
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
  *
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
  *
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt.  It is used for balancing
- * slab reclaim versus page reclaim.
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
  *
- * Returns the number of slab objects which we shrunk.
+ * Returns the number of reclaimed slab objects.
  */
-unsigned long shrink_slab(struct shrink_control *shrink,
-			  unsigned long nr_pages_scanned,
-			  unsigned long lru_pages)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+				 struct mem_cgroup *memcg,
+				 int priority,
+				 bool for_drop_caches)
 {
 	struct shrinker *shrinker;
-	unsigned long ret = 0;
+	unsigned long freed = 0;
 
-	if (nr_pages_scanned == 0)
-		nr_pages_scanned = SWAP_CLUSTER_MAX;
+	if (memcg && !memcg_kmem_is_active(memcg))
+		return 0;
+
+	if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+		return 0;
 
 	if (!down_read_trylock(&shrinker_rwsem)) {
-		/* Assume we'll be able to shrink next time */
-		ret = 1;
+		/*
+		 * If we would return 0, our callers would understand that we
+		 * have nothing else to shrink and give up trying. By returning
+		 * 1 we keep it going and assume we'll be able to shrink next
+		 * time.
+		 */
+		freed = 1;
 		goto out;
 	}
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
-		unsigned long long delta;
-		long total_scan;
-		long max_pass;
-		int shrink_ret = 0;
-		long nr;
-		long new_nr;
-		long batch_size = shrinker->batch ? shrinker->batch
-						  : SHRINK_BATCH;
-
-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-		if (max_pass <= 0)
+		struct shrink_control sc = {
+			.gfp_mask = gfp_mask,
+			.nid = nid,
+			.memcg = memcg,
+			.for_drop_caches = for_drop_caches,
+		};
+
+		if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
 			continue;
 
-		/*
-		 * copy the current shrinker scan count into a local variable
-		 * and zero it so that other concurrent shrinker invocations
-		 * don't also do this scanning work.
-		 */
-		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
-		total_scan = nr;
-		delta = (4 * nr_pages_scanned) / shrinker->seeks;
-		delta *= max_pass;
-		do_div(delta, lru_pages + 1);
-		total_scan += delta;
-		if (total_scan < 0) {
-			printk(KERN_ERR "shrink_slab: %pF negative objects to "
-			       "delete nr=%ld\n",
-			       shrinker->shrink, total_scan);
-			total_scan = max_pass;
-		}
+		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+			sc.nid = 0;
 
-		/*
-		 * We need to avoid excessive windup on filesystem shrinkers
-		 * due to large numbers of GFP_NOFS allocations causing the
-		 * shrinkers to return -1 all the time. This results in a large
-		 * nr being built up so when a shrink that can do some work
-		 * comes along it empties the entire cache due to nr >>>
-		 * max_pass.  This is bad for sustaining a working set in
-		 * memory.
-		 *
-		 * Hence only allow the shrinker to scan the entire cache when
-		 * a large delta change is calculated directly.
-		 */
-		if (delta < max_pass / 4)
-			total_scan = min(total_scan, max_pass / 2);
-
-		/*
-		 * Avoid risking looping forever due to too large nr value:
-		 * never try to free more than twice the estimate number of
-		 * freeable entries.
-		 */
-		if (total_scan > max_pass * 2)
-			total_scan = max_pass * 2;
+		freed += do_shrink_slab(&sc, shrinker, priority);
+	}
 
-		trace_mm_shrink_slab_start(shrinker, shrink, nr,
-					nr_pages_scanned, lru_pages,
-					max_pass, delta, total_scan);
+	up_read(&shrinker_rwsem);
+out:
+	cond_resched();
+	return freed;
+}
 
-		while (total_scan >= batch_size) {
-			int nr_before;
+void drop_slab_node(int nid)
+{
+	unsigned long freed;
 
-			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							batch_size);
-			if (shrink_ret == -1)
-				break;
-			if (shrink_ret < nr_before)
-				ret += nr_before - shrink_ret;
-			count_vm_events(SLABS_SCANNED, batch_size);
-			total_scan -= batch_size;
+	do {
+		struct mem_cgroup *memcg = NULL;
 
-			cond_resched();
-		}
+		freed = 0;
+		do {
+			freed += shrink_slab(GFP_KERNEL, nid, memcg,
+					     0, true);
+		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+	} while (freed > 10);
+}
 
-		/*
-		 * move the unused scan count back into the shrinker in a
-		 * manner that handles concurrent updates. If we exhausted the
-		 * scan, there is no need to do an update.
-		 */
-		if (total_scan > 0)
-			new_nr = atomic_long_add_return(total_scan,
-					&shrinker->nr_in_batch);
-		else
-			new_nr = atomic_long_read(&shrinker->nr_in_batch);
+void drop_slab(void)
+{
+	int nid;
 
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
-	}
-	up_read(&shrinker_rwsem);
-out:
-	cond_resched();
-	return ret;
+	for_each_online_node(nid)
+		drop_slab_node(nid);
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -557,9 +620,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
+		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-		swapcache_free(swap, page);
+		swapcache_free(swap);
 	} else {
 		void (*freepage)(struct page *);
 		void *shadow = NULL;
@@ -586,7 +650,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 			shadow = workingset_eviction(mapping, page);
 		__delete_from_page_cache(page, shadow);
 		spin_unlock_irq(&mapping->tree_lock);
-		mem_cgroup_uncharge_cache_page(page);
 
 		if (freepage != NULL)
 			freepage(page);
@@ -808,7 +871,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 	cond_resched();
 
-	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
@@ -893,11 +955,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 *    __GFP_IO|__GFP_FS for this reason); but more thought
 		 *    would probably show more reasons.
 		 *
-		 * 3) Legacy memcg encounters a page that is not already marked
+		 * 3) memcg encounters a page that is already marked
 		 *    PageReclaim. memcg does not have any dirty pages
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
-		 *    reclaim. Wait for the writeback to complete.
+		 *    reclaim. Stall memcg reclaim then.
 		 */
 		if (PageWriteback(page)) {
 			/* Case 1 above */
@@ -918,7 +980,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 * enough to care.  What we do want is for this
 				 * page to have PageReclaim set next time memcg
 				 * reclaim reaches the tests above, so it will
-				 * then wait_on_page_writeback() to avoid OOM;
+				 * then stall to avoid OOM;
 				 * and it's also appropriate in global reclaim.
 				 */
 				SetPageReclaim(page);
@@ -928,7 +990,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 			/* Case 3 above */
 			} else {
-				wait_on_page_writeback(page);
+				nr_immediate++;
+				goto keep_locked;
 			}
 		}
 
@@ -1104,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		unlock_page(page);
-		putback_lru_page(page);
+		list_add(&page->lru, &ret_pages);
 		continue;
 
 activate_locked:
@@ -1121,12 +1184,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
 
+	mem_cgroup_uncharge_list(&free_pages);
 	try_to_unmap_flush();
 	free_hot_cold_page_list(&free_pages, true);
 
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
-	mem_cgroup_uncharge_end();
+
 	*ret_nr_dirty += nr_dirty;
 	*ret_nr_congested += nr_congested;
 	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1266,7 +1330,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long nr_taken = 0;
 	unsigned long scan;
 
-	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+					!list_empty(src); scan++) {
 		struct page *page;
 		int nr_pages;
 
@@ -1348,6 +1413,32 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+static int __too_many_isolated(struct zone *zone, int file,
+			       struct scan_control *sc, int safe)
+{
+	unsigned long inactive, isolated;
+
+	if (safe) {
+		inactive = zone_page_state_snapshot(zone,
+				NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state_snapshot(zone,
+				NR_ISOLATED_ANON + file);
+	} else {
+		inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+	}
+
+	/*
+	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+	 * won't get blocked by normal direct-reclaimers, forming a circular
+	 * deadlock.
+	 */
+	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+		inactive >>= 3;
+
+	return isolated > inactive;
+}
+
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
@@ -1356,33 +1447,24 @@ int isolate_lru_page(struct page *page)
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
-		struct scan_control *sc)
+			     struct scan_control *sc)
 {
-	unsigned long inactive, isolated;
-
 	if (current_is_kswapd())
 		return 0;
 
 	if (!sane_reclaim(sc))
 		return 0;
 
-	if (file) {
-		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
-	} else {
-		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
-	}
-
 	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
+	 * __too_many_isolated(safe=0) is fast but inaccurate, because it
+	 * doesn't account for the vm_stat_diff[] counters.  So if it looks
+	 * like too_many_isolated() is about to return true, fall back to the
+	 * slower, more accurate zone_page_state_snapshot().
 	 */
-	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
-		inactive >>= 3;
+	if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+		return __too_many_isolated(zone, file, sc, 1);
 
-	return isolated > inactive;
+	return 0;
 }
 
 static noinline_for_stack void
@@ -1426,6 +1508,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
+				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
@@ -1520,6 +1603,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_unlock_irq(&zone->lru_lock);
 
+	mem_cgroup_uncharge_list(&page_list);
 	free_hot_cold_page_list(&page_list, true);
 
 	/*
@@ -1539,10 +1623,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_writeback && nr_writeback == nr_taken)
 		zone_set_flag(zone, ZONE_WRITEBACK);
 
-	/*
-	 * Legacy memcg will stall in page writeback so avoid forcibly
-	 * stalling here.
-	 */
+	if (!global_reclaim(sc) && nr_immediate)
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
+
 	if (sane_reclaim(sc)) {
 		/*
 		 * Tag a zone as congested if all the dirty pages scanned were
@@ -1555,19 +1638,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		 * If dirty pages are scanned that are not queued for IO, it
 		 * implies that flushers are not keeping up. In this case, flag
 		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
-		 * pages from reclaim context. It will forcibly stall in the
-		 * next check.
+		 * pages from reclaim context.
 		 */
 		if (nr_unqueued_dirty == nr_taken)
 			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
 
 		/*
-		 * In addition, if kswapd scans pages marked marked for
-		 * immediate reclaim and under writeback (nr_immediate), it
-		 * implies that pages are cycling through the LRU faster than
+		 * If kswapd scans pages marked marked for immediate
+		 * reclaim and under writeback (nr_immediate), it implies
+		 * that pages are cycling through the LRU faster than
 		 * they are written so also forcibly stall.
 		 */
-		if (nr_unqueued_dirty == nr_taken || nr_immediate)
+		if (nr_immediate)
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 
@@ -1634,6 +1716,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
+				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
@@ -1663,6 +1746,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 
+	KSTAT_PERF_ENTER(refill_inact);
+
 	lru_add_drain();
 
 	if (!sc->may_unmap)
@@ -1741,89 +1826,84 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
+	mem_cgroup_uncharge_list(&l_hold);
 	free_hot_cold_page_list(&l_hold, true);
-}
 
-#ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
-{
-	unsigned long active, inactive;
-
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
-	if (inactive * zone->inactive_ratio < active)
-		return 1;
-
-	return 0;
+	KSTAT_PERF_LEAVE(refill_inact);
 }
 
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
- *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static int inactive_anon_is_low(struct lruvec *lruvec)
-{
-	/*
-	 * If we don't have swap space, anonymous page deactivation
-	 * is pointless.
-	 */
-	if (!total_swap_pages)
-		return 0;
-
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_inactive_anon_is_low(lruvec);
 
-	return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return 0;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
+ *
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
  *
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
  *
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * If that fails and refaulting is observed, the inactive list grows.
  *
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
  */
-static int inactive_file_is_low(struct lruvec *lruvec)
+static int inactive_list_is_low(struct lruvec *lruvec, bool file,
+				struct mem_cgroup *memcg, bool actual_reclaim)
 {
+	struct zone *zone = lruvec_zone(lruvec);
+	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
+	unsigned long gb;
+	unsigned long refaults;
+
+	/*
+	 * If we don't have swap space, anonymous page deactivation
+	 * is pointless.
+	 */
+	if (!file && !total_swap_pages)
+		return false;
 
-	inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
-	active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+	inactive = get_lru_size(lruvec, file * LRU_FILE);
+	active = get_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-	return active > inactive;
-}
+	if (memcg)
+		refaults = zone->refaults; /* we don't support per-cgroup workingset */
+        else
+		refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
 
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
-	if (is_file_lru(lru))
-		return inactive_file_is_low(lruvec);
-	else
-		return inactive_anon_is_low(lruvec);
+	if (file && actual_reclaim && zone->refaults != refaults) {
+		inactive_ratio = 0;
+	} else {
+		gb = (inactive + active) >> (30 - PAGE_SHIFT);
+		if (gb)
+			inactive_ratio = int_sqrt(10 * gb);
+		else
+			inactive_ratio = 1;
+	}
+	return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, lru))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru),
+					sc->target_mem_cgroup, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -1838,6 +1918,51 @@ static int vmscan_swappiness(struct scan_control *sc)
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+#ifdef CONFIG_MEMCG
+int sysctl_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return zone->force_scan;
+}
+
+static void zone_update_force_scan(struct zone *zone)
+{
+	struct mem_cgroup *memcg;
+	int tiny, total;
+
+	tiny = total = 0;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+		unsigned long size;
+
+		size = max(get_lru_size(lruvec, LRU_ACTIVE_FILE),
+			   get_lru_size(lruvec, LRU_INACTIVE_FILE));
+		if (get_nr_swap_pages() > 0)
+			size = max3(size,
+				    get_lru_size(lruvec, LRU_ACTIVE_ANON),
+				    get_lru_size(lruvec, LRU_INACTIVE_ANON));
+
+		if (size && size >> DEF_PRIORITY == 0)
+			tiny++;
+		total++;
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	zone->force_scan = tiny * 100 > total * sysctl_force_scan_thresh;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return false;
+}
+
+static inline void zone_update_force_scan(struct zone *zone)
+{
+}
+#endif
+
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
@@ -1855,7 +1980,7 @@ enum scan_balance {
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
-			   unsigned long *nr)
+			   unsigned long *nr, unsigned long *lru_pages)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2];
@@ -1878,10 +2003,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && zone->all_unreclaimable)
+	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
+	if (zone_force_scan(zone))
+		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
@@ -1941,7 +2068,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * There is enough inactive page cache, do not reclaim
 	 * anything from the anonymous working set right now.
 	 */
-	if (!inactive_file_is_low(lruvec)) {
+	if (!inactive_list_is_low(lruvec, true, sc->target_mem_cgroup, false) &&
+	    get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -1993,6 +2121,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
+	*lru_pages = 0;
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
@@ -2018,13 +2147,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 		case SCAN_FILE:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
-			if ((scan_balance == SCAN_FILE) != file)
+			if ((scan_balance == SCAN_FILE) != file) {
+				size = 0;
 				scan = 0;
+			}
 			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
 		}
+
+		*lru_pages += size;
 		nr[lru] = scan;
 	}
 }
@@ -2049,7 +2182,8 @@ static inline void init_tlb_ubc(void)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc,
+			  unsigned long *lru_pages)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long targets[NR_LRU_LISTS];
@@ -2060,7 +2194,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	struct blk_plug plug;
 	bool scan_adjusted;
 
-	get_scan_count(lruvec, sc, nr);
+	get_scan_count(lruvec, sc, nr, lru_pages);
 
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
@@ -2158,7 +2292,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(lruvec))
+	if (inactive_list_is_low(lruvec, false, sc->target_mem_cgroup, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
@@ -2240,9 +2374,18 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc,
+			bool is_classzone)
 {
+	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
+	gfp_t slab_gfp = sc->gfp_mask;
+	bool slab_only = sc->slab_only;
+
+	/* Disable fs-related IO for direct reclaim */
+	if (!sc->target_mem_cgroup &&
+	    (current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+		slab_gfp &= ~__GFP_FS;
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2250,6 +2393,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 			.zone = zone,
 			.priority = sc->priority,
 		};
+		unsigned long zone_lru_pages = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2257,11 +2401,30 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
+			unsigned long lru_pages, scanned;
 			struct lruvec *lruvec;
 
-			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
+				continue;
+
+			scanned = sc->nr_scanned;
+
+			if (!slab_only) {
+				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+				shrink_lruvec(lruvec, sc, &lru_pages);
+				zone_lru_pages += lru_pages;
+			}
+
+			if (memcg && is_classzone) {
+				shrink_slab(slab_gfp, zone_to_nid(zone),
+					    memcg, sc->priority, false);
+				if (reclaim_state) {
+					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+					sc->nr_scanned += reclaim_state->reclaimed_slab;
+					reclaim_state->reclaimed_slab = 0;
+				}
 
-			shrink_lruvec(lruvec, sc);
+			}
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2278,8 +2441,36 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+
+		if (global_reclaim(sc) && is_classzone) {
+			unsigned long scanned, eligible;
+
+			scanned = sc->nr_scanned - nr_scanned;
+			eligible = zone_lru_pages;
+
+			/*
+			 * If most processes reside in memory cgroups protected
+			 * with memory.low there won't be a lot of user pages
+			 * in the root lruvec so that the lru scanned/eligible
+			 * ratio ratio can get high even on the default scan
+			 * priority. In order not to subject memcg unaware slab
+			 * caches to disproportionately high pressure, we forge
+			 * the ratio in this case.
+			 */
+			if (eligible >> sc->priority == 0) {
+				scanned = 1000;
+				eligible = 1000 << sc->priority;
+			}
+
+			shrink_slab(slab_gfp, zone_to_nid(zone), NULL,
+				    sc->priority, false);
+		}
+
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
@@ -2352,6 +2543,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
+	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
 	bool aborted_reclaim = false;
 
 	/*
@@ -2363,9 +2555,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		sc->gfp_mask |= __GFP_HIGHMEM;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
-					gfp_zone(sc->gfp_mask), sc->nodemask) {
+					requested_highidx, sc->nodemask) {
+		enum zone_type classzone_idx;
+
 		if (!populated_zone(zone))
 			continue;
+
+		classzone_idx = requested_highidx;
+		while (!populated_zone(zone->zone_pgdat->node_zones +
+							classzone_idx))
+			classzone_idx--;
+
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
@@ -2373,8 +2573,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			if (zone->all_unreclaimable &&
-					sc->priority != DEF_PRIORITY)
+			if (sc->priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
@@ -2406,7 +2606,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		shrink_zone(zone, sc);
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			break;
 	}
 
 	return aborted_reclaim;
@@ -2425,7 +2628,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
-		if (!zone->all_unreclaimable)
+		if (zone_reclaimable(zone))
 			return false;
 	}
 
@@ -2449,16 +2652,17 @@ static bool all_unreclaimable(struct zonelist *zonelist,
  * 		else, the number of pages reclaimed
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-					struct scan_control *sc,
-					struct shrink_control *shrink)
+					  struct scan_control *sc)
 {
+	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
-	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct zoneref *z;
-	struct zone *zone;
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
+	struct zone *zone;
+	struct zoneref *z;
 
+retry:
+	{KSTAT_PERF_ENTER(ttfp);
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2470,28 +2674,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);
 
-		/*
-		 * Don't shrink slabs when reclaiming memory from over limit
-		 * cgroups but do shrink slab at least once when aborting
-		 * reclaim for compaction to avoid unevenly scanning file/anon
-		 * LRU pages over slab pages.
-		 */
-		if (global_reclaim(sc)) {
-			unsigned long lru_pages = 0;
-			for_each_zone_zonelist(zone, z, zonelist,
-					gfp_zone(sc->gfp_mask)) {
-				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-					continue;
-
-				lru_pages += zone_reclaimable_pages(zone);
-			}
-
-			shrink_slab(shrink, sc->nr_scanned, lru_pages);
-			if (reclaim_state) {
-				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-				reclaim_state->reclaimed_slab = 0;
-			}
-		}
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
 			goto out;
@@ -2516,10 +2698,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			aborted_reclaim = 1;
 	} while (--sc->priority >= 0 && !aborted_reclaim);
 
 out:
+	if (!sc->target_mem_cgroup)
+		for_each_zone_zonelist_nodemask(zone, z, zonelist,
+					gfp_zone(sc->gfp_mask), sc->nodemask)
+			zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+
 	delayacct_freepages_end();
+	KSTAT_PERF_LEAVE(ttfp);}
 
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
@@ -2536,6 +2727,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	if (aborted_reclaim)
 		return 1;
 
+	/* Untapped cgroup reserves?  Don't OOM, retry. */
+	if (!sc->may_thrash) {
+		sc->priority = initial_priority;
+		sc->may_thrash = 1;
+		goto retry;
+	}
+
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
 		return 1;
@@ -2684,9 +2882,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.target_mem_cgroup = NULL,
 		.nodemask = nodemask,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 
 	/*
 	 * Do not enter reclaim if fatal signal was delivered while throttled.
@@ -2700,7 +2895,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				sc.may_writepage,
 				gfp_mask);
 
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 
@@ -2725,6 +2920,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 		.target_mem_cgroup = memcg,
 	};
 	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	unsigned long lru_pages;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2740,7 +2936,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
-	shrink_lruvec(lruvec, &sc);
+	shrink_lruvec(lruvec, &sc, &lru_pages);
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2749,17 +2945,20 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool noswap)
+					   int flags)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	struct reclaim_state reclaim_state = { 0 };
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = !noswap,
-		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
+		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
+		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
@@ -2767,9 +2966,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
@@ -2784,10 +2980,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 
-	current->flags |= PF_MEMALLOC;
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
-	current->flags &= ~PF_MEMALLOC;
-
+	current->reclaim_state = &reclaim_state;
+	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
+	current->reclaim_state = NULL;
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
 	return nr_reclaimed;
@@ -2805,7 +3002,8 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_anon_is_low(lruvec))
+		if (inactive_list_is_low(lruvec, false,
+					sc->target_mem_cgroup, true))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
@@ -2869,7 +3067,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
-		if (zone->all_unreclaimable) {
+		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
@@ -2927,16 +3125,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static bool kswapd_shrink_zone(struct zone *zone,
 			       int classzone_idx,
 			       struct scan_control *sc,
-			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
-	unsigned long nr_slab;
 	int testorder = sc->order;
 	unsigned long balance_gap;
-	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct shrink_control shrink = {
-		.gfp_mask = sc->gfp_mask,
-	};
 	bool lowmem_pressure;
 
 	/* Reclaim above the high watermark. */
@@ -2972,18 +3164,11 @@ static bool kswapd_shrink_zone(struct zone *zone,
 						balance_gap, classzone_idx))
 		return true;
 
-	shrink_zone(zone, sc);
-
-	reclaim_state->reclaimed_slab = 0;
-	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
-	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 
-	if (nr_slab == 0 && !zone_reclaimable(zone))
-		zone->all_unreclaimable = 1;
-
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 
 	/*
@@ -2992,7 +3177,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
-	if (!zone->all_unreclaimable &&
+	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -3041,7 +3226,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 	count_vm_event(PAGEOUTRUN);
 
 	do {
-		unsigned long lru_pages = 0;
 		unsigned long nr_attempted = 0;
 		bool raise_priority = true;
 		bool pgdat_needs_compaction = (order > 0);
@@ -3058,8 +3242,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			zone_update_force_scan(zone);
+
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			/*
@@ -3101,8 +3287,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			lru_pages += zone_reclaimable_pages(zone);
-
 			/*
 			 * If any zone is currently balanced then kswapd will
 			 * not call compaction as it is expected that the
@@ -3137,8 +3321,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			sc.nr_scanned = 0;
@@ -3158,8 +3342,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			 * that that high watermark would be met at 100%
 			 * efficiency.
 			 */
-			if (kswapd_shrink_zone(zone, end_zone, &sc,
-					lru_pages, &nr_attempted))
+			if (kswapd_shrink_zone(zone, end_zone,
+					       &sc, &nr_attempted))
 				raise_priority = false;
 		}
 
@@ -3204,6 +3388,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 		 !pgdat_balanced(pgdat, order, *classzone_idx));
 
 out:
+
+	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		if (!populated_zone(zone))
+			continue;
+
+		zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+	}
+
 	/*
 	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 	 * makes a decision on the order we were last reclaiming at. However,
@@ -3426,9 +3620,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.order = 0,
 		.priority = DEF_PRIORITY,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
@@ -3438,7 +3629,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
@@ -3615,10 +3806,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.order = order,
 		.priority = ZONE_RECLAIM_PRIORITY,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
-	unsigned long nr_slab_pages0, nr_slab_pages1;
 
 	cond_resched();
 	/*
@@ -3637,45 +3824,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * priorities until we have enough memory freed.
 		 */
 		do {
-			shrink_zone(zone, &sc);
+			shrink_zone(zone, &sc, true);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 
-	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-	if (nr_slab_pages0 > zone->min_slab_pages) {
-		/*
-		 * shrink_slab() does not currently allow us to determine how
-		 * many pages were freed in this zone. So we take the current
-		 * number of slab pages and shake the slab until it is reduced
-		 * by the same nr_pages that we used for reclaiming unmapped
-		 * pages.
-		 *
-		 * Note that shrink_slab will free memory on all zones and may
-		 * take a long time.
-		 */
-		for (;;) {
-			unsigned long lru_pages = zone_reclaimable_pages(zone);
-
-			/* No reclaimable slab or very low memory pressure */
-			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
-				break;
-
-			/* Freed enough memory */
-			nr_slab_pages1 = zone_page_state(zone,
-							NR_SLAB_RECLAIMABLE);
-			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
-				break;
-		}
-
-		/*
-		 * Update nr_reclaimed by the number of slab pages we
-		 * reclaimed from this zone.
-		 */
-		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-		if (nr_slab_pages1 < nr_slab_pages0)
-			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
-	}
-
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	lockdep_clear_current_reclaim_state();
@@ -3701,7 +3853,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 
-	if (zone->all_unreclaimable)
+	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 
 	/*
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -18,10 +18,13 @@
 #include <linux/cpumask.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>
+#include <linux/virtinfo.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
 #include <linux/compaction.h>
 
+#include "internal.h"
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -1115,10 +1118,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
-		   zone->all_unreclaimable,
+		   "\n  inactive_ratio:    %u"
+		   "\n  force_scan:        %d",
+		   !zone_reclaimable(zone),
 		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   zone->inactive_ratio,
+		   zone->force_scan);
 	seq_putc(m, '\n');
 }
 
@@ -1176,19 +1181,32 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	m->private = v;
 	if (!v)
 		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		v[i] = global_page_state(i);
-	v += NR_VM_ZONE_STAT_ITEMS;
 
-	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
-			    v + NR_DIRTY_THRESHOLD);
-	v += NR_VM_WRITEBACK_STAT_ITEMS;
+	if (ve_is_super(get_exec_env())) {
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			v[i] = global_page_state(i);
+
+		v += NR_VM_ZONE_STAT_ITEMS;
+
+		global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+				    v + NR_DIRTY_THRESHOLD);
+		v += NR_VM_WRITEBACK_STAT_ITEMS;
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
-	all_vm_events(v);
-	v[PGPGIN] /= 2;		/* sectors -> kbytes */
-	v[PGPGOUT] /= 2;
+		all_vm_events(v);
+		v[PGPGIN] /= 2;		/* sectors -> kbytes */
+		v[PGPGOUT] /= 2;
 #endif
+	} else
+		memset(v, 0, stat_items_size);
+
+	if (virtinfo_notifier_call(VITYPE_GENERAL,
+				VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) {
+		kfree(v);
+		m->private = NULL;
+		return ERR_PTR(-ENOMSG);
+	}
+
 	return (unsigned long *)m->private + *pos;
 }
 
@@ -1427,7 +1445,7 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+	proc_create("vmstat", S_IRUGO|S_ISVTX, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #endif
 	return 0;
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -264,34 +264,21 @@ void workingset_activation(struct page *page)
  * point where they would still be useful.
  */
 
-static unsigned long nr_shadow_nodes;
-static LIST_HEAD(shadow_nodes);
-static DEFINE_SPINLOCK(shadow_node_lock);
+struct list_lru workingset_shadow_nodes;
 
-void workingset_remember_node(struct radix_tree_node *node)
-{
-	spin_lock(&shadow_node_lock);
-	list_add(&node->private_list, &shadow_nodes);
-	nr_shadow_nodes++;
-	spin_unlock(&shadow_node_lock);
-}
-
-void workingset_forget_node(struct radix_tree_node *node)
-{
-	spin_lock(&shadow_node_lock);
-	list_del_init(&node->private_list);
-	nr_shadow_nodes--;
-	spin_unlock(&shadow_node_lock);
-}
-
-static unsigned long nr_excessive_shadows(void)
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+					struct shrink_control *sc)
 {
 	unsigned long shadow_nodes;
 	unsigned long max_nodes;
 	unsigned long pages;
 
-	shadow_nodes = nr_shadow_nodes;
-	pages = totalram_pages;
+	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+	local_irq_disable();
+	shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+	local_irq_enable();
+
+	pages = node_present_pages(sc->nid);
 	/*
 	 * Active cache pages are limited to 50% of memory, and shadow
 	 * entries that represent a refault distance bigger than that
@@ -314,12 +301,15 @@ static unsigned long nr_excessive_shadows(void)
 	return shadow_nodes - max_nodes;
 }
 
-static void shadow_lru_isolate(struct list_head *item,
-			       spinlock_t *lru_lock)
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+					  struct list_lru_one *lru,
+					  spinlock_t *lru_lock,
+					  void *arg)
 {
 	struct address_space *mapping;
 	struct radix_tree_node *node;
 	unsigned int i;
+	int ret;
 
 	/*
 	 * Page cache insertions and deletions synchroneously maintain
@@ -339,12 +329,11 @@ static void shadow_lru_isolate(struct list_head *item,
 	/* Coming from the list, invert the lock order */
 	if (!spin_trylock(&mapping->tree_lock)) {
 		spin_unlock(lru_lock);
+		ret = LRU_RETRY;
 		goto out;
 	}
 
-	list_del_init(item);
-	nr_shadow_nodes--;
-
+	list_lru_isolate(lru, item);
 	spin_unlock(lru_lock);
 
 	/*
@@ -372,37 +361,55 @@ static void shadow_lru_isolate(struct list_head *item,
 		BUG();
 
 	spin_unlock(&mapping->tree_lock);
+	ret = LRU_REMOVED_RETRY;
 out:
 	local_irq_enable();
 	cond_resched();
 	local_irq_disable();
 	spin_lock(lru_lock);
+	return ret;
 }
 
-static int shrink_shadow_nodes(struct shrinker *shrink,
-			       struct shrink_control *sc)
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+				       struct shrink_control *sc)
 {
-	unsigned long nr_to_scan = sc->nr_to_scan;
-
-	if (!nr_to_scan)
-		return nr_excessive_shadows();
+	unsigned long ret;
 
-	spin_lock_irq(&shadow_node_lock);
-	while (--nr_to_scan && !list_empty(&shadow_nodes))
-		shadow_lru_isolate(shadow_nodes.prev, &shadow_node_lock);
-	spin_unlock_irq(&shadow_node_lock);
-
-	return nr_excessive_shadows();
+	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+	local_irq_disable();
+	ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+				    shadow_lru_isolate, NULL);
+	local_irq_enable();
+	return ret;
 }
 
 static struct shrinker workingset_shadow_shrinker = {
-	.shrink = shrink_shadow_nodes,
+	.count_objects = count_shadow_nodes,
+	.scan_objects = scan_shadow_nodes,
 	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
 };
 
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
 static int __init workingset_init(void)
 {
-	register_shrinker(&workingset_shadow_shrinker);
+	int ret;
+
+	ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+	if (ret)
+		goto err;
+	ret = register_shrinker(&workingset_shadow_shrinker);
+	if (ret)
+		goto err_list_lru;
 	return 0;
+err_list_lru:
+	list_lru_destroy(&workingset_shadow_nodes);
+err:
+	return ret;
 }
 module_init(workingset_init);
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -507,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -67,7 +67,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg,
 		return 0;
 
 	size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
-	array = kzalloc(size, GFP_KERNEL);
+	array = kzalloc(size, GFP_KERNEL_ACCOUNT);
 	if (array == NULL)
 		return -ENOBUFS;
 
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -27,6 +27,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
 #include <net/switchdev.h>
@@ -156,6 +157,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 
 	skb->dev = vlan->real_dev;
 	len = skb->len;
+
 	if (unlikely(netpoll_tx_running(dev)))
 		return vlan_netpoll_send_skb(vlan, skb);
 
@@ -587,7 +589,7 @@ static int vlan_dev_init(struct net_device *dev)
 			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
 			   NETIF_F_ALL_FCOE;
 
-	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
+	dev->features |= dev->hw_features | NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
 	dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
@@ -827,4 +829,5 @@ void vlan_setup(struct net_device *dev)
 	dev->ethtool_ops	= &vlan_ethtool_ops;
 
 	memset(dev->broadcast, 0, ETH_ALEN);
+	dev->features |= NETIF_F_VIRTUAL;
 }
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -150,8 +150,8 @@ int __net_init vlan_proc_init(struct net *net)
 	if (!vn->proc_vlan_dir)
 		goto err;
 
-	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
-				     vn->proc_vlan_dir, &vlan_fops);
+	vn->proc_vlan_conf = proc_net_create_data(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
+				     vn->proc_vlan_dir, &vlan_fops, NULL);
 	if (!vn->proc_vlan_conf)
 		goto err;
 	return 0;
@@ -172,7 +172,7 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 	struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
 
 	vlan->dent =
-		proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
+		proc_net_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
 				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
 	if (!vlan->dent)
 		return -ENOBUFS;
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -681,7 +681,7 @@ int bt_procfs_init(struct net *net, const char *name,
 {
 	sk_list->custom_seq_show = seq_show;
 
-	if (!proc_create_data(name, 0, net->proc_net, &bt_fops, sk_list))
+	if (!proc_net_create_data(name, 0, net->proc_net, &bt_fops, sk_list))
 		return -ENOMEM;
 	return 0;
 }
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -97,8 +97,12 @@ static void br_set_lockdep_class(struct net_device *dev)
 static int br_dev_init(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	struct net *net = dev_net(dev);
 	int err;
 
+	if (!(net->owner_ve->features & VE_FEATURE_BRIDGE))
+		return -EACCES;
+
 	br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!br->stats)
 		return -ENOMEM;
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -35,7 +35,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb)
 {
-	if (!is_skb_forwardable(skb->dev, skb))
+	if (!(skb->dev->features & NETIF_F_VENET) &&
+	    !is_skb_forwardable(skb->dev, skb))
 		goto drop;
 
 	skb_push(skb, ETH_HLEN);
--- /dev/null
+++ b/net/bridge/br_netfilter.c
@@ -0,0 +1,1144 @@
+/*
+ *	Handle firewalling
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *	Bart De Schuymer		<bdschuym@pandora.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define skb_origaddr(skb)	 (((struct bridge_skb_cb *) \
+				 (skb->nf_bridge->data))->daddr.ipv4)
+#define store_orig_dstaddr(skb)	 (skb_origaddr(skb) = ip_hdr(skb)->daddr)
+#define dnat_took_place(skb)	 (skb_origaddr(skb) != ip_hdr(skb)->daddr)
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *brnf_sysctl_header;
+static int brnf_call_iptables __read_mostly = 1;
+static int brnf_call_ip6tables __read_mostly = 1;
+static int brnf_call_arptables __read_mostly = 1;
+static int brnf_filter_vlan_tagged __read_mostly = 0;
+static int brnf_filter_pppoe_tagged __read_mostly = 0;
+static int brnf_pass_vlan_indev __read_mostly = 0;
+#else
+#define brnf_call_iptables 1
+#define brnf_call_ip6tables 1
+#define brnf_call_arptables 1
+#define brnf_filter_vlan_tagged 0
+#define brnf_filter_pppoe_tagged 0
+#define brnf_pass_vlan_indev 0
+#endif
+
+#define IS_IP(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+
+#define IS_IPV6(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+
+#define IS_ARP(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+
+static inline __be16 vlan_proto(const struct sk_buff *skb)
+{
+	if (skb_vlan_tag_present(skb))
+		return skb->protocol;
+	else if (skb->protocol == htons(ETH_P_8021Q))
+		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+	else
+		return 0;
+}
+
+#define IS_VLAN_IP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IP) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_IPV6(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_ARP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
+	 brnf_filter_vlan_tagged)
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			    sizeof(struct pppoe_hdr)));
+}
+
+#define IS_PPPOE_IP(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IP) && \
+	 brnf_filter_pppoe_tagged)
+
+#define IS_PPPOE_IPV6(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IPV6) && \
+	 brnf_filter_pppoe_tagged)
+
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			     struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb)
+{
+}
+
+static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	return NULL;
+}
+
+static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
+{
+	return NULL;
+}
+
+static unsigned int fake_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
+static struct dst_ops fake_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		cpu_to_be16(ETH_P_IP),
+	.update_pmtu =		fake_update_pmtu,
+	.redirect =		fake_redirect,
+	.cow_metrics =		fake_cow_metrics,
+	.neigh_lookup =		fake_neigh_lookup,
+	.mtu =			fake_mtu,
+};
+
+/*
+ * Initialize bogus route table used to keep netfilter happy.
+ * Currently, we fill in the PMTU entry because netfilter
+ * refragmentation needs it, and the rt_flags entry because
+ * ipt_REJECT needs it.  Future netfilter modules might
+ * require us to fill additional fields.
+ */
+static const u32 br_dst_default_metrics[RTAX_MAX] = {
+	[RTAX_MTU - 1] = 1500,
+};
+
+void br_netfilter_rtable_init(struct net_bridge *br)
+{
+	struct rtable *rt = &br->fake_rtable;
+
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.dev = br->dev;
+	rt->dst.path = &rt->dst;
+	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+	rt->dst.flags	= DST_NOXFRM | DST_NOPEER | DST_FAKE_RTABLE;
+	rt->dst.ops = &fake_dst_ops;
+}
+
+static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? &port->br->fake_rtable : NULL;
+}
+
+static inline struct net_device *bridge_parent(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? port->br->dev : NULL;
+}
+
+static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
+{
+	skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
+	if (likely(skb->nf_bridge))
+		atomic_set(&(skb->nf_bridge->use), 1);
+
+	return skb->nf_bridge;
+}
+
+static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (atomic_read(&nf_bridge->use) > 1) {
+		struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
+
+		if (tmp) {
+			memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
+			atomic_set(&tmp->use, 1);
+		}
+		nf_bridge_put(nf_bridge);
+		nf_bridge = tmp;
+	}
+	return nf_bridge;
+}
+
+static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_push(skb, len);
+	skb->network_header -= len;
+}
+
+static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull_rcsum(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_save_header(struct sk_buff *skb)
+{
+	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+
+	skb_copy_from_linear_data_offset(skb, -header_size,
+					 skb->nf_bridge->data, header_size);
+}
+
+static inline void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+	if (skb->nf_bridge->mask & BRNF_8021Q)
+		skb->protocol = htons(ETH_P_8021Q);
+	else if (skb->nf_bridge->mask & BRNF_PPPoE)
+		skb->protocol = htons(ETH_P_PPP_SES);
+}
+
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	const struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+	u32 len;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+
+	/* Basic sanity checks */
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	if (iph->ihl == 5)
+		return 0;
+
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+	if (ip_options_compile(dev_net(dev), opt, skb))
+		goto inhdr_error;
+
+	/* Check correct handling of SRR option */
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+			goto drop;
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
+/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
+ * anyway
+ */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_validate_ipv4 for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
+/* Fill in the header for fragmented IP packets handled by
+ * the IPv4 connection tracking code.
+ */
+int nf_bridge_copy_header(struct sk_buff *skb)
+{
+	int err;
+	unsigned int header_size;
+
+	nf_bridge_update_protocol(skb);
+	header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+	err = skb_cow_head(skb, header_size);
+	if (err)
+		return err;
+
+	skb_copy_to_linear_data_offset(skb, -header_size,
+				       skb->nf_bridge->data, header_size);
+	__skb_push(skb, nf_bridge_encap_header_len(skb));
+	return 0;
+}
+
+/* PF_BRIDGE/PRE_ROUTING *********************************************/
+/* Undo the changes made for ip6tables PREROUTING and continue the
+ * bridge PRE_ROUTING hook. */
+static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+
+	nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+
+	rt = bridge_parent_rtable(nf_bridge->physindev);
+	if (!rt) {
+		kfree_skb(skb);
+		return 0;
+	}
+	skb_dst_set_noref(skb, &rt->dst);
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+		       skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct neighbour *neigh;
+	struct dst_entry *dst;
+
+	skb->dev = bridge_parent(skb->dev);
+	if (!skb->dev)
+		goto free_skb;
+	dst = skb_dst(skb);
+	neigh = dst_neigh_lookup_skb(dst, skb);
+	if (neigh) {
+		int ret;
+
+		if (neigh->hh.hh_len) {
+			neigh_hh_bridge(&neigh->hh, skb);
+			skb->dev = nf_bridge->physindev;
+			ret = br_handle_frame_finish(sk, skb);
+		} else {
+			/* the neighbour function below overwrites the complete
+			 * MAC header, so we save the Ethernet source address and
+			 * protocol number.
+			 */
+			skb_copy_from_linear_data_offset(skb,
+							 -(ETH_HLEN-ETH_ALEN),
+							 skb->nf_bridge->data,
+							 ETH_HLEN-ETH_ALEN);
+			/* tell br_dev_xmit to continue with forwarding */
+			nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+			/* FIXME Need to refragment */
+			ret = neigh->output(neigh, skb);
+		}
+		neigh_release(neigh);
+		return ret;
+	}
+free_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* This requires some explaining. If DNAT has taken place,
+ * we will need to fix up the destination Ethernet address.
+ *
+ * There are two cases to consider:
+ * 1. The packet was DNAT'ed to a device in the same bridge
+ *    port group as it was received on. We can still bridge
+ *    the packet.
+ * 2. The packet was DNAT'ed to a different device, either
+ *    a non-bridged device or another bridge port group.
+ *    The packet will need to be routed.
+ *
+ * The correct way of distinguishing between these two cases is to
+ * call ip_route_input() and to look at skb->dst->dev, which is
+ * changed to the destination device if ip_route_input() succeeds.
+ *
+ * Let's first consider the case that ip_route_input() succeeds:
+ *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
+ * Otherwise, the packet is considered to be routed and we just
+ * change the destination MAC address so that the packet will
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
+ *
+ * Let's now consider the case that ip_route_input() fails:
+ *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
+ * thinks we're handling a locally generated packet and won't care
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct iphdr *iph = ip_hdr(skb);
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+	int err;
+
+	nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+	if (dnat_took_place(skb)) {
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+			struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+			/* If err equals -EHOSTUNREACH the error is due to a
+			 * martian destination or due to the fact that
+			 * forwarding is disabled. For most martian packets,
+			 * ip_route_output_key() will fail. It won't fail for 2 types of
+			 * martian destinations: loopback destinations and destination
+			 * 0.0.0.0. In both cases the packet will be dropped because the
+			 * destination is the loopback device and not the bridge. */
+			if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
+				goto free_skb;
+
+			rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+					     RT_TOS(iph->tos), 0);
+			if (!IS_ERR(rt)) {
+				/* - Bridged-and-DNAT'ed traffic doesn't
+				 *   require ip_forwarding. */
+				if (rt->dst.dev == dev) {
+					skb_dst_set(skb, &rt->dst);
+					goto bridged_dnat;
+				}
+				ip_rt_put(rt);
+			}
+free_skb:
+			kfree_skb(skb);
+			return 0;
+		} else {
+			if (skb_dst(skb)->dev == dev) {
+bridged_dnat:
+				skb->dev = nf_bridge->physindev;
+				nf_bridge_update_protocol(skb);
+				nf_bridge_push_encap_header(skb);
+				NF_HOOK_THRESH(NFPROTO_BRIDGE,
+					       NF_BR_PRE_ROUTING,
+					       sk, skb, skb->dev, NULL,
+					       br_nf_pre_routing_finish_bridge,
+					       1);
+				return 0;
+			}
+			memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
+			skb->pkt_type = PACKET_HOST;
+		}
+	} else {
+		rt = bridge_parent_rtable(nf_bridge->physindev);
+		if (!rt) {
+			kfree_skb(skb);
+			return 0;
+		}
+		skb_dst_set_noref(skb, &rt->dst);
+	}
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+		       skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct net_device *vlan, *br;
+
+	br = bridge_parent(dev);
+	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+		return br;
+
+	vlan = __vlan_find_dev_deep(br, skb->vlan_proto,
+				    skb_vlan_tag_get(skb) & VLAN_VID_MASK);
+
+	return vlan ? vlan : br;
+}
+
+/* Some common code for IPv4/IPv6 */
+static struct net_device *setup_pre_routing(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->physindev = skb->dev;
+	skb->dev = brnf_get_logical_dev(skb, skb->dev);
+	if (skb->protocol == htons(ETH_P_8021Q))
+		nf_bridge->mask |= BRNF_8021Q;
+	else if (skb->protocol == htons(ETH_P_PPP_SES))
+		nf_bridge->mask |= BRNF_PPPoE;
+
+	return skb->dev;
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables.
+ */
+static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
+					   struct sk_buff *skb,
+					   const struct net_device *in,
+					   const struct net_device *out,
+					   const struct nf_hook_state *state)
+{
+	if (br_validate_ipv6(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
+		skb->dev, NULL,
+		br_nf_pre_routing_finish_ipv6);
+
+	return NF_STOLEN;
+}
+
+/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
+ * Replicate the checks that IPv4 does on packet reception.
+ * Set skb->dev to the bridge device (i.e. parent of the
+ * receiving device) to make netfilter happy, the REDIRECT
+ * target in particular.  Save the original destination IP
+ * address to be able to detect DNAT afterwards. */
+static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      const struct nf_hook_state *state)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	__u32 len = nf_bridge_encap_header_len(skb);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return NF_DROP;
+
+	p = br_port_get_rcu(state->in);
+	if (p == NULL)
+		return NF_DROP;
+	br = p->br;
+
+	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
+		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+			return NF_ACCEPT;
+
+		nf_bridge_pull_encap_header_rcsum(skb);
+		return br_nf_pre_routing_ipv6(ops, skb, state->in, state->out, state);
+	}
+
+	if (!brnf_call_iptables && !br->nf_call_iptables)
+		return NF_ACCEPT;
+
+	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header_rcsum(skb);
+
+	if (br_parse_ip_options(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+	store_orig_dstaddr(skb);
+	skb->protocol = htons(ETH_P_IP);
+
+	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
+		skb->dev, NULL,
+		br_nf_pre_routing_finish);
+
+	return NF_STOLEN;
+}
+
+
+/* PF_BRIDGE/FORWARD *************************************************/
+static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *in;
+
+	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+
+		if (skb->protocol == htons(ETH_P_IP))
+			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+		if (skb->protocol == htons(ETH_P_IPV6))
+			nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+		in = nf_bridge->physindev;
+		if (nf_bridge->mask & BRNF_PKT_TYPE) {
+			skb->pkt_type = PACKET_OTHERHOST;
+			nf_bridge->mask ^= BRNF_PKT_TYPE;
+		}
+		nf_bridge_update_protocol(skb);
+	} else {
+		in = *((struct net_device **)(skb->cb));
+	}
+	nf_bridge_push_encap_header(skb);
+
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb,
+		       in, skb->dev, br_forward_finish, 1);
+	return 0;
+}
+
+
+/* This is the 'purely bridged' case.  For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports. */
+static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+				     struct sk_buff *skb,
+				     const struct net_device *in,
+				     const struct net_device *out,
+				     const struct nf_hook_state *state)
+{
+	struct nf_bridge_info *nf_bridge;
+	struct net_device *parent;
+	u_int8_t pf;
+
+	if (!skb->nf_bridge)
+		return NF_ACCEPT;
+
+	/* Need exclusive nf_bridge_info since we might have multiple
+	 * different physoutdevs. */
+	if (!nf_bridge_unshare(skb))
+		return NF_DROP;
+
+	parent = bridge_parent(state->out);
+	if (!parent)
+		return NF_DROP;
+
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+		pf = NFPROTO_IPV4;
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+		pf = NFPROTO_IPV6;
+	else
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header(skb);
+
+	nf_bridge = skb->nf_bridge;
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	if (pf == NFPROTO_IPV4) {
+		if (br_parse_ip_options(skb))
+			return NF_DROP;
+		IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+	}
+
+	if (pf == NFPROTO_IPV6) {
+		if (br_validate_ipv6(skb))
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+	}
+
+	/* The physdev module checks on this */
+	nf_bridge->mask |= BRNF_BRIDGED;
+	nf_bridge->physoutdev = skb->dev;
+	if (pf == NFPROTO_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_FORWARD, NULL, skb,
+		brnf_get_logical_dev(skb, state->in),
+		parent,	br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      const struct nf_hook_state *state)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct net_device **d = (struct net_device **)(skb->cb);
+
+	p = br_port_get_rcu(state->out);
+	if (p == NULL)
+		return NF_ACCEPT;
+	br = p->br;
+
+	if (!brnf_call_arptables && !br->nf_call_arptables)
+		return NF_ACCEPT;
+
+	if (!IS_ARP(skb)) {
+		if (!IS_VLAN_ARP(skb))
+			return NF_ACCEPT;
+		nf_bridge_pull_encap_header(skb);
+	}
+
+	if (arp_hdr(skb)->ar_pln != 4) {
+		if (IS_VLAN_ARP(skb))
+			nf_bridge_push_encap_header(skb);
+		return NF_ACCEPT;
+	}
+	*d = state->in;
+	NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb,
+		state->in, state->out, br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb_is_gso(skb) || skb->len + nf_bridge_mtu_reduction(skb) <=
+	    skb->dev->mtu)
+		return br_dev_queue_push_xmit(sk, skb);
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	/* This is wrong! We should preserve the original fragment
+	 * boundaries by preserving frag_list rather than refragmenting.
+	 */
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    !(skb->dev->features & NETIF_F_VENET)) {
+		if (br_parse_ip_options(skb))
+			/* Drop invalid packet */
+			goto drop;
+
+		IPCB(skb)->frag_max_size = skb->nf_bridge->frag_max_size;
+		return ip_fragment(sk, skb, br_dev_queue_push_xmit);
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+
+		if (br_validate_ipv6(skb))
+			goto drop;
+
+		IP6CB(skb)->frag_max_size = skb->nf_bridge->frag_max_size;
+		if (v6ops)
+			return v6ops->fragment(sk, skb, br_dev_queue_push_xmit);
+
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+#endif
+	return br_dev_queue_push_xmit(sk, skb);
+ drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* PF_BRIDGE/POST_ROUTING ********************************************/
+static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       const struct nf_hook_state *state)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *realoutdev = bridge_parent(skb->dev);
+	u_int8_t pf;
+
+	if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED))
+		return NF_ACCEPT;
+
+	if (!realoutdev)
+		return NF_DROP;
+
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+		pf = NFPROTO_IPV4;
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+		pf = NFPROTO_IPV6;
+	else
+		return NF_ACCEPT;
+
+	/* We assume any code from br_dev_queue_push_xmit onwards doesn't care
+	 * about the value of skb->pkt_type. */
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge_pull_encap_header(skb);
+	nf_bridge_save_header(skb);
+	if (pf == NFPROTO_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb,
+		NULL, realoutdev,
+		br_nf_dev_queue_xmit);
+
+	return NF_STOLEN;
+}
+
+/* IP/SABOTAGE *****************************************************/
+/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
+ * for the second time. */
+static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out,
+				   const struct nf_hook_state *state)
+{
+	if (skb->nf_bridge &&
+	    !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+		return NF_STOP;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
+static struct nf_hook_ops br_nf_ops[] __read_mostly = {
+	{
+		.hook = br_nf_pre_routing,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_PRE_ROUTING,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_forward_ip,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF - 1,
+	},
+	{
+		.hook = br_nf_forward_arp,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_post_routing,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_POST_ROUTING,
+		.priority = NF_BR_PRI_LAST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_IPV4,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP_PRI_FIRST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_IPV6,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP6_PRI_FIRST,
+	},
+};
+
+#ifdef CONFIG_SYSCTL
+static
+int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
+			    void __user * buffer, size_t * lenp, loff_t * ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write && *(int *)(ctl->data))
+		*(int *)(ctl->data) = 1;
+	return ret;
+}
+
+static struct ctl_table brnf_table[] = {
+	{
+		.procname	= "bridge-nf-call-arptables",
+		.data		= &brnf_call_arptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-iptables",
+		.data		= &brnf_call_iptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-ip6tables",
+		.data		= &brnf_call_ip6tables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-vlan-tagged",
+		.data		= &brnf_filter_vlan_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-pppoe-tagged",
+		.data		= &brnf_filter_pppoe_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-pass-vlan-input-dev",
+		.data		= &brnf_pass_vlan_indev,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{ }
+};
+#endif
+
+int __init br_netfilter_init(void)
+{
+	int ret;
+
+	ret = dst_entries_init(&fake_dst_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret < 0) {
+		dst_entries_destroy(&fake_dst_ops);
+		return ret;
+	}
+#ifdef CONFIG_SYSCTL
+	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
+	if (brnf_sysctl_header == NULL) {
+		printk(KERN_WARNING
+		       "br_netfilter: can't register to sysctl.\n");
+		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		dst_entries_destroy(&fake_dst_ops);
+		return -ENOMEM;
+	}
+#endif
+	printk(KERN_NOTICE "Bridge firewalling registered\n");
+	return 0;
+}
+
+void br_netfilter_fini(void)
+{
+	nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(brnf_sysctl_header);
+#endif
+	dst_entries_destroy(&fake_dst_ops);
+}
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -519,6 +519,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
+
 /* PF_BRIDGE/FORWARD *************************************************/
 static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
 {
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -22,6 +22,10 @@
 #include <linux/if_vlan.h>
 #include <linux/rhashtable.h>
 
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <uapi/linux/vzcalluser.h>
+
 #define BR_HASH_BITS 8
 #define BR_HASH_SIZE (1 << BR_HASH_BITS)
 
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -51,7 +51,7 @@ struct arppayload
 };
 
 static void
-print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
+print_ports(const struct sk_buff *skb, uint8_t protocol, int offset, struct ve_struct *ve)
 {
 	if (protocol == IPPROTO_TCP ||
 	    protocol == IPPROTO_UDP ||
@@ -64,10 +64,10 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
 		pptr = skb_header_pointer(skb, offset,
 					  sizeof(_ports), &_ports);
 		if (pptr == NULL) {
-			printk(" INCOMPLETE TCP/UDP header");
+			ve_log_printk(ve, " INCOMPLETE TCP/UDP header");
 			return;
 		}
-		printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+		ve_log_printk(ve, " SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
 	}
 }
 
@@ -78,13 +78,10 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	       const char *prefix)
 {
 	unsigned int bitmask;
-
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
+	struct ve_struct *ve = net->owner_ve;
 
 	spin_lock_bh(&ebt_log_lock);
-	printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+	ve_log_printk(ve, KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
 	       '0' + loginfo->u.log.level, prefix,
 	       in ? in->name : "", out ? out->name : "",
 	       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
@@ -102,12 +99,12 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 		if (ih == NULL) {
-			printk(" INCOMPLETE IP header");
+			ve_log_printk(ve, " INCOMPLETE IP header");
 			goto out;
 		}
-		printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+		ve_log_printk(ve, " IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
 		       &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
-		print_ports(skb, ih->protocol, ih->ihl*4);
+		print_ports(skb, ih->protocol, ih->ihl*4, ve);
 		goto out;
 	}
 
@@ -122,16 +119,16 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 		if (ih == NULL) {
-			printk(" INCOMPLETE IPv6 header");
+			ve_log_printk(ve, " INCOMPLETE IPv6 header");
 			goto out;
 		}
-		printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
+		ve_log_printk(ve, " IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
 		       &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
 		nexthdr = ih->nexthdr;
 		offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off);
 		if (offset_ph == -1)
 			goto out;
-		print_ports(skb, nexthdr, offset_ph);
+		print_ports(skb, nexthdr, offset_ph, ve);
 		goto out;
 	}
 #endif
@@ -144,10 +141,10 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
 		if (ah == NULL) {
-			printk(" INCOMPLETE ARP header");
+			ve_log_printk(ve, " INCOMPLETE ARP header");
 			goto out;
 		}
-		printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+		ve_log_printk(ve, " ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
 		       ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
 		       ntohs(ah->ar_op));
 
@@ -162,15 +159,15 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 			ap = skb_header_pointer(skb, sizeof(_arph),
 						sizeof(_arpp), &_arpp);
 			if (ap == NULL) {
-				printk(" INCOMPLETE ARP payload");
+				ve_log_printk(ve, " INCOMPLETE ARP payload");
 				goto out;
 			}
-			printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
+			ve_log_printk(ve, " ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
 					ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
 		}
 	}
 out:
-	printk("\n");
+	ve_log_printk(ve, "\n");
 	spin_unlock_bh(&ebt_log_lock);
 
 }
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -31,8 +31,9 @@
 /* needed for logical [in,out]-dev filtering */
 #include "../br_private.h"
 
-#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
-					 "report to author: "format, ## args)
+#define BUGPRINT(format, args...)					\
+	ve_printk(VE_LOG, "kernel msg: ebtables bug: please "		\
+			"report to author: "format, ## args)
 /* #define BUGPRINT(format, args...) */
 
 /*
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -137,9 +137,12 @@
 #include <linux/hrtimer.h>
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
+#include <linux/fence-watchdog.h>
 
 #include "net-sysfs.h"
 
+#include <linux/ve.h>
+
 /* Instead of increasing this, you should create a hash table. */
 #define MAX_GRO_SKBS 8
 
@@ -189,18 +192,6 @@ static inline void dev_base_seq_inc(struct net *net)
 	while (++net->dev_base_seq == 0);
 }
 
-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
-{
-	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-
-	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
-}
-
-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
-{
-	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
-}
-
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
@@ -1170,11 +1161,14 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	}
 
 rollback:
-	ret = device_rename(&dev->dev, dev->name);
-	if (ret) {
-		memcpy(dev->name, oldname, IFNAMSIZ);
-		write_seqcount_end(&devnet_rename_seq);
-		return ret;
+	if (!dev_net(dev)->owner_ve->ve_netns ||
+	    dev_net(dev)->owner_ve->ve_netns == dev->nd_net) {
+		ret = device_rename(&dev->dev, dev->name);
+		if (ret) {
+			memcpy(dev->name, oldname, IFNAMSIZ);
+			write_seqcount_end(&devnet_rename_seq);
+			return ret;
+		}
 	}
 
 	write_seqcount_end(&devnet_rename_seq);
@@ -2885,6 +2879,14 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
 	struct sk_buff *skb = first;
 	int rc = NETDEV_TX_OK;
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (unlikely(fence_wdog_check_timer())) {
+		kfree_skb(skb);
+		*ret = rc;
+		return NULL;
+	}
+#endif
+
 	while (skb) {
 		struct sk_buff *next = skb->next;
 
@@ -4325,6 +4327,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->recursion_counter = 0;
 		NAPI_GRO_CB(skb)->is_atomic = 1;
 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+		NAPI_GRO_CB(skb)->recursion_counter = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
@@ -5048,6 +5051,10 @@ static void net_rx_action(struct softirq_action *h)
 	list_splice_init(&sd->poll_list, &list);
 	local_irq_enable();
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	fence_wdog_check_timer();
+#endif
+
 	for (;;) {
 		struct napi_struct *n;
 
@@ -6244,8 +6251,13 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 			return -EOVERFLOW;
 		}
 	}
-	if (dev->flags != old_flags) {
-		pr_info("device %s %s promiscuous mode\n",
+	/*
+	 * Promiscous mode on LOOPBACK/POINTTOPOINT devices does
+	 * not mean anything
+	 */
+	if ((dev->flags != old_flags) &&
+			!(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+		ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n",
 			dev->name,
 			dev->flags & IFF_PROMISC ? "entered" : "left");
 		if (audit_enabled) {
@@ -7109,9 +7121,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	if (count < 1 || count > 0xffff)
 		return -EINVAL;
 
-	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	tx = kzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_REPEAT);
 	if (!tx) {
-		tx = vzalloc(sz);
+		tx = vzalloc_account(sz);
 		if (!tx)
 			return -ENOMEM;
 	}
@@ -7154,6 +7166,14 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 	BUG_ON(!net);
 
+	ret = -EPERM;
+	if (!ve_is_super(net->owner_ve) && ve_is_dev_movable(dev))
+		goto out;
+
+	ret = -ENOMEM;
+	if (atomic_dec_if_positive(&net->owner_ve->netif_avail_nr) < 0)
+		goto out;
+
 	spin_lock_init(&dev->addr_list_lock);
 	netdev_set_addr_lockdep_class(dev);
 
@@ -7161,7 +7181,7 @@ int register_netdevice(struct net_device *dev)
 
 	ret = dev_get_valid_name(net, dev, dev->name);
 	if (ret < 0)
-		goto out;
+		goto err_avail;
 
 	/* Init, if this function is available */
 	if (dev->netdev_ops->ndo_init) {
@@ -7169,7 +7189,7 @@ int register_netdevice(struct net_device *dev)
 		if (ret) {
 			if (ret > 0)
 				ret = -EIO;
-			goto out;
+			goto err_avail;
 		}
 	}
 
@@ -7296,10 +7316,65 @@ int register_netdevice(struct net_device *dev)
 err_uninit:
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
+err_avail:
+	atomic_inc(&net->owner_ve->netif_avail_nr);
 	goto out;
 }
 EXPORT_SYMBOL(register_netdevice);
 
+/*
+ * We do horrible things -- we left a netdevice
+ * in "leaked" state, which means we release as much
+ * resources as possible but the device will remain
+ * present in namespace because someone holds a reference.
+ *
+ * The idea is to be able to force stop VE.
+ */
+static void ve_netdev_leak(struct net_device *dev)
+{
+	struct napi_struct *p, *n;
+
+	dev->is_leaked = 1;
+	barrier();
+
+	/*
+	 * Make sure we're unable to tx/rx
+	 * network packets to outside.
+	 */
+	WARN_ON_ONCE(dev->flags & IFF_UP);
+	WARN_ON_ONCE(dev->qdisc != &noop_qdisc);
+
+	rtnl_lock();
+
+	/*
+	 * No address and napi after that.
+	 */
+	dev_addr_flush(dev);
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
+	/*
+	 * No release_net() here since the device remains
+	 * present in the namespace.
+	 */
+
+	__rtnl_unlock();
+
+	/*
+	 * Since we've already screwed the device and releasing
+	 * it in a normal way is not possible anymore, we're
+	 * to be sure the device will remain here forever.
+	 */
+	dev_hold(dev);
+
+	synchronize_net();
+
+	pr_emerg("Device (%s:%d:%s:%p) marked as leaked\n",
+			dev->name, netdev_refcnt_read(dev) - 1,
+			ve_name(dev_net(dev)->owner_ve), dev);
+	dst_cache_dump();
+}
+
 /**
  *	init_dummy_netdev	- init a dummy network device for NAPI
  *	@dev: device to init
@@ -7387,10 +7462,11 @@ EXPORT_SYMBOL(netdev_refcnt_read);
  * We can get stuck here if buggy protocols don't correctly
  * call dev_put.
  */
-static void netdev_wait_allrefs(struct net_device *dev)
+static int netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
 	int refcnt;
+	int i = 0;
 
 	linkwatch_forget_dev(dev);
 
@@ -7430,11 +7506,25 @@ static void netdev_wait_allrefs(struct net_device *dev)
 		refcnt = netdev_refcnt_read(dev);
 
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
-			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
-				 dev->name, refcnt);
+			pr_emerg("unregister_netdevice: waiting for %s=%p to "
+				"become free. Usage count = %d\n ve=%s",
+				 dev->name, dev, refcnt,
+				 ve_name(dev_net(dev)->owner_ve));
 			warning_time = jiffies;
 		}
+
+		/*
+		 * If device has lost the reference we might stuck
+		 * in this loop forever not having a chance the VE
+		 * to stop.
+		 */
+		if (++i > 200) { /* give 50 seconds to try */
+			ve_netdev_leak(dev);
+			return -EBUSY;
+		}
 	}
+
+	return 0;
 }
 
 /* The sequence is:
@@ -7470,7 +7560,6 @@ void netdev_run_todo(void)
 
 	__rtnl_unlock();
 
-
 	/* Wait for rcu callbacks to finish before next phase */
 	if (!list_empty(&list))
 		rcu_barrier();
@@ -7493,7 +7582,12 @@ void netdev_run_todo(void)
 
 		dev->reg_state = NETREG_UNREGISTERED;
 
-		netdev_wait_allrefs(dev);
+		/*
+		 * Even if device get stuck here we are
+		 * to proceed the rest of the list.
+		 */
+		if (netdev_wait_allrefs(dev))
+			continue;
 
 		/* paranoia */
 		BUG_ON(netdev_refcnt_read(dev));
@@ -7501,6 +7595,11 @@ void netdev_run_todo(void)
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
 
+		atomic_inc(&dev_net(dev)->owner_ve->netif_avail_nr);
+
+		/* It must be the very last action,
+		 * after this 'dev' may point to freed up memory.
+		 */
 		if (dev->destructor)
 			dev->destructor(dev);
 
@@ -7659,11 +7758,13 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	/* ensure 32-byte alignment of whole construct */
 	alloc_size += NETDEV_ALIGN - 1;
 
-	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-	if (!p)
-		p = vzalloc(alloc_size);
+	p = kzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_REPEAT);
 	if (!p)
+		p = vzalloc_account(alloc_size);
+	if (!p) {
+		pr_err("alloc_netdev: Unable to allocate device\n");
 		return NULL;
+	}
 
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
@@ -7744,6 +7845,13 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+	if (dev->is_leaked) {
+		pr_emerg("%s: device %s=%p is leaked\n",
+				__func__, dev->name, dev);
+		dump_stack();
+		return;
+	}
+
 	might_sleep();
 	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
@@ -7899,6 +8007,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 			goto out;
 	}
 
+	err = -ENOMEM;
+	if (atomic_dec_if_positive(&net->owner_ve->netif_avail_nr) < 0)
+		goto out;
+	atomic_inc(&dev_net(dev)->owner_ve->netif_avail_nr);
+
 	/*
 	 * And now a mini version of register_netdevice unregister_netdevice.
 	 */
@@ -8047,7 +8160,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 		mask |= NETIF_F_CSUM_MASK;
 	mask |= NETIF_F_VLAN_CHALLENGED;
 
-	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
+	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_CSUM_MASK|NETIF_F_VIRTUAL) & mask;
 	all &= one | ~NETIF_F_ALL_FOR_ALL;
 
 	/* If one device supports hw checksumming, set for all. */
@@ -8063,7 +8176,7 @@ static struct hlist_head *netdev_create_hash(void)
 	int i;
 	struct hlist_head *hash;
 
-	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_ACCOUNT);
 	if (hash != NULL)
 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
 			INIT_HLIST_HEAD(&hash[i]);
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -294,6 +294,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
 
 	case SIOCSIFTXQLEN:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
 		if (ifr->ifr_qlen < 0)
 			return -EINVAL;
 		dev->tx_queue_len = ifr->ifr_qlen;
@@ -489,27 +491,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		}
 		return ret;
 
-	/*
-	 *	These ioctl calls:
-	 *	- require superuser power.
-	 *	- require strict serialization.
-	 *	- do not return a value
-	 */
-	case SIOCSIFMAP:
-	case SIOCSIFTXQLEN:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		/* fall through */
 	/*
 	 *	These ioctl calls:
 	 *	- require local superuser power.
 	 *	- require strict serialization.
 	 *	- do not return a value
 	 */
-	case SIOCSIFFLAGS:
-	case SIOCSIFMETRIC:
+	case SIOCSIFMAP:
 	case SIOCSIFMTU:
 	case SIOCSIFHWADDR:
+	case SIOCSIFFLAGS:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ifsioc(net, &ifr, cmd);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCSIFMETRIC:
 	case SIOCSIFSLAVE:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,6 +265,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 
 	lwtstate_put(dst->lwtstate);
 
+	dst->flags |= DST_FREE;
 	if (dst->flags & DST_METADATA)
 		metadata_dst_free((struct metadata_dst *)dst);
 	else
@@ -298,6 +299,21 @@ static void dst_destroy_rcu(struct rcu_head *head)
 		__dst_free(dst);
 }
 
+void dst_dump_one(struct dst_entry *d)
+{
+	printk("\tdev %p err %d obs %d flags %x i/o %p/%p ref %d use %d\n",
+			d->dev, (int)d->error, (int)d->obsolete, d->flags,
+			d->input, d->output, atomic_read(&d->__refcnt), d->__use);
+}
+EXPORT_SYMBOL(dst_dump_one);
+
+void dst_cache_dump(void)
+{
+	ip_rt_dump_dsts();
+	if (ip6_rt_dump_dsts)
+		ip6_rt_dump_dsts();
+}
+
 void dst_release(struct dst_entry *dst)
 {
 	if (dst) {
@@ -435,6 +451,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_UNREGISTER_FINAL:
 	case NETDEV_DOWN:
+		dst_gc_task(NULL);
 		mutex_lock(&dst_gc_mutex);
 		for (dst = dst_busy_list; dst; dst = dst->next) {
 			last = dst;
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2421,8 +2421,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GEEE:
 	case ETHTOOL_GTUNABLE:
 		break;
+	case ETHTOOL_SEEPROM:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 	default:
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		if (!ve_capable(CAP_NET_ADMIN))
 			return -EPERM;
 	}
 
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -23,7 +23,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 {
 	struct fib_rule *r;
 
-	r = kzalloc(ops->rule_size, GFP_KERNEL);
+	r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
 	if (r == NULL)
 		return -ENOMEM;
 
@@ -285,7 +285,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (err < 0)
 		goto errout;
 
-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
+	rule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
 	if (rule == NULL) {
 		err = -ENOMEM;
 		goto errout;
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -736,7 +736,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_ACCOUNT);
 	if (!fp)
 		return -ENOMEM;
 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -128,6 +128,10 @@ int memcpy_toiovecend_partial(const struct iovec *iov, unsigned char *kdata,
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len)
 {
+	/* No data? Done! */
+	if (len == 0)
+		return 0;
+
 	/* Skip over the finished iovecs */
 	while (offset >= iov->iov_len) {
 		offset -= iov->iov_len;
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -24,6 +24,7 @@
 #include <linux/socket.h>
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
+#include <linux/ve.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -40,6 +41,7 @@
 #include <linux/log2.h>
 #include <linux/inetdevice.h>
 #include <net/addrconf.h>
+#include <bc/beancounter.h>
 
 #define DEBUG
 #define NEIGH_DEBUG 1
@@ -277,6 +279,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	int entries;
 
 	entries = atomic_inc_return(&tbl->entries) - 1;
+	n = ERR_PTR(-ENOBUFS);
 	if (entries >= tbl->gc_thresh3 ||
 	    (entries >= tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
@@ -287,7 +290,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 
 	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
 	if (!n)
-		goto out_entries;
+		goto out_nomem;
 
 	__skb_queue_head_init(&n->arp_queue);
 	rwlock_init(&n->lock);
@@ -306,6 +309,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 out:
 	return n;
 
+out_nomem:
+	n = ERR_PTR(-ENOMEM);
 out_entries:
 	atomic_dec(&tbl->entries);
 	goto out;
@@ -467,13 +472,12 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 	u32 hash_val;
 	int key_len = tbl->key_len;
 	int error;
-	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
+	struct neighbour *n1, *rc, *n;
 	struct neigh_hash_table *nht;
 
-	if (!n) {
-		rc = ERR_PTR(-ENOBUFS);
+	rc = n = neigh_alloc(tbl, dev);
+	if (IS_ERR(n))
 		goto out;
-	}
 
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
@@ -704,6 +708,13 @@ void neigh_destroy(struct neighbour *neigh)
 
 	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
 
+	if (neigh->dev->is_leaked) {
+		printk(KERN_WARNING
+		       "Destroying neighbour %p on leaked device\n", neigh);
+		dump_stack();
+		return;
+	}
+
 	if (!neigh->dead) {
 		pr_warn("Destroying alive neighbour %p\n", neigh);
 		dump_stack();
@@ -810,7 +821,9 @@ static void neigh_periodic_work(struct work_struct *work)
 				*np = n->next;
 				n->dead = 1;
 				write_unlock(&n->lock);
+
 				neigh_cleanup_and_release(n);
+
 				continue;
 			}
 			write_unlock(&n->lock);
@@ -2120,6 +2133,12 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 	}
 
+	err = -ENOENT;
+	if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+	     tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+	    !net_eq(net, &init_net))
+		goto errout_tbl_lock;
+
 	if (tb[NDTA_THRESH1])
 		tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
 
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -304,12 +304,12 @@ static int __net_init dev_proc_net_init(struct net *net)
 {
 	int rc = -ENOMEM;
 
-	if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+	if (!proc_net_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
 		goto out;
-	if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+	if (!proc_net_create("softnet_stat", S_IRUGO, net->proc_net,
 			 &softnet_seq_fops))
 		goto out_dev;
-	if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+	if (!proc_net_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
 		goto out_softnet;
 
 	if (wext_proc_init(net))
@@ -387,7 +387,7 @@ static const struct file_operations dev_mc_seq_fops = {
 
 static int __net_init dev_mc_net_init(struct net *net)
 {
-	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+	if (!proc_net_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
 		return -ENOMEM;
 	return 0;
 }
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -3,10 +3,10 @@
 #include <linux/workqueue.h>
 #include <linux/rtnetlink.h>
 #include <linux/cache.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
@@ -16,12 +16,14 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
+#include <linux/netdevice.h>
 #ifndef __GENKSYMS__
 #include <net/sock.h>
 #endif
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/ve.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -36,6 +38,12 @@ EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net = {
 	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+#ifdef CONFIG_VE
+	.owner_ve = &ve0,
+#ifdef CONFIG_VE_IPTABLES
+	._iptables_modules = VE_IP_NONE,
+#endif
+#endif
 };
 EXPORT_SYMBOL(init_net);
 
@@ -55,7 +63,7 @@ static struct net_generic *net_alloc_generic(void)
 	return ng;
 }
 
-static int net_assign_generic(struct net *net, int id, void *data)
+int net_assign_generic(struct net *net, int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
 
@@ -91,6 +99,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	ng->ptr[id - 1] = data;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(net_assign_generic);
 
 static int ops_init(const struct pernet_operations *ops, struct net *net)
 {
@@ -261,7 +270,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 	spin_lock_irqsave(&net->nsid_lock, flags);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
-		get_net(peer);
+		peer = maybe_get_net(peer);
 	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	rcu_read_unlock();
 
@@ -288,6 +297,10 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	int error = 0;
 	LIST_HEAD(net_exit_list);
 
+#ifdef CONFIG_VE
+	net->owner_ve = get_ve(get_exec_env());
+#endif
+
 	atomic_set(&net->count, 1);
 	atomic_set(&net->passive, 1);
 	net->dev_base_seq = 1;
@@ -317,6 +330,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 		ops_free_list(ops, &net_exit_list);
 
 	rcu_barrier();
+#ifdef CONFIG_VE
+	put_ve(net->owner_ve);
+#endif
 	goto out;
 }
 
@@ -363,6 +379,7 @@ void net_drop_ns(void *p)
 struct net *copy_net_ns(unsigned long flags,
 			struct user_namespace *user_ns, struct net *old_net)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct ucounts *ucounts;
 	struct net *net;
 	int rv;
@@ -374,6 +391,9 @@ struct net *copy_net_ns(unsigned long flags,
 	if (!ucounts)
 		return ERR_PTR(-ENOSPC);
 
+	if (atomic_dec_if_positive(&ve->netns_avail_nr) < 0)
+		return ERR_PTR(-ENOMEM);
+
 	net = net_alloc();
 	if (!net) {
 		dec_net_namespaces(ucounts);
@@ -395,6 +415,7 @@ struct net *copy_net_ns(unsigned long flags,
 		dec_net_namespaces(ucounts);
 		put_user_ns(user_ns);
 		net_drop_ns(net);
+		atomic_inc(&ve->netns_avail_nr);
 		return ERR_PTR(rv);
 	}
 	return net;
@@ -409,12 +430,26 @@ static void cleanup_net(struct work_struct *work)
 	struct net *net, *tmp;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
+	bool reload = false;
+	int i = 0;
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	spin_lock_irq(&cleanup_list_lock);
-	list_replace_init(&cleanup_list, &net_kill_list);
+	list_for_each_entry_safe(net, tmp, &cleanup_list, cleanup_list)
+		if (++i == 16)
+			break;
+
+	if (i == 16) {
+		list_cut_position(&net_kill_list, &cleanup_list,
+						&net->cleanup_list);
+		reload = true;
+	} else
+		list_replace_init(&cleanup_list, &net_kill_list);
 	spin_unlock_irq(&cleanup_list_lock);
 
+	if (reload)
+		queue_work(netns_wq, work);
+
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
@@ -455,6 +490,15 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		struct ve_struct *ve = net->owner_ve;
+
+		atomic_inc(&ve->netns_avail_nr);
+		if (ve->ve_netns == net)
+			ve->ve_netns = NULL;
+		put_ve(ve);
+	}
+
 	mutex_unlock(&net_mutex);
 
 	/* Ensure there are no outstanding rcu callbacks using this
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3708,7 +3708,7 @@ static int __net_init pg_net_init(struct net *net)
 	pn->net = net;
 	INIT_LIST_HEAD(&pn->pktgen_threads);
 	pn->pktgen_exiting = false;
-	pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+	pn->proc_dir = proc_net_mkdir(pn->net, PG_PROC_DIR, pn->net->proc_net);
 	if (!pn->proc_dir) {
 		pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
 		return -ENODEV;
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -39,6 +39,7 @@
 #include <linux/if_vlan.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -2570,6 +2571,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx;
 	int s_idx = cb->family;
+	struct net *net = sock_net(skb->sk);
 
 	if (s_idx == 0)
 		s_idx = 1;
@@ -2580,6 +2582,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 		if (rtnl_msg_handlers[idx] == NULL ||
 		    rtnl_msg_handlers[idx][type].dumpit == NULL)
 			continue;
+		if (vz_security_family_check(net, idx, cb->nlh->nlmsg_type))
+			continue;
 		if (idx > s_idx) {
 			memset(&cb->args[0], 0, sizeof(cb->args));
 			cb->prev_seq = 0;
@@ -3866,6 +3870,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return 0;
 
 	family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+	if (vz_security_family_check(net, family, nlh->nlmsg_type))
+		return -EAFNOSUPPORT;
+
 	sz_idx = type>>2;
 	kind = type&3;
 
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -38,7 +38,6 @@
 #include <net/scm.h>
 #include <net/cls_cgroup.h>
 
-
 /*
  *	Only allow a user to send credentials, that they could set with
  *	setu(g)id.
@@ -54,6 +53,7 @@ static __inline__ int scm_check_creds(struct ucred *creds)
 		return -EINVAL;
 
 	if ((creds->pid == task_tgid_vnr(current) ||
+	     creds->pid == current->tgid ||
 	     ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
 	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) ||
 	      uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
@@ -81,7 +81,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 
 	if (!fpl)
 	{
-		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
@@ -338,7 +338,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 		return NULL;
 
 	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
-			  GFP_KERNEL);
+			  GFP_KERNEL_ACCOUNT);
 	if (new_fpl) {
 		for (i = 0; i < fpl->count; i++)
 			get_file(fpl->fp[i]);
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4082,7 +4082,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 		return;
 
 	skb_orphan(skb);
-	skb->mark = 0;
+	if (!(skb->dev->features & NETIF_F_VENET))
+		skb->mark = 0;
 }
 EXPORT_SYMBOL_GPL(skb_scrub_packet);
 
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,8 @@
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
 
+#include <bc/beancounter.h>
+
 #include <linux/filter.h>
 
 #include <trace/events/sock.h>
@@ -366,8 +368,8 @@ static void sock_warn_obsolete_bsdism(const char *name)
 	static char warncomm[TASK_COMM_LEN];
 	if (strcmp(warncomm, current->comm) && warned < 5) {
 		strcpy(warncomm,  current->comm);
-		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
-			warncomm, name);
+		ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
+			"%s SO_BSDCOMPAT\n", warncomm, name);
 		warned++;
 	}
 }
@@ -1554,6 +1556,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
+	extern int sysctl_tcp_use_sg;
+
 	sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1568,6 +1572,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
 		}
 	}
+	if (!sysctl_tcp_use_sg)
+		sk->sk_route_caps &= ~NETIF_F_SG;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
@@ -2681,7 +2687,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 {
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags,
 					NULL);
 
 		if (prot->slab == NULL) {
@@ -2697,7 +2703,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
 								 prot->rsk_prot->obj_size, 0,
-								 SLAB_HWCACHE_ALIGN, NULL);
+								 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 
 			if (prot->rsk_prot->slab == NULL) {
 				pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -2716,7 +2722,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0,
-						  SLAB_HWCACHE_ALIGN |
+						  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
 							prot->slab_flags,
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
@@ -2883,7 +2889,7 @@ static const struct file_operations proto_seq_fops = {
 
 static __net_init int proto_init_net(struct net *net)
 {
-	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
+	if (!proc_net_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
 		return -ENOMEM;
 
 	return 0;
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -545,6 +545,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
+	if (!sysctl_tcp_use_sg)
+		newsk->sk_route_caps &= ~NETIF_F_SG;
 	newdp6 = (struct dccp6_sock *)newsk;
 	newinet = inet_sk(newsk);
 	newinet->pinet6 = &newdp6->inet6;
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -252,6 +252,7 @@ int dccp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
 	int err = 0;
 	const int old_state = sk->sk_state;
 
@@ -271,6 +272,10 @@ int dccp_disconnect(struct sock *sk, int flags)
 		sk->sk_err = ECONNRESET;
 
 	dccp_clear_xmit_timers(sk);
+	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_rx_ccid = NULL;
+	dp->dccps_hc_tx_ccid = NULL;
 
 	__skb_queue_purge(&sk->sk_receive_queue);
 	__skb_queue_purge(&sk->sk_write_queue);
@@ -1056,7 +1061,7 @@ void dccp_close(struct sock *sk, long timeout)
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -442,6 +442,15 @@ config INET_UDP_DIAG
 	  Support for UDP socket monitoring interface used by the ss tool.
 	  If unsure, say Y.
 
+config INET_RAW_DIAG
+	tristate "RAW: socket monitoring interface"
+	depends on INET_DIAG && (IPV6 || IPV6=n)
+	default n
+	---help---
+	  Support for RAW socket monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+
 menuconfig TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_KMEM) += udp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -74,7 +74,6 @@
 #include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
@@ -310,6 +309,10 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(net, answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern &&
 	    !ns_capable(net->user_ns, CAP_NET_RAW))
@@ -1476,27 +1479,29 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 }
 EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
 
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long __snmp_fold_field(void __percpu *mib[], int offt,
+				const struct cpumask *mask)
 {
 	unsigned long res = 0;
 	int i, j;
 
-	for_each_possible_cpu(i) {
+	for_each_cpu(i, mask) {
 		for (j = 0; j < SNMP_ARRAY_SZ; j++)
 			res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
 	}
 	return res;
 }
-EXPORT_SYMBOL_GPL(snmp_fold_field);
+EXPORT_SYMBOL_GPL(__snmp_fold_field);
 
 #if BITS_PER_LONG==32
 
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 __snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset,
+			const struct cpumask *mask)
 {
 	u64 res = 0;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		void *bhptr;
 		struct u64_stats_sync *syncp;
 		u64 v;
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1447,7 +1447,7 @@ static const struct file_operations arp_seq_fops = {
 
 static int __net_init arp_net_init(struct net *net)
 {
-	if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
+	if (!proc_net_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
 		return -ENOMEM;
 	return 0;
 }
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -195,7 +195,7 @@ static void devinet_sysctl_unregister(struct in_device *idev)
 
 static struct in_ifaddr *inet_alloc_ifa(void)
 {
-	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT);
 }
 
 static void inet_rcu_free_ifa(struct rcu_head *head)
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -264,7 +264,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	if (fib_lookup(net, &fl4, &res))
 		goto last_resort;
 	if (res.type != RTN_UNICAST) {
-		if (res.type != RTN_LOCAL || !accept_local)
+		if (!(dev->features & NETIF_F_VENET) ||
+		    res.type != RTN_LOCAL || !accept_local)
 			goto e_inval;
 	}
 	fib_combine_itag(itag, &res);
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1935,11 +1935,11 @@ void __init fib_trie_init(void)
 {
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 					  sizeof(struct fib_alias),
-					  0, SLAB_PANIC, NULL);
+					  0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
 					   LEAF_SIZE,
-					   0, SLAB_PANIC, NULL);
+					   0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 }
 
 struct fib_table *fib_trie_table(u32 id)
@@ -2584,14 +2584,14 @@ static const struct file_operations fib_route_fops = {
 
 int __net_init fib_proc_init(struct net *net)
 {
-	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
+	if (!proc_net_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
 		goto out1;
 
-	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+	if (!proc_net_create("fib_triestat", S_IRUGO, net->proc_net,
 			 &fib_triestat_fops))
 		goto out2;
 
-	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
+	if (!proc_net_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
 		goto out3;
 
 	return 0;
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2331,11 +2331,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 	struct ip_sf_socklist *psl;
 	struct net *net = sock_net(sk);
 
+	ASSERT_RTNL();
+
 	if (!ipv4_is_multicast(addr))
 		return -EINVAL;
 
-	rtnl_lock();
-
 	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
 	imr.imr_address.s_addr = msf->imsf_interface;
 	imr.imr_ifindex = 0;
@@ -2356,7 +2356,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		goto done;
 	msf->imsf_fmode = pmc->sfmode;
 	psl = rtnl_dereference(pmc->sflist);
-	rtnl_unlock();
 	if (!psl) {
 		len = 0;
 		count = 0;
@@ -2375,7 +2374,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		return -EFAULT;
 	return 0;
 done:
-	rtnl_unlock();
 	return err;
 }
 
@@ -2389,6 +2387,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *psl;
 
+	ASSERT_RTNL();
+
 	psin = (struct sockaddr_in *)&gsf->gf_group;
 	if (psin->sin_family != AF_INET)
 		return -EINVAL;
@@ -2396,8 +2396,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	if (!ipv4_is_multicast(addr))
 		return -EINVAL;
 
-	rtnl_lock();
-
 	err = -EADDRNOTAVAIL;
 
 	for_each_pmc_rtnl(inet, pmc) {
@@ -2409,7 +2407,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 		goto done;
 	gsf->gf_fmode = pmc->sfmode;
 	psl = rtnl_dereference(pmc->sflist);
-	rtnl_unlock();
 	count = psl ? psl->sl_count : 0;
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
@@ -2429,7 +2426,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	}
 	return 0;
 done:
-	rtnl_unlock();
 	return err;
 }
 
@@ -2837,10 +2833,10 @@ static int __net_init igmp_net_init(struct net *net)
 	struct proc_dir_entry *pde;
 	int err;
 
-	pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
+	pde = proc_net_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
 	if (!pde)
 		goto out_igmp;
-	pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+	pde = proc_net_create("mcfilter", S_IRUGO, net->proc_net,
 			  &igmp_mcf_seq_fops);
 	if (!pde)
 		goto out_mcfilter;
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
+#include <net/tcp.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -733,7 +734,7 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	orphan_count_dec(sk);
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
@@ -750,7 +751,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
 	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 	inet_sk(sk)->inet_num = 0;
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
@@ -829,7 +830,7 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		sock_orphan(child);
 
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		orphan_count_inc(sk);
 
 		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -193,6 +193,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
 			goto errout;
 
+	/*
+	 * RAW sockets might have user-defined protocols assigned,
+	 * so report the one supplied on socket creation.
+	 */
+	if (sk->sk_type == SOCK_RAW) {
+		if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+			goto errout;
+	}
+
 	if (!icsk) {
 		handler->idiag_get_info(sk, r, NULL);
 		goto out;
@@ -860,7 +869,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 
 				if (!net_eq(sock_net(sk), net))
 					continue;
-
 				if (num < s_num) {
 					num++;
 					continue;
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include <net/sock.h>
 #include <net/inet_frag.h>
@@ -295,6 +296,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 		return NULL;
 
 	q->net = nf;
+
 	f->constructor(q, arg);
 	add_frag_mem_limit(q, f->qsize);
 
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -75,6 +75,7 @@ int ip_forward(struct sk_buff *skb)
 	struct iphdr *iph;	/* Our header */
 	struct rtable *rt;	/* Route we use */
 	struct ip_options *opt	= &(IPCB(skb)->opt);
+	unsigned int hroom;
 
 	/* that should never happen */
 	if (skb->pkt_type != PACKET_HOST)
@@ -119,14 +120,37 @@ int ip_forward(struct sk_buff *skb)
 		goto drop;
 	}
 
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	hroom = LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len;
+	if (
+#ifdef CONFIG_IP_ROUTE_NAT
+	    (rt->rt_flags & RTCF_NAT) == 0 &&	  /* no NAT mangling expected */
+#endif						  /* and */
+	    (skb->dev->features & NETIF_F_VENET) && /* src is VENET device and */
+	    (skb_headroom(skb) >= hroom)) {	  /* skb has enough headroom */
+		iph = ip_hdr(skb);
+		goto no_ttl_decr;
+	}
+
 	/* We are about to mangle packet. Copy it! */
-	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+	if (skb_cow(skb, hroom))
 		goto drop;
 	iph = ip_hdr(skb);
 
 	/* Decrease ttl after skb cow done */
 	ip_decrease_ttl(iph);
 
+no_ttl_decr:
+
 	/*
 	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -195,10 +195,11 @@ static void ip_evictor(struct net *net)
  */
 static void ip_expire(unsigned long arg)
 {
+	struct inet_frag_queue *q = (struct inet_frag_queue *)arg;
 	struct ipq *qp;
 	struct net *net;
 
-	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+	qp = container_of(q, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
 
 	spin_lock(&qp->q.lock);
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -679,6 +679,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->netdev_ops		= &ipgre_netdev_ops;
 	dev->type		= ARPHRD_IPGRE;
 	ip_tunnel_setup(dev, ipgre_net_id);
+	dev->features |= NETIF_F_VIRTUAL;
 }
 
 static void __gre_tunnel_init(struct net_device *dev)
@@ -935,6 +936,7 @@ static void ipgre_tap_setup(struct net_device *dev)
 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
 	ip_tunnel_setup(dev, gre_tap_net_id);
+	dev->features |= NETIF_F_VIRTUAL;
 }
 
 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -192,6 +192,8 @@ static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 
+	if (skb->destructor)
+		skb_orphan(skb);
 	__skb_pull(skb, skb_network_header_len(skb));
 
 	rcu_read_lock();
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -286,6 +286,10 @@ int ip_options_compile(struct net *net,
 			optptr++;
 			continue;
 		}
+		if (unlikely(l < 2)) {
+			pp_ptr = optptr;
+			goto error;
+		}
 		optlen = optptr[1];
 		if (optlen<2 || optlen>l) {
 			pp_ptr = optptr;
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -377,6 +377,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 	memcpy(&iph->saddr, &fl4->saddr,
 	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
 }
+EXPORT_SYMBOL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 {
@@ -563,6 +564,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 	hlen = iph->ihl * 4;
 	mtu = mtu - hlen;	/* Size of data space */
 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
@@ -578,14 +580,15 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
 		    ip_is_fragment(iph) ||
-		    skb_cloned(skb))
+		    skb_cloned(skb) ||
+		    skb_headroom(skb) < ll_rs)
 			goto slow_path;
 
 		skb_walk_frags(skb, frag) {
 			/* Correct geometry. */
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
-			    skb_headroom(frag) < hlen)
+			    skb_headroom(frag) < hlen + ll_rs)
 				goto slow_path_clean;
 
 			/* Partially cloned skb? */
@@ -675,8 +678,6 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 	left = skb->len - hlen;		/* Space per frame */
 	ptr = hlen;		/* Where to start from */
 
-	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
-
 	/*
 	 *	Fragment the datagram.
 	 */
@@ -1564,6 +1565,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
 		return;
 
+	saddr = ip_hdr(skb)->daddr;
 	ipc.addr = daddr;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -503,6 +503,7 @@ static bool setsockopt_needs_rtnl(int optname)
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_UNBLOCK_SOURCE:
+	case IP_ROUTER_ALERT:
 		return true;
 	}
 	return false;
@@ -1140,6 +1141,10 @@ int ip_setsockopt(struct sock *sk, int level,
 			optname != IP_IPSEC_POLICY &&
 			optname != IP_XFRM_POLICY &&
 			!ip_mroute_opt(optname)) {
+
+		if (!ve_ipt_permitted(net, VE_IP_FILTER))
+			return -ENOPROTOOPT;
+
 		lock_sock(sk);
 		err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
 		release_sock(sk);
@@ -1185,11 +1190,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
  *	the _received_ ones. The set sets the _sent_ ones.
  */
 
+static bool getsockopt_needs_rtnl(int optname)
+{
+	switch (optname) {
+	case IP_MSFILTER:
+	case MCAST_MSFILTER:
+		return true;
+	}
+	return false;
+}
+
 static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			    char __user *optval, int __user *optlen, unsigned int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	int val;
+	bool needs_rtnl = getsockopt_needs_rtnl(optname);
+	int val, err = 0;
 	int len;
 
 	if (level != SOL_IP)
@@ -1203,6 +1219,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	if (len < 0)
 		return -EINVAL;
 
+	if (needs_rtnl)
+		rtnl_lock();
 	lock_sock(sk);
 
 	switch (optname) {
@@ -1317,39 +1335,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MSFILTER:
 	{
 		struct ip_msfilter msf;
-		int err;
 
 		if (len < IP_MSFILTER_SIZE(0)) {
-			release_sock(sk);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
-			release_sock(sk);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		err = ip_mc_msfget(sk, &msf,
 				   (struct ip_msfilter __user *)optval, optlen);
-		release_sock(sk);
-		return err;
+		goto out;
 	}
 	case MCAST_MSFILTER:
 	{
 		struct group_filter gsf;
-		int err;
 
 		if (len < GROUP_FILTER_SIZE(0)) {
-			release_sock(sk);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
-			release_sock(sk);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		err = ip_mc_gsfget(sk, &gsf,
 				   (struct group_filter __user *)optval,
 				   optlen);
-		release_sock(sk);
-		return err;
+		goto out;
 	}
 	case IP_MULTICAST_ALL:
 		val = inet->mc_all;
@@ -1416,6 +1430,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 	}
 	return 0;
+
+out:
+	release_sock(sk);
+	if (needs_rtnl)
+		rtnl_unlock();
+	return err;
 }
 
 int ip_getsockopt(struct sock *sk, int level,
@@ -1433,6 +1453,9 @@ int ip_getsockopt(struct sock *sk, int level,
 		if (get_user(len, optlen))
 			return -EFAULT;
 
+		if (!ve_ipt_permitted(net, VE_IP_FILTER))
+			return -ENOENT;
+
 		lock_sock(sk);
 		err = nf_getsockopt(sk, PF_INET, optname, optval,
 				&len);
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -58,6 +58,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+	if (itn == NULL)
+		return -EINVAL;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
@@ -256,6 +259,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 	int protocol = iph->protocol;
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+	if (itn == NULL)
+		return -1;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->daddr, iph->saddr, 0);
 	if (!tunnel)
@@ -413,6 +419,9 @@ static int __net_init vti_init_net(struct net *net)
 	int err;
 	struct ip_tunnel_net *itn;
 
+	if (!ve_is_super(net->owner_ve))
+		return net_assign_generic(net, vti_net_id, NULL);
+
 	err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
 	if (err)
 		return err;
@@ -424,6 +433,9 @@ static int __net_init vti_init_net(struct net *net)
 static void __net_exit vti_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	if (itn == NULL)
+		return;
 	ip_tunnel_delete_net(itn, &vti_link_ops);
 }
 
@@ -473,6 +485,9 @@ static int vti_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct ip_tunnel_parm parms;
 
+	if (net_generic(dev_net(dev), vti_net_id) == NULL)
+		return -EACCES;
+
 	vti_netlink_parms(data, &parms);
 	return ip_tunnel_newlink(dev, tb, &parms);
 }
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -107,6 +107,7 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <uapi/linux/vzcalluser.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -142,6 +143,9 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	const int code = icmp_hdr(skb)->code;
 
 	err = -ENOENT;
+	if (itn == NULL)
+		goto out;
+
 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			     iph->daddr, iph->saddr, 0);
 	if (t == NULL)
@@ -191,6 +195,10 @@ static int ipip_rcv(struct sk_buff *skb)
 	const struct iphdr *iph;
 
 	iph = ip_hdr(skb);
+
+	if (itn == NULL)
+		return -1;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			iph->saddr, iph->daddr, 0);
 	if (tunnel) {
@@ -292,6 +300,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	netif_keep_dst(dev);
 
 	dev->features		|= IPIP_FEATURES;
+	dev->features		|= NETIF_F_VIRTUAL;
 	dev->hw_features	|= IPIP_FEATURES;
 	ip_tunnel_setup(dev, ipip_net_id);
 }
@@ -347,6 +356,9 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct ip_tunnel_parm p;
 
+	if (net_generic(dev_net(dev), ipip_net_id) == NULL)
+		return -EACCES;
+
 	ipip_netlink_parms(data, &p);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
@@ -433,13 +445,21 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
 
 static int __net_init ipip_init_net(struct net *net)
 {
+	if (!(net->owner_ve->features & VE_FEATURE_IPIP))
+		return net_assign_generic(net, ipip_net_id, NULL);
+
 	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 }
 
 static void __net_exit ipip_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+
+	if (itn == NULL) /* no VE_FEATURE_IPIP */
+		return;
+
 	ip_tunnel_delete_net(itn, &ipip_link_ops);
+	net_assign_generic(net, ipip_net_id, NULL);
 }
 
 static struct pernet_operations ipip_net_ops = {
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -265,7 +265,7 @@ static int __net_init ipmr_rules_init(struct net *net)
 	return 0;
 
 err2:
-	kfree(mrt);
+	ipmr_free_table(mrt);
 err1:
 	fib_rules_unregister(ops);
 	return err;
@@ -1253,7 +1253,7 @@ static void mrtsock_destruct(struct sock *sk)
 	struct net *net = sock_net(sk);
 	struct mr_table *mrt;
 
-	rtnl_lock();
+	ASSERT_RTNL();
 	ipmr_for_each_table(mrt, net) {
 		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1264,7 +1264,6 @@ static void mrtsock_destruct(struct sock *sk)
 			mroute_clean_tables(mrt);
 		}
 	}
-	rtnl_unlock();
 }
 
 /*
@@ -1320,7 +1319,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	case MRT_DONE:
 		if (sk != rcu_access_pointer(mrt->mroute_sk))
 			return -EACCES;
-		return ip_ra_control(sk, 0, NULL);
+        ret = ip_ra_control(sk, 0, NULL);
+        rtnl_unlock();
+		return ret;
 	case MRT_ADD_VIF:
 	case MRT_DEL_VIF:
 		if (optlen != sizeof(vif))
@@ -2717,9 +2718,9 @@ static int __net_init ipmr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
+	if (!proc_net_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
 		goto proc_vif_fail;
-	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
+	if (!proc_net_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
 		goto proc_cache_fail;
 #endif
 	return 0;
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -428,5 +428,11 @@ config IP_NF_ARP_MANGLE
 
 endif # IP_NF_ARPTABLES
 
+config VE_IP_NF_VZPRIVNET
+	tristate "VE private networking filtering"
+	default m
+	depends on IP_NF_IPTABLES && m
+	help
+	  This option allows filtering private subnets.
 endmenu
 
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+obj-$(CONFIG_VE_IP_NF_VZPRIVNET) += ip_vzprivnet.o
 
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -25,6 +25,7 @@
 #include <net/compat.h>
 #include <net/sock.h>
 #include <asm/uaccess.h>
+#include <linux/fence-watchdog.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
@@ -112,6 +113,14 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
 
 #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (FWINV((arpinfo->flags & ARPT_WDOGTMO) && !fence_wdog_tmo_match(),
+		  ARPT_INV_WDOGTMO)) {
+		dprintf("Watchdog timeout mismatch.\n");
+		return 0;
+	}
+#endif
+
 	if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
 		  ARPT_INV_ARPOP)) {
 		dprintf("ARP operation field mismatch.\n");
@@ -591,6 +600,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1235,6 +1248,10 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
 	entry_offset = (void *)e - (void *)base;
 
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,6 +301,9 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_action_param acpar;
 	unsigned int addend;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	ip = ip_hdr(skb);
 	indev = state->in ? state->in->name : nulldevname;
@@ -465,8 +468,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			int visited = e->comefrom & (1 << hook);
 
 			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
-				pr_err("iptables: loop hook %u pos %u %08X.\n",
-				       hook, pos, e->comefrom);
+				ve_printk(VE_LOG, "iptables: loop hook %u pos "
+						  "%u %08X.\n",
+					  hook, pos, e->comefrom);
 				return 0;
 			}
 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
@@ -744,6 +748,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -925,7 +933,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vzalloc(countersize);
+	counters = vzalloc_account(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1193,7 +1201,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc_account(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1472,6 +1480,10 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
@@ -1700,13 +1712,17 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len);
+
 static int
 compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 		      unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1719,8 +1735,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 		break;
 
 	default:
-		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
-		ret = -EINVAL;
+		ret = do_ipt_set_ctl(sk, cmd, user, len);
 	}
 
 	return ret;
@@ -1814,9 +1829,10 @@ static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
 static int
 compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1836,9 +1852,10 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 static int
 do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1861,9 +1878,10 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 static int
 do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2061,12 +2079,19 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
 
 static int __net_init ip_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV4);
+	int res;
+
+	res = xt_proto_init(net, NFPROTO_IPV4);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES);
+	return res;
 }
 
 static void __net_exit ip_tables_net_exit(struct net *net)
 {
 	xt_proto_fini(net, NFPROTO_IPV4);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES);
 }
 
 static struct pernet_operations ip_tables_net_ops = {
--- /dev/null
+++ b/net/ipv4/netfilter/ip_vzprivnet.c
@@ -0,0 +1,1151 @@
+/*
+ *  net/ipv4/netfilter/ip_vzprivnet.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * This is implementation of the private network filtering.
+ * How does it work:
+ *   _______      _______       _______
+ *  |  VE1  |    |  VE2  |     | VE-N  |
+ *  |_______|    |_______|     |_______|
+ *      | venet      | venet       | venet
+ *      |            |             |
+ *      |_______ip_forward__ ... __| VE0
+ *             vzfilter_hook
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/log2.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/inet.h>
+#include <asm/page.h>
+
+#include <linux/vzprivnet.h>
+#define VZPRIV_PROCNAME "ip_vzprivnet"
+
+static DEFINE_PER_CPU(unsigned long, lookup_stat[2]);
+
+struct vzprivnet {
+	u32 nmask;
+	int weak;
+};
+
+struct vzprivnet_sparse {
+	struct vzprivnet pn;
+
+	unsigned int netid;
+	struct list_head list;
+	struct list_head entries;
+};
+
+struct vzprivnet_range {
+	struct vzprivnet *pn;
+
+	/* In big-endian */
+	u32 netip;
+	u32 rmask;
+	struct rb_node node;
+};
+
+struct vzprivnet_entry {
+	struct vzprivnet_range range;
+	struct list_head list;
+};
+
+static DEFINE_RWLOCK(vzprivlock);
+static LIST_HEAD(vzpriv_sparse);
+static struct rb_root entries_root = RB_ROOT;
+
+/*
+ * Tree helpers
+ */
+
+static struct rb_root rbroot = RB_ROOT;
+/* ip: big-endian IP address */
+static struct vzprivnet_range *tree_search(struct rb_root *root, u32 ip)
+{
+	struct rb_node *node = root->rb_node;
+
+	ip = ntohl(ip);
+	while (node) {
+		struct vzprivnet_range *p = rb_entry(node, struct vzprivnet_range, node);
+		u32 start, end;
+
+		start = ntohl(p->netip);
+		end = start | ~ntohl(p->rmask);
+
+		if (ip <= end) {
+			if (start <= ip)
+				return p;
+
+			node = node->rb_left;
+		} else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct vzprivnet_range *legacy_search(u32 ip)
+{
+	return tree_search(&rbroot, ip);
+}
+
+static int tree_insert(struct rb_root *root, struct vzprivnet_range *data)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u32 ip;
+	u32 end_ip;
+
+	ip = ntohl(data->netip);
+	end_ip = ip | ~ntohl(data->rmask);
+
+
+	while (*link) {
+		struct vzprivnet_range *p = rb_entry(*link, struct vzprivnet_range, node);
+		u32 start, end;
+
+		start = ntohl(p->netip);
+		end = start | ~ntohl(p->rmask);
+
+		if (!(ip > end || start > end_ip))
+			return -EEXIST;
+
+		parent = *link;
+		if (ip < end)
+			link = &((*link)->rb_left);
+		else
+			link = &((*link)->rb_right);
+	}
+
+	/* Add link node and rebalance tree. */
+	rb_link_node(&data->node, parent, link);
+	rb_insert_color(&data->node, root);
+
+	return 0;
+}
+
+static int legacy_insert(struct vzprivnet_range *data)
+{
+	return tree_insert(&rbroot, data);
+}
+
+static void legacy_delete(struct vzprivnet_range *p)
+{
+	rb_erase(&p->node, &rbroot);
+}
+
+static struct vzprivnet_range *legacy_first(void)
+{
+	struct rb_node *node;
+
+	node = rb_first(&rbroot);
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct vzprivnet_range, node);
+}
+
+static struct vzprivnet_range *legacy_next(struct vzprivnet_range *p)
+{
+	struct rb_node *node;
+
+	node = rb_next(&p->node);
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct vzprivnet_range, node);
+}
+
+/*
+ * Generic code
+ */
+
+static struct vzprivnet vzpriv_internet = {
+	.nmask = 0,
+	.weak = VZPRIVNET_INET
+};
+
+static struct vzprivnet *vzpriv_search(u32 ip)
+{
+	struct vzprivnet_range *pnr;
+
+	pnr = tree_search(&entries_root, ip);
+	if (pnr == NULL)
+		pnr = legacy_search(ip);
+
+	if (pnr != NULL)
+		return pnr->pn;
+	else
+		return &vzpriv_internet;
+}
+
+static noinline unsigned int vzprivnet_classify(struct sk_buff *skb, int type)
+{
+	int res;
+	u32 saddr, daddr;
+	struct vzprivnet *p1, *p2;
+
+	per_cpu(lookup_stat[type], smp_processor_id())++;
+
+	saddr = ip_hdr(skb)->saddr;
+	daddr = ip_hdr(skb)->daddr;
+
+	read_lock(&vzprivlock);
+	p1 = vzpriv_search(saddr);
+	p2 = vzpriv_search(daddr);
+
+	if (p1 == p2) {
+		if ((saddr & p1->nmask) == (daddr & p1->nmask))
+			res = NF_ACCEPT;
+		else
+			res = NF_DROP;
+	} else {
+		if (p1->weak + p2->weak >= 3)
+			res = NF_ACCEPT;
+		else
+			res = NF_DROP;
+	}
+
+	read_unlock(&vzprivlock);
+	return res;
+}
+
+int vzpn_handle_bridged = 0;
+EXPORT_SYMBOL(vzpn_handle_bridged);
+
+int vzpn_filter_host = 0;
+EXPORT_SYMBOL(vzpn_filter_host);
+
+static unsigned int vzprivnet_hook(struct sk_buff *skb, int can_be_bridge)
+{
+	struct dst_entry *dst;
+	struct net *src_net;
+
+	if (WARN_ON_ONCE(!skb->dev && !skb->sk))
+		return NF_ACCEPT;
+
+	src_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	if (!ve_is_super(src_net->owner_ve))
+		return NF_ACCEPT;
+
+	dst = skb_dst(skb);
+	if (dst != NULL && can_be_bridge && dst->output != ip_output) { /* bridge */
+		if (vzpn_handle_bridged)
+			return vzprivnet_classify(skb, 1);
+		else
+			return NF_ACCEPT;
+	}
+
+	return vzprivnet_classify(skb, 0);
+}
+
+static unsigned int vzprivnet_fwd_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_hook(skb, 1);
+}
+
+static unsigned int vzprivnet_host_hook(struct sk_buff *skb,
+		const struct net_device *dev, int can_be_bridge)
+{
+	if (!vzpn_filter_host)
+		return NF_ACCEPT;
+
+	/*
+	 * Only packets coming from venet or going to one matter
+	 */
+	if (!(dev->features & NETIF_F_VENET))
+		return NF_ACCEPT;
+
+	return vzprivnet_hook(skb, can_be_bridge);
+}
+
+static unsigned int vzprivnet_in_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_host_hook(skb, in, 0); /* bridge doesn't call it */
+}
+
+static unsigned int vzprivnet_out_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_host_hook(skb, out, 1);
+}
+
+static struct nf_hook_ops vzprivnet_ops[] = {
+	{
+		.hook = vzprivnet_fwd_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_FORWARD,
+		.priority = NF_IP_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet_in_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_LOCAL_IN,
+		.priority = NF_IP_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet_out_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.priority = NF_IP_PRI_FIRST
+	},
+};
+
+static inline u32 to_netmask(int prefix)
+{
+	return htonl((~0 << (32 - prefix)));
+}
+
+static inline unsigned int to_prefix(u32 netmask)
+{
+	netmask = ntohl(netmask);
+	return 32 - ilog2(~netmask + 1);
+}
+
+static char *nextline(char *s)
+{
+	while(*s && *s != '\n') s++;
+	while(*s && *s == '\n') s++;
+	return s;
+}
+
+static int vzprivnet_add(u32 net, u32 m1, u32 m2, int weak)
+{
+	struct vzprivnet_range *p;
+	struct vzprivnet *pn;
+	int err;
+
+	p = kmalloc(sizeof(struct vzprivnet_range), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	pn = kmalloc(sizeof(struct vzprivnet), GFP_KERNEL);
+	if (!pn) {
+		kfree(p);
+		return -ENOMEM;
+	}
+
+	p->pn = pn;
+	p->netip = net;
+	p->rmask = m1;
+	pn->nmask = m2;
+	pn->weak = weak;
+
+	write_lock_bh(&vzprivlock);
+	err = legacy_insert(p);
+	write_unlock_bh(&vzprivlock);
+	if (err) {
+		kfree(pn);
+		kfree(p);
+	}
+
+	return err;
+}
+
+static int vzprivnet_del(u32 net)
+{
+	struct vzprivnet_range *p;
+
+	write_lock_bh(&vzprivlock);
+	p = legacy_search(net);
+	if (p == NULL) {
+		write_unlock_bh(&vzprivlock);
+		return -ENOENT;
+	}
+
+	legacy_delete(p);
+	write_unlock_bh(&vzprivlock);
+	kfree(p->pn);
+	kfree(p);
+	return 0;
+}
+
+static void sparse_free_one(struct vzprivnet_sparse *pns);
+static void vzprivnet_cleanup(void)
+{
+	struct vzprivnet_range *p;
+	struct vzprivnet_sparse *pns;
+
+	write_lock_bh(&vzprivlock);
+	while (1) {
+		p = legacy_first();
+		if (!p)
+			break;
+		legacy_delete(p);
+		kfree(p->pn);
+		kfree(p);
+	}
+
+	while (!list_empty(&vzpriv_sparse)) {
+		pns = list_first_entry(&vzpriv_sparse,
+				struct vzprivnet_sparse, list);
+		sparse_free_one(pns);
+	}
+	write_unlock_bh(&vzprivlock);
+}
+
+/*     +a.b.c.d/M1/M2
+ * or
+ *     -a.b.c.d/M1/M2
+ *
+ * add: 0 - delete, 1 - add
+ * if delete, netmasks don't matter
+ */
+static int parse_param(const char *param, int *add, u32 *net,
+			u32 *netmask1, u32 *netmask2, int *weak)
+{
+	int err;
+	unsigned char ch, e;
+	unsigned int a,b,c,d;
+	unsigned int m1, m2;
+
+	if (!*param)
+		return -EINVAL;
+
+	ch = *param;
+	if (ch != '+' && ch != '-')
+		return -EINVAL;
+
+	param++;
+	err = sscanf(param, "%u.%u.%u.%u/%u/%u%c\n",
+				&a, &b, &c, &d, &m1, &m2, &e);
+	if (err < 4 || (a == 0 || a > 255 || b > 255 || c > 255 || d > 255))
+		return -EINVAL;
+
+	*weak = VZPRIVNET_STRONG;
+	if (err == 7) {
+		if (e == '*')
+			*weak = VZPRIVNET_WEAK;
+		else if (e != '\n' || !isspace(e))
+			return -EINVAL;
+	}
+
+	*net = htonl((a << 24) + (b << 16) + (c << 8) + d);
+	if (ch == '+') {
+		if (err < 6 || m1 == 0 || m1 > 32 || m2 == 0 || m2 > 32)
+			return -EINVAL;
+
+		*netmask1 = to_netmask(m1);
+		*netmask2 = to_netmask(m2);
+		*net &= *netmask1;
+	} else
+		*netmask1 = *netmask2 = 0;
+
+	*add = (ch == '+') ? 1 : 0;
+	return 0;
+}
+
+static ssize_t vzpriv_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		u32 net, m1, m2;
+		int add, weak;
+
+		err = parse_param(s, &add, &net, &m1, &m2, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = vzprivnet_add(net, m1, m2, weak);
+		else
+			err = vzprivnet_del(net);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *vzprivnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	unsigned int n = *pos;
+
+	read_lock_bh(&vzprivlock);
+	if (n > 0) {
+		struct vzprivnet_range *p;
+
+		p = legacy_first();
+		while (n-- && p)
+			p = legacy_next(p);
+
+		return p;
+	}
+
+	return legacy_first();
+}
+
+static void *vzprivnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return legacy_next(v);
+}
+
+static void vzprivnet_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&vzprivlock);
+}
+
+static int vzprivnet_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet_range *p = v;
+
+	seq_printf(s, "%pI4/%u/%u", &p->netip,
+		   to_prefix(p->rmask), to_prefix(p->pn->nmask));
+	if (p->pn->weak == VZPRIVNET_WEAK)
+		seq_printf(s, "*\n");
+	else
+		seq_printf(s, "\n");
+	return 0;
+}
+
+static struct seq_operations vzprivnet_seq_ops = {
+	.start = vzprivnet_seq_start,
+	.next  = vzprivnet_seq_next,
+	.stop  = vzprivnet_seq_stop,
+	.show  = vzprivnet_seq_show,
+};
+
+static int vzprivnet_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vzprivnet_seq_ops);
+}
+
+static struct file_operations proc_vzprivnet_ops = {
+	.owner   = THIS_MODULE,
+	.open    = vzprivnet_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = vzpriv_write,
+};
+
+static int sparse_add(unsigned int netid, u32 ip, u32 mask, int weak)
+{
+	int err;
+	struct vzprivnet_sparse *pns, *epns = NULL;
+	struct vzprivnet_entry *pne = NULL;
+
+	err = -ENOMEM;
+
+	pns = kmalloc(sizeof(struct vzprivnet_sparse), GFP_KERNEL);
+	if (pns == NULL)
+		goto out;
+
+	pne = kmalloc(sizeof(struct vzprivnet_entry), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzprivlock);
+	list_for_each_entry(epns, &vzpriv_sparse, list)
+		if (epns->netid == netid) {
+			kfree(pns);
+			pns = epns;
+			goto found_net;
+		}
+
+	pns->netid = netid;
+	pns->pn.nmask = 0;
+	pns->pn.weak =  VZPRIVNET_STRONG;
+	INIT_LIST_HEAD(&pns->entries);
+
+found_net:
+	if (ip != 0) {
+		pne->range.netip = ip & mask;
+		pne->range.rmask = mask;
+		pne->range.pn = &pns->pn;
+		err = tree_insert(&entries_root, &pne->range);
+		if (err)
+			goto out_unlock;
+
+		list_add_tail(&pne->list, &pns->entries);
+		pne = NULL;
+	} else if (weak == VZPRIVNET_WEAK) {
+		pns->pn.weak = VZPRIVNET_WEAK;
+	} else if (pns == epns) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (pns != epns) {
+		list_add_tail(&pns->list, &vzpriv_sparse);
+		pns = NULL;
+	}
+
+	err = 0;
+
+out_unlock:
+	write_unlock_bh(&vzprivlock);
+out:
+	if (pns != epns)
+		kfree(pns);
+	kfree(pne);
+
+	return err;
+}
+
+static void sparse_free_entry(struct vzprivnet_entry *pne)
+{
+	list_del(&pne->list);
+	rb_erase(&pne->range.node, &entries_root);
+	kfree(pne);
+}
+
+static void sparse_free_one(struct vzprivnet_sparse *pns)
+{
+	struct vzprivnet_entry *pne;
+
+	list_del(&pns->list);
+
+	while (!list_empty(&pns->entries)) {
+		pne = list_first_entry(&pns->entries,
+				struct vzprivnet_entry, list);
+		sparse_free_entry(pne);
+	}
+
+	kfree(pns);
+}
+
+static int sparse_del_net(unsigned int netid, int weak)
+{
+	struct vzprivnet_sparse *pns;
+
+	list_for_each_entry(pns, &vzpriv_sparse, list)
+		if (pns->netid == netid) {
+			if (weak == VZPRIVNET_WEAK)
+				pns->pn.weak = VZPRIVNET_STRONG;
+			else
+				sparse_free_one(pns);
+			return 0;
+		}
+
+	return -ENOENT;
+}
+
+static int sparse_del_ip(u32 ip)
+{
+	struct vzprivnet_range *rng;
+	struct vzprivnet_entry *pne;
+
+	rng = tree_search(&entries_root, ip);
+	if (rng == NULL)
+		return -ENOENT;
+
+	pne = container_of(rng, struct vzprivnet_entry, range);
+	sparse_free_entry(pne);
+
+	return 0;
+}
+
+static int sparse_del(unsigned int netid, u32 ip, int weak)
+{
+	int err;
+
+	write_lock_bh(&vzprivlock);
+	if (ip != 0)
+		err = sparse_del_ip(ip);
+	else
+		err = sparse_del_net(netid, weak);
+	write_unlock_bh(&vzprivlock);
+
+	return err;
+}
+
+/*
+ * +ID			to add a network
+ * +ID:a.b.c.d		to add an IP to network
+ * +ID:a.b.c.d/m	to add a subnet to network
+ * +ID:*		to make a network weak
+ * -ID			to remove the whole network
+ * -a.b.c.d		to remove an IP or bounding subnet (from its network)
+ * -ID:*		to make a network "strong" ;)
+ *
+ *  No weak networks here!
+ */
+
+static int parse_sparse_add(const char *str, unsigned int *netid, u32 *ip, u32 *mask, int *weak)
+{
+	unsigned int m;
+	char *end;
+
+	*netid = simple_strtol(str, &end, 10);
+	if (is_eol(*end)) {
+		*ip = 0;
+		return 0;
+	}
+
+	if (*end != ':')
+		return -EINVAL;
+
+	str = end + 1;
+	if (*str == '*') {
+		if (!is_eol(*(str + 1)))
+			return -EINVAL;
+
+		*weak = VZPRIVNET_WEAK;
+		return 0;
+	}
+
+	if (!in4_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (is_eol(*end)) {
+		*mask = -1; /* match only one IP */
+		return 0;
+	}
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	m = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	*mask = to_netmask(m);
+	return 0;
+}
+
+static int parse_sparse_remove(const char *str, unsigned int *netid, u32 *ip, int *weak)
+{
+	char *end;
+
+	if (strchr(str, '.')) {
+		if (!in4_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+			return -EINVAL;
+	} else {
+		*netid = simple_strtol(str, &end, 10);
+		if (end[0] == ':' && end[1] == '*') {
+			end += 2;
+			*weak = VZPRIVNET_WEAK;
+		}
+	}
+
+	return (is_eol(*end) ? 0 : -EINVAL);
+}
+
+static int parse_sparse(const char *param, int *add,
+		unsigned int *netid, u32 *ip, u32 *mask, int *weak)
+{
+	if (param[0] == '+') {
+		*add = 1;
+		return parse_sparse_add(param + 1, netid, ip, mask, weak);
+	}
+
+	if (param[0] == '-') {
+		*add = 0;
+		return parse_sparse_remove(param + 1, netid, ip, weak);
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t sparse_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add, weak = VZPRIVNET_STRONG;
+		unsigned int netid = 0;
+		u32 ip = 0, mask = 0;
+
+		err = parse_sparse(s, &add, &netid, &ip, &mask, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = sparse_add(netid, ip, mask, weak);
+		else
+			err = sparse_del(netid, ip, weak);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *sparse_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_bh(&vzprivlock);
+	return seq_list_start(&vzpriv_sparse, *pos);
+}
+
+static void *sparse_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &vzpriv_sparse, pos);
+}
+
+static void sparse_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&vzprivlock);
+}
+
+static int sparse_seq_show(struct seq_file *s, void *v)
+{
+	struct list_head *lh = v;
+	struct vzprivnet_sparse *pns;
+	struct vzprivnet_entry *pne;
+
+	pns = list_entry(lh, struct vzprivnet_sparse, list);
+	seq_printf(s, "%u: ", pns->netid);
+
+	if (pns->pn.weak == VZPRIVNET_WEAK)
+		seq_puts(s, "* ");
+
+	list_for_each_entry(pne, &pns->entries, list) {
+		seq_printf(s, "%pI4", &pne->range.netip);
+		if (~pne->range.rmask != 0) /* subnet */
+			seq_printf(s, "/%u", to_prefix(pne->range.rmask));
+		seq_putc(s, ' ');
+	}
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations sparse_seq_ops = {
+	.start = sparse_seq_start,
+	.next  = sparse_seq_next,
+	.stop  = sparse_seq_stop,
+	.show  = sparse_seq_show,
+};
+
+static int sparse_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sparse_seq_ops);
+}
+
+static struct file_operations proc_sparse_ops = {
+	.owner   = THIS_MODULE,
+	.open    = sparse_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = sparse_write,
+};
+
+static void (*show_more)(struct seq_file *s);
+static DEFINE_MUTEX(show_lock);
+
+static void vzprivnet_reg_swap(vzprivnet_show_fn old, vzprivnet_show_fn new)
+{
+	mutex_lock(&show_lock);
+	if (show_more == old)
+		show_more = new;
+	mutex_unlock(&show_lock);
+}
+
+static void vzprivnet_show_more(struct seq_file *f)
+{
+	mutex_lock(&show_lock);
+	if (show_more != NULL)
+		show_more(f);
+	mutex_unlock(&show_lock);
+}
+
+void vzprivnet_reg_show(vzprivnet_show_fn fn)
+{
+	vzprivnet_reg_swap(NULL, fn);
+}
+EXPORT_SYMBOL(vzprivnet_reg_show);
+
+void vzprivnet_unreg_show(vzprivnet_show_fn fn)
+{
+	vzprivnet_reg_swap(fn, NULL);
+}
+EXPORT_SYMBOL(vzprivnet_unreg_show);
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+	unsigned long sum[2];
+	int cpu;
+
+	sum[0] = sum[1] = 0;
+	for_each_possible_cpu(cpu) {
+		sum[0] += per_cpu(lookup_stat[0], cpu);
+		sum[1] += per_cpu(lookup_stat[1], cpu);
+	}
+
+	seq_printf(s, "Lookups: %lu\n", sum[0]);
+	seq_printf(s, "Br-lookups: %lu\n", sum[1]);
+	vzprivnet_show_more(s);
+
+	return 0;
+}
+
+static int stat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &stat_seq_show, NULL);
+}
+
+static struct file_operations proc_stat_ops = {
+	.owner   = THIS_MODULE,
+	.open    = stat_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static char sample_ip[16];
+
+static ssize_t classify_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int len;
+	char *tmp;
+
+	len = count;
+	if (len >= sizeof(sample_ip))
+		len = sizeof(sample_ip) - 1;
+
+	if (copy_from_user(sample_ip, buf, len))
+		return -EFAULT;
+
+	sample_ip[len] = '\0';
+	tmp = strchr(sample_ip, '\n');
+	if (tmp)
+		*tmp = '\0';
+
+	return count;
+}
+
+static int classify_seq_show(struct seq_file *s, void *v)
+{
+	u32 ip;
+	struct vzprivnet_range *pnr;
+
+	seq_printf(s, "%s: ", sample_ip);
+
+	if (!in4_pton(sample_ip, sizeof(sample_ip), (u8 *)&ip, -1, NULL)) {
+		seq_puts(s, "invalid IP\n");
+		return 0;
+	}
+
+	read_lock(&vzprivlock);
+	pnr = tree_search(&entries_root, ip);
+	if (pnr != NULL) {
+		struct vzprivnet_sparse *pns;
+
+		pns = container_of(pnr->pn, struct vzprivnet_sparse, pn);
+		seq_printf(s, "net %u, ", pns->netid);
+		seq_printf(s, "rule %pI4", &pnr->netip);
+		if (~pnr->rmask != 0)
+			seq_printf(s, "/%u", to_prefix(pnr->rmask));
+		seq_putc(s, '\n');
+
+		goto out;
+	}
+
+	pnr = legacy_search(ip);
+	if (pnr != NULL) {
+		seq_printf(s, "legacy %pI4/%u/%u\n",
+				&pnr->netip,
+				to_prefix(pnr->rmask),
+				to_prefix(pnr->pn->nmask));
+
+		goto out;
+	}
+
+	seq_printf(s, "internet\n");
+out:
+	read_unlock(&vzprivlock);
+	return 0;
+}
+
+static int classify_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &classify_seq_show, NULL);
+}
+
+static struct file_operations proc_classify_ops = {
+	.owner   = THIS_MODULE,
+	.open    = classify_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write	 = classify_write,
+};
+
+struct proc_dir_entry *vzpriv_proc_dir;
+EXPORT_SYMBOL(vzpriv_proc_dir);
+
+static struct ctl_table vzprivnet_table[] = {
+	{
+		.procname = "net",
+		.child = vzprivnet_table + 2,
+	},
+	{ },
+	{
+		.procname = "vzpriv_handle_bridge",
+		.data = &vzpn_handle_bridged,
+		.maxlen = sizeof(vzpn_handle_bridged),
+		.mode = 0600,
+		.proc_handler = proc_dointvec,
+	},
+	{
+		.procname = "vzpriv_filter_host",
+		.data = &vzpn_filter_host,
+		.maxlen = sizeof(vzpn_filter_host),
+		.mode = 0600,
+		.proc_handler = proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_table_header *ctl;
+
+static int __init iptable_vzprivnet_init(void)
+{
+	int err = -ENOMEM;
+	struct proc_dir_entry *proc;
+
+	vzpriv_proc_dir = proc_mkdir("privnet", proc_vz_dir);
+	if (vzpriv_proc_dir == NULL)
+		goto err_mkdir;
+
+	proc = proc_create("legacy", 0644,
+			vzpriv_proc_dir, &proc_vzprivnet_ops);
+	if (proc == NULL)
+		goto err_legacy;
+
+	proc = proc_create("sparse", 0644,
+			vzpriv_proc_dir, &proc_sparse_ops);
+	if (proc == NULL)
+		goto err_net;
+
+	proc = proc_create("stat", 0644,
+			vzpriv_proc_dir, &proc_stat_ops);
+	if (proc == NULL)
+		goto err_stat;
+
+	proc = proc_create("classify", 0644,
+			vzpriv_proc_dir, &proc_classify_ops);
+	if (proc == NULL)
+		goto err_classify;
+
+	proc = proc_symlink(VZPRIV_PROCNAME, init_net.proc_net, "/proc/vz/privnet/legacy");
+	if (proc == NULL)
+		goto err_link;
+
+	err = -ENOMEM;
+	ctl = register_sysctl_table(vzprivnet_table);
+	if (ctl == NULL)
+		goto err_ctl;
+
+	err = nf_register_hooks(vzprivnet_ops, 3);
+	if (err)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	unregister_sysctl_table(ctl);
+err_ctl:
+	remove_proc_entry(VZPRIV_PROCNAME, init_net.proc_net);
+err_link:
+	remove_proc_entry("classify", vzpriv_proc_dir);
+err_classify:
+	remove_proc_entry("stat", vzpriv_proc_dir);
+err_stat:
+	remove_proc_entry("sparse", vzpriv_proc_dir);
+err_net:
+	remove_proc_entry("legacy", vzpriv_proc_dir);
+err_legacy:
+	remove_proc_entry("privnet", proc_vz_dir);
+err_mkdir:
+	return err;
+}
+
+static void __exit iptable_vzprivnet_exit(void)
+{
+	nf_unregister_hooks(vzprivnet_ops, 3);
+	unregister_sysctl_table(ctl);
+	remove_proc_entry(VZPRIV_PROCNAME, init_net.proc_net);
+	remove_proc_entry("classify", vzpriv_proc_dir);
+	remove_proc_entry("stat", vzpriv_proc_dir);
+	remove_proc_entry("sparse", vzpriv_proc_dir);
+	remove_proc_entry("legacy", vzpriv_proc_dir);
+	remove_proc_entry("privnet", proc_vz_dir);
+	vzprivnet_cleanup();
+}
+
+module_init(iptable_vzprivnet_init)
+module_exit(iptable_vzprivnet_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -423,6 +423,8 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -725,7 +727,7 @@ static int clusterip_net_init(struct net *net)
 	spin_lock_init(&cn->lock);
 
 #ifdef CONFIG_PROC_FS
-	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+	cn->procdir = proc_net_mkdir(net, "ipt_CLUSTERIP", net->proc_net);
 	if (!cn->procdir) {
 		pr_err("Unable to proc dir entry\n");
 		return -ENOMEM;
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,6 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -74,13 +74,13 @@ static int reject_tg_check(const struct xt_tgchk_param *par)
 	const struct ipt_entry *e = par->entryinfo;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
-		pr_info("ECHOREPLY no longer supported.\n");
+		ve_printk(VE_LOG, "ECHOREPLY no longer supported.\n");
 		return -EINVAL;
 	} else if (rejinfo->with == IPT_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ip.proto != IPPROTO_TCP ||
 		    (e->ip.invflags & XT_INV_PROTO)) {
-			pr_info("TCP_RESET invalid for non-tcp\n");
+			ve_printk(VE_LOG, "TCP_RESET invalid for non-tcp\n");
 			return -EINVAL;
 		}
 	}
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -408,12 +408,16 @@ static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
 static int synproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_entry *e = par->entryinfo;
+	int ret;
 
 	if (e->ip.proto != IPPROTO_TCP ||
 	    e->ip.invflags & XT_INV_PROTO)
 		return -EINVAL;
 
-	return nf_ct_l3proto_try_module_get(par->family);
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret == 0)
+		allow_conntrack_allocation(par->net);
+	return ret;
 }
 
 static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,12 +69,19 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	net->ipv4.iptable_filter =
 		ipt_register_table(net, &packet_filter, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.iptable_filter))
+		net_ipt_module_set(net, VE_IP_FILTER);
+
 	return PTR_RET(net->ipv4.iptable_filter);
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
 	ipt_unregister_table(net, net->ipv4.iptable_filter);
+	net->ipv4.iptable_filter = NULL;
+
+	net_ipt_module_clear(net, VE_IP_FILTER);
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -102,18 +102,31 @@ static int __net_init iptable_mangle_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_MANGLE))
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv4.iptable_mangle =
 		ipt_register_table(net, &packet_mangler, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.iptable_mangle))
+		net_ipt_module_set(net, VE_IP_MANGLE);
+
 	return PTR_RET(net->ipv4.iptable_mangle);
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_mangle);
+	net->ipv4.iptable_mangle = NULL;
+
+	net_ipt_module_clear(net, VE_IP_MANGLE);
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -113,17 +113,30 @@ static int __net_init iptable_nat_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT))
+		return 0;
+
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.nat_table))
+		net_ipt_module_set(net, VE_IP_IPTABLE_NAT);
+
 	return PTR_RET(net->ipv4.nat_table);
 }
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLE_NAT))
+		return;
+
 	ipt_unregister_table(net, net->ipv4.nat_table);
+	net->ipv4.nat_table = NULL;
+
+	net_ipt_module_clear(net, VE_IP_IPTABLE_NAT);
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -41,6 +41,12 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init iptable_raw_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+		return 0;
+
+	BUG_ON(net->ipv4.iptable_raw);
 
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
@@ -48,12 +54,22 @@ static int __net_init iptable_raw_net_init(struct net *net)
 	net->ipv4.iptable_raw =
 		ipt_register_table(net, &packet_raw, repl);
 	kfree(repl);
-	return PTR_RET(net->ipv4.iptable_raw);
+
+	ret = PTR_RET(net->ipv4.iptable_raw);
+	if (ret)
+		net->ipv4.iptable_raw = NULL;
+
+	return ret;
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_raw)
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_raw);
+
+	net->ipv4.iptable_raw = NULL;
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -372,7 +372,7 @@ static int ipv4_init_net(struct net *net)
 	if (!in->ctl_table)
 		return -ENOMEM;
 
-	in->ctl_table[0].data = &nf_conntrack_max;
+	in->ctl_table[0].data = &net->ct.max;
 	in->ctl_table[1].data = &net->ct.count;
 	in->ctl_table[2].data = &net->ct.htable_size;
 	in->ctl_table[3].data = &net->ct.sysctl_checksum;
@@ -427,6 +427,9 @@ static int ipv4_net_init(struct net *net)
 {
 	int ret = 0;
 
+	if (!net_ipt_permitted(net, VE_IP_CONNTRACK))
+		return 0;
+
 	ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
 					    ARRAY_SIZE(builtin_l4proto4));
 	if (ret < 0)
@@ -437,6 +440,9 @@ static int ipv4_net_init(struct net *net)
 		nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
 						ARRAY_SIZE(builtin_l4proto4));
 	}
+
+	net_ipt_module_set(net, VE_IP_CONNTRACK);
+
 	return ret;
 }
 
@@ -445,6 +451,9 @@ static void ipv4_net_exit(struct net *net)
 	nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
 	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
 					ARRAY_SIZE(builtin_l4proto4));
+
+	if (net_is_ipt_module_set(net, VE_IP_CONNTRACK))
+		net_ipt_module_clear(net, VE_IP_CONNTRACK);
 }
 
 static struct pernet_operations ipv4_net_ops = {
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -418,16 +418,16 @@ static int __net_init ip_conntrack_net_init(struct net *net)
 {
 	struct proc_dir_entry *proc, *proc_exp, *proc_stat;
 
-	proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
+	proc = proc_net_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
 	if (!proc)
 		goto err1;
 
-	proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+	proc_exp = proc_net_create("ip_conntrack_expect", 0440, net->proc_net,
 			       &ip_exp_file_ops);
 	if (!proc_exp)
 		goto err2;
 
-	proc_stat = proc_create("ip_conntrack", S_IRUGO,
+	proc_stat = proc_net_create("ip_conntrack", S_IRUGO,
 				net->proc_net_stat, &ct_cpu_seq_fops);
 	if (!proc_stat)
 		goto err3;
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -316,10 +316,6 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 {
 	struct nf_log_buf *m;
 
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
 	m = nf_log_buf_open();
 
 	if (!loginfo)
@@ -333,7 +329,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 
 	dump_ipv4_packet(m, loginfo, skb, 0);
 
-	nf_log_buf_close(m);
+	nf_log_buf_close(m, net->owner_ve);
 }
 
 static struct nf_logger nf_ip_logger __read_mostly = {
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -746,7 +746,7 @@ struct proto ping_prot = {
 	.init =		ping_init_sock,
 	.close =	ping_close,
 	.connect =	ip4_datagram_connect,
-	.disconnect =	udp_disconnect,
+	.disconnect =	__udp_disconnect,
 	.setsockopt =	ip_setsockopt,
 	.getsockopt =	ip_getsockopt,
 	.sendmsg =	ping_sendmsg,
@@ -904,7 +904,7 @@ static int ping_proc_register(struct net *net)
 	struct proc_dir_entry *p;
 	int rc = 0;
 
-	p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops);
+	p = proc_net_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops);
 	if (!p)
 		rc = -ENOMEM;
 	return rc;
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -501,12 +501,12 @@ static const struct file_operations netstat_seq_fops = {
 
 static __net_init int ip_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+	if (!proc_net_create("sockstat", S_IRUGO, net->proc_net,
 			 &sockstat_seq_fops))
 		goto out_sockstat;
-	if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
+	if (!proc_net_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
 		goto out_netstat;
-	if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
+	if (!proc_net_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
 		goto out_snmp;
 
 	return 0;
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -79,9 +79,10 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
 
-static struct raw_hashinfo raw_v4_hashinfo = {
+struct raw_hashinfo raw_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
 };
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
 
 void raw_hash_sk(struct sock *sk)
 {
@@ -108,7 +109,7 @@ void raw_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
-static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 		unsigned short num, __be32 raddr, __be32 laddr, int dif)
 {
 	sk_for_each_from(sk) {
@@ -124,6 +125,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 found:
 	return sk;
 }
+EXPORT_SYMBOL_GPL(__raw_v4_lookup);
 
 /*
  *	0 - deliver
@@ -646,7 +648,9 @@ static void raw_close(struct sock *sk, long timeout)
 	/*
 	 * Raw sockets may have direct kernel references. Kill them.
 	 */
-	ip_ra_control(sk, 0, NULL);
+	rtnl_lock();
+    ip_ra_control(sk, 0, NULL);
+    rtnl_unlock();
 
 	sk_common_release(sk);
 }
@@ -888,7 +892,7 @@ struct proto raw_prot = {
 	.close		   = raw_close,
 	.destroy	   = raw_destroy,
 	.connect	   = ip4_datagram_connect,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = __udp_disconnect,
 	.ioctl		   = raw_ioctl,
 	.init		   = raw_init,
 	.setsockopt	   = raw_setsockopt,
@@ -918,7 +922,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
 	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
 		sk_for_each(sk, &state->h->ht[state->bucket])
-			if (sock_net(sk) == seq_file_net(seq))
+			if (net_access_allowed(sock_net(sk), seq_file_net(seq)))
 				goto found;
 	}
 	sk = NULL;
@@ -934,7 +938,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 		sk = sk_next(sk);
 try_again:
 		;
-	} while (sk && sock_net(sk) != seq_file_net(seq));
+	} while (sk && !net_access_allowed(sock_net(sk), seq_file_net(seq)));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
 		sk = sk_head(&state->h->ht[state->bucket]);
@@ -1051,7 +1055,7 @@ static const struct file_operations raw_seq_fops = {
 
 static __net_init int raw_init_net(struct net *net)
 {
-	if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
+	if (!proc_net_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
 		return -ENOMEM;
 
 	return 0;
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,243 @@
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+	if (r->sdiag_family == AF_INET) {
+		return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (r->sdiag_family == AF_INET6) {
+		return &raw_v6_hashinfo;
+#endif
+	} else {
+		pr_warn_once("Unexpected inet family %d\n",
+			     r->sdiag_family);
+		WARN_ON_ONCE(1);
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static struct sock *raw_lookup(struct net *net, struct sock *from,
+			       const struct inet_diag_req_v2 *req)
+{
+	struct inet_diag_req_raw *r = (void *)req;
+	struct sock *sk = NULL;
+
+	if (r->sdiag_family == AF_INET)
+		sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
+				     r->id.idiag_dst[0],
+				     r->id.idiag_src[0],
+				     r->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
+				     (const struct in6_addr *)r->id.idiag_src,
+				     (const struct in6_addr *)r->id.idiag_dst,
+				     r->id.idiag_if);
+#endif
+	return sk;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct sock *sk = NULL, *s;
+	int slot;
+
+	if (IS_ERR(hashinfo))
+		return ERR_CAST(hashinfo);
+
+	read_lock(&hashinfo->lock);
+	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+		sk_for_each(s, &hashinfo->ht[slot]) {
+			sk = raw_lookup(net, s, r);
+			if (sk) {
+				/*
+				 * Grab it and keep until we fill
+				 * diag meaage to be reported, so
+				 * caller should call sock_put then.
+				 * We can do that because we're keeping
+				 * hashinfo->lock here.
+				 */
+				sock_hold(sk);
+				goto out_unlock;
+			}
+		}
+	}
+out_unlock:
+	read_unlock(&hashinfo->lock);
+
+	return sk ? sk : ERR_PTR(-ENOENT);
+}
+
+static int raw_diag_dump_one(struct sk_buff *in_skb,
+			     const struct nlmsghdr *nlh,
+			     const struct inet_diag_req_v2 *r)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *rep;
+	struct sock *sk;
+	int err;
+
+	sk = raw_sock_get(net, r);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+			sizeof(struct inet_diag_meminfo) + 64,
+			GFP_KERNEL);
+	if (!rep) {
+		sock_put(sk);
+		return -ENOMEM;
+	}
+
+	err = inet_sk_diag_fill(sk, NULL, rep, r,
+				sk_user_ns(NETLINK_CB(in_skb).sk),
+				NETLINK_CB(in_skb).portid,
+				nlh->nlmsg_seq, 0, nlh);
+	sock_put(sk);
+
+	if (err < 0) {
+		kfree_skb(rep);
+		return err;
+	}
+
+	err = netlink_unicast(net->diag_nlsk, rep,
+			      NETLINK_CB(in_skb).portid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+	return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *r,
+			struct nlattr *bc)
+{
+	if (!inet_diag_bc_sk(bc, sk))
+		return 0;
+
+	return inet_sk_diag_fill(sk, NULL, skb, r,
+				 sk_user_ns(NETLINK_CB(cb->skb).sk),
+				 NETLINK_CB(cb->skb).portid,
+				 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				 cb->nlh);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct net *net = sock_net(skb->sk);
+	int num, s_num, slot, s_slot;
+	struct sock *sk = NULL;
+
+	if (IS_ERR(hashinfo))
+		return;
+
+	s_slot = cb->args[0];
+	num = s_num = cb->args[1];
+
+	read_lock(&hashinfo->lock);
+	for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+		num = 0;
+
+		sk_for_each(sk, &hashinfo->ht[slot]) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (num < s_num)
+				goto next;
+			if (sk->sk_family != r->sdiag_family)
+				goto next;
+			if (r->id.idiag_sport != inet->inet_sport &&
+			    r->id.idiag_sport)
+				goto next;
+			if (r->id.idiag_dport != inet->inet_dport &&
+			    r->id.idiag_dport)
+				goto next;
+			if (sk_diag_dump(sk, skb, cb, r, bc) < 0)
+				goto out_unlock;
+next:
+			num++;
+		}
+	}
+
+out_unlock:
+	read_unlock(&hashinfo->lock);
+
+	cb->args[0] = slot;
+	cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *info)
+{
+	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+static const struct inet_diag_handler raw_diag_handler = {
+	.dump			= raw_diag_dump,
+	.dump_one		= raw_diag_dump_one,
+	.idiag_get_info		= raw_diag_get_info,
+	.idiag_type		= IPPROTO_RAW,
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+	/*
+	 * Make sure the two structures are identical,
+	 * except the @pad field.
+	 */
+#define __offset_mismatch(m1, m2)			\
+	(offsetof(struct inet_diag_req_v2, m1) !=	\
+	 offsetof(struct inet_diag_req_raw, m2))
+
+	BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+		     sizeof(struct inet_diag_req_raw));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+	BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+	BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+	return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+	inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,6 +70,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/nsproxy.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -118,6 +119,7 @@
 
 #define RT_GC_TIMEOUT (300*HZ)
 
+int ip_rt_src_check		= 1;
 static int ip_rt_max_size;
 static int ip_rt_redirect_number __read_mostly	= 9;
 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
@@ -385,18 +387,19 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
 {
 	struct proc_dir_entry *pde;
 
-	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+	pde = proc_net_create("rt_cache", S_IRUGO, net->proc_net,
 			  &rt_cache_seq_fops);
 	if (!pde)
 		goto err1;
 
-	pde = proc_create("rt_cache", S_IRUGO,
+	pde = proc_net_create("rt_cache", S_IRUGO,
 			  net->proc_net_stat, &rt_cpu_seq_fops);
 	if (!pde)
 		goto err2;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+	pde = proc_net_create("rt_acct", 0,
+			net->proc_net, &rt_acct_proc_fops);
 	if (!pde)
 		goto err3;
 #endif
@@ -984,6 +987,7 @@ static int ip_error(struct sk_buff *skb)
 out:	kfree_skb(skb);
 	return 0;
 }
+EXPORT_SYMBOL(rt_cache_flush);
 
 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
@@ -2198,7 +2202,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 			goto make_route;
 		}
 
-		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC) && ip_rt_src_check) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
 			if (!__ip_dev_find(net, fl4->saddr, false))
 				goto out;
@@ -2762,6 +2766,15 @@ static struct ctl_table ipv4_route_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname	= "src_check",
+		.data		= &ip_rt_src_check,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{ }
 };
 
@@ -2921,3 +2934,28 @@ void __init ip_static_sysctl_init(void)
 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
 }
 #endif
+
+#if 0
+static void ip_rt_dump_dst(void *o)
+{
+	struct rtable *rt = (struct rtable *)o;
+
+	if (rt->dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&rt->dst);
+	printk("\tgen %x flags %x type %d\n",
+			rt->rt_genid, rt->rt_flags, (int)rt->rt_type);
+}
+#endif
+
+void ip_rt_dump_dsts(void)
+{
+	printk("IPv4 dst cache:\n");
+	//FIXME
+	//slab_obj_walk(ipv4_dst_ops.kmem_cachep, ip_rt_dump_dst);
+}
+
+void (*ip6_rt_dump_dsts)(void);
+EXPORT_SYMBOL_GPL(ip6_rt_dump_dsts);
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -44,6 +44,9 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
 static int rhel_unused_sysctl __read_mostly;
 
+int sysctl_tcp_use_sg = 1;
+EXPORT_SYMBOL(sysctl_tcp_use_sg);
+
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
 {
@@ -428,8 +431,8 @@ static struct ctl_table ipv4_table[] = {
 		.procname	= "tcp_syncookies",
 		.data		= &sysctl_tcp_syncookies,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_immutable,
 	},
 #endif
 	{
@@ -825,10 +828,24 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one
 	},
+	{
+		.procname	= "tcp_use_sg",
+		.data		= &sysctl_tcp_use_sg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
 static struct ctl_table ipv4_net_table[] = {
+	{	/* This must go first. See ipv4_sysctl_init_net() */
+		.procname	= "ip_nonlocal_bind",
+		.data		= &init_net.ipv4_sysctl_ip_nonlocal_bind,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "icmp_echo_ignore_all",
 		.data		= &init_net.ipv4.sysctl_icmp_echo_ignore_all,
@@ -912,13 +929,6 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "ip_nonlocal_bind",
-		.data		= &init_net.ipv4_sysctl_ip_nonlocal_bind,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{ }
 };
 
@@ -938,8 +948,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
 			table[i].data += (void *)net - (void *)&init_net;
 
+		/*
+		 * Check that it's a creating VE or VE's initial user_ns,
+		 * and allow ip_nonlocal_bind only:
+		 */
+		if (!ve_is_super(get_exec_env()) && !ve_net_hide_sysctl(net))
+			table[2].procname = NULL;
 		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
+		else if (net->user_ns != &init_user_ns)
 			table[0].procname = NULL;
 	}
 
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,6 +289,7 @@ int sysctl_tcp_autocorking __read_mostly = 1;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
+int sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
@@ -401,6 +402,8 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 	u64_stats_init(&tp->syncp);
 
+	tp->advmss = 65535; /* max value */
+
 	tp->reordering = sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
@@ -1266,7 +1269,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 wait_for_sndbuf:
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-			if (copied)
+			if (copied && likely(!tp->repair))
 				tcp_push(sk, flags & ~MSG_MORE, mss_now,
 					 TCP_NAGLE_PUSH, size_goal);
 
@@ -1385,8 +1388,10 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
-	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
-	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+	     KERN_INFO "cleanup rbuf bug (%s/%s): copied %X seq %X/%X rcvnxt %X\n",
+	     ve_name(sock_net(sk)->owner_ve), current->comm,
+	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
+	     TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
 
 	if (inet_csk_ack_scheduled(sk)) {
 		const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1655,7 +1660,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
-			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+			     "recvmsg bug 2 (%s/%s): "
+			     "copied %X seq %X rcvnxt %X fl %X\n",
+			     ve_name(sock_net(sk)->owner_ve), current->comm,
 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
 		}
 
@@ -1717,8 +1724,18 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 			tp->ucopy.len = len;
 
-			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
-				!(flags & (MSG_PEEK | MSG_TRUNC)));
+			if (WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+				!(flags & (MSG_PEEK | MSG_TRUNC)))) {
+				printk("KERNEL: assertion: tp->copied_seq == "
+						"tp->rcv_nxt || ...\n");
+				printk("VE%s pid %d comm %.16s\n",
+						ve_name(sock_net(sk)->owner_ve),
+						current->pid, current->comm);
+				printk("flags=0x%x, len=%d, copied_seq=%d, "
+						"rcv_nxt=%d\n", flags,
+						(int)len, tp->copied_seq,
+						tp->rcv_nxt);
+			}
 
 			/* Ugly... If prequeue is not empty, we have to
 			 * process it before releasing socket, otherwise
@@ -2110,7 +2127,7 @@ void tcp_close(struct sock *sk, long timeout)
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2236,6 +2253,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
+	tp->advmss = 65535;
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
@@ -3107,7 +3125,7 @@ void __init tcp_init(void)
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
@@ -3152,6 +3170,11 @@ void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
+	if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096)
+		sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096;
+	if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096)
+		sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096;
+
 	tcp_init_mem(&init_net);
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4518,7 +4518,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 				if (skb_queue_len(&sk->sk_receive_queue) == 0)
 					sk_forced_mem_schedule(sk, skb->truesize);
 				else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
-					goto drop;
+					goto drop_part;
 			}
 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
 		}
@@ -4561,6 +4561,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 drop:
 		__kfree_skb(skb);
 		return;
+
+drop_part:
+		if (after(tp->copied_seq, tp->rcv_nxt))
+			tp->rcv_nxt = tp->copied_seq;
+		__kfree_skb(skb);
+		return;
 	}
 
 	/* Out of window. F.e. zero window probe. */
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -786,7 +786,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
-			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_rcv_wnd >>
+				(tw->tw_rcv_wscale & TW_WSCALE_MASK),
 			tcp_time_stamp + tcptw->tw_ts_offset,
 			tcptw->tw_ts_recent,
 			tw->tw_bound_dev_if,
@@ -1471,7 +1472,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1488,7 +1489,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				rsk = nsk;
 				goto reset;
 			}
-			return 0;
+			goto restore_context;
 		}
 	} else
 		sock_rps_save_rxhash(sk, skb);
@@ -1497,6 +1498,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		rsk = sk;
 		goto reset;
 	}
+
+restore_context:
 	return 0;
 
 reset:
@@ -1508,7 +1511,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
-	return 0;
+	goto restore_context;
 
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
@@ -2053,7 +2056,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+		if (sk->sk_family == st->family &&
+		    net_eq(sock_net(sk), net))
 			return sk;
 	}
 
@@ -2228,7 +2232,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= tcp_seq_next;
 	afinfo->seq_ops.stop		= tcp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_net_create_data(afinfo->name, S_IRUGO, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,6 +6,8 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
+#define RES_ORPHANS	1024
+
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -18,6 +20,67 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
+void cg_orphan_count_inc(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_inc(&tcp->tcp_orphan_count);
+	}
+}
+EXPORT_SYMBOL(cg_orphan_count_inc);
+
+void cg_orphan_count_dec(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_dec(&tcp->tcp_orphan_count);
+	}
+}
+
+bool cg_too_many_orphans(struct sock *sk, int shift)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+		struct percpu_counter *ocp;
+		int orphans;
+
+		tcp = tcp_from_cgproto(cg);
+		ocp = &tcp->tcp_orphan_count;
+		orphans = percpu_counter_read_positive(ocp);
+
+		if (orphans << shift > tcp->tcp_max_orphans) {
+			orphans = percpu_counter_sum_positive(ocp);
+			if (orphans << shift > tcp->tcp_max_orphans)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static u64 tcp_read_orphans(struct mem_cgroup *mem)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(mem);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return percpu_counter_sum_positive(&tcp->tcp_orphan_count);
+}
+
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
@@ -40,6 +103,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
 	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
 	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+	tcp->tcp_max_orphans = sysctl_tcp_max_orphans >> 2;
 	tcp->tcp_memory_pressure = 0;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
@@ -48,6 +112,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 	page_counter_init(&tcp->tcp_memory_allocated, counter_parent);
 	percpu_counter_init(&tcp->tcp_sockets_allocated, 0, GFP_KERNEL);
+	percpu_counter_init(&tcp->tcp_orphan_count, 0, GFP_KERNEL);
 
 	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
 	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
@@ -71,6 +136,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 
 	tcp = tcp_from_cgproto(cg_proto);
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+	percpu_counter_destroy(&tcp->tcp_orphan_count);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -191,6 +257,9 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
 		val = cg_proto->memory_allocated->watermark;
 		val *= PAGE_SIZE;
 		break;
+	case RES_ORPHANS:
+		val = tcp_read_orphans(memcg);
+		break;
 	default:
 		BUG();
 	}
@@ -259,6 +328,11 @@ static struct cftype tcp_files[] = {
 		.trigger = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
+	{
+		.name = "kmem.tcp.orphans",
+		.private = RES_ORPHANS,
+		.read_u64 = tcp_cgroup_read, /* XXX add configuration knob */
+	},
 	{ }	/* terminate */
 };
 
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -304,6 +304,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
 		tcptw->tw_last_oow_ack_time = 0;
+		if (sk->sk_user_data != NULL)
+			tw->tw_rcv_wscale |= TW_WSCALE_SPEC;
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
@@ -459,7 +461,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
 
-		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
 
 		newtp->rcv_wup = newtp->copied_seq =
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -412,11 +412,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
 
-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
-{
-	return tp->snd_una != tp->snd_up;
-}
-
 #define OPTION_SACK_ADVERTISE	(1 << 0)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,6 +114,7 @@
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
 #include "udp_impl.h"
+#include <net/udp_memcontrol.h>
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -150,6 +151,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    sk2 != sk &&
 		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    sk->sk_reuse != SK_FORCE_REUSE &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
@@ -185,6 +187,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    sk2 != sk &&
 		    (udp_sk(sk2)->udp_port_hash == num) &&
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    sk->sk_reuse != SK_FORCE_REUSE &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
@@ -1253,6 +1256,10 @@ EXPORT_SYMBOL_GPL(udp_destruct_sock);
 
 int udp_init_sock(struct sock *sk)
 {
+	local_bh_disable();
+	sock_update_memcg(sk);
+	local_bh_enable();
+
 	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
@@ -1446,7 +1453,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 }
 
 
-int udp_disconnect(struct sock *sk, int flags)
+int __udp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	/*
@@ -1468,6 +1475,15 @@ int udp_disconnect(struct sock *sk, int flags)
 	sk_dst_reset(sk);
 	return 0;
 }
+EXPORT_SYMBOL(__udp_disconnect);
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+	lock_sock(sk);
+	__udp_disconnect(sk, flags);
+	release_sock(sk);
+	return 0;
+}
 EXPORT_SYMBOL(udp_disconnect);
 
 void udp_lib_unhash(struct sock *sk)
@@ -2104,6 +2120,7 @@ void udp_destroy_sock(struct sock *sk)
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
+	sock_release_memcg(sk);
 }
 
 /*
@@ -2353,6 +2370,11 @@ struct proto udp_prot = {
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
 	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+#ifdef CONFIG_MEMCG_KMEM
+	.init_cgroup		= udp_init_cgroup,
+	.destroy_cgroup		= udp_destroy_cgroup,
+	.proto_cgroup		= udp_proto_cgroup,
+#endif
 };
 EXPORT_SYMBOL(udp_prot);
 
@@ -2375,7 +2397,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 
 		spin_lock_bh(&hslot->lock);
 		sk_nulls_for_each(sk, node, &hslot->head) {
-			if (!net_eq(sock_net(sk), net))
+			if (!net_access_allowed(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
 				goto found;
@@ -2394,7 +2416,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 
 	do {
 		sk = sk_nulls_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+	} while (sk && (!net_access_allowed(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
 		if (state->bucket <= state->udp_table->mask)
@@ -2471,7 +2493,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= udp_seq_next;
 	afinfo->seq_ops.stop		= udp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_net_create_data(afinfo->name, S_IRUGO, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
--- /dev/null
+++ b/net/ipv4/udp_memcontrol.c
@@ -0,0 +1,227 @@
+/*
+ *  net/ipv4/udp_memcontrol.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <net/udp.h>
+#include <net/udp_memcontrol.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
+#include <linux/memcontrol.h>
+#include <linux/module.h>
+
+/*
+ * The below code is copied from tcp_memcontrol.c with
+ * s/tcp/udp/g and knowledge that udp doesn't need mem
+ * pressure state and sockets_allocated counter.
+ */
+
+static inline struct udp_memcontrol *udp_from_cgproto(struct cg_proto *cg_proto)
+{
+	return container_of(cg_proto, struct udp_memcontrol, cg_proto);
+}
+
+int udp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+	/*
+	 * The root cgroup does not use page_counters, but rather,
+	 * rely on the data already collected by the network
+	 * subsystem
+	 */
+	struct page_counter *counter_parent = NULL;
+	struct cg_proto *cg_proto, *parent_cg;
+	struct udp_memcontrol *udp;
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return 0;
+
+	udp = udp_from_cgproto(cg_proto);
+
+	udp->udp_prot_mem[0] = sysctl_udp_mem[0];
+	udp->udp_prot_mem[1] = sysctl_udp_mem[1];
+	udp->udp_prot_mem[2] = sysctl_udp_mem[2];
+
+	parent_cg = udp_prot.proto_cgroup(parent);
+	if (parent_cg)
+		counter_parent = parent_cg->memory_allocated;
+
+	page_counter_init(&udp->udp_memory_allocated, counter_parent);
+
+	cg_proto->sysctl_mem = udp->udp_prot_mem;
+	cg_proto->memory_allocated = &udp->udp_memory_allocated;
+	cg_proto->memcg = memcg;
+
+	return 0;
+}
+
+void udp_destroy_cgroup(struct mem_cgroup *memcg)
+{
+}
+
+static int udp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+	struct udp_memcontrol *udp;
+	struct cg_proto *cg_proto;
+	u64 old_lim;
+	int i;
+	int ret;
+
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return -EINVAL;
+
+	udp = udp_from_cgproto(cg_proto);
+
+	old_lim = udp->udp_memory_allocated.limit;
+	ret = page_counter_limit(&udp->udp_memory_allocated, nr_pages);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < 3; i++)
+		udp->udp_prot_mem[i] = min_t(long, nr_pages, sysctl_udp_mem[i]);
+
+	if (nr_pages == PAGE_COUNTER_MAX)
+		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	else {
+		if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+		set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	}
+
+	return 0;
+}
+
+enum {
+	RES_USAGE,
+	RES_LIMIT,
+	RES_MAX_USAGE,
+	RES_FAILCNT,
+};
+
+static DEFINE_MUTEX(udp_limit_mutex);
+
+static int udp_cgroup_write(struct cgroup *cont, struct cftype *cft,
+			    const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret = 0;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		/* see memcontrol.c */
+		ret = page_counter_memparse(buffer, &nr_pages);
+		if (ret)
+			break;
+
+		mutex_lock(&udp_limit_mutex);
+		ret = udp_update_limit(memcg, nr_pages);
+		mutex_unlock(&udp_limit_mutex);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static u64 udp_cgroup_read(struct cgroup *cont, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	struct cg_proto *cg_proto = udp_prot.proto_cgroup(memcg);
+
+	u64 val;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		if (!cg_proto)
+			return PAGE_COUNTER_MAX;
+		val = cg_proto->memory_allocated->limit;
+		val *= PAGE_SIZE;
+		break;
+	case RES_USAGE:
+		if (!cg_proto)
+			val = atomic_long_read(&udp_memory_allocated);
+		else
+			val = page_counter_read(cg_proto->memory_allocated);
+		val *= PAGE_SIZE;
+		break;
+	case RES_FAILCNT:
+		if (!cg_proto)
+			return 0;
+		val = cg_proto->memory_allocated->failcnt;
+		break;
+	case RES_MAX_USAGE:
+		if (!cg_proto)
+			return 0;
+		val = cg_proto->memory_allocated->watermark;
+		val *= PAGE_SIZE;
+		break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static int udp_cgroup_reset(struct cgroup *cont, unsigned int event)
+{
+	struct mem_cgroup *memcg;
+	struct udp_memcontrol *udp;
+	struct cg_proto *cg_proto;
+
+	memcg = mem_cgroup_from_cont(cont);
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return 0;
+	udp = udp_from_cgproto(cg_proto);
+
+	switch (event) {
+	case RES_MAX_USAGE:
+		page_counter_reset_watermark(&udp->udp_memory_allocated);
+		break;
+	case RES_FAILCNT:
+		cg_proto->memory_allocated->failcnt = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static struct cftype udp_files[] = {
+	{
+		.name = "kmem.udp.limit_in_bytes",
+		.write_string = udp_cgroup_write,
+		.read_u64 = udp_cgroup_read,
+		.private = RES_LIMIT,
+	},
+	{
+		.name = "kmem.udp.usage_in_bytes",
+		.read_u64 = udp_cgroup_read,
+		.private = RES_USAGE,
+	},
+	{
+		.name = "kmem.udp.failcnt",
+		.private = RES_FAILCNT,
+		.trigger = udp_cgroup_reset,
+		.read_u64 = udp_cgroup_read,
+	},
+	{
+		.name = "kmem.udp.max_usage_in_bytes",
+		.private = RES_MAX_USAGE,
+		.trigger = udp_cgroup_reset,
+		.read_u64 = udp_cgroup_read,
+	},
+	{ }	/* terminate */
+};
+
+static int __init udp_memcontrol_init(void)
+{
+	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, udp_files));
+	return 0;
+}
+__initcall(udp_memcontrol_init);
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -91,6 +91,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/export.h>
+#include <linux/ve.h>
 
 /* Set to 3 to get tracing... */
 #define ACONF_DEBUG 2
@@ -837,7 +838,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC | __GFP_ACCOUNT);
 
 	if (ifa == NULL) {
 		ADBG("ipv6_add_addr: malloc failed\n");
@@ -3033,6 +3034,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct inet6_dev *idev = __in6_dev_get(dev);
+	struct net *net = dev_net(dev);
 	int run_pending = 0;
 	int err;
 
@@ -3048,7 +3050,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 	case NETDEV_CHANGEMTU:
 		/* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
 		if (dev->mtu < IPV6_MIN_MTU) {
-			addrconf_ifdown(dev, 1);
+			addrconf_ifdown(dev, dev != net->loopback_dev);
 			break;
 		}
 
@@ -3152,7 +3154,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			 * IPV6_MIN_MTU stop IPv6 on this interface.
 			 */
 			if (dev->mtu < IPV6_MIN_MTU)
-				addrconf_ifdown(dev, 1);
+				addrconf_ifdown(dev, dev != net->loopback_dev);
 		}
 		break;
 
@@ -3221,7 +3223,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	ASSERT_RTNL();
 
 	rt6_ifdown(net, dev);
-	neigh_ifdown(&nd_tbl, dev);
 
 	idev = __in6_dev_get(dev);
 	if (idev == NULL)
@@ -3409,6 +3410,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
+	    dev_net(dev)->owner_ve->disable_net ||
 	    ifp->flags & IFA_F_NODAD) {
 		bump_id = ifp->flags & IFA_F_TENTATIVE;
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
@@ -3754,7 +3756,7 @@ static const struct file_operations if6_fops = {
 
 static int __net_init if6_proc_net_init(struct net *net)
 {
-	if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops))
+	if (!proc_net_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops))
 		return -ENOMEM;
 	return 0;
 }
@@ -4608,7 +4610,7 @@ static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
 	/* Use put_unaligned() because stats may not be aligned for u64. */
 	put_unaligned(items, &stats[0]);
 	for (i = 1; i < items; i++)
-		put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+		put_unaligned(__snmp_fold_field64(mib, i, syncpoff, cpu_online_mask), &stats[i]);
 
 	memset(&stats[items], 0, pad);
 }
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -38,11 +38,13 @@
 #include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/ve.h>
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/cpu.h>
 
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -59,6 +61,9 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
 
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
@@ -164,6 +169,10 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(net, answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern &&
 	    !ns_capable(net->user_ns, CAP_NET_RAW))
@@ -710,6 +719,54 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
 
+static void move_ipv6_percpu_stats(int cpu, void **mib, int items)
+{
+	int this_cpu, i;
+
+	local_irq_disable();
+	this_cpu = smp_processor_id();
+
+	for (i = 1; i < items; i++) {
+		*(((u64 *) per_cpu_ptr(mib[0], this_cpu)) + i) +=
+		*(((u64 *) per_cpu_ptr(mib[0], cpu)) + i);
+
+		*(((u64 *) per_cpu_ptr(mib[0], cpu)) + i) = 0;
+	}
+	local_irq_enable();
+}
+
+
+static int ipv6_cpu_notify(struct notifier_block *self,
+			   unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct net *net;
+
+	switch (action) {
+		case CPU_DEAD:
+		case CPU_DEAD_FROZEN:
+		rtnl_lock();
+		for_each_net(net) {
+			for_each_netdev(net, dev) {
+				idev = __in6_dev_get(dev);
+				if (!idev)
+					continue;
+				move_ipv6_percpu_stats(cpu,
+						(void **)idev->stats.ipv6,
+						IPSTATS_MIB_MAX);
+			}
+		}
+		rtnl_unlock();
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipv6_cpu_notifier = {
+	.notifier_call  = ipv6_cpu_notify,
+};
+
 static struct packet_type ipv6_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
 	.func = ipv6_rcv,
@@ -967,6 +1024,7 @@ static int __init inet6_init(void)
 	if (err)
 		goto sysctl_fail;
 #endif
+	register_cpu_notifier(&ipv6_cpu_notifier);
 out:
 	return err;
 
@@ -1038,6 +1096,8 @@ static void __exit inet6_exit(void)
 	if (disable_ipv6_mod)
 		return;
 
+	unregister_cpu_notifier(&ipv6_cpu_notifier);
+
 	/* First of all disallow new sockets creation. */
 	sock_unregister(PF_INET6);
 	/* Disallow any further netlink messages */
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -500,7 +500,7 @@ static const struct file_operations ac6_seq_fops = {
 
 int __net_init ac6_proc_init(struct net *net)
 {
-	if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops))
+	if (!proc_net_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops))
 		return -ENOMEM;
 
 	return 0;
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -198,11 +198,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
 
 	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 
-	/*
-	 * No protection necessary, this is the only list mutatation
-	 * operation, tables never disappear once they exist.
-	 */
+	write_lock_bh(&tb->tb6_lock);
 	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+	write_unlock_bh(&tb->tb6_lock);
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -1665,6 +1663,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
 					func, false, arg);
@@ -1829,6 +1828,7 @@ static int __net_init fib6_net_init(struct net *net)
 					   GFP_KERNEL);
 	if (!net->ipv6.fib6_local_tbl)
 		goto out_fib6_main_tbl;
+
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
 	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
@@ -1877,7 +1877,7 @@ int __init fib6_init(void)
 
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
 					   sizeof(struct fib6_node),
-					   0, SLAB_HWCACHE_ALIGN,
+					   0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
 					   NULL);
 	if (!fib6_node_kmem)
 		goto out;
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -498,12 +498,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
 	case IPV6_FL_A_PUT:
 		spin_lock_bh(&ip6_sk_fl_lock);
 		for (sflp = &np->ipv6_fl_list;
-		     (sfl = rcu_dereference(*sflp))!=NULL;
+		     (sfl = rcu_dereference_protected(*sflp,
+						      lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
 		     sflp = &sfl->next) {
 			if (sfl->fl->label == freq.flr_label) {
 				if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
 					np->flow_label &= ~IPV6_FLOWLABEL_MASK;
-				*sflp = rcu_dereference(sfl->next);
+				*sflp = sfl->next;
 				spin_unlock_bh(&ip6_sk_fl_lock);
 				fl_release(sfl->fl);
 				kfree_rcu(sfl, rcu);
@@ -779,7 +780,7 @@ static const struct file_operations ip6fl_seq_fops = {
 
 static int __net_init ip6_flowlabel_proc_init(struct net *net)
 {
-	if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net,
+	if (!proc_net_create("ip6_flowlabel", S_IRUGO, net->proc_net,
 			 &ip6fl_seq_fops))
 		return -ENOMEM;
 	return 0;
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -149,6 +149,7 @@ int ip6_output(struct sock *sk, struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
+EXPORT_SYMBOL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
@@ -346,6 +347,7 @@ int ip6_forward(struct sk_buff *skb)
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct net *net = dev_net(dst->dev);
+	unsigned int hroom;
 	u32 mtu;
 
 	if (net->ipv6.devconf_all->forwarding == 0)
@@ -477,7 +479,22 @@ int ip6_forward(struct sk_buff *skb)
 		return -EMSGSIZE;
 	}
 
-	if (skb_cow(skb, dst->dev->hard_header_len)) {
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	hroom = dst->dev->hard_header_len;
+	if ((skb->dev->features & NETIF_F_VENET) && /* src is VENET device */
+	    (skb_headroom(skb) >= hroom))  /* and skb has enough headroom */
+		goto no_ttl_decr;
+
+	if (skb_cow(skb, hroom)) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 		goto drop;
 	}
@@ -488,6 +505,7 @@ int ip6_forward(struct sk_buff *skb)
 
 	hdr->hop_limit--;
 
+no_ttl_decr:
 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
@@ -574,20 +592,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	    (err = skb_checksum_help(skb)))
 		goto fail;
 
+	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
 		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
-		    skb_cloned(skb))
+		    skb_cloned(skb) ||
+		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 			goto slow_path;
 
 		skb_walk_frags(skb, frag) {
 			/* Correct geometry. */
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
-			    skb_headroom(frag) < hlen)
+			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 				goto slow_path_clean;
 
 			/* Partially cloned skb? */
@@ -604,8 +624,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 
 		err = 0;
 		offset = 0;
-		frag = skb_shinfo(skb)->frag_list;
-		skb_frag_list_init(skb);
 		/* BUILD HEADER */
 
 		*prevhdr = NEXTHDR_FRAGMENT;
@@ -613,8 +631,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 		if (!tmp_hdr) {
 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_FRAGFAILS);
-			return -ENOMEM;
+			err = -ENOMEM;
+			goto fail;
 		}
+		frag = skb_shinfo(skb)->frag_list;
+		skb_frag_list_init(skb);
 
 		__skb_pull(skb, hlen);
 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
@@ -710,7 +731,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	 *	Fragment the datagram.
 	 */
 
-	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	troom = rt->dst.dev->needed_tailroom;
 
 	/*
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -249,7 +249,7 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return 0;
 
 err2:
-	kfree(mrt);
+	ip6mr_free_table(mrt);
 err1:
 	fib_rules_unregister(ops);
 	return err;
@@ -778,7 +778,8 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
  *	Delete a VIF entry
  */
 
-static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
+		       struct list_head *head)
 {
 	struct mif_device *v;
 	struct net_device *dev;
@@ -824,7 +825,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
 					     dev->ifindex, &in6_dev->cnf);
 	}
 
-	if (v->flags & MIFF_REGISTER)
+	if ((v->flags & MIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
@@ -1333,7 +1334,6 @@ static int ip6mr_device_event(struct notifier_block *this,
 	struct mr6_table *mrt;
 	struct mif_device *v;
 	int ct;
-	LIST_HEAD(list);
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
@@ -1342,10 +1342,9 @@ static int ip6mr_device_event(struct notifier_block *this,
 		v = &mrt->vif6_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
 			if (v->dev == dev)
-				mif6_delete(mrt, ct, &list);
+				mif6_delete(mrt, ct, 1, NULL);
 		}
 	}
-	unregister_netdevice_many(&list);
 
 	return NOTIFY_DONE;
 }
@@ -1368,9 +1367,9 @@ static int __net_init ip6mr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops))
+	if (!proc_net_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops))
 		goto proc_vif_fail;
-	if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops))
+	if (!proc_net_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops))
 		goto proc_cache_fail;
 #endif
 
@@ -1549,7 +1548,7 @@ static void mroute_clean_tables(struct mr6_table *mrt)
 	 */
 	for (i = 0; i < mrt->maxvif; i++) {
 		if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
-			mif6_delete(mrt, i, &list);
+			mif6_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
@@ -1662,6 +1661,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (mrt == NULL)
 		return -ENOENT;
@@ -1673,9 +1676,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 	switch (optname) {
 	case MRT6_INIT:
-		if (sk->sk_type != SOCK_RAW ||
-		    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
-			return -EOPNOTSUPP;
 		if (optlen < sizeof(int))
 			return -EINVAL;
 
@@ -1702,7 +1702,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
 			return -EFAULT;
 		rtnl_lock();
-		ret = mif6_delete(mrt, mifi, NULL);
+		ret = mif6_delete(mrt, mifi, 0, NULL);
 		rtnl_unlock();
 		return ret;
 
@@ -1811,6 +1811,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (mrt == NULL)
 		return -ENOENT;
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -121,6 +121,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 static bool setsockopt_needs_rtnl(int optname)
 {
 	switch (optname) {
+	case IPV6_ADDRFORM:
 	case IPV6_ADD_MEMBERSHIP:
 	case IPV6_DROP_MEMBERSHIP:
 	case IPV6_JOIN_ANYCAST:
@@ -199,7 +200,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			}
 
 			fl6_free_socklist(sk);
-			ipv6_sock_mc_close(sk);
+			__ipv6_sock_mc_close(sk);
 
 			/*
 			 * Sock is moving from IPv6 to IPv4 (sk_prot), so
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -208,7 +208,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return 0;
 }
-EXPORT_SYMBOL(ipv6_sock_mc_join);
 
 /*
  *	socket leave on multicast group
@@ -285,16 +284,14 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
 	return idev;
 }
 
-void ipv6_sock_mc_close(struct sock *sk)
+void __ipv6_sock_mc_close(struct sock *sk)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct ipv6_mc_socklist *mc_lst;
 	struct net *net = sock_net(sk);
 
-	if (!rcu_access_pointer(np->ipv6_mc_list))
-		return;
+	ASSERT_RTNL();
 
-	rtnl_lock();
 	while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
 		struct net_device *dev;
 
@@ -312,8 +309,17 @@ void ipv6_sock_mc_close(struct sock *sk)
 
 		atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
 		kfree_rcu(mc_lst, rcu);
-
 	}
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	if (!rcu_access_pointer(np->ipv6_mc_list))
+		return;
+	rtnl_lock();
+	__ipv6_sock_mc_close(sk);
 	rtnl_unlock();
 }
 
@@ -2881,9 +2887,9 @@ static int __net_init igmp6_proc_init(struct net *net)
 	int err;
 
 	err = -ENOMEM;
-	if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops))
+	if (!proc_net_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops))
 		goto out;
-	if (!proc_create("mcfilter6", S_IRUGO, net->proc_net,
+	if (!proc_net_create("mcfilter6", S_IRUGO, net->proc_net,
 			 &igmp6_mcf_seq_fops))
 		goto out_proc_net_igmp6;
 
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -16,6 +16,8 @@ nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 # l3 independent conntrack
 obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
 
+obj-$(CONFIG_VE_IP_NF_VZPRIVNET) += ip6_vzprivnet.o
+
 nf_nat_ipv6-y		:= nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
 obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -326,6 +326,9 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_action_param acpar;
 	unsigned int addend;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
@@ -754,6 +757,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1481,6 +1488,10 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
@@ -1704,9 +1715,10 @@ static int
 compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 		       unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1814,9 +1826,10 @@ static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
 static int
 compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1836,9 +1849,10 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 static int
 do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1861,9 +1875,10 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 static int
 do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2061,12 +2076,19 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
 
 static int __net_init ip6_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV6);
+	int res;
+
+	res = xt_proto_init(net, NFPROTO_IPV6);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES6);
+	return res;
 }
 
 static void __net_exit ip6_tables_net_exit(struct net *net)
 {
 	xt_proto_fini(net, NFPROTO_IPV6);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES6);
 }
 
 static struct pernet_operations ip6_tables_net_ops = {
--- /dev/null
+++ b/net/ipv6/netfilter/ip6_vzprivnet.c
@@ -0,0 +1,1134 @@
+/*
+ *  net/ipv6/netfilter/ip6_vzprivnet.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/vzprivnet.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/inet.h>
+#include <net/ipv6.h>
+
+static DEFINE_RWLOCK(vzpriv6lock);
+
+struct vzprivnet {
+	unsigned int netid;
+	int weak;
+	unsigned int subnet_preflen;
+	struct list_head list;
+	struct list_head entries;
+};
+
+static LIST_HEAD(sparse6_vzprivnets);
+
+struct vzprivnet_entry {
+	__u32 ip[4];
+	unsigned preflen;
+	struct vzprivnet *pn;
+	struct vzprivnet6_node *n;
+	struct list_head list;
+};
+
+struct vzprivnet6_node
+{
+	struct vzprivnet6_node	*parent;
+	struct vzprivnet6_node	*left;
+	struct vzprivnet6_node	*right;
+
+	struct vzprivnet_entry	*entry;
+
+	__u16			fn_bit;		/* bit key */
+	__u16			fn_flags;
+};
+
+struct vzprivnet internet = {
+	.weak = VZPRIVNET_INET,
+};
+
+#define RTN_RTINFO		1
+
+static struct vzprivnet_entry sparse6_null_entry = {
+	.preflen = 128,
+	.pn = &internet,
+};
+
+static struct vzprivnet6_node sparse6_root_node = {
+	.entry		= &sparse6_null_entry,
+	.fn_flags	= RTN_RTINFO,
+};
+
+static struct vzprivnet_entry legacy6_null_entry = {
+	.preflen = 128,
+	.pn = &internet,
+};
+
+static struct vzprivnet6_node legacy6_root_node = {
+	.entry		= &legacy6_null_entry,
+	.fn_flags	= RTN_RTINFO,
+};
+
+static LIST_HEAD(legacy6_vzprivnets);
+
+static inline int ip6_match(u32 *net, unsigned plen, u32 *ip)
+{
+	return ipv6_prefix_equal((const struct in6_addr *)net, (const struct in6_addr *)ip, plen);
+}
+
+static inline int ip6_intersect(u32 *ip1, unsigned len1, u32 *ip2, unsigned len2)
+{
+	return ip6_match(ip1, len1, ip2) || ip6_match(ip2, len2, ip1);
+}
+
+static __inline__ int addr_bit_set(void *ip, int fn_bit)
+{
+	__u32 *addr = ip;
+
+	return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
+}
+
+static __inline__ void vzprivnet6_node_free(struct vzprivnet6_node * fn)
+{
+	kfree(fn);
+}
+
+static __inline__ struct vzprivnet6_node * vzprivnet6_node_alloc(void)
+{
+	return kzalloc(sizeof(struct vzprivnet6_node), GFP_ATOMIC);
+}
+
+static struct vzprivnet6_node * radix_tree_search(struct vzprivnet6_node *root,
+					struct in6_addr *addr)
+{
+	struct vzprivnet6_node *fn;
+	int dir;
+
+	fn = root;
+
+	for (;;) {
+		struct vzprivnet6_node *next;
+
+		dir = addr_bit_set(addr, fn->fn_bit);
+
+		next = dir ? fn->right : fn->left;
+		if (next) {
+			fn = next;
+			continue;
+		}
+
+		break;
+	}
+
+	if (ip6_match(fn->entry->ip, fn->entry->preflen, (u32 *)addr))
+		return fn;
+
+	return NULL;
+}
+
+static struct vzprivnet_entry *vzprivnet6_lookup(struct vzprivnet6_node *root,
+						u32 *ip)
+{
+	struct vzprivnet6_node *n;
+
+	n = radix_tree_search(root, (struct in6_addr *)ip);
+	return (n) ? n->entry : NULL;
+}
+
+static inline struct vzprivnet *vzprivnet6_lookup_net(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne == NULL)
+		pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+
+	if (pne != NULL)
+		return pne->pn;
+	else
+		return &internet;
+}
+
+static inline int noip(u32 *ip)
+{
+	return (ip[0] | ip[1] | ip[2] | ip[3]) == 0;
+}
+
+static struct vzprivnet6_node * radix_tree_add(void *addr, unsigned plen,
+						struct vzprivnet6_node *root)
+{
+	struct vzprivnet6_node *fn, *in, *ln;
+	struct vzprivnet6_node *pn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+	int	bit;
+	int	dir = 0;
+
+	/* insert node in tree */
+
+	fn = root;
+
+	do {
+		pne = fn->entry;
+		if (ip6_intersect(pne->ip, pne->preflen, (u32 *)addr, plen))
+			return ERR_PTR(-EEXIST);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !ipv6_prefix_equal((struct in6_addr *)pne->ip, addr, fn->fn_bit))
+			goto insert_intermediate_node;
+
+		dir = addr_bit_set(addr, fn->fn_bit);
+		pn = fn;
+		fn = dir ? fn->right : fn->left;
+	} while (fn);
+
+	/*
+	 *	We walked to the bottom of tree.
+	 *	Create new leaf node without children.
+	 */
+
+	ln = vzprivnet6_node_alloc();
+	if (ln == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ln->fn_bit = plen;
+	ln->parent = pn;
+
+	if (dir)
+		pn->right = ln;
+	else
+		pn->left  = ln;
+
+	return ln;
+
+insert_intermediate_node:
+
+	pn = fn->parent;
+
+	bit = ipv6_addr_diff(addr, (struct in6_addr *)pne->ip);
+
+	BUG_ON(plen <= bit);
+
+	/*
+	 *		(intermediate)[in]
+	 *	          /	   \
+	 *	(new leaf node)[ln] (old node)[fn]
+	 */
+	in = vzprivnet6_node_alloc();
+	ln = vzprivnet6_node_alloc();
+
+	if (in == NULL || ln == NULL) {
+		if (in)
+			vzprivnet6_node_free(in);
+		if (ln)
+			vzprivnet6_node_free(ln);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * new intermediate node.
+	 * RTN_RTINFO will be off
+	 */
+
+	in->fn_bit = bit;
+
+	in->parent = pn;
+	in->entry = fn->entry;
+
+	/* update parent pointer */
+	if (dir)
+		pn->right = in;
+	else
+		pn->left  = in;
+
+	ln->fn_bit = plen;
+
+	ln->parent = in;
+	fn->parent = in;
+
+	if (addr_bit_set(addr, bit)) {
+		in->right = ln;
+		in->left  = fn;
+	} else {
+		in->left  = ln;
+		in->right = fn;
+	}
+
+	return ln;
+}
+
+static struct vzprivnet6_node * sparse6_add_subnet(void *addr, unsigned plen)
+{
+	return radix_tree_add(addr, plen, &sparse6_root_node);
+}
+
+static int sparse6_add(unsigned netid, u32 *ip, unsigned preflen, int weak)
+{
+	int err;
+	struct vzprivnet *pn = NULL, *epn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+
+	err = -ENOMEM;
+	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
+	if (pn == NULL)
+		goto out;
+
+	pne = kzalloc(sizeof(*pne), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzpriv6lock);
+	list_for_each_entry(epn, &sparse6_vzprivnets, list)
+		if (epn->netid == netid) {
+			kfree(pn);
+			pn = epn;
+			goto found_net;
+		}
+
+	pn->netid = netid;
+	pn->weak = weak;
+	INIT_LIST_HEAD(&pn->entries);
+
+found_net:
+	if (!noip(ip)) {
+		struct vzprivnet6_node *n;
+
+		n = sparse6_add_subnet(ip, preflen);
+		if (IS_ERR(n)) {
+			err = PTR_ERR(n);
+			goto out_unlock;
+		}
+
+		n->entry = pne;
+		n->fn_flags |= RTN_RTINFO;
+
+		memcpy(pne->ip, ip, sizeof(pne->ip));
+		pne->preflen = preflen;
+		pne->pn = pn;
+		list_add_tail(&pne->list, &pn->entries);
+		pne->n = n;
+		pne = NULL;
+	} else if (weak == VZPRIVNET_WEAK) {
+		pn->weak =  VZPRIVNET_WEAK;
+	} else if (pn == epn) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (pn != epn) {
+		list_add_tail(&pn->list, &sparse6_vzprivnets);
+		pn = NULL;
+	}
+
+	err = 0;
+
+out_unlock:
+	write_unlock_bh(&vzpriv6lock);
+out:
+	if (pn != epn)
+		kfree(pn);
+	kfree(pne);
+
+	return err;
+}
+
+static void radix_tree_del(struct vzprivnet6_node *fn)
+{
+	int children;
+	struct vzprivnet6_node *child, *pn;
+
+	BUG_ON(fn->parent == NULL);
+
+	for (;;) {
+		children = 0;
+		child = NULL;
+
+		if (fn->right) {
+			child = fn->right;
+			children |= 1;
+		}
+		if (fn->left) {
+			child = fn->left;
+			children |= 2;
+		}
+
+		if (children == 3)
+			return;
+
+		pn = fn->parent;
+		if (pn->right == fn)
+			pn->right = child;
+		else if (pn->left == fn)
+			pn->left = child;
+
+		if (child)
+			child->parent = pn;
+
+		vzprivnet6_node_free(fn);
+		if (pn->fn_flags & RTN_RTINFO)
+			return;
+
+		fn = pn;
+	}
+}
+
+
+static void vzprivnet6_del_entry(struct vzprivnet_entry *pne)
+{
+	radix_tree_del(pne->n);
+}
+
+
+static void sparse6_free_entry(struct vzprivnet_entry *pne)
+{
+	list_del(&pne->list);
+	vzprivnet6_del_entry(pne);
+	kfree(pne);
+}
+
+static void vzprivnet6_del_one(struct vzprivnet *pn)
+{
+	struct vzprivnet_entry *pne;
+
+	list_del(&pn->list);
+
+	while (!list_empty(&pn->entries)) {
+		pne = list_first_entry(&pn->entries,
+				struct vzprivnet_entry, list);
+		sparse6_free_entry(pne);
+	}
+
+	kfree(pn);
+}
+
+static void vzprivnet6_cleanup(void)
+{
+	struct vzprivnet *pn;
+
+	write_lock_bh(&vzpriv6lock);
+	while (!list_empty(&sparse6_vzprivnets)) {
+		pn = list_first_entry(&sparse6_vzprivnets,
+				struct vzprivnet, list);
+		vzprivnet6_del_one(pn);
+	}
+	while (!list_empty(&legacy6_vzprivnets)) {
+		pn = list_first_entry(&legacy6_vzprivnets,
+				struct vzprivnet, list);
+		vzprivnet6_del_one(pn);
+	}
+	write_unlock_bh(&vzpriv6lock);
+}
+
+static int sparse6_del_net(unsigned netid, int weak)
+{
+	struct vzprivnet *pn;
+
+	list_for_each_entry(pn, &sparse6_vzprivnets, list) {
+		if (pn->netid != netid)
+			continue;
+
+		if (weak == VZPRIVNET_WEAK)
+			pn->weak = VZPRIVNET_STRONG;
+		else
+			vzprivnet6_del_one(pn);
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int sparse6_del_ip(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne == NULL)
+		return -ENOENT;
+
+	sparse6_free_entry(pne);
+	return 0;
+}
+
+static int sparse6_del(unsigned netid, u32 *ip, int weak)
+{
+	int err;
+
+	write_lock_bh(&vzpriv6lock);
+	if (!noip(ip))
+		err = sparse6_del_ip(ip);
+	else
+		err = sparse6_del_net(netid, weak);
+	write_unlock_bh(&vzpriv6lock);
+
+	return err;
+}
+
+static inline int is_ipv6_neighbour_solicit(const struct in6_addr *addr)
+{
+	/* see addrconf_addr_solict_mult */
+	return (addr->s6_addr32[0] == __constant_htonl(0xFF020000) &&
+		addr->s6_addr32[1] == 0 &&
+		addr->s6_addr32[2] == __constant_htonl(1) &&
+		(addr->s6_addr32[3] & __constant_htonl(0xFF000000)) == __constant_htonl(0xFF000000));
+}
+
+static unsigned int vzprivnet6_hook(struct sk_buff *skb, int can_be_bridge)
+{
+	int verdict = NF_DROP;
+	struct vzprivnet *dst, *src;
+	struct ipv6hdr *hdr;
+	struct net *src_net;
+
+	if (WARN_ON_ONCE(!skb->dev && !skb->sk))
+		return NF_ACCEPT;
+
+	src_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	if (!ve_is_super(src_net->owner_ve))
+		return NF_ACCEPT;
+
+	hdr = ipv6_hdr(skb);
+
+	if (can_be_bridge) {
+		if (!vzpn_handle_bridged &&
+				skb_dst(skb) != NULL &&
+				skb_dst(skb)->output != ip6_output)
+			return NF_ACCEPT;
+		if (is_ipv6_neighbour_solicit(&hdr->daddr))
+			return NF_ACCEPT;
+	}
+
+	read_lock(&vzpriv6lock);
+
+	src = vzprivnet6_lookup_net(hdr->saddr.in6_u.u6_addr32);
+	dst = vzprivnet6_lookup_net(hdr->daddr.in6_u.u6_addr32);
+
+	if (src == dst) {
+		if (ipv6_prefix_equal(&hdr->saddr, &hdr->daddr,
+				      src->subnet_preflen))
+			verdict = NF_ACCEPT;
+	} else if (src->weak + dst->weak >= 3)
+		verdict = NF_ACCEPT;
+
+	read_unlock(&vzpriv6lock);
+
+	return verdict;
+}
+
+static unsigned int vzprivnet6_fwd_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_hook(skb, 1);
+}
+
+static unsigned int vzprivnet6_host_hook(struct sk_buff *skb,
+		const struct net_device *dev, int can_be_bridge)
+{
+	if (!vzpn_filter_host)
+		return NF_ACCEPT;
+	if (!(dev->features & NETIF_F_VENET))
+		return NF_ACCEPT;
+
+	return vzprivnet6_hook(skb, can_be_bridge);
+}
+
+static unsigned int vzprivnet6_in_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_host_hook(skb, in, 0);
+}
+
+static unsigned int vzprivnet6_out_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_host_hook(skb, out, 1);
+}
+
+static struct nf_hook_ops vzprivnet6_ops[] = {
+	{
+		.hook = vzprivnet6_fwd_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_FORWARD,
+		.priority = NF_IP6_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet6_in_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_LOCAL_IN,
+		.priority = NF_IP6_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet6_out_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.priority = NF_IP6_PRI_FIRST
+	},
+};
+
+static char *nextline(char *s)
+{
+	while(*s && *s != '\n') s++;
+	while(*s && *s == '\n') s++;
+	return s;
+}
+
+static int parse_sparse6_add(const char *str, unsigned int *netid, u32 *ip, unsigned *preflen, int *weak)
+{
+	char *end;
+
+	*netid = simple_strtol(str, &end, 10);
+	if (is_eol(*end))
+		return 0;
+
+	if (*end != ':')
+		return -EINVAL;
+
+	str = end + 1;
+	if (*str == '*') {
+		if (!is_eol(*(str + 1)))
+			return -EINVAL;
+
+		*weak = VZPRIVNET_WEAK;
+		return 0;
+	}
+
+	if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*preflen = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int parse_sparse6_remove(const char *str, unsigned int *netid, u32 *ip, int *weak)
+{
+	char *end;
+
+	if (strchr(str, ':') && !strchr(str, '*')) {
+		if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end)) {
+			printk("Bad ip in %s\n", str);
+			return -EINVAL;
+		}
+
+		if (!is_eol(*end))
+			printk("No EOL in %s\n", str);
+	} else {
+		*netid = simple_strtol(str, &end, 10);
+		if (end[0] == ':' && end[1] == '*') {
+			end += 2;
+			*weak = VZPRIVNET_WEAK;
+		}
+	}
+
+	return (is_eol(*end) ? 0 : -EINVAL);
+}
+
+static int parse_sparse6(const char *param, int *add,
+		unsigned int *netid, u32 *ip, unsigned *preflen, int *weak)
+{
+	if (param[0] == '+') {
+		*add = 1;
+		return parse_sparse6_add(param + 1, netid, ip, preflen, weak);
+	}
+
+	if (param[0] == '-') {
+		*add = 0;
+		return parse_sparse6_remove(param + 1, netid, ip, weak);
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * +ID			to add a network
+ * +ID:<addr>/m		to add a subnet to network
+ * +ID:*		to make a network weak
+ * -ID			to remove the whole network
+ * -<addr>		to remove an IP or bounding subnet (from its network)
+ * -ID:*		to make a network "strong" ;)
+ *
+ *  No weak networks here!
+ */
+
+static ssize_t sparse6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add, weak = VZPRIVNET_STRONG;
+		unsigned int netid = 0, preflen = 0;
+		u32 ip[4] = { 0, 0, 0, 0 };
+
+		err = parse_sparse6(s, &add, &netid, ip, &preflen, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = sparse6_add(netid, ip, preflen, weak);
+		else
+			err = sparse6_del(netid, ip, weak);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+
+}
+
+static void *sparse6_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	struct list_head *lh;
+	loff_t pos = *ppos;
+
+	read_lock(&vzpriv6lock);
+	list_for_each(lh, &sparse6_vzprivnets)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+
+static void *sparse6_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == &sparse6_vzprivnets ? NULL : lh;
+}
+
+static void sparse6_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock(&vzpriv6lock);
+}
+
+static int sparse6_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet *pn;
+	struct vzprivnet_entry *pne;
+
+	pn = list_entry(v, struct vzprivnet, list);
+	seq_printf(s, "%u: ", pn->netid);
+	if (pn->weak == VZPRIVNET_WEAK)
+		seq_puts(s, "* ");
+
+	list_for_each_entry(pne, &pn->entries, list)
+		seq_printf(s, "%pI6/%u ", pne->ip, pne->preflen);
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations sparse6_seq_ops = {
+	.start = sparse6_seq_start,
+	.next  = sparse6_seq_next,
+	.stop  = sparse6_seq_stop,
+	.show  = sparse6_seq_show,
+};
+
+static int sparse6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sparse6_seq_ops);
+}
+
+static struct file_operations proc_sparse6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = sparse6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = sparse6_write,
+};
+
+static char sample_ipv6[42];
+
+static ssize_t classify6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int len;
+	char *tmp;
+
+	len = count;
+	if (len >= sizeof(sample_ipv6))
+		len = sizeof(sample_ipv6) - 1;
+
+	if (copy_from_user(sample_ipv6, buf, len))
+		return -EFAULT;
+
+	sample_ipv6[len] = '\0';
+	tmp = strchr(sample_ipv6, '\n');
+	if (tmp)
+		*tmp = '\0';
+
+	return count;
+}
+
+static int classify6_seq_show(struct seq_file *s, void *v)
+{
+	u32 ip[4];
+	struct vzprivnet_entry *pne;
+
+	seq_printf(s, "%s: ", sample_ipv6);
+
+	if (!in6_pton(sample_ipv6, sizeof(sample_ipv6), (u8 *)ip, -1, NULL)) {
+		seq_puts(s, "invalid IP\n");
+		return 0;
+	}
+
+	read_lock(&vzpriv6lock);
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne != NULL) {
+		seq_printf(s, "net %u, ", pne->pn->netid);
+		seq_printf(s, "rule %pI6/%u\n", pne->ip, pne->preflen);
+		goto out;
+	}
+
+	pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+	if (pne != NULL) {
+		seq_printf(s, "legacy %pI6/%u/%u\n",
+				pne->ip, pne->preflen, pne->pn->subnet_preflen);
+
+	} else
+		seq_printf(s, "internet\n");
+out:
+	read_unlock(&vzpriv6lock);
+	return 0;
+}
+
+static int classify6_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &classify6_seq_show, NULL);
+}
+
+static struct file_operations proc_classify6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = classify6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write	 = classify6_write,
+};
+
+static int legacy6_del(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	write_lock_bh(&vzpriv6lock);
+	pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+	if (pne == NULL) {
+		write_unlock_bh(&vzpriv6lock);
+		return -ENOENT;
+	}
+	vzprivnet6_del_one(pne->pn);
+	write_unlock_bh(&vzpriv6lock);
+
+	return 0;
+}
+
+static struct vzprivnet6_node * legacy6_add_subnet(void *addr, unsigned plen)
+{
+	return radix_tree_add(addr, plen, &legacy6_root_node);
+}
+
+static int legacy6_add(u32 *ip, u32 preflen, u32 subnet_preflen)
+{
+	int err;
+	struct vzprivnet *pn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+	struct vzprivnet6_node *n;
+
+	err = -ENOMEM;
+	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
+	if (pn == NULL)
+		goto out;
+
+	pn->subnet_preflen = subnet_preflen;
+	INIT_LIST_HEAD(&pn->entries);
+
+	pne = kzalloc(sizeof(*pne), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzpriv6lock);
+	n = legacy6_add_subnet(ip, preflen);
+	if (IS_ERR(n)) {
+		err = PTR_ERR(n);
+		write_unlock_bh(&vzpriv6lock);
+		goto out;
+	}
+
+	n->entry = pne;
+	n->fn_flags |= RTN_RTINFO;
+
+	memcpy(pne->ip, ip, sizeof(struct in6_addr));
+	pne->preflen = preflen;
+	pne->pn = pn;
+	list_add_tail(&pne->list, &pn->entries);
+	pne->n = n;
+
+	list_add_tail(&pn->list, &legacy6_vzprivnets);
+	write_unlock_bh(&vzpriv6lock);
+
+	return 0;
+out:
+	kfree(pn);
+	kfree(pne);
+
+	return err;
+}
+
+static int parse_legacy6(char *param, int *add, u32 *ip,
+				unsigned *preflen, unsigned *subnet_preflen)
+{
+	char *str, *end;
+
+	if (param[0] == '+')
+		*add = 1;
+	else if (param[0] == '-')
+		*add = 0;
+	else
+		return -EINVAL;
+
+	str = param + 1;
+
+	if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*preflen = simple_strtol(str, &end, 10);
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*subnet_preflen = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	if ((*preflen == 0) || (*preflen > 128) ||
+		(*subnet_preflen == 0) || (*subnet_preflen > 128))
+		return -EINVAL;
+
+	if (*subnet_preflen < *preflen)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t legacy6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add;
+		unsigned int preflen = 0, subnet_preflen = 0;
+		u32 ip[4] = { 0, 0, 0, 0 };
+
+		err = parse_legacy6(s, &add, ip, &preflen, &subnet_preflen);
+		if (err)
+			goto out;
+
+		if (add)
+			err = legacy6_add(ip, preflen, subnet_preflen);
+		else
+			err = legacy6_del(ip);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *legacy6_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	struct list_head *lh;
+	loff_t pos = *ppos;
+
+	read_lock(&vzpriv6lock);
+	list_for_each(lh, &legacy6_vzprivnets)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+
+static void *legacy6_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == &legacy6_vzprivnets ? NULL : lh;
+}
+
+static void legacy6_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock(&vzpriv6lock);
+}
+
+static int legacy6_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet *pn;
+	struct vzprivnet_entry *pne;
+
+	pn = list_entry(v, struct vzprivnet, list);
+	list_for_each_entry(pne, &pn->entries, list)
+		seq_printf(s, "%pI6/%u/%u", pne->ip, pne->preflen,
+							pne->pn->subnet_preflen);
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations legacy6_seq_ops = {
+	.start = legacy6_seq_start,
+	.next  = legacy6_seq_next,
+	.stop  = legacy6_seq_stop,
+	.show  = legacy6_seq_show,
+};
+
+static int legacy6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &legacy6_seq_ops);
+}
+
+static struct file_operations proc_legacy6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = legacy6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = legacy6_write,
+};
+
+static int __init ip6_vzprivnet_init(void)
+{
+	int err = -ENOMEM;
+	struct proc_dir_entry *proc;
+
+	proc = proc_create("sparse6", 0644,
+			vzpriv_proc_dir, &proc_sparse6_ops);
+	if (proc == NULL)
+		goto err_sparse6;
+
+	proc = proc_create("classify6", 0644,
+			vzpriv_proc_dir, &proc_classify6_ops);
+	if (proc == NULL)
+		goto err_classify6;
+
+	proc = proc_create("legacy6", 0644,
+			vzpriv_proc_dir, &proc_legacy6_ops);
+	if (proc == NULL)
+		goto err_legacy6;
+
+	err = nf_register_hooks(vzprivnet6_ops, 3);
+	if (err)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	remove_proc_entry("legacy6", vzpriv_proc_dir);
+err_legacy6:
+	remove_proc_entry("classify6", vzpriv_proc_dir);
+err_classify6:
+	remove_proc_entry("sparse6", vzpriv_proc_dir);
+err_sparse6:
+	return err;
+}
+
+static void __exit ip6_vzprivnet_exit(void)
+{
+	nf_unregister_hooks(vzprivnet6_ops, 3);
+	remove_proc_entry("legacy6", vzpriv_proc_dir);
+	remove_proc_entry("classify6", vzpriv_proc_dir);
+	remove_proc_entry("sparse6", vzpriv_proc_dir);
+	vzprivnet6_cleanup();
+}
+
+module_init(ip6_vzprivnet_init)
+module_exit(ip6_vzprivnet_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -33,6 +33,7 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -432,13 +432,17 @@ static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
 static int synproxy_tg6_check(const struct xt_tgchk_param *par)
 {
 	const struct ip6t_entry *e = par->entryinfo;
+	int ret;
 
 	if (!(e->ipv6.flags & IP6T_F_PROTO) ||
 	    e->ipv6.proto != IPPROTO_TCP ||
 	    e->ipv6.invflags & XT_INV_PROTO)
 		return -EINVAL;
 
-	return nf_ct_l3proto_try_module_get(par->family);
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret == 0)
+		allow_conntrack_allocation(par->net);
+	return ret;
 }
 
 static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -61,12 +61,19 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	net->ipv6.ip6table_filter =
 		ip6t_register_table(net, &packet_filter, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv6.ip6table_filter))
+		net_ipt_module_set(net, VE_IP_FILTER6);
+
 	return PTR_RET(net->ipv6.ip6table_filter);
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
 	ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+	net->ipv6.ip6table_filter = NULL;
+
+	net_ipt_module_clear(net, VE_IP_FILTER6);
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -95,18 +95,31 @@ static int __net_init ip6table_mangle_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_MANGLE6))
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv6.ip6table_mangle =
 		ip6t_register_table(net, &packet_mangler, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv6.ip6table_mangle))
+		net_ipt_module_set(net, VE_IP_MANGLE6);
+
 	return PTR_RET(net->ipv6.ip6table_mangle);
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE6))
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+	net->ipv6.ip6table_mangle = NULL;
+
+	net_ipt_module_clear(net, VE_IP_MANGLE6);
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -115,6 +115,10 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT) ||
+	    !net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -125,7 +129,11 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_nat)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+	net->ipv6.ip6table_nat = NULL;
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -33,19 +33,34 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init ip6table_raw_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	struct xt_table *ip6table_raw;
+
+	if (WARN_ON(net->ipv6.ip6table_raw))
+		net->ipv6.ip6table_raw = NULL;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
 
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_raw =
-		ip6t_register_table(net, &packet_raw, repl);
+	ip6table_raw = ip6t_register_table(net, &packet_raw, repl);
 	kfree(repl);
-	return PTR_RET(net->ipv6.ip6table_raw);
+
+	if (!IS_ERR(ip6table_raw))
+		net->ipv6.ip6table_raw = ip6table_raw;
+
+	return PTR_RET(ip6table_raw);
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_raw)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+
+	net->ipv6.ip6table_raw = NULL;
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -348,10 +348,6 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 {
 	struct nf_log_buf *m;
 
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
 	m = nf_log_buf_open();
 
 	if (!loginfo)
@@ -365,7 +361,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 
 	dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
 
-	nf_log_buf_close(m);
+	nf_log_buf_close(m, net->owner_ve);
 }
 
 static struct nf_logger nf_ip6_logger __read_mostly = {
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -278,7 +278,7 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-	p = proc_create_data(idev->dev->name, S_IRUGO,
+	p = proc_net_create_data(idev->dev->name, S_IRUGO,
 			     net->mib.proc_net_devsnmp6,
 			     &snmp6_dev_seq_fops, idev);
 	if (!p)
@@ -302,14 +302,14 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
 
 static int __net_init ipv6_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat6", S_IRUGO, net->proc_net,
+	if (!proc_net_create("sockstat6", S_IRUGO, net->proc_net,
 			 &sockstat6_seq_fops))
 		return -ENOMEM;
 
-	if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops))
+	if (!proc_net_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops))
 		goto proc_snmp6_fail;
 
-	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+	net->mib.proc_net_devsnmp6 = proc_net_mkdir(net, "dev_snmp6", net->proc_net);
 	if (!net->mib.proc_net_devsnmp6)
 		goto proc_dev_snmp6_fail;
 	return 0;
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -63,11 +63,12 @@
 #include <linux/seq_file.h>
 #include <linux/export.h>
 
-static struct raw_hashinfo raw_v6_hashinfo = {
+struct raw_hashinfo raw_v6_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
 };
+EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
-static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 		unsigned short num, const struct in6_addr *loc_addr,
 		const struct in6_addr *rmt_addr, int dif)
 {
@@ -100,6 +101,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 found:
 	return sk;
 }
+EXPORT_SYMBOL_GPL(__raw_v6_lookup);
 
 /*
  *	0 - deliver
@@ -972,6 +974,11 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case IPV6_HDRINCL:
+		if (sk->sk_type != SOCK_RAW)
+			return -EINVAL;
+		inet_sk(sk)->hdrincl = !!val;
+		return 0;
 	case IPV6_CHECKSUM:
 		if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
 		    level == IPPROTO_IPV6) {
@@ -1016,7 +1023,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return ipv6_setsockopt(sk, level, optname, optval, optlen);
@@ -1037,7 +1045,8 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return compat_ipv6_setsockopt(sk, level, optname,
@@ -1057,6 +1066,9 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case IPV6_HDRINCL:
+		val = inet_sk(sk)->hdrincl;
+		break;
 	case IPV6_CHECKSUM:
 		/*
 		 * We allow getsockopt() for IPPROTO_IPV6-level
@@ -1094,7 +1106,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return ipv6_getsockopt(sk, level, optname, optval, optlen);
@@ -1115,7 +1128,8 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return compat_ipv6_getsockopt(sk, level, optname,
@@ -1214,7 +1228,7 @@ struct proto rawv6_prot = {
 	.close		   = rawv6_close,
 	.destroy	   = raw6_destroy,
 	.connect	   = ip6_datagram_connect,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = __udp_disconnect,
 	.ioctl		   = rawv6_ioctl,
 	.init		   = rawv6_init_sk,
 	.setsockopt	   = rawv6_setsockopt,
@@ -1270,7 +1284,7 @@ static const struct file_operations raw6_seq_fops = {
 
 static int __net_init raw6_init_net(struct net *net)
 {
-	if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops))
+	if (!proc_net_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops))
 		return -ENOMEM;
 
 	return 0;
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1184,7 +1184,7 @@ static struct dst_entry *ip6_route_input_lookup(struct net *net,
 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
 
-void ip6_route_input(struct sk_buff *skb)
+void __ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct net *net = dev_net(skb->dev);
@@ -1192,7 +1192,7 @@ void ip6_route_input(struct sk_buff *skb)
 	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
 		.flowi6_iif = skb->dev->ifindex,
-		.daddr = iph->daddr,
+		.daddr = *daddr,
 		.saddr = iph->saddr,
 		.flowlabel = ip6_flowinfo(iph),
 		.flowi6_mark = skb->mark,
@@ -1205,6 +1205,12 @@ void ip6_route_input(struct sk_buff *skb)
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
+EXPORT_SYMBOL(__ip6_route_input);
+
+void ip6_route_input(struct sk_buff *skb)
+{
+	__ip6_route_input(skb, &ipv6_hdr(skb)->daddr);
+}
 
 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 					     struct flowi6 *fl6, int flags)
@@ -3660,8 +3666,8 @@ static void __net_exit ip6_route_net_exit(struct net *net)
 static int __net_init ip6_route_net_init_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
-	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
+	proc_net_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
+	proc_net_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
 #endif
 	return 0;
 }
@@ -3729,6 +3735,29 @@ void __init ip6_route_init_special_entries(void)
   #endif
 }
 
+#if 0
+static void ip6_rt_dump_dst(void *o)
+{
+	struct rt6_info *r = (struct rt6_info *)o;
+
+	if (r->dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&r->dst);
+	printk("\tflags %x ref %d prot %d\n",
+			r->rt6i_flags, atomic_read(&r->rt6i_ref),
+			(int)r->rt6i_protocol);
+}
+#endif
+
+static void _ip6_rt_dump_dsts(void)
+{
+	printk("IPv6 dst cache:\n");
+	//FIXME
+	//slab_obj_walk(ip6_dst_ops_template.kmem_cachep, ip6_rt_dump_dst);
+}
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -3737,7 +3766,7 @@ int __init ip6_route_init(void)
 	ret = -ENOMEM;
 	ip6_dst_ops_template.kmem_cachep =
 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
-				  SLAB_HWCACHE_ALIGN, NULL);
+				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 	if (!ip6_dst_ops_template.kmem_cachep)
 		goto out;
 
@@ -3788,6 +3817,7 @@ int __init ip6_route_init(void)
 		spin_lock_init(&ul->lock);
 	}
 
+	ip6_rt_dump_dsts = _ip6_rt_dump_dsts;
 out:
 	return ret;
 
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -35,6 +35,8 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -98,6 +100,9 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
 	struct ip_tunnel *t;
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
+	if (sitn == NULL)
+		return NULL;
+
 	for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
@@ -303,8 +308,8 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 	/* For simple GET or for root users,
 	 * we try harder to allocate.
 	 */
-	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
-		kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+	kp = (cmax <= 1 || ve_capable(CAP_NET_ADMIN)) ?
+		kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
 		NULL;
 
 	rcu_read_lock();
@@ -317,7 +322,8 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 		 * For root users, retry allocating enough memory for
 		 * the answer.
 		 */
-		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT |
+				__GFP_NOWARN);
 		if (!kp) {
 			ret = -ENOMEM;
 			goto out;
@@ -1468,6 +1474,9 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
 #endif
 	int err;
 
+	if (net_generic(net, sit_net_id) == NULL)
+		return -EACCES;
+
 	nt = netdev_priv(dev);
 
 	if (ipip6_netlink_encap_parms(data, &ipencap)) {
@@ -1716,6 +1725,9 @@ static int __net_init sit_init_net(struct net *net)
 	struct ip_tunnel *t;
 	int err;
 
+	if (!(net->owner_ve->features & VE_FEATURE_SIT))
+		return net_assign_generic(net, sit_net_id, NULL);
+
 	sitn->tunnels[0] = sitn->tunnels_wc;
 	sitn->tunnels[1] = sitn->tunnels_l;
 	sitn->tunnels[2] = sitn->tunnels_r;
@@ -1753,12 +1765,17 @@ static int __net_init sit_init_net(struct net *net)
 
 static void __net_exit sit_exit_net(struct net *net)
 {
+	struct sit_net *sitn = net_generic(net, sit_net_id);
 	LIST_HEAD(list);
 
+	if (sitn == NULL) /* no VE_FEATURE_SIT */
+		return;
+
 	rtnl_lock();
 	sit_destroy_tunnels(net, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
+	net_assign_generic(net, sit_net_id, NULL);
 }
 
 static struct pernet_operations sit_net_ops = {
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1307,7 +1307,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
 		if (opt_skb)
 			goto ipv6_pktoptions;
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1329,7 +1329,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 				goto reset;
 			if (opt_skb)
 				__kfree_skb(opt_skb);
-			return 0;
+			goto restore_context;
 		}
 	} else
 		sock_rps_save_rxhash(sk, skb);
@@ -1338,6 +1338,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		goto reset;
 	if (opt_skb)
 		goto ipv6_pktoptions;
+
+restore_context:
 	return 0;
 
 reset:
@@ -1346,7 +1348,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (opt_skb)
 		__kfree_skb(opt_skb);
 	kfree_skb(skb);
-	return 0;
+	goto restore_context;
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
@@ -1380,7 +1382,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	kfree_skb(opt_skb);
-	return 0;
+	goto restore_context;
 }
 
 static int tcp_v6_rcv(struct sk_buff *skb)
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -52,6 +52,7 @@
 #include <linux/seq_file.h>
 #include <trace/events/skb.h>
 #include "udp_impl.h"
+#include <net/udp_memcontrol.h>
 
 static unsigned int udp6_ehashfn(struct net *net,
 				  const struct in6_addr *laddr,
@@ -1408,6 +1409,7 @@ void udpv6_destroy_sock(struct sock *sk)
 	}
 
 	inet6_destroy_sock(sk);
+	sock_release_memcg(sk);
 }
 
 /*
@@ -1544,6 +1546,9 @@ struct proto udpv6_prot = {
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
 	.clear_sk	   = udp_v6_clear_sk,
+#ifdef CONFIG_MEMCG_KMEM
+	.proto_cgroup		= udp_proto_cgroup,
+#endif
 };
 
 static struct inet_protosw udpv6_protosw = {
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3779,7 +3779,7 @@ static int __net_init pfkey_init_proc(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("pfkey", 0, net->proc_net, &pfkey_proc_ops);
+	e = proc_net_create("pfkey", 0, net->proc_net, &pfkey_proc_ops);
 	if (e == NULL)
 		return -ENOMEM;
 
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -339,7 +339,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags)
 	if (sock_flag(sk, SOCK_ZAPPED))
 		return 0;
 
-	return udp_disconnect(sk, flags);
+	return __udp_disconnect(sk, flags);
 }
 
 static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -404,7 +404,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags)
 	if (sock_flag(sk, SOCK_ZAPPED))
 		return 0;
 
-	return udp_disconnect(sk, flags);
+	return __udp_disconnect(sk, flags);
 }
 
 static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -67,7 +67,6 @@
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/jiffies.h>
@@ -98,6 +97,8 @@
 #include <net/udp.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <asm/byteorder.h>
 #include <linux/atomic.h>
@@ -549,6 +550,9 @@ static int pppol2tp_create(struct net *net, struct socket *sock)
 	int error = -ENOMEM;
 	struct sock *sk;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
 	if (!sk)
 		goto out;
@@ -1746,7 +1750,7 @@ static __net_init int pppol2tp_init_net(struct net *net)
 	struct proc_dir_entry *pde;
 	int err = 0;
 
-	pde = proc_create("pppol2tp", S_IRUGO, net->proc_net,
+	pde = proc_net_create("pppol2tp", S_IRUGO, net->proc_net,
 			  &pppol2tp_proc_fops);
 	if (!pde) {
 		err = -ENOMEM;
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1419,6 +1419,12 @@ config NETFILTER_XT_MATCH_U32
 
 	  Details and examples are in the kernel module source.
 
+config NETFILTER_XT_MATCH_WDOG_TMO
+	tristate '"wdog_tmo" watchdog timer match'
+	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK && FENCE_WATCHDOG
+	help
+	  This option selects the watchdog timer match module.
+
 endif # NETFILTER_XTABLES
 
 endmenu
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -165,6 +165,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_WDOG_TMO) += xt_wdog_tmo.o
 
 # ipset
 obj-$(CONFIG_IP_SET) += ipset/
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -74,6 +74,8 @@ int nf_register_hook(struct nf_hook_ops *reg)
 	struct nf_hook_ops *elem;
 	int err;
 
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	err = mutex_lock_interruptible(&nf_hook_mutex);
 	if (err < 0)
 		return err;
@@ -92,6 +94,8 @@ EXPORT_SYMBOL(nf_register_hook);
 
 void nf_unregister_hook(struct nf_hook_ops *reg)
 {
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -261,7 +261,8 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
 }
 
 static int
-bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		 u32 flags)
 {
 	struct bitmap_ip *map;
 	u32 first_ip, last_ip, hosts, cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -352,7 +352,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
 }
 
 static int
-bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
+bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		    u32 flags)
 {
 	u32 first_ip, last_ip, cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -248,7 +248,8 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
 }
 
 static int
-bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		   u32 flags)
 {
 	struct bitmap_port *map;
 	u16 first_port, last_port;
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -17,6 +17,8 @@
 #include <linux/spinlock.h>
 #include <linux/rculist.h>
 #include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter/x_tables.h>
@@ -27,8 +29,17 @@ static LIST_HEAD(ip_set_type_list);		/* all registered set types */
 static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
 static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
 
-static struct ip_set * __rcu *ip_set_list;	/* all individual sets */
-static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
+struct ip_set_net {
+	struct ip_set * __rcu *ip_set_list;	/* all individual sets */
+	ip_set_id_t	ip_set_max;	/* max number of sets */
+	int		is_deleted;	/* deleted by ip_set_net_exit */
+};
+static int ip_set_net_id __read_mostly;
+
+static inline struct ip_set_net *ip_set_pernet(struct net *net)
+{
+	return net_generic(net, ip_set_net_id);
+}
 
 #define IP_SET_INC	64
 #define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
@@ -45,8 +56,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 /* When the nfnl mutex is held: */
 #define ip_set_dereference(p)		\
 	rcu_dereference_protected(p, 1)
-#define ip_set(id)		\
-	ip_set_dereference(ip_set_list)[id]
+#define ip_set(inst, id)			\
+	ip_set_dereference((inst)->ip_set_list)[id]
 
 /*
  * The set types are implemented in modules and registered set types
@@ -374,13 +385,14 @@ __ip_set_put(struct ip_set *set)
  */
 
 static inline struct ip_set *
-ip_set_rcu_get(ip_set_id_t index)
+ip_set_rcu_get(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
 	rcu_read_lock();
 	/* ip_set_list itself needs to be protected */
-	set = rcu_dereference(ip_set_list)[index];
+	set = rcu_dereference(inst->ip_set_list)[index];
 	rcu_read_unlock();
 
 	return set;
@@ -390,7 +402,8 @@ int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -428,7 +441,8 @@ int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret;
 
 	BUG_ON(set == NULL);
@@ -450,7 +464,8 @@ int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -474,14 +489,15 @@ EXPORT_SYMBOL_GPL(ip_set_del);
  *
  */
 ip_set_id_t
-ip_set_get_byname(const char *name, struct ip_set **set)
+ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
 {
 	ip_set_id_t i, index = IPSET_INVALID_ID;
 	struct ip_set *s;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
 	rcu_read_lock();
-	for (i = 0; i < ip_set_max; i++) {
-		s = rcu_dereference(ip_set_list)[i];
+	for (i = 0; i < inst->ip_set_max; i++) {
+		s = rcu_dereference(inst->ip_set_list)[i];
 		if (s != NULL && STREQ(s->name, name)) {
 			__ip_set_get(s);
 			index = i;
@@ -501,17 +517,26 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);
  * to be valid, after calling this function.
  *
  */
-void
-ip_set_put_byindex(ip_set_id_t index)
+
+static inline void
+__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
 {
 	struct ip_set *set;
 
 	rcu_read_lock();
-	set = rcu_dereference(ip_set_list)[index];
+	set = rcu_dereference(inst->ip_set_list)[index];
 	if (set != NULL)
 		__ip_set_put(set);
 	rcu_read_unlock();
 }
+
+void
+ip_set_put_byindex(struct net *net, ip_set_id_t index)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+
+	__ip_set_put_byindex(inst, index);
+}
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
 /*
@@ -522,9 +547,9 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex);
  *
  */
 const char *
-ip_set_name_byindex(ip_set_id_t index)
+ip_set_name_byindex(struct net *net, ip_set_id_t index)
 {
-	const struct ip_set *set = ip_set_rcu_get(index);
+	const struct ip_set *set = ip_set_rcu_get(net, index);
 
 	BUG_ON(set == NULL);
 	BUG_ON(set->ref == 0);
@@ -546,15 +571,16 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex);
  * The nfnl mutex is used in the function.
  */
 ip_set_id_t
-ip_set_nfnl_get_byindex(ip_set_id_t index)
+ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
-	if (index > ip_set_max)
+	if (index > inst->ip_set_max)
 		return IPSET_INVALID_ID;
 
 	nfnl_lock(NFNL_SUBSYS_IPSET);
-	set = ip_set(index);
+	set = ip_set(inst, index);
 	if (set)
 		__ip_set_get(set);
 	else
@@ -573,13 +599,17 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
  * The nfnl mutex is used in the function.
  */
 void
-ip_set_nfnl_put(ip_set_id_t index)
+ip_set_nfnl_put(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
+
 	nfnl_lock(NFNL_SUBSYS_IPSET);
-	set = ip_set(index);
-	if (set != NULL)
-		__ip_set_put(set);
+	if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
+		set = ip_set(inst, index);
+		if (set != NULL)
+			__ip_set_put(set);
+	}
 	nfnl_unlock(NFNL_SUBSYS_IPSET);
 }
 EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
@@ -637,14 +667,14 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
 };
 
 static struct ip_set *
-find_set_and_id(const char *name, ip_set_id_t *id)
+find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
 {
 	struct ip_set *set = NULL;
 	ip_set_id_t i;
 
 	*id = IPSET_INVALID_ID;
-	for (i = 0; i < ip_set_max; i++) {
-		set = ip_set(i);
+	for (i = 0; i < inst->ip_set_max; i++) {
+		set = ip_set(inst, i);
 		if (set != NULL && STREQ(set->name, name)) {
 			*id = i;
 			break;
@@ -654,22 +684,23 @@ find_set_and_id(const char *name, ip_set_id_t *id)
 }
 
 static inline struct ip_set *
-find_set(const char *name)
+find_set(struct ip_set_net *inst, const char *name)
 {
 	ip_set_id_t id;
 
-	return find_set_and_id(name, &id);
+	return find_set_and_id(inst, name, &id);
 }
 
 static int
-find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
+find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
+	     struct ip_set **set)
 {
 	struct ip_set *s;
 	ip_set_id_t i;
 
 	*index = IPSET_INVALID_ID;
-	for (i = 0;  i < ip_set_max; i++) {
-		s = ip_set(i);
+	for (i = 0;  i < inst->ip_set_max; i++) {
+		s = ip_set(inst, i);
 		if (s == NULL) {
 			if (*index == IPSET_INVALID_ID)
 				*index = i;
@@ -698,6 +729,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct net *net = sock_net(ctnl);
+	struct ip_set_net *inst = ip_set_pernet(net);
 	struct ip_set *set, *clash = NULL;
 	ip_set_id_t index = IPSET_INVALID_ID;
 	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
@@ -756,7 +789,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		goto put_out;
 	}
 
-	ret = set->type->create(set, tb, flags);
+	ret = set->type->create(net, set, tb, flags);
 	if (ret != 0)
 		goto put_out;
 
@@ -767,7 +800,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	 * by the nfnl mutex. Find the first free index in ip_set_list
 	 * and check clashing.
 	 */
-	ret = find_free_id(set->name, &index, &clash);
+	ret = find_free_id(inst, set->name, &index, &clash);
 	if (ret == -EEXIST) {
 		/* If this is the same set and requested, ignore error */
 		if ((flags & IPSET_FLAG_EXIST) &&
@@ -780,9 +813,9 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		goto cleanup;
 	} else if (ret == -IPSET_ERR_MAX_SETS) {
 		struct ip_set **list, **tmp;
-		ip_set_id_t i = ip_set_max + IP_SET_INC;
+		ip_set_id_t i = inst->ip_set_max + IP_SET_INC;
 
-		if (i < ip_set_max || i == IPSET_INVALID_ID)
+		if (i < inst->ip_set_max || i == IPSET_INVALID_ID)
 			/* Wraparound */
 			goto cleanup;
 
@@ -790,14 +823,14 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		if (!list)
 			goto cleanup;
 		/* nfnl mutex is held, both lists are valid */
-		tmp = ip_set_dereference(ip_set_list);
-		memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max);
-		rcu_assign_pointer(ip_set_list, list);
+		tmp = ip_set_dereference(inst->ip_set_list);
+		memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
+		rcu_assign_pointer(inst->ip_set_list, list);
 		/* Make sure all current packets have passed through */
 		synchronize_net();
 		/* Use new list */
-		index = ip_set_max;
-		ip_set_max = i;
+		index = inst->ip_set_max;
+		inst->ip_set_max = i;
 		kfree(tmp);
 		ret = 0;
 	} else if (ret)
@@ -807,7 +840,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	 * Finally! Add our shiny new set to the list, and be done.
 	 */
 	pr_debug("create: '%s' created with index %u!\n", set->name, index);
-	ip_set(index) = set;
+	ip_set(inst, index) = set;
 
 	return ret;
 
@@ -830,12 +863,12 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
 };
 
 static void
-ip_set_destroy_set(ip_set_id_t index)
+ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
 {
-	struct ip_set *set = ip_set(index);
+	struct ip_set *set = ip_set(inst, index);
 
 	pr_debug("set: %s\n",  set->name);
-	ip_set(index) = NULL;
+	ip_set(inst, index) = NULL;
 
 	/* Must call it without holding any lock */
 	set->variant->destroy(set);
@@ -848,6 +881,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	       const struct nlmsghdr *nlh,
 	       const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *s;
 	ip_set_id_t i;
 	int ret = 0;
@@ -867,21 +901,22 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	 */
 	read_lock_bh(&ip_set_ref_lock);
 	if (!attr[IPSET_ATTR_SETNAME]) {
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL && s->ref) {
 				ret = -IPSET_ERR_BUSY;
 				goto out;
 			}
 		}
 		read_unlock_bh(&ip_set_ref_lock);
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL)
-				ip_set_destroy_set(i);
+				ip_set_destroy_set(inst, i);
 		}
 	} else {
-		s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i);
+		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+				    &i);
 		if (s == NULL) {
 			ret = -ENOENT;
 			goto out;
@@ -891,7 +926,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 		}
 		read_unlock_bh(&ip_set_ref_lock);
 
-		ip_set_destroy_set(i);
+		ip_set_destroy_set(inst, i);
 	}
 	return 0;
 out:
@@ -916,6 +951,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh,
 	     const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *s;
 	ip_set_id_t i;
 
@@ -923,13 +959,13 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 		return -IPSET_ERR_PROTOCOL;
 
 	if (!attr[IPSET_ATTR_SETNAME]) {
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL)
 				ip_set_flush_set(s);
 		}
 	} else {
-		s = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+		s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 		if (s == NULL)
 			return -ENOENT;
 
@@ -955,6 +991,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set, *s;
 	const char *name2;
 	ip_set_id_t i;
@@ -965,7 +1002,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -976,8 +1013,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	}
 
 	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
-	for (i = 0; i < ip_set_max; i++) {
-		s = ip_set(i);
+	for (i = 0; i < inst->ip_set_max; i++) {
+		s = ip_set(inst, i);
 		if (s != NULL && STREQ(s->name, name2)) {
 			ret = -IPSET_ERR_EXIST_SETNAME2;
 			goto out;
@@ -1004,6 +1041,7 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *from, *to;
 	ip_set_id_t from_id, to_id;
 	char from_name[IPSET_MAXNAMELEN];
@@ -1013,11 +1051,13 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id);
+	from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+			       &from_id);
 	if (from == NULL)
 		return -ENOENT;
 
-	to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id);
+	to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
+			     &to_id);
 	if (to == NULL)
 		return -IPSET_ERR_EXIST_SETNAME2;
 
@@ -1034,8 +1074,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 
 	write_lock_bh(&ip_set_ref_lock);
 	swap(from->ref, to->ref);
-	ip_set(from_id) = to;
-	ip_set(to_id) = from;
+	ip_set(inst, from_id) = to;
+	ip_set(inst, to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
 	return 0;
@@ -1054,9 +1094,10 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 static int
 ip_set_dump_done(struct netlink_callback *cb)
 {
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 	if (cb->args[2]) {
-		pr_debug("release set %s\n", ip_set(cb->args[1])->name);
-		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
+		pr_debug("release set %s\n", ip_set(inst, cb->args[1])->name);
+		__ip_set_put_byindex(inst, (ip_set_id_t) cb->args[1]);
 	}
 	return 0;
 }
@@ -1082,6 +1123,7 @@ dump_init(struct netlink_callback *cb)
 	struct nlattr *attr = (void *)nlh + min_len;
 	u32 dump_type;
 	ip_set_id_t index;
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 
 	/* Second pass, so parser can't fail */
 	nla_parse(cda, IPSET_ATTR_CMD_MAX,
@@ -1095,7 +1137,7 @@ dump_init(struct netlink_callback *cb)
 	if (cda[IPSET_ATTR_SETNAME]) {
 		struct ip_set *set;
 
-		set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]),
+		set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
 				      &index);
 		if (set == NULL)
 			return -ENOENT;
@@ -1123,6 +1165,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
 	u32 dump_type, dump_flags;
 	int ret = 0;
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 
 	if (!cb->args[0]) {
 		ret = dump_init(cb);
@@ -1136,18 +1179,18 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 		}
 	}
 
-	if (cb->args[1] >= ip_set_max)
+	if (cb->args[1] >= inst->ip_set_max)
 		goto out;
 
 	dump_type = DUMP_TYPE(cb->args[0]);
 	dump_flags = DUMP_FLAGS(cb->args[0]);
-	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : inst->ip_set_max;
 dump_last:
 	pr_debug("args[0]: %u %u args[1]: %ld\n",
 		 dump_type, dump_flags, cb->args[1]);
 	for (; cb->args[1] < max; cb->args[1]++) {
 		index = (ip_set_id_t) cb->args[1];
-		set = ip_set(index);
+		set = ip_set(inst, index);
 		if (set == NULL) {
 			if (dump_type == DUMP_ONE) {
 				ret = -ENOENT;
@@ -1225,8 +1268,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[2]) {
-		pr_debug("release set %s\n", ip_set(index)->name);
-		ip_set_put_byindex(index);
+		pr_debug("release set %s\n", ip_set(inst, index)->name);
+		__ip_set_put_byindex(inst, index);
 		cb->args[2] = 0;
 	}
 out:
@@ -1244,6 +1287,8 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -1251,6 +1296,7 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 		struct netlink_dump_control c = {
 			.dump = ip_set_dump_start,
 			.done = ip_set_dump_done,
+			.data = (void *)inst
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
@@ -1329,6 +1375,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	const struct nlattr *nla;
@@ -1347,7 +1394,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
 		       attr[IPSET_ATTR_LINENO] == NULL))))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1383,6 +1430,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	const struct nlattr *nla;
@@ -1401,7 +1449,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
 		       attr[IPSET_ATTR_LINENO] == NULL))))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1437,6 +1485,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh,
 	     const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	int ret = 0;
@@ -1447,7 +1496,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 		     !flag_nested(attr[IPSET_ATTR_DATA])))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1473,6 +1522,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	const struct ip_set *set;
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
@@ -1482,7 +1532,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1707,8 +1757,10 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	unsigned int *op;
 	void *data;
 	int copylen = *len, ret = 0;
+	struct net *net = sock_net(sk);
+	struct ip_set_net *inst = ip_set_pernet(net);
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 	if (optval != SO_IP_SET)
 		return -EBADF;
@@ -1757,7 +1809,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		}
 		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
 		nfnl_lock(NFNL_SUBSYS_IPSET);
-		find_set_and_id(req_get->set.name, &id);
+		find_set_and_id(inst, req_get->set.name, &id);
 		req_get->set.index = id;
 		nfnl_unlock(NFNL_SUBSYS_IPSET);
 		goto copy;
@@ -1767,12 +1819,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		struct ip_set *set;
 
 		if (*len != sizeof(struct ip_set_req_get_set) ||
-		    req_get->set.index >= ip_set_max) {
+		    req_get->set.index >= inst->ip_set_max) {
 			ret = -EINVAL;
 			goto done;
 		}
 		nfnl_lock(NFNL_SUBSYS_IPSET);
-		set = ip_set(req_get->set.index);
+		set = ip_set(inst, req_get->set.index);
 		strncpy(req_get->set.name, set ? set->name : "",
 			IPSET_MAXNAMELEN);
 		nfnl_unlock(NFNL_SUBSYS_IPSET);
@@ -1801,49 +1853,82 @@ static struct nf_sockopt_ops so_set __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init
-ip_set_init(void)
+static int __net_init
+ip_set_net_init(struct net *net)
 {
+	struct ip_set_net *inst = ip_set_pernet(net);
+
 	struct ip_set **list;
-	int ret;
 
-	if (max_sets)
-		ip_set_max = max_sets;
-	if (ip_set_max >= IPSET_INVALID_ID)
-		ip_set_max = IPSET_INVALID_ID - 1;
+	inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
+	if (inst->ip_set_max >= IPSET_INVALID_ID)
+		inst->ip_set_max = IPSET_INVALID_ID - 1;
 
-	list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL);
+	list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
 	if (!list)
 		return -ENOMEM;
+	inst->is_deleted = 0;
+	rcu_assign_pointer(inst->ip_set_list, list);
+	return 0;
+}
+
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+
+	struct ip_set *set = NULL;
+	ip_set_id_t i;
 
-	rcu_assign_pointer(ip_set_list, list);
-	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+	inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+
+	for (i = 0; i < inst->ip_set_max; i++) {
+		set = ip_set(inst, i);
+		if (set != NULL)
+			ip_set_destroy_set(inst, i);
+	}
+	kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+}
+
+static struct pernet_operations ip_set_net_ops = {
+	.init	= ip_set_net_init,
+	.exit   = ip_set_net_exit,
+	.id	= &ip_set_net_id,
+	.size	= sizeof(struct ip_set_net)
+};
+
+
+static int __init
+ip_set_init(void)
+{
+	int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
 	if (ret != 0) {
 		pr_err("ip_set: cannot register with nfnetlink.\n");
-		kfree(list);
 		return ret;
 	}
 	ret = nf_register_sockopt(&so_set);
 	if (ret != 0) {
 		pr_err("SO_SET registry failed: %d\n", ret);
 		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-		kfree(list);
 		return ret;
 	}
-
-	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
+	ret = register_pernet_subsys(&ip_set_net_ops);
+	if (ret) {
+		pr_err("ip_set: cannot register pernet_subsys.\n");
+		nf_unregister_sockopt(&so_set);
+		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+		return ret;
+	}
+	pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
 	return 0;
 }
 
 static void __exit
 ip_set_fini(void)
 {
-	struct ip_set **list = rcu_dereference_protected(ip_set_list, 1);
-
-	/* There can't be any existing set */
+	unregister_pernet_subsys(&ip_set_net_ops);
 	nf_unregister_sockopt(&so_set);
 	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-	kfree(list);
 	pr_debug("these are the famous last words\n");
 }
 
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -946,7 +946,8 @@ static const struct ip_set_type_variant mtype_variant = {
 
 #ifdef IP_SET_EMIT_CREATE
 static int
-TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
+TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
+			    struct nlattr *tb[], u32 flags)
 {
 	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
 	u32 cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -63,6 +63,7 @@ struct list_set {
 	u32 size;		/* size of set list array */
 	u32 timeout;		/* timeout value */
 	struct timer_list gc;	/* garbage collection */
+	struct net *net;	/* namespace */
 	struct set_elem members[0]; /* the set members */
 };
 
@@ -204,13 +205,13 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
 	if (e->id != IPSET_INVALID_ID) {
 		if (i == map->size - 1)
 			/* Last element replaced: e.g. add new,before,last */
-			ip_set_put_byindex(e->id);
+			ip_set_put_byindex(map->net, e->id);
 		else {
 			struct set_elem *x = list_set_elem(map, map->size - 1);
 
 			/* Last element pushed off */
 			if (x->id != IPSET_INVALID_ID)
-				ip_set_put_byindex(x->id);
+				ip_set_put_byindex(map->net, x->id);
 			memmove(list_set_elem(map, i + 1), e,
 				map->dsize * (map->size - (i + 1)));
 		}
@@ -230,7 +231,7 @@ list_set_del(struct ip_set *set, u32 i)
 	struct list_set *map = set->data;
 	struct set_elem *e = list_set_elem(map, i);
 
-	ip_set_put_byindex(e->id);
+	ip_set_put_byindex(map->net, e->id);
 
 	if (i < map->size - 1)
 		memmove(e, list_set_elem(map, i + 1),
@@ -324,7 +325,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (SET_WITH_COUNTER(set))
 			ip_set_init_counter(ext_counter(e, map), ext);
 		/* Set is already added to the list */
-		ip_set_put_byindex(d->id);
+		ip_set_put_byindex(map->net, d->id);
 		return 0;
 	}
 insert:
@@ -403,7 +404,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 	ret = ip_set_get_extensions(set, tb, &ext);
 	if (ret)
 		return ret;
-	e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+	e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);
 	if (e.id == IPSET_INVALID_ID)
 		return -IPSET_ERR_NAME;
 	/* "Loop detection" */
@@ -423,7 +424,8 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (tb[IPSET_ATTR_NAMEREF]) {
-		e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+		e.refid = ip_set_get_byname(map->net,
+					    nla_data(tb[IPSET_ATTR_NAMEREF]),
 					    &s);
 		if (e.refid == IPSET_INVALID_ID) {
 			ret = -IPSET_ERR_NAMEREF;
@@ -439,9 +441,9 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 
 finish:
 	if (e.refid != IPSET_INVALID_ID)
-		ip_set_put_byindex(e.refid);
+		ip_set_put_byindex(map->net, e.refid);
 	if (adt != IPSET_ADD || ret)
-		ip_set_put_byindex(e.id);
+		ip_set_put_byindex(map->net, e.id);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
@@ -456,7 +458,7 @@ list_set_flush(struct ip_set *set)
 	for (i = 0; i < map->size; i++) {
 		e = list_set_elem(map, i);
 		if (e->id != IPSET_INVALID_ID) {
-			ip_set_put_byindex(e->id);
+			ip_set_put_byindex(map->net, e->id);
 			e->id = IPSET_INVALID_ID;
 		}
 	}
@@ -530,7 +532,7 @@ list_set_list(const struct ip_set *set,
 				goto nla_put_failure;
 		}
 		if (nla_put_string(skb, IPSET_ATTR_NAME,
-				   ip_set_name_byindex(e->id)))
+				   ip_set_name_byindex(map->net, e->id)))
 			goto nla_put_failure;
 		if (SET_WITH_TIMEOUT(set) &&
 		    nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
@@ -613,7 +615,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 /* Create list:set type of sets */
 
 static struct list_set *
-init_list_set(struct ip_set *set, u32 size, size_t dsize,
+init_list_set(struct net *net, struct ip_set *set, u32 size, size_t dsize,
 	      unsigned long timeout)
 {
 	struct list_set *map;
@@ -625,6 +627,7 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
 		return NULL;
 
 	map->size = size;
+	map->net = net;
 	map->dsize = dsize;
 	map->timeout = timeout;
 	set->data = map;
@@ -638,7 +641,8 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
 }
 
 static int
-list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		u32 flags)
 {
 	struct list_set *map;
 	u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0;
@@ -662,7 +666,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
 		set->extensions |= IPSET_EXT_COUNTER;
 		if (tb[IPSET_ATTR_TIMEOUT]) {
-			map = init_list_set(set, size,
+			map = init_list_set(net, set, size,
 					sizeof(struct setct_elem), timeout);
 			if (!map)
 				return -ENOMEM;
@@ -673,7 +677,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				offsetof(struct setct_elem, counter);
 			list_set_gc_init(set, list_set_gc);
 		} else {
-			map = init_list_set(set, size,
+			map = init_list_set(net, set, size,
 					    sizeof(struct setc_elem), 0);
 			if (!map)
 				return -ENOMEM;
@@ -681,7 +685,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				offsetof(struct setc_elem, counter);
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
-		map = init_list_set(set, size,
+		map = init_list_set(net, set, size,
 				    sizeof(struct sett_elem), timeout);
 		if (!map)
 			return -ENOMEM;
@@ -690,7 +694,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			offsetof(struct sett_elem, timeout);
 		list_set_gc_init(set, list_set_gc);
 	} else {
-		map = init_list_set(set, size, sizeof(struct set_elem), 0);
+		map = init_list_set(net, set, size, sizeof(struct set_elem), 0);
 		if (!map)
 			return -ENOMEM;
 	}
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -616,7 +616,7 @@ int __net_init ip_vs_app_net_init(struct net *net)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	INIT_LIST_HEAD(&ipvs->app_list);
-	proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+	proc_net_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
 	return 0;
 }
 
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1305,8 +1305,8 @@ int __net_init ip_vs_conn_net_init(struct net *net)
 
 	atomic_set(&ipvs->conn_count, 0);
 
-	proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
-	proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+	proc_net_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+	proc_net_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
 	return 0;
 }
 
@@ -1336,7 +1336,7 @@ int __init ip_vs_conn_init(void)
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL);
+					      SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 	if (!ip_vs_conn_cachep) {
 		vfree(ip_vs_conn_tab);
 		return -ENOMEM;
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3603,95 +3603,95 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 static const struct genl_ops ip_vs_genl_ops[] __read_mostly = {
 	{
 		.cmd	= IPVS_CMD_NEW_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_SET_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_DEL_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_GET_SERVICE,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.doit	= ip_vs_genl_get_cmd,
 		.dumpit	= ip_vs_genl_dump_services,
 		.policy	= ip_vs_cmd_policy,
 	},
 	{
 		.cmd	= IPVS_CMD_NEW_DEST,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_SET_DEST,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_DEL_DEST,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_GET_DEST,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.dumpit	= ip_vs_genl_dump_dests,
 	},
 	{
 		.cmd	= IPVS_CMD_NEW_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_daemon,
 	},
 	{
 		.cmd	= IPVS_CMD_DEL_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_daemon,
 	},
 	{
 		.cmd	= IPVS_CMD_GET_DAEMON,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.dumpit	= ip_vs_genl_dump_daemons,
 	},
 	{
 		.cmd	= IPVS_CMD_SET_CONFIG,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_GET_CONFIG,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.doit	= ip_vs_genl_get_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_GET_INFO,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.doit	= ip_vs_genl_get_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_ZERO,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.policy	= ip_vs_cmd_policy,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 	{
 		.cmd	= IPVS_CMD_FLUSH,
-		.flags	= GENL_ADMIN_PERM,
+		.flags	= GENL_VE_ADMIN_PERM,
 		.doit	= ip_vs_genl_set_cmd,
 	},
 };
@@ -3730,7 +3730,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 			return -ENOMEM;
 
 		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
+		if (ve_net_hide_sysctl(net))
 			tbl[0].procname = NULL;
 	} else
 		tbl = vs_vars;
@@ -3797,6 +3797,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
 	cancel_delayed_work_sync(&ipvs->defense_work);
 	cancel_work_sync(&ipvs->defense_work.work);
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
+
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->sysctl_tbl);
 }
 
 #else
@@ -3833,9 +3836,9 @@ int __net_init ip_vs_control_net_init(struct net *net)
 
 	spin_lock_init(&ipvs->tot_stats.lock);
 
-	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
-	proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
-	proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+	proc_net_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+	proc_net_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+	proc_net_create("ip_vs_stats_percpu", 0, net->proc_net,
 		    &ip_vs_stats_percpu_fops);
 
 	if (ip_vs_control_net_init_sysctl(net))
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -17,6 +17,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_core.h>
 
 static bool nf_ct_acct __read_mostly;
 
@@ -70,7 +71,7 @@ static int nf_conntrack_acct_init_sysctl(struct net *net)
 	table[0].data = &net->ct.sysctl_acct;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -53,6 +53,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 
+#include <net/sock.h>
+
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
@@ -123,9 +125,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
-
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
@@ -853,6 +852,7 @@ __nf_conntrack_alloc(struct net *net,
 		     const struct nf_conntrack_tuple *repl,
 		     gfp_t gfp, u32 hash)
 {
+	unsigned int ct_max = net->ct.max ? net->ct.max : init_net.ct.max;
 	struct nf_conn *ct;
 
 	if (unlikely(!nf_conntrack_hash_rnd)) {
@@ -864,11 +864,13 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	atomic_inc(&net->ct.count);
 
-	if (nf_conntrack_max &&
-	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+	if (ct_max &&
+	    unlikely(atomic_read(&net->ct.count) > ct_max)) {
 		if (!early_drop(net, hash)) {
 			atomic_dec(&net->ct.count);
-			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+			net_veboth_ratelimited(KERN_WARNING "VE%s: "
+						"nf_conntrack table full, dropping packet\n",
+						net->owner_ve->ve_name);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
@@ -1072,6 +1074,15 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	struct nf_conn *ct;
 	u32 hash;
 
+	if (!net_ipt_permitted(net, VE_NF_CONNTRACK))
+		return NULL;
+
+	if (!net->ct.can_alloc) {
+		/* No rules loaded */
+		return NULL;
+	}
+	smp_rmb(); /* Pairs with wmb in allow_conntrack_allocation() */
+
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
 			     l4proto)) {
@@ -1607,11 +1618,11 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
 	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
 	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
 	sz = nr_slots * sizeof(struct hlist_nulls_head);
-	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+	hash = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_ZERO,
 					get_order(sz));
 	if (!hash) {
 		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
-		hash = vzalloc(sz);
+		hash = vzalloc_account(sz);
 	}
 
 	if (hash && nulls)
@@ -1724,11 +1735,11 @@ int nf_conntrack_init_start(void)
 		 * entries. */
 		max_factor = 4;
 	}
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.max = max_factor * nf_conntrack_htable_size;
 
 	printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
 	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
-	       nf_conntrack_max);
+	       init_net.ct.max);
 
 	ret = nf_conntrack_expect_init();
 	if (ret < 0)
@@ -1825,6 +1836,7 @@ int nf_conntrack_init_net(struct net *net)
 	int cpu;
 
 	atomic_set(&net->ct.count, 0);
+	net->ct.max = init_net.ct.max;
 	seqcount_init(&net->ct.generation);
 
 	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -199,7 +199,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
 	table[1].data = &net->ct.sysctl_events_retry_timeout;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	net->ct.event_sysctl_header =
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -35,8 +35,6 @@
 unsigned int nf_ct_expect_hsize __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
 
-unsigned int nf_ct_expect_max __read_mostly;
-
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
 /* nf_conntrack_expect helper functions */
@@ -430,8 +428,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 		}
 	}
 
-	if (net->ct.expect_count >= nf_ct_expect_max) {
-		net_warn_ratelimited("nf_conntrack: expectation table full\n");
+	if (net->ct.expect_count >= init_net.ct.expect_max) {
+		net_veboth_ratelimited(KERN_WARNING "VE%s "
+					"nf_conntrack: expectation table full\n",
+					net->owner_ve->ve_name);
 		ret = -EMFILE;
 	}
 out:
@@ -595,7 +595,7 @@ static int exp_proc_init(struct net *net)
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	struct proc_dir_entry *proc;
 
-	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
+	proc = proc_net_create("nf_conntrack_expect", 0440, net->proc_net,
 			   &exp_file_ops);
 	if (!proc)
 		return -ENOMEM;
@@ -617,6 +617,7 @@ int nf_conntrack_expect_pernet_init(struct net *net)
 	int err = -ENOMEM;
 
 	net->ct.expect_count = 0;
+	net->ct.expect_max = init_net.ct.expect_max;
 	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
 	if (net->ct.expect_hash == NULL)
 		goto err1;
@@ -645,7 +646,7 @@ int nf_conntrack_expect_init(void)
 		if (!nf_ct_expect_hsize)
 			nf_ct_expect_hsize = 1;
 	}
-	nf_ct_expect_max = nf_ct_expect_hsize * 4;
+	init_net.ct.expect_max = nf_ct_expect_hsize * 4;
 	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
 				sizeof(struct nf_conntrack_expect),
 				0, 0, NULL);
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -54,6 +54,9 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
+#include <net/sock.h>
+#include <bc/beancounter.h>
+
 MODULE_LICENSE("GPL");
 
 static char __initdata version[] = "0.93";
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -32,6 +32,12 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <linux/rculist_nulls.h>
+#include <linux/ve.h>
+#include <linux/vziptable_defs.h>
+
+int ip_conntrack_disable_ve0 = 0;
+module_param(ip_conntrack_disable_ve0, int, 0440);
+EXPORT_SYMBOL(ip_conntrack_disable_ve0);
 
 MODULE_LICENSE("GPL");
 
@@ -397,11 +403,11 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
 {
 	struct proc_dir_entry *pde;
 
-	pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+	pde = proc_net_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
 	if (!pde)
 		goto out_nf_conntrack;
 
-	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+	pde = proc_net_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
 			  &ct_cpu_seq_fops);
 	if (!pde)
 		goto out_stat_nf_conntrack;
@@ -436,12 +442,10 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
-static struct ctl_table_header *nf_ct_netfilter_header;
-
 static struct ctl_table nf_ct_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -478,7 +482,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 	},
 	{
 		.procname	= "nf_conntrack_expect_max",
-		.data		= &nf_ct_expect_max,
+		.data		= &init_net.ct.expect_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -491,7 +495,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -499,6 +503,44 @@ static struct ctl_table nf_ct_netfilter_table[] = {
 	{ }
 };
 
+static int zero;
+
+static int nf_conntrack_netfilter_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(nf_ct_netfilter_table, sizeof(nf_ct_netfilter_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out_kmemdup;
+
+	table[0].data = &net->ct.max;
+
+	/* Don't export sysctls to unprivileged users */
+	if (ve_net_hide_sysctl(net))
+		table[0].procname = NULL;
+
+	net->ct.netfilter_header = register_net_sysctl(net, "net", table);
+	if (!net->ct.netfilter_header)
+		goto out_unregister_netfilter;
+
+	return 0;
+
+out_unregister_netfilter:
+	kfree(table);
+out_kmemdup:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_netfilter_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.netfilter_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.netfilter_header);
+	kfree(table);
+}
+
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	struct ctl_table *table;
@@ -508,15 +550,23 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	if (!table)
 		goto out_kmemdup;
 
+	table[0].data = &net->ct.max;
 	table[1].data = &net->ct.count;
 	table[2].data = &net->ct.htable_size;
 	table[3].data = &net->ct.sysctl_checksum;
 	table[4].data = &net->ct.sysctl_log_invalid;
+	table[5].data = &net->ct.expect_max;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
+	if (!net_eq(net, &init_net)) {
+		table[0].proc_handler = proc_dointvec_minmax;
+		table[0].extra1 = &zero;
+		table[0].extra2 = &init_net.ct.max;
+	}
+
 	net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
 	if (!net->ct.sysctl_header)
 		goto out_unregister_netfilter;
@@ -538,6 +588,15 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 	kfree(table);
 }
 #else
+static int nf_conntrack_netfilter_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_netfilter_fini_sysctl(struct net *net)
+{
+}
+
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	return 0;
@@ -566,8 +625,14 @@ static int nf_conntrack_pernet_init(struct net *net)
 	if (ret < 0)
 		goto out_sysctl;
 
+	ret = nf_conntrack_netfilter_init_sysctl(net);
+	if (ret < 0)
+		goto out_netfilter_sysctl;
+
 	return 0;
 
+out_netfilter_sysctl:
+	nf_conntrack_standalone_fini_sysctl(net);
 out_sysctl:
 	nf_conntrack_standalone_fini_proc(net);
 out_proc:
@@ -581,6 +646,7 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
 	struct net *net;
 
 	list_for_each_entry(net, net_exit_list, exit_list) {
+		nf_conntrack_netfilter_fini_sysctl(net);
 		nf_conntrack_standalone_fini_sysctl(net);
 		nf_conntrack_standalone_fini_proc(net);
 	}
@@ -594,20 +660,22 @@ static struct pernet_operations nf_conntrack_net_ops = {
 
 static int __init nf_conntrack_standalone_init(void)
 {
-	int ret = nf_conntrack_init_start();
-	if (ret < 0)
-		goto out_start;
+	int ret;
 
-#ifdef CONFIG_SYSCTL
-	nf_ct_netfilter_header =
-		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
-	if (!nf_ct_netfilter_header) {
-		pr_err("nf_conntrack: can't register to sysctl.\n");
-		ret = -ENOMEM;
-		goto out_sysctl;
+#ifdef CONFIG_VE_IPTABLES
+	if (ip_conntrack_disable_ve0) {
+		printk("Disabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask &= ~(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD);
+	} else {
+		printk("Enabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask |= VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD;
 	}
 #endif
 
+	ret = nf_conntrack_init_start();
+	if (ret < 0)
+		goto out_start;
+
 	ret = register_pernet_subsys(&nf_conntrack_net_ops);
 	if (ret < 0)
 		goto out_pernet;
@@ -616,10 +684,6 @@ static int __init nf_conntrack_standalone_init(void)
 	return 0;
 
 out_pernet:
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-out_sysctl:
-#endif
 	nf_conntrack_cleanup_end();
 out_start:
 	return ret;
@@ -629,9 +693,6 @@ static void __exit nf_conntrack_standalone_fini(void)
 {
 	nf_conntrack_cleanup_start();
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-#endif
 	nf_conntrack_cleanup_end();
 }
 
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -285,10 +285,10 @@ struct nf_log_buf *nf_log_buf_open(void)
 }
 EXPORT_SYMBOL_GPL(nf_log_buf_open);
 
-void nf_log_buf_close(struct nf_log_buf *m)
+void nf_log_buf_close(struct nf_log_buf *m, struct ve_struct *ve)
 {
 	m->buf[m->count] = 0;
-	printk("%s\n", m->buf);
+	ve_log_printk(ve, "%s\n", m->buf);
 
 	if (likely(m != &emergency))
 		kfree(m);
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -833,6 +833,9 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 
 static int __net_init nf_nat_net_init(struct net *net)
 {
+	if (net_ipt_permitted(net, VE_IP_NAT))
+		net_ipt_module_set(net, VE_IP_NAT);
+
 	/* Leave them the same for the moment. */
 	net->ct.nat_htable_size = net->ct.htable_size;
 	net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
@@ -848,6 +851,8 @@ static void __net_exit nf_nat_net_exit(struct net *net)
 	nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
 	synchronize_rcu();
 	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
+
+	net_ipt_module_clear(net, VE_IP_NAT);
 }
 
 static struct pernet_operations nf_nat_net_ops = {
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -57,6 +57,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 		indev = __in_dev_get_rcu(skb->dev);
 		if (indev && indev->ifa_list) {
 			ifa = indev->ifa_list;
+#ifdef CONFIG_VE
+                       /*
+                        * Because of venet device specific, we should use
+                        * first nonloopback ifa in the list.
+                        */
+			if (skb->dev->features & NETIF_F_VENET) {
+				while (IN_LOOPBACK(ntohl(ifa->ifa_local)) &&
+				       ifa->ifa_next)
+					ifa = ifa->ifa_next;
+			}
+#endif
 			newdst = ifa->ifa_local;
 		}
 		rcu_read_unlock();
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -6,6 +6,11 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif /* CONFIG_VE_IPTABLES */
+
 #include "nf_internals.h"
 
 /* Sockopts only registered and called from user context, so
@@ -91,6 +96,73 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 	mutex_unlock(&nf_sockopt_mutex);
 	return ops;
 }
+#ifdef CONFIG_VE_IPTABLES
+static int sockopt_module_fits(u_int8_t pf, int val, int get,
+			       u_int8_t mod_pf,
+			       int set_optmin, int set_optmax,
+			       int get_optmin, int get_optmax)
+{
+	if (pf != mod_pf)
+		return 0;
+	if (get)
+		return val >= get_optmin && val < get_optmax;
+	else
+		return val >= set_optmin && val < set_optmax;
+}
+
+static int ve0_load_sockopt_module(struct net *net, u8 pf, int val, int get)
+{
+	const char *name;
+	int ret = -EPERM;
+
+	if (!ve_capable(CAP_NET_ADMIN))
+		goto out;
+
+	if (sockopt_module_fits(pf, val, get, PF_INET,
+				     IPT_BASE_CTL, IPT_SO_SET_MAX + 1,
+				     IPT_BASE_CTL, IPT_SO_GET_MAX + 1)) {
+		name = "ip_tables";
+	} else if (sockopt_module_fits(pf, val, get, PF_INET6,
+				     IP6T_BASE_CTL, IP6T_SO_SET_MAX + 1,
+				     IP6T_BASE_CTL, IP6T_SO_GET_MAX + 1)) {
+		name = "ip6_tables";
+	} else {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * Currently loaded modules are free of locks used during
+	 * their initialization. So, if you add one more module
+	 * here research it before. Maybe you will have to use
+	 * nowait module request in the function below.
+	 */
+	ret = request_module(name);
+out:
+	return ret;
+}
+
+static struct nf_sockopt_ops *nf_sockopt_find_ve(struct sock *sk, u_int8_t pf,
+		int val, int get)
+{
+	struct nf_sockopt_ops *ops = nf_sockopt_find(sk, pf, val, get);
+
+	if (!IS_ERR(ops) || ve_is_super(get_exec_env()))
+		return ops;
+
+	/*
+	 * Containers are not able to load appropriate modules
+	 * from userspace. We tricky help them here. For containers
+	 * this looks like module is already loaded or driver
+	 * is built in kernel.
+	 */
+	if (ve0_load_sockopt_module(sock_net(sk), pf, val, get) != 0)
+		return ops;
+
+	return nf_sockopt_find(sk, pf, val, get);
+}
+#else /* !CONFIG_VE_IPTABLES */
+#define nf_sockopt_find_ve(sk, pf, val, get)	nf_sockopt_find(sk, pf, val, get)
+#endif /* !CONFIG_VE_IPTABLES */
 
 /* Call get/setsockopt() */
 static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
@@ -99,7 +171,7 @@ static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
@@ -133,7 +205,7 @@ static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -325,7 +325,7 @@ static const struct file_operations synproxy_cpu_seq_fops = {
 
 static int __net_init synproxy_proc_init(struct net *net)
 {
-	if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+	if (!proc_net_create("synproxy", S_IRUGO, net->proc_net_stat,
 			 &synproxy_cpu_seq_fops))
 		return -ENOMEM;
 	return 0;
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
 	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,
+	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,
 	[NFNLGRP_NFTRACE]		= NFNL_SUBSYS_NFTABLES,
 };
 
@@ -327,7 +329,9 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlh = nlmsg_hdr(skb);
 		err = 0;
 
-		if (nlh->nlmsg_len < NLMSG_HDRLEN) {
+		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+		    skb->len < nlh->nlmsg_len ||
+		    nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
 			err = -EINVAL;
 			goto ack;
 		}
@@ -482,7 +486,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int nfnetlink_bind(int group)
 {
 	const struct nfnetlink_subsystem *ss;
-	int type = nfnl_group2type[group];
+	int type;
+
+	if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+		return -EINVAL;
+
+	type = nfnl_group2type[group];
 
 	rcu_read_lock();
 	ss = nfnetlink_get_subsys(type);
@@ -532,6 +541,9 @@ static int __init nfnetlink_init(void)
 {
 	int i;
 
+	for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
+		BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
+
 	for (i=0; i<NFNL_SUBSYS_COUNT; i++)
 		mutex_init(&table[i].mutex);
 
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -347,6 +347,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
+	allow_conntrack_allocation(ctx->net);
+
 	return 0;
 }
 
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -209,6 +209,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 			return -EINVAL;
 	}
 
+	allow_conntrack_allocation(ctx->net);
+
 	return 0;
 }
 
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/audit.h>
 #include <net/net_namespace.h>
+#include <bc/beancounter.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp.h>
@@ -67,6 +68,43 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 	[NFPROTO_IPV6]   = "ip6",
 };
 
+#ifdef CONFIG_BEANCOUNTERS
+static void uncharge_xtables(struct xt_table_info *info, unsigned long size)
+{
+	uncharge_beancounter(info->ub, UB_NUMXTENT, size);
+}
+
+static int recharge_xtables(struct xt_table_info *new, struct xt_table_info *old)
+{
+	struct user_beancounter *ub, *old_ub;
+	long change;
+
+	ub = new->ub;
+	old_ub = old->number ? old->ub : ub;
+	change = (long)new->number - (long)old->number;
+	if (old_ub != ub) {
+		printk(KERN_WARNING "iptables resources are charged"
+				" from different UB (%s -> %s)\n",
+				old_ub->ub_name, ub->ub_name);
+		change = new->number;
+	}
+
+	if (change > 0) {
+		if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT))
+			return -ENOMEM;
+	} else if (change < 0)
+		uncharge_beancounter(ub, UB_NUMXTENT, -change);
+
+	if (old_ub != ub)
+		uncharge_beancounter(old_ub, UB_NUMXTENT, old->number);
+
+	return 0;
+}
+#else
+#define recharge_xtables(c, new, old)	(0)
+#define uncharge_xtables(info, s)	do { } while (0)
+#endif	/* CONFIG_BEANCOUNTERS */
+
 /* Allow this many total (re)entries. */
 static const unsigned int xt_jumpstack_multiplier = 2;
 
@@ -181,6 +219,29 @@ xt_unregister_matches(struct xt_match *match, unsigned int n)
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
+/*
+ * Convert xt_name to module name and check for it's allowed.
+ *
+ * xt_name is a module name without prefix.
+ */
+static bool xt_name_allowed(u8 af, const char *xt_name)
+{
+	char module_name[MODULE_NAME_LEN] = {'\0'};
+	const char *prefix = xt_prefix[af];
+	int len = strlen(prefix) + strlen("t_");
+
+	if (len + strnlen(xt_name, MODULE_NAME_LEN) >= MODULE_NAME_LEN)
+		return false;
+
+	/* Fallback targets (ipt_standard_target etc) */
+	if (strcmp(xt_name, XT_STANDARD_TARGET) == 0 ||
+	    strcmp(xt_name, XT_ERROR_TARGET) == 0)
+		return true;
+
+	sprintf(module_name, "%st_%s", prefix, xt_name);
+
+	return module_payload_allowed(module_name);
+}
 
 /*
  * These are weird, but module loading must not be done with mutex
@@ -194,6 +255,9 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 	struct xt_match *m;
 	int err = -ENOENT;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -239,6 +303,9 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 	struct xt_target *t;
 	int err = -ENOENT;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -391,7 +458,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 		 * ebt_among is exempt from centralized matchsize checking
 		 * because it uses a dynamic-size data set.
 		 */
-		pr_err("%s_tables: %s.%u match: invalid size "
+		ve_printk(VE_LOG, "%s_tables: %s.%u match: invalid size "
 		       "%u (kernel) != (user) %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->revision,
@@ -400,7 +467,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 	}
 	if (par->match->table != NULL &&
 	    strcmp(par->match->table, par->table) != 0) {
-		pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+		ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->table, par->table);
 		return -EINVAL;
@@ -408,7 +475,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
 		char used[64], allow[64];
 
-		pr_err("%s_tables: %s match: used from hooks %s, but only "
+		ve_printk(VE_LOG, "%s_tables: %s match: used from hooks %s, but only "
 		       "valid from %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       textify_hooks(used, sizeof(used), par->hook_mask,
@@ -418,7 +485,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 		return -EINVAL;
 	}
 	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
-		pr_err("%s_tables: %s match: only valid for protocol %u\n",
+		ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->proto);
 		return -EINVAL;
@@ -732,13 +799,14 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size)
 {
 	unsigned int *off;
 
-	off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN);
+	off = kcalloc(size, sizeof(unsigned int),
+		GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 
 	if (off)
 		return off;
 
 	if (size < (SIZE_MAX / sizeof(unsigned int)))
-		off = vmalloc(size * sizeof(unsigned int));
+		off = vmalloc_account(size * sizeof(unsigned int));
 
 	return off;
 }
@@ -777,7 +845,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	int ret;
 
 	if (XT_ALIGN(par->target->targetsize) != size) {
-		pr_err("%s_tables: %s.%u target: invalid size "
+		ve_printk(VE_LOG, "%s_tables: %s.%u target: invalid size "
 		       "%u (kernel) != (user) %u\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->revision,
@@ -786,7 +854,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	}
 	if (par->target->table != NULL &&
 	    strcmp(par->target->table, par->table) != 0) {
-		pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
+		ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->table, par->table);
 		return -EINVAL;
@@ -794,7 +862,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
 		char used[64], allow[64];
 
-		pr_err("%s_tables: %s target: used from hooks %s, but only "
+		ve_printk(VE_LOG, "%s_tables: %s target: used from hooks %s, but only "
 		       "usable from %s\n",
 		       xt_prefix[par->family], par->target->name,
 		       textify_hooks(used, sizeof(used), par->hook_mask,
@@ -804,7 +872,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 		return -EINVAL;
 	}
 	if (par->target->proto && (par->target->proto != proto || inv_proto)) {
-		pr_err("%s_tables: %s target: only valid for protocol %u\n",
+		ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->proto);
 		return -EINVAL;
@@ -883,7 +951,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 	if (size != (u64)len)
 		return ERR_PTR(-EINVAL);
 
-	mem = vmalloc(len);
+	mem = vmalloc_account(len);
 	if (!mem)
 		return ERR_PTR(-ENOMEM);
 
@@ -975,14 +1043,18 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 		return NULL;
 
 	if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
-		info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+		info = kmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_NORETRY);
 	if (!info) {
-		info = vmalloc(sz);
+		info = __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN |
+				     __GFP_NORETRY | __GFP_HIGHMEM,
+				 PAGE_KERNEL);
 		if (!info)
 			return NULL;
 	}
 	memset(info, 0, sizeof(*info));
 	info->size = size;
+	info->ub = get_beancounter(get_exec_ub());
+
 	return info;
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
@@ -999,6 +1071,8 @@ void xt_free_table_info(struct xt_table_info *info)
 
 	free_percpu(info->stackptr);
 
+	put_beancounter(info->ub);
+
 	kvfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
@@ -1109,6 +1183,12 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 
+	if (recharge_xtables(newinfo, private)) {
+		local_bh_enable();
+		*error = -ENOMEM;
+		return NULL;
+	}
+
 	newinfo->initial_entries = private->initial_entries;
 	/*
 	 * Ensure contents of newinfo are visible before assigning to
@@ -1206,6 +1286,7 @@ void *xt_unregister_table(struct xt_table *table)
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	kfree(table);
+	uncharge_xtables(private, private->number);
 
 	return private;
 }
@@ -1558,6 +1639,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
 #ifdef CONFIG_PROC_FS
 	char buf[XT_FUNCTION_MAXNAMELEN];
 	struct proc_dir_entry *proc;
+	int mode = 0440;
 #endif
 
 	if (af >= ARRAY_SIZE(xt_prefix))
@@ -1565,23 +1647,26 @@ int xt_proto_init(struct net *net, u_int8_t af)
 
 
 #ifdef CONFIG_PROC_FS
+	if (likely(net_ipt_permitted(net, VE_IP_IPTABLES)))
+		mode |= S_ISVTX;
+
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TABLES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
+	proc = proc_create_data(buf, mode, net->proc_net, &xt_table_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out;
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops,
+	proc = proc_create_data(buf, mode, net->proc_net, &xt_match_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_tables;
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops,
+	proc = proc_create_data(buf, mode, net->proc_net, &xt_target_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_matches;
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -110,6 +110,8 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -248,6 +248,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	}
 	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
 	nf_conntrack_get(&ct->ct_general);
+	allow_conntrack_allocation(par->net);
 out:
 	info->ct = ct;
 	return 0;
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -73,6 +73,42 @@ static int dscp_tg_check(const struct xt_tgchk_param *par)
 	return 0;
 }
 
+static unsigned int
+tos_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	struct iphdr *iph = ip_hdr(skb);
+	u_int8_t oldtos;
+
+	if ((iph->tos & IPTOS_TOS_MASK) != info->tos) {
+		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		iph      = ip_hdr(skb);
+		oldtos   = iph->tos;
+		iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos;
+		csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+	}
+
+	return XT_CONTINUE;
+}
+
+static int
+tos_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	const uint8_t tos = info->tos;
+
+	if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT &&
+	    tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST &&
+	    tos != IPTOS_NORMALSVC) {
+		printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static unsigned int
 tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -132,6 +168,16 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 		.table		= "mangle",
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "TOS",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tos_tg_v0,
+		.targetsize	= sizeof(struct ipt_tos_target_info),
+		.checkentry	= tos_tg_check_v0,
+		.me		= THIS_MODULE,
+	},
 	{
 		.name		= "TOS",
 		.revision	= 1,
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -334,6 +334,7 @@ static int hmark_tg_check(const struct xt_tgchk_param *par)
 		pr_info("xt_HMARK: spi-set and port-set can't be combined\n");
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -60,6 +60,7 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -111,6 +112,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u.\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -40,6 +40,7 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -56,6 +57,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u.\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -112,13 +112,13 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
 
 		if (dst_mtu(skb_dst(skb)) <= minlen) {
-			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    dst_mtu(skb_dst(skb)));
+			net_velog_ratelimited("unknown or invalid path-MTU (%u)\n",
+					      dst_mtu(skb_dst(skb)));
 			return -1;
 		}
 		if (in_mtu <= minlen) {
-			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    in_mtu);
+			net_velog_ratelimited("unknown or invalid path-MTU (%u)\n",
+					      in_mtu);
 			return -1;
 		}
 		newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
@@ -273,8 +273,8 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		pr_info("path-MTU clamping only supported in "
-			"FORWARD, OUTPUT and POSTROUTING hooks\n");
+		ve_printk(VE_LOG, "path-MTU clamping only supported in "
+				  "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
 	if (par->nft_compat)
@@ -283,7 +283,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
-	pr_info("Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "Only works on TCP SYN packets\n");
 	return -EINVAL;
 }
 
@@ -298,8 +298,8 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		pr_info("path-MTU clamping only supported in "
-			"FORWARD, OUTPUT and POSTROUTING hooks\n");
+		ve_printk(VE_LOG, "path-MTU clamping only supported in "
+				  "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
 	if (par->nft_compat)
@@ -308,7 +308,7 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
-	pr_info("Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "Only works on TCP SYN packets\n");
 	return -EINVAL;
 }
 #endif
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -148,6 +148,7 @@ static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
 			"higher than the total number of nodes\n");
 		return -EDOM;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -112,6 +112,8 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 
 	/*
 	 * This filter cannot function correctly unless connection tracking
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -53,7 +53,8 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
 		pr_info("cannot load conntrack support for proto=%u\n",
 							par->family);
 		return ret;
-	}
+	} else
+		allow_conntrack_allocation(par->net);
 
 	ret = nf_connlabels_get(par->net, info->bit);
 	if (ret < 0)
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -393,6 +393,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
 	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
 		info->data->climit_root6[i] = RB_ROOT;
 
+	allow_conntrack_allocation(par->net);
+
 	return 0;
 }
 
@@ -430,15 +432,27 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 	kfree(info->data);
 }
 
-static struct xt_match connlimit_mt_reg __read_mostly = {
-	.name       = "connlimit",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = connlimit_mt_check,
-	.match      = connlimit_mt,
-	.matchsize  = sizeof(struct xt_connlimit_info),
-	.destroy    = connlimit_mt_destroy,
-	.me         = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+	{
+		.name		= "connlimit",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connlimit_mt_check,
+		.match		= connlimit_mt,
+		.matchsize	= sizeof(struct xt_connlimit_info),
+		.destroy	= connlimit_mt_destroy,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "connlimit",
+		.revision	= 1,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connlimit_mt_check,
+		.match		= connlimit_mt,
+		.matchsize	= sizeof(struct xt_connlimit_info),
+		.destroy	= connlimit_mt_destroy,
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init connlimit_mt_init(void)
@@ -464,7 +478,8 @@ static int __init connlimit_mt_init(void)
 		kmem_cache_destroy(connlimit_conn_cachep);
 		return -ENOMEM;
 	}
-	ret = xt_register_match(&connlimit_mt_reg);
+	ret = xt_register_matches(connlimit_mt_reg,
+	      ARRAY_SIZE(connlimit_mt_reg));
 	if (ret != 0) {
 		kmem_cache_destroy(connlimit_conn_cachep);
 		kmem_cache_destroy(connlimit_rb_cachep);
@@ -474,7 +489,7 @@ static int __init connlimit_mt_init(void)
 
 static void __exit connlimit_mt_exit(void)
 {
-	xt_unregister_match(&connlimit_mt_reg);
+	xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
 	kmem_cache_destroy(connlimit_conn_cachep);
 	kmem_cache_destroy(connlimit_rb_cachep);
 }
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -36,6 +36,45 @@ MODULE_ALIAS("ip6t_CONNMARK");
 MODULE_ALIAS("ipt_connmark");
 MODULE_ALIAS("ip6t_connmark");
 
+static unsigned int
+connmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connmark_target_info *markinfo = par->targinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t diff;
+	u_int32_t mark;
+	u_int32_t newmark;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		switch(markinfo->mode) {
+		case XT_CONNMARK_SET:
+			newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+			if (newmark != ct->mark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_SAVE:
+			newmark = (ct->mark & ~markinfo->mask) |
+				  (skb->mark & markinfo->mask);
+			if (ct->mark != newmark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_RESTORE:
+			mark = skb->mark;
+			diff = (ct->mark ^ mark) & markinfo->mask;
+			skb->mark = mark ^ diff;
+			break;
+		}
+	}
+
+	return XT_CONTINUE;
+}
+
 static unsigned int
 connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -74,6 +113,32 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
+static int connmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct xt_connmark_target_info *matchinfo = par->targinfo;
+	int ret = -EINVAL;
+
+	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
+		if (strcmp(par->table, "mangle") != 0) {
+			printk(KERN_WARNING "CONNMARK: restore can only be "
+			       "called from \"mangle\" table, not \"%s\"\n",
+			       par->table);
+			return ret;
+		}
+	}
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return ret;
+	}
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return ret;
+	}
+	return 0;
+}
+
 static int connmark_tg_check(const struct xt_tgchk_param *par)
 {
 	int ret;
@@ -90,6 +155,37 @@ static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_l3proto_module_put(par->family);
 }
 
+static bool
+connmark_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_connmark_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+
+	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int connmark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_connmark_info *cm = par->matchinfo;
+
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return -EINVAL;
+	}
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return -EINVAL;
+	}
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
 static bool
 connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -112,6 +208,8 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -120,38 +218,139 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
 	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_target connmark_tg_reg __read_mostly = {
-	.name           = "CONNMARK",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_tg_check,
-	.target         = connmark_tg,
-	.targetsize     = sizeof(struct xt_connmark_tginfo1),
-	.destroy        = connmark_tg_destroy,
-	.me             = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_target_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
 };
 
-static struct xt_match connmark_mt_reg __read_mostly = {
-	.name           = "connmark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_mt_check,
-	.match          = connmark_mt,
-	.matchsize      = sizeof(struct xt_connmark_mtinfo1),
-	.destroy        = connmark_mt_destroy,
-	.me             = THIS_MODULE,
+static void connmark_tg_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_connmark_target_info *cm = src;
+	struct xt_connmark_target_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_tg_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_connmark_target_info *m = src;
+	struct compat_xt_connmark_target_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark	= m->mark;
+	cm.mask	= m->mask;
+	cm.mode	= m->mode;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target connmark_tg_reg[] __read_mostly = {
+	{
+		.name		= "CONNMARK",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_tg_check_v0,
+		.destroy	= connmark_tg_destroy,
+		.target		= connmark_tg_v0,
+		.targetsize	= sizeof(struct xt_connmark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_target_info),
+		.compat_from_user = connmark_tg_compat_from_user_v0,
+		.compat_to_user	= connmark_tg_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE
+	},
+	{
+		.name           = "CONNMARK",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_tg_check,
+		.target         = connmark_tg,
+		.targetsize     = sizeof(struct xt_connmark_tginfo1),
+		.destroy        = connmark_tg_destroy,
+		.me             = THIS_MODULE,
+	},
+};
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void connmark_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_connmark_info *cm = src;
+	struct xt_connmark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_connmark_info *m = src;
+	struct compat_xt_connmark_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mask = m->mask;
+	cm.invert = m->invert;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match connmark_mt_reg[] __read_mostly = {
+	{
+		.name		= "connmark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_mt_check_v0,
+		.match		= connmark_mt_v0,
+		.destroy	= connmark_mt_destroy,
+		.matchsize	= sizeof(struct xt_connmark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_info),
+		.compat_from_user = connmark_mt_compat_from_user_v0,
+		.compat_to_user	= connmark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "connmark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_mt_check,
+		.match          = connmark_mt,
+		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
+		.destroy        = connmark_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init connmark_mt_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&connmark_tg_reg);
+	ret = xt_register_targets(connmark_tg_reg,
+				  ARRAY_SIZE(connmark_tg_reg));
 	if (ret < 0)
 		return ret;
-	ret = xt_register_match(&connmark_mt_reg);
+	ret = xt_register_matches(connmark_mt_reg,
+				  ARRAY_SIZE(connmark_mt_reg));
 	if (ret < 0) {
-		xt_unregister_target(&connmark_tg_reg);
+		xt_unregister_targets(connmark_tg_reg,
+				      ARRAY_SIZE(connmark_tg_reg));
 		return ret;
 	}
 	return 0;
@@ -159,8 +358,8 @@ static int __init connmark_mt_init(void)
 
 static void __exit connmark_mt_exit(void)
 {
-	xt_unregister_match(&connmark_mt_reg);
-	xt_unregister_target(&connmark_tg_reg);
+	xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg));
+	xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg));
 }
 
 module_init(connmark_mt_init);
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -119,6 +119,95 @@ port_match(u16 min, u16 max, u16 port, bool invert)
 	return (port >= min && port <= max) ^ invert;
 }
 
+static bool
+conntrack_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntrack_info *sinfo = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg)))
+
+	if (ct == &nf_conntrack_untracked)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	else
+		statebit = XT_CONNTRACK_STATE_INVALID;
+
+	if (sinfo->flags & XT_CONNTRACK_STATE) {
+		if (ct) {
+			if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+			if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+		if (FWINV((statebit & sinfo->statemask) == 0,
+			  XT_CONNTRACK_STATE))
+			return false;
+	}
+
+	if (ct == NULL) {
+		if (sinfo->flags & ~XT_CONNTRACK_STATE)
+			return false;
+		return true;
+	}
+
+	if (sinfo->flags & XT_CONNTRACK_PROTO &&
+	    FWINV(nf_ct_protonum(ct) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum,
+		  XT_CONNTRACK_PROTO))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip,
+		  XT_CONNTRACK_ORIGSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip,
+		  XT_CONNTRACK_ORIGDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].src.ip,
+		  XT_CONNTRACK_REPLSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].dst.ip,
+		  XT_CONNTRACK_REPLDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_STATUS &&
+	    FWINV((ct->status & sinfo->statusmask) == 0,
+		  XT_CONNTRACK_STATUS))
+		return false;
+
+	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires = timer_pending(&ct->timeout) ?
+					(ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min &&
+			    expires <= sinfo->expires_max),
+			  XT_CONNTRACK_EXPIRES))
+			return false;
+	}
+	return true;
+#undef FWINV
+}
+
 static inline bool
 ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
 		       const struct nf_conn *ct)
@@ -245,6 +334,56 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 	return true;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_xt_conntrack_info
+{
+	compat_uint_t			statemask;
+	compat_uint_t			statusmask;
+	struct ip_conntrack_old_tuple	tuple[IP_CT_DIR_MAX];
+	struct in_addr			sipmsk[IP_CT_DIR_MAX];
+	struct in_addr			dipmsk[IP_CT_DIR_MAX];
+	compat_ulong_t			expires_min;
+	compat_ulong_t			expires_max;
+	u_int8_t			flags;
+	u_int8_t			invflags;
+};
+
+static void conntrack_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_conntrack_info *cm = src;
+	struct xt_conntrack_info m = {
+		.statemask	= cm->statemask,
+		.statusmask	= cm->statusmask,
+		.expires_min	= cm->expires_min,
+		.expires_max	= cm->expires_max,
+		.flags		= cm->flags,
+		.invflags	= cm->invflags,
+	};
+	memcpy(m.tuple, cm->tuple, sizeof(m.tuple));
+	memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk));
+	memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk));
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int conntrack_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_conntrack_info *m = src;
+	struct compat_xt_conntrack_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.statemask	= m->statemask;
+	cm.statusmask	= m->statusmask;
+	cm.expires_min	= m->expires_min;
+	cm.expires_max	= m->expires_max;
+	cm.flags	= m->flags;
+	cm.invflags	= m->invflags;
+	memcpy(cm.tuple, m->tuple, sizeof(cm.tuple));
+	memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk));
+	memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk));
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif
+
 static bool
 conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -277,6 +416,8 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -286,6 +427,21 @@ static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 }
 
 static struct xt_match conntrack_mt_reg[] __read_mostly = {
+	{
+		.name       = "conntrack",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = conntrack_mt_v0,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.matchsize  = sizeof(struct xt_conntrack_info),
+		.me         = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(struct compat_xt_conntrack_info),
+		.compat_from_user = conntrack_mt_compat_from_user_v0,
+		.compat_to_user   = conntrack_mt_compat_to_user_v0,
+#endif
+	},
 	{
 		.name       = "conntrack",
 		.revision   = 1,
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -54,6 +54,14 @@ static int dscp_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
+static bool
+tos_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_tos_match_info *info = par->matchinfo;
+
+	return (ip_hdr(skb)->tos == info->tos_value) ^ info->invert;
+}
+
 static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_tos_match_info *info = par->matchinfo;
@@ -83,6 +91,14 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 		.matchsize	= sizeof(struct xt_dscp_info),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "tos",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.match		= tos_mt_v0,
+		.matchsize	= sizeof(struct xt_tos_match_info),
+		.me		= THIS_MODULE,
+	},
 	{
 		.name		= "tos",
 		.revision	= 1,
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -262,7 +262,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	}
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde = proc_create_data(minfo->name, 0,
+	hinfo->pde = proc_net_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
 		&dl_file_ops, hinfo);
@@ -867,11 +867,11 @@ static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 
-	hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
+	hashlimit_net->ipt_hashlimit = proc_net_mkdir(net, "ipt_hashlimit", net->proc_net);
 	if (!hashlimit_net->ipt_hashlimit)
 		return -ENOMEM;
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
+	hashlimit_net->ip6t_hashlimit = proc_net_mkdir(net, "ip6t_hashlimit", net->proc_net);
 	if (!hashlimit_net->ip6t_hashlimit) {
 		remove_proc_entry("ipt_hashlimit", net->proc_net);
 		return -ENOMEM;
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -66,6 +66,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
 		return ret;
 	}
 	info->name[29] = '\0';
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -16,6 +16,39 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_iprange.h>
 
+static bool
+iprange_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (info->flags & IPRANGE_SRC) {
+		if ((ntohl(iph->saddr) < ntohl(info->src_min.ip)
+			  || ntohl(iph->saddr) > ntohl(info->src_max.ip))
+			 ^ !!(info->flags & IPRANGE_SRC_INV)) {
+			pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->saddr,
+				 info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+				 &info->src_min.ip,
+				 &info->src_max.ip);
+			return false;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		if ((ntohl(iph->daddr) < ntohl(info->dst_min.ip)
+			  || ntohl(iph->daddr) > ntohl(info->dst_max.ip))
+			 ^ !!(info->flags & IPRANGE_DST_INV)) {
+			pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->daddr,
+				 info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+				 &info->dst_min.ip,
+				 &info->dst_max.ip);
+			return false;
+		}
+	}
+	return true;
+}
+
 static bool
 iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -102,6 +135,14 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 }
 
 static struct xt_match iprange_mt_reg[] __read_mostly = {
+	{
+		.name      = "iprange",
+		.revision  = 0,
+		.family    = NFPROTO_IPV4,
+		.match     = iprange_mt_v0,
+		.matchsize = sizeof(struct xt_iprange_mtinfo),
+		.me        = THIS_MODULE,
+	},
 	{
 		.name      = "iprange",
 		.revision  = 1,
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -161,6 +161,7 @@ static int ipvs_mt_check(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	}
 
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -107,8 +107,8 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
 	/* Check for overflow. */
 	if (r->burst == 0
 	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
-		pr_info("Overflow, try lower: %u/%u\n",
-			r->avg, r->burst);
+		ve_printk(VE_LOG, "Overflow, try lower: %u/%u\n",
+				  r->avg, r->burst);
 		return -ERANGE;
 	}
 
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -24,6 +24,55 @@ MODULE_ALIAS("ip6t_mark");
 MODULE_ALIAS("ipt_MARK");
 MODULE_ALIAS("ip6t_MARK");
 
+static bool
+mark_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mark_info *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool
+mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mark_mtinfo1 *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static unsigned int
+mark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_mark_target_info *markinfo = par->targinfo;
+
+	skb->mark = markinfo->mark;
+	return XT_CONTINUE;
+}
+
+static unsigned int
+mark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+	int mark = 0;
+
+	switch (markinfo->mode) {
+	case XT_MARK_SET:
+		mark = markinfo->mark;
+		break;
+
+	case XT_MARK_AND:
+		mark = skb->mark & markinfo->mark;
+		break;
+
+	case XT_MARK_OR:
+		mark = skb->mark | markinfo->mark;
+		break;
+	}
+
+	skb->mark = mark;
+	return XT_CONTINUE;
+}
+
 static unsigned int
 mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -33,42 +82,220 @@ mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+static int mark_tg_check_v0(const struct xt_tgchk_param *par)
 {
-	const struct xt_mark_mtinfo1 *info = par->matchinfo;
+	const struct xt_mark_target_info *markinfo = par->targinfo;
 
-	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return -EINVAL;
+	}
+	return 0;
 }
 
-static struct xt_target mark_tg_reg __read_mostly = {
-	.name           = "MARK",
-	.revision       = 2,
-	.family         = NFPROTO_UNSPEC,
-	.target         = mark_tg,
-	.targetsize     = sizeof(struct xt_mark_tginfo2),
-	.me             = THIS_MODULE,
+static int mark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+
+	if (markinfo->mode != XT_MARK_SET
+	    && markinfo->mode != XT_MARK_AND
+	    && markinfo->mode != XT_MARK_OR) {
+		printk(KERN_WARNING "MARK: unknown mode %u\n",
+		       markinfo->mode);
+		return -EINVAL;
+	}
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_target_info {
+	compat_ulong_t	mark;
 };
 
-static struct xt_match mark_mt_reg __read_mostly = {
-	.name           = "mark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.match          = mark_mt,
-	.matchsize      = sizeof(struct xt_mark_mtinfo1),
-	.me             = THIS_MODULE,
+static void mark_tg_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_mark_target_info *cm = src;
+	struct xt_mark_target_info m = {
+		.mark	= cm->mark,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_mark_target_info *m = src;
+	struct compat_xt_mark_target_info cm = {
+		.mark	= m->mark,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+
+struct compat_xt_mark_target_info_v1 {
+	compat_ulong_t	mark;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_tg_compat_from_user_v1(void *dst, const void *src)
+{
+	const struct compat_xt_mark_target_info_v1 *cm = src;
+	struct xt_mark_target_info_v1 m = {
+		.mark	= cm->mark,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v1(void __user *dst, const void *src)
+{
+	const struct xt_mark_target_info_v1 *m = src;
+	struct compat_xt_mark_target_info_v1 cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mode = m->mode;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target mark_tg_reg[] __read_mostly = {
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 0,
+		.checkentry	= mark_tg_check_v0,
+		.target		= mark_tg_v0,
+		.targetsize	= sizeof(struct xt_mark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info),
+		.compat_from_user = mark_tg_compat_from_user_v0,
+		.compat_to_user	= mark_tg_compat_to_user_v0,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 1,
+		.checkentry	= mark_tg_check_v1,
+		.target		= mark_tg_v1,
+		.targetsize	= sizeof(struct xt_mark_target_info_v1),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info_v1),
+		.compat_from_user = mark_tg_compat_from_user_v1,
+		.compat_to_user	= mark_tg_compat_to_user_v1,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "MARK",
+		.revision       = 2,
+		.family         = NFPROTO_UNSPEC,
+		.target         = mark_tg,
+		.targetsize     = sizeof(struct xt_mark_tginfo2),
+		.me             = THIS_MODULE,
+	},
+};
+
+static int mark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_mark_info *minfo = par->matchinfo;
+
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_mark_info *cm = src;
+	struct xt_mark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_mark_info *m = src;
+	struct compat_xt_mark_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mask = m->mask;
+	cm.invert = m->invert;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match mark_mt_reg[] __read_mostly = {
+	{
+		.name		= "mark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= mark_mt_check_v0,
+		.match		= mark_mt_v0,
+		.matchsize	= sizeof(struct xt_mark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_info),
+		.compat_from_user = mark_mt_compat_from_user_v0,
+		.compat_to_user	= mark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "mark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.match          = mark_mt,
+		.matchsize      = sizeof(struct xt_mark_mtinfo1),
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init mark_mt_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&mark_tg_reg);
+	ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 	if (ret < 0)
 		return ret;
-	ret = xt_register_match(&mark_mt_reg);
+	ret = xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
 	if (ret < 0) {
-		xt_unregister_target(&mark_tg_reg);
+		xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 		return ret;
 	}
 	return 0;
@@ -76,8 +303,8 @@ static int __init mark_mt_init(void)
 
 static void __exit mark_mt_exit(void)
 {
-	xt_unregister_match(&mark_mt_reg);
-	xt_unregister_target(&mark_tg_reg);
+	xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
+	xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 }
 
 module_init(mark_mt_init);
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,6 +23,13 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
 			par->target->name);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
+static int xt_nat_checkentry_v1(const struct xt_tgchk_param *par)
+{
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -129,6 +136,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 	{
 		.name		= "SNAT",
 		.revision	= 1,
+		.checkentry	= xt_nat_checkentry_v1,
 		.target		= xt_snat_target_v1,
 		.targetsize	= sizeof(struct nf_nat_range),
 		.table		= "nat",
@@ -139,6 +147,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 	{
 		.name		= "DNAT",
 		.revision	= 1,
+		.checkentry	= xt_nat_checkentry_v1,
 		.target		= xt_dnat_target_v1,
 		.targetsize	= sizeof(struct nf_nat_range),
 		.table		= "nat",
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -422,5 +422,7 @@ module_exit(xt_osf_fini);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
 MODULE_DESCRIPTION("Passive OS fingerprint matching.");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -17,14 +17,102 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_owner.h>
 
+static bool
+owner_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & XT_OWNER_UID) {
+		kuid_t uid = make_kuid(ve_init_user_ns(), info->uid);
+		if ((!uid_eq(filp->f_cred->fsuid, uid)) ^
+		    !!(info->invert & XT_OWNER_UID))
+			return false;
+	}
+
+	if (info->match & XT_OWNER_GID) {
+		kgid_t gid = make_kgid(ve_init_user_ns(), info->gid);
+		if ((!gid_eq(filp->f_cred->fsgid, gid)) ^
+		    !!(info->invert & XT_OWNER_GID))
+			return false;
+	}
+
+	return true;
+}
+
+static bool
+owner_mt6_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & XT_OWNER_UID) {
+		kuid_t uid = make_kuid(ve_init_user_ns(), info->uid);
+		if ((!uid_eq(filp->f_cred->fsuid, uid)) ^
+		    !!(info->invert & XT_OWNER_UID))
+			return false;
+	}
+
+	if (info->match & XT_OWNER_GID) {
+		kgid_t gid = make_kgid(ve_init_user_ns(), info->gid);
+		if ((!gid_eq(filp->f_cred->fsgid, gid)) ^
+		    !!(info->invert & XT_OWNER_GID))
+			return false;
+	}
+
+	return true;
+}
+
 static int owner_check(const struct xt_mtchk_param *par)
 {
 	struct xt_owner_match_info *info = par->matchinfo;
+	struct net *net = par->net;
 
-	/* For now only allow adding matches from the initial user namespace */
+	/* Only allow the common case where the userns of the writer
+	 * matches the userns of the network namespace.
+	 */
 	if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
-	    (current_user_ns() != &init_user_ns))
+	    (current_user_ns() != net->user_ns))
 		return -EINVAL;
+
+	/* Ensure the uids are valid */
+	if (info->match & XT_OWNER_UID) {
+		kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+		kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
+
+		if (!uid_valid(uid_min) || !uid_valid(uid_max) ||
+		    (info->uid_max < info->uid_min) ||
+		    uid_lt(uid_max, uid_min)) {
+			return -EINVAL;
+		}
+	}
+
+	/* Ensure the gids are valid */
+	if (info->match & XT_OWNER_GID) {
+		kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+		kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
+
+		if (!gid_valid(gid_min) || !gid_valid(gid_max) ||
+		    (info->gid_max < info->gid_min) ||
+		    gid_lt(gid_max, gid_min)) {
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -33,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_owner_match_info *info = par->matchinfo;
 	const struct file *filp;
+	struct net *net = dev_net(par->in ? par->in : par->out);
 
 	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
 		return (info->match ^ info->invert) == 0;
@@ -49,8 +138,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		       (XT_OWNER_UID | XT_OWNER_GID)) == 0;
 
 	if (info->match & XT_OWNER_UID) {
-		kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
-		kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+		kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+		kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
 		if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
 		     uid_lte(filp->f_cred->fsuid, uid_max)) ^
 		    !(info->invert & XT_OWNER_UID))
@@ -58,8 +147,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 
 	if (info->match & XT_OWNER_GID) {
-		kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
-		kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+		kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+		kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
 		if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
 		     gid_lte(filp->f_cred->fsgid, gid_max)) ^
 		    !(info->invert & XT_OWNER_GID))
@@ -69,26 +158,77 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
-static struct xt_match owner_mt_reg __read_mostly = {
-	.name       = "owner",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = owner_check,
-	.match      = owner_mt,
-	.matchsize  = sizeof(struct xt_owner_match_info),
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-	              (1 << NF_INET_POST_ROUTING),
-	.me         = THIS_MODULE,
+static int owner_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+
+	if (info->match & ~(XT_OWNER_UID | XT_OWNER_GID)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID, SID and command matching is not "
+		       "supported anymore\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int owner_mt6_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+
+	if (info->match & ~(XT_OWNER_UID | XT_OWNER_GID)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID and SID matching is not supported anymore\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match owner_mt_reg[] __read_mostly = {
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.match      = owner_mt_v0,
+		.matchsize  = sizeof(struct ipt_owner_info),
+		.checkentry = owner_mt_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.match      = owner_mt6_v0,
+		.matchsize  = sizeof(struct ip6t_owner_info),
+		.checkentry = owner_mt6_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init owner_mt_init(void)
 {
-	return xt_register_match(&owner_mt_reg);
+	return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 static void __exit owner_mt_exit(void)
 {
-	xt_unregister_match(&owner_mt_reg);
+	xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 module_init(owner_mt_init);
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -394,7 +394,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 		ret = -EINVAL;
 		goto out;
 	}
-	pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
+	pde = proc_net_create_data(t->name, ip_list_perms, recent_net->xt_recent,
 		  &recent_mt_fops, t);
 	if (pde == NULL) {
 		recent_table_free(t);
@@ -618,7 +618,7 @@ static int __net_init recent_proc_net_init(struct net *net)
 {
 	struct recent_net *recent_net = recent_pernet(net);
 
-	recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
+	recent_net->xt_recent = proc_net_mkdir(net, "xt_recent", net->proc_net);
 	if (!recent_net->xt_recent)
 		return -ENOMEM;
 	return 0;
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -81,7 +81,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	struct xt_set_info_match_v0 *info = par->matchinfo;
 	ip_set_id_t index;
 
-	index = ip_set_nfnl_get_byindex(info->match_set.index);
+	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
 
 	if (index == IPSET_INVALID_ID) {
 		pr_warning("Cannot find set indentified by id %u to match\n",
@@ -91,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
-		ip_set_nfnl_put(info->match_set.index);
+		ip_set_nfnl_put(par->net, info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -106,7 +106,7 @@ set_match_v0_destroy(const struct xt_mtdtor_param *par)
 {
 	struct xt_set_info_match_v0 *info = par->matchinfo;
 
-	ip_set_nfnl_put(info->match_set.index);
+	ip_set_nfnl_put(par->net, info->match_set.index);
 }
 
 static unsigned int
@@ -133,7 +133,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	ip_set_id_t index;
 
 	if (info->add_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find add_set index %u as target\n",
 				   info->add_set.index);
@@ -142,12 +142,12 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	}
 
 	if (info->del_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
 			if (info->add_set.index != IPSET_INVALID_ID)
-				ip_set_nfnl_put(info->add_set.index);
+				ip_set_nfnl_put(par->net, info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -156,9 +156,9 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->add_set.index);
+			ip_set_nfnl_put(par->net, info->add_set.index);
 		if (info->del_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->del_set.index);
+			ip_set_nfnl_put(par->net, info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -175,9 +175,9 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par)
 	const struct xt_set_info_target_v0 *info = par->targinfo;
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->add_set.index);
+		ip_set_nfnl_put(par->net, info->add_set.index);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->del_set.index);
+		ip_set_nfnl_put(par->net, info->del_set.index);
 }
 
 /* Revision 1 match and target */
@@ -202,7 +202,7 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par)
 	struct xt_set_info_match_v1 *info = par->matchinfo;
 	ip_set_id_t index;
 
-	index = ip_set_nfnl_get_byindex(info->match_set.index);
+	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
 
 	if (index == IPSET_INVALID_ID) {
 		pr_warning("Cannot find set indentified by id %u to match\n",
@@ -212,7 +212,7 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
-		ip_set_nfnl_put(info->match_set.index);
+		ip_set_nfnl_put(par->net, info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -224,7 +224,7 @@ set_match_v1_destroy(const struct xt_mtdtor_param *par)
 {
 	struct xt_set_info_match_v1 *info = par->matchinfo;
 
-	ip_set_nfnl_put(info->match_set.index);
+	ip_set_nfnl_put(par->net, info->match_set.index);
 }
 
 static unsigned int
@@ -251,7 +251,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 	ip_set_id_t index;
 
 	if (info->add_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find add_set index %u as target\n",
 				   info->add_set.index);
@@ -260,12 +260,12 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 	}
 
 	if (info->del_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
 			if (info->add_set.index != IPSET_INVALID_ID)
-				ip_set_nfnl_put(info->add_set.index);
+				ip_set_nfnl_put(par->net, info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -274,9 +274,9 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->add_set.index);
+			ip_set_nfnl_put(par->net, info->add_set.index);
 		if (info->del_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->del_set.index);
+			ip_set_nfnl_put(par->net, info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -289,9 +289,9 @@ set_target_v1_destroy(const struct xt_tgdtor_param *par)
 	const struct xt_set_info_target_v1 *info = par->targinfo;
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->add_set.index);
+		ip_set_nfnl_put(par->net, info->add_set.index);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->del_set.index);
+		ip_set_nfnl_put(par->net, info->del_set.index);
 }
 
 /* Revision 2 target */
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -388,6 +388,12 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 }
 #endif
 
+static int socket_mt_v0_check(const struct xt_mtchk_param *par)
+{
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
 static int socket_mt_v1_check(const struct xt_mtchk_param *par)
 {
 	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
@@ -396,6 +402,7 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
 		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -407,6 +414,7 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par)
 		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -416,6 +424,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
 		.match		= socket_mt4_v0,
+		.checkentry	= socket_mt_v0_check,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -47,6 +47,8 @@ static int state_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
--- /dev/null
+++ b/net/netfilter/xt_wdog_tmo.c
@@ -0,0 +1,55 @@
+/*
+ *  net/netfilter/xt_wdog_tmo.c
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/fence-watchdog.h>
+
+static bool
+wdog_tmo_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return fence_wdog_tmo_match();
+}
+
+int wdog_tmo_mt_check(const struct xt_mtchk_param *par)
+{
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+	return 0;
+}
+
+static struct xt_match wdog_tmo_mt_reg __read_mostly = {
+		.name       = "wdog_tmo",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = wdog_tmo_mt,
+		.checkentry = wdog_tmo_mt_check,
+		.matchsize  = 0,
+		.me         = THIS_MODULE,
+};
+
+static int __init wdog_tmo_mt_init(void)
+{
+	return xt_register_match(&wdog_tmo_mt_reg);
+}
+
+static void __exit wdog_tmo_mt_exit(void)
+{
+	xt_unregister_match(&wdog_tmo_mt_reg);
+}
+
+module_init(wdog_tmo_mt_init);
+module_exit(wdog_tmo_mt_exit);
+MODULE_AUTHOR("Dmitry Guryanov <dguryanov@parallels.com>");
+MODULE_DESCRIPTION("Xtables: fence watchdog timeout matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_wdog_tmo");
+MODULE_ALIAS("ip6t_wdog_tmo");
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -77,13 +77,6 @@ struct listeners {
 /* state bits */
 #define NETLINK_S_CONGESTED		0x0
 
-/* flags */
-#define NETLINK_F_KERNEL_SOCKET		0x1
-#define NETLINK_F_RECV_PKTINFO		0x2
-#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
-#define NETLINK_F_RECV_NO_ENOBUFS	0x8
-#define NETLINK_F_LISTEN_ALL_NSID	0x10
-
 static inline int netlink_is_kernel(struct sock *sk)
 {
 	return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
@@ -1223,6 +1216,9 @@ static void deferred_put_nlk_sk(struct rcu_head *head)
 {
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
 
+	kfree(nlk->groups);
+	nlk->groups = NULL;
+
 	sock_put(&nlk->sk);
 }
 
@@ -1280,9 +1276,6 @@ static int netlink_release(struct socket *sock)
 	}
 	netlink_table_ungrab();
 
-	kfree(nlk->groups);
-	nlk->groups = NULL;
-
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
@@ -1359,6 +1352,25 @@ bool netlink_ns_capable(const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(netlink_ns_capable);
 
+#ifdef CONFIG_VE
+bool netlink_ve_capable(const struct sk_buff *skb, int cap)
+{
+	struct cred *cred = get_exec_env()->init_cred;
+
+	if (cred == NULL) /* ve isn't running */
+		cred = ve0.init_cred;
+
+	return netlink_ns_capable(skb, cred->user_ns, cap);
+}
+#else
+bool netlink_ve_capable(const struct sk_buff *skb, int cap)
+{
+	return netlink_capable(skb, cap);
+}
+#endif
+
+EXPORT_SYMBOL(netlink_ve_capable);
+
 /**
  * netlink_capable - Netlink global message capability test
  * @skb: socket buffer holding a netlink command from userspace
@@ -1394,6 +1406,7 @@ static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
 {
 	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
 		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
+
 }
 
 static void
@@ -1630,7 +1643,13 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
 	struct sk_buff *skb;
 	void *data;
 
-	if (size <= NLMSG_GOODSIZE || broadcast)
+	if (size <= NLMSG_GOODSIZE || broadcast ||
+			/*
+			 * Once we have vmalloc_kmem() that would account
+			 * allocated pages into memcg, this check can be
+			 * removed.
+			 */
+			!ve_is_super(get_exec_env()))
 		return alloc_skb(size, GFP_KERNEL);
 
 	size = SKB_DATA_ALIGN(size) +
@@ -1786,6 +1805,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
 int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
 		    u32 portid, int nonblock)
 {
+	struct netlink_sock *nlk = nlk_sk(ssk);
 	struct sock *sk;
 	int err;
 	long timeo;
@@ -1794,19 +1814,24 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
 
 	timeo = sock_sndtimeo(ssk, nonblock);
 retry:
-	sk = netlink_getsockbyportid(ssk, portid);
-	if (IS_ERR(sk)) {
-		kfree_skb(skb);
-		return PTR_ERR(sk);
-	}
-	if (netlink_is_kernel(sk))
-		return netlink_unicast_kernel(sk, skb, ssk);
+	if (nlk->flags & NETLINK_F_REPAIR) {
+		sk = ssk;
+		sock_hold(sk);
+	} else {
+		sk = netlink_getsockbyportid(ssk, portid);
+		if (IS_ERR(sk)) {
+			kfree_skb(skb);
+			return PTR_ERR(sk);
+		}
+		if (netlink_is_kernel(sk))
+			return netlink_unicast_kernel(sk, skb, ssk);
 
-	if (sk_filter(sk, skb)) {
-		err = skb->len;
-		kfree_skb(skb);
-		sock_put(sk);
-		return err;
+		if (sk_filter(sk, skb)) {
+			err = skb->len;
+			kfree_skb(skb);
+			sock_put(sk);
+			return err;
+		}
 	}
 
 	err = netlink_attachskb(sk, skb, &timeo, ssk);
@@ -2168,6 +2193,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case NETLINK_REPAIR:
+		if (val)
+			nlk->flags |= NETLINK_F_REPAIR;
+		else
+			nlk->flags &= ~NETLINK_F_REPAIR;
+		err = 0;
+		break;
 	case NETLINK_PKTINFO:
 		if (val)
 			nlk->flags |= NETLINK_F_RECV_PKTINFO;
@@ -2332,6 +2364,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int err;
 	struct scm_cookie scm;
 	u32 netlink_skb_flags = 0;
+	bool repair = nlk->flags & NETLINK_F_REPAIR;
 
 	if (msg->msg_flags&MSG_OOB)
 		return -EOPNOTSUPP;
@@ -2351,7 +2384,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		dst_group = ffs(addr->nl_groups);
 		err =  -EPERM;
 		if ((dst_group || dst_portid) &&
-		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
+		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND &&
+		    !repair))
 			goto out;
 		netlink_skb_flags |= NETLINK_SKB_DST;
 	} else {
@@ -2383,7 +2417,11 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
-	NETLINK_CB(skb).portid	= nlk->portid;
+	if (unlikely(repair))
+		NETLINK_CB(skb).portid = dst_portid;
+	else
+		NETLINK_CB(skb).portid	= nlk->portid;
+
 	NETLINK_CB(skb).dst_group = dst_group;
 	NETLINK_CB(skb).creds	= siocb->scm->creds;
 	NETLINK_CB(skb).flags	= netlink_skb_flags;
@@ -2400,7 +2438,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 	}
 
-	if (dst_group) {
+	if (dst_group && !repair) {
 		atomic_inc(&skb->users);
 		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
 	}
@@ -2419,17 +2457,18 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie scm;
 	struct sock *sk = sock->sk;
 	struct netlink_sock *nlk = nlk_sk(sk);
-	int noblock = flags&MSG_DONTWAIT;
 	size_t copied;
 	struct sk_buff *skb, *data_skb;
+	int peeked, skip;
 	int err, ret;
 
 	if (flags&MSG_OOB)
 		return -EOPNOTSUPP;
 
 	copied = 0;
+	skip = sk_peek_offset(sk, flags);
 
-	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	skb = __skb_recv_datagram(sk, flags, NULL, &peeked, &skip, &err);
 	if (skb == NULL)
 		goto out;
 
@@ -2457,14 +2496,20 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
 				     16384);
 
-	copied = data_skb->len;
+	copied = data_skb->len - skip;
 	if (len < copied) {
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
+	err = skb_copy_datagram_msg(data_skb, skip, msg, copied);
+	if (!err) {
+		if (flags & MSG_PEEK)
+			sk_peek_offset_fwd(sk, copied);
+		else
+			sk_peek_offset_bwd(sk, skb->len);
+	}
 
 	if (msg->msg_name) {
 		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
@@ -2486,7 +2531,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 	siocb->scm->creds = *NETLINK_CREDS(skb);
 	if (flags & MSG_TRUNC)
-		copied = data_skb->len;
+		copied = data_skb->len - skip;
 
 	skb_free_datagram(sk, skb);
 
@@ -3138,6 +3183,13 @@ int netlink_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(netlink_unregister_notifier);
 
+static int netlink_set_peek_off(struct sock *sk, int val)
+{
+	sk->sk_peek_off = val;
+
+	return 0;
+}
+
 static const struct proto_ops netlink_ops = {
 	.family =	PF_NETLINK,
 	.owner =	THIS_MODULE,
@@ -3157,6 +3209,7 @@ static const struct proto_ops netlink_ops = {
 	.recvmsg =	netlink_recvmsg,
 	.mmap =		netlink_mmap,
 	.sendpage =	sock_no_sendpage,
+	.set_peek_off = netlink_set_peek_off,
 };
 
 static const struct net_proto_family netlink_family_ops = {
@@ -3168,7 +3221,7 @@ static const struct net_proto_family netlink_family_ops = {
 static int __net_init netlink_net_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
+	if (!proc_net_create("netlink", 0, net->proc_net, &netlink_seq_fops))
 		return -ENOMEM;
 #endif
 	return 0;
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -4,6 +4,15 @@
 #include <linux/rhashtable.h>
 #include <net/sock.h>
 
+/* flags */
+#define NETLINK_F_KERNEL_SOCKET		0x1
+#define NETLINK_F_RECV_PKTINFO		0x2
+#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
+#define NETLINK_F_RECV_NO_ENOBUFS	0x8
+#define NETLINK_F_LISTEN_ALL_NSID	0x10
+#define NETLINK_F_CAP_ACK		0x20
+#define NETLINK_F_REPAIR		0x40
+
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
 
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -54,6 +54,27 @@ static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 		       nlk->groups);
 }
 
+static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	u32 flags = 0;
+
+	if (nlk->cb_running)
+		flags |= NDIAG_FLAG_CB_RUNNING;
+	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
+		flags |= NDIAG_FLAG_PKTINFO;
+	if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
+		flags |= NDIAG_FLAG_BROADCAST_ERROR;
+	if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)
+		flags |= NDIAG_FLAG_NO_ENOBUFS;
+	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+		flags |= NDIAG_FLAG_LISTEN_ALL_NSID;
+	if (nlk->flags & NETLINK_F_CAP_ACK)
+		flags |= NDIAG_FLAG_CAP_ACK;
+
+	return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags);
+}
+
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			struct netlink_diag_req *req,
 			u32 portid, u32 seq, u32 flags, int sk_ino)
@@ -91,6 +112,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sk_diag_put_rings_cfg(sk, skb))
 		goto out_nlmsg_trim;
 
+	if ((req->ndiag_show & NDIAG_SHOW_FLAGS) &&
+	    sk_diag_put_flags(sk, skb))
+		goto out_nlmsg_trim;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -560,6 +560,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
+	if ((ops->flags & GENL_VE_ADMIN_PERM) &&
+	    !netlink_ve_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
 	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,6 +89,7 @@
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
 #include <linux/percpu.h>
+
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
@@ -1379,13 +1380,12 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		return -EINVAL;
 	}
 
-	if (!po->running)
-		return -EINVAL;
+	mutex_lock(&fanout_mutex);
 
+	err = -EALREADY;
 	if (po->fanout)
-		return -EALREADY;
+		goto out;
 
-	mutex_lock(&fanout_mutex);
 	match = NULL;
 	list_for_each_entry(f, &fanout_list, list) {
 		if (f->id == id &&
@@ -1419,7 +1419,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		list_add(&match->list, &fanout_list);
 	}
 	err = -EINVAL;
-	if (match->type == type &&
+
+	spin_lock(&po->bind_lock);
+	if (po->running &&
+	    match->type == type &&
 	    match->prot_hook.type == po->prot_hook.type &&
 	    match->prot_hook.dev == po->prot_hook.dev) {
 		err = -ENOSPC;
@@ -1431,6 +1434,14 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 			err = 0;
 		}
 	}
+
+	if (err && !atomic_read(&match->sk_ref)) {
+		__dev_remove_pack(&match->prot_hook);
+		list_del(&match->list);
+		kfree(match);
+	}
+	spin_unlock(&po->bind_lock);
+
 out:
 	mutex_unlock(&fanout_mutex);
 	return err;
@@ -1441,17 +1452,16 @@ static void fanout_release(struct sock *sk)
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f;
 
-	f = po->fanout;
-	if (!f)
-		return;
-
 	mutex_lock(&fanout_mutex);
-	po->fanout = NULL;
+	f = po->fanout;
+	if (f) {
+		po->fanout = NULL;
 
-	if (atomic_dec_and_test(&f->sk_ref)) {
-		list_del(&f->list);
-		dev_remove_pack(&f->prot_hook);
-		kfree(f);
+		if (atomic_dec_and_test(&f->sk_ref)) {
+			list_del(&f->list);
+			dev_remove_pack(&f->prot_hook);
+			kfree(f);
+		}
 	}
 	mutex_unlock(&fanout_mutex);
 }
@@ -2502,6 +2512,76 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		return packet_snd(sock, msg, len);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+struct packet_sk_charge {
+	struct mem_cgroup	*memcg;
+	unsigned long		nr_pages;
+};
+
+static struct cg_proto *packet_sk_charge(void)
+{
+	struct packet_sk_charge *psc;
+	int err = -ENOMEM;
+
+	psc = kmalloc(sizeof(*psc), GFP_KERNEL);
+	if (!psc)
+		goto out;
+
+	err = 0;
+	psc->memcg = get_mem_cgroup_from_mm(current->mm);
+	if (!psc->memcg)
+		goto out_free_psc;
+	if (!memcg_kmem_is_active(psc->memcg))
+		goto out_put_cg;
+
+	/*
+	 * Forcedly charge the maximum amount of data this socket may have.
+	 * It's typically not huge and packet sockets are rare guests in
+	 * containers, so we don't disturb the memory consumption much.
+	 */
+	psc->nr_pages = ACCESS_ONCE(sysctl_rmem_max)/PAGE_SIZE;
+
+	err = memcg_charge_kmem(psc->memcg, GFP_KERNEL, psc->nr_pages);
+	if (!err)
+		goto out;
+
+out_put_cg:
+	mem_cgroup_put(psc->memcg);
+out_free_psc:
+	kfree(psc);
+	psc = NULL;
+out:
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * The sk->sk_cgrp is not used for packet sockets,
+	 * so we'll just put the smaller structure into it.
+	 */
+	return (struct cg_proto *)psc;
+}
+
+static void packet_sk_uncharge(struct cg_proto *cg)
+{
+	struct packet_sk_charge *psc = (struct packet_sk_charge *)cg;
+
+	if (psc) {
+		memcg_uncharge_kmem(psc->memcg, psc->nr_pages);
+		mem_cgroup_put(psc->memcg);
+		kfree(psc);
+	}
+}
+#else
+static struct cg_proto *packet_sk_charge(void)
+{
+	return NULL;
+}
+
+static void packet_sk_uncharge(struct cg_proto *cg)
+{
+}
+#endif
+
 /*
  *	Close a PACKET socket. This is fairly simple. We immediately go
  *	to 'closed' state and remove our protocol entry in the device list.
@@ -2551,6 +2631,8 @@ static int packet_release(struct socket *sock)
 	}
 
 	fanout_release(sk);
+	packet_sk_uncharge(sk->sk_cgrp);
+	sk->sk_cgrp = NULL;
 
 	synchronize_net();
 	/*
@@ -2584,13 +2666,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 	int ret = 0;
 	bool unlisted = false;
 
-	if (po->fanout)
-		return -EINVAL;
-
 	lock_sock(sk);
 	spin_lock(&po->bind_lock);
 	rcu_read_lock();
 
+	if (po->fanout) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	if (name) {
 		dev = dev_get_by_name_rcu(sock_net(sk), name);
 		if (!dev) {
@@ -2713,6 +2797,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 	struct packet_sock *po;
+	struct cg_proto *cg;
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
 	int err;
 
@@ -2723,11 +2808,16 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 
 	sock->state = SS_UNCONNECTED;
+	cg = packet_sk_charge();
+	if (IS_ERR(cg)) {
+		err = PTR_ERR(cg);
+		goto out;
+	}
 
 	err = -ENOBUFS;
 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
 	if (sk == NULL)
-		goto out;
+		goto outu;
 
 	sock->ops = &packet_ops;
 	if (sock->type == SOCK_PACKET)
@@ -2774,9 +2864,13 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_prot_inuse_add(net, &packet_proto, 1);
 	preempt_enable();
 
+	sk->sk_cgrp = cg;
+
 	return 0;
 out2:
 	sk_free(sk);
+outu:
+	packet_sk_uncharge(cg);
 out:
 	return err;
 }
@@ -3704,7 +3798,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!pg_vec))
 		goto out;
 
@@ -3726,6 +3820,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		int closing, int tx_ring)
 {
+	struct packet_sk_charge *psc = (struct packet_sk_charge *)sk->sk_cgrp;
 	struct pgv *pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
@@ -3739,7 +3834,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	lock_sock(sk);
 	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
 	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-		WARN(1, "Tx-ring is not supported.\n");
+		/* Hide warnings initiated from CT */
+		if (ve_is_super(get_exec_env()))
+			WARN(1, "Tx-ring is not supported.\n");
 		goto out;
 	}
 
@@ -3798,9 +3895,16 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 
 		err = -ENOMEM;
 		order = get_order(req->tp_block_size);
+		if (psc && memcg_charge_kmem(psc->memcg, GFP_KERNEL,
+				(1 << order) * req->tp_block_nr))
+			goto out;
 		pg_vec = alloc_pg_vec(req, order);
-		if (unlikely(!pg_vec))
+		if (unlikely(!pg_vec)) {
+			if (psc)
+				memcg_uncharge_kmem(psc->memcg,
+					(1 << order) * req->tp_block_nr);
 			goto out;
+		}
 		switch (po->tp_version) {
 		case TPACKET_V3:
 		/* Transmit path is not supported. We checked
@@ -3808,7 +3912,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		 */
 			if (!tx_ring)
 				init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
-				break;
+			break;
 		default:
 			break;
 		}
@@ -3869,8 +3973,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
 	}
 
-	if (pg_vec)
+	if (pg_vec) {
+		if (psc)
+			memcg_uncharge_kmem(psc->memcg,
+				(1 << order) * req->tp_block_nr);
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
+	}
 out:
 	release_sock(sk);
 	return err;
@@ -4065,7 +4173,7 @@ static int __net_init packet_net_init(struct net *net)
 	mutex_init(&net->packet.sklist_lock);
 	INIT_HLIST_HEAD(&net->packet.sklist);
 
-	if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
+	if (!proc_net_create("packet", 0, net->proc_net, &packet_seq_fops))
 		return -ENOMEM;
 
 	return 0;
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -320,7 +320,7 @@ static int __net_init phonet_init_net(struct net *net)
 {
 	struct phonet_net *pnn = phonet_pernet(net);
 
-	if (!proc_create("phonet", 0, net->proc_net, &pn_sock_seq_fops))
+	if (!proc_net_create("phonet", 0, net->proc_net, &pn_sock_seq_fops))
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&pnn->pndevs.list);
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -213,6 +213,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		}
 	}
 
+	if (trans == NULL) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(-ENODEV);
+		goto out;
+	}
+
 	conn->c_trans = trans;
 
 	init_waitqueue_head(&conn->c_hs_waitq);
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -28,7 +28,7 @@ static int em_ipset_change(struct net *net, void *data, int data_len,
 	if (data_len != sizeof(*set))
 		return -EINVAL;
 
-	index = ip_set_nfnl_get_byindex(set->index);
+	index = ip_set_nfnl_get_byindex(net, set->index);
 	if (index == IPSET_INVALID_ID)
 		return -ENOENT;
 
@@ -37,7 +37,7 @@ static int em_ipset_change(struct net *net, void *data, int data_len,
 	if (em->data)
 		return 0;
 
-	ip_set_nfnl_put(index);
+	ip_set_nfnl_put(net, index);
 	return -ENOMEM;
 }
 
@@ -45,7 +45,7 @@ static void em_ipset_destroy(struct tcf_ematch *em)
 {
 	const struct xt_set_info *set = (const void *) em->data;
 	if (set) {
-		ip_set_nfnl_put(set->index);
+		ip_set_nfnl_put(em->net, set->index);
 		kfree((void *) em->data);
 	}
 }
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1917,7 +1917,7 @@ static int __net_init psched_net_init(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("psched", 0, net->proc_net, &psched_fops);
+	e = proc_net_create("psched", 0, net->proc_net, &psched_fops);
 	if (e == NULL)
 		return -ENOMEM;
 
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -159,7 +159,6 @@ struct cbq_sched_data {
 	struct cbq_class	*tx_borrowed;
 	int			tx_len;
 	psched_time_t		now;		/* Cached timestamp */
-	psched_time_t		now_rt;		/* Cached real time */
 	unsigned int		pmask;
 
 	struct hrtimer		delay_timer;
@@ -355,12 +354,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 	int toplevel = q->toplevel;
 
 	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
-		psched_time_t now;
-		psched_tdiff_t incr;
-
-		now = psched_get_time();
-		incr = now - q->now_rt;
-		now = q->now + incr;
+		psched_time_t now = psched_get_time();
 
 		do {
 			if (cl->undertime < now) {
@@ -702,8 +696,13 @@ cbq_update(struct cbq_sched_data *q)
 	struct cbq_class *this = q->tx_class;
 	struct cbq_class *cl = this;
 	int len = q->tx_len;
+	psched_time_t now;
 
 	q->tx_class = NULL;
+	/* Time integrator. We calculate EOS time
+	 * by adding expected packet transmission time.
+	 */
+	now = q->now + L2T(&q->link, len);
 
 	for ( ; cl; cl = cl->share) {
 		long avgidle = cl->avgidle;
@@ -719,7 +718,7 @@ cbq_update(struct cbq_sched_data *q)
 		 *	idle = (now - last) - last_pktlen/rate
 		 */
 
-		idle = q->now - cl->last;
+		idle = now - cl->last;
 		if ((unsigned long)idle > 128*1024*1024) {
 			avgidle = cl->maxidle;
 		} else {
@@ -763,7 +762,7 @@ cbq_update(struct cbq_sched_data *q)
 			idle -= L2T(&q->link, len);
 			idle += L2T(cl, len);
 
-			cl->undertime = q->now + idle;
+			cl->undertime = now + idle;
 		} else {
 			/* Underlimit */
 
@@ -773,7 +772,8 @@ cbq_update(struct cbq_sched_data *q)
 			else
 				cl->avgidle = avgidle;
 		}
-		cl->last = q->now;
+		if ((s64)(now - cl->last) > 0)
+			cl->last = now;
 	}
 
 	cbq_update_toplevel(q, this, q->tx_borrowed);
@@ -874,8 +874,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 
 			if (cl->deficit <= 0) {
 				q->active[prio] = cl;
-				cl = cl->next_alive;
 				cl->deficit += cl->quantum;
+				cl = cl->next_alive;
 			}
 			return skb;
 
@@ -945,31 +945,13 @@ cbq_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	psched_time_t now;
-	psched_tdiff_t incr;
 
 	now = psched_get_time();
-	incr = now - q->now_rt;
-
-	if (q->tx_class) {
-		psched_tdiff_t incr2;
-		/* Time integrator. We calculate EOS time
-		 * by adding expected packet transmission time.
-		 * If real time is greater, we warp artificial clock,
-		 * so that:
-		 *
-		 * cbq_time = max(real_time, work);
-		 */
-		incr2 = L2T(&q->link, q->tx_len);
-		q->now += incr2;
+
+	if (q->tx_class)
 		cbq_update(q);
-		if ((incr -= incr2) < 0)
-			incr = 0;
-		q->now += incr;
-	} else {
-		if (now > q->now)
-			q->now = now;
-	}
-	q->now_rt = now;
+
+	q->now = now;
 
 	for (;;) {
 		q->wd_expires = 0;
@@ -1053,18 +1035,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+			long mtu;
 			/* BUGGGG... Beware! This expression suffer of
 			 * arithmetic overflows!
 			 */
 			if (cl->priority == prio) {
-				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
-					q->quanta[prio];
-			}
-			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
-				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
-					   cl->common.classid, cl->quantum);
-				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+				cl->quantum = (cl->weight * cl->allot) /
+					(q->quanta[prio] / q->nclasses[prio]);
 			}
+			mtu = qdisc_dev(cl->qdisc)->mtu;
+			if (cl->quantum <= mtu/2)
+				cl->quantum = mtu/2 + 1;
+			else if (cl->quantum > 32*mtu)
+				cl->quantum = 32*mtu;
 		}
 	}
 }
@@ -1224,7 +1207,6 @@ cbq_reset(struct Qdisc *sch)
 	hrtimer_cancel(&q->delay_timer);
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
 		q->active[prio] = NULL;
@@ -1408,7 +1390,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->delay_timer.function = cbq_undelay;
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	cbq_link_class(&q->link);
 
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -216,17 +216,21 @@ static inline int qdisc_restart(struct Qdisc *q)
 	spinlock_t *root_lock;
 	struct sk_buff *skb;
 	bool validate;
+	int ret;
 
 	/* Dequeue packet */
 	skb = dequeue_skb(q, &validate);
 	if (unlikely(!skb))
 		return 0;
 
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
-	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
+	ret = sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
+
+	return ret;
 }
 
 void __qdisc_run(struct Qdisc *q)
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1116,6 +1116,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
+	opt.rate.mpu = cl->rate.mpu;
+	opt.ceil.mpu = cl->ceil.mpu;
 	opt.quantum = cl->quantum;
 	opt.prio = cl->prio;
 	opt.level = cl->level;
@@ -1510,6 +1512,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+	cl->rate.mpu = hopt->rate.mpu;
+	cl->ceil.mpu = hopt->ceil.mpu;
 
 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -180,6 +180,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
 	struct teql_master *m = (struct teql_master *)sch->ops;
 	struct teql_sched_data *q = qdisc_priv(sch);
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (dev->hard_header_len > m->dev->hard_header_len)
 		return -EINVAL;
 
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -133,8 +133,8 @@ void sctp_dbg_objcnt_init(struct net *net)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create("sctp_dbg_objcnt", 0,
-			  net->sctp.proc_net_sctp, &sctp_objcnt_ops);
+	ent = proc_net_create_data("sctp_dbg_objcnt", 0,
+			net->sctp.proc_net_sctp, &sctp_objcnt_ops, NULL);
 	if (!ent)
 		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -110,8 +110,8 @@ int __net_init sctp_snmp_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_snmp_seq_fops);
+	p = proc_net_create_data("snmp", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_snmp_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -270,8 +270,8 @@ int __net_init sctp_eps_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_eps_seq_fops);
+	p = proc_net_create_data("eps", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_eps_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -405,8 +405,8 @@ int __net_init sctp_assocs_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_assocs_seq_fops);
+	p = proc_net_create_data("assocs", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_assocs_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -521,8 +521,8 @@ int __net_init sctp_remaddr_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_remaddr_seq_fops);
+	p = proc_net_create_data("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_remaddr_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 	return 0;
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4747,6 +4747,10 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 	struct socket *sock;
 	int err = 0;
 
+	/* Do not peel off from one netns to another one. */
+	if (!net_eq(current->nsproxy->net_ns, sock_net(sk)))
+		return -EINVAL;
+
 	if (!asoc)
 		return -EINVAL;
 
@@ -6874,6 +6878,9 @@ int sctp_inet_listen(struct socket *sock, int backlog)
 	if (sock->state != SS_UNCONNECTED)
 		goto out;
 
+	if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
+		goto out;
+
 	/* If backlog is zero, disable listening. */
 	if (!backlog) {
 		if (sctp_sstate(sk, CLOSED))
--- a/net/socket.c
+++ b/net/socket.c
@@ -84,10 +84,12 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/in.h>
 #include <linux/nsproxy.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -196,6 +198,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k
 		return -EFAULT;
 	return audit_sockaddr(ulen, kaddr);
 }
+EXPORT_SYMBOL(move_addr_to_kernel);
 
 /**
  *	move_addr_to_user	-	copy an address to user space
@@ -294,7 +297,7 @@ static int init_inodecache(void)
 					      0,
 					      (SLAB_HWCACHE_ALIGN |
 					       SLAB_RECLAIM_ACCOUNT |
-					       SLAB_MEM_SPREAD),
+					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1261,6 +1264,11 @@ int __sock_create(struct net *net, int family, int type, int protocol,
 		family = PF_PACKET;
 	}
 
+	/* VZ compatibility layer */
+	err = vz_security_family_check(net, family, 0);
+	if (err < 0)
+		return err;
+
 	err = security_socket_create(family, type, protocol, kern);
 	if (err)
 		return err;
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -449,12 +449,13 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
 /*
  * Remove stale credentials. Avoid sleeping inside the loop.
  */
-static int
+static long
 rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 {
 	spinlock_t *cache_lock;
 	struct rpc_cred *cred, *next;
 	unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
+	long freed = 0;
 
 	list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
 
@@ -466,10 +467,11 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 		 */
 		if (time_in_range(cred->cr_expire, expired, jiffies) &&
 		    test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
-			return 0;
+			break;
 
 		list_del_init(&cred->cr_lru);
 		number_cred_unused--;
+		freed++;
 		if (atomic_read(&cred->cr_count) != 0)
 			continue;
 
@@ -482,7 +484,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 		}
 		spin_unlock(cache_lock);
 	}
-	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+	return freed;
 }
 
 int rpcauth_cache_do_shrinker(int nr_to_scan)
@@ -501,18 +503,18 @@ int rpcauth_cache_do_shrinker(int nr_to_scan)
 /*
  * Run memory cache shrinker.
  */
-static int
-rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+
 {
-	int nr_to_scan = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
+	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
 
-	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return (nr_to_scan == 0) ? 0 : -1;
+	/* nothing left, don't come back */
 	if (list_empty(&cred_unused))
-		return 0;
+		return SHRINK_STOP;
 
-	return rpcauth_cache_do_shrinker(nr_to_scan);
+	return rpcauth_cache_do_shrinker(sc->nr_to_scan);
 }
 
 static void
@@ -530,6 +532,13 @@ rpcauth_cache_enforce_limit(void)
 	rpcauth_cache_do_shrinker(nr_to_scan);
 }
 
+static unsigned long
+rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+
+{
+	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+}
+
 /*
  * Look up a process' credentials in the authentication cache
  */
@@ -853,7 +862,8 @@ rpcauth_uptodatecred(struct rpc_task *task)
 }
 
 static struct shrinker rpc_cred_shrinker = {
-	.shrink = rpcauth_cache_shrinker,
+	.count_objects = rpcauth_cache_shrink_count,
+	.scan_objects = rpcauth_cache_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1640,13 +1640,13 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 	struct sunrpc_net *sn;
 
 	sn = net_generic(net, sunrpc_net_id);
-	cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
+	cd->u.procfs.proc_ent = proc_net_mkdir(net, cd->name, sn->proc_net_rpc);
 	if (cd->u.procfs.proc_ent == NULL)
 		goto out_nomem;
 	cd->u.procfs.channel_ent = NULL;
 	cd->u.procfs.content_ent = NULL;
 
-	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
+	p = proc_net_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
 			     cd->u.procfs.proc_ent,
 			     &cache_flush_operations_procfs, cd);
 	cd->u.procfs.flush_ent = p;
@@ -1654,7 +1654,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 		goto out_nomem;
 
 	if (cd->cache_request || cd->cache_parse) {
-		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
+		p = proc_net_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
 				     cd->u.procfs.proc_ent,
 				     &cache_file_operations_procfs, cd);
 		cd->u.procfs.channel_ent = p;
@@ -1662,7 +1662,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 			goto out_nomem;
 	}
 	if (cd->cache_show) {
-		p = proc_create_data("content", S_IFREG|S_IRUSR,
+		p = proc_net_create_data("content", S_IFREG|S_IRUSR,
 				cd->u.procfs.proc_ent,
 				&content_file_operations_procfs, cd);
 		cd->u.procfs.content_ent = p;
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2867,3 +2867,112 @@ rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
 #endif /* CONFIG_SUNRPC_SWAP */
+
+static void rpc_kill_tasks(struct net *net)
+{
+	struct rpc_clnt *clnt;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	spin_lock(&sn->rpc_client_lock);
+	list_for_each_entry(clnt, &sn->all_clients, cl_clients)
+		rpc_killall_tasks(clnt);
+	spin_unlock(&sn->rpc_client_lock);
+}
+
+static ssize_t write_kill_tasks(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	struct net *net = PDE_DATA(file->f_path.dentry->d_inode);
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	bool prev_kill_tasks = sn->kill_tasks;
+	char tbuf[20];
+	unsigned long kill_tasks;
+	int res;
+
+	if (*ppos || count > sizeof(tbuf)-1)
+		return -EINVAL;
+	if (copy_from_user(tbuf, buf, count))
+		return -EFAULT;
+
+	tbuf[count] = 0;
+	res = kstrtoul(tbuf, 0, &kill_tasks);
+	if (res)
+		return res;
+
+	sn->kill_tasks = !!kill_tasks;
+
+	/* Kill pending tasks */
+	if (sn->kill_tasks && !prev_kill_tasks)
+		rpc_kill_tasks(net);
+
+	return count;
+}
+
+static ssize_t read_kill_tasks(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	struct net *net = PDE_DATA(file->f_path.dentry->d_inode);
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	unsigned long p = *ppos;
+	char tbuf[10];
+	size_t len;
+
+	snprintf(tbuf, sizeof(tbuf), "%d\n", sn->kill_tasks);
+	len = strlen(tbuf);
+	if (p >= len)
+		return 0;
+	len -= p;
+	if (len > count)
+		len = count;
+	if (copy_to_user(buf, (void *)(tbuf+p), len))
+		return -EFAULT;
+	*ppos += len;
+	return len;
+}
+
+static const struct file_operations kill_tasks_ops = {
+	.open = nonseekable_open,
+	.write = write_kill_tasks,
+	.read = read_kill_tasks,
+};
+
+int rpc_task_kill_proc_init(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	sn->kill_tasks = 0;
+	sn->kill_tasks_proc = proc_create_data("kill-tasks",
+					      S_IFREG|S_IRUSR|S_IWUSR,
+					      sn->proc_net_rpc,
+					      &kill_tasks_ops, net);
+	return sn->kill_tasks_proc ? 0 : -ENOMEM;
+}
+
+void rpc_task_kill_proc_fini(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	if (sn->kill_tasks_proc)
+		remove_proc_entry("kill-tasks", sn->proc_net_rpc);
+}
+
+static struct net *rpc_task_net(struct rpc_task *task)
+{
+	if (task->tk_client)
+		return rpc_net_ns(task->tk_client);
+	return task->tk_rqstp->rq_xprt->xprt_net;
+}
+
+bool rpc_abort_task(struct rpc_task *task)
+{
+	struct net *net = rpc_task_net(task);
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	if (!sn->kill_tasks)
+		return false;
+
+	dprintk("RPC: SUNRPC traffic is suppressed. Drop task %5u with EIO.\n",
+			task->tk_pid);
+
+	return true;
+}
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -32,6 +32,9 @@ struct sunrpc_net {
 	int pipe_version;
 	atomic_t pipe_users;
 	struct proc_dir_entry *use_gssp_proc;
+
+	bool kill_tasks;
+	struct proc_dir_entry *kill_tasks_proc;
 };
 
 extern int sunrpc_net_id;
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1487,6 +1487,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 	.name		= "rpc_pipefs",
 	.mount		= rpc_mount,
 	.kill_sb	= rpc_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("rpc_pipefs");
 MODULE_ALIAS("rpc_pipefs");
@@ -1509,7 +1510,7 @@ int register_rpc_pipefs(void)
 	rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
 				sizeof(struct rpc_inode),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				init_once);
 	if (!rpc_inode_cachep)
 		return -ENOMEM;
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -253,7 +253,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
 static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
@@ -356,6 +356,12 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
 		rpc_action action,
 		unsigned char queue_priority)
 {
+	if (rpc_abort_task(task)) {
+		task->tk_flags |= RPC_TASK_KILLED;
+		rpc_exit(task, -EIO);
+		return;
+	}
+
 	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
 			task->tk_pid, rpc_qname(q), jiffies);
 
@@ -752,6 +758,11 @@ static void __rpc_execute(struct rpc_task *task)
 	if (RPC_IS_QUEUED(task))
 		return;
 
+	if (rpc_abort_task(task)) {
+		task->tk_flags |= RPC_TASK_KILLED;
+		rpc_exit(task, -EIO);
+	}
+
 	for (;;) {
 		void (*do_action)(struct rpc_task *);
 
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -247,7 +247,7 @@ do_register(struct net *net, const char *name, void *data,
 
 	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
 	sn = net_generic(net, sunrpc_net_id);
-	return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+	return proc_net_create_data(name, 0, sn->proc_net_rpc, fops, data);
 }
 
 struct proc_dir_entry *
@@ -287,19 +287,27 @@ EXPORT_SYMBOL_GPL(svc_proc_unregister);
 int rpc_proc_init(struct net *net)
 {
 	struct sunrpc_net *sn;
+	int err;
 
 	dprintk("RPC:       registering /proc/net/rpc\n");
 	sn = net_generic(net, sunrpc_net_id);
-	sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
+	sn->proc_net_rpc = proc_net_mkdir(net, "rpc", net->proc_net);
 	if (sn->proc_net_rpc == NULL)
 		return -ENOMEM;
 
+	err = rpc_task_kill_proc_init(net);
+	if (err) {
+		remove_proc_entry("rpc", net->proc_net);
+		return err;
+	}
+
 	return 0;
 }
 
 void rpc_proc_exit(struct net *net)
 {
 	dprintk("RPC:       unregistering /proc/net/rpc\n");
+	rpc_task_kill_proc_fini(net);
 	remove_proc_entry("rpc", net->proc_net);
 }
 
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/net.h>
 #include <linux/in.h>
@@ -21,6 +20,8 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 
+#include <linux/ve.h>
+
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/stats.h>
@@ -731,8 +732,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 		}
 
 		__module_get(serv->sv_ops->svo_module);
-		task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
-					      node, serv->sv_name);
+		task = kthread_create_on_node_ve(get_exec_env(),
+			serv->sv_ops->svo_function, rqstp, node, serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
 			module_put(serv->sv_ops->svo_module);
@@ -1337,6 +1338,21 @@ svc_process(struct svc_rqst *rqstp)
 EXPORT_SYMBOL_GPL(svc_process);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net)
+{
+	struct rpc_rqst *req, *tmp;
+
+	spin_lock_bh(&serv->sv_cb_lock);
+	list_for_each_entry_safe(req, tmp, &serv->sv_cb_list, rq_bc_list) {
+		if (req->rq_xprt->xprt_net == net) {
+			list_del(&req->rq_bc_list);
+			xprt_free_bc_request(req);
+		}
+	}
+	spin_unlock_bh(&serv->sv_cb_lock);
+}
+EXPORT_SYMBOL_GPL(bc_svc_flush_queue_net);
+
 /*
  * Process a backchannel RPC request that arrived over an existing
  * outbound connection
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -94,10 +94,14 @@ __init int net_sysctl_init(void)
 		goto out;
 	ret = register_pernet_subsys(&sysctl_pernet_ops);
 	if (ret)
-		goto out;
+		goto out1;
 	register_sysctl_root(&net_sysctl_root);
 out:
 	return ret;
+out1:
+	unregister_sysctl_table(net_header);
+	net_header = NULL;
+	goto out;
 }
 
 struct ctl_table_header *register_net_sysctl(struct net *net,
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -654,11 +654,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
 {
 	struct unix_sock *u = unix_sk(sk);
 
-	if (mutex_lock_interruptible(&u->readlock))
+	if (mutex_lock_interruptible(&u->iolock))
 		return -EINTR;
 
 	sk->sk_peek_off = val;
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 
 	return 0;
 }
@@ -762,6 +762,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	lockdep_set_class(&sk->sk_receive_queue.lock,
 				&af_unix_sk_receive_queue_lock_key);
 
+	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
@@ -771,7 +772,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
-	mutex_init(&u->readlock); /* single task reading lock */
+	mutex_init(&u->iolock); /* single task reading lock */
+	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -840,7 +842,7 @@ static int unix_autobind(struct socket *sock)
 	int err;
 	unsigned int retries = 0;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		return err;
 
@@ -887,7 +889,7 @@ static int unix_autobind(struct socket *sock)
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
@@ -986,6 +988,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	unsigned int hash;
 	struct unix_address *addr;
 	struct hlist_head *list;
+	struct path path = { NULL, NULL };
 
 	err = -EINVAL;
 	if (sunaddr->sun_family != AF_UNIX)
@@ -1001,9 +1004,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	if (sun_path[0]) {
+		umode_t mode = S_IFSOCK |
+		       (SOCK_INODE(sock)->i_mode & ~current_umask());
+		err = unix_mknod(sun_path, mode, &path);
+		if (err) {
+			if (err == -EEXIST)
+				err = -EADDRINUSE;
+			goto out;
+		}
+	}
+
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
-		goto out;
+		goto out_put;
 
 	err = -EINVAL;
 	if (u->addr)
@@ -1020,16 +1034,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	atomic_set(&addr->refcnt, 1);
 
 	if (sun_path[0]) {
-		struct path path;
-		umode_t mode = S_IFSOCK |
-		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = unix_mknod(sun_path, mode, &path);
-		if (err) {
-			if (err == -EEXIST)
-				err = -EADDRINUSE;
-			unix_release_addr(addr);
-			goto out_up;
-		}
 		addr->hash = UNIX_HASH_SIZE;
 		hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
 		spin_lock(&unix_table_lock);
@@ -1055,7 +1059,10 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->bindlock);
+out_put:
+	if (err)
+		path_put(&path);
 out:
 	return err;
 }
@@ -1958,17 +1965,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	if (false) {
 alloc_skb:
 		unix_state_unlock(other);
-		mutex_unlock(&unix_sk(other)->readlock);
+		mutex_unlock(&unix_sk(other)->iolock);
 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
 					      &err, 0);
 		if (!newskb)
 			goto err;
 	}
 
-	/* we must acquire readlock as we modify already present
+	/* we must acquire iolock as we modify already present
 	 * skbs in the sk_receive_queue and mess with skb->len
 	 */
-	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
 	if (err) {
 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
 		goto err;
@@ -2035,7 +2042,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	}
 
 	unix_state_unlock(other);
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 
 	other->sk_data_ready(other, 0);
 	scm_destroy(&scm);
@@ -2044,7 +2051,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 err_state_unlock:
 	unix_state_unlock(other);
 err_unlock:
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 err:
 	kfree_skb(newskb);
 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2112,7 +2119,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (flags&MSG_OOB)
 		goto out;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->iolock);
 	if (unlikely(err)) {
 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
@@ -2190,7 +2197,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 out_free:
 	skb_free_datagram(sk, skb);
 out_unlock:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 out:
 	return err;
 }
@@ -2296,7 +2303,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 		siocb->scm = &tmp_scm;
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->iolock);
 	if (unlikely(err)) {
 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
@@ -2337,13 +2344,13 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 			err = -EAGAIN;
 			if (!timeo)
 				break;
-			mutex_unlock(&u->readlock);
+			mutex_unlock(&u->iolock);
 
 			timeo = unix_stream_data_wait(sk, timeo, last,
 						      last_len);
 
 			if (signal_pending(current) ||
-			    mutex_lock_interruptible(&u->readlock)) {
+			    mutex_lock_interruptible(&u->iolock)) {
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
@@ -2439,7 +2446,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	if (state->msg)
 		scm_recv(sock, state->msg, siocb->scm, flags);
 	else
@@ -2482,9 +2489,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
 	int ret;
 	struct unix_sock *u = unix_sk(sk);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	return ret;
 }
@@ -2875,7 +2882,7 @@ static int __net_init unix_net_init(struct net *net)
 		goto out;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
+	if (!proc_net_create("unix", 0, net->proc_net, &unix_seq_fops)) {
 		unix_sysctl_unregister(net);
 		goto out;
 	}
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -35,7 +35,7 @@ int __net_init unix_sysctl_register(struct net *net)
 		goto err_alloc;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	table[0].data = &net->unx.sysctl_max_dgram_qlen;
--- a/net/wireless/wext-proc.c
+++ b/net/wireless/wext-proc.c
@@ -143,7 +143,7 @@ static const struct file_operations wireless_seq_fops = {
 int __net_init wext_proc_init(struct net *net)
 {
 	/* Create /proc/net/wireless entry */
-	if (!proc_create("wireless", S_IRUGO, net->proc_net,
+	if (!proc_net_create("wireless", S_IRUGO, net->proc_net,
 			 &wireless_seq_fops))
 		return -ENOMEM;
 
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3327,6 +3327,11 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
 		goto out;
 
+	if (dir >= XFRM_POLICY_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	/* Stage 1 - find policy */
 	if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
 		err = -ENOENT;
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -74,7 +74,7 @@ static const struct file_operations xfrm_statistics_seq_fops = {
 
 int __net_init xfrm_proc_init(struct net *net)
 {
-	if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net,
+	if (!proc_net_create("xfrm_stat", S_IRUGO, net->proc_net,
 			 &xfrm_statistics_seq_fops))
 		return -ENOMEM;
 	return 0;
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1616,6 +1616,14 @@ static int xfrm_dump_policy_done(struct netlink_callback *cb)
 	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
 	struct net *net = sock_net(cb->skb->sk);
 
+	/*
+	 * .done callback runs only once for a given 'cb', so there is no
+	 * need to  set cb->args[0] to indicate that xfrm_policy_walk_init()
+	 * has already been called.
+	 */
+	if (!cb->args[0])
+		xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
+
 	xfrm_policy_walk_done(walk, net);
 	return 0;
 }
@@ -2142,7 +2150,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return 0;
 
 bad_policy:
-	WARN(1, "BAD policy passed\n");
+	ve_pr_warn_ratelimited(VE0_LOG, "CT%s: BAD xfrm policy passed\n",
+		net->owner_ve->ve_name);
 free_state:
 	kfree(x);
 nomem:
--- /dev/null
+++ b/scripts/Makefile.kasan
@@ -0,0 +1,29 @@
+ifdef CONFIG_KASAN
+ifdef CONFIG_KASAN_INLINE
+	call_threshold := 10000
+else
+	call_threshold := 0
+endif
+
+CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
+
+CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
+		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-stack=1 --param asan-globals=1 \
+		--param asan-instrumentation-with-call-threshold=$(call_threshold))
+
+ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
+   ifneq ($(CONFIG_COMPILE_TEST),y)
+        $(warning Cannot use CONFIG_KASAN: \
+            -fsanitize=kernel-address is not supported by compiler)
+   endif
+else
+    ifeq ($(CFLAGS_KASAN),)
+        ifneq ($(CONFIG_COMPILE_TEST),y)
+            $(warning CONFIG_KASAN: compiler does not support all options.\
+                Trying minimal configuration)
+        endif
+        CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
+    endif
+endif
+endif
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -119,6 +119,22 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_GCOV))
 endif
 
+#
+# Enable address sanitizer flags for kernel except some files or directories
+# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
+#
+ifeq ($(CONFIG_KASAN),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
+		$(CFLAGS_KASAN))
+endif
+
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+	$(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+	$(CFLAGS_KCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -77,7 +77,7 @@ modpost = scripts/mod/modpost                    \
  $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \
  $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
- $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w)
+ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),$(if $(KBUILD_MODPOST_FAIL),,-w))
 
 # We can go over command line length here, so be careful.
 quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
--- a/scripts/module-common.lds
+++ b/scripts/module-common.lds
@@ -16,4 +16,8 @@ SECTIONS {
 	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
 	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
 	__kcrctab_gpl_future	0 : { *(SORT(___kcrctab_gpl_future+*)) }
+
+
+	. = ALIGN(8);
+	.init_array		0 : { *(SORT(.init_array.*)) *(.init_array) }
 }
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -20,7 +20,7 @@ config SECURITY_DMESG_RESTRICT
 
 config SECURITY
 	bool "Enable different security models"
-	depends on SYSFS
+	depends on SYSFS && !VE
 	help
 	  This allows you to choose different security modules to be
 	  configured into your kernel.
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -58,7 +58,7 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
 		return ERR_PTR(-EACCES);
 
 	/* freed by caller to simple_write_to_buffer */
-	data = kvmalloc(alloc_size);
+	data = kvmalloc(alloc_size, GFP_KERNEL);
 	if (data == NULL)
 		return ERR_PTR(-ENOMEM);
 
--- a/security/apparmor/include/apparmor.h
+++ b/security/apparmor/include/apparmor.h
@@ -64,8 +64,6 @@ extern int apparmor_initialized __initdata;
 /* fn's in lib */
 char *aa_split_fqname(char *args, char **ns_name);
 void aa_info_message(const char *str);
-void *kvmalloc(size_t size);
-void kvfree(void *buffer);
 
 
 /**
--- a/security/apparmor/lib.c
+++ b/security/apparmor/lib.c
@@ -74,64 +74,3 @@ void aa_info_message(const char *str)
 	printk(KERN_INFO "AppArmor: %s\n", str);
 }
 
-/**
- * kvmalloc - do allocation preferring kmalloc but falling back to vmalloc
- * @size: size of allocation
- *
- * Return: allocated buffer or NULL if failed
- *
- * It is possible that policy being loaded from the user is larger than
- * what can be allocated by kmalloc, in those cases fall back to vmalloc.
- */
-void *kvmalloc(size_t size)
-{
-	void *buffer = NULL;
-
-	if (size == 0)
-		return NULL;
-
-	/* do not attempt kmalloc if we need more than 16 pages at once */
-	if (size <= (16*PAGE_SIZE))
-		buffer = kmalloc(size, GFP_NOIO | __GFP_NOWARN);
-	if (!buffer) {
-		/* see kvfree for why size must be at least work_struct size
-		 * when allocated via vmalloc
-		 */
-		if (size < sizeof(struct work_struct))
-			size = sizeof(struct work_struct);
-		buffer = vmalloc(size);
-	}
-	return buffer;
-}
-
-/**
- * do_vfree - workqueue routine for freeing vmalloced memory
- * @work: data to be freed
- *
- * The work_struct is overlaid to the data being freed, as at the point
- * the work is scheduled the data is no longer valid, be its freeing
- * needs to be delayed until safe.
- */
-static void do_vfree(struct work_struct *work)
-{
-	vfree(work);
-}
-
-/**
- * kvfree - free an allocation do by kvmalloc
- * @buffer: buffer to free (MAYBE_NULL)
- *
- * Free a buffer allocated by kvmalloc
- */
-void kvfree(void *buffer)
-{
-	if (is_vmalloc_addr(buffer)) {
-		/* Data is no longer valid so just use the allocated space
-		 * as the work_struct
-		 */
-		struct work_struct *work = (struct work_struct *) buffer;
-		INIT_WORK(work, do_vfree);
-		schedule_work(work);
-	} else
-		kfree(buffer);
-}
--- a/security/apparmor/match.c
+++ b/security/apparmor/match.c
@@ -57,7 +57,7 @@ static struct table_header *unpack_table(char *blob, size_t bsize)
 	if (bsize < tsize)
 		goto out;
 
-	table = kvmalloc(tsize);
+	table = kvmalloc(tsize, GFP_KERNEL);
 	if (table) {
 		*table = th;
 		if (th.td_flags == YYTD_DATA8)
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -661,14 +661,14 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
 	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+		if (!ve_capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -687,14 +687,14 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
 	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+		if (!ve_capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -13,11 +13,19 @@
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
 #define ACC_WRITE 4
-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
+#define ACC_QUOTA 8	/* deprecated */
+#define ACC_HIDDEN 16
+#define ACC_MOUNT 64
+#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_MOUNT)
 
 #define DEV_BLOCK 1
 #define DEV_CHAR  2
@@ -75,7 +83,7 @@ static int devcgroup_can_attach(struct cgroup *new_cgrp,
 {
 	struct task_struct *task = cgroup_taskset_first(set);
 
-	if (current != task && !capable(CAP_SYS_ADMIN))
+	if (current != task && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -261,7 +269,7 @@ static void devcgroup_css_free(struct cgroup *cgroup)
 #define DEVCG_LIST 3
 
 #define MAJMINLEN 13
-#define ACCLEN 4
+#define ACCLEN 5
 
 static void set_access(char *acc, short access)
 {
@@ -273,6 +281,8 @@ static void set_access(char *acc, short access)
 		acc[idx++] = 'w';
 	if (access & ACC_MKNOD)
 		acc[idx++] = 'm';
+	if (access & ACC_MOUNT)
+		acc[idx++] = 'M';
 }
 
 static char type_to_char(short type)
@@ -347,6 +357,9 @@ static bool match_exception(struct list_head *exceptions, short type,
 	struct dev_exception_item *ex;
 
 	list_for_each_entry_rcu(ex, exceptions, list) {
+		short mismatched_bits;
+		bool allowed_mount;
+
 		if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
 			continue;
 		if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
@@ -356,7 +369,12 @@ static bool match_exception(struct list_head *exceptions, short type,
 		if (ex->minor != ~0 && ex->minor != minor)
 			continue;
 		/* provided access cannot have more than the exception rule */
-		if (access & (~ex->access))
+		mismatched_bits = access & (~ex->access) & ~ACC_MOUNT;
+		allowed_mount = !(mismatched_bits & ~ACC_WRITE) &&
+				(ex->access & ACC_MOUNT) &&
+				(access & ACC_MOUNT);
+
+		if (mismatched_bits && !allowed_mount)
 			continue;
 		return true;
 	}
@@ -657,7 +675,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	struct cgroup *p = devcgroup->css.cgroup;
 	struct dev_cgroup *parent = NULL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (p->parent)
@@ -673,8 +691,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 			if (has_children(devcgroup))
 				return -EINVAL;
 
-			if (!may_allow_all(parent))
-				return -EPERM;
+			if (!may_allow_all(parent)) {
+				if (ve_is_super(get_exec_env()))
+					return -EPERM;
+				else
+					/* Fooling docker in CT - silently exit */
+					return 0;
+			}
 			dev_exception_clean(devcgroup);
 			devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
 			if (!parent)
@@ -750,7 +773,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	}
 	if (!isspace(*b))
 		return -EINVAL;
-	for (b++, count = 0; count < 3; count++, b++) {
+	for (b++, count = 0; count < ACCLEN - 1; count++, b++) {
 		switch (*b) {
 		case 'r':
 			ex.access |= ACC_READ;
@@ -761,9 +784,12 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 		case 'm':
 			ex.access |= ACC_MKNOD;
 			break;
+		case 'M':
+			ex.access |= ACC_MOUNT;
+			break;
 		case '\n':
 		case '\0':
-			count = 3;
+			count = ACCLEN - 1;
 			break;
 		default:
 			return -EINVAL;
@@ -881,8 +907,24 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 				     minor, access);
 	rcu_read_unlock();
 
+#ifdef CONFIG_VE
+	/*
+	 * When restoring container allow everything in
+	 * pseudosuper state. We need this for early
+	 * mounting of second ploop device. Still, don't
+	 * change behaviour on the ve0.
+	 */
+	if (!rc) {
+		struct ve_struct *ve = get_exec_env();
+
+		if (!ve_is_super(ve) && ve->is_pseudosuper)
+			return 0;
+		return -EPERM;
+	}
+#else
 	if (!rc)
 		return -EPERM;
+#endif
 
 	return 0;
 }
@@ -899,11 +941,64 @@ int __devcgroup_inode_permission(struct inode *inode, int mask)
 		access |= ACC_WRITE;
 	if (mask & MAY_READ)
 		access |= ACC_READ;
+	if (mask & MAY_MOUNT)
+		access |= ACC_MOUNT;
 
 	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
 			access);
 }
 
+int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{
+	short type, access = 0;
+
+	if (S_ISBLK(mode))
+		type = DEV_BLOCK;
+	if (S_ISCHR(mode))
+		type = DEV_CHAR;
+	if (mask & MAY_WRITE)
+		access |= ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= ACC_READ;
+
+	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), access);
+}
+
+int devcgroup_device_visible(umode_t mode, int major, int start_minor, int nr_minors)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_exception_item *ex;
+	short access = ACC_READ | ACC_WRITE;
+	bool match = false;
+
+	rcu_read_lock();
+	dev_cgroup = task_devcgroup(current);
+
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+		match = true;
+		goto out;
+	}
+
+	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		if ((ex->type & DEV_BLOCK) && !S_ISBLK(mode))
+			continue;
+		if ((ex->type & DEV_CHAR) && !S_ISCHR(mode))
+			continue;
+		if (ex->major != ~0 && ex->major != major)
+			continue;
+		if (ex->minor != ~0 && (ex->minor < start_minor ||
+					ex->minor >= start_minor + nr_minors))
+			continue;
+		if (!(access & ex->access))
+			continue;
+		match = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return match;
+}
+
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
 	short type;
@@ -920,3 +1015,107 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
 			ACC_MKNOD);
 
 }
+
+#ifdef CONFIG_VE
+
+static unsigned decode_ve_perms(unsigned perm)
+{
+	unsigned mask = 0;
+
+	if (perm & S_IROTH)
+		mask |= ACC_READ;
+	if (perm & S_IWOTH)
+		mask |= ACC_WRITE;
+	if (perm & S_IXUSR)
+		mask |= ACC_MOUNT;
+
+	return mask;
+}
+
+static unsigned encode_ve_perms(unsigned mask)
+{
+	unsigned perm = 0;
+
+	if (mask & ACC_READ)
+		perm |= S_IROTH;
+	if (mask & ACC_WRITE)
+		perm |= S_IWOTH;
+	if (mask & ACC_MOUNT)
+		perm |= S_IXUSR;
+
+	return perm;
+}
+
+int devcgroup_set_perms_ve(struct ve_struct *ve,
+		unsigned type, dev_t dev, unsigned mask)
+{
+	int err = -EINVAL;
+	struct dev_exception_item new;
+	struct cgroup_subsys_state *css;
+
+	if ((type & S_IFMT) == S_IFBLK)
+		new.type = DEV_BLOCK;
+	else if ((type & S_IFMT) == S_IFCHR)
+		new.type = DEV_CHAR;
+	else
+		return -EINVAL;
+
+	new.access = decode_ve_perms(mask) | (mask ? ACC_MKNOD : 0);
+	new.major = new.minor = ~0;
+
+	switch (type & VE_USE_MASK) {
+	default:
+		new.minor = MINOR(dev);
+	case VE_USE_MAJOR:
+		new.major = MAJOR(dev);
+	case 0:
+		;
+	}
+
+	mutex_lock(&devcgroup_mutex);
+	css = ve_get_init_css(ve, devices_subsys_id);
+	err = dev_exception_add(cgroup_to_devcgroup(css->cgroup), &new);
+	css_put(css);
+	mutex_unlock(&devcgroup_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL(devcgroup_set_perms_ve);
+
+int devcgroup_seq_show_ve(struct ve_struct *ve, struct seq_file *m)
+{
+	struct dev_exception_item *wh;
+	struct dev_cgroup *devcgroup;
+	struct cgroup_subsys_state *css;
+
+	css = ve_get_init_css(ve, devices_subsys_id);
+	devcgroup = cgroup_to_devcgroup(css->cgroup);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(wh, &devcgroup->exceptions, list) {
+		char maj[MAJMINLEN], min[MAJMINLEN];
+		unsigned perm;
+
+		if (wh->access & ACC_HIDDEN)
+			continue;
+
+		set_majmin(maj, wh->major);
+		set_majmin(min, wh->minor);
+
+		perm = encode_ve_perms(wh->access);
+		if (perm & (S_IROTH | S_IWOTH))
+			perm |= S_IXOTH;
+
+		seq_printf(m, "%10u %c %03o %s:%s\n",
+				ve->veid,
+				type_to_char(wh->type),
+				perm, maj, min);
+	}
+	rcu_read_unlock();
+
+	css_put(css);
+	return 0;
+}
+EXPORT_SYMBOL(devcgroup_seq_show_ve);
+
+#endif /* CONFIG_VE */
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -845,6 +845,8 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep)
 	size_t datalen = prep->datalen;
 	int ret = 0;
 
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+		return -ENOKEY;
 	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -97,7 +97,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
 	payload = NULL;
 
 	vm = false;
-	if (_payload) {
+	if (plen) {
 		ret = -ENOMEM;
 		payload = kmalloc(plen, GFP_KERNEL | __GFP_NOWARN);
 		if (!payload) {
@@ -331,7 +331,7 @@ long keyctl_update_key(key_serial_t id,
 
 	/* pull the payload in if one was supplied */
 	payload = NULL;
-	if (_payload) {
+	if (plen) {
 		ret = -ENOMEM;
 		payload = kmalloc(plen, GFP_KERNEL);
 		if (!payload)
@@ -748,12 +748,17 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
 
 	key = key_ref_to_ptr(key_ref);
 
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
+		ret = -ENOKEY;
+		goto error2;
+	}
+
 	/* see if we can read it directly */
 	ret = key_permission(key_ref, KEY_READ);
 	if (ret == 0)
 		goto can_read_key;
 	if (ret != -EACCES)
-		goto error;
+		goto error2;
 
 	/* we can't; see if it's searchable from this process's keyrings
 	 * - we automatically take account of the fact that it may be
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -896,8 +896,8 @@ EXPORT_SYMBOL(keyring_search);
  * caller must also hold a lock on the keyring semaphore.
  *
  * Returns a pointer to the found key with usage count incremented if
- * successful and returns NULL if not found.  Revoked and invalidated keys are
- * skipped over.
+ * successful and returns NULL if not found.  Revoked, invalidated, and
+ * uninstantiated keys are skipped over.  (But negative keys are not!)
  *
  * If successful, the possession indicator is propagated from the keyring ref
  * to the returned key reference.
@@ -924,8 +924,10 @@ key_ref_t find_key_to_update(key_ref_t keyring_ref,
 
 found:
 	key = keyring_ptr_to_key(object);
-	if (key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-			  (1 << KEY_FLAG_REVOKED))) {
+	if ((key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+			   (1 << KEY_FLAG_REVOKED) |
+			   (1 << KEY_FLAG_INSTANTIATED))) !=
+	    (1 << KEY_FLAG_INSTANTIATED)) {
 		kleave(" = NULL [x]");
 		return NULL;
 	}
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -132,6 +132,9 @@ int install_thread_keyring_to_cred(struct cred *new)
 {
 	struct key *keyring;
 
+	if (new->thread_keyring)
+		return 0;
+
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include "internal.h"
 
+#include <keys/user-type.h>
+
 #define key_negative_timeout	60	/* default timeout on a negative key's existence */
 
 /**
@@ -517,7 +519,7 @@ struct key *request_key_and_link(struct key_type *type,
 		.index_key.type		= type,
 		.index_key.description	= description,
 		.cred			= current_cred(),
-		.match			= type->match,
+		.match			= type->match ? : user_match,
 		.match_data		= description,
 		.flags			= (KEYRING_SEARCH_LOOKUP_DIRECT |
 					   KEYRING_SEARCH_DO_STATE_CHECK |
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1060,13 +1060,16 @@ static void trusted_rcu_free(struct rcu_head *rcu)
  */
 static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
 {
-	struct trusted_key_payload *p = key->payload.data;
+	struct trusted_key_payload *p;
 	struct trusted_key_payload *new_p;
 	struct trusted_key_options *new_o;
 	size_t datalen = prep->datalen;
 	char *datablob;
 	int ret = 0;
 
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+		return -ENOKEY;
+	p = key->payload.data;
 	if (!p->migratable)
 		return -EPERM;
 	if (datalen <= 0 || datalen > 32767 || !prep->data)
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -121,7 +121,10 @@ int user_update(struct key *key, struct key_preparsed_payload *prep)
 
 	if (ret == 0) {
 		/* attach the new data, displacing the old */
-		zap = key->payload.data;
+		if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+			zap = key->payload.data;
+		else
+			zap = NULL;
 		rcu_assign_keypointer(key, upayload);
 		key->expiry = 0;
 	}
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -1,6 +1,6 @@
 config SECURITY_SELINUX
 	bool "NSA SELinux Support"
-	depends on SECURITY_NETWORK && AUDIT && NET && INET
+	depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE
 	select NETWORK_SECMARK
 	default n
 	help
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -1259,6 +1259,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg)
 	struct snd_seq_port_info *info = arg;
 	struct snd_seq_client_port *port;
 	struct snd_seq_port_callback *callback;
+	int port_idx;
 
 	/* it is not allowed to create the port for an another client */
 	if (info->addr.client != client->number)
@@ -1269,7 +1270,9 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg)
 		return -ENOMEM;
 
 	if (client->type == USER_CLIENT && info->kernel) {
-		snd_seq_delete_port(client, port->addr.port);
+		port_idx = port->addr.port;
+		snd_seq_port_unlock(port);
+		snd_seq_delete_port(client, port_idx);
 		return -EINVAL;
 	}
 	if (client->type == KERNEL_CLIENT) {
@@ -1290,6 +1293,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg)
 
 	snd_seq_set_port_info(port, info);
 	snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port);
+	snd_seq_port_unlock(port);
 
 	return 0;
 }
--- a/sound/core/seq/seq_ports.c
+++ b/sound/core/seq/seq_ports.c
@@ -122,7 +122,9 @@ static void port_subs_info_init(struct snd_seq_port_subs_info *grp)
 }
 
 
-/* create a port, port number is returned (-1 on failure) */
+/* create a port, port number is returned (-1 on failure);
+ * the caller needs to unref the port via snd_seq_port_unlock() appropriately
+ */
 struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client,
 						int port)
 {
@@ -151,6 +153,7 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client,
 	snd_use_lock_init(&new_port->use_lock);
 	port_subs_info_init(&new_port->c_src);
 	port_subs_info_init(&new_port->c_dest);
+	snd_use_lock_use(&new_port->use_lock);
 
 	num = port >= 0 ? port : 0;
 	mutex_lock(&client->ports_mutex);
@@ -165,9 +168,9 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client,
 	list_add_tail(&new_port->list, &p->list);
 	client->num_ports++;
 	new_port->addr.port = num;	/* store the port number in the port */
+	sprintf(new_port->name, "port-%d", num);
 	write_unlock_irqrestore(&client->ports_lock, flags);
 	mutex_unlock(&client->ports_mutex);
-	sprintf(new_port->name, "port-%d", num);
 
 	return new_port;
 }
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -2,6 +2,7 @@ TARGETS = breakpoints
 TARGETS += cpu-hotplug
 TARGETS += efivarfs
 TARGETS += kcmp
+TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mqueue
 TARGETS += net
--- /dev/null
+++ b/tools/testing/selftests/memfd/.gitignore
@@ -0,0 +1,4 @@
+fuse_mnt
+fuse_test
+memfd_test
+memfd-test-file
--- /dev/null
+++ b/tools/testing/selftests/memfd/Makefile
@@ -0,0 +1,47 @@
+#
+# tools/testing/selftests/memfd/Makefile
+#
+# Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+#
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+	ARCH := X86
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH := X86
+endif
+
+CFLAGS += -D_FILE_OFFSET_BITS=64
+CFLAGS += -I../../../../arch/x86/include/generated/uapi/
+CFLAGS += -I../../../../arch/x86/include/uapi/
+CFLAGS += -I../../../../include/uapi/
+CFLAGS += -I../../../../include/
+
+all:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) memfd_test.c -o memfd_test
+else
+	echo "Not an x86 target, can't build memfd selftest"
+endif
+
+run_tests: all
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) memfd_test.c -o memfd_test
+endif
+	@./memfd_test || echo "memfd_test: [FAIL]"
+
+build_fuse:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt
+	gcc $(CFLAGS) fuse_test.c -o fuse_test
+else
+	echo "Not an x86 target, can't build memfd selftest"
+endif
+
+run_fuse: build_fuse
+	@./run_fuse_test.sh || echo "fuse_test: [FAIL]"
+
+clean:
+	$(RM) memfd_test fuse_test
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_mnt.c
@@ -0,0 +1,117 @@
+/*
+ *  tools/testing/selftests/memfd/fuse_mnt.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * memfd test file-system
+ * This file uses FUSE to create a dummy file-system with only one file /memfd.
+ * This file is read-only and takes 1s per read.
+ *
+ * This file-system is used by the memfd test-cases to force the kernel to pin
+ * pages during reads(). Due to the 1s delay of this file-system, this is a
+ * nice way to test race-conditions against get_user_pages() in the kernel.
+ *
+ * We use direct_io==1 to force the kernel to use direct-IO for this
+ * file-system.
+ */
+
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+static const char memfd_content[] = "memfd-example-content";
+static const char memfd_path[] = "/memfd";
+
+static int memfd_getattr(const char *path, struct stat *st)
+{
+	memset(st, 0, sizeof(*st));
+
+	if (!strcmp(path, "/")) {
+		st->st_mode = S_IFDIR | 0755;
+		st->st_nlink = 2;
+	} else if (!strcmp(path, memfd_path)) {
+		st->st_mode = S_IFREG | 0444;
+		st->st_nlink = 1;
+		st->st_size = strlen(memfd_content);
+	} else {
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static int memfd_readdir(const char *path,
+			 void *buf,
+			 fuse_fill_dir_t filler,
+			 off_t offset,
+			 struct fuse_file_info *fi)
+{
+	if (strcmp(path, "/"))
+		return -ENOENT;
+
+	filler(buf, ".", NULL, 0);
+	filler(buf, "..", NULL, 0);
+	filler(buf, memfd_path + 1, NULL, 0);
+
+	return 0;
+}
+
+static int memfd_open(const char *path, struct fuse_file_info *fi)
+{
+	if (strcmp(path, memfd_path))
+		return -ENOENT;
+
+	if ((fi->flags & 3) != O_RDONLY)
+		return -EACCES;
+
+	/* force direct-IO */
+	fi->direct_io = 1;
+
+	return 0;
+}
+
+static int memfd_read(const char *path,
+		      char *buf,
+		      size_t size,
+		      off_t offset,
+		      struct fuse_file_info *fi)
+{
+	size_t len;
+
+	if (strcmp(path, memfd_path) != 0)
+		return -ENOENT;
+
+	sleep(1);
+
+	len = strlen(memfd_content);
+	if (offset < len) {
+		if (offset + size > len)
+			size = len - offset;
+
+		memcpy(buf, memfd_content + offset, size);
+	} else {
+		size = 0;
+	}
+
+	return size;
+}
+
+static struct fuse_operations memfd_ops = {
+	.getattr	= memfd_getattr,
+	.readdir	= memfd_readdir,
+	.open		= memfd_open,
+	.read		= memfd_read,
+};
+
+int main(int argc, char *argv[])
+{
+	return fuse_main(argc, argv, &memfd_ops, NULL);
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -0,0 +1,318 @@
+/*
+ *  tools/testing/selftests/memfd/fuse_test.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * memfd GUP test-case
+ * This tests memfd interactions with get_user_pages(). We require the
+ * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This
+ * file-system delays _all_ reads by 1s and forces direct-IO. This means, any
+ * read() on files in that file-system will pin the receive-buffer pages for at
+ * least 1s via get_user_pages().
+ *
+ * We use this trick to race ADD_SEALS against a write on a memfd object. The
+ * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use
+ * the read() syscall with our memory-mapped memfd object as receive buffer to
+ * force the kernel to write into our memfd object.
+ */
+
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65535
+
+static int sys_memfd_create(const char *name,
+			    unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+	int r, fd;
+
+	fd = sys_memfd_create(name, flags);
+	if (fd < 0) {
+		printf("memfd_create(\"%s\", %u) failed: %m\n",
+		       name, flags);
+		abort();
+	}
+
+	r = ftruncate(fd, sz);
+	if (r < 0) {
+		printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+		abort();
+	}
+
+	return fd;
+}
+
+static __u64 mfd_assert_get_seals(int fd)
+{
+	long r;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0) {
+		printf("GET_SEALS(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_assert_has_seals(int fd, __u64 seals)
+{
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	if (s != seals) {
+		printf("%llu != %llu = GET_SEALS(%d)\n",
+		       (unsigned long long)seals, (unsigned long long)s, fd);
+		abort();
+	}
+}
+
+static void mfd_assert_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static int mfd_busy_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0)
+		s = 0;
+	else
+		s = r;
+
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0 && errno != EBUSY) {
+		printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+
+	return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static int global_mfd = -1;
+static void *global_p = NULL;
+
+static int sealing_thread_fn(void *arg)
+{
+	int sig, r;
+
+	/*
+	 * This thread first waits 200ms so any pending operation in the parent
+	 * is correctly started. After that, it tries to seal @global_mfd as
+	 * SEAL_WRITE. This _must_ fail as the parent thread has a read() into
+	 * that memory mapped object still ongoing.
+	 * We then wait one more second and try sealing again. This time it
+	 * must succeed as there shouldn't be anyone else pinning the pages.
+	 */
+
+	/* wait 200ms for FUSE-request to be active */
+	usleep(200000);
+
+	/* unmount mapping before sealing to avoid i_mmap_writable failures */
+	munmap(global_p, MFD_DEF_SIZE);
+
+	/* Try sealing the global file; expect EBUSY or success. Current
+	 * kernels will never succeed, but in the future, kernels might
+	 * implement page-replacements or other fancy ways to avoid racing
+	 * writes. */
+	r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE);
+	if (r >= 0) {
+		printf("HURRAY! This kernel fixed GUP races!\n");
+	} else {
+		/* wait 1s more so the FUSE-request is done */
+		sleep(1);
+
+		/* try sealing the global file again */
+		mfd_assert_add_seals(global_mfd, F_SEAL_WRITE);
+	}
+
+	return 0;
+}
+
+static pid_t spawn_sealing_thread(void)
+{
+	uint8_t *stack;
+	pid_t pid;
+
+	stack = malloc(STACK_SIZE);
+	if (!stack) {
+		printf("malloc(STACK_SIZE) failed: %m\n");
+		abort();
+	}
+
+	pid = clone(sealing_thread_fn,
+		    stack + STACK_SIZE,
+		    SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM,
+		    NULL);
+	if (pid < 0) {
+		printf("clone() failed: %m\n");
+		abort();
+	}
+
+	return pid;
+}
+
+static void join_sealing_thread(pid_t pid)
+{
+	waitpid(pid, NULL, 0);
+}
+
+int main(int argc, char **argv)
+{
+	static const char zero[MFD_DEF_SIZE];
+	int fd, mfd, r;
+	void *p;
+	int was_sealed;
+	pid_t pid;
+
+	if (argc < 2) {
+		printf("error: please pass path to file in fuse_mnt mount-point\n");
+		abort();
+	}
+
+	/* open FUSE memfd file for GUP testing */
+	printf("opening: %s\n", argv[1]);
+	fd = open(argv[1], O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		printf("cannot open(\"%s\"): %m\n", argv[1]);
+		abort();
+	}
+
+	/* create new memfd-object */
+	mfd = mfd_assert_new("kern_memfd_fuse",
+			     MFD_DEF_SIZE,
+			     MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	/* mmap memfd-object for writing */
+	p = mfd_assert_mmap_shared(mfd);
+
+	/* pass mfd+mapping to a separate sealing-thread which tries to seal
+	 * the memfd objects with SEAL_WRITE while we write into it */
+	global_mfd = mfd;
+	global_p = p;
+	pid = spawn_sealing_thread();
+
+	/* Use read() on the FUSE file to read into our memory-mapped memfd
+	 * object. This races the other thread which tries to seal the
+	 * memfd-object.
+	 * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s.
+	 * This guarantees that the receive-buffer is pinned for 1s until the
+	 * data is written into it. The racing ADD_SEALS should thus fail as
+	 * the pages are still pinned. */
+	r = read(fd, p, MFD_DEF_SIZE);
+	if (r < 0) {
+		printf("read() failed: %m\n");
+		abort();
+	} else if (!r) {
+		printf("unexpected EOF on read()\n");
+		abort();
+	}
+
+	was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE;
+
+	/* Wait for sealing-thread to finish and verify that it
+	 * successfully sealed the file after the second try. */
+	join_sealing_thread(pid);
+	mfd_assert_has_seals(mfd, F_SEAL_WRITE);
+
+	/* *IF* the memfd-object was sealed at the time our read() returned,
+	 * then the kernel did a page-replacement or canceled the read() (or
+	 * whatever magic it did..). In that case, the memfd object is still
+	 * all zero.
+	 * In case the memfd-object was *not* sealed, the read() was successfull
+	 * and the memfd object must *not* be all zero.
+	 * Note that in real scenarios, there might be a mixture of both, but
+	 * in this test-cases, we have explicit 200ms delays which should be
+	 * enough to avoid any in-flight writes. */
+
+	p = mfd_assert_mmap_private(mfd);
+	if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) {
+		printf("memfd sealed during read() but data not discarded\n");
+		abort();
+	} else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) {
+		printf("memfd sealed after read() but data discarded\n");
+		abort();
+	}
+
+	close(mfd);
+	close(fd);
+
+	printf("fuse: DONE\n");
+
+	return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -0,0 +1,920 @@
+/*
+ *  tools/testing/selftests/memfd/memfd_test.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65535
+
+static int sys_memfd_create(const char *name,
+			    unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+	int r, fd;
+
+	fd = sys_memfd_create(name, flags);
+	if (fd < 0) {
+		printf("memfd_create(\"%s\", %u) failed: %m\n",
+		       name, flags);
+		abort();
+	}
+
+	r = ftruncate(fd, sz);
+	if (r < 0) {
+		printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+		abort();
+	}
+
+	return fd;
+}
+
+static void mfd_fail_new(const char *name, unsigned int flags)
+{
+	int r;
+
+	r = sys_memfd_create(name, flags);
+	if (r >= 0) {
+		printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n",
+		       name, flags);
+		close(r);
+		abort();
+	}
+}
+
+static __u64 mfd_assert_get_seals(int fd)
+{
+	long r;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0) {
+		printf("GET_SEALS(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_assert_has_seals(int fd, __u64 seals)
+{
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	if (s != seals) {
+		printf("%llu != %llu = GET_SEALS(%d)\n",
+		       (unsigned long long)seals, (unsigned long long)s, fd);
+		abort();
+	}
+}
+
+static void mfd_assert_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static void mfd_fail_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0)
+		s = 0;
+	else
+		s = r;
+
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r >= 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static void mfd_assert_size(int fd, size_t size)
+{
+	struct stat st;
+	int r;
+
+	r = fstat(fd, &st);
+	if (r < 0) {
+		printf("fstat(%d) failed: %m\n", fd);
+		abort();
+	} else if (st.st_size != size) {
+		printf("wrong file size %lld, but expected %lld\n",
+		       (long long)st.st_size, (long long)size);
+		abort();
+	}
+}
+
+static int mfd_assert_dup(int fd)
+{
+	int r;
+
+	r = dup(fd);
+	if (r < 0) {
+		printf("dup(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static int mfd_assert_open(int fd, int flags, mode_t mode)
+{
+	char buf[512];
+	int r;
+
+	sprintf(buf, "/proc/self/fd/%d", fd);
+	r = open(buf, flags, mode);
+	if (r < 0) {
+		printf("open(%s) failed: %m\n", buf);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_fail_open(int fd, int flags, mode_t mode)
+{
+	char buf[512];
+	int r;
+
+	sprintf(buf, "/proc/self/fd/%d", fd);
+	r = open(buf, flags, mode);
+	if (r >= 0) {
+		printf("open(%s) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_read(int fd)
+{
+	char buf[16];
+	void *p;
+	ssize_t l;
+
+	l = read(fd, buf, sizeof(buf));
+	if (l != sizeof(buf)) {
+		printf("read() failed: %m\n");
+		abort();
+	}
+
+	/* verify PROT_READ *is* allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify MAP_PRIVATE is *always* allowed (even writable) */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	munmap(p, MFD_DEF_SIZE);
+}
+
+static void mfd_assert_write(int fd)
+{
+	ssize_t l;
+	void *p;
+	int r;
+
+	/* verify write() succeeds */
+	l = write(fd, "\0\0\0\0", 4);
+	if (l != 4) {
+		printf("write() failed: %m\n");
+		abort();
+	}
+
+	/* verify PROT_READ | PROT_WRITE is allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PROT_WRITE is allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PROT_READ with MAP_SHARED is allowed and a following
+	 * mprotect(PROT_WRITE) allows writing */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+	if (r < 0) {
+		printf("mprotect() failed: %m\n");
+		abort();
+	}
+
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PUNCH_HOLE works */
+	r = fallocate(fd,
+		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+		      0,
+		      MFD_DEF_SIZE);
+	if (r < 0) {
+		printf("fallocate(PUNCH_HOLE) failed: %m\n");
+		abort();
+	}
+}
+
+static void mfd_fail_write(int fd)
+{
+	ssize_t l;
+	void *p;
+	int r;
+
+	/* verify write() fails */
+	l = write(fd, "data", 4);
+	if (l != -EPERM) {
+		printf("expected EPERM on write(), but got %d: %m\n", (int)l);
+		abort();
+	}
+
+	/* verify PROT_READ | PROT_WRITE is not allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		printf("mmap() didn't fail as expected\n");
+		abort();
+	}
+
+	/* verify PROT_WRITE is not allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		printf("mmap() didn't fail as expected\n");
+		abort();
+	}
+
+	/* Verify PROT_READ with MAP_SHARED with a following mprotect is not
+	 * allowed. Note that for r/w the kernel already prevents the mmap. */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+		if (r >= 0) {
+			printf("mmap()+mprotect() didn't fail as expected\n");
+			abort();
+		}
+	}
+
+	/* verify PUNCH_HOLE fails */
+	r = fallocate(fd,
+		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+		      0,
+		      MFD_DEF_SIZE);
+	if (r >= 0) {
+		printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_shrink(int fd)
+{
+	int r, fd2;
+
+	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	if (r < 0) {
+		printf("ftruncate(SHRINK) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE / 2);
+
+	fd2 = mfd_assert_open(fd,
+			      O_RDWR | O_CREAT | O_TRUNC,
+			      S_IRUSR | S_IWUSR);
+	close(fd2);
+
+	mfd_assert_size(fd, 0);
+}
+
+static void mfd_fail_shrink(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	if (r >= 0) {
+		printf("ftruncate(SHRINK) didn't fail as expected\n");
+		abort();
+	}
+
+	mfd_fail_open(fd,
+		      O_RDWR | O_CREAT | O_TRUNC,
+		      S_IRUSR | S_IWUSR);
+}
+
+static void mfd_assert_grow(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	if (r < 0) {
+		printf("ftruncate(GROW) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 2);
+
+	r = fallocate(fd,
+		      0,
+		      0,
+		      MFD_DEF_SIZE * 4);
+	if (r < 0) {
+		printf("fallocate(ALLOC) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 4);
+}
+
+static void mfd_fail_grow(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	if (r >= 0) {
+		printf("ftruncate(GROW) didn't fail as expected\n");
+		abort();
+	}
+
+	r = fallocate(fd,
+		      0,
+		      0,
+		      MFD_DEF_SIZE * 4);
+	if (r >= 0) {
+		printf("fallocate(ALLOC) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_grow_write(int fd)
+{
+	static char buf[MFD_DEF_SIZE * 8];
+	ssize_t l;
+
+	l = pwrite(fd, buf, sizeof(buf), 0);
+	if (l != sizeof(buf)) {
+		printf("pwrite() failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 8);
+}
+
+static void mfd_fail_grow_write(int fd)
+{
+	static char buf[MFD_DEF_SIZE * 8];
+	ssize_t l;
+
+	l = pwrite(fd, buf, sizeof(buf), 0);
+	if (l == sizeof(buf)) {
+		printf("pwrite() didn't fail as expected\n");
+		abort();
+	}
+}
+
+static int idle_thread_fn(void *arg)
+{
+	sigset_t set;
+	int sig;
+
+	/* dummy waiter; SIGTERM terminates us anyway */
+	sigemptyset(&set);
+	sigaddset(&set, SIGTERM);
+	sigwait(&set, &sig);
+
+	return 0;
+}
+
+static pid_t spawn_idle_thread(unsigned int flags)
+{
+	uint8_t *stack;
+	pid_t pid;
+
+	stack = malloc(STACK_SIZE);
+	if (!stack) {
+		printf("malloc(STACK_SIZE) failed: %m\n");
+		abort();
+	}
+
+	pid = clone(idle_thread_fn,
+		    stack + STACK_SIZE,
+		    SIGCHLD | flags,
+		    NULL);
+	if (pid < 0) {
+		printf("clone() failed: %m\n");
+		abort();
+	}
+
+	return pid;
+}
+
+static void join_idle_thread(pid_t pid)
+{
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+/*
+ * Test memfd_create() syscall
+ * Verify syscall-argument validation, including name checks, flag validation
+ * and more.
+ */
+static void test_create(void)
+{
+	char buf[2048];
+	int fd;
+
+	/* test NULL name */
+	mfd_fail_new(NULL, 0);
+
+	/* test over-long name (not zero-terminated) */
+	memset(buf, 0xff, sizeof(buf));
+	mfd_fail_new(buf, 0);
+
+	/* test over-long zero-terminated name */
+	memset(buf, 0xff, sizeof(buf));
+	buf[sizeof(buf) - 1] = 0;
+	mfd_fail_new(buf, 0);
+
+	/* verify "" is a valid name */
+	fd = mfd_assert_new("", 0, 0);
+	close(fd);
+
+	/* verify invalid O_* open flags */
+	mfd_fail_new("", 0x0100);
+	mfd_fail_new("", ~MFD_CLOEXEC);
+	mfd_fail_new("", ~MFD_ALLOW_SEALING);
+	mfd_fail_new("", ~0);
+	mfd_fail_new("", 0x80000000U);
+
+	/* verify MFD_CLOEXEC is allowed */
+	fd = mfd_assert_new("", 0, MFD_CLOEXEC);
+	close(fd);
+
+	/* verify MFD_ALLOW_SEALING is allowed */
+	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+	close(fd);
+
+	/* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
+	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+	close(fd);
+}
+
+/*
+ * Test basic sealing
+ * A very basic sealing test to see whether setting/retrieving seals works.
+ */
+static void test_basic(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_basic",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	/* add basic seals */
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+
+	/* add them again */
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+
+	/* add more seals and seal against sealing */
+	mfd_assert_add_seals(fd, F_SEAL_GROW | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_GROW |
+				 F_SEAL_WRITE |
+				 F_SEAL_SEAL);
+
+	/* verify that sealing no longer works */
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	mfd_fail_add_seals(fd, 0);
+
+	close(fd);
+
+	/* verify sealing does not work without MFD_ALLOW_SEALING */
+	fd = mfd_assert_new("kern_memfd_basic",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+	mfd_fail_add_seals(fd, F_SEAL_SHRINK |
+			       F_SEAL_GROW |
+			       F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+	close(fd);
+}
+
+/*
+ * Test SEAL_WRITE
+ * Test whether SEAL_WRITE actually prevents modifications.
+ */
+static void test_seal_write(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_write",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+
+	mfd_assert_read(fd);
+	mfd_fail_write(fd);
+	mfd_assert_shrink(fd);
+	mfd_assert_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK
+ * Test whether SEAL_SHRINK actually prevents shrinking
+ */
+static void test_seal_shrink(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_shrink",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_fail_shrink(fd);
+	mfd_assert_grow(fd);
+	mfd_assert_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_GROW
+ * Test whether SEAL_GROW actually prevents growing
+ */
+static void test_seal_grow(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_grow",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_GROW);
+	mfd_assert_has_seals(fd, F_SEAL_GROW);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_assert_shrink(fd);
+	mfd_fail_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK | SEAL_GROW
+ * Test whether SEAL_SHRINK | SEAL_GROW actually prevents resizing
+ */
+static void test_seal_resize(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_resize",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_fail_shrink(fd);
+	mfd_fail_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test sharing via dup()
+ * Test that seals are shared between dupped FDs and they're all equal.
+ */
+static void test_share_dup(void)
+{
+	int fd, fd2;
+
+	fd = mfd_assert_new("kern_memfd_share_dup",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	fd2 = mfd_assert_dup(fd);
+	mfd_assert_has_seals(fd2, 0);
+
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	mfd_assert_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	mfd_fail_add_seals(fd2, F_SEAL_GROW);
+	mfd_fail_add_seals(fd, F_SEAL_SEAL);
+	mfd_fail_add_seals(fd2, F_SEAL_SEAL);
+
+	close(fd2);
+
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	close(fd);
+}
+
+/*
+ * Test sealing with active mmap()s
+ * Modifying seals is only allowed if no other mmap() refs exist.
+ */
+static void test_share_mmap(void)
+{
+	int fd;
+	void *p;
+
+	fd = mfd_assert_new("kern_memfd_share_mmap",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	/* shared/writable ref prevents sealing WRITE, but allows others */
+	p = mfd_assert_mmap_shared(fd);
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+	munmap(p, MFD_DEF_SIZE);
+
+	/* readable ref allows sealing */
+	p = mfd_assert_mmap_private(fd);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	munmap(p, MFD_DEF_SIZE);
+
+	close(fd);
+}
+
+/*
+ * Test sealing with open(/proc/self/fd/%d)
+ * Via /proc we can get access to a separate file-context for the same memfd.
+ * This is *not* like dup(), but like a real separate open(). Make sure the
+ * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
+ */
+static void test_share_open(void)
+{
+	int fd, fd2;
+
+	fd = mfd_assert_new("kern_memfd_share_open",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	fd2 = mfd_assert_open(fd, O_RDWR, 0);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	close(fd);
+	fd = mfd_assert_open(fd2, O_RDONLY, 0);
+
+	mfd_fail_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	close(fd2);
+	fd2 = mfd_assert_open(fd, O_RDWR, 0);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+	close(fd2);
+	close(fd);
+}
+
+/*
+ * Test sharing via fork()
+ * Test whether seal-modifications work as expected with forked childs.
+ */
+static void test_share_fork(void)
+{
+	int fd;
+	pid_t pid;
+
+	fd = mfd_assert_new("kern_memfd_share_fork",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	pid = spawn_idle_thread(0);
+	mfd_assert_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	join_idle_thread(pid);
+
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	pid_t pid;
+
+	printf("memfd: CREATE\n");
+	test_create();
+	printf("memfd: BASIC\n");
+	test_basic();
+
+	printf("memfd: SEAL-WRITE\n");
+	test_seal_write();
+	printf("memfd: SEAL-SHRINK\n");
+	test_seal_shrink();
+	printf("memfd: SEAL-GROW\n");
+	test_seal_grow();
+	printf("memfd: SEAL-RESIZE\n");
+	test_seal_resize();
+
+	printf("memfd: SHARE-DUP\n");
+	test_share_dup();
+	printf("memfd: SHARE-MMAP\n");
+	test_share_mmap();
+	printf("memfd: SHARE-OPEN\n");
+	test_share_open();
+	printf("memfd: SHARE-FORK\n");
+	test_share_fork();
+
+	/* Run test-suite in a multi-threaded environment with a shared
+	 * file-table. */
+	pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
+	printf("memfd: SHARE-DUP (shared file-table)\n");
+	test_share_dup();
+	printf("memfd: SHARE-MMAP (shared file-table)\n");
+	test_share_mmap();
+	printf("memfd: SHARE-OPEN (shared file-table)\n");
+	test_share_open();
+	printf("memfd: SHARE-FORK (shared file-table)\n");
+	test_share_fork();
+	join_idle_thread(pid);
+
+	printf("memfd: DONE\n");
+
+	return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_fuse_test.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+if test -d "./mnt" ; then
+	fusermount -u ./mnt
+	rmdir ./mnt
+fi
+
+set -e
+
+mkdir mnt
+./fuse_mnt ./mnt
+./fuse_test ./mnt/memfd
+fusermount -u ./mnt
+rmdir ./mnt
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -167,6 +167,14 @@ static int setup_routing_entry(struct kvm *kvm,
 	return r;
 }
 
+void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+}
+
+void __attribute__((weak)) kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+}
+
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *ue,
 			unsigned nr,
@@ -220,9 +228,10 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	old = kvm->irq_routing;
 	rcu_assign_pointer(kvm->irq_routing, new);
 	kvm_irq_routing_update(kvm);
+	kvm_arch_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 
-	kvm_arch_irq_routing_update(kvm);
+	kvm_arch_post_irq_routing_update(kvm);
 
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -410,6 +410,35 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * Even though we do not flush TLB, this will still adversely
+	 * affect performance on pre-Haswell Intel EPT, where there is
+	 * no EPT Access Bit to clear so that we have to tear down EPT
+	 * tables instead. If we find this unacceptable, we can always
+	 * add a parameter to kvm_age_hva so that it effectively doesn't
+	 * do anything on clear_young.
+	 *
+	 * Also note that currently we never issue secondary TLB flushes
+	 * from clear_young, leaving this job up to the regular system
+	 * cadence. If we find this inaccurate, we might come up with a
+	 * more sophisticated heuristic later.
+	 */
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
@@ -442,6 +471,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
@@ -1671,7 +1701,8 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 	if (!kvm_is_reserved_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (!PageReserved(page))
+		if (!PageReserved(page) &&
+		    (!page->mapping || PageAnon(page)))
 			SetPageDirty(page);
 	}
 }
