summaryrefslogtreecommitdiffstats
path: root/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch
blob: 7197ce71d4fb98c7d5d97675bb1f5d8463859b0d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
From c08f0e4d768db796098c8bdc64c3358baee076e7 Mon Sep 17 00:00:00 2001
From: Sebastien Boeuf <sebastien.boeuf@intel.com>
Date: Mon, 23 Jan 2017 15:08:55 -0800
Subject: [PATCH 152/154] x86: kvm: Notify host to release pages

In context of hypervisors managing several virtual machines, we
want those virtual machines to give the memory they used back to
the host when they don't need it anymore.

This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
the guest kernel to notify the host kernel when such event occurs.
And relying on do_madvise() function that we have previously exported,
it issues a call to this function when it receives the new hypercall.

Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
ask for a new page without going through a new hypercall. Instead,
it will be able to start using that memory again as it will get
faulted back in as a fresh new page. That's why do_madvise() is more
efficient than doing vm_unmap() to return some memory to the host.

This patch introduces also a new sysctl kvm_madv_instant_free,
allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
because it does not zero the pages in case the memory has not been
freed by the kernel. This can happen when there was no need for the
kernel to get this memory back, meaning it was keeping those pages
in the right state to be re-used by the same application.
MADV_FREE being a very recent advice introduced in kernel 4.5, we
only want to enable it through a sysctl in case the user want to
use it.

Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
---
 arch/x86/kvm/x86.c            | 17 +++++++++++++++++
 include/linux/mm.h            |  5 +++++
 include/uapi/linux/kvm_para.h |  3 +++
 kernel/sysctl.c               |  7 +++++++
 mm/Makefile                   |  2 +-
 mm/kvm.c                      | 26 ++++++++++++++++++++++++++
 6 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 mm/kvm.c

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0cb83b..7bd380ff8dfa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -45,6 +45,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
@@ -6206,6 +6207,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
+{
+	unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+
+	if (len > KVM_MAX_RET_MEM_SIZE)
+		return KVM_EPERM;
+
+	if (kvm_is_error_hva(start + len))
+		return KVM_EFAULT;
+
+	return do_madvise(start, len, kvm_ret_mem_advice);
+}
+
 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.apicv_active = false;
@@ -6257,6 +6271,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
 #endif
+	case KVM_HC_RETURN_MEM:
+		ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf52e0498247..d8bcf5c4b996 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2406,6 +2406,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
 extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+extern int sysctl_kvm_madv_instant_free;
+extern int kvm_ret_mem_advice;
+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
+					 void __user *buffer, size_t *length,
+					 loff_t *ppos);
 #endif
 
 void drop_slab(void);
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index fed506aeff62..ebc482ce7d38 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -25,6 +25,9 @@
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
 #define KVM_HC_CLOCK_PAIRING		9
+#define KVM_HC_RETURN_MEM		10
+
+#define KVM_MAX_RET_MEM_SIZE		(1 << 22) // 4MiB
 
 /*
  * hypercalls use architecture specific
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..771a930cadfa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1387,6 +1387,13 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &one,
 		.extra2		= &four,
 	},
+	{
+		.procname	= "kvm_madv_instant_free",
+		.data		= &sysctl_kvm_madv_instant_free,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= kvm_madv_instant_free_sysctl_handler,
+	},
 #ifdef CONFIG_COMPACTION
 	{
 		.procname	= "compact_memory",
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a50..28d95bed7e1d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o $(mmu-y)
+			   debug.o kvm.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/kvm.c b/mm/kvm.c
new file mode 100644
index 000000000000..1c5600788221
--- /dev/null
+++ b/mm/kvm.c
@@ -0,0 +1,26 @@
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+
+int sysctl_kvm_madv_instant_free;
+
+int kvm_ret_mem_advice = MADV_DONTNEED;
+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
+
+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
+
+#ifdef MADV_FREE
+	if (sysctl_kvm_madv_instant_free > 0)
+		kvm_ret_mem_advice = MADV_FREE;
+	else
+		kvm_ret_mem_advice = MADV_DONTNEED;
+#endif
+
+	return 0;
+}
-- 
2.13.2