summaryrefslogtreecommitdiffstats
path: root/patches/boot_time_opt_guest
diff options
context:
space:
mode:
Diffstat (limited to 'patches/boot_time_opt_guest')
-rw-r--r--patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch34
-rw-r--r--patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch38
-rw-r--r--patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch26
-rw-r--r--patches/boot_time_opt_guest/0105-vmstats-wakeups.patch28
-rw-r--r--patches/boot_time_opt_guest/0106-pci-probe.patch123
-rw-r--r--patches/boot_time_opt_guest/0107-cgroup.patch107
-rw-r--r--patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch45
-rw-r--r--patches/boot_time_opt_guest/0109-perf.patch28
-rw-r--r--patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch190
-rw-r--r--patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch39
-rw-r--r--patches/boot_time_opt_guest/0112-ksm-wakeups.patch32
-rw-r--r--patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch42
-rw-r--r--patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch56
-rw-r--r--patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch84
-rw-r--r--patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch180
-rw-r--r--patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch155
-rw-r--r--patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch137
-rw-r--r--patches/boot_time_opt_guest/guest_boot_time_opt.scc19
18 files changed, 1363 insertions, 0 deletions
diff --git a/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch
new file mode 100644
index 0000000..1de2a6b
--- /dev/null
+++ b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch
@@ -0,0 +1,34 @@
1From 6b0fb5b2a7a157c04d8ab6ad71b092034d0048bf Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:19:26 -0600
4Subject: [PATCH 102/114] cpuidle: skip synchronize_rcu() on single CPU systems
5
6synchronize_rcu() is pretty expensive, and on single CPU systems we don't need
7it in this specific case, so skip it.
8
9Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 drivers/cpuidle/cpuidle.c | 5 ++++-
13 1 file changed, 4 insertions(+), 1 deletion(-)
14
15diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
16index 62810ff3b00f..f1d110411098 100644
17--- a/drivers/cpuidle/cpuidle.c
18+++ b/drivers/cpuidle/cpuidle.c
19@@ -324,8 +324,11 @@ void cpuidle_uninstall_idle_handler(void)
20 /*
21 * Make sure external observers (such as the scheduler)
22 * are done looking at pointed idle states.
23+ * This is only relevant if there is more than one cpu,
24+ * if there is only one CPU, that is us... and we're
25+ * coherent to ourselves.
26 */
27- synchronize_rcu();
28+
29 }
30
31 /**
32--
332.11.1
34
diff --git a/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
new file mode 100644
index 0000000..d3a20fb
--- /dev/null
+++ b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
@@ -0,0 +1,38 @@
1From 7be707833bb35c295eb702d13cf73ac9390e4b31 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:25:16 -0600
4Subject: [PATCH 103/114] sysrq: skip synchronize_rcu() if there is no old op
5
6synchronize_rcu() is expensive. Currently it is called as part of the sysrq
7registration/unregistration, which happens during boot several times.
8Now, the reason for the synchronize_rcu() is to allow an old registered
9operation to expire properly... which is pointless if the old operation
10is NULL...
11So we can save the common case of the old operation being NULL a lot of time
12by just checking for non-NULL prior to the synchronize_rcu()
13
14Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
15Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
16---
17 drivers/tty/sysrq.c | 4 +++-
18 1 file changed, 3 insertions(+), 1 deletion(-)
19
20diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
21index 701c085bb19b..c60c7ba57ad9 100644
22--- a/drivers/tty/sysrq.c
23+++ b/drivers/tty/sysrq.c
24@@ -1065,8 +1065,10 @@ static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p,
25 * A concurrent __handle_sysrq either got the old op or the new op.
26 * Wait for it to go away before returning, so the code for an old
27 * op is not freed (eg. on module unload) while it is in use.
28+ * This is only relevant if the old op is not NULL of course.
29 */
30- synchronize_rcu();
31+ if (remove_op_p)
32+ synchronize_rcu();
33
34 return retval;
35 }
36--
372.11.1
38
diff --git a/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch
new file mode 100644
index 0000000..715c195
--- /dev/null
+++ b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch
@@ -0,0 +1,26 @@
1From 5899ff79ed4e3514420e1530a3588a922832dae5 Mon Sep 17 00:00:00 2001
2From: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
3Date: Mon, 13 Apr 2015 11:26:36 -0500
4Subject: [PATCH 104/114] fbcon: enable no blink by default
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7---
8 drivers/video/console/fbcon.c | 2 +-
9 1 file changed, 1 insertion(+), 1 deletion(-)
10
11diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
12index a44f5627b82a..95b73366b86f 100644
13--- a/drivers/video/console/fbcon.c
14+++ b/drivers/video/console/fbcon.c
15@@ -146,7 +146,7 @@ static const struct consw fb_con;
16
17 static int fbcon_set_origin(struct vc_data *);
18
19-static int fbcon_cursor_noblink;
20+static int fbcon_cursor_noblink = 1;
21
22 #define divides(a, b) ((!(a) || (b)%(a)) ? 0 : 1)
23
24--
252.11.1
26
diff --git a/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch
new file mode 100644
index 0000000..09b109a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch
@@ -0,0 +1,28 @@
1From ff47b4e9be8113b4ba05d6f2afee3db6904bc10f Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:47:20 -0600
4Subject: [PATCH 105/114] vmstats: wakeups
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 mm/vmstat.c | 2 +-
11 1 file changed, 1 insertion(+), 1 deletion(-)
12
13diff --git a/mm/vmstat.c b/mm/vmstat.c
14index 7c28df36f50f..efe1b6797139 100644
15--- a/mm/vmstat.c
16+++ b/mm/vmstat.c
17@@ -1549,7 +1549,7 @@ static const struct file_operations proc_vmstat_file_operations = {
18 #ifdef CONFIG_SMP
19 static struct workqueue_struct *vmstat_wq;
20 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
21-int sysctl_stat_interval __read_mostly = HZ;
22+int sysctl_stat_interval __read_mostly = 8 * HZ;
23
24 #ifdef CONFIG_PROC_FS
25 static void refresh_vm_stats(struct work_struct *work)
26--
272.11.1
28
diff --git a/patches/boot_time_opt_guest/0106-pci-probe.patch b/patches/boot_time_opt_guest/0106-pci-probe.patch
new file mode 100644
index 0000000..5045926
--- /dev/null
+++ b/patches/boot_time_opt_guest/0106-pci-probe.patch
@@ -0,0 +1,123 @@
1From b225caf8f743b9f5f9e84d0df711ee0c17e049ae Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:53:08 -0600
4Subject: [PATCH 106/114] pci: probe
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 drivers/pci/probe.c | 43 ++++++++++++++++++++++++++++++++++++++++---
11 1 file changed, 40 insertions(+), 3 deletions(-)
12
13diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
14index 204960e70333..7399a06698da 100644
15--- a/drivers/pci/probe.c
16+++ b/drivers/pci/probe.c
17@@ -182,6 +182,10 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
18
19 mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
20
21+ res->name = pci_name(dev);
22+
23+ printk("clr: Starting probe for %s\n", res->name);
24+
25 /* No printks while decoding is disabled! */
26 if (!dev->mmio_always_on) {
27 pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
28@@ -191,8 +195,6 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
29 }
30 }
31
32- res->name = pci_name(dev);
33-
34 pci_read_config_dword(dev, pos, &l);
35 pci_write_config_dword(dev, pos, l | mask);
36 pci_read_config_dword(dev, pos, &sz);
37@@ -324,6 +326,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
38 if (dev->non_compliant_bars)
39 return;
40
41+ printk("clr: pci_read_bases start\n");
42+
43 for (pos = 0; pos < howmany; pos++) {
44 struct resource *res = &dev->resource[pos];
45 reg = PCI_BASE_ADDRESS_0 + (pos << 2);
46@@ -332,11 +336,13 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
47
48 if (rom) {
49 struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
50+ printk("clr: rom path\n");
51 dev->rom_base_reg = rom;
52 res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
53 IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
54 __pci_read_base(dev, pci_bar_mem32, res, rom);
55 }
56+ printk("clr: pci_read_bases end\n");
57 }
58
59 static void pci_read_bridge_io(struct pci_bus *child)
60@@ -1311,6 +1317,28 @@ static void pci_msi_setup_pci_dev(struct pci_dev *dev)
61 pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
62 }
63
64+static int guess_bar_count(int class)
65+{
66+ if (class == 0x068000)
67+ return 0;
68+ if (class == 0x020000)
69+ return 2;
70+ if (class == 0x010000)
71+ return 2;
72+ if (class == 0x00ff00)
73+ return 1;
74+ return 6;
75+}
76+
77+static int has_rom(int class, int rom)
78+{
79+ if (class == 0x020000)
80+ return 0;
81+ if (class == 0x010000 || class == 0x00ff00)
82+ return 0;
83+ return rom;
84+}
85+
86 /**
87 * pci_setup_device - fill in class and map information of a device
88 * @dev: the device structure to fill
89@@ -1329,6 +1357,9 @@ int pci_setup_device(struct pci_dev *dev)
90 int pos = 0;
91 struct pci_bus_region region;
92 struct resource *res;
93+ int maxbar;
94+
95+ printk("clr: pci_setup_device start\n");
96
97 if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
98 return -EIO;
99@@ -1383,7 +1414,11 @@ int pci_setup_device(struct pci_dev *dev)
100 if (class == PCI_CLASS_BRIDGE_PCI)
101 goto bad;
102 pci_read_irq(dev);
103- pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
104+
105+ maxbar = guess_bar_count(dev->class);
106+
107+ if (class != PCI_CLASS_STORAGE_IDE)
108+ pci_read_bases(dev, maxbar, has_rom(dev->class, PCI_ROM_ADDRESS));
109 pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
110 pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
111
112@@ -1468,6 +1503,8 @@ int pci_setup_device(struct pci_dev *dev)
113 dev->class = PCI_CLASS_NOT_DEFINED << 8;
114 }
115
116+ printk("clr: pci_setup_device end\n");
117+
118 /* We found a fine healthy device, go go go... */
119 return 0;
120 }
121--
1222.11.1
123
diff --git a/patches/boot_time_opt_guest/0107-cgroup.patch b/patches/boot_time_opt_guest/0107-cgroup.patch
new file mode 100644
index 0000000..d68c686
--- /dev/null
+++ b/patches/boot_time_opt_guest/0107-cgroup.patch
@@ -0,0 +1,107 @@
1From 0adc5bfd84939d11d3c172eab0a00bfab4aadb46 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Fri, 28 Aug 2015 11:00:36 -0500
4Subject: [PATCH 107/114] cgroup
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9Signed-off-by: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
10---
11 include/linux/cgroup-defs.h | 2 +-
12 kernel/cgroup.c | 24 ++++++++++++++----------
13 2 files changed, 15 insertions(+), 11 deletions(-)
14
15diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
16index 861b4677fc5b..5d3c345ee60c 100644
17--- a/include/linux/cgroup-defs.h
18+++ b/include/linux/cgroup-defs.h
19@@ -137,7 +137,7 @@ struct cgroup_subsys_state {
20
21 /* percpu_ref killing and RCU release */
22 struct rcu_head rcu_head;
23- struct work_struct destroy_work;
24+ struct delayed_work destroy_work;
25 };
26
27 /*
28diff --git a/kernel/cgroup.c b/kernel/cgroup.c
29index 53bbca7c4859..6de39d8213ed 100644
30--- a/kernel/cgroup.c
31+++ b/kernel/cgroup.c
32@@ -73,7 +73,7 @@
33 * Expiring in the middle is a performance problem not a correctness one.
34 * 1 sec should be enough.
35 */
36-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
37+#define CGROUP_PIDLIST_DESTROY_DELAY round_jiffies_relative(HZ)
38
39 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
40 MAX_CFTYPE_NAME + 2)
41@@ -4986,8 +4986,9 @@ static struct cftype cgroup_legacy_base_files[] = {
42 */
43 static void css_free_work_fn(struct work_struct *work)
44 {
45+ struct delayed_work *dwork = to_delayed_work(work);
46 struct cgroup_subsys_state *css =
47- container_of(work, struct cgroup_subsys_state, destroy_work);
48+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
49 struct cgroup_subsys *ss = css->ss;
50 struct cgroup *cgrp = css->cgroup;
51
52@@ -5036,14 +5037,15 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
53 struct cgroup_subsys_state *css =
54 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
55
56- INIT_WORK(&css->destroy_work, css_free_work_fn);
57- queue_work(cgroup_destroy_wq, &css->destroy_work);
58+ INIT_DELAYED_WORK(&css->destroy_work, css_free_work_fn);
59+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
60 }
61
62 static void css_release_work_fn(struct work_struct *work)
63 {
64+ struct delayed_work *dwork = to_delayed_work(work);
65 struct cgroup_subsys_state *css =
66- container_of(work, struct cgroup_subsys_state, destroy_work);
67+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
68 struct cgroup_subsys *ss = css->ss;
69 struct cgroup *cgrp = css->cgroup;
70
71@@ -5088,8 +5090,9 @@ static void css_release(struct percpu_ref *ref)
72 struct cgroup_subsys_state *css =
73 container_of(ref, struct cgroup_subsys_state, refcnt);
74
75- INIT_WORK(&css->destroy_work, css_release_work_fn);
76- queue_work(cgroup_destroy_wq, &css->destroy_work);
77+ INIT_DELAYED_WORK(&css->destroy_work, css_release_work_fn);
78+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
79+
80 }
81
82 static void init_and_link_css(struct cgroup_subsys_state *css,
83@@ -5371,8 +5374,9 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
84 */
85 static void css_killed_work_fn(struct work_struct *work)
86 {
87+ struct delayed_work *dwork = to_delayed_work(work);
88 struct cgroup_subsys_state *css =
89- container_of(work, struct cgroup_subsys_state, destroy_work);
90+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
91
92 mutex_lock(&cgroup_mutex);
93
94@@ -5393,8 +5397,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
95 container_of(ref, struct cgroup_subsys_state, refcnt);
96
97 if (atomic_dec_and_test(&css->online_cnt)) {
98- INIT_WORK(&css->destroy_work, css_killed_work_fn);
99- queue_work(cgroup_destroy_wq, &css->destroy_work);
100+ INIT_DELAYED_WORK(&css->destroy_work, css_killed_work_fn);
101+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
102 }
103 }
104
105--
1062.11.1
107
diff --git a/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch
new file mode 100644
index 0000000..48be94a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch
@@ -0,0 +1,45 @@
1From 634947be6c24d844af5f6ecf59453f2ddc09e032 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 17:28:14 -0600
4Subject: [PATCH 108/114] smpboot: reuse timer calibration
5
6NO point recalibrating for known-constant tsc... saves 200ms+ of boot time.
7
8Author: Arjan van de Ven <arjan@linux.intel.com>
9
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 arch/x86/kernel/smpboot.c | 2 +-
13 arch/x86/kernel/tsc.c | 3 +++
14 2 files changed, 4 insertions(+), 1 deletion(-)
15
16diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
17index 99b920d0e516..e17bb425bb52 100644
18--- a/arch/x86/kernel/smpboot.c
19+++ b/arch/x86/kernel/smpboot.c
20@@ -761,7 +761,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
21 pr_debug("Waiting for send to finish...\n");
22 send_status = safe_apic_wait_icr_idle();
23
24- udelay(init_udelay);
25+ udelay(100);
26
27 pr_debug("Deasserting INIT\n");
28
29diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
30index 37e7cf544e51..e99be8a6a132 100644
31--- a/arch/x86/kernel/tsc.c
32+++ b/arch/x86/kernel/tsc.c
33@@ -1413,6 +1413,9 @@ unsigned long calibrate_delay_is_known(void)
34 if (!mask)
35 return 0;
36
37+ if (cpu !=0)
38+ return cpu_data(0).loops_per_jiffy;
39+
40 sibling = cpumask_any_but(mask, cpu);
41 if (sibling < nr_cpu_ids)
42 return cpu_data(sibling).loops_per_jiffy;
43--
442.11.1
45
diff --git a/patches/boot_time_opt_guest/0109-perf.patch b/patches/boot_time_opt_guest/0109-perf.patch
new file mode 100644
index 0000000..75f50f6
--- /dev/null
+++ b/patches/boot_time_opt_guest/0109-perf.patch
@@ -0,0 +1,28 @@
1From cce700dfbd5fdbf72b96e6479ca539ab4d880ce2 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 4 Nov 2015 15:17:10 -0600
4Subject: [PATCH 109/114] perf
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 arch/x86/events/intel/core.c | 2 +-
11 1 file changed, 1 insertion(+), 1 deletion(-)
12
13diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
14index eb1484c86bb4..c13ea26ac066 100644
15--- a/arch/x86/events/intel/core.c
16+++ b/arch/x86/events/intel/core.c
17@@ -4040,7 +4040,7 @@ __init int intel_pmu_init(void)
18 */
19 if (x86_pmu.extra_regs) {
20 for (er = x86_pmu.extra_regs; er->msr; er++) {
21- er->extra_msr_access = check_msr(er->msr, 0x11UL);
22+ er->extra_msr_access = false;
23 /* Disable LBR select mapping */
24 if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
25 x86_pmu.lbr_sel_map = NULL;
26--
272.11.1
28
diff --git a/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch
new file mode 100644
index 0000000..742a045
--- /dev/null
+++ b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch
@@ -0,0 +1,190 @@
1From c662d99134b67c58e63ecc17c2531588a3a51596 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sat, 14 Feb 2015 09:49:41 -0600
4Subject: [PATCH 110/114] pci: probe: identify known devices
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7Modify-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
8
9Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
10---
11 drivers/pci/probe.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++
12 1 file changed, 156 insertions(+)
13
14diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
15index 7399a06698da..4fb2d7fed4c5 100644
16--- a/drivers/pci/probe.c
17+++ b/drivers/pci/probe.c
18@@ -163,6 +163,159 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
19
20 #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
21
22+/* shortcut version of __pci_read_base where we know the sizes already */
23+int __pci_read_base_shortcut(struct pci_dev *dev, enum pci_bar_type type,
24+ struct resource *res, unsigned int pos, u32 sz_in, u32 sz2_in)
25+{
26+ u32 l, sz;
27+ u64 l64, sz64, mask64;
28+ struct pci_bus_region region, inverted_region;
29+
30+ res->name = pci_name(dev);
31+
32+ pci_read_config_dword(dev, pos, &l);
33+
34+ sz = sz_in;
35+
36+ /*
37+ * All bits set in sz means the device isn't working properly.
38+ * If the BAR isn't implemented, all bits must be 0. If it's a
39+ * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit
40+ * 1 must be clear.
41+ * Here we set the size and is not 0xffffffff
42+ */
43+
44+ /*
45+ * I don't know how l can have all bits set. Copied from old code.
46+ * Maybe it fixes a bug on some ancient platform.
47+ */
48+ if (l == 0xffffffff)
49+ l = 0;
50+
51+ if (type == pci_bar_unknown) {
52+ res->flags = decode_bar(dev, l);
53+ res->flags |= IORESOURCE_SIZEALIGN;
54+ if (res->flags & IORESOURCE_IO) {
55+ l64 = l & PCI_BASE_ADDRESS_IO_MASK;
56+ sz64 = sz & PCI_BASE_ADDRESS_IO_MASK;
57+ mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT;
58+ } else {
59+ l64 = l & PCI_BASE_ADDRESS_MEM_MASK;
60+ sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK;
61+ mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK;
62+ }
63+ } else {
64+ res->flags |= (l & IORESOURCE_ROM_ENABLE);
65+ l64 = l & PCI_ROM_ADDRESS_MASK;
66+ sz64 = sz & PCI_ROM_ADDRESS_MASK;
67+ mask64 = (u32)PCI_ROM_ADDRESS_MASK;
68+ }
69+
70+ if (res->flags & IORESOURCE_MEM_64) {
71+ pci_read_config_dword(dev, pos + 4, &l);
72+ sz = sz2_in;
73+
74+ l64 |= ((u64)l << 32);
75+ sz64 |= ((u64)sz << 32);
76+ mask64 |= ((u64)~0 << 32);
77+ }
78+
79+ if (!sz64)
80+ goto fail;
81+
82+ sz64 = pci_size(l64, sz64, mask64);
83+ if (!sz64) {
84+ dev_info(&dev->dev, FW_BUG "reg 0x%x: invalid BAR (can't size)\n",
85+ pos);
86+ goto fail;
87+ }
88+
89+ if (res->flags & IORESOURCE_MEM_64) {
90+ if ((sizeof(dma_addr_t) < 8 || sizeof(resource_size_t) < 8) &&
91+ sz64 > 0x100000000ULL) {
92+ res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED;
93+ res->start = 0;
94+ res->end = 0;
95+ dev_err(&dev->dev, "reg 0x%x: can't handle BAR larger than 4GB (size %#010llx)\n",
96+ pos, (unsigned long long)sz64);
97+ goto out;
98+ }
99+
100+ if ((sizeof(dma_addr_t) < 8) && l) {
101+ /* Above 32-bit boundary; try to reallocate */
102+ res->flags |= IORESOURCE_UNSET;
103+ res->start = 0;
104+ res->end = sz64;
105+ dev_info(&dev->dev, "reg 0x%x: can't handle BAR above 4GB (bus address %#010llx)\n",
106+ pos, (unsigned long long)l64);
107+ goto out;
108+ }
109+ }
110+
111+ region.start = l64;
112+ region.end = l64 + sz64;
113+
114+ pcibios_bus_to_resource(dev->bus, res, &region);
115+ pcibios_resource_to_bus(dev->bus, &inverted_region, res);
116+
117+ /*
118+ * If "A" is a BAR value (a bus address), "bus_to_resource(A)" is
119+ * the corresponding resource address (the physical address used by
120+ * the CPU. Converting that resource address back to a bus address
121+ * should yield the original BAR value:
122+ *
123+ * resource_to_bus(bus_to_resource(A)) == A
124+ *
125+ * If it doesn't, CPU accesses to "bus_to_resource(A)" will not
126+ * be claimed by the device.
127+ */
128+ if (inverted_region.start != region.start) {
129+ res->flags |= IORESOURCE_UNSET;
130+ res->start = 0;
131+ res->end = region.end - region.start;
132+ dev_info(&dev->dev, "reg 0x%x: initial BAR value %#010llx invalid\n",
133+ pos, (unsigned long long)region.start);
134+ }
135+
136+ goto out;
137+
138+
139+fail:
140+ res->flags = 0;
141+out:
142+ if (res->flags)
143+ dev_printk(KERN_DEBUG, &dev->dev, "reg 0x%x: %pR\n", pos, res);
144+
145+ return (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
146+}
147+
148+static int is_known_device(struct pci_dev *dev, int pos, int *sz)
149+{
150+ /* Red Hat, Inc : Virtio network device */
151+ if (dev->vendor == 0x1af4 && dev->device == 0x1000) {
152+ if (pos == 0x10) {
153+ *sz = 0xffffffe1;
154+ return 1;
155+ }
156+ if (pos == 0x14) {
157+ *sz = 0xfffff000;
158+ return 1;
159+ }
160+ }
161+ /* Red Hat, Inc : Virtio block device */
162+ if (dev->vendor == 0x1af4 && dev->device == 0x1001) {
163+ if (pos == 0x10) {
164+ *sz = 0xffffffc1;
165+ return 1;
166+ }
167+ if (pos == 0x14) {
168+ *sz = 0xfffff000;
169+ return 1;
170+ }
171+ }
172+ return 0;
173+}
174+
175 /**
176 * pci_read_base - read a PCI BAR
177 * @dev: the PCI device
178@@ -182,6 +335,9 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
179
180 mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
181
182+ if (is_known_device(dev, pos, &sz))
183+ return __pci_read_base_shortcut(dev, type, res, pos, sz, 0);
184+
185 res->name = pci_name(dev);
186
187 printk("clr: Starting probe for %s\n", res->name);
188--
1892.11.1
190
diff --git a/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch
new file mode 100644
index 0000000..701a18d
--- /dev/null
+++ b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch
@@ -0,0 +1,39 @@
1From be2ab4809c6b5058fbf3cd54c0f59c56416e572c Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 22 Jun 2015 09:33:33 -0500
4Subject: [PATCH 111/114] init: no wait for the known devices
5
6No wait for the known devices to complete their probing
7
8Author: Arjan van de Ven <arjan@linux.intel.com>
9
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 init/do_mounts.c | 4 +++-
13 1 file changed, 3 insertions(+), 1 deletion(-)
14
15diff --git a/init/do_mounts.c b/init/do_mounts.c
16index c2de5104aad2..40725f0f5fb3 100644
17--- a/init/do_mounts.c
18+++ b/init/do_mounts.c
19@@ -28,6 +28,7 @@
20 #include <linux/slab.h>
21 #include <linux/ramfs.h>
22 #include <linux/shmem_fs.h>
23+#include <linux/async.h>
24
25 #include <linux/nfs_fs.h>
26 #include <linux/nfs_fs_sb.h>
27@@ -563,7 +564,8 @@ void __init prepare_namespace(void)
28 * For example, it is not atypical to wait 5 seconds here
29 * for the touchpad of a laptop to initialize.
30 */
31- wait_for_device_probe();
32+ //wait_for_device_probe();
33+ async_synchronize_full();
34
35 md_run_setup();
36
37--
382.11.1
39
diff --git a/patches/boot_time_opt_guest/0112-ksm-wakeups.patch b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch
new file mode 100644
index 0000000..b131e3f
--- /dev/null
+++ b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch
@@ -0,0 +1,32 @@
1From 2dc48e4b5c651691b7028991b64c935047b41b19 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 14 Mar 2016 11:06:46 -0600
4Subject: [PATCH 112/114] ksm-wakeups
5
6reduce wakeups in ksm
7---
8 mm/ksm.c | 8 ++++++--
9 1 file changed, 6 insertions(+), 2 deletions(-)
10
11diff --git a/mm/ksm.c b/mm/ksm.c
12index 9ae6011a41f8..eecd3ff669e2 100644
13--- a/mm/ksm.c
14+++ b/mm/ksm.c
15@@ -1725,8 +1725,12 @@ static int ksm_scan_thread(void *nothing)
16 try_to_freeze();
17
18 if (ksmd_should_run()) {
19- schedule_timeout_interruptible(
20- msecs_to_jiffies(ksm_thread_sleep_millisecs));
21+ if (ksm_thread_sleep_millisecs >= 1000)
22+ schedule_timeout_interruptible(
23+ msecs_to_jiffies(round_jiffies_relative(ksm_thread_sleep_millisecs)));
24+ else
25+ schedule_timeout_interruptible(
26+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
27 } else {
28 wait_event_freezable(ksm_thread_wait,
29 ksmd_should_run() || kthread_should_stop());
30--
312.11.1
32
diff --git a/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch
new file mode 100644
index 0000000..047eddb
--- /dev/null
+++ b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch
@@ -0,0 +1,42 @@
1From 179b7f41d5509f93cd297cc81c5d8da4a3123d9d Mon Sep 17 00:00:00 2001
2From: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
3Date: Fri, 20 Nov 2015 14:01:26 -0600
4Subject: [PATCH 113/114] init: do_mounts: recreate /dev/root
5
6Rootfs shows as is mounted in /dev/root, but this devices is not present in
7/dev directory.
8
9Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
10---
11 init/do_mounts.c | 8 ++++++++
12 1 file changed, 8 insertions(+)
13
14diff --git a/init/do_mounts.c b/init/do_mounts.c
15index 40725f0f5fb3..78b5b1dba8ca 100644
16--- a/init/do_mounts.c
17+++ b/init/do_mounts.c
18@@ -550,6 +550,7 @@ void __init mount_root(void)
19 void __init prepare_namespace(void)
20 {
21 int is_floppy;
22+ int err;
23
24 if (root_delay) {
25 printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
26@@ -604,6 +605,13 @@ void __init prepare_namespace(void)
27 devtmpfs_mount("dev");
28 sys_mount(".", "/", NULL, MS_MOVE, NULL);
29 sys_chroot(".");
30+#ifdef CONFIG_BLOCK
31+ /* recreate the /dev/root */
32+ err = create_dev("/dev/root", ROOT_DEV);
33+
34+ if (err < 0)
35+ pr_emerg("Failed to create /dev/root: %d\n", err);
36+#endif
37 }
38
39 static bool is_tmpfs;
40--
412.11.1
42
diff --git a/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
new file mode 100644
index 0000000..dee9058
--- /dev/null
+++ b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
@@ -0,0 +1,56 @@
1From 02fd2e6a7c708bf973209f9b238c5c61cbf15239 Mon Sep 17 00:00:00 2001
2From: Alan Cox <alan@linux.intel.com>
3Date: Thu, 10 Mar 2016 15:11:28 +0000
4Subject: [PATCH 114/114] xattr: allow setting user.* attributes on symlinks by
5 owner
6
7Kvmtool and clear containers supports using user attributes to label host
8files with the virtual uid/guid of the file in the container. This allows an
9end user to manage their files and a complete uid space without all the ugly
10namespace stuff.
11
12The one gap in the support is symlinks because an end user can change the
13ownership of a symbolic link. We support attributes on these files as you
14can already (as root) set security attributes on them.
15
16The current rules seem slightly over-paranoid and as we have a use case this
17patch enables updating the attributes on a symbolic link IFF you are the
18owner of the synlink (as permissions are not usually meaningful on the link
19itself).
20
21Signed-off-by: Alan Cox <alan@linux.intel.com>
22---
23 fs/xattr.c | 14 ++++++++------
24 1 file changed, 8 insertions(+), 6 deletions(-)
25
26diff --git a/fs/xattr.c b/fs/xattr.c
27index 7e3317cf4045..e005c30acb2c 100644
28--- a/fs/xattr.c
29+++ b/fs/xattr.c
30@@ -118,15 +118,17 @@ xattr_permission(struct inode *inode, const char *name, int mask)
31 }
32
33 /*
34- * In the user.* namespace, only regular files and directories can have
35- * extended attributes. For sticky directories, only the owner and
36- * privileged users can write attributes.
37+ * In the user.* namespace, only regular files, symbolic links, and
38+ * directories can have extended attributes. For symbolic links and
39+ * sticky directories, only the owner and privileged users can write
40+ * attributes.
41 */
42 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
43- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
44+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode))
45 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
46- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
47- (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
48+ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX))
49+ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE)
50+ && !inode_owner_or_capable(inode))
51 return -EPERM;
52 }
53
54--
552.11.1
56
diff --git a/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch
new file mode 100644
index 0000000..a6dbff7
--- /dev/null
+++ b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch
@@ -0,0 +1,84 @@
1From 99b4cdcce43ad0f706120bef26fef8c628c572cf Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:03:52 -0800
4Subject: [PATCH 151/154] mm: Export do_madvise()
5
6Combined with some interesting flags madvise() system call
7allows to free memory more smartly and more efficiently than
8we could do with a simple free(). The issue is that is not
9available for kernel modules that could need it.
10
11In order to solve this lack of support, this patch exports
12do_madvise() so as to make it available to the entire kernel.
13The already existing madvise() system call is unchanged and
14now relies on this new do_madvise() function.
15
16Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
17Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
18---
19 include/linux/mm.h | 2 ++
20 mm/madvise.c | 25 +++++++++++++++++++++----
21 2 files changed, 23 insertions(+), 4 deletions(-)
22
23diff --git a/include/linux/mm.h b/include/linux/mm.h
24index 0b5b2e4df14e..925ec25f99a8 100644
25--- a/include/linux/mm.h
26+++ b/include/linux/mm.h
27@@ -2450,5 +2450,7 @@ void __init setup_nr_node_ids(void);
28 static inline void setup_nr_node_ids(void) {}
29 #endif
30
31+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
32+
33 #endif /* __KERNEL__ */
34 #endif /* _LINUX_MM_H */
35diff --git a/mm/madvise.c b/mm/madvise.c
36index 93fb63e88b5e..c8bbf93d4978 100644
37--- a/mm/madvise.c
38+++ b/mm/madvise.c
39@@ -618,9 +618,7 @@ madvise_behavior_valid(int behavior)
40 }
41
42 /*
43- * The madvise(2) system call.
44- *
45- * Applications can use madvise() to advise the kernel how it should
46+ * Kernel modules can use do_madvise() to advise the kernel how it should
47 * handle paging I/O in this VM area. The idea is to help the kernel
48 * use appropriate read-ahead and caching techniques. The information
49 * provided is advisory only, and can be safely disregarded by the
50@@ -673,7 +671,7 @@ madvise_behavior_valid(int behavior)
51 * -EBADF - map exists, but area maps something that isn't a file.
52 * -EAGAIN - a kernel resource was temporarily unavailable.
53 */
54-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
55+int do_madvise(unsigned long start, size_t len_in, int behavior)
56 {
57 unsigned long end, tmp;
58 struct vm_area_struct *vma, *prev;
59@@ -767,3 +765,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
60
61 return error;
62 }
63+EXPORT_SYMBOL_GPL(do_madvise);
64+
65+/*
66+ * The madvise(2) system call.
67+ *
68+ * Applications can use madvise() system call to advise the kernel how
69+ * it should handle paging I/O in this VM area. The idea is to help
70+ * the kernel use appropriate read-ahead and caching techniques. The
71+ * information provided is advisory only, and can be safely disregarded
72+ * by the kernel without affecting the correct operation of the application.
73+ *
74+ * behavior values are the same than the ones defined in madvise()
75+ *
76+ * return values are the same than the ones defined in madvise()
77+ */
78+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
79+{
80+ return do_madvise(start, len_in, behavior);
81+}
82--
832.12.1
84
diff --git a/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch
new file mode 100644
index 0000000..ff9d8c0
--- /dev/null
+++ b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch
@@ -0,0 +1,180 @@
1From d28921b5f797829e4e676f7968ae688ef96b7992 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:08:55 -0800
4Subject: [PATCH 152/154] x86: kvm: Notify host to release pages
5
6In context of hypervisors managing several virtual machines, we
7want those virtual machines to give the memory they used back to
8the host when they don't need it anymore.
9
10This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
11the guest kernel to notify the host kernel when such event occurs.
12And relying on do_madvise() function that we have previously exported,
13it issues a call to this function when it receives the new hypercall.
14
15Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
16ask for a new page without going through a new hypercall. Instead,
17it will be able to start using that memory again as it will get
18faulted back in as a fresh new page. That's why do_madvise() is more
19efficient than doing vm_unmap() to return some memory to the host.
20
21This patch introduces also a new sysctl kvm_madv_instant_free,
22allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
23Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
24because it does not zero the pages in case the memory has not been
25freed by the kernel. This can happen when there was no need for the
26kernel to get this memory back, meaning it was keeping those pages
27in the right state to be re-used by the same application.
28MADV_FREE being a very recent advice introduced in kernel 4.5, we
29only want to enable it through a sysctl in case the user want to
30use it.
31
32Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
33Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
34---
35 arch/x86/kvm/x86.c | 17 +++++++++++++++++
36 include/linux/mm.h | 5 +++++
37 include/uapi/linux/kvm_para.h | 3 +++
38 kernel/sysctl.c | 7 +++++++
39 mm/Makefile | 2 +-
40 mm/kvm.c | 25 +++++++++++++++++++++++++
41 6 files changed, 58 insertions(+), 1 deletion(-)
42 create mode 100644 mm/kvm.c
43
44diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
45index 582c75311f95..683a94dd5f03 100644
46--- a/arch/x86/kvm/x86.c
47+++ b/arch/x86/kvm/x86.c
48@@ -46,6 +46,7 @@
49 #include <linux/user-return-notifier.h>
50 #include <linux/srcu.h>
51 #include <linux/slab.h>
52+#include <linux/mm.h>
53 #include <linux/perf_event.h>
54 #include <linux/uaccess.h>
55 #include <linux/hash.h>
56@@ -6019,6 +6020,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
57 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
58 }
59
60+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
61+{
62+ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
63+
64+ if (len > KVM_MAX_RET_MEM_SIZE)
65+ return KVM_EPERM;
66+
67+ if (kvm_is_error_hva(start + len))
68+ return KVM_EFAULT;
69+
70+ return do_madvise(start, len, kvm_ret_mem_advice);
71+}
72+
73 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
74 {
75 vcpu->arch.apicv_active = false;
76@@ -6065,6 +6079,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
77 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
78 ret = 0;
79 break;
80+ case KVM_HC_RETURN_MEM:
81+ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
82+ break;
83 default:
84 ret = -KVM_ENOSYS;
85 break;
86diff --git a/include/linux/mm.h b/include/linux/mm.h
87index 925ec25f99a8..833f23d98baa 100644
88--- a/include/linux/mm.h
89+++ b/include/linux/mm.h
90@@ -2303,6 +2303,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
91 extern int sysctl_drop_caches;
92 int drop_caches_sysctl_handler(struct ctl_table *, int,
93 void __user *, size_t *, loff_t *);
94+extern int sysctl_kvm_madv_instant_free;
95+extern int kvm_ret_mem_advice;
96+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
97+ void __user *buffer, size_t *length,
98+ loff_t *ppos);
99 #endif
100
101 void drop_slab(void);
102diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
103index bf6cd7d5cac2..7d90f77d87d0 100644
104--- a/include/uapi/linux/kvm_para.h
105+++ b/include/uapi/linux/kvm_para.h
106@@ -23,6 +23,9 @@
107 #define KVM_HC_MIPS_GET_CLOCK_FREQ 6
108 #define KVM_HC_MIPS_EXIT_VM 7
109 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8
110+#define KVM_HC_RETURN_MEM 10
111+
112+#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB
113
114 /*
115 * hypercalls use architecture specific
116diff --git a/kernel/sysctl.c b/kernel/sysctl.c
117index c1095cdc0fe2..d8ae774fa042 100644
118--- a/kernel/sysctl.c
119+++ b/kernel/sysctl.c
120@@ -1398,6 +1398,13 @@ static struct ctl_table vm_table[] = {
121 .extra1 = &one,
122 .extra2 = &four,
123 },
124+ {
125+ .procname = "kvm_madv_instant_free",
126+ .data = &sysctl_kvm_madv_instant_free,
127+ .maxlen = sizeof(int),
128+ .mode = 0644,
129+ .proc_handler = kvm_madv_instant_free_sysctl_handler,
130+ },
131 #ifdef CONFIG_COMPACTION
132 {
133 .procname = "compact_memory",
134diff --git a/mm/Makefile b/mm/Makefile
135index 295bd7a9f76b..651ce0aff140 100644
136--- a/mm/Makefile
137+++ b/mm/Makefile
138@@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
139 mm_init.o mmu_context.o percpu.o slab_common.o \
140 compaction.o vmacache.o \
141 interval_tree.o list_lru.o workingset.o \
142- prfile.o debug.o $(mmu-y)
143+ prfile.o debug.o kvm.o $(mmu-y)
144
145 obj-y += init-mm.o
146
147diff --git a/mm/kvm.c b/mm/kvm.c
148new file mode 100644
149index 000000000000..8945f6a311b9
150--- /dev/null
151+++ b/mm/kvm.c
152@@ -0,0 +1,25 @@
153+#include <linux/mman.h>
154+
155+int sysctl_kvm_madv_instant_free;
156+
157+int kvm_ret_mem_advice = MADV_DONTNEED;
158+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
159+
160+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
161+ void __user *buffer, size_t *length, loff_t *ppos)
162+{
163+ int ret;
164+
165+ ret = proc_dointvec(table, write, buffer, length, ppos);
166+ if (ret)
167+ return ret;
168+
169+#ifdef MADV_FREE
170+ if (sysctl_kvm_madv_instant_free > 0)
171+ kvm_ret_mem_advice = MADV_FREE;
172+ else
173+ kvm_ret_mem_advice = MADV_DONTNEED;
174+#endif
175+
176+ return 0;
177+}
178--
1792.12.1
180
diff --git a/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch
new file mode 100644
index 0000000..cdb876a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch
@@ -0,0 +1,155 @@
1From 855ef164854307839c08c60688eaeac14f9a649e Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:26:13 -0800
4Subject: [PATCH 153/154] x86: Return memory from guest to host kernel
5
6All virtual machines need memory to perform various tasks, but this
7memory is not released to the host after it is not used anymore. We
8have to wait for the termination of the virtual machine to get this
9memory back into the host.
10
11Ballooning mechanism is close but not designed for the same purpose.
12In case we hit memory limits of the system, the host predicts how much
13memory can be asked back from a guest, and it issues an hypercall to
14retrieve this memory.
15
16The solution proposed is different because it does not wait for host
17needs before to return memory, and it knows precisely how much memory
18it can return.
19
20The way to notify the host side about such a return is to rely on
21the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be
22overloaded with too many hypercalls, we only return memory blocks of
23order 7 (512k blocks) and higher. This value has been found running
24memory tests using multiple threads allocating/freeing high amount
25of memory. Those tests were run for different order values, and 7 was
26the best tradeoff between the number of hypercalls issued and the
27amount of memory returned to the host.
28
29In order to limit performances impact related to this code addition,
30we check for blocks of order 7 or higher. This means it only costs an
31additional function call and a branch to perform this check.
32
33Furthermore, this code has been added to the "merge" codepath of the
34buddy allocator, which is not as sensitive as the "free" codepath.
35Not all blocks going through the "free" codepath will end up in the
36"merge" codepath because some of them won't find their free buddy.
37But this is a negligible amount since the kernel does not use many
38high order blocks directly. Instead, those bigger blocks are often
39broken into smaller chunks used as low order blocks. At the time
40those small blocks are released, they go through the merge path.
41
42Benchmarks such as ebizzy and will-it-scale have been run in order
43to make sure this patch does not affect kernel performances and no
44significant differences were observed.
45
46Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
47Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
48---
49 arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++
50 arch/x86/kernel/kvm.c | 10 ++++++++++
51 include/linux/mm-arch-hooks.h | 8 ++++++++
52 mm/page_alloc.c | 2 ++
53 4 files changed, 42 insertions(+)
54
55diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
56index bc62e7cbf1b1..4a2f6d1adbd2 100644
57--- a/arch/x86/include/asm/kvm_para.h
58+++ b/arch/x86/include/asm/kvm_para.h
59@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token);
60 void kvm_async_pf_task_wake(u32 token);
61 u32 kvm_read_and_reset_pf_reason(void);
62 extern void kvm_disable_steal_time(void);
63+void kvm_arch_return_memory(struct page *page, unsigned int order);
64+
65+/*
66+ * This order has been found in an empirical way, running memory tests
67+ * through many iterations to assess the number of hypercalls issued
68+ * and the amount of memory returned. In case you change this order to
69+ * 6 or 8, it should not impact your performances significantly.
70+ *
71+ * Smaller values lead to less memory waste, but consume more CPU on
72+ * hypercalls. Larger values use less CPU, but do not as precisely
73+ * inform the hypervisor of which memory is free.
74+ */
75+#define RET_MEM_BUDDY_ORDER 7
76+
77+static inline void arch_buddy_merge(struct page *page, unsigned int order)
78+{
79+ if (order < RET_MEM_BUDDY_ORDER)
80+ return;
81+
82+ kvm_arch_return_memory(page, order);
83+}
84+#define arch_buddy_merge arch_buddy_merge
85
86 #ifdef CONFIG_PARAVIRT_SPINLOCKS
87 void __init kvm_spinlock_init(void);
88diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
89index edbbfc854e39..14167b3f6514 100644
90--- a/arch/x86/kernel/kvm.c
91+++ b/arch/x86/kernel/kvm.c
92@@ -552,6 +552,16 @@ static __init int activate_jump_labels(void)
93 }
94 arch_initcall(activate_jump_labels);
95
96+void kvm_arch_return_memory(struct page *page, unsigned int order)
97+{
98+ if (!kvm_para_available())
99+ return;
100+
101+ kvm_hypercall2(KVM_HC_RETURN_MEM,
102+ page_to_phys(page),
103+ PAGE_SIZE << order);
104+}
105+
106 #ifdef CONFIG_PARAVIRT_SPINLOCKS
107
108 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
109diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
110index 4efc3f56e6df..26eb3a05a8a3 100644
111--- a/include/linux/mm-arch-hooks.h
112+++ b/include/linux/mm-arch-hooks.h
113@@ -12,6 +12,7 @@
114 #define _LINUX_MM_ARCH_HOOKS_H
115
116 #include <asm/mm-arch-hooks.h>
117+#include <asm/kvm_para.h>
118
119 #ifndef arch_remap
120 static inline void arch_remap(struct mm_struct *mm,
121@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm,
122 #define arch_remap arch_remap
123 #endif
124
125+#ifndef arch_buddy_merge
126+static inline void arch_buddy_merge(struct page *page, unsigned int order)
127+{
128+}
129+#define arch_buddy_merge arch_buddy_merge
130+#endif
131+
132 #endif /* _LINUX_MM_ARCH_HOOKS_H */
133diff --git a/mm/page_alloc.c b/mm/page_alloc.c
134index 1460e6ad5e14..5f6e6371bc6f 100644
135--- a/mm/page_alloc.c
136+++ b/mm/page_alloc.c
137@@ -64,6 +64,7 @@
138 #include <linux/page_owner.h>
139 #include <linux/kthread.h>
140 #include <linux/memcontrol.h>
141+#include <linux/mm-arch-hooks.h>
142
143 #include <asm/sections.h>
144 #include <asm/tlbflush.h>
145@@ -855,6 +856,7 @@ static inline void __free_one_page(struct page *page,
146 }
147
148 done_merging:
149+ arch_buddy_merge(page, order);
150 set_page_order(page, order);
151
152 /*
153--
1542.12.1
155
diff --git a/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
new file mode 100644
index 0000000..07d4a83
--- /dev/null
+++ b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
@@ -0,0 +1,137 @@
1From 2c145b5233b504f5226a0f4bc44baeef33b444d8 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:32:39 -0800
4Subject: [PATCH 154/154] sysctl: vm: Fine-grained cache shrinking
5
6Lots of virtual machines are let in idle state for days until they
7are terminated, and they can keep a large amount of memory in their
8cache, meaning this memory cannot be used by other processes.
9
10We tried to release this memory using existing drop_caches sysctl,
11but it led to the complete cache loss while it could have been used
12whether the idle process wakes up. Indeed, the process can't find any
13available cached data and it directly affects performances to rebuild
14it from scratch.
15
16Instead, the solution we want is based on shrinking gradually system
17cache over time. This patch adds a new sysctl shrink_caches_mb so as
18to allow userspace applications indicating the kernel it should shrink
19system cache up to the amount (in MiB) specified.
20
21There is an application called "memshrinker" which uses this new
22mechanism. It runs in the background and periodically releases a
23specified amount of cache. This amount is based on the remaining
24cache on the system, and period is computed to follow a shrinking
25model. It results in saving a lot of memory for other processes
26running on the system.
27
28Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
29Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
30---
31 fs/drop_caches.c | 25 +++++++++++++++++++++++++
32 include/linux/mm.h | 4 ++++
33 kernel/sysctl.c | 8 ++++++++
34 mm/vmscan.c | 2 --
35 4 files changed, 37 insertions(+), 2 deletions(-)
36
37diff --git a/fs/drop_caches.c b/fs/drop_caches.c
38index d72d52b90433..f564dfcc13a4 100644
39--- a/fs/drop_caches.c
40+++ b/fs/drop_caches.c
41@@ -8,10 +8,12 @@
42 #include <linux/writeback.h>
43 #include <linux/sysctl.h>
44 #include <linux/gfp.h>
45+#include <linux/swap.h>
46 #include "internal.h"
47
48 /* A global variable is a bit ugly, but it keeps the code simple */
49 int sysctl_drop_caches;
50+int sysctl_shrink_caches_mb;
51
52 static void drop_pagecache_sb(struct super_block *sb, void *unused)
53 {
54@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
55 }
56 return 0;
57 }
58+
59+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
60+ void __user *buffer, size_t *length, loff_t *ppos)
61+{
62+ int ret;
63+ unsigned long nr_to_reclaim, page_reclaimed;
64+
65+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
66+ if (ret)
67+ return ret;
68+
69+ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE;
70+ if (write) {
71+ page_reclaimed = shrink_all_memory(nr_to_reclaim);
72+ if (page_reclaimed > 0)
73+ lru_add_drain_all();
74+
75+ if (page_reclaimed != nr_to_reclaim)
76+ return page_reclaimed;
77+ }
78+
79+ return 0;
80+}
81diff --git a/include/linux/mm.h b/include/linux/mm.h
82index 833f23d98baa..0bb66c1c31c9 100644
83--- a/include/linux/mm.h
84+++ b/include/linux/mm.h
85@@ -2308,6 +2308,10 @@ extern int kvm_ret_mem_advice;
86 int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
87 void __user *buffer, size_t *length,
88 loff_t *ppos);
89+extern int sysctl_shrink_caches_mb;
90+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
91+ void __user *buffer, size_t *length,
92+ loff_t *ppos);
93 #endif
94
95 void drop_slab(void);
96diff --git a/kernel/sysctl.c b/kernel/sysctl.c
97index d8ae774fa042..5dc9a46ae212 100644
98--- a/kernel/sysctl.c
99+++ b/kernel/sysctl.c
100@@ -1405,6 +1405,14 @@ static struct ctl_table vm_table[] = {
101 .mode = 0644,
102 .proc_handler = kvm_madv_instant_free_sysctl_handler,
103 },
104+ {
105+ .procname = "shrink_caches_mb",
106+ .data = &sysctl_shrink_caches_mb,
107+ .maxlen = sizeof(int),
108+ .mode = 0644,
109+ .proc_handler = shrink_caches_sysctl_handler,
110+ .extra1 = &one,
111+ },
112 #ifdef CONFIG_COMPACTION
113 {
114 .procname = "compact_memory",
115diff --git a/mm/vmscan.c b/mm/vmscan.c
116index 30a88b945a44..1198e74d1860 100644
117--- a/mm/vmscan.c
118+++ b/mm/vmscan.c
119@@ -3525,7 +3525,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
120 wake_up_interruptible(&pgdat->kswapd_wait);
121 }
122
123-#ifdef CONFIG_HIBERNATION
124 /*
125 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
126 * freed pages.
127@@ -3564,7 +3563,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
128
129 return nr_reclaimed;
130 }
131-#endif /* CONFIG_HIBERNATION */
132
133 /* It's optimal to keep kswapds on the same CPUs as their memory, but
134 not required for correctness. So if the last cpu in a node goes
135--
1362.12.1
137
diff --git a/patches/boot_time_opt_guest/guest_boot_time_opt.scc b/patches/boot_time_opt_guest/guest_boot_time_opt.scc
new file mode 100644
index 0000000..3636c01
--- /dev/null
+++ b/patches/boot_time_opt_guest/guest_boot_time_opt.scc
@@ -0,0 +1,19 @@
1define KFEATURE_DESCRIPTION "Boot time optimization changes ported from ClearLinux , https://github.com/clearlinux-pkgs/linux-kvm"
2define KFEATURE_COMPATIBILITY all
3
4patch 0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
5patch 0104-fbcon-enable-no-blink-by-default.patch
6patch 0105-vmstats-wakeups.patch
7# Remove patch because it causes ixgvbevf to not initialize correctly in the guest
8#patch 0106-pci-probe.patch
9patch 0107-cgroup.patch
10patch 0108-smpboot-reuse-timer-calibration.patch
11patch 0109-perf.patch
12patch 0110-pci-probe-identify-known-devices.patch
13patch 0111-init-no-wait-for-the-known-devices.patch
14patch 0112-ksm-wakeups.patch
15
16patch 0151-mm-Export-do_madvise.patch
17patch 0152-x86-kvm-Notify-host-to-release-pages.patch
18patch 0153-x86-Return-memory-from-guest-to-host-kernel.patch
19patch 0154-sysctl-vm-Fine-grained-cache-shrinking.patch