summaryrefslogtreecommitdiffstats
path: root/patches
diff options
context:
space:
mode:
Diffstat (limited to 'patches')
-rw-r--r--patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch25
-rw-r--r--patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch29
-rw-r--r--patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch65
-rw-r--r--patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch42
-rw-r--r--patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch35
-rw-r--r--patches/boot_time_opt/0105-silence-rapl.patch25
-rw-r--r--patches/boot_time_opt/0106-pci-pme-wakeups.patch27
-rw-r--r--patches/boot_time_opt/0107-ksm-wakeups.patch34
-rw-r--r--patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch227
-rw-r--r--patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch56
-rw-r--r--patches/boot_time_opt/0110-init_task-faster-timerslack.patch32
-rw-r--r--patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch158
-rw-r--r--patches/boot_time_opt/0113-overload-on-wakeup.patch43
-rw-r--r--patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch83
-rw-r--r--patches/boot_time_opt/0115-fix-initcall-timestamps.patch42
-rw-r--r--patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch31
-rw-r--r--patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch156
-rw-r--r--patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch47
-rw-r--r--patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch311
-rw-r--r--patches/boot_time_opt/0120-give-rdrand-some-credit.patch30
-rw-r--r--patches/boot_time_opt/0121-e1000e-change-default-policy.patch27
-rw-r--r--patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch28
-rw-r--r--patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch27
-rw-r--r--patches/boot_time_opt/0124-tweak-perfbias.patch32
-rw-r--r--patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch33
-rw-r--r--patches/boot_time_opt/0151-mm-Export-do_madvise.patch84
-rw-r--r--patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch180
-rw-r--r--patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch155
-rw-r--r--patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch137
-rw-r--r--patches/boot_time_opt/host_boot_time_opt.scc29
-rw-r--r--patches/boot_time_opt/raid_alg.cfg3
-rw-r--r--patches/boot_time_opt/raid_alg.scc5
-rw-r--r--patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch34
-rw-r--r--patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch38
-rw-r--r--patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch26
-rw-r--r--patches/boot_time_opt_guest/0105-vmstats-wakeups.patch28
-rw-r--r--patches/boot_time_opt_guest/0106-pci-probe.patch123
-rw-r--r--patches/boot_time_opt_guest/0107-cgroup.patch107
-rw-r--r--patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch45
-rw-r--r--patches/boot_time_opt_guest/0109-perf.patch28
-rw-r--r--patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch190
-rw-r--r--patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch39
-rw-r--r--patches/boot_time_opt_guest/0112-ksm-wakeups.patch32
-rw-r--r--patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch42
-rw-r--r--patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch56
-rw-r--r--patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch84
-rw-r--r--patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch180
-rw-r--r--patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch155
-rw-r--r--patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch137
-rw-r--r--patches/boot_time_opt_guest/guest_boot_time_opt.scc19
-rw-r--r--patches/ipv4/0001-IPV4-unlock-rtnl_mutex-before-waiting-for-carrier-on.patch44
-rw-r--r--patches/ipv4/ipv4wait.scc1
-rw-r--r--patches/kernel_startend_msg/0001-printk-add-Enea-Linux-boot-start-end-messages.patch95
-rw-r--r--patches/kernel_startend_msg/kernel_startend_msg.scc4
54 files changed, 3745 insertions, 0 deletions
diff --git a/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch b/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch
new file mode 100644
index 0000000..33debcd
--- /dev/null
+++ b/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch
@@ -0,0 +1,25 @@
1From 07639791f247ae7a807444106b9b7611f070d02b Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Fri, 6 Jan 2017 13:28:29 +0000
4Subject: [PATCH] drm/i915/fbc: sanitize fbc GEN greater than 9
5
6---
7 drivers/gpu/drm/i915/intel_fbc.c | 2 +-
8 1 file changed, 1 insertion(+), 1 deletion(-)
9
10diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
11index c43dd9abce79..f5a2560840f3 100644
12--- a/drivers/gpu/drm/i915/intel_fbc.c
13+++ b/drivers/gpu/drm/i915/intel_fbc.c
14@@ -1262,7 +1262,7 @@ static int intel_sanitize_fbc_option(struct drm_i915_private *dev_priv)
15 if (!HAS_FBC(dev_priv))
16 return 0;
17
18- if (IS_BROADWELL(dev_priv))
19+ if (IS_BROADWELL(dev_priv) || INTEL_GEN(dev_priv) >= 9)
20 return 1;
21
22 return 0;
23--
242.11.1
25
diff --git a/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch b/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch
new file mode 100644
index 0000000..aeb3abf
--- /dev/null
+++ b/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch
@@ -0,0 +1,29 @@
1From f45c353859fc0ceb75fef3a2f4a2c179dfa378d7 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Tue, 23 Jun 2015 01:16:45 -0500
4Subject: [PATCH 101/124] kvm: silence kvm unhandled rdmsr
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9Signed-off-by: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
10---
11 arch/x86/kvm/x86.c | 2 +-
12 1 file changed, 1 insertion(+), 1 deletion(-)
13
14diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
15index 731044efb195..582c75311f95 100644
16--- a/arch/x86/kvm/x86.c
17+++ b/arch/x86/kvm/x86.c
18@@ -2506,7 +2506,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
19 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
20 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
21 if (!ignore_msrs) {
22- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
23+// vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
24 return 1;
25 } else {
26 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
27--
282.11.1
29
diff --git a/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch b/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch
new file mode 100644
index 0000000..96fd92b
--- /dev/null
+++ b/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch
@@ -0,0 +1,65 @@
1From 7e847b13b753ec632fef2f1ffa0d8f5b444c967b Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Tue, 23 Jun 2015 01:26:52 -0500
4Subject: [PATCH 102/124] i8042: decrease debug message level to info
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9Signed-off-by: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
10---
11 drivers/input/serio/i8042.c | 10 +++++-----
12 1 file changed, 5 insertions(+), 5 deletions(-)
13
14diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
15index 89abfdb539ac..5317c41b049e 100644
16--- a/drivers/input/serio/i8042.c
17+++ b/drivers/input/serio/i8042.c
18@@ -593,7 +593,7 @@ static int i8042_enable_kbd_port(void)
19 if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
20 i8042_ctr &= ~I8042_CTR_KBDINT;
21 i8042_ctr |= I8042_CTR_KBDDIS;
22- pr_err("Failed to enable KBD port\n");
23+ pr_info("Failed to enable KBD port\n");
24 return -EIO;
25 }
26
27@@ -612,7 +612,7 @@ static int i8042_enable_aux_port(void)
28 if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
29 i8042_ctr &= ~I8042_CTR_AUXINT;
30 i8042_ctr |= I8042_CTR_AUXDIS;
31- pr_err("Failed to enable AUX port\n");
32+ pr_info("Failed to enable AUX port\n");
33 return -EIO;
34 }
35
36@@ -704,7 +704,7 @@ static int __init i8042_check_mux(void)
37 i8042_ctr &= ~I8042_CTR_AUXINT;
38
39 if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
40- pr_err("Failed to disable AUX port, can't use MUX\n");
41+ pr_info("Failed to disable AUX port, can't use MUX\n");
42 return -EIO;
43 }
44
45@@ -927,7 +927,7 @@ static int i8042_controller_selftest(void)
46 do {
47
48 if (i8042_command(&param, I8042_CMD_CTL_TEST)) {
49- pr_err("i8042 controller selftest timeout\n");
50+ pr_info("i8042 controller selftest timeout\n");
51 return -ENODEV;
52 }
53
54@@ -949,7 +949,7 @@ static int i8042_controller_selftest(void)
55 pr_info("giving up on controller selftest, continuing anyway...\n");
56 return 0;
57 #else
58- pr_err("i8042 controller selftest failed\n");
59+ pr_info("i8042 controller selftest failed\n");
60 return -EIO;
61 #endif
62 }
63--
642.11.1
65
diff --git a/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch b/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch
new file mode 100644
index 0000000..bb7bb9f
--- /dev/null
+++ b/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch
@@ -0,0 +1,42 @@
1From 838abc7e5f43ea40a2cc05ebd6c7321b6d84b057 Mon Sep 17 00:00:00 2001
2From: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
3Date: Fri, 20 Nov 2015 14:01:26 -0600
4Subject: [PATCH 103/124] init: do_mounts: recreate /dev/root
5
6Rootfs shows as is mounted in /dev/root, but this devices is not present in
7/dev directory.
8
9Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
10---
11 init/do_mounts.c | 8 ++++++++
12 1 file changed, 8 insertions(+)
13
14diff --git a/init/do_mounts.c b/init/do_mounts.c
15index dea5de95c2dd..d74a346b2dfa 100644
16--- a/init/do_mounts.c
17+++ b/init/do_mounts.c
18@@ -549,6 +549,7 @@ void __init mount_root(void)
19 void __init prepare_namespace(void)
20 {
21 int is_floppy;
22+ int err;
23
24 if (root_delay) {
25 printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
26@@ -602,6 +603,13 @@ void __init prepare_namespace(void)
27 devtmpfs_mount("dev");
28 sys_mount(".", "/", NULL, MS_MOVE, NULL);
29 sys_chroot(".");
30+#ifdef CONFIG_BLOCK
31+ /* recreate the /dev/root */
32+ err = create_dev("/dev/root", ROOT_DEV);
33+
34+ if (err < 0)
35+ pr_emerg("Failed to create /dev/root: %d\n", err);
36+#endif
37 }
38
39 static bool is_tmpfs;
40--
412.11.1
42
diff --git a/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch b/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch
new file mode 100644
index 0000000..fb709b4
--- /dev/null
+++ b/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch
@@ -0,0 +1,35 @@
1From b6970d43f97325c9acc7bd942dcd192586d8d407 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 11 Jan 2016 10:01:44 -0600
4Subject: [PATCH 104/124] Increase the ext4 default commit age
5
6Both the VM and EXT4 have a "commit to disk after X seconds" time.
7Currently the EXT4 time is shorter than our VM time, which is a bit
8suboptional,
9it's better for performance to let the VM do the writeouts in bulk
10rather than something deep in the journalling layer.
11
12(DISTRO TWEAK -- NOT FOR UPSTREAM)
13
14Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
15Signed-off-by: Jose Carlos Venegas Munoz <jose.carlos.venegas.munoz@intel.com>
16---
17 include/linux/jbd2.h | 2 +-
18 1 file changed, 1 insertion(+), 1 deletion(-)
19
20diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
21index dfaa1f4dcb0c..9955fd6c6159 100644
22--- a/include/linux/jbd2.h
23+++ b/include/linux/jbd2.h
24@@ -47,7 +47,7 @@
25 /*
26 * The default maximum commit age, in seconds.
27 */
28-#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
29+#define JBD2_DEFAULT_MAX_COMMIT_AGE 30
30
31 #ifdef CONFIG_JBD2_DEBUG
32 /*
33--
342.11.1
35
diff --git a/patches/boot_time_opt/0105-silence-rapl.patch b/patches/boot_time_opt/0105-silence-rapl.patch
new file mode 100644
index 0000000..4dd78fc
--- /dev/null
+++ b/patches/boot_time_opt/0105-silence-rapl.patch
@@ -0,0 +1,25 @@
1From 558d32869c8d8e302dd3810610d62e1c69a8ebce Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 14 Mar 2016 11:22:09 -0600
4Subject: [PATCH 105/124] silence rapl
5
6---
7 drivers/powercap/intel_rapl.c | 2 +-
8 1 file changed, 1 insertion(+), 1 deletion(-)
9
10diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
11index 3c71f608b444..450aff027d42 100644
12--- a/drivers/powercap/intel_rapl.c
13+++ b/drivers/powercap/intel_rapl.c
14@@ -1684,7 +1684,7 @@ static int __init rapl_init(void)
15
16 id = x86_match_cpu(rapl_ids);
17 if (!id) {
18- pr_err("driver does not support CPU family %d model %d\n",
19+ pr_info("driver does not support CPU family %d model %d\n",
20 boot_cpu_data.x86, boot_cpu_data.x86_model);
21
22 return -ENODEV;
23--
242.11.1
25
diff --git a/patches/boot_time_opt/0106-pci-pme-wakeups.patch b/patches/boot_time_opt/0106-pci-pme-wakeups.patch
new file mode 100644
index 0000000..f0a4799
--- /dev/null
+++ b/patches/boot_time_opt/0106-pci-pme-wakeups.patch
@@ -0,0 +1,27 @@
1From 1f44219cd74f5c3b97e2c85af87141e1bddf0555 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 14 Mar 2016 11:10:58 -0600
4Subject: [PATCH 106/124] pci pme wakeups
5
6Reduce wakeups for PME checks, which are a workaround for miswired
7boards (sadly, too many of them) in laptops.
8---
9 drivers/pci/pci.c | 2 +-
10 1 file changed, 1 insertion(+), 1 deletion(-)
11
12diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
13index eda6a7cf0e54..82a623255059 100644
14--- a/drivers/pci/pci.c
15+++ b/drivers/pci/pci.c
16@@ -57,7 +57,7 @@ struct pci_pme_device {
17 struct pci_dev *dev;
18 };
19
20-#define PME_TIMEOUT 1000 /* How long between PME checks */
21+#define PME_TIMEOUT 4000 /* How long between PME checks */
22
23 static void pci_dev_d3_sleep(struct pci_dev *dev)
24 {
25--
262.11.1
27
diff --git a/patches/boot_time_opt/0107-ksm-wakeups.patch b/patches/boot_time_opt/0107-ksm-wakeups.patch
new file mode 100644
index 0000000..2b25625
--- /dev/null
+++ b/patches/boot_time_opt/0107-ksm-wakeups.patch
@@ -0,0 +1,34 @@
1From a5de04044d428bf54472365e7dc07958aa184daf Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 14 Mar 2016 11:06:46 -0600
4Subject: [PATCH 107/124] ksm-wakeups
5
6reduce wakeups in ksm by adding rounding (aligning) when the sleep times are 1 second or longer
7
8Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
9---
10 mm/ksm.c | 8 ++++++--
11 1 file changed, 6 insertions(+), 2 deletions(-)
12
13diff --git a/mm/ksm.c b/mm/ksm.c
14index 9ae6011a41f8..eecd3ff669e2 100644
15--- a/mm/ksm.c
16+++ b/mm/ksm.c
17@@ -1725,8 +1725,12 @@ static int ksm_scan_thread(void *nothing)
18 try_to_freeze();
19
20 if (ksmd_should_run()) {
21- schedule_timeout_interruptible(
22- msecs_to_jiffies(ksm_thread_sleep_millisecs));
23+ if (ksm_thread_sleep_millisecs >= 1000)
24+ schedule_timeout_interruptible(
25+ msecs_to_jiffies(round_jiffies_relative(ksm_thread_sleep_millisecs)));
26+ else
27+ schedule_timeout_interruptible(
28+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
29 } else {
30 wait_event_freezable(ksm_thread_wait,
31 ksmd_should_run() || kthread_should_stop());
32--
332.11.1
34
diff --git a/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch b/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch
new file mode 100644
index 0000000..da5396c
--- /dev/null
+++ b/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch
@@ -0,0 +1,227 @@
1From bf7e0cebaafe790f62cbc5815648d556847b7d27 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sat, 19 Mar 2016 21:32:19 -0400
4Subject: [PATCH 108/124] intel_idle: tweak cpuidle cstates
5
6Increase target_residency in cpuidle cstate
7
8Tune intel_idle to be a bit less agressive;
9Clear linux is cleaner in hygiene (wakupes) than the average linux,
10so we can afford changing these in a way that increases
11performance while keeping power efficiency
12---
13 drivers/idle/intel_idle.c | 74 +++++++++++------------------------------------
14 1 file changed, 17 insertions(+), 57 deletions(-)
15
16diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
17index 4466a2f969d7..cbab050b83f0 100644
18--- a/drivers/idle/intel_idle.c
19+++ b/drivers/idle/intel_idle.c
20@@ -475,7 +475,7 @@ static struct cpuidle_state hsw_cstates[] = {
21 .desc = "MWAIT 0x10",
22 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
23 .exit_latency = 33,
24- .target_residency = 100,
25+ .target_residency = 1000,
26 .enter = &intel_idle,
27 .enter_freeze = intel_idle_freeze, },
28 {
29@@ -483,7 +483,7 @@ static struct cpuidle_state hsw_cstates[] = {
30 .desc = "MWAIT 0x20",
31 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
32 .exit_latency = 133,
33- .target_residency = 400,
34+ .target_residency = 4000,
35 .enter = &intel_idle,
36 .enter_freeze = intel_idle_freeze, },
37 {
38@@ -491,7 +491,7 @@ static struct cpuidle_state hsw_cstates[] = {
39 .desc = "MWAIT 0x32",
40 .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
41 .exit_latency = 166,
42- .target_residency = 500,
43+ .target_residency = 5000,
44 .enter = &intel_idle,
45 .enter_freeze = intel_idle_freeze, },
46 {
47@@ -499,7 +499,7 @@ static struct cpuidle_state hsw_cstates[] = {
48 .desc = "MWAIT 0x40",
49 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
50 .exit_latency = 300,
51- .target_residency = 900,
52+ .target_residency = 9000,
53 .enter = &intel_idle,
54 .enter_freeze = intel_idle_freeze, },
55 {
56@@ -507,7 +507,7 @@ static struct cpuidle_state hsw_cstates[] = {
57 .desc = "MWAIT 0x50",
58 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
59 .exit_latency = 600,
60- .target_residency = 1800,
61+ .target_residency = 18000,
62 .enter = &intel_idle,
63 .enter_freeze = intel_idle_freeze, },
64 {
65@@ -515,7 +515,7 @@ static struct cpuidle_state hsw_cstates[] = {
66 .desc = "MWAIT 0x60",
67 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
68 .exit_latency = 2600,
69- .target_residency = 7700,
70+ .target_residency = 77000,
71 .enter = &intel_idle,
72 .enter_freeze = intel_idle_freeze, },
73 {
74@@ -531,27 +531,11 @@ static struct cpuidle_state bdw_cstates[] = {
75 .enter = &intel_idle,
76 .enter_freeze = intel_idle_freeze, },
77 {
78- .name = "C1E-BDW",
79- .desc = "MWAIT 0x01",
80- .flags = MWAIT2flg(0x01),
81- .exit_latency = 10,
82- .target_residency = 20,
83- .enter = &intel_idle,
84- .enter_freeze = intel_idle_freeze, },
85- {
86- .name = "C3-BDW",
87- .desc = "MWAIT 0x10",
88- .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
89- .exit_latency = 40,
90- .target_residency = 100,
91- .enter = &intel_idle,
92- .enter_freeze = intel_idle_freeze, },
93- {
94 .name = "C6-BDW",
95 .desc = "MWAIT 0x20",
96 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
97 .exit_latency = 133,
98- .target_residency = 400,
99+ .target_residency = 4000,
100 .enter = &intel_idle,
101 .enter_freeze = intel_idle_freeze, },
102 {
103@@ -559,7 +543,7 @@ static struct cpuidle_state bdw_cstates[] = {
104 .desc = "MWAIT 0x32",
105 .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
106 .exit_latency = 166,
107- .target_residency = 500,
108+ .target_residency = 5000,
109 .enter = &intel_idle,
110 .enter_freeze = intel_idle_freeze, },
111 {
112@@ -567,7 +551,7 @@ static struct cpuidle_state bdw_cstates[] = {
113 .desc = "MWAIT 0x40",
114 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
115 .exit_latency = 300,
116- .target_residency = 900,
117+ .target_residency = 9000,
118 .enter = &intel_idle,
119 .enter_freeze = intel_idle_freeze, },
120 {
121@@ -575,7 +559,7 @@ static struct cpuidle_state bdw_cstates[] = {
122 .desc = "MWAIT 0x50",
123 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
124 .exit_latency = 600,
125- .target_residency = 1800,
126+ .target_residency = 18000,
127 .enter = &intel_idle,
128 .enter_freeze = intel_idle_freeze, },
129 {
130@@ -583,7 +567,7 @@ static struct cpuidle_state bdw_cstates[] = {
131 .desc = "MWAIT 0x60",
132 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
133 .exit_latency = 2600,
134- .target_residency = 7700,
135+ .target_residency = 77000,
136 .enter = &intel_idle,
137 .enter_freeze = intel_idle_freeze, },
138 {
139@@ -600,27 +584,11 @@ static struct cpuidle_state skl_cstates[] = {
140 .enter = &intel_idle,
141 .enter_freeze = intel_idle_freeze, },
142 {
143- .name = "C1E-SKL",
144- .desc = "MWAIT 0x01",
145- .flags = MWAIT2flg(0x01),
146- .exit_latency = 10,
147- .target_residency = 20,
148- .enter = &intel_idle,
149- .enter_freeze = intel_idle_freeze, },
150- {
151- .name = "C3-SKL",
152- .desc = "MWAIT 0x10",
153- .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
154- .exit_latency = 70,
155- .target_residency = 100,
156- .enter = &intel_idle,
157- .enter_freeze = intel_idle_freeze, },
158- {
159 .name = "C6-SKL",
160 .desc = "MWAIT 0x20",
161 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
162 .exit_latency = 85,
163- .target_residency = 200,
164+ .target_residency = 2000,
165 .enter = &intel_idle,
166 .enter_freeze = intel_idle_freeze, },
167 {
168@@ -628,7 +596,7 @@ static struct cpuidle_state skl_cstates[] = {
169 .desc = "MWAIT 0x33",
170 .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
171 .exit_latency = 124,
172- .target_residency = 800,
173+ .target_residency = 8000,
174 .enter = &intel_idle,
175 .enter_freeze = intel_idle_freeze, },
176 {
177@@ -636,7 +604,7 @@ static struct cpuidle_state skl_cstates[] = {
178 .desc = "MWAIT 0x40",
179 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
180 .exit_latency = 200,
181- .target_residency = 800,
182+ .target_residency = 8000,
183 .enter = &intel_idle,
184 .enter_freeze = intel_idle_freeze, },
185 {
186@@ -644,7 +612,7 @@ static struct cpuidle_state skl_cstates[] = {
187 .desc = "MWAIT 0x50",
188 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
189 .exit_latency = 480,
190- .target_residency = 5000,
191+ .target_residency = 50000,
192 .enter = &intel_idle,
193 .enter_freeze = intel_idle_freeze, },
194 {
195@@ -652,7 +620,7 @@ static struct cpuidle_state skl_cstates[] = {
196 .desc = "MWAIT 0x60",
197 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
198 .exit_latency = 890,
199- .target_residency = 5000,
200+ .target_residency = 50000,
201 .enter = &intel_idle,
202 .enter_freeze = intel_idle_freeze, },
203 {
204@@ -669,19 +637,11 @@ static struct cpuidle_state skx_cstates[] = {
205 .enter = &intel_idle,
206 .enter_freeze = intel_idle_freeze, },
207 {
208- .name = "C1E-SKX",
209- .desc = "MWAIT 0x01",
210- .flags = MWAIT2flg(0x01),
211- .exit_latency = 10,
212- .target_residency = 20,
213- .enter = &intel_idle,
214- .enter_freeze = intel_idle_freeze, },
215- {
216 .name = "C6-SKX",
217 .desc = "MWAIT 0x20",
218 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
219 .exit_latency = 133,
220- .target_residency = 600,
221+ .target_residency = 1600,
222 .enter = &intel_idle,
223 .enter_freeze = intel_idle_freeze, },
224 {
225--
2262.11.1
227
diff --git a/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch b/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
new file mode 100644
index 0000000..70247a0
--- /dev/null
+++ b/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
@@ -0,0 +1,56 @@
1From 4170571f7bb0897c90e13b2fcf3ee06990a9e774 Mon Sep 17 00:00:00 2001
2From: Alan Cox <alan@linux.intel.com>
3Date: Thu, 10 Mar 2016 15:11:28 +0000
4Subject: [PATCH 109/124] xattr: allow setting user.* attributes on symlinks by
5 owner
6
7Kvmtool and clear containers supports using user attributes to label host
8files with the virtual uid/guid of the file in the container. This allows an
9end user to manage their files and a complete uid space without all the ugly
10namespace stuff.
11
12The one gap in the support is symlinks because an end user can change the
13ownership of a symbolic link. We support attributes on these files as you
14can already (as root) set security attributes on them.
15
16The current rules seem slightly over-paranoid and as we have a use case this
17patch enables updating the attributes on a symbolic link IFF you are the
18owner of the synlink (as permissions are not usually meaningful on the link
19itself).
20
21Signed-off-by: Alan Cox <alan@linux.intel.com>
22---
23 fs/xattr.c | 14 ++++++++------
24 1 file changed, 8 insertions(+), 6 deletions(-)
25
26diff --git a/fs/xattr.c b/fs/xattr.c
27index 2d13b4e62fae..580a5aeddfd2 100644
28--- a/fs/xattr.c
29+++ b/fs/xattr.c
30@@ -118,15 +118,17 @@ xattr_permission(struct inode *inode, const char *name, int mask)
31 }
32
33 /*
34- * In the user.* namespace, only regular files and directories can have
35- * extended attributes. For sticky directories, only the owner and
36- * privileged users can write attributes.
37+ * In the user.* namespace, only regular files, symbolic links, and
38+ * directories can have extended attributes. For symbolic links and
39+ * sticky directories, only the owner and privileged users can write
40+ * attributes.
41 */
42 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
43- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
44+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode))
45 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
46- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
47- (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
48+ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX))
49+ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE)
50+ && !inode_owner_or_capable(inode))
51 return -EPERM;
52 }
53
54--
552.11.1
56
diff --git a/patches/boot_time_opt/0110-init_task-faster-timerslack.patch b/patches/boot_time_opt/0110-init_task-faster-timerslack.patch
new file mode 100644
index 0000000..b0075ff
--- /dev/null
+++ b/patches/boot_time_opt/0110-init_task-faster-timerslack.patch
@@ -0,0 +1,32 @@
1From 42c2cb32259b76fb1f6713d99c4f0922e97bcc8d Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 23 Mar 2016 14:52:41 +0000
4Subject: [PATCH 110/124] init_task: faster timerslack
5
6the default tuning is a compromise between client power and server performance;
7for a server distro like Clear Linux, we don't need to compromise.
8(for non-server usages we have different kernel binaries)
9
10in principle this can be done as a patch to systemd as well, but we have a shared
11systemd between usages while we have different kernels, so the logistics
12for where the patch goes work out better here
13---
14 include/linux/init_task.h | 2 +-
15 1 file changed, 1 insertion(+), 1 deletion(-)
16
17diff --git a/include/linux/init_task.h b/include/linux/init_task.h
18index 325f649d77ff..e0eb261e17cb 100644
19--- a/include/linux/init_task.h
20+++ b/include/linux/init_task.h
21@@ -249,7 +249,7 @@ extern struct task_group root_task_group;
22 .journal_info = NULL, \
23 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
24 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
25- .timer_slack_ns = 50000, /* 50 usec default slack */ \
26+ .timer_slack_ns = 1000, /* 1 usec default slack */ \
27 .pids = { \
28 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
29 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
30--
312.11.1
32
diff --git a/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch b/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch
new file mode 100644
index 0000000..7d0def8
--- /dev/null
+++ b/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch
@@ -0,0 +1,158 @@
1From 3152053ea1ea3aa77bcc7e990d48ef84621ff6c9 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sat, 9 Apr 2016 22:41:37 +0000
4Subject: [PATCH 112/124] fs: ext4: fsync: optimize double-fsync() a bunch
5
6There are cases where EXT4 is a bit too conservative sending barriers down to the disk;
7there are cases where the transaction in progress is not the one that sent the barrier
8(in other words: the fsync is for a file for which the IO happened more time ago
9and all data was already sent to the disk). For that case, a more performing tradeoff
10can be made on SSD devices (which have the ability to flush their dram caches in a hurry
11on a power fail event) where the barrier gets sent to the disk, but we don't need to wait
12for the barrier to complete. Any consecutive IO will block on the barrier correctly.
13---
14 block/bio.c | 20 ++++++++++++++++++++
15 block/blk-flush.c | 41 +++++++++++++++++++++++++++++++++++++++++
16 fs/ext4/fsync.c | 6 +++++-
17 include/linux/bio.h | 1 +
18 include/linux/blkdev.h | 5 +++++
19 5 files changed, 72 insertions(+), 1 deletion(-)
20
21diff --git a/block/bio.c b/block/bio.c
22index db85c5753a76..80f5ab6b536a 100644
23--- a/block/bio.c
24+++ b/block/bio.c
25@@ -882,6 +882,26 @@ int submit_bio_wait(struct bio *bio)
26 }
27 EXPORT_SYMBOL(submit_bio_wait);
28
29+static void submit_bio_nowait_endio(struct bio *bio)
30+{
31+ bio_put(bio);
32+}
33+
34+/**
35+ * submit_bio_nowait - submit a bio for fire-and-forget
36+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
37+ * @bio: The &struct bio which describes the I/O
38+ *
39+ * Simple wrapper around submit_bio() that takes care of bio_put() on completion
40+ */
41+void submit_bio_nowait(struct bio *bio)
42+{
43+ bio->bi_end_io = submit_bio_nowait_endio;
44+ bio->bi_opf |= REQ_SYNC;
45+ submit_bio(bio);
46+}
47+EXPORT_SYMBOL(submit_bio_nowait);
48+
49 /**
50 * bio_advance - increment/complete a bio by some number of bytes
51 * @bio: bio to advance
52diff --git a/block/blk-flush.c b/block/blk-flush.c
53index 3c882cbc7541..b2dfcfe01ed7 100644
54--- a/block/blk-flush.c
55+++ b/block/blk-flush.c
56@@ -530,6 +530,47 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
57 }
58 EXPORT_SYMBOL(blkdev_issue_flush);
59
60+/**
61+ * blkdev_issue_flush_nowait - queue a flush
62+ * @bdev: blockdev to issue flush for
63+ * @gfp_mask: memory allocation flags (for bio_alloc)
64+ * @error_sector: error sector
65+ *
66+ * Description:
67+ * Issue a flush for the block device in question. Caller can supply
68+ * room for storing the error offset in case of a flush error, if they
69+ * wish to. If WAIT flag is not passed then caller may check only what
70+ * request was pushed in some internal queue for later handling.
71+ */
72+void blkdev_issue_flush_nowait(struct block_device *bdev, gfp_t gfp_mask)
73+{
74+ struct request_queue *q;
75+ struct bio *bio;
76+
77+ if (bdev->bd_disk == NULL)
78+ return;
79+
80+ q = bdev_get_queue(bdev);
81+ if (!q)
82+ return;
83+
84+ /*
85+ * some block devices may not have their queue correctly set up here
86+ * (e.g. loop device without a backing file) and so issuing a flush
87+ * here will panic. Ensure there is a request function before issuing
88+ * the flush.
89+ */
90+ if (!q->make_request_fn)
91+ return;
92+
93+ bio = bio_alloc(gfp_mask, 0);
94+ bio->bi_bdev = bdev;
95+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
96+
97+ submit_bio_nowait(bio);
98+}
99+EXPORT_SYMBOL(blkdev_issue_flush_nowait);
100+
101 struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
102 int node, int cmd_size)
103 {
104diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
105index 88effb1053c7..a58966c18172 100644
106--- a/fs/ext4/fsync.c
107+++ b/fs/ext4/fsync.c
108@@ -150,7 +150,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
109 ret = jbd2_complete_transaction(journal, commit_tid);
110 if (needs_barrier) {
111 issue_flush:
112- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
113+ err = 0;
114+ if (!blk_queue_nonrot(bdev_get_queue(inode->i_sb->s_bdev)))
115+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
116+ else
117+ blkdev_issue_flush_nowait(inode->i_sb->s_bdev, GFP_KERNEL);
118 if (!ret)
119 ret = err;
120 }
121diff --git a/include/linux/bio.h b/include/linux/bio.h
122index 97cb48f03dc7..3f055e6541e0 100644
123--- a/include/linux/bio.h
124+++ b/include/linux/bio.h
125@@ -421,6 +421,7 @@ struct request_queue;
126 extern int bio_phys_segments(struct request_queue *, struct bio *);
127
128 extern int submit_bio_wait(struct bio *bio);
129+extern void submit_bio_nowait(struct bio *bio);
130 extern void bio_advance(struct bio *, unsigned);
131
132 extern void bio_init(struct bio *);
133diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
134index f6a816129856..727684abf21e 100644
135--- a/include/linux/blkdev.h
136+++ b/include/linux/blkdev.h
137@@ -1144,6 +1144,7 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
138 #define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */
139
140 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
141+extern void blkdev_issue_flush_nowait(struct block_device *, gfp_t);
142 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
143 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
144 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
145@@ -1745,6 +1746,10 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
146 return 0;
147 }
148
149+static inline void blkdev_issue_flush_nowait(struct block_device *bdev, gfp_t gfp_mask)
150+{
151+}
152+
153 #endif /* CONFIG_BLOCK */
154
155 #endif
156--
1572.11.1
158
diff --git a/patches/boot_time_opt/0113-overload-on-wakeup.patch b/patches/boot_time_opt/0113-overload-on-wakeup.patch
new file mode 100644
index 0000000..a3a6bce
--- /dev/null
+++ b/patches/boot_time_opt/0113-overload-on-wakeup.patch
@@ -0,0 +1,43 @@
1From 9f25d18f45a8391488feb9783404f2f79b7090f4 Mon Sep 17 00:00:00 2001
2From: jplozi <jplozi@unice.fr>
3Date: Fri, 11 Mar 2016 15:18:06 +0100
4Subject: [PATCH 113/124] overload on wakeup
5
6source https://github.com/jplozi/wastedcores
7
8as an experiment, apply the learnings from the wasted-cores paper
9and see how the performance works out. With the data from this we should
10be able to work with Peter and the rest of the scheduler folks on
11a more permanent/elegant solution.
12---
13 kernel/sched/fair.c | 14 ++++++++++++++
14 1 file changed, 14 insertions(+)
15
16diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
17index c242944f5cbd..5132c828161e 100644
18--- a/kernel/sched/fair.c
19+++ b/kernel/sched/fair.c
20@@ -5638,6 +5638,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
21 }
22
23 rcu_read_lock();
24+
25+ if (cpu_rq(prev_cpu)->nr_running) {
26+ int _cpu;
27+
28+ for_each_online_cpu(_cpu) {
29+ if (!cpumask_test_cpu(_cpu, tsk_cpus_allowed(p)) ||
30+ cpu_rq(_cpu)->nr_running)
31+ continue;
32+
33+ rcu_read_unlock();
34+ return _cpu;
35+ }
36+ }
37+
38 for_each_domain(cpu, tmp) {
39 if (!(tmp->flags & SD_LOAD_BALANCE))
40 break;
41--
422.11.1
43
diff --git a/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch b/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch
new file mode 100644
index 0000000..c6bf036
--- /dev/null
+++ b/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch
@@ -0,0 +1,83 @@
1From 3a1512b4ed3922f88936b95731aaff706e7286a9 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:05:23 -0600
4Subject: [PATCH 114/124] bootstats: add printk's to measure boot time in more
5 detail
6
7Few distro-tweaks to add printk's to visualize boot time better
8
9Author: Arjan van de Ven <arjan@linux.intel.com>
10
11Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
12---
13 arch/x86/kernel/alternative.c | 4 ++++
14 drivers/base/firmware_class.c | 2 ++
15 init/main.c | 2 +-
16 kernel/kmod.c | 2 ++
17 4 files changed, 9 insertions(+), 1 deletion(-)
18
19diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
20index 5cb272a7a5a3..d28fb7aae4ce 100644
21--- a/arch/x86/kernel/alternative.c
22+++ b/arch/x86/kernel/alternative.c
23@@ -626,7 +626,9 @@ void __init alternative_instructions(void)
24 * patching.
25 */
26
27+ printk("clr: Applying alternatives\n");
28 apply_alternatives(__alt_instructions, __alt_instructions_end);
29+ printk("clr: Applying alternatives done\n");
30
31 #ifdef CONFIG_SMP
32 /* Patch to UP if other cpus not imminent. */
33@@ -637,6 +639,8 @@ void __init alternative_instructions(void)
34 _text, _etext);
35 }
36
37+ printk("clr: Applying alternatives smp done\n");
38+
39 if (!uniproc_patched || num_possible_cpus() == 1)
40 free_init_pages("SMP alternatives",
41 (unsigned long)__smp_locks,
42diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
43index a95e1e572697..b29467031be6 100644
44--- a/drivers/base/firmware_class.c
45+++ b/drivers/base/firmware_class.c
46@@ -1224,6 +1224,8 @@ request_firmware(const struct firmware **firmware_p, const char *name,
47 {
48 int ret;
49
50+ printk("clr: request_firmware: %s\n", name);
51+
52 /* Need to pin this module until return */
53 __module_get(THIS_MODULE);
54 ret = _request_firmware(firmware_p, name, device, NULL, 0,
55diff --git a/init/main.c b/init/main.c
56index 2858be732f6d..f1d8c3fdbf05 100644
57--- a/init/main.c
58+++ b/init/main.c
59@@ -751,7 +751,7 @@ static int __init_or_module do_one_initcall_debug(initcall_t fn)
60 unsigned long long duration;
61 int ret;
62
63- printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
64+ printk(KERN_DEBUG "calling %pF @ %i\n", fn, raw_smp_processor_id());
65 calltime = ktime_get();
66 ret = fn();
67 rettime = ktime_get();
68diff --git a/kernel/kmod.c b/kernel/kmod.c
69index 0277d1216f80..dc5a6edd3895 100644
70--- a/kernel/kmod.c
71+++ b/kernel/kmod.c
72@@ -76,6 +76,8 @@ static int call_modprobe(char *module_name, int wait)
73 NULL
74 };
75
76+ printk("clr: call_modprobe: %s %i \n", module_name, wait);
77+
78 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
79 if (!argv)
80 goto out;
81--
822.11.1
83
diff --git a/patches/boot_time_opt/0115-fix-initcall-timestamps.patch b/patches/boot_time_opt/0115-fix-initcall-timestamps.patch
new file mode 100644
index 0000000..cdf2af1
--- /dev/null
+++ b/patches/boot_time_opt/0115-fix-initcall-timestamps.patch
@@ -0,0 +1,42 @@
1From 5b5ad2c9b9b555d20aeba1f895d0c9d1c2a77776 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Thu, 2 Jun 2016 23:36:32 -0500
4Subject: [PATCH 115/124] fix initcall timestamps
5
6Print more finegrained initcall timings
7
8use the tsc instead of the jiffies clock for initcall_debug
9---
10 init/main.c | 12 ++++++------
11 1 file changed, 6 insertions(+), 6 deletions(-)
12
13diff --git a/init/main.c b/init/main.c
14index f1d8c3fdbf05..8358cbe6ab13 100644
15--- a/init/main.c
16+++ b/init/main.c
17@@ -747,16 +747,16 @@ __setup("initcall_blacklist=", initcall_blacklist);
18
19 static int __init_or_module do_one_initcall_debug(initcall_t fn)
20 {
21- ktime_t calltime, delta, rettime;
22+ unsigned long long calltime, delta, rettime;
23 unsigned long long duration;
24 int ret;
25
26- printk(KERN_DEBUG "calling %pF @ %i\n", fn, raw_smp_processor_id());
27- calltime = ktime_get();
28+ printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
29+ calltime = local_clock();
30 ret = fn();
31- rettime = ktime_get();
32- delta = ktime_sub(rettime, calltime);
33- duration = (unsigned long long) ktime_to_ns(delta) >> 10;
34+ rettime = local_clock();
35+ delta = rettime - calltime;
36+ duration = delta >> 10;
37 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
38 fn, ret, duration);
39
40--
412.11.1
42
diff --git a/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch b/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch
new file mode 100644
index 0000000..d1f71b5
--- /dev/null
+++ b/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch
@@ -0,0 +1,31 @@
1From 16104411cc5a7b20f310e3ecede85343ee6ce6b9 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 17:28:14 -0600
4Subject: [PATCH 116/124] smpboot: reuse timer calibration
5
6NO point recalibrating for known-constant tsc... saves 200ms+ of boot time.
7
8Author: Arjan van de Ven <arjan@linux.intel.com>
9
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 arch/x86/kernel/tsc.c | 3 +++
13 1 file changed, 3 insertions(+)
14
15diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
16index 46b2f41f8b05..88553c1f21f1 100644
17--- a/arch/x86/kernel/tsc.c
18+++ b/arch/x86/kernel/tsc.c
19@@ -1384,6 +1384,9 @@ unsigned long calibrate_delay_is_known(void)
20 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
21 return 0;
22
23+ if (cpu != 0)
24+ return cpu_data(0).loops_per_jiffy;
25+
26 if (!mask)
27 return 0;
28
29--
302.11.1
31
diff --git a/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch b/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch
new file mode 100644
index 0000000..978e09f
--- /dev/null
+++ b/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch
@@ -0,0 +1,156 @@
1From fd1f55138c242bd9aeec374ff611064bdc89b359 Mon Sep 17 00:00:00 2001
2From: Jim Kukunas <james.t.kukunas@linux.intel.com>
3Date: Fri, 27 May 2016 09:26:51 -0400
4Subject: [PATCH 117/124] raid6: add Kconfig option to skip raid6 benchmarking
5
6Adds CONFIG_RAID6_FORCE_ALGO, which causes the kernel to not benchmark
7each raid recovery and syndrome generation algorithm, and instead use
8the version selected via Kconfig (CONFIG_RAID6_FORCE_{INT,SSSE3,AVX2}).
9In the case, the selected algorithm is not supported by the processor at
10runtime, a fallback is used.
11
12Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
13---
14 lib/Kconfig | 3 +--
15 lib/raid6/Kconfig | 38 ++++++++++++++++++++++++++++++++++++
16 lib/raid6/algos.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
17 3 files changed, 97 insertions(+), 2 deletions(-)
18 create mode 100644 lib/raid6/Kconfig
19
20diff --git a/lib/Kconfig b/lib/Kconfig
21index 260a80e313b9..b3efd21db2fd 100644
22--- a/lib/Kconfig
23+++ b/lib/Kconfig
24@@ -7,8 +7,7 @@ config BINARY_PRINTF
25
26 menu "Library routines"
27
28-config RAID6_PQ
29- tristate
30+source "lib/raid6/Kconfig"
31
32 config BITREVERSE
33 tristate
34diff --git a/lib/raid6/Kconfig b/lib/raid6/Kconfig
35new file mode 100644
36index 000000000000..d881d6be89bb
37--- /dev/null
38+++ b/lib/raid6/Kconfig
39@@ -0,0 +1,38 @@
40+menu "RAID 6"
41+
42+config RAID6_PQ
43+ tristate
44+
45+config RAID6_FORCE_ALGO
46+ bool "Always use specified recovery algorithm"
47+ default n
48+ depends on RAID6_PQ
49+ help
50+ If this option is not set, on every boot the kernel will
51+ benchmark each optimized version of the RAID6 recovery and
52+ syndrome generation algorithms and will select the one that
53+ performs best. Microbenchmarking each version negatively
54+ affects boot time.
55+
56+ Enabling this option skips the benchmark at boot, and
57+ instead always uses the algorithm selected. The only exception
58+ is if the selected algorithm relies on a cpu feature not
59+ supported at runtime. In this case, one of the lower performance
60+ fallbacks are used.
61+
62+choice
63+ prompt "RAID6 Recovery Algorithm"
64+ default RAID6_FORCE_INT
65+ depends on RAID6_FORCE_ALGO
66+ ---help---
67+ Select the RAID6 recovery algorithm to unconditionally use
68+
69+ config RAID6_FORCE_INT
70+ bool "Reference Implementation"
71+ config RAID6_FORCE_SSSE3
72+ bool "SSSE3"
73+ config RAID6_FORCE_AVX2
74+ bool "AVX2"
75+endchoice
76+
77+endmenu
78diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
79index 7857049fd7d3..29332d2a04a5 100644
80--- a/lib/raid6/algos.c
81+++ b/lib/raid6/algos.c
82@@ -125,6 +125,63 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
83 #define time_before(x, y) ((x) < (y))
84 #endif
85
86+#ifdef CONFIG_RAID6_FORCE_ALGO
87+/* TODO don't compile in algos that will never be used */
88+int __init raid6_select_algo(void)
89+{
90+ const struct raid6_recov_calls *recov_fallback = &raid6_recov_intx1;
91+ const struct raid6_recov_calls *recov_algo;
92+ const struct raid6_calls *gen_fallback;
93+ const struct raid6_calls *gen_algo;
94+
95+#if defined(__i386__)
96+ gen_fallback = &raid6_intx32;
97+#elif defined(__x86_64__)
98+ gen_fallback = &raid6_sse2x2;
99+#else
100+# error "TODO"
101+#endif
102+
103+#if defined(CONFIG_RAID6_FORCE_INT)
104+ recov_algo = &raid6_recov_intx1;
105+ gen_algo = &raid6_intx32;
106+
107+#elif defined(CONFIG_RAID6_FORCE_SSSE3)
108+ recov_algo = &raid6_recov_ssse3;
109+#if defined(__i386__)
110+ gen_algo = &raid6_sse2x2;
111+#else
112+ gen_algo = &raid6_sse2x4;
113+#endif
114+
115+#elif defined(CONFIG_RAID6_FORCE_AVX2)
116+ recov_algo = &raid6_recov_avx2;
117+
118+#if defined(__i386__)
119+ gen_algo = &raid6_avx2x2;
120+#else
121+ gen_algo = &raid6_avx2x4;
122+#endif
123+
124+#else
125+#error "RAID6 Forced Recov Algo: Unsupported selection"
126+#endif
127+
128+ if (recov_algo->valid != NULL && recov_algo->valid() == 0)
129+ recov_algo = recov_fallback;
130+
131+ pr_info("raid6: Forced to use recovery algorithm %s\n", recov_algo->name);
132+
133+ raid6_2data_recov = recov_algo->data2;
134+ raid6_datap_recov = recov_algo->datap;
135+
136+ pr_info("raid6: Forced gen() algo %s\n", gen_algo->name);
137+
138+ raid6_call = *gen_algo;
139+
140+ return gen_algo && recov_algo ? 0 : -EINVAL;
141+}
142+#else
143 static inline const struct raid6_recov_calls *raid6_choose_recov(void)
144 {
145 const struct raid6_recov_calls *const *algo;
146@@ -256,6 +313,7 @@ int __init raid6_select_algo(void)
147
148 return gen_best && rec_best ? 0 : -EINVAL;
149 }
150+#endif
151
152 static void raid6_exit(void)
153 {
154--
1552.11.1
156
diff --git a/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch b/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch
new file mode 100644
index 0000000..70e07c8
--- /dev/null
+++ b/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch
@@ -0,0 +1,47 @@
1From fbc1ab7c18a9c960a0bff293a93620d581658f8d Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Thu, 2 Jun 2016 23:36:32 -0500
4Subject: [PATCH 118/124] Initialize ata before graphics
5
6ATA init is the long pole in the boot process, and its asynchronous.
7move the graphics init after it so that ata and graphics initialize
8in parallel
9---
10 drivers/Makefile | 11 ++++++-----
11 1 file changed, 6 insertions(+), 5 deletions(-)
12
13diff --git a/drivers/Makefile b/drivers/Makefile
14index 194d20bee7dc..2785e4c6b30f 100644
15--- a/drivers/Makefile
16+++ b/drivers/Makefile
17@@ -55,14 +55,9 @@ obj-y += char/
18 # iommu/ comes before gpu as gpu are using iommu controllers
19 obj-$(CONFIG_IOMMU_SUPPORT) += iommu/
20
21-# gpu/ comes after char for AGP vs DRM startup and after iommu
22-obj-y += gpu/
23
24 obj-$(CONFIG_CONNECTOR) += connector/
25
26-# i810fb and intelfb depend on char/agp/
27-obj-$(CONFIG_FB_I810) += video/fbdev/i810/
28-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
29
30 obj-$(CONFIG_PARPORT) += parport/
31 obj-$(CONFIG_NVM) += lightnvm/
32@@ -76,6 +71,12 @@ obj-$(CONFIG_IDE) += ide/
33 obj-$(CONFIG_SCSI) += scsi/
34 obj-y += nvme/
35 obj-$(CONFIG_ATA) += ata/
36+
37+# gpu/ comes after char for AGP vs DRM startup and after iommu
38+obj-y += gpu/
39+# i810fb and intelfb depend on char/agp/
40+obj-$(CONFIG_FB_I810) += video/fbdev/i810/
41+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
42 obj-$(CONFIG_TARGET_CORE) += target/
43 obj-$(CONFIG_MTD) += mtd/
44 obj-$(CONFIG_SPI) += spi/
45--
462.11.1
47
diff --git a/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch b/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch
new file mode 100644
index 0000000..a068afb
--- /dev/null
+++ b/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch
@@ -0,0 +1,311 @@
1From d9390cb702de5cbef64f893efd2344c4f58dae82 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 25 Jul 2016 06:44:34 -0500
4Subject: [PATCH 119/124] reduce e1000e boot time by tightening sleep ranges
5
6The e1000e driver is a great user of the usleep_range() API,
7and has any nice ranges that in principle help power management.
8
9However the ranges that are used only during system startup are
10very long (and can add easily 100 msec to the boot time) while
11the power savings of such long ranges is irrelevant due to the
12one-off, boot only, nature of these functions.
13
14This patch shrinks some of the longest ranges to be shorter
15(while still using a power friendly 1 msec range); this saves
16100msec+ of boot time on my BDW NUCs
17
18Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
19---
20 drivers/net/ethernet/intel/e1000e/80003es2lan.c | 2 +-
21 drivers/net/ethernet/intel/e1000e/82571.c | 2 +-
22 drivers/net/ethernet/intel/e1000e/ethtool.c | 14 +++++++-------
23 drivers/net/ethernet/intel/e1000e/ich8lan.c | 20 ++++++++++----------
24 drivers/net/ethernet/intel/e1000e/mac.c | 2 +-
25 drivers/net/ethernet/intel/e1000e/netdev.c | 14 +++++++-------
26 drivers/net/ethernet/intel/e1000e/nvm.c | 2 +-
27 7 files changed, 28 insertions(+), 28 deletions(-)
28
29diff --git a/drivers/net/ethernet/intel/e1000e/80003es2lan.c b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
30index cd391376036c..b5759899eeb8 100644
31--- a/drivers/net/ethernet/intel/e1000e/80003es2lan.c
32+++ b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
33@@ -698,7 +698,7 @@ static s32 e1000_reset_hw_80003es2lan(struct e1000_hw *hw)
34 ew32(TCTL, E1000_TCTL_PSP);
35 e1e_flush();
36
37- usleep_range(10000, 20000);
38+ usleep_range(10000, 11000);
39
40 ctrl = er32(CTRL);
41
42diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c
43index 6b03c8553e59..d31145269dd9 100644
44--- a/drivers/net/ethernet/intel/e1000e/82571.c
45+++ b/drivers/net/ethernet/intel/e1000e/82571.c
46@@ -977,7 +977,7 @@ static s32 e1000_reset_hw_82571(struct e1000_hw *hw)
47 ew32(TCTL, tctl);
48 e1e_flush();
49
50- usleep_range(10000, 20000);
51+ usleep_range(10000, 11000);
52
53 /* Must acquire the MDIO ownership before MAC reset.
54 * Ownership defaults to firmware after a reset.
55diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
56index 7aff68a4a4df..7cb689bd41f8 100644
57--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
58+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
59@@ -1023,7 +1023,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
60 /* Disable all the interrupts */
61 ew32(IMC, 0xFFFFFFFF);
62 e1e_flush();
63- usleep_range(10000, 20000);
64+ usleep_range(10000, 11000);
65
66 /* Test each interrupt */
67 for (i = 0; i < 10; i++) {
68@@ -1055,7 +1055,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
69 ew32(IMC, mask);
70 ew32(ICS, mask);
71 e1e_flush();
72- usleep_range(10000, 20000);
73+ usleep_range(10000, 11000);
74
75 if (adapter->test_icr & mask) {
76 *data = 3;
77@@ -1073,7 +1073,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
78 ew32(IMS, mask);
79 ew32(ICS, mask);
80 e1e_flush();
81- usleep_range(10000, 20000);
82+ usleep_range(10000, 11000);
83
84 if (!(adapter->test_icr & mask)) {
85 *data = 4;
86@@ -1091,7 +1091,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
87 ew32(IMC, ~mask & 0x00007FFF);
88 ew32(ICS, ~mask & 0x00007FFF);
89 e1e_flush();
90- usleep_range(10000, 20000);
91+ usleep_range(10000, 11000);
92
93 if (adapter->test_icr) {
94 *data = 5;
95@@ -1103,7 +1103,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
96 /* Disable all the interrupts */
97 ew32(IMC, 0xFFFFFFFF);
98 e1e_flush();
99- usleep_range(10000, 20000);
100+ usleep_range(10000, 11000);
101
102 /* Unhook test interrupt handler */
103 free_irq(irq, netdev);
104@@ -1479,7 +1479,7 @@ static int e1000_set_82571_fiber_loopback(struct e1000_adapter *adapter)
105 */
106 ew32(SCTL, E1000_SCTL_ENABLE_SERDES_LOOPBACK);
107 e1e_flush();
108- usleep_range(10000, 20000);
109+ usleep_range(10000, 11000);
110
111 return 0;
112 }
113@@ -1592,7 +1592,7 @@ static void e1000_loopback_cleanup(struct e1000_adapter *adapter)
114 hw->phy.media_type == e1000_media_type_internal_serdes) {
115 ew32(SCTL, E1000_SCTL_DISABLE_SERDES_LOOPBACK);
116 e1e_flush();
117- usleep_range(10000, 20000);
118+ usleep_range(10000, 11000);
119 break;
120 }
121 /* Fall Through */
122diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
123index f3aaca743ea3..bef75cec259f 100644
124--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
125+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
126@@ -289,7 +289,7 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw)
127 u16 count = 20;
128
129 do {
130- usleep_range(5000, 10000);
131+ usleep_range(5000, 6000);
132 } while (!(er32(CTRL_EXT) & E1000_CTRL_EXT_LPCD) && count--);
133
134 msleep(30);
135@@ -422,7 +422,7 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw)
136 /* Ungate automatic PHY configuration on non-managed 82579 */
137 if ((hw->mac.type == e1000_pch2lan) &&
138 !(fwsm & E1000_ICH_FWSM_FW_VALID)) {
139- usleep_range(10000, 20000);
140+ usleep_range(10000, 11000);
141 e1000_gate_hw_phy_config_ich8lan(hw, false);
142 }
143
144@@ -547,7 +547,7 @@ static s32 e1000_init_phy_params_ich8lan(struct e1000_hw *hw)
145 phy->id = 0;
146 while ((e1000_phy_unknown == e1000e_get_phy_type_from_id(phy->id)) &&
147 (i++ < 100)) {
148- usleep_range(1000, 2000);
149+ usleep_range(1000, 1100);
150 ret_val = e1000e_get_phy_id(hw);
151 if (ret_val)
152 return ret_val;
153@@ -1259,7 +1259,7 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force)
154 goto out;
155 }
156
157- usleep_range(10000, 20000);
158+ usleep_range(10000, 11000);
159 }
160 e_dbg("ULP_CONFIG_DONE cleared after %dmsec\n", i * 10);
161
162@@ -2011,7 +2011,7 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw)
163
164 while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) &&
165 (i++ < 30))
166- usleep_range(10000, 20000);
167+ usleep_range(10000, 11000);
168 return blocked ? E1000_BLK_PHY_RESET : 0;
169 }
170
171@@ -2827,7 +2827,7 @@ static s32 e1000_post_phy_reset_ich8lan(struct e1000_hw *hw)
172 return 0;
173
174 /* Allow time for h/w to get to quiescent state after reset */
175- usleep_range(10000, 20000);
176+ usleep_range(10000, 11000);
177
178 /* Perform any necessary post-reset workarounds */
179 switch (hw->mac.type) {
180@@ -2863,7 +2863,7 @@ static s32 e1000_post_phy_reset_ich8lan(struct e1000_hw *hw)
181 if (hw->mac.type == e1000_pch2lan) {
182 /* Ungate automatic PHY configuration on non-managed 82579 */
183 if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) {
184- usleep_range(10000, 20000);
185+ usleep_range(10000, 11000);
186 e1000_gate_hw_phy_config_ich8lan(hw, false);
187 }
188
189@@ -3884,7 +3884,7 @@ static s32 e1000_update_nvm_checksum_spt(struct e1000_hw *hw)
190 */
191 if (!ret_val) {
192 nvm->ops.reload(hw);
193- usleep_range(10000, 20000);
194+ usleep_range(10000, 11000);
195 }
196
197 out:
198@@ -4035,7 +4035,7 @@ static s32 e1000_update_nvm_checksum_ich8lan(struct e1000_hw *hw)
199 */
200 if (!ret_val) {
201 nvm->ops.reload(hw);
202- usleep_range(10000, 20000);
203+ usleep_range(10000, 11000);
204 }
205
206 out:
207@@ -4658,7 +4658,7 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw)
208 ew32(TCTL, E1000_TCTL_PSP);
209 e1e_flush();
210
211- usleep_range(10000, 20000);
212+ usleep_range(10000, 11000);
213
214 /* Workaround for ICH8 bit corruption issue in FIFO memory */
215 if (hw->mac.type == e1000_ich8lan) {
216diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c
217index b322011ec282..eecbf7a12735 100644
218--- a/drivers/net/ethernet/intel/e1000e/mac.c
219+++ b/drivers/net/ethernet/intel/e1000e/mac.c
220@@ -815,7 +815,7 @@ static s32 e1000_poll_fiber_serdes_link_generic(struct e1000_hw *hw)
221 * milliseconds even if the other end is doing it in SW).
222 */
223 for (i = 0; i < FIBER_LINK_UP_LIMIT; i++) {
224- usleep_range(10000, 20000);
225+ usleep_range(10000, 11000);
226 status = er32(STATUS);
227 if (status & E1000_STATUS_LU)
228 break;
229diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
230index 7017281ba2dc..7d68d694ed9e 100644
231--- a/drivers/net/ethernet/intel/e1000e/netdev.c
232+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
233@@ -3206,7 +3206,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
234 if (!(adapter->flags2 & FLAG2_NO_DISABLE_RX))
235 ew32(RCTL, rctl & ~E1000_RCTL_EN);
236 e1e_flush();
237- usleep_range(10000, 20000);
238+ usleep_range(10000, 11000);
239
240 if (adapter->flags2 & FLAG2_DMA_BURST) {
241 /* set the writeback threshold (only takes effect if the RDTR
242@@ -4258,7 +4258,7 @@ void e1000e_down(struct e1000_adapter *adapter, bool reset)
243
244 /* flush both disables and wait for them to finish */
245 e1e_flush();
246- usleep_range(10000, 20000);
247+ usleep_range(10000, 11000);
248
249 e1000_irq_disable(adapter);
250
251@@ -4296,7 +4296,7 @@ void e1000e_reinit_locked(struct e1000_adapter *adapter)
252 {
253 might_sleep();
254 while (test_and_set_bit(__E1000_RESETTING, &adapter->state))
255- usleep_range(1000, 2000);
256+ usleep_range(1000, 1100);
257 e1000e_down(adapter, true);
258 e1000e_up(adapter);
259 clear_bit(__E1000_RESETTING, &adapter->state);
260@@ -4671,7 +4671,7 @@ int e1000e_close(struct net_device *netdev)
261 int count = E1000_CHECK_RESET_COUNT;
262
263 while (test_bit(__E1000_RESETTING, &adapter->state) && count--)
264- usleep_range(10000, 20000);
265+ usleep_range(10000, 11000);
266
267 WARN_ON(test_bit(__E1000_RESETTING, &adapter->state));
268
269@@ -5996,7 +5996,7 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu)
270 }
271
272 while (test_and_set_bit(__E1000_RESETTING, &adapter->state))
273- usleep_range(1000, 2000);
274+ usleep_range(1000, 1100);
275 /* e1000e_down -> e1000e_reset dependent on max_frame_size & mtu */
276 adapter->max_frame_size = max_frame;
277 e_info("changing MTU from %d to %d\n", netdev->mtu, new_mtu);
278@@ -6276,7 +6276,7 @@ static int e1000e_pm_freeze(struct device *dev)
279 int count = E1000_CHECK_RESET_COUNT;
280
281 while (test_bit(__E1000_RESETTING, &adapter->state) && count--)
282- usleep_range(10000, 20000);
283+ usleep_range(10000, 11000);
284
285 WARN_ON(test_bit(__E1000_RESETTING, &adapter->state));
286
287@@ -6687,7 +6687,7 @@ static int e1000e_pm_runtime_suspend(struct device *dev)
288 int count = E1000_CHECK_RESET_COUNT;
289
290 while (test_bit(__E1000_RESETTING, &adapter->state) && count--)
291- usleep_range(10000, 20000);
292+ usleep_range(10000, 11000);
293
294 WARN_ON(test_bit(__E1000_RESETTING, &adapter->state));
295
296diff --git a/drivers/net/ethernet/intel/e1000e/nvm.c b/drivers/net/ethernet/intel/e1000e/nvm.c
297index 2efd80dfd88e..38f7c8fb3061 100644
298--- a/drivers/net/ethernet/intel/e1000e/nvm.c
299+++ b/drivers/net/ethernet/intel/e1000e/nvm.c
300@@ -410,7 +410,7 @@ s32 e1000e_write_nvm_spi(struct e1000_hw *hw, u16 offset, u16 words, u16 *data)
301 break;
302 }
303 }
304- usleep_range(10000, 20000);
305+ usleep_range(10000, 11000);
306 nvm->ops.release(hw);
307 }
308
309--
3102.11.1
311
diff --git a/patches/boot_time_opt/0120-give-rdrand-some-credit.patch b/patches/boot_time_opt/0120-give-rdrand-some-credit.patch
new file mode 100644
index 0000000..4b1669c
--- /dev/null
+++ b/patches/boot_time_opt/0120-give-rdrand-some-credit.patch
@@ -0,0 +1,30 @@
1From 5cc978db25b2c92707f68b15098ac39901fb5aac Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Fri, 29 Jul 2016 19:10:52 +0000
4Subject: [PATCH 120/124] give rdrand some credit
5
6try to credit rdrand/rdseed with some entropy
7
8In VMs but even modern hardware, we're super starved for entropy, and while we can
9and do wear a tin foil hat, it's very hard to argue that
10rdrand and rdtsc add zero entropy.
11---
12 drivers/char/random.c | 2 ++
13 1 file changed, 2 insertions(+)
14
15diff --git a/drivers/char/random.c b/drivers/char/random.c
16index d6876d506220..fca09af81b2c 100644
17--- a/drivers/char/random.c
18+++ b/drivers/char/random.c
19@@ -1638,6 +1638,8 @@ static void init_std_data(struct entropy_store *r)
20 if (!arch_get_random_seed_long(&rv) &&
21 !arch_get_random_long(&rv))
22 rv = random_get_entropy();
23+ else
24+ credit_entropy_bits(r, 1);
25 mix_pool_bytes(r, &rv, sizeof(rv));
26 }
27 mix_pool_bytes(r, utsname(), sizeof(*(utsname())));
28--
292.11.1
30
diff --git a/patches/boot_time_opt/0121-e1000e-change-default-policy.patch b/patches/boot_time_opt/0121-e1000e-change-default-policy.patch
new file mode 100644
index 0000000..bf3e13d
--- /dev/null
+++ b/patches/boot_time_opt/0121-e1000e-change-default-policy.patch
@@ -0,0 +1,27 @@
1From 5b4707fc2aa8c49aa18a60136880bf05a3e29071 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sat, 10 Dec 2016 14:29:52 +0000
4Subject: [PATCH 121/124] e1000e: change default policy
5
6change the default irq mitigation policy for e1000e to be
7more HPC/cluster friendly
8---
9 drivers/net/ethernet/intel/e1000e/param.c | 2 +-
10 1 file changed, 1 insertion(+), 1 deletion(-)
11
12diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
13index 6d8c39abee16..ef1122ad3b98 100644
14--- a/drivers/net/ethernet/intel/e1000e/param.c
15+++ b/drivers/net/ethernet/intel/e1000e/param.c
16@@ -92,7 +92,7 @@ E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
17 * Valid Range: 100-100000 or one of: 0=off, 1=dynamic, 3=dynamic conservative
18 */
19 E1000_PARAM(InterruptThrottleRate, "Interrupt Throttling Rate");
20-#define DEFAULT_ITR 3
21+#define DEFAULT_ITR 1
22 #define MAX_ITR 100000
23 #define MIN_ITR 100
24
25--
262.11.1
27
diff --git a/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch b/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch
new file mode 100644
index 0000000..eb44cec
--- /dev/null
+++ b/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch
@@ -0,0 +1,28 @@
1From 5cf7ba4ba9c9d770aad9e52deaa3730f259df9f1 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Fri, 6 Jan 2017 15:34:09 +0000
4Subject: [PATCH 122/124] ipv4/tcp: allow the memory tuning for tcp to go a
5 little bigger than default
6
7---
8 net/ipv4/tcp.c | 4 ++--
9 1 file changed, 2 insertions(+), 2 deletions(-)
10
11diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
12index 6a90a0e130dc..32e43ce7c60e 100644
13--- a/net/ipv4/tcp.c
14+++ b/net/ipv4/tcp.c
15@@ -3341,8 +3341,8 @@ void __init tcp_init(void)
16 tcp_init_mem();
17 /* Set per-socket limits to no more than 1/128 the pressure threshold */
18 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
19- max_wshare = min(4UL*1024*1024, limit);
20- max_rshare = min(6UL*1024*1024, limit);
21+ max_wshare = min(16UL*1024*1024, limit);
22+ max_rshare = min(16UL*1024*1024, limit);
23
24 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
25 sysctl_tcp_wmem[1] = 16*1024;
26--
272.11.1
28
diff --git a/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch b/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch
new file mode 100644
index 0000000..ce4964e
--- /dev/null
+++ b/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch
@@ -0,0 +1,27 @@
1From 10f0c995ce6aaf6b3ffa78377f1a12ad0477057a Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Thu, 12 Jan 2017 18:17:14 +0000
4Subject: [PATCH 123/124] igb: no runtime pm to fix reboot oops
5
6Causes oops on reboot due to a race between runtime resume and shutdown
7---
8 drivers/net/ethernet/intel/igb/igb_main.c | 3 ---
9 1 file changed, 3 deletions(-)
10
11diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
12index 9affd7c198bd..8ade77e75b36 100644
13--- a/drivers/net/ethernet/intel/igb/igb_main.c
14+++ b/drivers/net/ethernet/intel/igb/igb_main.c
15@@ -238,9 +238,6 @@ static struct pci_driver igb_driver = {
16 .id_table = igb_pci_tbl,
17 .probe = igb_probe,
18 .remove = igb_remove,
19-#ifdef CONFIG_PM
20- .driver.pm = &igb_pm_ops,
21-#endif
22 .shutdown = igb_shutdown,
23 .sriov_configure = igb_pci_sriov_configure,
24 .err_handler = &igb_err_handler
25--
262.11.1
27
diff --git a/patches/boot_time_opt/0124-tweak-perfbias.patch b/patches/boot_time_opt/0124-tweak-perfbias.patch
new file mode 100644
index 0000000..56a2865
--- /dev/null
+++ b/patches/boot_time_opt/0124-tweak-perfbias.patch
@@ -0,0 +1,32 @@
1From 03e2c414a860264511dae5bbfc6d7e62b8b94f0f Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sun, 22 Jan 2017 18:51:13 +0000
4Subject: [PATCH 124/124] tweak perfbias
5
6---
7 arch/x86/kernel/cpu/intel.c | 6 +++---
8 1 file changed, 3 insertions(+), 3 deletions(-)
9
10diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
11index fcd484d2bb03..13ae40f10bd4 100644
12--- a/arch/x86/kernel/cpu/intel.c
13+++ b/arch/x86/kernel/cpu/intel.c
14@@ -434,12 +434,12 @@ static void init_intel_energy_perf(struct cpuinfo_x86 *c)
15 return;
16
17 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
18- if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
19+ if ((epb & 0xF) >= ENERGY_PERF_BIAS_NORMAL)
20 return;
21
22- pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
23+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'performance', was 'normal'\n");
24 pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
25- epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
26+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_PERFORMANCE;
27 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
28 }
29
30--
312.11.1
32
diff --git a/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch b/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch
new file mode 100644
index 0000000..1c50e74
--- /dev/null
+++ b/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch
@@ -0,0 +1,33 @@
1From 6730c1ae12a567d56092d15540d2f971be95b936 Mon Sep 17 00:00:00 2001
2From: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
3Date: Mon, 27 Mar 2017 16:01:56 -0600
4Subject: [PATCH] e1000e: increase pause and refresh time
5
6Suggested-by: Tim Pepper <timothy.c.pepper@linux.intel.com>
7Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
8---
9 drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++--
10 1 file changed, 2 insertions(+), 2 deletions(-)
11
12diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
13index 7d68d694ed9e..1db390a52656 100644
14--- a/drivers/net/ethernet/intel/e1000e/netdev.c
15+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
16@@ -4032,12 +4032,12 @@ void e1000e_reset(struct e1000_adapter *adapter)
17 case e1000_pch2lan:
18 case e1000_pch_lpt:
19 case e1000_pch_spt:
20- fc->refresh_time = 0x0400;
21+ fc->refresh_time = 0xFFFF;
22+ fc->pause_time = 0xFFFF;
23
24 if (adapter->netdev->mtu <= ETH_DATA_LEN) {
25 fc->high_water = 0x05C20;
26 fc->low_water = 0x05048;
27- fc->pause_time = 0x0650;
28 break;
29 }
30
31--
322.12.2
33
diff --git a/patches/boot_time_opt/0151-mm-Export-do_madvise.patch b/patches/boot_time_opt/0151-mm-Export-do_madvise.patch
new file mode 100644
index 0000000..a6dbff7
--- /dev/null
+++ b/patches/boot_time_opt/0151-mm-Export-do_madvise.patch
@@ -0,0 +1,84 @@
1From 99b4cdcce43ad0f706120bef26fef8c628c572cf Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:03:52 -0800
4Subject: [PATCH 151/154] mm: Export do_madvise()
5
6Combined with some interesting flags madvise() system call
7allows to free memory more smartly and more efficiently than
8we could do with a simple free(). The issue is that is not
9available for kernel modules that could need it.
10
11In order to solve this lack of support, this patch exports
12do_madvise() so as to make it available to the entire kernel.
13The already existing madvise() system call is unchanged and
14now relies on this new do_madvise() function.
15
16Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
17Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
18---
19 include/linux/mm.h | 2 ++
20 mm/madvise.c | 25 +++++++++++++++++++++----
21 2 files changed, 23 insertions(+), 4 deletions(-)
22
23diff --git a/include/linux/mm.h b/include/linux/mm.h
24index 0b5b2e4df14e..925ec25f99a8 100644
25--- a/include/linux/mm.h
26+++ b/include/linux/mm.h
27@@ -2450,5 +2450,7 @@ void __init setup_nr_node_ids(void);
28 static inline void setup_nr_node_ids(void) {}
29 #endif
30
31+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
32+
33 #endif /* __KERNEL__ */
34 #endif /* _LINUX_MM_H */
35diff --git a/mm/madvise.c b/mm/madvise.c
36index 93fb63e88b5e..c8bbf93d4978 100644
37--- a/mm/madvise.c
38+++ b/mm/madvise.c
39@@ -618,9 +618,7 @@ madvise_behavior_valid(int behavior)
40 }
41
42 /*
43- * The madvise(2) system call.
44- *
45- * Applications can use madvise() to advise the kernel how it should
46+ * Kernel modules can use do_madvise() to advise the kernel how it should
47 * handle paging I/O in this VM area. The idea is to help the kernel
48 * use appropriate read-ahead and caching techniques. The information
49 * provided is advisory only, and can be safely disregarded by the
50@@ -673,7 +671,7 @@ madvise_behavior_valid(int behavior)
51 * -EBADF - map exists, but area maps something that isn't a file.
52 * -EAGAIN - a kernel resource was temporarily unavailable.
53 */
54-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
55+int do_madvise(unsigned long start, size_t len_in, int behavior)
56 {
57 unsigned long end, tmp;
58 struct vm_area_struct *vma, *prev;
59@@ -767,3 +765,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
60
61 return error;
62 }
63+EXPORT_SYMBOL_GPL(do_madvise);
64+
65+/*
66+ * The madvise(2) system call.
67+ *
68+ * Applications can use madvise() system call to advise the kernel how
69+ * it should handle paging I/O in this VM area. The idea is to help
70+ * the kernel use appropriate read-ahead and caching techniques. The
71+ * information provided is advisory only, and can be safely disregarded
72+ * by the kernel without affecting the correct operation of the application.
73+ *
74+ * behavior values are the same than the ones defined in madvise()
75+ *
76+ * return values are the same than the ones defined in madvise()
77+ */
78+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
79+{
80+ return do_madvise(start, len_in, behavior);
81+}
82--
832.12.1
84
diff --git a/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch b/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch
new file mode 100644
index 0000000..5f44930
--- /dev/null
+++ b/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch
@@ -0,0 +1,180 @@
1From d28921b5f797829e4e676f7968ae688ef96b7992 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:08:55 -0800
4Subject: [PATCH 152/154] x86: kvm: Notify host to release pages
5
6In context of hypervisors managing several virtual machines, we
7want those virtual machines to give the memory they used back to
8the host when they don't need it anymore.
9
10This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
11the guest kernel to notify the host kernel when such event occurs.
12And relying on do_madvise() function that we have previously exported,
13it issues a call to this function when it receives the new hypercall.
14
15Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
16ask for a new page without going through a new hypercall. Instead,
17it will be able to start using that memory again as it will get
18faulted back in as a fresh new page. That's why do_madvise() is more
19efficient than doing vm_unmap() to return some memory to the host.
20
21This patch introduces also a new sysctl kvm_madv_instant_free,
22allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
23Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
24because it does not zero the pages in case the memory has not been
25freed by the kernel. This can happen when there was no need for the
26kernel to get this memory back, meaning it was keeping those pages
27in the right state to be re-used by the same application.
28MADV_FREE being a very recent advice introduced in kernel 4.5, we
29only want to enable it through a sysctl in case the user want to
30use it.
31
32Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
33Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
34---
35 arch/x86/kvm/x86.c | 17 +++++++++++++++++
36 include/linux/mm.h | 5 +++++
37 include/uapi/linux/kvm_para.h | 3 +++
38 kernel/sysctl.c | 7 +++++++
39 mm/Makefile | 2 +-
40 mm/kvm.c | 25 +++++++++++++++++++++++++
41 6 files changed, 58 insertions(+), 1 deletion(-)
42 create mode 100644 mm/kvm.c
43
44diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
45index 582c75311f95..683a94dd5f03 100644
46--- a/arch/x86/kvm/x86.c
47+++ b/arch/x86/kvm/x86.c
48@@ -46,6 +46,7 @@
49 #include <linux/user-return-notifier.h>
50 #include <linux/srcu.h>
51 #include <linux/slab.h>
52+#include <linux/mm.h>
53 #include <linux/perf_event.h>
54 #include <linux/uaccess.h>
55 #include <linux/hash.h>
56@@ -6019,6 +6020,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
57 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
58 }
59
60+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
61+{
62+ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
63+
64+ if (len > KVM_MAX_RET_MEM_SIZE)
65+ return KVM_EPERM;
66+
67+ if (kvm_is_error_hva(start + len))
68+ return KVM_EFAULT;
69+
70+ return do_madvise(start, len, kvm_ret_mem_advice);
71+}
72+
73 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
74 {
75 vcpu->arch.apicv_active = false;
76@@ -6065,6 +6079,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
77 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
78 ret = 0;
79 break;
80+ case KVM_HC_RETURN_MEM:
81+ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
82+ break;
83 default:
84 ret = -KVM_ENOSYS;
85 break;
86diff --git a/include/linux/mm.h b/include/linux/mm.h
87index 925ec25f99a8..833f23d98baa 100644
88--- a/include/linux/mm.h
89+++ b/include/linux/mm.h
90@@ -2303,6 +2303,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
91 extern int sysctl_drop_caches;
92 int drop_caches_sysctl_handler(struct ctl_table *, int,
93 void __user *, size_t *, loff_t *);
94+extern int sysctl_kvm_madv_instant_free;
95+extern int kvm_ret_mem_advice;
96+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
97+ void __user *buffer, size_t *length,
98+ loff_t *ppos);
99 #endif
100
101 void drop_slab(void);
102diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
103index bf6cd7d5cac2..7d90f77d87d0 100644
104--- a/include/uapi/linux/kvm_para.h
105+++ b/include/uapi/linux/kvm_para.h
106@@ -23,6 +23,9 @@
107 #define KVM_HC_MIPS_GET_CLOCK_FREQ 6
108 #define KVM_HC_MIPS_EXIT_VM 7
109 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8
110+#define KVM_HC_RETURN_MEM 10
111+
112+#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB
113
114 /*
115 * hypercalls use architecture specific
116diff --git a/kernel/sysctl.c b/kernel/sysctl.c
117index c1095cdc0fe2..d8ae774fa042 100644
118--- a/kernel/sysctl.c
119+++ b/kernel/sysctl.c
120@@ -1398,6 +1398,13 @@ static struct ctl_table vm_table[] = {
121 .extra1 = &one,
122 .extra2 = &four,
123 },
124+ {
125+ .procname = "kvm_madv_instant_free",
126+ .data = &sysctl_kvm_madv_instant_free,
127+ .maxlen = sizeof(int),
128+ .mode = 0644,
129+ .proc_handler = kvm_madv_instant_free_sysctl_handler,
130+ },
131 #ifdef CONFIG_COMPACTION
132 {
133 .procname = "compact_memory",
134diff --git a/mm/Makefile b/mm/Makefile
135index 295bd7a..6455723 100644
136--- a/mm/Makefile
137+++ b/mm/Makefile
138@@ -47,6 +47,8 @@ else
139 obj-y += bootmem.o
140 endif
141
142+obj-y += kvm.o
143+
144 obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
145 ifdef CONFIG_MMU
146 obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
147diff --git a/mm/kvm.c b/mm/kvm.c
148new file mode 100644
149index 000000000000..8945f6a311b9
150--- /dev/null
151+++ b/mm/kvm.c
152@@ -0,0 +1,25 @@
153+#include <linux/mman.h>
154+
155+int sysctl_kvm_madv_instant_free;
156+
157+int kvm_ret_mem_advice = MADV_DONTNEED;
158+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
159+
160+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
161+ void __user *buffer, size_t *length, loff_t *ppos)
162+{
163+ int ret;
164+
165+ ret = proc_dointvec(table, write, buffer, length, ppos);
166+ if (ret)
167+ return ret;
168+
169+#ifdef MADV_FREE
170+ if (sysctl_kvm_madv_instant_free > 0)
171+ kvm_ret_mem_advice = MADV_FREE;
172+ else
173+ kvm_ret_mem_advice = MADV_DONTNEED;
174+#endif
175+
176+ return 0;
177+}
178--
1792.12.1
180
diff --git a/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch b/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch
new file mode 100644
index 0000000..cdb876a
--- /dev/null
+++ b/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch
@@ -0,0 +1,155 @@
1From 855ef164854307839c08c60688eaeac14f9a649e Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:26:13 -0800
4Subject: [PATCH 153/154] x86: Return memory from guest to host kernel
5
6All virtual machines need memory to perform various tasks, but this
7memory is not released to the host after it is not used anymore. We
8have to wait for the termination of the virtual machine to get this
9memory back into the host.
10
11Ballooning mechanism is close but not designed for the same purpose.
12In case we hit memory limits of the system, the host predicts how much
13memory can be asked back from a guest, and it issues an hypercall to
14retrieve this memory.
15
16The solution proposed is different because it does not wait for host
17needs before to return memory, and it knows precisely how much memory
18it can return.
19
20The way to notify the host side about such a return is to rely on
21the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be
22overloaded with too many hypercalls, we only return memory blocks of
23order 7 (512k blocks) and higher. This value has been found running
24memory tests using multiple threads allocating/freeing high amount
25of memory. Those tests were run for different order values, and 7 was
26the best tradeoff between the number of hypercalls issued and the
27amount of memory returned to the host.
28
29In order to limit performances impact related to this code addition,
30we check for blocks of order 7 or higher. This means it only costs an
31additional function call and a branch to perform this check.
32
33Furthermore, this code has been added to the "merge" codepath of the
34buddy allocator, which is not as sensitive as the "free" codepath.
35Not all blocks going through the "free" codepath will end up in the
36"merge" codepath because some of them won't find their free buddy.
37But this is a negligible amount since the kernel does not use many
38high order blocks directly. Instead, those bigger blocks are often
39broken into smaller chunks used as low order blocks. At the time
40those small blocks are released, they go through the merge path.
41
42Benchmarks such as ebizzy and will-it-scale have been run in order
43to make sure this patch does not affect kernel performances and no
44significant differences were observed.
45
46Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
47Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
48---
49 arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++
50 arch/x86/kernel/kvm.c | 10 ++++++++++
51 include/linux/mm-arch-hooks.h | 8 ++++++++
52 mm/page_alloc.c | 2 ++
53 4 files changed, 42 insertions(+)
54
55diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
56index bc62e7cbf1b1..4a2f6d1adbd2 100644
57--- a/arch/x86/include/asm/kvm_para.h
58+++ b/arch/x86/include/asm/kvm_para.h
59@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token);
60 void kvm_async_pf_task_wake(u32 token);
61 u32 kvm_read_and_reset_pf_reason(void);
62 extern void kvm_disable_steal_time(void);
63+void kvm_arch_return_memory(struct page *page, unsigned int order);
64+
65+/*
66+ * This order has been found in an empirical way, running memory tests
67+ * through many iterations to assess the number of hypercalls issued
68+ * and the amount of memory returned. In case you change this order to
69+ * 6 or 8, it should not impact your performances significantly.
70+ *
71+ * Smaller values lead to less memory waste, but consume more CPU on
72+ * hypercalls. Larger values use less CPU, but do not as precisely
73+ * inform the hypervisor of which memory is free.
74+ */
75+#define RET_MEM_BUDDY_ORDER 7
76+
77+static inline void arch_buddy_merge(struct page *page, unsigned int order)
78+{
79+ if (order < RET_MEM_BUDDY_ORDER)
80+ return;
81+
82+ kvm_arch_return_memory(page, order);
83+}
84+#define arch_buddy_merge arch_buddy_merge
85
86 #ifdef CONFIG_PARAVIRT_SPINLOCKS
87 void __init kvm_spinlock_init(void);
88diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
89index edbbfc854e39..14167b3f6514 100644
90--- a/arch/x86/kernel/kvm.c
91+++ b/arch/x86/kernel/kvm.c
92@@ -552,6 +552,16 @@ static __init int activate_jump_labels(void)
93 }
94 arch_initcall(activate_jump_labels);
95
96+void kvm_arch_return_memory(struct page *page, unsigned int order)
97+{
98+ if (!kvm_para_available())
99+ return;
100+
101+ kvm_hypercall2(KVM_HC_RETURN_MEM,
102+ page_to_phys(page),
103+ PAGE_SIZE << order);
104+}
105+
106 #ifdef CONFIG_PARAVIRT_SPINLOCKS
107
108 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
109diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
110index 4efc3f56e6df..26eb3a05a8a3 100644
111--- a/include/linux/mm-arch-hooks.h
112+++ b/include/linux/mm-arch-hooks.h
113@@ -12,6 +12,7 @@
114 #define _LINUX_MM_ARCH_HOOKS_H
115
116 #include <asm/mm-arch-hooks.h>
117+#include <asm/kvm_para.h>
118
119 #ifndef arch_remap
120 static inline void arch_remap(struct mm_struct *mm,
121@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm,
122 #define arch_remap arch_remap
123 #endif
124
125+#ifndef arch_buddy_merge
126+static inline void arch_buddy_merge(struct page *page, unsigned int order)
127+{
128+}
129+#define arch_buddy_merge arch_buddy_merge
130+#endif
131+
132 #endif /* _LINUX_MM_ARCH_HOOKS_H */
133diff --git a/mm/page_alloc.c b/mm/page_alloc.c
134index 1460e6ad5e14..5f6e6371bc6f 100644
135--- a/mm/page_alloc.c
136+++ b/mm/page_alloc.c
137@@ -64,6 +64,7 @@
138 #include <linux/page_owner.h>
139 #include <linux/kthread.h>
140 #include <linux/memcontrol.h>
141+#include <linux/mm-arch-hooks.h>
142
143 #include <asm/sections.h>
144 #include <asm/tlbflush.h>
145@@ -855,6 +856,7 @@ static inline void __free_one_page(struct page *page,
146 }
147
148 done_merging:
149+ arch_buddy_merge(page, order);
150 set_page_order(page, order);
151
152 /*
153--
1542.12.1
155
diff --git a/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch b/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
new file mode 100644
index 0000000..07d4a83
--- /dev/null
+++ b/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
@@ -0,0 +1,137 @@
1From 2c145b5233b504f5226a0f4bc44baeef33b444d8 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:32:39 -0800
4Subject: [PATCH 154/154] sysctl: vm: Fine-grained cache shrinking
5
6Lots of virtual machines are let in idle state for days until they
7are terminated, and they can keep a large amount of memory in their
8cache, meaning this memory cannot be used by other processes.
9
10We tried to release this memory using existing drop_caches sysctl,
11but it led to the complete cache loss while it could have been used
12whether the idle process wakes up. Indeed, the process can't find any
13available cached data and it directly affects performances to rebuild
14it from scratch.
15
16Instead, the solution we want is based on shrinking gradually system
17cache over time. This patch adds a new sysctl shrink_caches_mb so as
18to allow userspace applications indicating the kernel it should shrink
19system cache up to the amount (in MiB) specified.
20
21There is an application called "memshrinker" which uses this new
22mechanism. It runs in the background and periodically releases a
23specified amount of cache. This amount is based on the remaining
24cache on the system, and period is computed to follow a shrinking
25model. It results in saving a lot of memory for other processes
26running on the system.
27
28Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
29Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
30---
31 fs/drop_caches.c | 25 +++++++++++++++++++++++++
32 include/linux/mm.h | 4 ++++
33 kernel/sysctl.c | 8 ++++++++
34 mm/vmscan.c | 2 --
35 4 files changed, 37 insertions(+), 2 deletions(-)
36
37diff --git a/fs/drop_caches.c b/fs/drop_caches.c
38index d72d52b90433..f564dfcc13a4 100644
39--- a/fs/drop_caches.c
40+++ b/fs/drop_caches.c
41@@ -8,10 +8,12 @@
42 #include <linux/writeback.h>
43 #include <linux/sysctl.h>
44 #include <linux/gfp.h>
45+#include <linux/swap.h>
46 #include "internal.h"
47
48 /* A global variable is a bit ugly, but it keeps the code simple */
49 int sysctl_drop_caches;
50+int sysctl_shrink_caches_mb;
51
52 static void drop_pagecache_sb(struct super_block *sb, void *unused)
53 {
54@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
55 }
56 return 0;
57 }
58+
59+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
60+ void __user *buffer, size_t *length, loff_t *ppos)
61+{
62+ int ret;
63+ unsigned long nr_to_reclaim, page_reclaimed;
64+
65+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
66+ if (ret)
67+ return ret;
68+
69+ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE;
70+ if (write) {
71+ page_reclaimed = shrink_all_memory(nr_to_reclaim);
72+ if (page_reclaimed > 0)
73+ lru_add_drain_all();
74+
75+ if (page_reclaimed != nr_to_reclaim)
76+ return page_reclaimed;
77+ }
78+
79+ return 0;
80+}
81diff --git a/include/linux/mm.h b/include/linux/mm.h
82index 833f23d98baa..0bb66c1c31c9 100644
83--- a/include/linux/mm.h
84+++ b/include/linux/mm.h
85@@ -2308,6 +2308,10 @@ extern int kvm_ret_mem_advice;
86 int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
87 void __user *buffer, size_t *length,
88 loff_t *ppos);
89+extern int sysctl_shrink_caches_mb;
90+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
91+ void __user *buffer, size_t *length,
92+ loff_t *ppos);
93 #endif
94
95 void drop_slab(void);
96diff --git a/kernel/sysctl.c b/kernel/sysctl.c
97index d8ae774fa042..5dc9a46ae212 100644
98--- a/kernel/sysctl.c
99+++ b/kernel/sysctl.c
100@@ -1405,6 +1405,14 @@ static struct ctl_table vm_table[] = {
101 .mode = 0644,
102 .proc_handler = kvm_madv_instant_free_sysctl_handler,
103 },
104+ {
105+ .procname = "shrink_caches_mb",
106+ .data = &sysctl_shrink_caches_mb,
107+ .maxlen = sizeof(int),
108+ .mode = 0644,
109+ .proc_handler = shrink_caches_sysctl_handler,
110+ .extra1 = &one,
111+ },
112 #ifdef CONFIG_COMPACTION
113 {
114 .procname = "compact_memory",
115diff --git a/mm/vmscan.c b/mm/vmscan.c
116index 30a88b945a44..1198e74d1860 100644
117--- a/mm/vmscan.c
118+++ b/mm/vmscan.c
119@@ -3525,7 +3525,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
120 wake_up_interruptible(&pgdat->kswapd_wait);
121 }
122
123-#ifdef CONFIG_HIBERNATION
124 /*
125 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
126 * freed pages.
127@@ -3564,7 +3563,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
128
129 return nr_reclaimed;
130 }
131-#endif /* CONFIG_HIBERNATION */
132
133 /* It's optimal to keep kswapds on the same CPUs as their memory, but
134 not required for correctness. So if the last cpu in a node goes
135--
1362.12.1
137
diff --git a/patches/boot_time_opt/host_boot_time_opt.scc b/patches/boot_time_opt/host_boot_time_opt.scc
new file mode 100644
index 0000000..ec93999
--- /dev/null
+++ b/patches/boot_time_opt/host_boot_time_opt.scc
@@ -0,0 +1,29 @@
1define KFEATURE_DESCRIPTION "Boot time optimization changes ported from ClearLinux, https://github.com/clearlinux-pkgs/linux-lts and https://github.com/clearlinux-pkgs/linux-kvm"
2define KFEATURE_COMPATIBILITY all
3
4patch 0101-kvm-silence-kvm-unhandled-rdmsr.patch
5patch 0102-i8042-decrease-debug-message-level-to-info.patch
6patch 0104-Increase-the-ext4-default-commit-age.patch
7patch 0105-silence-rapl.patch
8patch 0106-pci-pme-wakeups.patch
9patch 0107-ksm-wakeups.patch
10patch 0108-intel_idle-tweak-cpuidle-cstates.patch
11patch 0110-init_task-faster-timerslack.patch
12patch 0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch
13patch 0113-overload-on-wakeup.patch
14patch 0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch
15patch 0115-fix-initcall-timestamps.patch
16patch 0116-smpboot-reuse-timer-calibration.patch
17patch 0118-Initialize-ata-before-graphics.patch
18patch 0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch
19patch 0120-give-rdrand-some-credit.patch
20patch 0121-e1000e-change-default-policy.patch
21patch 0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch
22patch 0123-igb-no-runtime-pm-to-fix-reboot-oops.patch
23patch 0124-tweak-perfbias.patch
24patch 0125-e1000e-increase-pause-and-refresh-time.patch
25
26patch 0151-mm-Export-do_madvise.patch
27patch 0152-x86-kvm-Notify-host-to-release-pages.patch
28patch 0153-x86-Return-memory-from-guest-to-host-kernel.patch
29patch 0154-sysctl-vm-Fine-grained-cache-shrinking.patch
diff --git a/patches/boot_time_opt/raid_alg.cfg b/patches/boot_time_opt/raid_alg.cfg
new file mode 100644
index 0000000..6df4a7c
--- /dev/null
+++ b/patches/boot_time_opt/raid_alg.cfg
@@ -0,0 +1,3 @@
1CONFIG_RAID6_FORCE_ALGO=y
2CONFIG_RAID6_FORCE_INT=y
3CONFIG_RAID6_FORCE_AVX2=y
diff --git a/patches/boot_time_opt/raid_alg.scc b/patches/boot_time_opt/raid_alg.scc
new file mode 100644
index 0000000..98dd713
--- /dev/null
+++ b/patches/boot_time_opt/raid_alg.scc
@@ -0,0 +1,5 @@
1define KFEATURE_DESCRIPTION "Use AVX2 for RAID recovery algorithm"
2define KFEATURE_COMPATIBILITY all
3
4patch 0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch
5kconf non-hardware raid_alg.cfg
diff --git a/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch
new file mode 100644
index 0000000..1de2a6b
--- /dev/null
+++ b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch
@@ -0,0 +1,34 @@
1From 6b0fb5b2a7a157c04d8ab6ad71b092034d0048bf Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:19:26 -0600
4Subject: [PATCH 102/114] cpuidle: skip synchronize_rcu() on single CPU systems
5
6synchronize_rcu() is pretty expensive, and on single CPU systems we don't need
7it in this specific case, so skip it.
8
9Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 drivers/cpuidle/cpuidle.c | 5 ++++-
13 1 file changed, 4 insertions(+), 1 deletion(-)
14
15diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
16index 62810ff3b00f..f1d110411098 100644
17--- a/drivers/cpuidle/cpuidle.c
18+++ b/drivers/cpuidle/cpuidle.c
19@@ -324,8 +324,11 @@ void cpuidle_uninstall_idle_handler(void)
20 /*
21 * Make sure external observers (such as the scheduler)
22 * are done looking at pointed idle states.
23+ * This is only relevant if there is more than one cpu,
24+ * if there is only one CPU, that is us... and we're
25+ * coherent to ourselves.
26 */
27- synchronize_rcu();
28+
29 }
30
31 /**
32--
332.11.1
34
diff --git a/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
new file mode 100644
index 0000000..d3a20fb
--- /dev/null
+++ b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
@@ -0,0 +1,38 @@
1From 7be707833bb35c295eb702d13cf73ac9390e4b31 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:25:16 -0600
4Subject: [PATCH 103/114] sysrq: skip synchronize_rcu() if there is no old op
5
6synchronize_rcu() is expensive. Currently it is called as part of the sysrq
7registration/unregistration, which happens during boot several times.
8Now, the reason for the synchronize_rcu() is to allow an old registered
9operation to expire properly... which is pointless if the old operation
10is NULL...
11So we can save the common case of the old operation being NULL a lot of time
12by just checking for non-NULL prior to the synchronize_rcu()
13
14Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
15Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
16---
17 drivers/tty/sysrq.c | 4 +++-
18 1 file changed, 3 insertions(+), 1 deletion(-)
19
20diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
21index 701c085bb19b..c60c7ba57ad9 100644
22--- a/drivers/tty/sysrq.c
23+++ b/drivers/tty/sysrq.c
24@@ -1065,8 +1065,10 @@ static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p,
25 * A concurrent __handle_sysrq either got the old op or the new op.
26 * Wait for it to go away before returning, so the code for an old
27 * op is not freed (eg. on module unload) while it is in use.
28+ * This is only relevant if the old op is not NULL of course.
29 */
30- synchronize_rcu();
31+ if (remove_op_p)
32+ synchronize_rcu();
33
34 return retval;
35 }
36--
372.11.1
38
diff --git a/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch
new file mode 100644
index 0000000..715c195
--- /dev/null
+++ b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch
@@ -0,0 +1,26 @@
1From 5899ff79ed4e3514420e1530a3588a922832dae5 Mon Sep 17 00:00:00 2001
2From: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
3Date: Mon, 13 Apr 2015 11:26:36 -0500
4Subject: [PATCH 104/114] fbcon: enable no blink by default
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7---
8 drivers/video/console/fbcon.c | 2 +-
9 1 file changed, 1 insertion(+), 1 deletion(-)
10
11diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
12index a44f5627b82a..95b73366b86f 100644
13--- a/drivers/video/console/fbcon.c
14+++ b/drivers/video/console/fbcon.c
15@@ -146,7 +146,7 @@ static const struct consw fb_con;
16
17 static int fbcon_set_origin(struct vc_data *);
18
19-static int fbcon_cursor_noblink;
20+static int fbcon_cursor_noblink = 1;
21
22 #define divides(a, b) ((!(a) || (b)%(a)) ? 0 : 1)
23
24--
252.11.1
26
diff --git a/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch
new file mode 100644
index 0000000..09b109a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch
@@ -0,0 +1,28 @@
1From ff47b4e9be8113b4ba05d6f2afee3db6904bc10f Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:47:20 -0600
4Subject: [PATCH 105/114] vmstats: wakeups
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 mm/vmstat.c | 2 +-
11 1 file changed, 1 insertion(+), 1 deletion(-)
12
13diff --git a/mm/vmstat.c b/mm/vmstat.c
14index 7c28df36f50f..efe1b6797139 100644
15--- a/mm/vmstat.c
16+++ b/mm/vmstat.c
17@@ -1549,7 +1549,7 @@ static const struct file_operations proc_vmstat_file_operations = {
18 #ifdef CONFIG_SMP
19 static struct workqueue_struct *vmstat_wq;
20 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
21-int sysctl_stat_interval __read_mostly = HZ;
22+int sysctl_stat_interval __read_mostly = 8 * HZ;
23
24 #ifdef CONFIG_PROC_FS
25 static void refresh_vm_stats(struct work_struct *work)
26--
272.11.1
28
diff --git a/patches/boot_time_opt_guest/0106-pci-probe.patch b/patches/boot_time_opt_guest/0106-pci-probe.patch
new file mode 100644
index 0000000..5045926
--- /dev/null
+++ b/patches/boot_time_opt_guest/0106-pci-probe.patch
@@ -0,0 +1,123 @@
1From b225caf8f743b9f5f9e84d0df711ee0c17e049ae Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 16:53:08 -0600
4Subject: [PATCH 106/114] pci: probe
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 drivers/pci/probe.c | 43 ++++++++++++++++++++++++++++++++++++++++---
11 1 file changed, 40 insertions(+), 3 deletions(-)
12
13diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
14index 204960e70333..7399a06698da 100644
15--- a/drivers/pci/probe.c
16+++ b/drivers/pci/probe.c
17@@ -182,6 +182,10 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
18
19 mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
20
21+ res->name = pci_name(dev);
22+
23+ printk("clr: Starting probe for %s\n", res->name);
24+
25 /* No printks while decoding is disabled! */
26 if (!dev->mmio_always_on) {
27 pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
28@@ -191,8 +195,6 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
29 }
30 }
31
32- res->name = pci_name(dev);
33-
34 pci_read_config_dword(dev, pos, &l);
35 pci_write_config_dword(dev, pos, l | mask);
36 pci_read_config_dword(dev, pos, &sz);
37@@ -324,6 +326,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
38 if (dev->non_compliant_bars)
39 return;
40
41+ printk("clr: pci_read_bases start\n");
42+
43 for (pos = 0; pos < howmany; pos++) {
44 struct resource *res = &dev->resource[pos];
45 reg = PCI_BASE_ADDRESS_0 + (pos << 2);
46@@ -332,11 +336,13 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
47
48 if (rom) {
49 struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
50+ printk("clr: rom path\n");
51 dev->rom_base_reg = rom;
52 res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
53 IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
54 __pci_read_base(dev, pci_bar_mem32, res, rom);
55 }
56+ printk("clr: pci_read_bases end\n");
57 }
58
59 static void pci_read_bridge_io(struct pci_bus *child)
60@@ -1311,6 +1317,28 @@ static void pci_msi_setup_pci_dev(struct pci_dev *dev)
61 pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
62 }
63
64+static int guess_bar_count(int class)
65+{
66+ if (class == 0x068000)
67+ return 0;
68+ if (class == 0x020000)
69+ return 2;
70+ if (class == 0x010000)
71+ return 2;
72+ if (class == 0x00ff00)
73+ return 1;
74+ return 6;
75+}
76+
77+static int has_rom(int class, int rom)
78+{
79+ if (class == 0x020000)
80+ return 0;
81+ if (class == 0x010000 || class == 0x00ff00)
82+ return 0;
83+ return rom;
84+}
85+
86 /**
87 * pci_setup_device - fill in class and map information of a device
88 * @dev: the device structure to fill
89@@ -1329,6 +1357,9 @@ int pci_setup_device(struct pci_dev *dev)
90 int pos = 0;
91 struct pci_bus_region region;
92 struct resource *res;
93+ int maxbar;
94+
95+ printk("clr: pci_setup_device start\n");
96
97 if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
98 return -EIO;
99@@ -1383,7 +1414,11 @@ int pci_setup_device(struct pci_dev *dev)
100 if (class == PCI_CLASS_BRIDGE_PCI)
101 goto bad;
102 pci_read_irq(dev);
103- pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
104+
105+ maxbar = guess_bar_count(dev->class);
106+
107+ if (class != PCI_CLASS_STORAGE_IDE)
108+ pci_read_bases(dev, maxbar, has_rom(dev->class, PCI_ROM_ADDRESS));
109 pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
110 pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
111
112@@ -1468,6 +1503,8 @@ int pci_setup_device(struct pci_dev *dev)
113 dev->class = PCI_CLASS_NOT_DEFINED << 8;
114 }
115
116+ printk("clr: pci_setup_device end\n");
117+
118 /* We found a fine healthy device, go go go... */
119 return 0;
120 }
121--
1222.11.1
123
diff --git a/patches/boot_time_opt_guest/0107-cgroup.patch b/patches/boot_time_opt_guest/0107-cgroup.patch
new file mode 100644
index 0000000..d68c686
--- /dev/null
+++ b/patches/boot_time_opt_guest/0107-cgroup.patch
@@ -0,0 +1,107 @@
1From 0adc5bfd84939d11d3c172eab0a00bfab4aadb46 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Fri, 28 Aug 2015 11:00:36 -0500
4Subject: [PATCH 107/114] cgroup
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9Signed-off-by: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com>
10---
11 include/linux/cgroup-defs.h | 2 +-
12 kernel/cgroup.c | 24 ++++++++++++++----------
13 2 files changed, 15 insertions(+), 11 deletions(-)
14
15diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
16index 861b4677fc5b..5d3c345ee60c 100644
17--- a/include/linux/cgroup-defs.h
18+++ b/include/linux/cgroup-defs.h
19@@ -137,7 +137,7 @@ struct cgroup_subsys_state {
20
21 /* percpu_ref killing and RCU release */
22 struct rcu_head rcu_head;
23- struct work_struct destroy_work;
24+ struct delayed_work destroy_work;
25 };
26
27 /*
28diff --git a/kernel/cgroup.c b/kernel/cgroup.c
29index 53bbca7c4859..6de39d8213ed 100644
30--- a/kernel/cgroup.c
31+++ b/kernel/cgroup.c
32@@ -73,7 +73,7 @@
33 * Expiring in the middle is a performance problem not a correctness one.
34 * 1 sec should be enough.
35 */
36-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
37+#define CGROUP_PIDLIST_DESTROY_DELAY round_jiffies_relative(HZ)
38
39 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
40 MAX_CFTYPE_NAME + 2)
41@@ -4986,8 +4986,9 @@ static struct cftype cgroup_legacy_base_files[] = {
42 */
43 static void css_free_work_fn(struct work_struct *work)
44 {
45+ struct delayed_work *dwork = to_delayed_work(work);
46 struct cgroup_subsys_state *css =
47- container_of(work, struct cgroup_subsys_state, destroy_work);
48+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
49 struct cgroup_subsys *ss = css->ss;
50 struct cgroup *cgrp = css->cgroup;
51
52@@ -5036,14 +5037,15 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
53 struct cgroup_subsys_state *css =
54 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
55
56- INIT_WORK(&css->destroy_work, css_free_work_fn);
57- queue_work(cgroup_destroy_wq, &css->destroy_work);
58+ INIT_DELAYED_WORK(&css->destroy_work, css_free_work_fn);
59+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
60 }
61
62 static void css_release_work_fn(struct work_struct *work)
63 {
64+ struct delayed_work *dwork = to_delayed_work(work);
65 struct cgroup_subsys_state *css =
66- container_of(work, struct cgroup_subsys_state, destroy_work);
67+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
68 struct cgroup_subsys *ss = css->ss;
69 struct cgroup *cgrp = css->cgroup;
70
71@@ -5088,8 +5090,9 @@ static void css_release(struct percpu_ref *ref)
72 struct cgroup_subsys_state *css =
73 container_of(ref, struct cgroup_subsys_state, refcnt);
74
75- INIT_WORK(&css->destroy_work, css_release_work_fn);
76- queue_work(cgroup_destroy_wq, &css->destroy_work);
77+ INIT_DELAYED_WORK(&css->destroy_work, css_release_work_fn);
78+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
79+
80 }
81
82 static void init_and_link_css(struct cgroup_subsys_state *css,
83@@ -5371,8 +5374,9 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
84 */
85 static void css_killed_work_fn(struct work_struct *work)
86 {
87+ struct delayed_work *dwork = to_delayed_work(work);
88 struct cgroup_subsys_state *css =
89- container_of(work, struct cgroup_subsys_state, destroy_work);
90+ container_of(dwork, struct cgroup_subsys_state, destroy_work);
91
92 mutex_lock(&cgroup_mutex);
93
94@@ -5393,8 +5397,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
95 container_of(ref, struct cgroup_subsys_state, refcnt);
96
97 if (atomic_dec_and_test(&css->online_cnt)) {
98- INIT_WORK(&css->destroy_work, css_killed_work_fn);
99- queue_work(cgroup_destroy_wq, &css->destroy_work);
100+ INIT_DELAYED_WORK(&css->destroy_work, css_killed_work_fn);
101+ queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY);
102 }
103 }
104
105--
1062.11.1
107
diff --git a/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch
new file mode 100644
index 0000000..48be94a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch
@@ -0,0 +1,45 @@
1From 634947be6c24d844af5f6ecf59453f2ddc09e032 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 11 Feb 2015 17:28:14 -0600
4Subject: [PATCH 108/114] smpboot: reuse timer calibration
5
6NO point recalibrating for known-constant tsc... saves 200ms+ of boot time.
7
8Author: Arjan van de Ven <arjan@linux.intel.com>
9
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 arch/x86/kernel/smpboot.c | 2 +-
13 arch/x86/kernel/tsc.c | 3 +++
14 2 files changed, 4 insertions(+), 1 deletion(-)
15
16diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
17index 99b920d0e516..e17bb425bb52 100644
18--- a/arch/x86/kernel/smpboot.c
19+++ b/arch/x86/kernel/smpboot.c
20@@ -761,7 +761,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
21 pr_debug("Waiting for send to finish...\n");
22 send_status = safe_apic_wait_icr_idle();
23
24- udelay(init_udelay);
25+ udelay(100);
26
27 pr_debug("Deasserting INIT\n");
28
29diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
30index 37e7cf544e51..e99be8a6a132 100644
31--- a/arch/x86/kernel/tsc.c
32+++ b/arch/x86/kernel/tsc.c
33@@ -1413,6 +1413,9 @@ unsigned long calibrate_delay_is_known(void)
34 if (!mask)
35 return 0;
36
37+ if (cpu !=0)
38+ return cpu_data(0).loops_per_jiffy;
39+
40 sibling = cpumask_any_but(mask, cpu);
41 if (sibling < nr_cpu_ids)
42 return cpu_data(sibling).loops_per_jiffy;
43--
442.11.1
45
diff --git a/patches/boot_time_opt_guest/0109-perf.patch b/patches/boot_time_opt_guest/0109-perf.patch
new file mode 100644
index 0000000..75f50f6
--- /dev/null
+++ b/patches/boot_time_opt_guest/0109-perf.patch
@@ -0,0 +1,28 @@
1From cce700dfbd5fdbf72b96e6479ca539ab4d880ce2 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Wed, 4 Nov 2015 15:17:10 -0600
4Subject: [PATCH 109/114] perf
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7
8Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
9---
10 arch/x86/events/intel/core.c | 2 +-
11 1 file changed, 1 insertion(+), 1 deletion(-)
12
13diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
14index eb1484c86bb4..c13ea26ac066 100644
15--- a/arch/x86/events/intel/core.c
16+++ b/arch/x86/events/intel/core.c
17@@ -4040,7 +4040,7 @@ __init int intel_pmu_init(void)
18 */
19 if (x86_pmu.extra_regs) {
20 for (er = x86_pmu.extra_regs; er->msr; er++) {
21- er->extra_msr_access = check_msr(er->msr, 0x11UL);
22+ er->extra_msr_access = false;
23 /* Disable LBR select mapping */
24 if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
25 x86_pmu.lbr_sel_map = NULL;
26--
272.11.1
28
diff --git a/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch
new file mode 100644
index 0000000..742a045
--- /dev/null
+++ b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch
@@ -0,0 +1,190 @@
1From c662d99134b67c58e63ecc17c2531588a3a51596 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Sat, 14 Feb 2015 09:49:41 -0600
4Subject: [PATCH 110/114] pci: probe: identify known devices
5
6Author: Arjan van de Ven <arjan@linux.intel.com>
7Modify-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
8
9Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
10---
11 drivers/pci/probe.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++
12 1 file changed, 156 insertions(+)
13
14diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
15index 7399a06698da..4fb2d7fed4c5 100644
16--- a/drivers/pci/probe.c
17+++ b/drivers/pci/probe.c
18@@ -163,6 +163,159 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
19
20 #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
21
22+/* shortcut version of __pci_read_base where we know the sizes already */
23+int __pci_read_base_shortcut(struct pci_dev *dev, enum pci_bar_type type,
24+ struct resource *res, unsigned int pos, u32 sz_in, u32 sz2_in)
25+{
26+ u32 l, sz;
27+ u64 l64, sz64, mask64;
28+ struct pci_bus_region region, inverted_region;
29+
30+ res->name = pci_name(dev);
31+
32+ pci_read_config_dword(dev, pos, &l);
33+
34+ sz = sz_in;
35+
36+ /*
37+ * All bits set in sz means the device isn't working properly.
38+ * If the BAR isn't implemented, all bits must be 0. If it's a
39+ * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit
40+ * 1 must be clear.
41+ * Here we set the size and is not 0xffffffff
42+ */
43+
44+ /*
45+ * I don't know how l can have all bits set. Copied from old code.
46+ * Maybe it fixes a bug on some ancient platform.
47+ */
48+ if (l == 0xffffffff)
49+ l = 0;
50+
51+ if (type == pci_bar_unknown) {
52+ res->flags = decode_bar(dev, l);
53+ res->flags |= IORESOURCE_SIZEALIGN;
54+ if (res->flags & IORESOURCE_IO) {
55+ l64 = l & PCI_BASE_ADDRESS_IO_MASK;
56+ sz64 = sz & PCI_BASE_ADDRESS_IO_MASK;
57+ mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT;
58+ } else {
59+ l64 = l & PCI_BASE_ADDRESS_MEM_MASK;
60+ sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK;
61+ mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK;
62+ }
63+ } else {
64+ res->flags |= (l & IORESOURCE_ROM_ENABLE);
65+ l64 = l & PCI_ROM_ADDRESS_MASK;
66+ sz64 = sz & PCI_ROM_ADDRESS_MASK;
67+ mask64 = (u32)PCI_ROM_ADDRESS_MASK;
68+ }
69+
70+ if (res->flags & IORESOURCE_MEM_64) {
71+ pci_read_config_dword(dev, pos + 4, &l);
72+ sz = sz2_in;
73+
74+ l64 |= ((u64)l << 32);
75+ sz64 |= ((u64)sz << 32);
76+ mask64 |= ((u64)~0 << 32);
77+ }
78+
79+ if (!sz64)
80+ goto fail;
81+
82+ sz64 = pci_size(l64, sz64, mask64);
83+ if (!sz64) {
84+ dev_info(&dev->dev, FW_BUG "reg 0x%x: invalid BAR (can't size)\n",
85+ pos);
86+ goto fail;
87+ }
88+
89+ if (res->flags & IORESOURCE_MEM_64) {
90+ if ((sizeof(dma_addr_t) < 8 || sizeof(resource_size_t) < 8) &&
91+ sz64 > 0x100000000ULL) {
92+ res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED;
93+ res->start = 0;
94+ res->end = 0;
95+ dev_err(&dev->dev, "reg 0x%x: can't handle BAR larger than 4GB (size %#010llx)\n",
96+ pos, (unsigned long long)sz64);
97+ goto out;
98+ }
99+
100+ if ((sizeof(dma_addr_t) < 8) && l) {
101+ /* Above 32-bit boundary; try to reallocate */
102+ res->flags |= IORESOURCE_UNSET;
103+ res->start = 0;
104+ res->end = sz64;
105+ dev_info(&dev->dev, "reg 0x%x: can't handle BAR above 4GB (bus address %#010llx)\n",
106+ pos, (unsigned long long)l64);
107+ goto out;
108+ }
109+ }
110+
111+ region.start = l64;
112+ region.end = l64 + sz64;
113+
114+ pcibios_bus_to_resource(dev->bus, res, &region);
115+ pcibios_resource_to_bus(dev->bus, &inverted_region, res);
116+
117+ /*
118+ * If "A" is a BAR value (a bus address), "bus_to_resource(A)" is
119+ * the corresponding resource address (the physical address used by
120+ * the CPU. Converting that resource address back to a bus address
121+ * should yield the original BAR value:
122+ *
123+ * resource_to_bus(bus_to_resource(A)) == A
124+ *
125+ * If it doesn't, CPU accesses to "bus_to_resource(A)" will not
126+ * be claimed by the device.
127+ */
128+ if (inverted_region.start != region.start) {
129+ res->flags |= IORESOURCE_UNSET;
130+ res->start = 0;
131+ res->end = region.end - region.start;
132+ dev_info(&dev->dev, "reg 0x%x: initial BAR value %#010llx invalid\n",
133+ pos, (unsigned long long)region.start);
134+ }
135+
136+ goto out;
137+
138+
139+fail:
140+ res->flags = 0;
141+out:
142+ if (res->flags)
143+ dev_printk(KERN_DEBUG, &dev->dev, "reg 0x%x: %pR\n", pos, res);
144+
145+ return (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
146+}
147+
148+static int is_known_device(struct pci_dev *dev, int pos, int *sz)
149+{
150+ /* Red Hat, Inc : Virtio network device */
151+ if (dev->vendor == 0x1af4 && dev->device == 0x1000) {
152+ if (pos == 0x10) {
153+ *sz = 0xffffffe1;
154+ return 1;
155+ }
156+ if (pos == 0x14) {
157+ *sz = 0xfffff000;
158+ return 1;
159+ }
160+ }
161+ /* Red Hat, Inc : Virtio block device */
162+ if (dev->vendor == 0x1af4 && dev->device == 0x1001) {
163+ if (pos == 0x10) {
164+ *sz = 0xffffffc1;
165+ return 1;
166+ }
167+ if (pos == 0x14) {
168+ *sz = 0xfffff000;
169+ return 1;
170+ }
171+ }
172+ return 0;
173+}
174+
175 /**
176 * pci_read_base - read a PCI BAR
177 * @dev: the PCI device
178@@ -182,6 +335,9 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
179
180 mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
181
182+ if (is_known_device(dev, pos, &sz))
183+ return __pci_read_base_shortcut(dev, type, res, pos, sz, 0);
184+
185 res->name = pci_name(dev);
186
187 printk("clr: Starting probe for %s\n", res->name);
188--
1892.11.1
190
diff --git a/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch
new file mode 100644
index 0000000..701a18d
--- /dev/null
+++ b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch
@@ -0,0 +1,39 @@
1From be2ab4809c6b5058fbf3cd54c0f59c56416e572c Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 22 Jun 2015 09:33:33 -0500
4Subject: [PATCH 111/114] init: no wait for the known devices
5
6No wait for the known devices to complete their probing
7
8Author: Arjan van de Ven <arjan@linux.intel.com>
9
10Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
11---
12 init/do_mounts.c | 4 +++-
13 1 file changed, 3 insertions(+), 1 deletion(-)
14
15diff --git a/init/do_mounts.c b/init/do_mounts.c
16index c2de5104aad2..40725f0f5fb3 100644
17--- a/init/do_mounts.c
18+++ b/init/do_mounts.c
19@@ -28,6 +28,7 @@
20 #include <linux/slab.h>
21 #include <linux/ramfs.h>
22 #include <linux/shmem_fs.h>
23+#include <linux/async.h>
24
25 #include <linux/nfs_fs.h>
26 #include <linux/nfs_fs_sb.h>
27@@ -563,7 +564,8 @@ void __init prepare_namespace(void)
28 * For example, it is not atypical to wait 5 seconds here
29 * for the touchpad of a laptop to initialize.
30 */
31- wait_for_device_probe();
32+ //wait_for_device_probe();
33+ async_synchronize_full();
34
35 md_run_setup();
36
37--
382.11.1
39
diff --git a/patches/boot_time_opt_guest/0112-ksm-wakeups.patch b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch
new file mode 100644
index 0000000..b131e3f
--- /dev/null
+++ b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch
@@ -0,0 +1,32 @@
1From 2dc48e4b5c651691b7028991b64c935047b41b19 Mon Sep 17 00:00:00 2001
2From: Arjan van de Ven <arjan@linux.intel.com>
3Date: Mon, 14 Mar 2016 11:06:46 -0600
4Subject: [PATCH 112/114] ksm-wakeups
5
6reduce wakeups in ksm
7---
8 mm/ksm.c | 8 ++++++--
9 1 file changed, 6 insertions(+), 2 deletions(-)
10
11diff --git a/mm/ksm.c b/mm/ksm.c
12index 9ae6011a41f8..eecd3ff669e2 100644
13--- a/mm/ksm.c
14+++ b/mm/ksm.c
15@@ -1725,8 +1725,12 @@ static int ksm_scan_thread(void *nothing)
16 try_to_freeze();
17
18 if (ksmd_should_run()) {
19- schedule_timeout_interruptible(
20- msecs_to_jiffies(ksm_thread_sleep_millisecs));
21+ if (ksm_thread_sleep_millisecs >= 1000)
22+ schedule_timeout_interruptible(
23+ msecs_to_jiffies(round_jiffies_relative(ksm_thread_sleep_millisecs)));
24+ else
25+ schedule_timeout_interruptible(
26+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
27 } else {
28 wait_event_freezable(ksm_thread_wait,
29 ksmd_should_run() || kthread_should_stop());
30--
312.11.1
32
diff --git a/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch
new file mode 100644
index 0000000..047eddb
--- /dev/null
+++ b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch
@@ -0,0 +1,42 @@
1From 179b7f41d5509f93cd297cc81c5d8da4a3123d9d Mon Sep 17 00:00:00 2001
2From: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
3Date: Fri, 20 Nov 2015 14:01:26 -0600
4Subject: [PATCH 113/114] init: do_mounts: recreate /dev/root
5
6Rootfs shows as is mounted in /dev/root, but this devices is not present in
7/dev directory.
8
9Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
10---
11 init/do_mounts.c | 8 ++++++++
12 1 file changed, 8 insertions(+)
13
14diff --git a/init/do_mounts.c b/init/do_mounts.c
15index 40725f0f5fb3..78b5b1dba8ca 100644
16--- a/init/do_mounts.c
17+++ b/init/do_mounts.c
18@@ -550,6 +550,7 @@ void __init mount_root(void)
19 void __init prepare_namespace(void)
20 {
21 int is_floppy;
22+ int err;
23
24 if (root_delay) {
25 printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
26@@ -604,6 +605,13 @@ void __init prepare_namespace(void)
27 devtmpfs_mount("dev");
28 sys_mount(".", "/", NULL, MS_MOVE, NULL);
29 sys_chroot(".");
30+#ifdef CONFIG_BLOCK
31+ /* recreate the /dev/root */
32+ err = create_dev("/dev/root", ROOT_DEV);
33+
34+ if (err < 0)
35+ pr_emerg("Failed to create /dev/root: %d\n", err);
36+#endif
37 }
38
39 static bool is_tmpfs;
40--
412.11.1
42
diff --git a/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
new file mode 100644
index 0000000..dee9058
--- /dev/null
+++ b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch
@@ -0,0 +1,56 @@
1From 02fd2e6a7c708bf973209f9b238c5c61cbf15239 Mon Sep 17 00:00:00 2001
2From: Alan Cox <alan@linux.intel.com>
3Date: Thu, 10 Mar 2016 15:11:28 +0000
4Subject: [PATCH 114/114] xattr: allow setting user.* attributes on symlinks by
5 owner
6
7Kvmtool and clear containers supports using user attributes to label host
8files with the virtual uid/guid of the file in the container. This allows an
9end user to manage their files and a complete uid space without all the ugly
10namespace stuff.
11
12The one gap in the support is symlinks because an end user can change the
13ownership of a symbolic link. We support attributes on these files as you
14can already (as root) set security attributes on them.
15
16The current rules seem slightly over-paranoid and as we have a use case this
17patch enables updating the attributes on a symbolic link IFF you are the
18owner of the synlink (as permissions are not usually meaningful on the link
19itself).
20
21Signed-off-by: Alan Cox <alan@linux.intel.com>
22---
23 fs/xattr.c | 14 ++++++++------
24 1 file changed, 8 insertions(+), 6 deletions(-)
25
26diff --git a/fs/xattr.c b/fs/xattr.c
27index 7e3317cf4045..e005c30acb2c 100644
28--- a/fs/xattr.c
29+++ b/fs/xattr.c
30@@ -118,15 +118,17 @@ xattr_permission(struct inode *inode, const char *name, int mask)
31 }
32
33 /*
34- * In the user.* namespace, only regular files and directories can have
35- * extended attributes. For sticky directories, only the owner and
36- * privileged users can write attributes.
37+ * In the user.* namespace, only regular files, symbolic links, and
38+ * directories can have extended attributes. For symbolic links and
39+ * sticky directories, only the owner and privileged users can write
40+ * attributes.
41 */
42 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
43- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
44+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode))
45 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
46- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
47- (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
48+ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX))
49+ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE)
50+ && !inode_owner_or_capable(inode))
51 return -EPERM;
52 }
53
54--
552.11.1
56
diff --git a/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch
new file mode 100644
index 0000000..a6dbff7
--- /dev/null
+++ b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch
@@ -0,0 +1,84 @@
1From 99b4cdcce43ad0f706120bef26fef8c628c572cf Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:03:52 -0800
4Subject: [PATCH 151/154] mm: Export do_madvise()
5
6Combined with some interesting flags madvise() system call
7allows to free memory more smartly and more efficiently than
8we could do with a simple free(). The issue is that is not
9available for kernel modules that could need it.
10
11In order to solve this lack of support, this patch exports
12do_madvise() so as to make it available to the entire kernel.
13The already existing madvise() system call is unchanged and
14now relies on this new do_madvise() function.
15
16Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
17Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
18---
19 include/linux/mm.h | 2 ++
20 mm/madvise.c | 25 +++++++++++++++++++++----
21 2 files changed, 23 insertions(+), 4 deletions(-)
22
23diff --git a/include/linux/mm.h b/include/linux/mm.h
24index 0b5b2e4df14e..925ec25f99a8 100644
25--- a/include/linux/mm.h
26+++ b/include/linux/mm.h
27@@ -2450,5 +2450,7 @@ void __init setup_nr_node_ids(void);
28 static inline void setup_nr_node_ids(void) {}
29 #endif
30
31+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
32+
33 #endif /* __KERNEL__ */
34 #endif /* _LINUX_MM_H */
35diff --git a/mm/madvise.c b/mm/madvise.c
36index 93fb63e88b5e..c8bbf93d4978 100644
37--- a/mm/madvise.c
38+++ b/mm/madvise.c
39@@ -618,9 +618,7 @@ madvise_behavior_valid(int behavior)
40 }
41
42 /*
43- * The madvise(2) system call.
44- *
45- * Applications can use madvise() to advise the kernel how it should
46+ * Kernel modules can use do_madvise() to advise the kernel how it should
47 * handle paging I/O in this VM area. The idea is to help the kernel
48 * use appropriate read-ahead and caching techniques. The information
49 * provided is advisory only, and can be safely disregarded by the
50@@ -673,7 +671,7 @@ madvise_behavior_valid(int behavior)
51 * -EBADF - map exists, but area maps something that isn't a file.
52 * -EAGAIN - a kernel resource was temporarily unavailable.
53 */
54-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
55+int do_madvise(unsigned long start, size_t len_in, int behavior)
56 {
57 unsigned long end, tmp;
58 struct vm_area_struct *vma, *prev;
59@@ -767,3 +765,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
60
61 return error;
62 }
63+EXPORT_SYMBOL_GPL(do_madvise);
64+
65+/*
66+ * The madvise(2) system call.
67+ *
68+ * Applications can use madvise() system call to advise the kernel how
69+ * it should handle paging I/O in this VM area. The idea is to help
70+ * the kernel use appropriate read-ahead and caching techniques. The
71+ * information provided is advisory only, and can be safely disregarded
72+ * by the kernel without affecting the correct operation of the application.
73+ *
74+ * behavior values are the same than the ones defined in madvise()
75+ *
76+ * return values are the same than the ones defined in madvise()
77+ */
78+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
79+{
80+ return do_madvise(start, len_in, behavior);
81+}
82--
832.12.1
84
diff --git a/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch
new file mode 100644
index 0000000..5f44930
--- /dev/null
+++ b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch
@@ -0,0 +1,180 @@
1From d28921b5f797829e4e676f7968ae688ef96b7992 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:08:55 -0800
4Subject: [PATCH 152/154] x86: kvm: Notify host to release pages
5
6In context of hypervisors managing several virtual machines, we
7want those virtual machines to give the memory they used back to
8the host when they don't need it anymore.
9
10This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
11the guest kernel to notify the host kernel when such event occurs.
12And relying on do_madvise() function that we have previously exported,
13it issues a call to this function when it receives the new hypercall.
14
15Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
16ask for a new page without going through a new hypercall. Instead,
17it will be able to start using that memory again as it will get
18faulted back in as a fresh new page. That's why do_madvise() is more
19efficient than doing vm_unmap() to return some memory to the host.
20
21This patch introduces also a new sysctl kvm_madv_instant_free,
22allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
23Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
24because it does not zero the pages in case the memory has not been
25freed by the kernel. This can happen when there was no need for the
26kernel to get this memory back, meaning it was keeping those pages
27in the right state to be re-used by the same application.
28MADV_FREE being a very recent advice introduced in kernel 4.5, we
29only want to enable it through a sysctl in case the user want to
30use it.
31
32Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
33Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
34---
35 arch/x86/kvm/x86.c | 17 +++++++++++++++++
36 include/linux/mm.h | 5 +++++
37 include/uapi/linux/kvm_para.h | 3 +++
38 kernel/sysctl.c | 7 +++++++
39 mm/Makefile | 2 +-
40 mm/kvm.c | 25 +++++++++++++++++++++++++
41 6 files changed, 58 insertions(+), 1 deletion(-)
42 create mode 100644 mm/kvm.c
43
44diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
45index 582c75311f95..683a94dd5f03 100644
46--- a/arch/x86/kvm/x86.c
47+++ b/arch/x86/kvm/x86.c
48@@ -46,6 +46,7 @@
49 #include <linux/user-return-notifier.h>
50 #include <linux/srcu.h>
51 #include <linux/slab.h>
52+#include <linux/mm.h>
53 #include <linux/perf_event.h>
54 #include <linux/uaccess.h>
55 #include <linux/hash.h>
56@@ -6019,6 +6020,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
57 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
58 }
59
60+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
61+{
62+ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
63+
64+ if (len > KVM_MAX_RET_MEM_SIZE)
65+ return KVM_EPERM;
66+
67+ if (kvm_is_error_hva(start + len))
68+ return KVM_EFAULT;
69+
70+ return do_madvise(start, len, kvm_ret_mem_advice);
71+}
72+
73 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
74 {
75 vcpu->arch.apicv_active = false;
76@@ -6065,6 +6079,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
77 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
78 ret = 0;
79 break;
80+ case KVM_HC_RETURN_MEM:
81+ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
82+ break;
83 default:
84 ret = -KVM_ENOSYS;
85 break;
86diff --git a/include/linux/mm.h b/include/linux/mm.h
87index 925ec25f99a8..833f23d98baa 100644
88--- a/include/linux/mm.h
89+++ b/include/linux/mm.h
90@@ -2303,6 +2303,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
91 extern int sysctl_drop_caches;
92 int drop_caches_sysctl_handler(struct ctl_table *, int,
93 void __user *, size_t *, loff_t *);
94+extern int sysctl_kvm_madv_instant_free;
95+extern int kvm_ret_mem_advice;
96+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
97+ void __user *buffer, size_t *length,
98+ loff_t *ppos);
99 #endif
100
101 void drop_slab(void);
102diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
103index bf6cd7d5cac2..7d90f77d87d0 100644
104--- a/include/uapi/linux/kvm_para.h
105+++ b/include/uapi/linux/kvm_para.h
106@@ -23,6 +23,9 @@
107 #define KVM_HC_MIPS_GET_CLOCK_FREQ 6
108 #define KVM_HC_MIPS_EXIT_VM 7
109 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8
110+#define KVM_HC_RETURN_MEM 10
111+
112+#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB
113
114 /*
115 * hypercalls use architecture specific
116diff --git a/kernel/sysctl.c b/kernel/sysctl.c
117index c1095cdc0fe2..d8ae774fa042 100644
118--- a/kernel/sysctl.c
119+++ b/kernel/sysctl.c
120@@ -1398,6 +1398,13 @@ static struct ctl_table vm_table[] = {
121 .extra1 = &one,
122 .extra2 = &four,
123 },
124+ {
125+ .procname = "kvm_madv_instant_free",
126+ .data = &sysctl_kvm_madv_instant_free,
127+ .maxlen = sizeof(int),
128+ .mode = 0644,
129+ .proc_handler = kvm_madv_instant_free_sysctl_handler,
130+ },
131 #ifdef CONFIG_COMPACTION
132 {
133 .procname = "compact_memory",
134diff --git a/mm/Makefile b/mm/Makefile
135index 295bd7a..6455723 100644
136--- a/mm/Makefile
137+++ b/mm/Makefile
138@@ -47,6 +47,8 @@ else
139 obj-y += bootmem.o
140 endif
141
142+obj-y += kvm.o
143+
144 obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
145 ifdef CONFIG_MMU
146 obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
147diff --git a/mm/kvm.c b/mm/kvm.c
148new file mode 100644
149index 000000000000..8945f6a311b9
150--- /dev/null
151+++ b/mm/kvm.c
152@@ -0,0 +1,25 @@
153+#include <linux/mman.h>
154+
155+int sysctl_kvm_madv_instant_free;
156+
157+int kvm_ret_mem_advice = MADV_DONTNEED;
158+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
159+
160+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
161+ void __user *buffer, size_t *length, loff_t *ppos)
162+{
163+ int ret;
164+
165+ ret = proc_dointvec(table, write, buffer, length, ppos);
166+ if (ret)
167+ return ret;
168+
169+#ifdef MADV_FREE
170+ if (sysctl_kvm_madv_instant_free > 0)
171+ kvm_ret_mem_advice = MADV_FREE;
172+ else
173+ kvm_ret_mem_advice = MADV_DONTNEED;
174+#endif
175+
176+ return 0;
177+}
178--
1792.12.1
180
diff --git a/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch
new file mode 100644
index 0000000..cdb876a
--- /dev/null
+++ b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch
@@ -0,0 +1,155 @@
1From 855ef164854307839c08c60688eaeac14f9a649e Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:26:13 -0800
4Subject: [PATCH 153/154] x86: Return memory from guest to host kernel
5
6All virtual machines need memory to perform various tasks, but this
7memory is not released to the host after it is not used anymore. We
8have to wait for the termination of the virtual machine to get this
9memory back into the host.
10
11Ballooning mechanism is close but not designed for the same purpose.
12In case we hit memory limits of the system, the host predicts how much
13memory can be asked back from a guest, and it issues an hypercall to
14retrieve this memory.
15
16The solution proposed is different because it does not wait for host
17needs before to return memory, and it knows precisely how much memory
18it can return.
19
20The way to notify the host side about such a return is to rely on
21the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be
22overloaded with too many hypercalls, we only return memory blocks of
23order 7 (512k blocks) and higher. This value has been found running
24memory tests using multiple threads allocating/freeing high amount
25of memory. Those tests were run for different order values, and 7 was
26the best tradeoff between the number of hypercalls issued and the
27amount of memory returned to the host.
28
29In order to limit performances impact related to this code addition,
30we check for blocks of order 7 or higher. This means it only costs an
31additional function call and a branch to perform this check.
32
33Furthermore, this code has been added to the "merge" codepath of the
34buddy allocator, which is not as sensitive as the "free" codepath.
35Not all blocks going through the "free" codepath will end up in the
36"merge" codepath because some of them won't find their free buddy.
37But this is a negligible amount since the kernel does not use many
38high order blocks directly. Instead, those bigger blocks are often
39broken into smaller chunks used as low order blocks. At the time
40those small blocks are released, they go through the merge path.
41
42Benchmarks such as ebizzy and will-it-scale have been run in order
43to make sure this patch does not affect kernel performances and no
44significant differences were observed.
45
46Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
47Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
48---
49 arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++
50 arch/x86/kernel/kvm.c | 10 ++++++++++
51 include/linux/mm-arch-hooks.h | 8 ++++++++
52 mm/page_alloc.c | 2 ++
53 4 files changed, 42 insertions(+)
54
55diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
56index bc62e7cbf1b1..4a2f6d1adbd2 100644
57--- a/arch/x86/include/asm/kvm_para.h
58+++ b/arch/x86/include/asm/kvm_para.h
59@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token);
60 void kvm_async_pf_task_wake(u32 token);
61 u32 kvm_read_and_reset_pf_reason(void);
62 extern void kvm_disable_steal_time(void);
63+void kvm_arch_return_memory(struct page *page, unsigned int order);
64+
65+/*
66+ * This order has been found in an empirical way, running memory tests
67+ * through many iterations to assess the number of hypercalls issued
68+ * and the amount of memory returned. In case you change this order to
69+ * 6 or 8, it should not impact your performances significantly.
70+ *
71+ * Smaller values lead to less memory waste, but consume more CPU on
72+ * hypercalls. Larger values use less CPU, but do not as precisely
73+ * inform the hypervisor of which memory is free.
74+ */
75+#define RET_MEM_BUDDY_ORDER 7
76+
77+static inline void arch_buddy_merge(struct page *page, unsigned int order)
78+{
79+ if (order < RET_MEM_BUDDY_ORDER)
80+ return;
81+
82+ kvm_arch_return_memory(page, order);
83+}
84+#define arch_buddy_merge arch_buddy_merge
85
86 #ifdef CONFIG_PARAVIRT_SPINLOCKS
87 void __init kvm_spinlock_init(void);
88diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
89index edbbfc854e39..14167b3f6514 100644
90--- a/arch/x86/kernel/kvm.c
91+++ b/arch/x86/kernel/kvm.c
92@@ -552,6 +552,16 @@ static __init int activate_jump_labels(void)
93 }
94 arch_initcall(activate_jump_labels);
95
96+void kvm_arch_return_memory(struct page *page, unsigned int order)
97+{
98+ if (!kvm_para_available())
99+ return;
100+
101+ kvm_hypercall2(KVM_HC_RETURN_MEM,
102+ page_to_phys(page),
103+ PAGE_SIZE << order);
104+}
105+
106 #ifdef CONFIG_PARAVIRT_SPINLOCKS
107
108 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
109diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
110index 4efc3f56e6df..26eb3a05a8a3 100644
111--- a/include/linux/mm-arch-hooks.h
112+++ b/include/linux/mm-arch-hooks.h
113@@ -12,6 +12,7 @@
114 #define _LINUX_MM_ARCH_HOOKS_H
115
116 #include <asm/mm-arch-hooks.h>
117+#include <asm/kvm_para.h>
118
119 #ifndef arch_remap
120 static inline void arch_remap(struct mm_struct *mm,
121@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm,
122 #define arch_remap arch_remap
123 #endif
124
125+#ifndef arch_buddy_merge
126+static inline void arch_buddy_merge(struct page *page, unsigned int order)
127+{
128+}
129+#define arch_buddy_merge arch_buddy_merge
130+#endif
131+
132 #endif /* _LINUX_MM_ARCH_HOOKS_H */
133diff --git a/mm/page_alloc.c b/mm/page_alloc.c
134index 1460e6ad5e14..5f6e6371bc6f 100644
135--- a/mm/page_alloc.c
136+++ b/mm/page_alloc.c
137@@ -64,6 +64,7 @@
138 #include <linux/page_owner.h>
139 #include <linux/kthread.h>
140 #include <linux/memcontrol.h>
141+#include <linux/mm-arch-hooks.h>
142
143 #include <asm/sections.h>
144 #include <asm/tlbflush.h>
145@@ -855,6 +856,7 @@ static inline void __free_one_page(struct page *page,
146 }
147
148 done_merging:
149+ arch_buddy_merge(page, order);
150 set_page_order(page, order);
151
152 /*
153--
1542.12.1
155
diff --git a/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
new file mode 100644
index 0000000..07d4a83
--- /dev/null
+++ b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch
@@ -0,0 +1,137 @@
1From 2c145b5233b504f5226a0f4bc44baeef33b444d8 Mon Sep 17 00:00:00 2001
2From: Sebastien Boeuf <sebastien.boeuf@intel.com>
3Date: Mon, 23 Jan 2017 15:32:39 -0800
4Subject: [PATCH 154/154] sysctl: vm: Fine-grained cache shrinking
5
6Lots of virtual machines are let in idle state for days until they
7are terminated, and they can keep a large amount of memory in their
8cache, meaning this memory cannot be used by other processes.
9
10We tried to release this memory using existing drop_caches sysctl,
11but it led to the complete cache loss while it could have been used
12whether the idle process wakes up. Indeed, the process can't find any
13available cached data and it directly affects performances to rebuild
14it from scratch.
15
16Instead, the solution we want is based on shrinking gradually system
17cache over time. This patch adds a new sysctl shrink_caches_mb so as
18to allow userspace applications indicating the kernel it should shrink
19system cache up to the amount (in MiB) specified.
20
21There is an application called "memshrinker" which uses this new
22mechanism. It runs in the background and periodically releases a
23specified amount of cache. This amount is based on the remaining
24cache on the system, and period is computed to follow a shrinking
25model. It results in saving a lot of memory for other processes
26running on the system.
27
28Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
29Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
30---
31 fs/drop_caches.c | 25 +++++++++++++++++++++++++
32 include/linux/mm.h | 4 ++++
33 kernel/sysctl.c | 8 ++++++++
34 mm/vmscan.c | 2 --
35 4 files changed, 37 insertions(+), 2 deletions(-)
36
37diff --git a/fs/drop_caches.c b/fs/drop_caches.c
38index d72d52b90433..f564dfcc13a4 100644
39--- a/fs/drop_caches.c
40+++ b/fs/drop_caches.c
41@@ -8,10 +8,12 @@
42 #include <linux/writeback.h>
43 #include <linux/sysctl.h>
44 #include <linux/gfp.h>
45+#include <linux/swap.h>
46 #include "internal.h"
47
48 /* A global variable is a bit ugly, but it keeps the code simple */
49 int sysctl_drop_caches;
50+int sysctl_shrink_caches_mb;
51
52 static void drop_pagecache_sb(struct super_block *sb, void *unused)
53 {
54@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
55 }
56 return 0;
57 }
58+
59+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
60+ void __user *buffer, size_t *length, loff_t *ppos)
61+{
62+ int ret;
63+ unsigned long nr_to_reclaim, page_reclaimed;
64+
65+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
66+ if (ret)
67+ return ret;
68+
69+ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE;
70+ if (write) {
71+ page_reclaimed = shrink_all_memory(nr_to_reclaim);
72+ if (page_reclaimed > 0)
73+ lru_add_drain_all();
74+
75+ if (page_reclaimed != nr_to_reclaim)
76+ return page_reclaimed;
77+ }
78+
79+ return 0;
80+}
81diff --git a/include/linux/mm.h b/include/linux/mm.h
82index 833f23d98baa..0bb66c1c31c9 100644
83--- a/include/linux/mm.h
84+++ b/include/linux/mm.h
85@@ -2308,6 +2308,10 @@ extern int kvm_ret_mem_advice;
86 int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
87 void __user *buffer, size_t *length,
88 loff_t *ppos);
89+extern int sysctl_shrink_caches_mb;
90+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
91+ void __user *buffer, size_t *length,
92+ loff_t *ppos);
93 #endif
94
95 void drop_slab(void);
96diff --git a/kernel/sysctl.c b/kernel/sysctl.c
97index d8ae774fa042..5dc9a46ae212 100644
98--- a/kernel/sysctl.c
99+++ b/kernel/sysctl.c
100@@ -1405,6 +1405,14 @@ static struct ctl_table vm_table[] = {
101 .mode = 0644,
102 .proc_handler = kvm_madv_instant_free_sysctl_handler,
103 },
104+ {
105+ .procname = "shrink_caches_mb",
106+ .data = &sysctl_shrink_caches_mb,
107+ .maxlen = sizeof(int),
108+ .mode = 0644,
109+ .proc_handler = shrink_caches_sysctl_handler,
110+ .extra1 = &one,
111+ },
112 #ifdef CONFIG_COMPACTION
113 {
114 .procname = "compact_memory",
115diff --git a/mm/vmscan.c b/mm/vmscan.c
116index 30a88b945a44..1198e74d1860 100644
117--- a/mm/vmscan.c
118+++ b/mm/vmscan.c
119@@ -3525,7 +3525,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
120 wake_up_interruptible(&pgdat->kswapd_wait);
121 }
122
123-#ifdef CONFIG_HIBERNATION
124 /*
125 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
126 * freed pages.
127@@ -3564,7 +3563,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
128
129 return nr_reclaimed;
130 }
131-#endif /* CONFIG_HIBERNATION */
132
133 /* It's optimal to keep kswapds on the same CPUs as their memory, but
134 not required for correctness. So if the last cpu in a node goes
135--
1362.12.1
137
diff --git a/patches/boot_time_opt_guest/guest_boot_time_opt.scc b/patches/boot_time_opt_guest/guest_boot_time_opt.scc
new file mode 100644
index 0000000..3636c01
--- /dev/null
+++ b/patches/boot_time_opt_guest/guest_boot_time_opt.scc
@@ -0,0 +1,19 @@
1define KFEATURE_DESCRIPTION "Boot time optimization changes ported from ClearLinux , https://github.com/clearlinux-pkgs/linux-kvm"
2define KFEATURE_COMPATIBILITY all
3
4patch 0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch
5patch 0104-fbcon-enable-no-blink-by-default.patch
6patch 0105-vmstats-wakeups.patch
7# Remove patch because it causes ixgvbevf to not initialize correctly in the guest
8#patch 0106-pci-probe.patch
9patch 0107-cgroup.patch
10patch 0108-smpboot-reuse-timer-calibration.patch
11patch 0109-perf.patch
12patch 0110-pci-probe-identify-known-devices.patch
13patch 0111-init-no-wait-for-the-known-devices.patch
14patch 0112-ksm-wakeups.patch
15
16patch 0151-mm-Export-do_madvise.patch
17patch 0152-x86-kvm-Notify-host-to-release-pages.patch
18patch 0153-x86-Return-memory-from-guest-to-host-kernel.patch
19patch 0154-sysctl-vm-Fine-grained-cache-shrinking.patch
diff --git a/patches/ipv4/0001-IPV4-unlock-rtnl_mutex-before-waiting-for-carrier-on.patch b/patches/ipv4/0001-IPV4-unlock-rtnl_mutex-before-waiting-for-carrier-on.patch
new file mode 100644
index 0000000..5133075
--- /dev/null
+++ b/patches/ipv4/0001-IPV4-unlock-rtnl_mutex-before-waiting-for-carrier-on.patch
@@ -0,0 +1,44 @@
1From 1828e68d8f0b99dbe388de4b6703afd90fdd7493 Mon Sep 17 00:00:00 2001
2From: Dragos Motrea <Dragos.Motrea@enea.com>
3Date: Thu, 16 Mar 2017 14:04:17 +0100
4Subject: [PATCH] IPV4: unlock rtnl_mutex before waiting for carrier on
5
6There is a race condition between ip auto configuration and the ethernet
7driver. The ip configuration is taking the rtnl_mutex in the ic_open_devs()
8function and then is waiting 120 seconds for a carrier on at least
9one network device. The driver is locked in the mutex and the carrier on
10is not sent. After 120 seconds, the mutex is unlocked and the driver is
11continuing its task execution.
12
13The mutex should be unlocked in the ip auto configuration before waiting the
14carrier on from the ethernet driver.
15
16Signed-off-by: Dragos Motrea <Dragos.Motrea@enea.com>
17---
18 net/ipv4/ipconfig.c | 3 +--
19 1 file changed, 1 insertion(+), 2 deletions(-)
20
21diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
22index 071a785..55c95cc 100644
23--- a/net/ipv4/ipconfig.c
24+++ b/net/ipv4/ipconfig.c
25@@ -254,6 +254,7 @@ static int __init ic_open_devs(void)
26 dev->name, able, d->xid);
27 }
28 }
29+ rtnl_unlock();
30
31 /* no point in waiting if we could not bring up at least one device */
32 if (!ic_first_dev)
33@@ -281,8 +282,6 @@ static int __init ic_open_devs(void)
34 next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
35 }
36 have_carrier:
37- rtnl_unlock();
38-
39 *last = NULL;
40
41 if (!ic_first_dev) {
42--
432.7.4
44
diff --git a/patches/ipv4/ipv4wait.scc b/patches/ipv4/ipv4wait.scc
new file mode 100644
index 0000000..93e8cdc
--- /dev/null
+++ b/patches/ipv4/ipv4wait.scc
@@ -0,0 +1 @@
patch 0001-IPV4-unlock-rtnl_mutex-before-waiting-for-carrier-on.patch
diff --git a/patches/kernel_startend_msg/0001-printk-add-Enea-Linux-boot-start-end-messages.patch b/patches/kernel_startend_msg/0001-printk-add-Enea-Linux-boot-start-end-messages.patch
new file mode 100644
index 0000000..0fa8756
--- /dev/null
+++ b/patches/kernel_startend_msg/0001-printk-add-Enea-Linux-boot-start-end-messages.patch
@@ -0,0 +1,95 @@
1From b91730ba705d151577974d5fb9f5371a4569b467 Mon Sep 17 00:00:00 2001
2From: Adrian Calianu <adrian.calianu@enea.com>
3Date: Tue, 6 Jun 2017 15:47:54 +0200
4Subject: [PATCH 1/1] printk: add Enea Linux boot start/end messages
5
6Signed-off-by: Adrian Calianu <adrian.calianu@enea.com>
7---
8 arch/x86/boot/compressed/misc.c | 23 ++++++++++++-----------
9 init/main.c | 4 +++-
10 2 files changed, 15 insertions(+), 12 deletions(-)
11
12diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
13index b3c5a5f0..9fdf3c6 100644
14--- a/arch/x86/boot/compressed/misc.c
15+++ b/arch/x86/boot/compressed/misc.c
16@@ -202,10 +202,10 @@ static void handle_relocations(void *output, unsigned long output_len,
17 delta = virt_addr - LOAD_PHYSICAL_ADDR;
18
19 if (!delta) {
20- debug_putstr("No relocation needed... ");
21+ /* debug_putstr("No relocation needed... "); */
22 return;
23 }
24- debug_putstr("Performing relocations... ");
25+ /* debug_putstr("Performing relocations... "); */
26
27 /*
28 * Process relocations: 32 bit relocations first then 64 bit after.
29@@ -286,7 +286,7 @@ static void parse_elf(void *output)
30 return;
31 }
32
33- debug_putstr("Parsing ELF... ");
34+ /* debug_putstr("Parsing ELF... ");*/
35
36 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
37 if (!phdrs)
38@@ -360,17 +360,18 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
39 cols = boot_params->screen_info.orig_video_cols;
40
41 console_init();
42- debug_putstr("early console in extract_kernel\n");
43+ /* debug_putstr("early console in extract_kernel\n");*/
44+ debug_putstr("\n");debug_putstr("Enea Linux kernel boot start\n");
45
46 free_mem_ptr = heap; /* Heap */
47 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
48
49 /* Report initial kernel position details. */
50- debug_putaddr(input_data);
51- debug_putaddr(input_len);
52- debug_putaddr(output);
53- debug_putaddr(output_len);
54- debug_putaddr(kernel_total_size);
55+ /*debug_putaddr(input_data); */
56+ /*debug_putaddr(input_len); */
57+ /*debug_putaddr(output); */
58+ /*debug_putaddr(output_len); */
59+ /*debug_putaddr(kernel_total_size);*/
60
61 /*
62 * The memory hole needed for the kernel is the larger of either
63@@ -401,11 +402,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
64 error("Destination virtual address changed when not relocatable");
65 #endif
66
67- debug_putstr("\nDecompressing Linux... ");
68+ /*debug_putstr("\nDecompressing Linux... ");*/
69 __decompress(input_data, input_len, NULL, NULL, output, output_len,
70 NULL, error);
71 parse_elf(output);
72 handle_relocations(output, output_len, virt_addr);
73- debug_putstr("done.\nBooting the kernel.\n");
74+ /*debug_putstr("done.\nBooting the kernel.\n");*/
75 return output;
76 }
77diff --git a/init/main.c b/init/main.c
78index 8358cbe..613caa1 100644
79--- a/init/main.c
80+++ b/init/main.c
81@@ -976,8 +976,10 @@ static int __ref kernel_init(void *unused)
82 if (!try_to_run_init_process("/sbin/init") ||
83 !try_to_run_init_process("/etc/init") ||
84 !try_to_run_init_process("/bin/init") ||
85- !try_to_run_init_process("/bin/sh"))
86+ !try_to_run_init_process("/bin/sh")) {
87+ printk(KERN_EMERG "Enea Linux kernel boot end\n");
88 return 0;
89+ }
90
91 panic("No working init found. Try passing init= option to kernel. "
92 "See Linux Documentation/init.txt for guidance.");
93--
942.7.4
95
diff --git a/patches/kernel_startend_msg/kernel_startend_msg.scc b/patches/kernel_startend_msg/kernel_startend_msg.scc
new file mode 100644
index 0000000..e6da49c
--- /dev/null
+++ b/patches/kernel_startend_msg/kernel_startend_msg.scc
@@ -0,0 +1,4 @@
1define KFEATURE_DESCRIPTION "Enable the kernel to output messages when it starts and ends booting"
2define KFEATURE_COMPATIBILITY all
3
4patch 0001-printk-add-Enea-Linux-boot-start-end-messages.patch