From 16b0e3313f53566481c106ace9992e477f8efe9b Mon Sep 17 00:00:00 2001 From: Adrian Calianu Date: Mon, 22 May 2017 08:43:50 +0200 Subject: patches: Boot time optimizations with ClearLinux patches Signed-off-by: Adrian Calianu Signed-off-by: Adrian Dudau --- ...-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch | 25 ++ .../0101-kvm-silence-kvm-unhandled-rdmsr.patch | 29 ++ ...8042-decrease-debug-message-level-to-info.patch | 65 +++++ .../0103-init-do_mounts-recreate-dev-root.patch | 42 +++ ...0104-Increase-the-ext4-default-commit-age.patch | 35 +++ patches/boot_time_opt/0105-silence-rapl.patch | 25 ++ patches/boot_time_opt/0106-pci-pme-wakeups.patch | 27 ++ patches/boot_time_opt/0107-ksm-wakeups.patch | 34 +++ .../0108-intel_idle-tweak-cpuidle-cstates.patch | 227 +++++++++++++++ ...-setting-user.-attributes-on-symlinks-by-.patch | 56 ++++ .../0110-init_task-faster-timerslack.patch | 32 +++ ...-ext4-fsync-optimize-double-fsync-a-bunch.patch | 158 +++++++++++ .../boot_time_opt/0113-overload-on-wakeup.patch | 43 +++ ...dd-printk-s-to-measure-boot-time-in-more-.patch | 83 ++++++ .../0115-fix-initcall-timestamps.patch | 42 +++ .../0116-smpboot-reuse-timer-calibration.patch | 31 ++ ...Kconfig-option-to-skip-raid6-benchmarking.patch | 156 +++++++++++ .../0118-Initialize-ata-before-graphics.patch | 47 ++++ ...000e-boot-time-by-tightening-sleep-ranges.patch | 311 +++++++++++++++++++++ .../0120-give-rdrand-some-credit.patch | 30 ++ .../0121-e1000e-change-default-policy.patch | 27 ++ ...low-the-memory-tuning-for-tcp-to-go-a-lit.patch | 28 ++ ...0123-igb-no-runtime-pm-to-fix-reboot-oops.patch | 27 ++ patches/boot_time_opt/0124-tweak-perfbias.patch | 32 +++ ...25-e1000e-increase-pause-and-refresh-time.patch | 33 +++ .../boot_time_opt/0151-mm-Export-do_madvise.patch | 84 ++++++ ...0152-x86-kvm-Notify-host-to-release-pages.patch | 180 ++++++++++++ ...6-Return-memory-from-guest-to-host-kernel.patch | 155 ++++++++++ ...54-sysctl-vm-Fine-grained-cache-shrinking.patch | 137 +++++++++ patches/boot_time_opt/boot_time_opt.scc | 29 ++ patches/boot_time_opt/raid_alg.cfg | 3 + patches/boot_time_opt/raid_alg.scc | 5 + 32 files changed, 2238 insertions(+) create mode 100644 patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch create mode 100644 patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch create mode 100644 patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch create mode 100644 patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch create mode 100644 patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch create mode 100644 patches/boot_time_opt/0105-silence-rapl.patch create mode 100644 patches/boot_time_opt/0106-pci-pme-wakeups.patch create mode 100644 patches/boot_time_opt/0107-ksm-wakeups.patch create mode 100644 patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch create mode 100644 patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch create mode 100644 patches/boot_time_opt/0110-init_task-faster-timerslack.patch create mode 100644 patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch create mode 100644 patches/boot_time_opt/0113-overload-on-wakeup.patch create mode 100644 patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch create mode 100644 patches/boot_time_opt/0115-fix-initcall-timestamps.patch create mode 100644 patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch create mode 100644 patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch create mode 100644 patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch create mode 100644 patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch create mode 100644 patches/boot_time_opt/0120-give-rdrand-some-credit.patch create mode 100644 patches/boot_time_opt/0121-e1000e-change-default-policy.patch create mode 100644 patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch create mode 100644 patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch create mode 100644 patches/boot_time_opt/0124-tweak-perfbias.patch create mode 100644 patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch create mode 100644 patches/boot_time_opt/0151-mm-Export-do_madvise.patch create mode 100644 patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch create mode 100644 patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch create mode 100644 patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch create mode 100644 patches/boot_time_opt/boot_time_opt.scc create mode 100644 patches/boot_time_opt/raid_alg.cfg create mode 100644 patches/boot_time_opt/raid_alg.scc diff --git a/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch b/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch new file mode 100644 index 0000000..33debcd --- /dev/null +++ b/patches/boot_time_opt/0011-drm-i915-fbc-sanitize-fbc-GEN-greater-than-9.patch @@ -0,0 +1,25 @@ +From 07639791f247ae7a807444106b9b7611f070d02b Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 13:28:29 +0000 +Subject: [PATCH] drm/i915/fbc: sanitize fbc GEN greater than 9 + +--- + drivers/gpu/drm/i915/intel_fbc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c +index c43dd9abce79..f5a2560840f3 100644 +--- a/drivers/gpu/drm/i915/intel_fbc.c ++++ b/drivers/gpu/drm/i915/intel_fbc.c +@@ -1262,7 +1262,7 @@ static int intel_sanitize_fbc_option(struct drm_i915_private *dev_priv) + if (!HAS_FBC(dev_priv)) + return 0; + +- if (IS_BROADWELL(dev_priv)) ++ if (IS_BROADWELL(dev_priv) || INTEL_GEN(dev_priv) >= 9) + return 1; + + return 0; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch b/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch new file mode 100644 index 0000000..aeb3abf --- /dev/null +++ b/patches/boot_time_opt/0101-kvm-silence-kvm-unhandled-rdmsr.patch @@ -0,0 +1,29 @@ +From f45c353859fc0ceb75fef3a2f4a2c179dfa378d7 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Tue, 23 Jun 2015 01:16:45 -0500 +Subject: [PATCH 101/124] kvm: silence kvm unhandled rdmsr + +Author: Arjan van de Ven + +Signed-off-by: Miguel Bernal Marin +Signed-off-by: Jose Carlos Venegas Munoz +--- + arch/x86/kvm/x86.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 731044efb195..582c75311f95 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -2506,7 +2506,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + if (!ignore_msrs) { +- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); ++// vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + return 1; + } else { + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); +-- +2.11.1 + diff --git a/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch b/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch new file mode 100644 index 0000000..96fd92b --- /dev/null +++ b/patches/boot_time_opt/0102-i8042-decrease-debug-message-level-to-info.patch @@ -0,0 +1,65 @@ +From 7e847b13b753ec632fef2f1ffa0d8f5b444c967b Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Tue, 23 Jun 2015 01:26:52 -0500 +Subject: [PATCH 102/124] i8042: decrease debug message level to info + +Author: Arjan van de Ven + +Signed-off-by: Miguel Bernal Marin +Signed-off-by: Jose Carlos Venegas Munoz +--- + drivers/input/serio/i8042.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c +index 89abfdb539ac..5317c41b049e 100644 +--- a/drivers/input/serio/i8042.c ++++ b/drivers/input/serio/i8042.c +@@ -593,7 +593,7 @@ static int i8042_enable_kbd_port(void) + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { + i8042_ctr &= ~I8042_CTR_KBDINT; + i8042_ctr |= I8042_CTR_KBDDIS; +- pr_err("Failed to enable KBD port\n"); ++ pr_info("Failed to enable KBD port\n"); + return -EIO; + } + +@@ -612,7 +612,7 @@ static int i8042_enable_aux_port(void) + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { + i8042_ctr &= ~I8042_CTR_AUXINT; + i8042_ctr |= I8042_CTR_AUXDIS; +- pr_err("Failed to enable AUX port\n"); ++ pr_info("Failed to enable AUX port\n"); + return -EIO; + } + +@@ -704,7 +704,7 @@ static int __init i8042_check_mux(void) + i8042_ctr &= ~I8042_CTR_AUXINT; + + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { +- pr_err("Failed to disable AUX port, can't use MUX\n"); ++ pr_info("Failed to disable AUX port, can't use MUX\n"); + return -EIO; + } + +@@ -927,7 +927,7 @@ static int i8042_controller_selftest(void) + do { + + if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { +- pr_err("i8042 controller selftest timeout\n"); ++ pr_info("i8042 controller selftest timeout\n"); + return -ENODEV; + } + +@@ -949,7 +949,7 @@ static int i8042_controller_selftest(void) + pr_info("giving up on controller selftest, continuing anyway...\n"); + return 0; + #else +- pr_err("i8042 controller selftest failed\n"); ++ pr_info("i8042 controller selftest failed\n"); + return -EIO; + #endif + } +-- +2.11.1 + diff --git a/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch b/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch new file mode 100644 index 0000000..bb7bb9f --- /dev/null +++ b/patches/boot_time_opt/0103-init-do_mounts-recreate-dev-root.patch @@ -0,0 +1,42 @@ +From 838abc7e5f43ea40a2cc05ebd6c7321b6d84b057 Mon Sep 17 00:00:00 2001 +From: Miguel Bernal Marin +Date: Fri, 20 Nov 2015 14:01:26 -0600 +Subject: [PATCH 103/124] init: do_mounts: recreate /dev/root + +Rootfs shows as is mounted in /dev/root, but this devices is not present in +/dev directory. + +Signed-off-by: Miguel Bernal Marin +--- + init/do_mounts.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/init/do_mounts.c b/init/do_mounts.c +index dea5de95c2dd..d74a346b2dfa 100644 +--- a/init/do_mounts.c ++++ b/init/do_mounts.c +@@ -549,6 +549,7 @@ void __init mount_root(void) + void __init prepare_namespace(void) + { + int is_floppy; ++ int err; + + if (root_delay) { + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", +@@ -602,6 +603,13 @@ void __init prepare_namespace(void) + devtmpfs_mount("dev"); + sys_mount(".", "/", NULL, MS_MOVE, NULL); + sys_chroot("."); ++#ifdef CONFIG_BLOCK ++ /* recreate the /dev/root */ ++ err = create_dev("/dev/root", ROOT_DEV); ++ ++ if (err < 0) ++ pr_emerg("Failed to create /dev/root: %d\n", err); ++#endif + } + + static bool is_tmpfs; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch b/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch new file mode 100644 index 0000000..fb709b4 --- /dev/null +++ b/patches/boot_time_opt/0104-Increase-the-ext4-default-commit-age.patch @@ -0,0 +1,35 @@ +From b6970d43f97325c9acc7bd942dcd192586d8d407 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 11 Jan 2016 10:01:44 -0600 +Subject: [PATCH 104/124] Increase the ext4 default commit age + +Both the VM and EXT4 have a "commit to disk after X seconds" time. +Currently the EXT4 time is shorter than our VM time, which is a bit +suboptional, +it's better for performance to let the VM do the writeouts in bulk +rather than something deep in the journalling layer. + +(DISTRO TWEAK -- NOT FOR UPSTREAM) + +Signed-off-by: Arjan van de Ven +Signed-off-by: Jose Carlos Venegas Munoz +--- + include/linux/jbd2.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h +index dfaa1f4dcb0c..9955fd6c6159 100644 +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -47,7 +47,7 @@ + /* + * The default maximum commit age, in seconds. + */ +-#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 ++#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 + + #ifdef CONFIG_JBD2_DEBUG + /* +-- +2.11.1 + diff --git a/patches/boot_time_opt/0105-silence-rapl.patch b/patches/boot_time_opt/0105-silence-rapl.patch new file mode 100644 index 0000000..4dd78fc --- /dev/null +++ b/patches/boot_time_opt/0105-silence-rapl.patch @@ -0,0 +1,25 @@ +From 558d32869c8d8e302dd3810610d62e1c69a8ebce Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:22:09 -0600 +Subject: [PATCH 105/124] silence rapl + +--- + drivers/powercap/intel_rapl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c +index 3c71f608b444..450aff027d42 100644 +--- a/drivers/powercap/intel_rapl.c ++++ b/drivers/powercap/intel_rapl.c +@@ -1684,7 +1684,7 @@ static int __init rapl_init(void) + + id = x86_match_cpu(rapl_ids); + if (!id) { +- pr_err("driver does not support CPU family %d model %d\n", ++ pr_info("driver does not support CPU family %d model %d\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return -ENODEV; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0106-pci-pme-wakeups.patch b/patches/boot_time_opt/0106-pci-pme-wakeups.patch new file mode 100644 index 0000000..f0a4799 --- /dev/null +++ b/patches/boot_time_opt/0106-pci-pme-wakeups.patch @@ -0,0 +1,27 @@ +From 1f44219cd74f5c3b97e2c85af87141e1bddf0555 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH 106/124] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index eda6a7cf0e54..82a623255059 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -57,7 +57,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.11.1 + diff --git a/patches/boot_time_opt/0107-ksm-wakeups.patch b/patches/boot_time_opt/0107-ksm-wakeups.patch new file mode 100644 index 0000000..2b25625 --- /dev/null +++ b/patches/boot_time_opt/0107-ksm-wakeups.patch @@ -0,0 +1,34 @@ +From a5de04044d428bf54472365e7dc07958aa184daf Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:06:46 -0600 +Subject: [PATCH 107/124] ksm-wakeups + +reduce wakeups in ksm by adding rounding (aligning) when the sleep times are 1 second or longer + +Signed-off-by: Arjan van de Ven +--- + mm/ksm.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/mm/ksm.c b/mm/ksm.c +index 9ae6011a41f8..eecd3ff669e2 100644 +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -1725,8 +1725,12 @@ static int ksm_scan_thread(void *nothing) + try_to_freeze(); + + if (ksmd_should_run()) { +- schedule_timeout_interruptible( +- msecs_to_jiffies(ksm_thread_sleep_millisecs)); ++ if (ksm_thread_sleep_millisecs >= 1000) ++ schedule_timeout_interruptible( ++ msecs_to_jiffies(round_jiffies_relative(ksm_thread_sleep_millisecs))); ++ else ++ schedule_timeout_interruptible( ++ msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_freezable(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); +-- +2.11.1 + diff --git a/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch b/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch new file mode 100644 index 0000000..da5396c --- /dev/null +++ b/patches/boot_time_opt/0108-intel_idle-tweak-cpuidle-cstates.patch @@ -0,0 +1,227 @@ +From bf7e0cebaafe790f62cbc5815648d556847b7d27 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH 108/124] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 74 +++++++++++------------------------------------ + 1 file changed, 17 insertions(+), 57 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 4466a2f969d7..cbab050b83f0 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -475,7 +475,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -483,7 +483,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -491,7 +491,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -499,7 +499,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -507,7 +507,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 18000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -515,7 +515,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 77000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -531,27 +531,11 @@ static struct cpuidle_state bdw_cstates[] = { + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +- .name = "C1E-BDW", +- .desc = "MWAIT 0x01", +- .flags = MWAIT2flg(0x01), +- .exit_latency = 10, +- .target_residency = 20, +- .enter = &intel_idle, +- .enter_freeze = intel_idle_freeze, }, +- { +- .name = "C3-BDW", +- .desc = "MWAIT 0x10", +- .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, +- .exit_latency = 40, +- .target_residency = 100, +- .enter = &intel_idle, +- .enter_freeze = intel_idle_freeze, }, +- { + .name = "C6-BDW", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -559,7 +543,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -567,7 +551,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -575,7 +559,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 18000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -583,7 +567,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 77000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -600,27 +584,11 @@ static struct cpuidle_state skl_cstates[] = { + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +- .name = "C1E-SKL", +- .desc = "MWAIT 0x01", +- .flags = MWAIT2flg(0x01), +- .exit_latency = 10, +- .target_residency = 20, +- .enter = &intel_idle, +- .enter_freeze = intel_idle_freeze, }, +- { +- .name = "C3-SKL", +- .desc = "MWAIT 0x10", +- .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, +- .exit_latency = 70, +- .target_residency = 100, +- .enter = &intel_idle, +- .enter_freeze = intel_idle_freeze, }, +- { + .name = "C6-SKL", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -628,7 +596,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 8000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -636,7 +604,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 8000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -644,7 +612,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 50000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -652,7 +620,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 50000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +@@ -669,19 +637,11 @@ static struct cpuidle_state skx_cstates[] = { + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +- .name = "C1E-SKX", +- .desc = "MWAIT 0x01", +- .flags = MWAIT2flg(0x01), +- .exit_latency = 10, +- .target_residency = 20, +- .enter = &intel_idle, +- .enter_freeze = intel_idle_freeze, }, +- { + .name = "C6-SKX", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 600, ++ .target_residency = 1600, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { +-- +2.11.1 + diff --git a/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch b/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch new file mode 100644 index 0000000..70247a0 --- /dev/null +++ b/patches/boot_time_opt/0109-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch @@ -0,0 +1,56 @@ +From 4170571f7bb0897c90e13b2fcf3ee06990a9e774 Mon Sep 17 00:00:00 2001 +From: Alan Cox +Date: Thu, 10 Mar 2016 15:11:28 +0000 +Subject: [PATCH 109/124] xattr: allow setting user.* attributes on symlinks by + owner + +Kvmtool and clear containers supports using user attributes to label host +files with the virtual uid/guid of the file in the container. This allows an +end user to manage their files and a complete uid space without all the ugly +namespace stuff. + +The one gap in the support is symlinks because an end user can change the +ownership of a symbolic link. We support attributes on these files as you +can already (as root) set security attributes on them. + +The current rules seem slightly over-paranoid and as we have a use case this +patch enables updating the attributes on a symbolic link IFF you are the +owner of the synlink (as permissions are not usually meaningful on the link +itself). + +Signed-off-by: Alan Cox +--- + fs/xattr.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +diff --git a/fs/xattr.c b/fs/xattr.c +index 2d13b4e62fae..580a5aeddfd2 100644 +--- a/fs/xattr.c ++++ b/fs/xattr.c +@@ -118,15 +118,17 @@ xattr_permission(struct inode *inode, const char *name, int mask) + } + + /* +- * In the user.* namespace, only regular files and directories can have +- * extended attributes. For sticky directories, only the owner and +- * privileged users can write attributes. ++ * In the user.* namespace, only regular files, symbolic links, and ++ * directories can have extended attributes. For symbolic links and ++ * sticky directories, only the owner and privileged users can write ++ * attributes. + */ + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { +- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) ++ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; +- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && +- (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) ++ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) ++ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) ++ && !inode_owner_or_capable(inode)) + return -EPERM; + } + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0110-init_task-faster-timerslack.patch b/patches/boot_time_opt/0110-init_task-faster-timerslack.patch new file mode 100644 index 0000000..b0075ff --- /dev/null +++ b/patches/boot_time_opt/0110-init_task-faster-timerslack.patch @@ -0,0 +1,32 @@ +From 42c2cb32259b76fb1f6713d99c4f0922e97bcc8d Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Wed, 23 Mar 2016 14:52:41 +0000 +Subject: [PATCH 110/124] init_task: faster timerslack + +the default tuning is a compromise between client power and server performance; +for a server distro like Clear Linux, we don't need to compromise. +(for non-server usages we have different kernel binaries) + +in principle this can be done as a patch to systemd as well, but we have a shared +systemd between usages while we have different kernels, so the logistics +for where the patch goes work out better here +--- + include/linux/init_task.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 325f649d77ff..e0eb261e17cb 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -249,7 +249,7 @@ extern struct task_group root_task_group; + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ +- .timer_slack_ns = 50000, /* 50 usec default slack */ \ ++ .timer_slack_ns = 1000, /* 1 usec default slack */ \ + .pids = { \ + [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ + [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ +-- +2.11.1 + diff --git a/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch b/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch new file mode 100644 index 0000000..7d0def8 --- /dev/null +++ b/patches/boot_time_opt/0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch @@ -0,0 +1,158 @@ +From 3152053ea1ea3aa77bcc7e990d48ef84621ff6c9 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 9 Apr 2016 22:41:37 +0000 +Subject: [PATCH 112/124] fs: ext4: fsync: optimize double-fsync() a bunch + +There are cases where EXT4 is a bit too conservative sending barriers down to the disk; +there are cases where the transaction in progress is not the one that sent the barrier +(in other words: the fsync is for a file for which the IO happened more time ago +and all data was already sent to the disk). For that case, a more performing tradeoff +can be made on SSD devices (which have the ability to flush their dram caches in a hurry +on a power fail event) where the barrier gets sent to the disk, but we don't need to wait +for the barrier to complete. Any consecutive IO will block on the barrier correctly. +--- + block/bio.c | 20 ++++++++++++++++++++ + block/blk-flush.c | 41 +++++++++++++++++++++++++++++++++++++++++ + fs/ext4/fsync.c | 6 +++++- + include/linux/bio.h | 1 + + include/linux/blkdev.h | 5 +++++ + 5 files changed, 72 insertions(+), 1 deletion(-) + +diff --git a/block/bio.c b/block/bio.c +index db85c5753a76..80f5ab6b536a 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -882,6 +882,26 @@ int submit_bio_wait(struct bio *bio) + } + EXPORT_SYMBOL(submit_bio_wait); + ++static void submit_bio_nowait_endio(struct bio *bio) ++{ ++ bio_put(bio); ++} ++ ++/** ++ * submit_bio_nowait - submit a bio for fire-and-forget ++ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) ++ * @bio: The &struct bio which describes the I/O ++ * ++ * Simple wrapper around submit_bio() that takes care of bio_put() on completion ++ */ ++void submit_bio_nowait(struct bio *bio) ++{ ++ bio->bi_end_io = submit_bio_nowait_endio; ++ bio->bi_opf |= REQ_SYNC; ++ submit_bio(bio); ++} ++EXPORT_SYMBOL(submit_bio_nowait); ++ + /** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance +diff --git a/block/blk-flush.c b/block/blk-flush.c +index 3c882cbc7541..b2dfcfe01ed7 100644 +--- a/block/blk-flush.c ++++ b/block/blk-flush.c +@@ -530,6 +530,47 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + } + EXPORT_SYMBOL(blkdev_issue_flush); + ++/** ++ * blkdev_issue_flush_nowait - queue a flush ++ * @bdev: blockdev to issue flush for ++ * @gfp_mask: memory allocation flags (for bio_alloc) ++ * @error_sector: error sector ++ * ++ * Description: ++ * Issue a flush for the block device in question. Caller can supply ++ * room for storing the error offset in case of a flush error, if they ++ * wish to. If WAIT flag is not passed then caller may check only what ++ * request was pushed in some internal queue for later handling. ++ */ ++void blkdev_issue_flush_nowait(struct block_device *bdev, gfp_t gfp_mask) ++{ ++ struct request_queue *q; ++ struct bio *bio; ++ ++ if (bdev->bd_disk == NULL) ++ return; ++ ++ q = bdev_get_queue(bdev); ++ if (!q) ++ return; ++ ++ /* ++ * some block devices may not have their queue correctly set up here ++ * (e.g. loop device without a backing file) and so issuing a flush ++ * here will panic. Ensure there is a request function before issuing ++ * the flush. ++ */ ++ if (!q->make_request_fn) ++ return; ++ ++ bio = bio_alloc(gfp_mask, 0); ++ bio->bi_bdev = bdev; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); ++ ++ submit_bio_nowait(bio); ++} ++EXPORT_SYMBOL(blkdev_issue_flush_nowait); ++ + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size) + { +diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c +index 88effb1053c7..a58966c18172 100644 +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -150,7 +150,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) + ret = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + issue_flush: +- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); ++ err = 0; ++ if (!blk_queue_nonrot(bdev_get_queue(inode->i_sb->s_bdev))) ++ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); ++ else ++ blkdev_issue_flush_nowait(inode->i_sb->s_bdev, GFP_KERNEL); + if (!ret) + ret = err; + } +diff --git a/include/linux/bio.h b/include/linux/bio.h +index 97cb48f03dc7..3f055e6541e0 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -421,6 +421,7 @@ struct request_queue; + extern int bio_phys_segments(struct request_queue *, struct bio *); + + extern int submit_bio_wait(struct bio *bio); ++extern void submit_bio_nowait(struct bio *bio); + extern void bio_advance(struct bio *, unsigned); + + extern void bio_init(struct bio *); +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f6a816129856..727684abf21e 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1144,6 +1144,7 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, + #define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ + + extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); ++extern void blkdev_issue_flush_nowait(struct block_device *, gfp_t); + extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); + extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, +@@ -1745,6 +1746,10 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + return 0; + } + ++static inline void blkdev_issue_flush_nowait(struct block_device *bdev, gfp_t gfp_mask) ++{ ++} ++ + #endif /* CONFIG_BLOCK */ + + #endif +-- +2.11.1 + diff --git a/patches/boot_time_opt/0113-overload-on-wakeup.patch b/patches/boot_time_opt/0113-overload-on-wakeup.patch new file mode 100644 index 0000000..a3a6bce --- /dev/null +++ b/patches/boot_time_opt/0113-overload-on-wakeup.patch @@ -0,0 +1,43 @@ +From 9f25d18f45a8391488feb9783404f2f79b7090f4 Mon Sep 17 00:00:00 2001 +From: jplozi +Date: Fri, 11 Mar 2016 15:18:06 +0100 +Subject: [PATCH 113/124] overload on wakeup + +source https://github.com/jplozi/wastedcores + +as an experiment, apply the learnings from the wasted-cores paper +and see how the performance works out. With the data from this we should +be able to work with Peter and the rest of the scheduler folks on +a more permanent/elegant solution. +--- + kernel/sched/fair.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index c242944f5cbd..5132c828161e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -5638,6 +5638,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + } + + rcu_read_lock(); ++ ++ if (cpu_rq(prev_cpu)->nr_running) { ++ int _cpu; ++ ++ for_each_online_cpu(_cpu) { ++ if (!cpumask_test_cpu(_cpu, tsk_cpus_allowed(p)) || ++ cpu_rq(_cpu)->nr_running) ++ continue; ++ ++ rcu_read_unlock(); ++ return _cpu; ++ } ++ } ++ + for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + break; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch b/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch new file mode 100644 index 0000000..c6bf036 --- /dev/null +++ b/patches/boot_time_opt/0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch @@ -0,0 +1,83 @@ +From 3a1512b4ed3922f88936b95731aaff706e7286a9 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Wed, 11 Feb 2015 16:05:23 -0600 +Subject: [PATCH 114/124] bootstats: add printk's to measure boot time in more + detail + +Few distro-tweaks to add printk's to visualize boot time better + +Author: Arjan van de Ven + +Signed-off-by: Miguel Bernal Marin +--- + arch/x86/kernel/alternative.c | 4 ++++ + drivers/base/firmware_class.c | 2 ++ + init/main.c | 2 +- + kernel/kmod.c | 2 ++ + 4 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index 5cb272a7a5a3..d28fb7aae4ce 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -626,7 +626,9 @@ void __init alternative_instructions(void) + * patching. + */ + ++ printk("clr: Applying alternatives\n"); + apply_alternatives(__alt_instructions, __alt_instructions_end); ++ printk("clr: Applying alternatives done\n"); + + #ifdef CONFIG_SMP + /* Patch to UP if other cpus not imminent. */ +@@ -637,6 +639,8 @@ void __init alternative_instructions(void) + _text, _etext); + } + ++ printk("clr: Applying alternatives smp done\n"); ++ + if (!uniproc_patched || num_possible_cpus() == 1) + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, +diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c +index a95e1e572697..b29467031be6 100644 +--- a/drivers/base/firmware_class.c ++++ b/drivers/base/firmware_class.c +@@ -1224,6 +1224,8 @@ request_firmware(const struct firmware **firmware_p, const char *name, + { + int ret; + ++ printk("clr: request_firmware: %s\n", name); ++ + /* Need to pin this module until return */ + __module_get(THIS_MODULE); + ret = _request_firmware(firmware_p, name, device, NULL, 0, +diff --git a/init/main.c b/init/main.c +index 2858be732f6d..f1d8c3fdbf05 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -751,7 +751,7 @@ static int __init_or_module do_one_initcall_debug(initcall_t fn) + unsigned long long duration; + int ret; + +- printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); ++ printk(KERN_DEBUG "calling %pF @ %i\n", fn, raw_smp_processor_id()); + calltime = ktime_get(); + ret = fn(); + rettime = ktime_get(); +diff --git a/kernel/kmod.c b/kernel/kmod.c +index 0277d1216f80..dc5a6edd3895 100644 +--- a/kernel/kmod.c ++++ b/kernel/kmod.c +@@ -76,6 +76,8 @@ static int call_modprobe(char *module_name, int wait) + NULL + }; + ++ printk("clr: call_modprobe: %s %i \n", module_name, wait); ++ + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0115-fix-initcall-timestamps.patch b/patches/boot_time_opt/0115-fix-initcall-timestamps.patch new file mode 100644 index 0000000..cdf2af1 --- /dev/null +++ b/patches/boot_time_opt/0115-fix-initcall-timestamps.patch @@ -0,0 +1,42 @@ +From 5b5ad2c9b9b555d20aeba1f895d0c9d1c2a77776 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH 115/124] fix initcall timestamps + +Print more finegrained initcall timings + +use the tsc instead of the jiffies clock for initcall_debug +--- + init/main.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/init/main.c b/init/main.c +index f1d8c3fdbf05..8358cbe6ab13 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -747,16 +747,16 @@ __setup("initcall_blacklist=", initcall_blacklist); + + static int __init_or_module do_one_initcall_debug(initcall_t fn) + { +- ktime_t calltime, delta, rettime; ++ unsigned long long calltime, delta, rettime; + unsigned long long duration; + int ret; + +- printk(KERN_DEBUG "calling %pF @ %i\n", fn, raw_smp_processor_id()); +- calltime = ktime_get(); ++ printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); ++ calltime = local_clock(); + ret = fn(); +- rettime = ktime_get(); +- delta = ktime_sub(rettime, calltime); +- duration = (unsigned long long) ktime_to_ns(delta) >> 10; ++ rettime = local_clock(); ++ delta = rettime - calltime; ++ duration = delta >> 10; + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", + fn, ret, duration); + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch b/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch new file mode 100644 index 0000000..d1f71b5 --- /dev/null +++ b/patches/boot_time_opt/0116-smpboot-reuse-timer-calibration.patch @@ -0,0 +1,31 @@ +From 16104411cc5a7b20f310e3ecede85343ee6ce6b9 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Wed, 11 Feb 2015 17:28:14 -0600 +Subject: [PATCH 116/124] smpboot: reuse timer calibration + +NO point recalibrating for known-constant tsc... saves 200ms+ of boot time. + +Author: Arjan van de Ven + +Signed-off-by: Miguel Bernal Marin +--- + arch/x86/kernel/tsc.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c +index 46b2f41f8b05..88553c1f21f1 100644 +--- a/arch/x86/kernel/tsc.c ++++ b/arch/x86/kernel/tsc.c +@@ -1384,6 +1384,9 @@ unsigned long calibrate_delay_is_known(void) + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + ++ if (cpu != 0) ++ return cpu_data(0).loops_per_jiffy; ++ + if (!mask) + return 0; + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch b/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch new file mode 100644 index 0000000..978e09f --- /dev/null +++ b/patches/boot_time_opt/0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch @@ -0,0 +1,156 @@ +From fd1f55138c242bd9aeec374ff611064bdc89b359 Mon Sep 17 00:00:00 2001 +From: Jim Kukunas +Date: Fri, 27 May 2016 09:26:51 -0400 +Subject: [PATCH 117/124] raid6: add Kconfig option to skip raid6 benchmarking + +Adds CONFIG_RAID6_FORCE_ALGO, which causes the kernel to not benchmark +each raid recovery and syndrome generation algorithm, and instead use +the version selected via Kconfig (CONFIG_RAID6_FORCE_{INT,SSSE3,AVX2}). +In the case, the selected algorithm is not supported by the processor at +runtime, a fallback is used. + +Signed-off-by: Jim Kukunas +--- + lib/Kconfig | 3 +-- + lib/raid6/Kconfig | 38 ++++++++++++++++++++++++++++++++++++ + lib/raid6/algos.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 97 insertions(+), 2 deletions(-) + create mode 100644 lib/raid6/Kconfig + +diff --git a/lib/Kconfig b/lib/Kconfig +index 260a80e313b9..b3efd21db2fd 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -7,8 +7,7 @@ config BINARY_PRINTF + + menu "Library routines" + +-config RAID6_PQ +- tristate ++source "lib/raid6/Kconfig" + + config BITREVERSE + tristate +diff --git a/lib/raid6/Kconfig b/lib/raid6/Kconfig +new file mode 100644 +index 000000000000..d881d6be89bb +--- /dev/null ++++ b/lib/raid6/Kconfig +@@ -0,0 +1,38 @@ ++menu "RAID 6" ++ ++config RAID6_PQ ++ tristate ++ ++config RAID6_FORCE_ALGO ++ bool "Always use specified recovery algorithm" ++ default n ++ depends on RAID6_PQ ++ help ++ If this option is not set, on every boot the kernel will ++ benchmark each optimized version of the RAID6 recovery and ++ syndrome generation algorithms and will select the one that ++ performs best. Microbenchmarking each version negatively ++ affects boot time. ++ ++ Enabling this option skips the benchmark at boot, and ++ instead always uses the algorithm selected. The only exception ++ is if the selected algorithm relies on a cpu feature not ++ supported at runtime. In this case, one of the lower performance ++ fallbacks are used. ++ ++choice ++ prompt "RAID6 Recovery Algorithm" ++ default RAID6_FORCE_INT ++ depends on RAID6_FORCE_ALGO ++ ---help--- ++ Select the RAID6 recovery algorithm to unconditionally use ++ ++ config RAID6_FORCE_INT ++ bool "Reference Implementation" ++ config RAID6_FORCE_SSSE3 ++ bool "SSSE3" ++ config RAID6_FORCE_AVX2 ++ bool "AVX2" ++endchoice ++ ++endmenu +diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c +index 7857049fd7d3..29332d2a04a5 100644 +--- a/lib/raid6/algos.c ++++ b/lib/raid6/algos.c +@@ -125,6 +125,63 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { + #define time_before(x, y) ((x) < (y)) + #endif + ++#ifdef CONFIG_RAID6_FORCE_ALGO ++/* TODO don't compile in algos that will never be used */ ++int __init raid6_select_algo(void) ++{ ++ const struct raid6_recov_calls *recov_fallback = &raid6_recov_intx1; ++ const struct raid6_recov_calls *recov_algo; ++ const struct raid6_calls *gen_fallback; ++ const struct raid6_calls *gen_algo; ++ ++#if defined(__i386__) ++ gen_fallback = &raid6_intx32; ++#elif defined(__x86_64__) ++ gen_fallback = &raid6_sse2x2; ++#else ++# error "TODO" ++#endif ++ ++#if defined(CONFIG_RAID6_FORCE_INT) ++ recov_algo = &raid6_recov_intx1; ++ gen_algo = &raid6_intx32; ++ ++#elif defined(CONFIG_RAID6_FORCE_SSSE3) ++ recov_algo = &raid6_recov_ssse3; ++#if defined(__i386__) ++ gen_algo = &raid6_sse2x2; ++#else ++ gen_algo = &raid6_sse2x4; ++#endif ++ ++#elif defined(CONFIG_RAID6_FORCE_AVX2) ++ recov_algo = &raid6_recov_avx2; ++ ++#if defined(__i386__) ++ gen_algo = &raid6_avx2x2; ++#else ++ gen_algo = &raid6_avx2x4; ++#endif ++ ++#else ++#error "RAID6 Forced Recov Algo: Unsupported selection" ++#endif ++ ++ if (recov_algo->valid != NULL && recov_algo->valid() == 0) ++ recov_algo = recov_fallback; ++ ++ pr_info("raid6: Forced to use recovery algorithm %s\n", recov_algo->name); ++ ++ raid6_2data_recov = recov_algo->data2; ++ raid6_datap_recov = recov_algo->datap; ++ ++ pr_info("raid6: Forced gen() algo %s\n", gen_algo->name); ++ ++ raid6_call = *gen_algo; ++ ++ return gen_algo && recov_algo ? 0 : -EINVAL; ++} ++#else + static inline const struct raid6_recov_calls *raid6_choose_recov(void) + { + const struct raid6_recov_calls *const *algo; +@@ -256,6 +313,7 @@ int __init raid6_select_algo(void) + + return gen_best && rec_best ? 0 : -EINVAL; + } ++#endif + + static void raid6_exit(void) + { +-- +2.11.1 + diff --git a/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch b/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch new file mode 100644 index 0000000..70e07c8 --- /dev/null +++ b/patches/boot_time_opt/0118-Initialize-ata-before-graphics.patch @@ -0,0 +1,47 @@ +From fbc1ab7c18a9c960a0bff293a93620d581658f8d Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH 118/124] Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index 194d20bee7dc..2785e4c6b30f 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -55,14 +55,9 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-$(CONFIG_IOMMU_SUPPORT) += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ + + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ +@@ -76,6 +71,12 @@ obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_SCSI) += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +-- +2.11.1 + diff --git a/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch b/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch new file mode 100644 index 0000000..a068afb --- /dev/null +++ b/patches/boot_time_opt/0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch @@ -0,0 +1,311 @@ +From d9390cb702de5cbef64f893efd2344c4f58dae82 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 25 Jul 2016 06:44:34 -0500 +Subject: [PATCH 119/124] reduce e1000e boot time by tightening sleep ranges + +The e1000e driver is a great user of the usleep_range() API, +and has any nice ranges that in principle help power management. + +However the ranges that are used only during system startup are +very long (and can add easily 100 msec to the boot time) while +the power savings of such long ranges is irrelevant due to the +one-off, boot only, nature of these functions. + +This patch shrinks some of the longest ranges to be shorter +(while still using a power friendly 1 msec range); this saves +100msec+ of boot time on my BDW NUCs + +Signed-off-by: Arjan van de Ven +--- + drivers/net/ethernet/intel/e1000e/80003es2lan.c | 2 +- + drivers/net/ethernet/intel/e1000e/82571.c | 2 +- + drivers/net/ethernet/intel/e1000e/ethtool.c | 14 +++++++------- + drivers/net/ethernet/intel/e1000e/ich8lan.c | 20 ++++++++++---------- + drivers/net/ethernet/intel/e1000e/mac.c | 2 +- + drivers/net/ethernet/intel/e1000e/netdev.c | 14 +++++++------- + drivers/net/ethernet/intel/e1000e/nvm.c | 2 +- + 7 files changed, 28 insertions(+), 28 deletions(-) + +diff --git a/drivers/net/ethernet/intel/e1000e/80003es2lan.c b/drivers/net/ethernet/intel/e1000e/80003es2lan.c +index cd391376036c..b5759899eeb8 100644 +--- a/drivers/net/ethernet/intel/e1000e/80003es2lan.c ++++ b/drivers/net/ethernet/intel/e1000e/80003es2lan.c +@@ -698,7 +698,7 @@ static s32 e1000_reset_hw_80003es2lan(struct e1000_hw *hw) + ew32(TCTL, E1000_TCTL_PSP); + e1e_flush(); + +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + ctrl = er32(CTRL); + +diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c +index 6b03c8553e59..d31145269dd9 100644 +--- a/drivers/net/ethernet/intel/e1000e/82571.c ++++ b/drivers/net/ethernet/intel/e1000e/82571.c +@@ -977,7 +977,7 @@ static s32 e1000_reset_hw_82571(struct e1000_hw *hw) + ew32(TCTL, tctl); + e1e_flush(); + +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + /* Must acquire the MDIO ownership before MAC reset. + * Ownership defaults to firmware after a reset. +diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c +index 7aff68a4a4df..7cb689bd41f8 100644 +--- a/drivers/net/ethernet/intel/e1000e/ethtool.c ++++ b/drivers/net/ethernet/intel/e1000e/ethtool.c +@@ -1023,7 +1023,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data) + /* Disable all the interrupts */ + ew32(IMC, 0xFFFFFFFF); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + /* Test each interrupt */ + for (i = 0; i < 10; i++) { +@@ -1055,7 +1055,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data) + ew32(IMC, mask); + ew32(ICS, mask); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + if (adapter->test_icr & mask) { + *data = 3; +@@ -1073,7 +1073,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data) + ew32(IMS, mask); + ew32(ICS, mask); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + if (!(adapter->test_icr & mask)) { + *data = 4; +@@ -1091,7 +1091,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data) + ew32(IMC, ~mask & 0x00007FFF); + ew32(ICS, ~mask & 0x00007FFF); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + if (adapter->test_icr) { + *data = 5; +@@ -1103,7 +1103,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data) + /* Disable all the interrupts */ + ew32(IMC, 0xFFFFFFFF); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + /* Unhook test interrupt handler */ + free_irq(irq, netdev); +@@ -1479,7 +1479,7 @@ static int e1000_set_82571_fiber_loopback(struct e1000_adapter *adapter) + */ + ew32(SCTL, E1000_SCTL_ENABLE_SERDES_LOOPBACK); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + return 0; + } +@@ -1592,7 +1592,7 @@ static void e1000_loopback_cleanup(struct e1000_adapter *adapter) + hw->phy.media_type == e1000_media_type_internal_serdes) { + ew32(SCTL, E1000_SCTL_DISABLE_SERDES_LOOPBACK); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + break; + } + /* Fall Through */ +diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c +index f3aaca743ea3..bef75cec259f 100644 +--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c ++++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c +@@ -289,7 +289,7 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw) + u16 count = 20; + + do { +- usleep_range(5000, 10000); ++ usleep_range(5000, 6000); + } while (!(er32(CTRL_EXT) & E1000_CTRL_EXT_LPCD) && count--); + + msleep(30); +@@ -422,7 +422,7 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) + /* Ungate automatic PHY configuration on non-managed 82579 */ + if ((hw->mac.type == e1000_pch2lan) && + !(fwsm & E1000_ICH_FWSM_FW_VALID)) { +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + e1000_gate_hw_phy_config_ich8lan(hw, false); + } + +@@ -547,7 +547,7 @@ static s32 e1000_init_phy_params_ich8lan(struct e1000_hw *hw) + phy->id = 0; + while ((e1000_phy_unknown == e1000e_get_phy_type_from_id(phy->id)) && + (i++ < 100)) { +- usleep_range(1000, 2000); ++ usleep_range(1000, 1100); + ret_val = e1000e_get_phy_id(hw); + if (ret_val) + return ret_val; +@@ -1259,7 +1259,7 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) + goto out; + } + +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + } + e_dbg("ULP_CONFIG_DONE cleared after %dmsec\n", i * 10); + +@@ -2011,7 +2011,7 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw) + + while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) && + (i++ < 30)) +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + return blocked ? E1000_BLK_PHY_RESET : 0; + } + +@@ -2827,7 +2827,7 @@ static s32 e1000_post_phy_reset_ich8lan(struct e1000_hw *hw) + return 0; + + /* Allow time for h/w to get to quiescent state after reset */ +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + /* Perform any necessary post-reset workarounds */ + switch (hw->mac.type) { +@@ -2863,7 +2863,7 @@ static s32 e1000_post_phy_reset_ich8lan(struct e1000_hw *hw) + if (hw->mac.type == e1000_pch2lan) { + /* Ungate automatic PHY configuration on non-managed 82579 */ + if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) { +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + e1000_gate_hw_phy_config_ich8lan(hw, false); + } + +@@ -3884,7 +3884,7 @@ static s32 e1000_update_nvm_checksum_spt(struct e1000_hw *hw) + */ + if (!ret_val) { + nvm->ops.reload(hw); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + } + + out: +@@ -4035,7 +4035,7 @@ static s32 e1000_update_nvm_checksum_ich8lan(struct e1000_hw *hw) + */ + if (!ret_val) { + nvm->ops.reload(hw); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + } + + out: +@@ -4658,7 +4658,7 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw) + ew32(TCTL, E1000_TCTL_PSP); + e1e_flush(); + +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + /* Workaround for ICH8 bit corruption issue in FIFO memory */ + if (hw->mac.type == e1000_ich8lan) { +diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c +index b322011ec282..eecbf7a12735 100644 +--- a/drivers/net/ethernet/intel/e1000e/mac.c ++++ b/drivers/net/ethernet/intel/e1000e/mac.c +@@ -815,7 +815,7 @@ static s32 e1000_poll_fiber_serdes_link_generic(struct e1000_hw *hw) + * milliseconds even if the other end is doing it in SW). + */ + for (i = 0; i < FIBER_LINK_UP_LIMIT; i++) { +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + status = er32(STATUS); + if (status & E1000_STATUS_LU) + break; +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index 7017281ba2dc..7d68d694ed9e 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -3206,7 +3206,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter) + if (!(adapter->flags2 & FLAG2_NO_DISABLE_RX)) + ew32(RCTL, rctl & ~E1000_RCTL_EN); + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + if (adapter->flags2 & FLAG2_DMA_BURST) { + /* set the writeback threshold (only takes effect if the RDTR +@@ -4258,7 +4258,7 @@ void e1000e_down(struct e1000_adapter *adapter, bool reset) + + /* flush both disables and wait for them to finish */ + e1e_flush(); +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + e1000_irq_disable(adapter); + +@@ -4296,7 +4296,7 @@ void e1000e_reinit_locked(struct e1000_adapter *adapter) + { + might_sleep(); + while (test_and_set_bit(__E1000_RESETTING, &adapter->state)) +- usleep_range(1000, 2000); ++ usleep_range(1000, 1100); + e1000e_down(adapter, true); + e1000e_up(adapter); + clear_bit(__E1000_RESETTING, &adapter->state); +@@ -4671,7 +4671,7 @@ int e1000e_close(struct net_device *netdev) + int count = E1000_CHECK_RESET_COUNT; + + while (test_bit(__E1000_RESETTING, &adapter->state) && count--) +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + WARN_ON(test_bit(__E1000_RESETTING, &adapter->state)); + +@@ -5996,7 +5996,7 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu) + } + + while (test_and_set_bit(__E1000_RESETTING, &adapter->state)) +- usleep_range(1000, 2000); ++ usleep_range(1000, 1100); + /* e1000e_down -> e1000e_reset dependent on max_frame_size & mtu */ + adapter->max_frame_size = max_frame; + e_info("changing MTU from %d to %d\n", netdev->mtu, new_mtu); +@@ -6276,7 +6276,7 @@ static int e1000e_pm_freeze(struct device *dev) + int count = E1000_CHECK_RESET_COUNT; + + while (test_bit(__E1000_RESETTING, &adapter->state) && count--) +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + WARN_ON(test_bit(__E1000_RESETTING, &adapter->state)); + +@@ -6687,7 +6687,7 @@ static int e1000e_pm_runtime_suspend(struct device *dev) + int count = E1000_CHECK_RESET_COUNT; + + while (test_bit(__E1000_RESETTING, &adapter->state) && count--) +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + + WARN_ON(test_bit(__E1000_RESETTING, &adapter->state)); + +diff --git a/drivers/net/ethernet/intel/e1000e/nvm.c b/drivers/net/ethernet/intel/e1000e/nvm.c +index 2efd80dfd88e..38f7c8fb3061 100644 +--- a/drivers/net/ethernet/intel/e1000e/nvm.c ++++ b/drivers/net/ethernet/intel/e1000e/nvm.c +@@ -410,7 +410,7 @@ s32 e1000e_write_nvm_spi(struct e1000_hw *hw, u16 offset, u16 words, u16 *data) + break; + } + } +- usleep_range(10000, 20000); ++ usleep_range(10000, 11000); + nvm->ops.release(hw); + } + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0120-give-rdrand-some-credit.patch b/patches/boot_time_opt/0120-give-rdrand-some-credit.patch new file mode 100644 index 0000000..4b1669c --- /dev/null +++ b/patches/boot_time_opt/0120-give-rdrand-some-credit.patch @@ -0,0 +1,30 @@ +From 5cc978db25b2c92707f68b15098ac39901fb5aac Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 29 Jul 2016 19:10:52 +0000 +Subject: [PATCH 120/124] give rdrand some credit + +try to credit rdrand/rdseed with some entropy + +In VMs but even modern hardware, we're super starved for entropy, and while we can +and do wear a tin foil hat, it's very hard to argue that +rdrand and rdtsc add zero entropy. +--- + drivers/char/random.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/char/random.c b/drivers/char/random.c +index d6876d506220..fca09af81b2c 100644 +--- a/drivers/char/random.c ++++ b/drivers/char/random.c +@@ -1638,6 +1638,8 @@ static void init_std_data(struct entropy_store *r) + if (!arch_get_random_seed_long(&rv) && + !arch_get_random_long(&rv)) + rv = random_get_entropy(); ++ else ++ credit_entropy_bits(r, 1); + mix_pool_bytes(r, &rv, sizeof(rv)); + } + mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); +-- +2.11.1 + diff --git a/patches/boot_time_opt/0121-e1000e-change-default-policy.patch b/patches/boot_time_opt/0121-e1000e-change-default-policy.patch new file mode 100644 index 0000000..bf3e13d --- /dev/null +++ b/patches/boot_time_opt/0121-e1000e-change-default-policy.patch @@ -0,0 +1,27 @@ +From 5b4707fc2aa8c49aa18a60136880bf05a3e29071 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 10 Dec 2016 14:29:52 +0000 +Subject: [PATCH 121/124] e1000e: change default policy + +change the default irq mitigation policy for e1000e to be +more HPC/cluster friendly +--- + drivers/net/ethernet/intel/e1000e/param.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c +index 6d8c39abee16..ef1122ad3b98 100644 +--- a/drivers/net/ethernet/intel/e1000e/param.c ++++ b/drivers/net/ethernet/intel/e1000e/param.c +@@ -92,7 +92,7 @@ E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay"); + * Valid Range: 100-100000 or one of: 0=off, 1=dynamic, 3=dynamic conservative + */ + E1000_PARAM(InterruptThrottleRate, "Interrupt Throttling Rate"); +-#define DEFAULT_ITR 3 ++#define DEFAULT_ITR 1 + #define MAX_ITR 100000 + #define MIN_ITR 100 + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch b/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch new file mode 100644 index 0000000..eb44cec --- /dev/null +++ b/patches/boot_time_opt/0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch @@ -0,0 +1,28 @@ +From 5cf7ba4ba9c9d770aad9e52deaa3730f259df9f1 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH 122/124] ipv4/tcp: allow the memory tuning for tcp to go a + little bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 6a90a0e130dc..32e43ce7c60e 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3341,8 +3341,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + sysctl_tcp_wmem[1] = 16*1024; +-- +2.11.1 + diff --git a/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch b/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch new file mode 100644 index 0000000..ce4964e --- /dev/null +++ b/patches/boot_time_opt/0123-igb-no-runtime-pm-to-fix-reboot-oops.patch @@ -0,0 +1,27 @@ +From 10f0c995ce6aaf6b3ffa78377f1a12ad0477057a Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 12 Jan 2017 18:17:14 +0000 +Subject: [PATCH 123/124] igb: no runtime pm to fix reboot oops + +Causes oops on reboot due to a race between runtime resume and shutdown +--- + drivers/net/ethernet/intel/igb/igb_main.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c +index 9affd7c198bd..8ade77e75b36 100644 +--- a/drivers/net/ethernet/intel/igb/igb_main.c ++++ b/drivers/net/ethernet/intel/igb/igb_main.c +@@ -238,9 +238,6 @@ static struct pci_driver igb_driver = { + .id_table = igb_pci_tbl, + .probe = igb_probe, + .remove = igb_remove, +-#ifdef CONFIG_PM +- .driver.pm = &igb_pm_ops, +-#endif + .shutdown = igb_shutdown, + .sriov_configure = igb_pci_sriov_configure, + .err_handler = &igb_err_handler +-- +2.11.1 + diff --git a/patches/boot_time_opt/0124-tweak-perfbias.patch b/patches/boot_time_opt/0124-tweak-perfbias.patch new file mode 100644 index 0000000..56a2865 --- /dev/null +++ b/patches/boot_time_opt/0124-tweak-perfbias.patch @@ -0,0 +1,32 @@ +From 03e2c414a860264511dae5bbfc6d7e62b8b94f0f Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 22 Jan 2017 18:51:13 +0000 +Subject: [PATCH 124/124] tweak perfbias + +--- + arch/x86/kernel/cpu/intel.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index fcd484d2bb03..13ae40f10bd4 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -434,12 +434,12 @@ static void init_intel_energy_perf(struct cpuinfo_x86 *c) + return; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); +- if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) ++ if ((epb & 0xF) >= ENERGY_PERF_BIAS_NORMAL) + return; + +- pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); ++ pr_warn_once("ENERGY_PERF_BIAS: Set to 'performance', was 'normal'\n"); + pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); +- epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; ++ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_PERFORMANCE; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + } + +-- +2.11.1 + diff --git a/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch b/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch new file mode 100644 index 0000000..1c50e74 --- /dev/null +++ b/patches/boot_time_opt/0125-e1000e-increase-pause-and-refresh-time.patch @@ -0,0 +1,33 @@ +From 6730c1ae12a567d56092d15540d2f971be95b936 Mon Sep 17 00:00:00 2001 +From: Miguel Bernal Marin +Date: Mon, 27 Mar 2017 16:01:56 -0600 +Subject: [PATCH] e1000e: increase pause and refresh time + +Suggested-by: Tim Pepper +Signed-off-by: Miguel Bernal Marin +--- + drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index 7d68d694ed9e..1db390a52656 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -4032,12 +4032,12 @@ void e1000e_reset(struct e1000_adapter *adapter) + case e1000_pch2lan: + case e1000_pch_lpt: + case e1000_pch_spt: +- fc->refresh_time = 0x0400; ++ fc->refresh_time = 0xFFFF; ++ fc->pause_time = 0xFFFF; + + if (adapter->netdev->mtu <= ETH_DATA_LEN) { + fc->high_water = 0x05C20; + fc->low_water = 0x05048; +- fc->pause_time = 0x0650; + break; + } + +-- +2.12.2 + diff --git a/patches/boot_time_opt/0151-mm-Export-do_madvise.patch b/patches/boot_time_opt/0151-mm-Export-do_madvise.patch new file mode 100644 index 0000000..a6dbff7 --- /dev/null +++ b/patches/boot_time_opt/0151-mm-Export-do_madvise.patch @@ -0,0 +1,84 @@ +From 99b4cdcce43ad0f706120bef26fef8c628c572cf Mon Sep 17 00:00:00 2001 +From: Sebastien Boeuf +Date: Mon, 23 Jan 2017 15:03:52 -0800 +Subject: [PATCH 151/154] mm: Export do_madvise() + +Combined with some interesting flags madvise() system call +allows to free memory more smartly and more efficiently than +we could do with a simple free(). The issue is that is not +available for kernel modules that could need it. + +In order to solve this lack of support, this patch exports +do_madvise() so as to make it available to the entire kernel. +The already existing madvise() system call is unchanged and +now relies on this new do_madvise() function. + +Suggested-by: Arjan van de Ven +Signed-off-by: Sebastien Boeuf +--- + include/linux/mm.h | 2 ++ + mm/madvise.c | 25 +++++++++++++++++++++---- + 2 files changed, 23 insertions(+), 4 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 0b5b2e4df14e..925ec25f99a8 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2450,5 +2450,7 @@ void __init setup_nr_node_ids(void); + static inline void setup_nr_node_ids(void) {} + #endif + ++extern int do_madvise(unsigned long start, size_t len_in, int behavior); ++ + #endif /* __KERNEL__ */ + #endif /* _LINUX_MM_H */ +diff --git a/mm/madvise.c b/mm/madvise.c +index 93fb63e88b5e..c8bbf93d4978 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -618,9 +618,7 @@ madvise_behavior_valid(int behavior) + } + + /* +- * The madvise(2) system call. +- * +- * Applications can use madvise() to advise the kernel how it should ++ * Kernel modules can use do_madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the +@@ -673,7 +671,7 @@ madvise_behavior_valid(int behavior) + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) ++int do_madvise(unsigned long start, size_t len_in, int behavior) + { + unsigned long end, tmp; + struct vm_area_struct *vma, *prev; +@@ -767,3 +765,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) + + return error; + } ++EXPORT_SYMBOL_GPL(do_madvise); ++ ++/* ++ * The madvise(2) system call. ++ * ++ * Applications can use madvise() system call to advise the kernel how ++ * it should handle paging I/O in this VM area. The idea is to help ++ * the kernel use appropriate read-ahead and caching techniques. The ++ * information provided is advisory only, and can be safely disregarded ++ * by the kernel without affecting the correct operation of the application. ++ * ++ * behavior values are the same than the ones defined in madvise() ++ * ++ * return values are the same than the ones defined in madvise() ++ */ ++SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) ++{ ++ return do_madvise(start, len_in, behavior); ++} +-- +2.12.1 + diff --git a/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch b/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch new file mode 100644 index 0000000..5f44930 --- /dev/null +++ b/patches/boot_time_opt/0152-x86-kvm-Notify-host-to-release-pages.patch @@ -0,0 +1,180 @@ +From d28921b5f797829e4e676f7968ae688ef96b7992 Mon Sep 17 00:00:00 2001 +From: Sebastien Boeuf +Date: Mon, 23 Jan 2017 15:08:55 -0800 +Subject: [PATCH 152/154] x86: kvm: Notify host to release pages + +In context of hypervisors managing several virtual machines, we +want those virtual machines to give the memory they used back to +the host when they don't need it anymore. + +This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing +the guest kernel to notify the host kernel when such event occurs. +And relying on do_madvise() function that we have previously exported, +it issues a call to this function when it receives the new hypercall. + +Use of do_madvise() with MADV_DONTNEED flag will allow the guest to +ask for a new page without going through a new hypercall. Instead, +it will be able to start using that memory again as it will get +faulted back in as a fresh new page. That's why do_madvise() is more +efficient than doing vm_unmap() to return some memory to the host. + +This patch introduces also a new sysctl kvm_madv_instant_free, +allowing user to set MADV_FREE advice instead of MADV_DONTNEED. +Indeed, MADV_FREE saves more performances than using MADV_DONTNEED +because it does not zero the pages in case the memory has not been +freed by the kernel. This can happen when there was no need for the +kernel to get this memory back, meaning it was keeping those pages +in the right state to be re-used by the same application. +MADV_FREE being a very recent advice introduced in kernel 4.5, we +only want to enable it through a sysctl in case the user want to +use it. + +Suggested-by: Arjan van de Ven +Signed-off-by: Sebastien Boeuf +--- + arch/x86/kvm/x86.c | 17 +++++++++++++++++ + include/linux/mm.h | 5 +++++ + include/uapi/linux/kvm_para.h | 3 +++ + kernel/sysctl.c | 7 +++++++ + mm/Makefile | 2 +- + mm/kvm.c | 25 +++++++++++++++++++++++++ + 6 files changed, 58 insertions(+), 1 deletion(-) + create mode 100644 mm/kvm.c + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 582c75311f95..683a94dd5f03 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -6019,6 +6020,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) + kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); + } + ++static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len) ++{ ++ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa)); ++ ++ if (len > KVM_MAX_RET_MEM_SIZE) ++ return KVM_EPERM; ++ ++ if (kvm_is_error_hva(start + len)) ++ return KVM_EFAULT; ++ ++ return do_madvise(start, len, kvm_ret_mem_advice); ++} ++ + void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) + { + vcpu->arch.apicv_active = false; +@@ -6065,6 +6079,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; ++ case KVM_HC_RETURN_MEM: ++ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1); ++ break; + default: + ret = -KVM_ENOSYS; + break; +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 925ec25f99a8..833f23d98baa 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2303,6 +2303,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); + extern int sysctl_drop_caches; + int drop_caches_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); ++extern int sysctl_kvm_madv_instant_free; ++extern int kvm_ret_mem_advice; ++int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *length, ++ loff_t *ppos); + #endif + + void drop_slab(void); +diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h +index bf6cd7d5cac2..7d90f77d87d0 100644 +--- a/include/uapi/linux/kvm_para.h ++++ b/include/uapi/linux/kvm_para.h +@@ -23,6 +23,9 @@ + #define KVM_HC_MIPS_GET_CLOCK_FREQ 6 + #define KVM_HC_MIPS_EXIT_VM 7 + #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 ++#define KVM_HC_RETURN_MEM 10 ++ ++#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB + + /* + * hypercalls use architecture specific +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index c1095cdc0fe2..d8ae774fa042 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -1398,6 +1398,13 @@ static struct ctl_table vm_table[] = { + .extra1 = &one, + .extra2 = &four, + }, ++ { ++ .procname = "kvm_madv_instant_free", ++ .data = &sysctl_kvm_madv_instant_free, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = kvm_madv_instant_free_sysctl_handler, ++ }, + #ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", +diff --git a/mm/Makefile b/mm/Makefile +index 295bd7a..6455723 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -47,6 +47,8 @@ else + obj-y += bootmem.o + endif + ++obj-y += kvm.o ++ + obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o + ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o +diff --git a/mm/kvm.c b/mm/kvm.c +new file mode 100644 +index 000000000000..8945f6a311b9 +--- /dev/null ++++ b/mm/kvm.c +@@ -0,0 +1,25 @@ ++#include ++ ++int sysctl_kvm_madv_instant_free; ++ ++int kvm_ret_mem_advice = MADV_DONTNEED; ++EXPORT_SYMBOL_GPL(kvm_ret_mem_advice); ++ ++int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *length, loff_t *ppos) ++{ ++ int ret; ++ ++ ret = proc_dointvec(table, write, buffer, length, ppos); ++ if (ret) ++ return ret; ++ ++#ifdef MADV_FREE ++ if (sysctl_kvm_madv_instant_free > 0) ++ kvm_ret_mem_advice = MADV_FREE; ++ else ++ kvm_ret_mem_advice = MADV_DONTNEED; ++#endif ++ ++ return 0; ++} +-- +2.12.1 + diff --git a/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch b/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch new file mode 100644 index 0000000..cdb876a --- /dev/null +++ b/patches/boot_time_opt/0153-x86-Return-memory-from-guest-to-host-kernel.patch @@ -0,0 +1,155 @@ +From 855ef164854307839c08c60688eaeac14f9a649e Mon Sep 17 00:00:00 2001 +From: Sebastien Boeuf +Date: Mon, 23 Jan 2017 15:26:13 -0800 +Subject: [PATCH 153/154] x86: Return memory from guest to host kernel + +All virtual machines need memory to perform various tasks, but this +memory is not released to the host after it is not used anymore. We +have to wait for the termination of the virtual machine to get this +memory back into the host. + +Ballooning mechanism is close but not designed for the same purpose. +In case we hit memory limits of the system, the host predicts how much +memory can be asked back from a guest, and it issues an hypercall to +retrieve this memory. + +The solution proposed is different because it does not wait for host +needs before to return memory, and it knows precisely how much memory +it can return. + +The way to notify the host side about such a return is to rely on +the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be +overloaded with too many hypercalls, we only return memory blocks of +order 7 (512k blocks) and higher. This value has been found running +memory tests using multiple threads allocating/freeing high amount +of memory. Those tests were run for different order values, and 7 was +the best tradeoff between the number of hypercalls issued and the +amount of memory returned to the host. + +In order to limit performances impact related to this code addition, +we check for blocks of order 7 or higher. This means it only costs an +additional function call and a branch to perform this check. + +Furthermore, this code has been added to the "merge" codepath of the +buddy allocator, which is not as sensitive as the "free" codepath. +Not all blocks going through the "free" codepath will end up in the +"merge" codepath because some of them won't find their free buddy. +But this is a negligible amount since the kernel does not use many +high order blocks directly. Instead, those bigger blocks are often +broken into smaller chunks used as low order blocks. At the time +those small blocks are released, they go through the merge path. + +Benchmarks such as ebizzy and will-it-scale have been run in order +to make sure this patch does not affect kernel performances and no +significant differences were observed. + +Suggested-by: Arjan van de Ven +Signed-off-by: Sebastien Boeuf +--- + arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ + arch/x86/kernel/kvm.c | 10 ++++++++++ + include/linux/mm-arch-hooks.h | 8 ++++++++ + mm/page_alloc.c | 2 ++ + 4 files changed, 42 insertions(+) + +diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h +index bc62e7cbf1b1..4a2f6d1adbd2 100644 +--- a/arch/x86/include/asm/kvm_para.h ++++ b/arch/x86/include/asm/kvm_para.h +@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token); + void kvm_async_pf_task_wake(u32 token); + u32 kvm_read_and_reset_pf_reason(void); + extern void kvm_disable_steal_time(void); ++void kvm_arch_return_memory(struct page *page, unsigned int order); ++ ++/* ++ * This order has been found in an empirical way, running memory tests ++ * through many iterations to assess the number of hypercalls issued ++ * and the amount of memory returned. In case you change this order to ++ * 6 or 8, it should not impact your performances significantly. ++ * ++ * Smaller values lead to less memory waste, but consume more CPU on ++ * hypercalls. Larger values use less CPU, but do not as precisely ++ * inform the hypervisor of which memory is free. ++ */ ++#define RET_MEM_BUDDY_ORDER 7 ++ ++static inline void arch_buddy_merge(struct page *page, unsigned int order) ++{ ++ if (order < RET_MEM_BUDDY_ORDER) ++ return; ++ ++ kvm_arch_return_memory(page, order); ++} ++#define arch_buddy_merge arch_buddy_merge + + #ifdef CONFIG_PARAVIRT_SPINLOCKS + void __init kvm_spinlock_init(void); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index edbbfc854e39..14167b3f6514 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -552,6 +552,16 @@ static __init int activate_jump_labels(void) + } + arch_initcall(activate_jump_labels); + ++void kvm_arch_return_memory(struct page *page, unsigned int order) ++{ ++ if (!kvm_para_available()) ++ return; ++ ++ kvm_hypercall2(KVM_HC_RETURN_MEM, ++ page_to_phys(page), ++ PAGE_SIZE << order); ++} ++ + #ifdef CONFIG_PARAVIRT_SPINLOCKS + + /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h +index 4efc3f56e6df..26eb3a05a8a3 100644 +--- a/include/linux/mm-arch-hooks.h ++++ b/include/linux/mm-arch-hooks.h +@@ -12,6 +12,7 @@ + #define _LINUX_MM_ARCH_HOOKS_H + + #include ++#include + + #ifndef arch_remap + static inline void arch_remap(struct mm_struct *mm, +@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm, + #define arch_remap arch_remap + #endif + ++#ifndef arch_buddy_merge ++static inline void arch_buddy_merge(struct page *page, unsigned int order) ++{ ++} ++#define arch_buddy_merge arch_buddy_merge ++#endif ++ + #endif /* _LINUX_MM_ARCH_HOOKS_H */ +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 1460e6ad5e14..5f6e6371bc6f 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -64,6 +64,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -855,6 +856,7 @@ static inline void __free_one_page(struct page *page, + } + + done_merging: ++ arch_buddy_merge(page, order); + set_page_order(page, order); + + /* +-- +2.12.1 + diff --git a/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch b/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch new file mode 100644 index 0000000..07d4a83 --- /dev/null +++ b/patches/boot_time_opt/0154-sysctl-vm-Fine-grained-cache-shrinking.patch @@ -0,0 +1,137 @@ +From 2c145b5233b504f5226a0f4bc44baeef33b444d8 Mon Sep 17 00:00:00 2001 +From: Sebastien Boeuf +Date: Mon, 23 Jan 2017 15:32:39 -0800 +Subject: [PATCH 154/154] sysctl: vm: Fine-grained cache shrinking + +Lots of virtual machines are let in idle state for days until they +are terminated, and they can keep a large amount of memory in their +cache, meaning this memory cannot be used by other processes. + +We tried to release this memory using existing drop_caches sysctl, +but it led to the complete cache loss while it could have been used +whether the idle process wakes up. Indeed, the process can't find any +available cached data and it directly affects performances to rebuild +it from scratch. + +Instead, the solution we want is based on shrinking gradually system +cache over time. This patch adds a new sysctl shrink_caches_mb so as +to allow userspace applications indicating the kernel it should shrink +system cache up to the amount (in MiB) specified. + +There is an application called "memshrinker" which uses this new +mechanism. It runs in the background and periodically releases a +specified amount of cache. This amount is based on the remaining +cache on the system, and period is computed to follow a shrinking +model. It results in saving a lot of memory for other processes +running on the system. + +Suggested-by: Arjan van de Ven +Signed-off-by: Sebastien Boeuf +--- + fs/drop_caches.c | 25 +++++++++++++++++++++++++ + include/linux/mm.h | 4 ++++ + kernel/sysctl.c | 8 ++++++++ + mm/vmscan.c | 2 -- + 4 files changed, 37 insertions(+), 2 deletions(-) + +diff --git a/fs/drop_caches.c b/fs/drop_caches.c +index d72d52b90433..f564dfcc13a4 100644 +--- a/fs/drop_caches.c ++++ b/fs/drop_caches.c +@@ -8,10 +8,12 @@ + #include + #include + #include ++#include + #include "internal.h" + + /* A global variable is a bit ugly, but it keeps the code simple */ + int sysctl_drop_caches; ++int sysctl_shrink_caches_mb; + + static void drop_pagecache_sb(struct super_block *sb, void *unused) + { +@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, + } + return 0; + } ++ ++int shrink_caches_sysctl_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *length, loff_t *ppos) ++{ ++ int ret; ++ unsigned long nr_to_reclaim, page_reclaimed; ++ ++ ret = proc_dointvec_minmax(table, write, buffer, length, ppos); ++ if (ret) ++ return ret; ++ ++ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE; ++ if (write) { ++ page_reclaimed = shrink_all_memory(nr_to_reclaim); ++ if (page_reclaimed > 0) ++ lru_add_drain_all(); ++ ++ if (page_reclaimed != nr_to_reclaim) ++ return page_reclaimed; ++ } ++ ++ return 0; ++} +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 833f23d98baa..0bb66c1c31c9 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2308,6 +2308,10 @@ extern int kvm_ret_mem_advice; + int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); ++extern int sysctl_shrink_caches_mb; ++int shrink_caches_sysctl_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *length, ++ loff_t *ppos); + #endif + + void drop_slab(void); +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index d8ae774fa042..5dc9a46ae212 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -1405,6 +1405,14 @@ static struct ctl_table vm_table[] = { + .mode = 0644, + .proc_handler = kvm_madv_instant_free_sysctl_handler, + }, ++ { ++ .procname = "shrink_caches_mb", ++ .data = &sysctl_shrink_caches_mb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = shrink_caches_sysctl_handler, ++ .extra1 = &one, ++ }, + #ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 30a88b945a44..1198e74d1860 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3525,7 +3525,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) + wake_up_interruptible(&pgdat->kswapd_wait); + } + +-#ifdef CONFIG_HIBERNATION + /* + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of + * freed pages. +@@ -3564,7 +3563,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) + + return nr_reclaimed; + } +-#endif /* CONFIG_HIBERNATION */ + + /* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes +-- +2.12.1 + diff --git a/patches/boot_time_opt/boot_time_opt.scc b/patches/boot_time_opt/boot_time_opt.scc new file mode 100644 index 0000000..1ffb857 --- /dev/null +++ b/patches/boot_time_opt/boot_time_opt.scc @@ -0,0 +1,29 @@ +define KFEATURE_DESCRIPTION "Boot time optimization changes ported from ClearLinux, https://github.com/clearlinux-pkgs/linux-lts and https://github.com/clearlinux-pkgs/linux-kvm" +define KFEATURE_COMPATIBILITY all + +patch 0101-kvm-silence-kvm-unhandled-rdmsr.patch +patch 0102-i8042-decrease-debug-message-level-to-info.patch +watch 0104-Increase-the-ext4-default-commit-age.patch +patch 0105-silence-rapl.patch +patch 0106-pci-pme-wakeups.patch +patch 0107-ksm-wakeups.patch +patch 0108-intel_idle-tweak-cpuidle-cstates.patch +patch 0110-init_task-faster-timerslack.patch +patch 0112-fs-ext4-fsync-optimize-double-fsync-a-bunch.patch +patch 0113-overload-on-wakeup.patch +patch 0114-bootstats-add-printk-s-to-measure-boot-time-in-more-.patch +patch 0115-fix-initcall-timestamps.patch +patch 0116-smpboot-reuse-timer-calibration.patch +patch 0118-Initialize-ata-before-graphics.patch +patch 0119-reduce-e1000e-boot-time-by-tightening-sleep-ranges.patch +patch 0120-give-rdrand-some-credit.patch +patch 0121-e1000e-change-default-policy.patch +watch 0122-ipv4-tcp-allow-the-memory-tuning-for-tcp-to-go-a-lit.patch +patch 0123-igb-no-runtime-pm-to-fix-reboot-oops.patch +patch 0124-tweak-perfbias.patch +patch 0125-e1000e-increase-pause-and-refresh-time.patch + +patch 0151-mm-Export-do_madvise.patch +patch 0152-x86-kvm-Notify-host-to-release-pages.patch +patch 0153-x86-Return-memory-from-guest-to-host-kernel.patch +patch 0154-sysctl-vm-Fine-grained-cache-shrinking.patch diff --git a/patches/boot_time_opt/raid_alg.cfg b/patches/boot_time_opt/raid_alg.cfg new file mode 100644 index 0000000..6df4a7c --- /dev/null +++ b/patches/boot_time_opt/raid_alg.cfg @@ -0,0 +1,3 @@ +CONFIG_RAID6_FORCE_ALGO=y +CONFIG_RAID6_FORCE_INT=y +CONFIG_RAID6_FORCE_AVX2=y diff --git a/patches/boot_time_opt/raid_alg.scc b/patches/boot_time_opt/raid_alg.scc new file mode 100644 index 0000000..98dd713 --- /dev/null +++ b/patches/boot_time_opt/raid_alg.scc @@ -0,0 +1,5 @@ +define KFEATURE_DESCRIPTION "Use AVX2 for RAID recovery algorithm" +define KFEATURE_COMPATIBILITY all + +patch 0117-raid6-add-Kconfig-option-to-skip-raid6-benchmarking.patch +kconf non-hardware raid_alg.cfg -- cgit v1.2.3-54-g00ecf