diff options
Diffstat (limited to 'patches')
18 files changed, 1363 insertions, 0 deletions
diff --git a/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch new file mode 100644 index 0000000..1de2a6b --- /dev/null +++ b/patches/boot_time_opt_guest/0102-cpuidle-skip-synchronize_rcu-on-single-CPU-systems.patch | |||
@@ -0,0 +1,34 @@ | |||
1 | From 6b0fb5b2a7a157c04d8ab6ad71b092034d0048bf Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 11 Feb 2015 16:19:26 -0600 | ||
4 | Subject: [PATCH 102/114] cpuidle: skip synchronize_rcu() on single CPU systems | ||
5 | |||
6 | synchronize_rcu() is pretty expensive, and on single CPU systems we don't need | ||
7 | it in this specific case, so skip it. | ||
8 | |||
9 | Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> | ||
10 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
11 | --- | ||
12 | drivers/cpuidle/cpuidle.c | 5 ++++- | ||
13 | 1 file changed, 4 insertions(+), 1 deletion(-) | ||
14 | |||
15 | diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c | ||
16 | index 62810ff3b00f..f1d110411098 100644 | ||
17 | --- a/drivers/cpuidle/cpuidle.c | ||
18 | +++ b/drivers/cpuidle/cpuidle.c | ||
19 | @@ -324,8 +324,11 @@ void cpuidle_uninstall_idle_handler(void) | ||
20 | /* | ||
21 | * Make sure external observers (such as the scheduler) | ||
22 | * are done looking at pointed idle states. | ||
23 | + * This is only relevant if there is more than one cpu, | ||
24 | + * if there is only one CPU, that is us... and we're | ||
25 | + * coherent to ourselves. | ||
26 | */ | ||
27 | - synchronize_rcu(); | ||
28 | + | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | -- | ||
33 | 2.11.1 | ||
34 | |||
diff --git a/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch new file mode 100644 index 0000000..d3a20fb --- /dev/null +++ b/patches/boot_time_opt_guest/0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch | |||
@@ -0,0 +1,38 @@ | |||
1 | From 7be707833bb35c295eb702d13cf73ac9390e4b31 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 11 Feb 2015 16:25:16 -0600 | ||
4 | Subject: [PATCH 103/114] sysrq: skip synchronize_rcu() if there is no old op | ||
5 | |||
6 | synchronize_rcu() is expensive. Currently it is called as part of the sysrq | ||
7 | registration/unregistration, which happens during boot several times. | ||
8 | Now, the reason for the synchronize_rcu() is to allow an old registered | ||
9 | operation to expire properly... which is pointless if the old operation | ||
10 | is NULL... | ||
11 | So we can save the common case of the old operation being NULL a lot of time | ||
12 | by just checking for non-NULL prior to the synchronize_rcu() | ||
13 | |||
14 | Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> | ||
15 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
16 | --- | ||
17 | drivers/tty/sysrq.c | 4 +++- | ||
18 | 1 file changed, 3 insertions(+), 1 deletion(-) | ||
19 | |||
20 | diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c | ||
21 | index 701c085bb19b..c60c7ba57ad9 100644 | ||
22 | --- a/drivers/tty/sysrq.c | ||
23 | +++ b/drivers/tty/sysrq.c | ||
24 | @@ -1065,8 +1065,10 @@ static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, | ||
25 | * A concurrent __handle_sysrq either got the old op or the new op. | ||
26 | * Wait for it to go away before returning, so the code for an old | ||
27 | * op is not freed (eg. on module unload) while it is in use. | ||
28 | + * This is only relevant if the old op is not NULL of course. | ||
29 | */ | ||
30 | - synchronize_rcu(); | ||
31 | + if (remove_op_p) | ||
32 | + synchronize_rcu(); | ||
33 | |||
34 | return retval; | ||
35 | } | ||
36 | -- | ||
37 | 2.11.1 | ||
38 | |||
diff --git a/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch new file mode 100644 index 0000000..715c195 --- /dev/null +++ b/patches/boot_time_opt_guest/0104-fbcon-enable-no-blink-by-default.patch | |||
@@ -0,0 +1,26 @@ | |||
1 | From 5899ff79ed4e3514420e1530a3588a922832dae5 Mon Sep 17 00:00:00 2001 | ||
2 | From: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com> | ||
3 | Date: Mon, 13 Apr 2015 11:26:36 -0500 | ||
4 | Subject: [PATCH 104/114] fbcon: enable no blink by default | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | --- | ||
8 | drivers/video/console/fbcon.c | 2 +- | ||
9 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
10 | |||
11 | diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c | ||
12 | index a44f5627b82a..95b73366b86f 100644 | ||
13 | --- a/drivers/video/console/fbcon.c | ||
14 | +++ b/drivers/video/console/fbcon.c | ||
15 | @@ -146,7 +146,7 @@ static const struct consw fb_con; | ||
16 | |||
17 | static int fbcon_set_origin(struct vc_data *); | ||
18 | |||
19 | -static int fbcon_cursor_noblink; | ||
20 | +static int fbcon_cursor_noblink = 1; | ||
21 | |||
22 | #define divides(a, b) ((!(a) || (b)%(a)) ? 0 : 1) | ||
23 | |||
24 | -- | ||
25 | 2.11.1 | ||
26 | |||
diff --git a/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch new file mode 100644 index 0000000..09b109a --- /dev/null +++ b/patches/boot_time_opt_guest/0105-vmstats-wakeups.patch | |||
@@ -0,0 +1,28 @@ | |||
1 | From ff47b4e9be8113b4ba05d6f2afee3db6904bc10f Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 11 Feb 2015 16:47:20 -0600 | ||
4 | Subject: [PATCH 105/114] vmstats: wakeups | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | |||
8 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
9 | --- | ||
10 | mm/vmstat.c | 2 +- | ||
11 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
12 | |||
13 | diff --git a/mm/vmstat.c b/mm/vmstat.c | ||
14 | index 7c28df36f50f..efe1b6797139 100644 | ||
15 | --- a/mm/vmstat.c | ||
16 | +++ b/mm/vmstat.c | ||
17 | @@ -1549,7 +1549,7 @@ static const struct file_operations proc_vmstat_file_operations = { | ||
18 | #ifdef CONFIG_SMP | ||
19 | static struct workqueue_struct *vmstat_wq; | ||
20 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | ||
21 | -int sysctl_stat_interval __read_mostly = HZ; | ||
22 | +int sysctl_stat_interval __read_mostly = 8 * HZ; | ||
23 | |||
24 | #ifdef CONFIG_PROC_FS | ||
25 | static void refresh_vm_stats(struct work_struct *work) | ||
26 | -- | ||
27 | 2.11.1 | ||
28 | |||
diff --git a/patches/boot_time_opt_guest/0106-pci-probe.patch b/patches/boot_time_opt_guest/0106-pci-probe.patch new file mode 100644 index 0000000..5045926 --- /dev/null +++ b/patches/boot_time_opt_guest/0106-pci-probe.patch | |||
@@ -0,0 +1,123 @@ | |||
1 | From b225caf8f743b9f5f9e84d0df711ee0c17e049ae Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 11 Feb 2015 16:53:08 -0600 | ||
4 | Subject: [PATCH 106/114] pci: probe | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | |||
8 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
9 | --- | ||
10 | drivers/pci/probe.c | 43 ++++++++++++++++++++++++++++++++++++++++--- | ||
11 | 1 file changed, 40 insertions(+), 3 deletions(-) | ||
12 | |||
13 | diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c | ||
14 | index 204960e70333..7399a06698da 100644 | ||
15 | --- a/drivers/pci/probe.c | ||
16 | +++ b/drivers/pci/probe.c | ||
17 | @@ -182,6 +182,10 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, | ||
18 | |||
19 | mask = type ? PCI_ROM_ADDRESS_MASK : ~0; | ||
20 | |||
21 | + res->name = pci_name(dev); | ||
22 | + | ||
23 | + printk("clr: Starting probe for %s\n", res->name); | ||
24 | + | ||
25 | /* No printks while decoding is disabled! */ | ||
26 | if (!dev->mmio_always_on) { | ||
27 | pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); | ||
28 | @@ -191,8 +195,6 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, | ||
29 | } | ||
30 | } | ||
31 | |||
32 | - res->name = pci_name(dev); | ||
33 | - | ||
34 | pci_read_config_dword(dev, pos, &l); | ||
35 | pci_write_config_dword(dev, pos, l | mask); | ||
36 | pci_read_config_dword(dev, pos, &sz); | ||
37 | @@ -324,6 +326,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) | ||
38 | if (dev->non_compliant_bars) | ||
39 | return; | ||
40 | |||
41 | + printk("clr: pci_read_bases start\n"); | ||
42 | + | ||
43 | for (pos = 0; pos < howmany; pos++) { | ||
44 | struct resource *res = &dev->resource[pos]; | ||
45 | reg = PCI_BASE_ADDRESS_0 + (pos << 2); | ||
46 | @@ -332,11 +336,13 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) | ||
47 | |||
48 | if (rom) { | ||
49 | struct resource *res = &dev->resource[PCI_ROM_RESOURCE]; | ||
50 | + printk("clr: rom path\n"); | ||
51 | dev->rom_base_reg = rom; | ||
52 | res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | | ||
53 | IORESOURCE_READONLY | IORESOURCE_SIZEALIGN; | ||
54 | __pci_read_base(dev, pci_bar_mem32, res, rom); | ||
55 | } | ||
56 | + printk("clr: pci_read_bases end\n"); | ||
57 | } | ||
58 | |||
59 | static void pci_read_bridge_io(struct pci_bus *child) | ||
60 | @@ -1311,6 +1317,28 @@ static void pci_msi_setup_pci_dev(struct pci_dev *dev) | ||
61 | pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); | ||
62 | } | ||
63 | |||
64 | +static int guess_bar_count(int class) | ||
65 | +{ | ||
66 | + if (class == 0x068000) | ||
67 | + return 0; | ||
68 | + if (class == 0x020000) | ||
69 | + return 2; | ||
70 | + if (class == 0x010000) | ||
71 | + return 2; | ||
72 | + if (class == 0x00ff00) | ||
73 | + return 1; | ||
74 | + return 6; | ||
75 | +} | ||
76 | + | ||
77 | +static int has_rom(int class, int rom) | ||
78 | +{ | ||
79 | + if (class == 0x020000) | ||
80 | + return 0; | ||
81 | + if (class == 0x010000 || class == 0x00ff00) | ||
82 | + return 0; | ||
83 | + return rom; | ||
84 | +} | ||
85 | + | ||
86 | /** | ||
87 | * pci_setup_device - fill in class and map information of a device | ||
88 | * @dev: the device structure to fill | ||
89 | @@ -1329,6 +1357,9 @@ int pci_setup_device(struct pci_dev *dev) | ||
90 | int pos = 0; | ||
91 | struct pci_bus_region region; | ||
92 | struct resource *res; | ||
93 | + int maxbar; | ||
94 | + | ||
95 | + printk("clr: pci_setup_device start\n"); | ||
96 | |||
97 | if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type)) | ||
98 | return -EIO; | ||
99 | @@ -1383,7 +1414,11 @@ int pci_setup_device(struct pci_dev *dev) | ||
100 | if (class == PCI_CLASS_BRIDGE_PCI) | ||
101 | goto bad; | ||
102 | pci_read_irq(dev); | ||
103 | - pci_read_bases(dev, 6, PCI_ROM_ADDRESS); | ||
104 | + | ||
105 | + maxbar = guess_bar_count(dev->class); | ||
106 | + | ||
107 | + if (class != PCI_CLASS_STORAGE_IDE) | ||
108 | + pci_read_bases(dev, maxbar, has_rom(dev->class, PCI_ROM_ADDRESS)); | ||
109 | pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); | ||
110 | pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device); | ||
111 | |||
112 | @@ -1468,6 +1503,8 @@ int pci_setup_device(struct pci_dev *dev) | ||
113 | dev->class = PCI_CLASS_NOT_DEFINED << 8; | ||
114 | } | ||
115 | |||
116 | + printk("clr: pci_setup_device end\n"); | ||
117 | + | ||
118 | /* We found a fine healthy device, go go go... */ | ||
119 | return 0; | ||
120 | } | ||
121 | -- | ||
122 | 2.11.1 | ||
123 | |||
diff --git a/patches/boot_time_opt_guest/0107-cgroup.patch b/patches/boot_time_opt_guest/0107-cgroup.patch new file mode 100644 index 0000000..d68c686 --- /dev/null +++ b/patches/boot_time_opt_guest/0107-cgroup.patch | |||
@@ -0,0 +1,107 @@ | |||
1 | From 0adc5bfd84939d11d3c172eab0a00bfab4aadb46 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Fri, 28 Aug 2015 11:00:36 -0500 | ||
4 | Subject: [PATCH 107/114] cgroup | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | |||
8 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
9 | Signed-off-by: Jose Carlos Venegas Munoz <jos.c.venegas.munoz@intel.com> | ||
10 | --- | ||
11 | include/linux/cgroup-defs.h | 2 +- | ||
12 | kernel/cgroup.c | 24 ++++++++++++++---------- | ||
13 | 2 files changed, 15 insertions(+), 11 deletions(-) | ||
14 | |||
15 | diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h | ||
16 | index 861b4677fc5b..5d3c345ee60c 100644 | ||
17 | --- a/include/linux/cgroup-defs.h | ||
18 | +++ b/include/linux/cgroup-defs.h | ||
19 | @@ -137,7 +137,7 @@ struct cgroup_subsys_state { | ||
20 | |||
21 | /* percpu_ref killing and RCU release */ | ||
22 | struct rcu_head rcu_head; | ||
23 | - struct work_struct destroy_work; | ||
24 | + struct delayed_work destroy_work; | ||
25 | }; | ||
26 | |||
27 | /* | ||
28 | diff --git a/kernel/cgroup.c b/kernel/cgroup.c | ||
29 | index 53bbca7c4859..6de39d8213ed 100644 | ||
30 | --- a/kernel/cgroup.c | ||
31 | +++ b/kernel/cgroup.c | ||
32 | @@ -73,7 +73,7 @@ | ||
33 | * Expiring in the middle is a performance problem not a correctness one. | ||
34 | * 1 sec should be enough. | ||
35 | */ | ||
36 | -#define CGROUP_PIDLIST_DESTROY_DELAY HZ | ||
37 | +#define CGROUP_PIDLIST_DESTROY_DELAY round_jiffies_relative(HZ) | ||
38 | |||
39 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ | ||
40 | MAX_CFTYPE_NAME + 2) | ||
41 | @@ -4986,8 +4986,9 @@ static struct cftype cgroup_legacy_base_files[] = { | ||
42 | */ | ||
43 | static void css_free_work_fn(struct work_struct *work) | ||
44 | { | ||
45 | + struct delayed_work *dwork = to_delayed_work(work); | ||
46 | struct cgroup_subsys_state *css = | ||
47 | - container_of(work, struct cgroup_subsys_state, destroy_work); | ||
48 | + container_of(dwork, struct cgroup_subsys_state, destroy_work); | ||
49 | struct cgroup_subsys *ss = css->ss; | ||
50 | struct cgroup *cgrp = css->cgroup; | ||
51 | |||
52 | @@ -5036,14 +5037,15 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | ||
53 | struct cgroup_subsys_state *css = | ||
54 | container_of(rcu_head, struct cgroup_subsys_state, rcu_head); | ||
55 | |||
56 | - INIT_WORK(&css->destroy_work, css_free_work_fn); | ||
57 | - queue_work(cgroup_destroy_wq, &css->destroy_work); | ||
58 | + INIT_DELAYED_WORK(&css->destroy_work, css_free_work_fn); | ||
59 | + queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY); | ||
60 | } | ||
61 | |||
62 | static void css_release_work_fn(struct work_struct *work) | ||
63 | { | ||
64 | + struct delayed_work *dwork = to_delayed_work(work); | ||
65 | struct cgroup_subsys_state *css = | ||
66 | - container_of(work, struct cgroup_subsys_state, destroy_work); | ||
67 | + container_of(dwork, struct cgroup_subsys_state, destroy_work); | ||
68 | struct cgroup_subsys *ss = css->ss; | ||
69 | struct cgroup *cgrp = css->cgroup; | ||
70 | |||
71 | @@ -5088,8 +5090,9 @@ static void css_release(struct percpu_ref *ref) | ||
72 | struct cgroup_subsys_state *css = | ||
73 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
74 | |||
75 | - INIT_WORK(&css->destroy_work, css_release_work_fn); | ||
76 | - queue_work(cgroup_destroy_wq, &css->destroy_work); | ||
77 | + INIT_DELAYED_WORK(&css->destroy_work, css_release_work_fn); | ||
78 | + queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY); | ||
79 | + | ||
80 | } | ||
81 | |||
82 | static void init_and_link_css(struct cgroup_subsys_state *css, | ||
83 | @@ -5371,8 +5374,9 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | ||
84 | */ | ||
85 | static void css_killed_work_fn(struct work_struct *work) | ||
86 | { | ||
87 | + struct delayed_work *dwork = to_delayed_work(work); | ||
88 | struct cgroup_subsys_state *css = | ||
89 | - container_of(work, struct cgroup_subsys_state, destroy_work); | ||
90 | + container_of(dwork, struct cgroup_subsys_state, destroy_work); | ||
91 | |||
92 | mutex_lock(&cgroup_mutex); | ||
93 | |||
94 | @@ -5393,8 +5397,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | ||
95 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
96 | |||
97 | if (atomic_dec_and_test(&css->online_cnt)) { | ||
98 | - INIT_WORK(&css->destroy_work, css_killed_work_fn); | ||
99 | - queue_work(cgroup_destroy_wq, &css->destroy_work); | ||
100 | + INIT_DELAYED_WORK(&css->destroy_work, css_killed_work_fn); | ||
101 | + queue_delayed_work(cgroup_destroy_wq, &css->destroy_work, CGROUP_PIDLIST_DESTROY_DELAY); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | -- | ||
106 | 2.11.1 | ||
107 | |||
diff --git a/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch new file mode 100644 index 0000000..48be94a --- /dev/null +++ b/patches/boot_time_opt_guest/0108-smpboot-reuse-timer-calibration.patch | |||
@@ -0,0 +1,45 @@ | |||
1 | From 634947be6c24d844af5f6ecf59453f2ddc09e032 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 11 Feb 2015 17:28:14 -0600 | ||
4 | Subject: [PATCH 108/114] smpboot: reuse timer calibration | ||
5 | |||
6 | NO point recalibrating for known-constant tsc... saves 200ms+ of boot time. | ||
7 | |||
8 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
9 | |||
10 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
11 | --- | ||
12 | arch/x86/kernel/smpboot.c | 2 +- | ||
13 | arch/x86/kernel/tsc.c | 3 +++ | ||
14 | 2 files changed, 4 insertions(+), 1 deletion(-) | ||
15 | |||
16 | diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c | ||
17 | index 99b920d0e516..e17bb425bb52 100644 | ||
18 | --- a/arch/x86/kernel/smpboot.c | ||
19 | +++ b/arch/x86/kernel/smpboot.c | ||
20 | @@ -761,7 +761,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) | ||
21 | pr_debug("Waiting for send to finish...\n"); | ||
22 | send_status = safe_apic_wait_icr_idle(); | ||
23 | |||
24 | - udelay(init_udelay); | ||
25 | + udelay(100); | ||
26 | |||
27 | pr_debug("Deasserting INIT\n"); | ||
28 | |||
29 | diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c | ||
30 | index 37e7cf544e51..e99be8a6a132 100644 | ||
31 | --- a/arch/x86/kernel/tsc.c | ||
32 | +++ b/arch/x86/kernel/tsc.c | ||
33 | @@ -1413,6 +1413,9 @@ unsigned long calibrate_delay_is_known(void) | ||
34 | if (!mask) | ||
35 | return 0; | ||
36 | |||
37 | + if (cpu !=0) | ||
38 | + return cpu_data(0).loops_per_jiffy; | ||
39 | + | ||
40 | sibling = cpumask_any_but(mask, cpu); | ||
41 | if (sibling < nr_cpu_ids) | ||
42 | return cpu_data(sibling).loops_per_jiffy; | ||
43 | -- | ||
44 | 2.11.1 | ||
45 | |||
diff --git a/patches/boot_time_opt_guest/0109-perf.patch b/patches/boot_time_opt_guest/0109-perf.patch new file mode 100644 index 0000000..75f50f6 --- /dev/null +++ b/patches/boot_time_opt_guest/0109-perf.patch | |||
@@ -0,0 +1,28 @@ | |||
1 | From cce700dfbd5fdbf72b96e6479ca539ab4d880ce2 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Wed, 4 Nov 2015 15:17:10 -0600 | ||
4 | Subject: [PATCH 109/114] perf | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | |||
8 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
9 | --- | ||
10 | arch/x86/events/intel/core.c | 2 +- | ||
11 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
12 | |||
13 | diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c | ||
14 | index eb1484c86bb4..c13ea26ac066 100644 | ||
15 | --- a/arch/x86/events/intel/core.c | ||
16 | +++ b/arch/x86/events/intel/core.c | ||
17 | @@ -4040,7 +4040,7 @@ __init int intel_pmu_init(void) | ||
18 | */ | ||
19 | if (x86_pmu.extra_regs) { | ||
20 | for (er = x86_pmu.extra_regs; er->msr; er++) { | ||
21 | - er->extra_msr_access = check_msr(er->msr, 0x11UL); | ||
22 | + er->extra_msr_access = false; | ||
23 | /* Disable LBR select mapping */ | ||
24 | if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) | ||
25 | x86_pmu.lbr_sel_map = NULL; | ||
26 | -- | ||
27 | 2.11.1 | ||
28 | |||
diff --git a/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch new file mode 100644 index 0000000..742a045 --- /dev/null +++ b/patches/boot_time_opt_guest/0110-pci-probe-identify-known-devices.patch | |||
@@ -0,0 +1,190 @@ | |||
1 | From c662d99134b67c58e63ecc17c2531588a3a51596 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Sat, 14 Feb 2015 09:49:41 -0600 | ||
4 | Subject: [PATCH 110/114] pci: probe: identify known devices | ||
5 | |||
6 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
7 | Modify-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
8 | |||
9 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
10 | --- | ||
11 | drivers/pci/probe.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
12 | 1 file changed, 156 insertions(+) | ||
13 | |||
14 | diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c | ||
15 | index 7399a06698da..4fb2d7fed4c5 100644 | ||
16 | --- a/drivers/pci/probe.c | ||
17 | +++ b/drivers/pci/probe.c | ||
18 | @@ -163,6 +163,159 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar) | ||
19 | |||
20 | #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO) | ||
21 | |||
22 | +/* shortcut version of __pci_read_base where we know the sizes already */ | ||
23 | +int __pci_read_base_shortcut(struct pci_dev *dev, enum pci_bar_type type, | ||
24 | + struct resource *res, unsigned int pos, u32 sz_in, u32 sz2_in) | ||
25 | +{ | ||
26 | + u32 l, sz; | ||
27 | + u64 l64, sz64, mask64; | ||
28 | + struct pci_bus_region region, inverted_region; | ||
29 | + | ||
30 | + res->name = pci_name(dev); | ||
31 | + | ||
32 | + pci_read_config_dword(dev, pos, &l); | ||
33 | + | ||
34 | + sz = sz_in; | ||
35 | + | ||
36 | + /* | ||
37 | + * All bits set in sz means the device isn't working properly. | ||
38 | + * If the BAR isn't implemented, all bits must be 0. If it's a | ||
39 | + * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit | ||
40 | + * 1 must be clear. | ||
41 | + * Here we set the size and is not 0xffffffff | ||
42 | + */ | ||
43 | + | ||
44 | + /* | ||
45 | + * I don't know how l can have all bits set. Copied from old code. | ||
46 | + * Maybe it fixes a bug on some ancient platform. | ||
47 | + */ | ||
48 | + if (l == 0xffffffff) | ||
49 | + l = 0; | ||
50 | + | ||
51 | + if (type == pci_bar_unknown) { | ||
52 | + res->flags = decode_bar(dev, l); | ||
53 | + res->flags |= IORESOURCE_SIZEALIGN; | ||
54 | + if (res->flags & IORESOURCE_IO) { | ||
55 | + l64 = l & PCI_BASE_ADDRESS_IO_MASK; | ||
56 | + sz64 = sz & PCI_BASE_ADDRESS_IO_MASK; | ||
57 | + mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT; | ||
58 | + } else { | ||
59 | + l64 = l & PCI_BASE_ADDRESS_MEM_MASK; | ||
60 | + sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK; | ||
61 | + mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK; | ||
62 | + } | ||
63 | + } else { | ||
64 | + res->flags |= (l & IORESOURCE_ROM_ENABLE); | ||
65 | + l64 = l & PCI_ROM_ADDRESS_MASK; | ||
66 | + sz64 = sz & PCI_ROM_ADDRESS_MASK; | ||
67 | + mask64 = (u32)PCI_ROM_ADDRESS_MASK; | ||
68 | + } | ||
69 | + | ||
70 | + if (res->flags & IORESOURCE_MEM_64) { | ||
71 | + pci_read_config_dword(dev, pos + 4, &l); | ||
72 | + sz = sz2_in; | ||
73 | + | ||
74 | + l64 |= ((u64)l << 32); | ||
75 | + sz64 |= ((u64)sz << 32); | ||
76 | + mask64 |= ((u64)~0 << 32); | ||
77 | + } | ||
78 | + | ||
79 | + if (!sz64) | ||
80 | + goto fail; | ||
81 | + | ||
82 | + sz64 = pci_size(l64, sz64, mask64); | ||
83 | + if (!sz64) { | ||
84 | + dev_info(&dev->dev, FW_BUG "reg 0x%x: invalid BAR (can't size)\n", | ||
85 | + pos); | ||
86 | + goto fail; | ||
87 | + } | ||
88 | + | ||
89 | + if (res->flags & IORESOURCE_MEM_64) { | ||
90 | + if ((sizeof(dma_addr_t) < 8 || sizeof(resource_size_t) < 8) && | ||
91 | + sz64 > 0x100000000ULL) { | ||
92 | + res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED; | ||
93 | + res->start = 0; | ||
94 | + res->end = 0; | ||
95 | + dev_err(&dev->dev, "reg 0x%x: can't handle BAR larger than 4GB (size %#010llx)\n", | ||
96 | + pos, (unsigned long long)sz64); | ||
97 | + goto out; | ||
98 | + } | ||
99 | + | ||
100 | + if ((sizeof(dma_addr_t) < 8) && l) { | ||
101 | + /* Above 32-bit boundary; try to reallocate */ | ||
102 | + res->flags |= IORESOURCE_UNSET; | ||
103 | + res->start = 0; | ||
104 | + res->end = sz64; | ||
105 | + dev_info(&dev->dev, "reg 0x%x: can't handle BAR above 4GB (bus address %#010llx)\n", | ||
106 | + pos, (unsigned long long)l64); | ||
107 | + goto out; | ||
108 | + } | ||
109 | + } | ||
110 | + | ||
111 | + region.start = l64; | ||
112 | + region.end = l64 + sz64; | ||
113 | + | ||
114 | + pcibios_bus_to_resource(dev->bus, res, ®ion); | ||
115 | + pcibios_resource_to_bus(dev->bus, &inverted_region, res); | ||
116 | + | ||
117 | + /* | ||
118 | + * If "A" is a BAR value (a bus address), "bus_to_resource(A)" is | ||
119 | + * the corresponding resource address (the physical address used by | ||
120 | + * the CPU. Converting that resource address back to a bus address | ||
121 | + * should yield the original BAR value: | ||
122 | + * | ||
123 | + * resource_to_bus(bus_to_resource(A)) == A | ||
124 | + * | ||
125 | + * If it doesn't, CPU accesses to "bus_to_resource(A)" will not | ||
126 | + * be claimed by the device. | ||
127 | + */ | ||
128 | + if (inverted_region.start != region.start) { | ||
129 | + res->flags |= IORESOURCE_UNSET; | ||
130 | + res->start = 0; | ||
131 | + res->end = region.end - region.start; | ||
132 | + dev_info(&dev->dev, "reg 0x%x: initial BAR value %#010llx invalid\n", | ||
133 | + pos, (unsigned long long)region.start); | ||
134 | + } | ||
135 | + | ||
136 | + goto out; | ||
137 | + | ||
138 | + | ||
139 | +fail: | ||
140 | + res->flags = 0; | ||
141 | +out: | ||
142 | + if (res->flags) | ||
143 | + dev_printk(KERN_DEBUG, &dev->dev, "reg 0x%x: %pR\n", pos, res); | ||
144 | + | ||
145 | + return (res->flags & IORESOURCE_MEM_64) ? 1 : 0; | ||
146 | +} | ||
147 | + | ||
148 | +static int is_known_device(struct pci_dev *dev, int pos, int *sz) | ||
149 | +{ | ||
150 | + /* Red Hat, Inc : Virtio network device */ | ||
151 | + if (dev->vendor == 0x1af4 && dev->device == 0x1000) { | ||
152 | + if (pos == 0x10) { | ||
153 | + *sz = 0xffffffe1; | ||
154 | + return 1; | ||
155 | + } | ||
156 | + if (pos == 0x14) { | ||
157 | + *sz = 0xfffff000; | ||
158 | + return 1; | ||
159 | + } | ||
160 | + } | ||
161 | + /* Red Hat, Inc : Virtio block device */ | ||
162 | + if (dev->vendor == 0x1af4 && dev->device == 0x1001) { | ||
163 | + if (pos == 0x10) { | ||
164 | + *sz = 0xffffffc1; | ||
165 | + return 1; | ||
166 | + } | ||
167 | + if (pos == 0x14) { | ||
168 | + *sz = 0xfffff000; | ||
169 | + return 1; | ||
170 | + } | ||
171 | + } | ||
172 | + return 0; | ||
173 | +} | ||
174 | + | ||
175 | /** | ||
176 | * pci_read_base - read a PCI BAR | ||
177 | * @dev: the PCI device | ||
178 | @@ -182,6 +335,9 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, | ||
179 | |||
180 | mask = type ? PCI_ROM_ADDRESS_MASK : ~0; | ||
181 | |||
182 | + if (is_known_device(dev, pos, &sz)) | ||
183 | + return __pci_read_base_shortcut(dev, type, res, pos, sz, 0); | ||
184 | + | ||
185 | res->name = pci_name(dev); | ||
186 | |||
187 | printk("clr: Starting probe for %s\n", res->name); | ||
188 | -- | ||
189 | 2.11.1 | ||
190 | |||
diff --git a/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch new file mode 100644 index 0000000..701a18d --- /dev/null +++ b/patches/boot_time_opt_guest/0111-init-no-wait-for-the-known-devices.patch | |||
@@ -0,0 +1,39 @@ | |||
1 | From be2ab4809c6b5058fbf3cd54c0f59c56416e572c Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Mon, 22 Jun 2015 09:33:33 -0500 | ||
4 | Subject: [PATCH 111/114] init: no wait for the known devices | ||
5 | |||
6 | No wait for the known devices to complete their probing | ||
7 | |||
8 | Author: Arjan van de Ven <arjan@linux.intel.com> | ||
9 | |||
10 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
11 | --- | ||
12 | init/do_mounts.c | 4 +++- | ||
13 | 1 file changed, 3 insertions(+), 1 deletion(-) | ||
14 | |||
15 | diff --git a/init/do_mounts.c b/init/do_mounts.c | ||
16 | index c2de5104aad2..40725f0f5fb3 100644 | ||
17 | --- a/init/do_mounts.c | ||
18 | +++ b/init/do_mounts.c | ||
19 | @@ -28,6 +28,7 @@ | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/ramfs.h> | ||
22 | #include <linux/shmem_fs.h> | ||
23 | +#include <linux/async.h> | ||
24 | |||
25 | #include <linux/nfs_fs.h> | ||
26 | #include <linux/nfs_fs_sb.h> | ||
27 | @@ -563,7 +564,8 @@ void __init prepare_namespace(void) | ||
28 | * For example, it is not atypical to wait 5 seconds here | ||
29 | * for the touchpad of a laptop to initialize. | ||
30 | */ | ||
31 | - wait_for_device_probe(); | ||
32 | + //wait_for_device_probe(); | ||
33 | + async_synchronize_full(); | ||
34 | |||
35 | md_run_setup(); | ||
36 | |||
37 | -- | ||
38 | 2.11.1 | ||
39 | |||
diff --git a/patches/boot_time_opt_guest/0112-ksm-wakeups.patch b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch new file mode 100644 index 0000000..b131e3f --- /dev/null +++ b/patches/boot_time_opt_guest/0112-ksm-wakeups.patch | |||
@@ -0,0 +1,32 @@ | |||
1 | From 2dc48e4b5c651691b7028991b64c935047b41b19 Mon Sep 17 00:00:00 2001 | ||
2 | From: Arjan van de Ven <arjan@linux.intel.com> | ||
3 | Date: Mon, 14 Mar 2016 11:06:46 -0600 | ||
4 | Subject: [PATCH 112/114] ksm-wakeups | ||
5 | |||
6 | reduce wakeups in ksm | ||
7 | --- | ||
8 | mm/ksm.c | 8 ++++++-- | ||
9 | 1 file changed, 6 insertions(+), 2 deletions(-) | ||
10 | |||
11 | diff --git a/mm/ksm.c b/mm/ksm.c | ||
12 | index 9ae6011a41f8..eecd3ff669e2 100644 | ||
13 | --- a/mm/ksm.c | ||
14 | +++ b/mm/ksm.c | ||
15 | @@ -1725,8 +1725,12 @@ static int ksm_scan_thread(void *nothing) | ||
16 | try_to_freeze(); | ||
17 | |||
18 | if (ksmd_should_run()) { | ||
19 | - schedule_timeout_interruptible( | ||
20 | - msecs_to_jiffies(ksm_thread_sleep_millisecs)); | ||
21 | + if (ksm_thread_sleep_millisecs >= 1000) | ||
22 | + schedule_timeout_interruptible( | ||
23 | + msecs_to_jiffies(round_jiffies_relative(ksm_thread_sleep_millisecs))); | ||
24 | + else | ||
25 | + schedule_timeout_interruptible( | ||
26 | + msecs_to_jiffies(ksm_thread_sleep_millisecs)); | ||
27 | } else { | ||
28 | wait_event_freezable(ksm_thread_wait, | ||
29 | ksmd_should_run() || kthread_should_stop()); | ||
30 | -- | ||
31 | 2.11.1 | ||
32 | |||
diff --git a/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch new file mode 100644 index 0000000..047eddb --- /dev/null +++ b/patches/boot_time_opt_guest/0113-init-do_mounts-recreate-dev-root.patch | |||
@@ -0,0 +1,42 @@ | |||
1 | From 179b7f41d5509f93cd297cc81c5d8da4a3123d9d Mon Sep 17 00:00:00 2001 | ||
2 | From: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
3 | Date: Fri, 20 Nov 2015 14:01:26 -0600 | ||
4 | Subject: [PATCH 113/114] init: do_mounts: recreate /dev/root | ||
5 | |||
6 | Rootfs shows as is mounted in /dev/root, but this devices is not present in | ||
7 | /dev directory. | ||
8 | |||
9 | Signed-off-by: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com> | ||
10 | --- | ||
11 | init/do_mounts.c | 8 ++++++++ | ||
12 | 1 file changed, 8 insertions(+) | ||
13 | |||
14 | diff --git a/init/do_mounts.c b/init/do_mounts.c | ||
15 | index 40725f0f5fb3..78b5b1dba8ca 100644 | ||
16 | --- a/init/do_mounts.c | ||
17 | +++ b/init/do_mounts.c | ||
18 | @@ -550,6 +550,7 @@ void __init mount_root(void) | ||
19 | void __init prepare_namespace(void) | ||
20 | { | ||
21 | int is_floppy; | ||
22 | + int err; | ||
23 | |||
24 | if (root_delay) { | ||
25 | printk(KERN_INFO "Waiting %d sec before mounting root device...\n", | ||
26 | @@ -604,6 +605,13 @@ void __init prepare_namespace(void) | ||
27 | devtmpfs_mount("dev"); | ||
28 | sys_mount(".", "/", NULL, MS_MOVE, NULL); | ||
29 | sys_chroot("."); | ||
30 | +#ifdef CONFIG_BLOCK | ||
31 | + /* recreate the /dev/root */ | ||
32 | + err = create_dev("/dev/root", ROOT_DEV); | ||
33 | + | ||
34 | + if (err < 0) | ||
35 | + pr_emerg("Failed to create /dev/root: %d\n", err); | ||
36 | +#endif | ||
37 | } | ||
38 | |||
39 | static bool is_tmpfs; | ||
40 | -- | ||
41 | 2.11.1 | ||
42 | |||
diff --git a/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch new file mode 100644 index 0000000..dee9058 --- /dev/null +++ b/patches/boot_time_opt_guest/0114-xattr-allow-setting-user.-attributes-on-symlinks-by-.patch | |||
@@ -0,0 +1,56 @@ | |||
1 | From 02fd2e6a7c708bf973209f9b238c5c61cbf15239 Mon Sep 17 00:00:00 2001 | ||
2 | From: Alan Cox <alan@linux.intel.com> | ||
3 | Date: Thu, 10 Mar 2016 15:11:28 +0000 | ||
4 | Subject: [PATCH 114/114] xattr: allow setting user.* attributes on symlinks by | ||
5 | owner | ||
6 | |||
7 | Kvmtool and clear containers supports using user attributes to label host | ||
8 | files with the virtual uid/guid of the file in the container. This allows an | ||
9 | end user to manage their files and a complete uid space without all the ugly | ||
10 | namespace stuff. | ||
11 | |||
12 | The one gap in the support is symlinks because an end user can change the | ||
13 | ownership of a symbolic link. We support attributes on these files as you | ||
14 | can already (as root) set security attributes on them. | ||
15 | |||
16 | The current rules seem slightly over-paranoid and as we have a use case this | ||
17 | patch enables updating the attributes on a symbolic link IFF you are the | ||
18 | owner of the synlink (as permissions are not usually meaningful on the link | ||
19 | itself). | ||
20 | |||
21 | Signed-off-by: Alan Cox <alan@linux.intel.com> | ||
22 | --- | ||
23 | fs/xattr.c | 14 ++++++++------ | ||
24 | 1 file changed, 8 insertions(+), 6 deletions(-) | ||
25 | |||
26 | diff --git a/fs/xattr.c b/fs/xattr.c | ||
27 | index 7e3317cf4045..e005c30acb2c 100644 | ||
28 | --- a/fs/xattr.c | ||
29 | +++ b/fs/xattr.c | ||
30 | @@ -118,15 +118,17 @@ xattr_permission(struct inode *inode, const char *name, int mask) | ||
31 | } | ||
32 | |||
33 | /* | ||
34 | - * In the user.* namespace, only regular files and directories can have | ||
35 | - * extended attributes. For sticky directories, only the owner and | ||
36 | - * privileged users can write attributes. | ||
37 | + * In the user.* namespace, only regular files, symbolic links, and | ||
38 | + * directories can have extended attributes. For symbolic links and | ||
39 | + * sticky directories, only the owner and privileged users can write | ||
40 | + * attributes. | ||
41 | */ | ||
42 | if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { | ||
43 | - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) | ||
44 | + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) | ||
45 | return (mask & MAY_WRITE) ? -EPERM : -ENODATA; | ||
46 | - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && | ||
47 | - (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) | ||
48 | + if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) | ||
49 | + || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) | ||
50 | + && !inode_owner_or_capable(inode)) | ||
51 | return -EPERM; | ||
52 | } | ||
53 | |||
54 | -- | ||
55 | 2.11.1 | ||
56 | |||
diff --git a/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch new file mode 100644 index 0000000..a6dbff7 --- /dev/null +++ b/patches/boot_time_opt_guest/0151-mm-Export-do_madvise.patch | |||
@@ -0,0 +1,84 @@ | |||
1 | From 99b4cdcce43ad0f706120bef26fef8c628c572cf Mon Sep 17 00:00:00 2001 | ||
2 | From: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
3 | Date: Mon, 23 Jan 2017 15:03:52 -0800 | ||
4 | Subject: [PATCH 151/154] mm: Export do_madvise() | ||
5 | |||
6 | Combined with some interesting flags madvise() system call | ||
7 | allows to free memory more smartly and more efficiently than | ||
8 | we could do with a simple free(). The issue is that is not | ||
9 | available for kernel modules that could need it. | ||
10 | |||
11 | In order to solve this lack of support, this patch exports | ||
12 | do_madvise() so as to make it available to the entire kernel. | ||
13 | The already existing madvise() system call is unchanged and | ||
14 | now relies on this new do_madvise() function. | ||
15 | |||
16 | Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com> | ||
17 | Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
18 | --- | ||
19 | include/linux/mm.h | 2 ++ | ||
20 | mm/madvise.c | 25 +++++++++++++++++++++---- | ||
21 | 2 files changed, 23 insertions(+), 4 deletions(-) | ||
22 | |||
23 | diff --git a/include/linux/mm.h b/include/linux/mm.h | ||
24 | index 0b5b2e4df14e..925ec25f99a8 100644 | ||
25 | --- a/include/linux/mm.h | ||
26 | +++ b/include/linux/mm.h | ||
27 | @@ -2450,5 +2450,7 @@ void __init setup_nr_node_ids(void); | ||
28 | static inline void setup_nr_node_ids(void) {} | ||
29 | #endif | ||
30 | |||
31 | +extern int do_madvise(unsigned long start, size_t len_in, int behavior); | ||
32 | + | ||
33 | #endif /* __KERNEL__ */ | ||
34 | #endif /* _LINUX_MM_H */ | ||
35 | diff --git a/mm/madvise.c b/mm/madvise.c | ||
36 | index 93fb63e88b5e..c8bbf93d4978 100644 | ||
37 | --- a/mm/madvise.c | ||
38 | +++ b/mm/madvise.c | ||
39 | @@ -618,9 +618,7 @@ madvise_behavior_valid(int behavior) | ||
40 | } | ||
41 | |||
42 | /* | ||
43 | - * The madvise(2) system call. | ||
44 | - * | ||
45 | - * Applications can use madvise() to advise the kernel how it should | ||
46 | + * Kernel modules can use do_madvise() to advise the kernel how it should | ||
47 | * handle paging I/O in this VM area. The idea is to help the kernel | ||
48 | * use appropriate read-ahead and caching techniques. The information | ||
49 | * provided is advisory only, and can be safely disregarded by the | ||
50 | @@ -673,7 +671,7 @@ madvise_behavior_valid(int behavior) | ||
51 | * -EBADF - map exists, but area maps something that isn't a file. | ||
52 | * -EAGAIN - a kernel resource was temporarily unavailable. | ||
53 | */ | ||
54 | -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | ||
55 | +int do_madvise(unsigned long start, size_t len_in, int behavior) | ||
56 | { | ||
57 | unsigned long end, tmp; | ||
58 | struct vm_area_struct *vma, *prev; | ||
59 | @@ -767,3 +765,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | ||
60 | |||
61 | return error; | ||
62 | } | ||
63 | +EXPORT_SYMBOL_GPL(do_madvise); | ||
64 | + | ||
65 | +/* | ||
66 | + * The madvise(2) system call. | ||
67 | + * | ||
68 | + * Applications can use madvise() system call to advise the kernel how | ||
69 | + * it should handle paging I/O in this VM area. The idea is to help | ||
70 | + * the kernel use appropriate read-ahead and caching techniques. The | ||
71 | + * information provided is advisory only, and can be safely disregarded | ||
72 | + * by the kernel without affecting the correct operation of the application. | ||
73 | + * | ||
74 | + * behavior values are the same than the ones defined in madvise() | ||
75 | + * | ||
76 | + * return values are the same than the ones defined in madvise() | ||
77 | + */ | ||
78 | +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | ||
79 | +{ | ||
80 | + return do_madvise(start, len_in, behavior); | ||
81 | +} | ||
82 | -- | ||
83 | 2.12.1 | ||
84 | |||
diff --git a/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch new file mode 100644 index 0000000..ff9d8c0 --- /dev/null +++ b/patches/boot_time_opt_guest/0152-x86-kvm-Notify-host-to-release-pages.patch | |||
@@ -0,0 +1,180 @@ | |||
1 | From d28921b5f797829e4e676f7968ae688ef96b7992 Mon Sep 17 00:00:00 2001 | ||
2 | From: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
3 | Date: Mon, 23 Jan 2017 15:08:55 -0800 | ||
4 | Subject: [PATCH 152/154] x86: kvm: Notify host to release pages | ||
5 | |||
6 | In context of hypervisors managing several virtual machines, we | ||
7 | want those virtual machines to give the memory they used back to | ||
8 | the host when they don't need it anymore. | ||
9 | |||
10 | This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing | ||
11 | the guest kernel to notify the host kernel when such event occurs. | ||
12 | And relying on do_madvise() function that we have previously exported, | ||
13 | it issues a call to this function when it receives the new hypercall. | ||
14 | |||
15 | Use of do_madvise() with MADV_DONTNEED flag will allow the guest to | ||
16 | ask for a new page without going through a new hypercall. Instead, | ||
17 | it will be able to start using that memory again as it will get | ||
18 | faulted back in as a fresh new page. That's why do_madvise() is more | ||
19 | efficient than doing vm_unmap() to return some memory to the host. | ||
20 | |||
21 | This patch introduces also a new sysctl kvm_madv_instant_free, | ||
22 | allowing user to set MADV_FREE advice instead of MADV_DONTNEED. | ||
23 | Indeed, MADV_FREE saves more performances than using MADV_DONTNEED | ||
24 | because it does not zero the pages in case the memory has not been | ||
25 | freed by the kernel. This can happen when there was no need for the | ||
26 | kernel to get this memory back, meaning it was keeping those pages | ||
27 | in the right state to be re-used by the same application. | ||
28 | MADV_FREE being a very recent advice introduced in kernel 4.5, we | ||
29 | only want to enable it through a sysctl in case the user want to | ||
30 | use it. | ||
31 | |||
32 | Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com> | ||
33 | Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
34 | --- | ||
35 | arch/x86/kvm/x86.c | 17 +++++++++++++++++ | ||
36 | include/linux/mm.h | 5 +++++ | ||
37 | include/uapi/linux/kvm_para.h | 3 +++ | ||
38 | kernel/sysctl.c | 7 +++++++ | ||
39 | mm/Makefile | 2 +- | ||
40 | mm/kvm.c | 25 +++++++++++++++++++++++++ | ||
41 | 6 files changed, 58 insertions(+), 1 deletion(-) | ||
42 | create mode 100644 mm/kvm.c | ||
43 | |||
44 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | ||
45 | index 582c75311f95..683a94dd5f03 100644 | ||
46 | --- a/arch/x86/kvm/x86.c | ||
47 | +++ b/arch/x86/kvm/x86.c | ||
48 | @@ -46,6 +46,7 @@ | ||
49 | #include <linux/user-return-notifier.h> | ||
50 | #include <linux/srcu.h> | ||
51 | #include <linux/slab.h> | ||
52 | +#include <linux/mm.h> | ||
53 | #include <linux/perf_event.h> | ||
54 | #include <linux/uaccess.h> | ||
55 | #include <linux/hash.h> | ||
56 | @@ -6019,6 +6020,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) | ||
57 | kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); | ||
58 | } | ||
59 | |||
60 | +static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len) | ||
61 | +{ | ||
62 | + unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa)); | ||
63 | + | ||
64 | + if (len > KVM_MAX_RET_MEM_SIZE) | ||
65 | + return KVM_EPERM; | ||
66 | + | ||
67 | + if (kvm_is_error_hva(start + len)) | ||
68 | + return KVM_EFAULT; | ||
69 | + | ||
70 | + return do_madvise(start, len, kvm_ret_mem_advice); | ||
71 | +} | ||
72 | + | ||
73 | void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) | ||
74 | { | ||
75 | vcpu->arch.apicv_active = false; | ||
76 | @@ -6065,6 +6079,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | ||
77 | kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); | ||
78 | ret = 0; | ||
79 | break; | ||
80 | + case KVM_HC_RETURN_MEM: | ||
81 | + ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1); | ||
82 | + break; | ||
83 | default: | ||
84 | ret = -KVM_ENOSYS; | ||
85 | break; | ||
86 | diff --git a/include/linux/mm.h b/include/linux/mm.h | ||
87 | index 925ec25f99a8..833f23d98baa 100644 | ||
88 | --- a/include/linux/mm.h | ||
89 | +++ b/include/linux/mm.h | ||
90 | @@ -2303,6 +2303,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); | ||
91 | extern int sysctl_drop_caches; | ||
92 | int drop_caches_sysctl_handler(struct ctl_table *, int, | ||
93 | void __user *, size_t *, loff_t *); | ||
94 | +extern int sysctl_kvm_madv_instant_free; | ||
95 | +extern int kvm_ret_mem_advice; | ||
96 | +int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | ||
97 | + void __user *buffer, size_t *length, | ||
98 | + loff_t *ppos); | ||
99 | #endif | ||
100 | |||
101 | void drop_slab(void); | ||
102 | diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h | ||
103 | index bf6cd7d5cac2..7d90f77d87d0 100644 | ||
104 | --- a/include/uapi/linux/kvm_para.h | ||
105 | +++ b/include/uapi/linux/kvm_para.h | ||
106 | @@ -23,6 +23,9 @@ | ||
107 | #define KVM_HC_MIPS_GET_CLOCK_FREQ 6 | ||
108 | #define KVM_HC_MIPS_EXIT_VM 7 | ||
109 | #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 | ||
110 | +#define KVM_HC_RETURN_MEM 10 | ||
111 | + | ||
112 | +#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB | ||
113 | |||
114 | /* | ||
115 | * hypercalls use architecture specific | ||
116 | diff --git a/kernel/sysctl.c b/kernel/sysctl.c | ||
117 | index c1095cdc0fe2..d8ae774fa042 100644 | ||
118 | --- a/kernel/sysctl.c | ||
119 | +++ b/kernel/sysctl.c | ||
120 | @@ -1398,6 +1398,13 @@ static struct ctl_table vm_table[] = { | ||
121 | .extra1 = &one, | ||
122 | .extra2 = &four, | ||
123 | }, | ||
124 | + { | ||
125 | + .procname = "kvm_madv_instant_free", | ||
126 | + .data = &sysctl_kvm_madv_instant_free, | ||
127 | + .maxlen = sizeof(int), | ||
128 | + .mode = 0644, | ||
129 | + .proc_handler = kvm_madv_instant_free_sysctl_handler, | ||
130 | + }, | ||
131 | #ifdef CONFIG_COMPACTION | ||
132 | { | ||
133 | .procname = "compact_memory", | ||
134 | diff --git a/mm/Makefile b/mm/Makefile | ||
135 | index 295bd7a9f76b..651ce0aff140 100644 | ||
136 | --- a/mm/Makefile | ||
137 | +++ b/mm/Makefile | ||
138 | @@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ | ||
139 | mm_init.o mmu_context.o percpu.o slab_common.o \ | ||
140 | compaction.o vmacache.o \ | ||
141 | interval_tree.o list_lru.o workingset.o \ | ||
142 | - prfile.o debug.o $(mmu-y) | ||
143 | + prfile.o debug.o kvm.o $(mmu-y) | ||
144 | |||
145 | obj-y += init-mm.o | ||
146 | |||
147 | diff --git a/mm/kvm.c b/mm/kvm.c | ||
148 | new file mode 100644 | ||
149 | index 000000000000..8945f6a311b9 | ||
150 | --- /dev/null | ||
151 | +++ b/mm/kvm.c | ||
152 | @@ -0,0 +1,25 @@ | ||
153 | +#include <linux/mman.h> | ||
154 | + | ||
155 | +int sysctl_kvm_madv_instant_free; | ||
156 | + | ||
157 | +int kvm_ret_mem_advice = MADV_DONTNEED; | ||
158 | +EXPORT_SYMBOL_GPL(kvm_ret_mem_advice); | ||
159 | + | ||
160 | +int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | ||
161 | + void __user *buffer, size_t *length, loff_t *ppos) | ||
162 | +{ | ||
163 | + int ret; | ||
164 | + | ||
165 | + ret = proc_dointvec(table, write, buffer, length, ppos); | ||
166 | + if (ret) | ||
167 | + return ret; | ||
168 | + | ||
169 | +#ifdef MADV_FREE | ||
170 | + if (sysctl_kvm_madv_instant_free > 0) | ||
171 | + kvm_ret_mem_advice = MADV_FREE; | ||
172 | + else | ||
173 | + kvm_ret_mem_advice = MADV_DONTNEED; | ||
174 | +#endif | ||
175 | + | ||
176 | + return 0; | ||
177 | +} | ||
178 | -- | ||
179 | 2.12.1 | ||
180 | |||
diff --git a/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch new file mode 100644 index 0000000..cdb876a --- /dev/null +++ b/patches/boot_time_opt_guest/0153-x86-Return-memory-from-guest-to-host-kernel.patch | |||
@@ -0,0 +1,155 @@ | |||
1 | From 855ef164854307839c08c60688eaeac14f9a649e Mon Sep 17 00:00:00 2001 | ||
2 | From: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
3 | Date: Mon, 23 Jan 2017 15:26:13 -0800 | ||
4 | Subject: [PATCH 153/154] x86: Return memory from guest to host kernel | ||
5 | |||
6 | All virtual machines need memory to perform various tasks, but this | ||
7 | memory is not released to the host after it is not used anymore. We | ||
8 | have to wait for the termination of the virtual machine to get this | ||
9 | memory back into the host. | ||
10 | |||
11 | Ballooning mechanism is close but not designed for the same purpose. | ||
12 | In case we hit memory limits of the system, the host predicts how much | ||
13 | memory can be asked back from a guest, and it issues an hypercall to | ||
14 | retrieve this memory. | ||
15 | |||
16 | The solution proposed is different because it does not wait for host | ||
17 | needs before to return memory, and it knows precisely how much memory | ||
18 | it can return. | ||
19 | |||
20 | The way to notify the host side about such a return is to rely on | ||
21 | the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be | ||
22 | overloaded with too many hypercalls, we only return memory blocks of | ||
23 | order 7 (512k blocks) and higher. This value has been found running | ||
24 | memory tests using multiple threads allocating/freeing high amount | ||
25 | of memory. Those tests were run for different order values, and 7 was | ||
26 | the best tradeoff between the number of hypercalls issued and the | ||
27 | amount of memory returned to the host. | ||
28 | |||
29 | In order to limit performances impact related to this code addition, | ||
30 | we check for blocks of order 7 or higher. This means it only costs an | ||
31 | additional function call and a branch to perform this check. | ||
32 | |||
33 | Furthermore, this code has been added to the "merge" codepath of the | ||
34 | buddy allocator, which is not as sensitive as the "free" codepath. | ||
35 | Not all blocks going through the "free" codepath will end up in the | ||
36 | "merge" codepath because some of them won't find their free buddy. | ||
37 | But this is a negligible amount since the kernel does not use many | ||
38 | high order blocks directly. Instead, those bigger blocks are often | ||
39 | broken into smaller chunks used as low order blocks. At the time | ||
40 | those small blocks are released, they go through the merge path. | ||
41 | |||
42 | Benchmarks such as ebizzy and will-it-scale have been run in order | ||
43 | to make sure this patch does not affect kernel performances and no | ||
44 | significant differences were observed. | ||
45 | |||
46 | Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com> | ||
47 | Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
48 | --- | ||
49 | arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ | ||
50 | arch/x86/kernel/kvm.c | 10 ++++++++++ | ||
51 | include/linux/mm-arch-hooks.h | 8 ++++++++ | ||
52 | mm/page_alloc.c | 2 ++ | ||
53 | 4 files changed, 42 insertions(+) | ||
54 | |||
55 | diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h | ||
56 | index bc62e7cbf1b1..4a2f6d1adbd2 100644 | ||
57 | --- a/arch/x86/include/asm/kvm_para.h | ||
58 | +++ b/arch/x86/include/asm/kvm_para.h | ||
59 | @@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token); | ||
60 | void kvm_async_pf_task_wake(u32 token); | ||
61 | u32 kvm_read_and_reset_pf_reason(void); | ||
62 | extern void kvm_disable_steal_time(void); | ||
63 | +void kvm_arch_return_memory(struct page *page, unsigned int order); | ||
64 | + | ||
65 | +/* | ||
66 | + * This order has been found in an empirical way, running memory tests | ||
67 | + * through many iterations to assess the number of hypercalls issued | ||
68 | + * and the amount of memory returned. In case you change this order to | ||
69 | + * 6 or 8, it should not impact your performances significantly. | ||
70 | + * | ||
71 | + * Smaller values lead to less memory waste, but consume more CPU on | ||
72 | + * hypercalls. Larger values use less CPU, but do not as precisely | ||
73 | + * inform the hypervisor of which memory is free. | ||
74 | + */ | ||
75 | +#define RET_MEM_BUDDY_ORDER 7 | ||
76 | + | ||
77 | +static inline void arch_buddy_merge(struct page *page, unsigned int order) | ||
78 | +{ | ||
79 | + if (order < RET_MEM_BUDDY_ORDER) | ||
80 | + return; | ||
81 | + | ||
82 | + kvm_arch_return_memory(page, order); | ||
83 | +} | ||
84 | +#define arch_buddy_merge arch_buddy_merge | ||
85 | |||
86 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
87 | void __init kvm_spinlock_init(void); | ||
88 | diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c | ||
89 | index edbbfc854e39..14167b3f6514 100644 | ||
90 | --- a/arch/x86/kernel/kvm.c | ||
91 | +++ b/arch/x86/kernel/kvm.c | ||
92 | @@ -552,6 +552,16 @@ static __init int activate_jump_labels(void) | ||
93 | } | ||
94 | arch_initcall(activate_jump_labels); | ||
95 | |||
96 | +void kvm_arch_return_memory(struct page *page, unsigned int order) | ||
97 | +{ | ||
98 | + if (!kvm_para_available()) | ||
99 | + return; | ||
100 | + | ||
101 | + kvm_hypercall2(KVM_HC_RETURN_MEM, | ||
102 | + page_to_phys(page), | ||
103 | + PAGE_SIZE << order); | ||
104 | +} | ||
105 | + | ||
106 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
107 | |||
108 | /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ | ||
109 | diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h | ||
110 | index 4efc3f56e6df..26eb3a05a8a3 100644 | ||
111 | --- a/include/linux/mm-arch-hooks.h | ||
112 | +++ b/include/linux/mm-arch-hooks.h | ||
113 | @@ -12,6 +12,7 @@ | ||
114 | #define _LINUX_MM_ARCH_HOOKS_H | ||
115 | |||
116 | #include <asm/mm-arch-hooks.h> | ||
117 | +#include <asm/kvm_para.h> | ||
118 | |||
119 | #ifndef arch_remap | ||
120 | static inline void arch_remap(struct mm_struct *mm, | ||
121 | @@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm, | ||
122 | #define arch_remap arch_remap | ||
123 | #endif | ||
124 | |||
125 | +#ifndef arch_buddy_merge | ||
126 | +static inline void arch_buddy_merge(struct page *page, unsigned int order) | ||
127 | +{ | ||
128 | +} | ||
129 | +#define arch_buddy_merge arch_buddy_merge | ||
130 | +#endif | ||
131 | + | ||
132 | #endif /* _LINUX_MM_ARCH_HOOKS_H */ | ||
133 | diff --git a/mm/page_alloc.c b/mm/page_alloc.c | ||
134 | index 1460e6ad5e14..5f6e6371bc6f 100644 | ||
135 | --- a/mm/page_alloc.c | ||
136 | +++ b/mm/page_alloc.c | ||
137 | @@ -64,6 +64,7 @@ | ||
138 | #include <linux/page_owner.h> | ||
139 | #include <linux/kthread.h> | ||
140 | #include <linux/memcontrol.h> | ||
141 | +#include <linux/mm-arch-hooks.h> | ||
142 | |||
143 | #include <asm/sections.h> | ||
144 | #include <asm/tlbflush.h> | ||
145 | @@ -855,6 +856,7 @@ static inline void __free_one_page(struct page *page, | ||
146 | } | ||
147 | |||
148 | done_merging: | ||
149 | + arch_buddy_merge(page, order); | ||
150 | set_page_order(page, order); | ||
151 | |||
152 | /* | ||
153 | -- | ||
154 | 2.12.1 | ||
155 | |||
diff --git a/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch new file mode 100644 index 0000000..07d4a83 --- /dev/null +++ b/patches/boot_time_opt_guest/0154-sysctl-vm-Fine-grained-cache-shrinking.patch | |||
@@ -0,0 +1,137 @@ | |||
1 | From 2c145b5233b504f5226a0f4bc44baeef33b444d8 Mon Sep 17 00:00:00 2001 | ||
2 | From: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
3 | Date: Mon, 23 Jan 2017 15:32:39 -0800 | ||
4 | Subject: [PATCH 154/154] sysctl: vm: Fine-grained cache shrinking | ||
5 | |||
6 | Lots of virtual machines are let in idle state for days until they | ||
7 | are terminated, and they can keep a large amount of memory in their | ||
8 | cache, meaning this memory cannot be used by other processes. | ||
9 | |||
10 | We tried to release this memory using existing drop_caches sysctl, | ||
11 | but it led to the complete cache loss while it could have been used | ||
12 | whether the idle process wakes up. Indeed, the process can't find any | ||
13 | available cached data and it directly affects performances to rebuild | ||
14 | it from scratch. | ||
15 | |||
16 | Instead, the solution we want is based on shrinking gradually system | ||
17 | cache over time. This patch adds a new sysctl shrink_caches_mb so as | ||
18 | to allow userspace applications indicating the kernel it should shrink | ||
19 | system cache up to the amount (in MiB) specified. | ||
20 | |||
21 | There is an application called "memshrinker" which uses this new | ||
22 | mechanism. It runs in the background and periodically releases a | ||
23 | specified amount of cache. This amount is based on the remaining | ||
24 | cache on the system, and period is computed to follow a shrinking | ||
25 | model. It results in saving a lot of memory for other processes | ||
26 | running on the system. | ||
27 | |||
28 | Suggested-by: Arjan van de Ven <arjan.van.de.ven@intel.com> | ||
29 | Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com> | ||
30 | --- | ||
31 | fs/drop_caches.c | 25 +++++++++++++++++++++++++ | ||
32 | include/linux/mm.h | 4 ++++ | ||
33 | kernel/sysctl.c | 8 ++++++++ | ||
34 | mm/vmscan.c | 2 -- | ||
35 | 4 files changed, 37 insertions(+), 2 deletions(-) | ||
36 | |||
37 | diff --git a/fs/drop_caches.c b/fs/drop_caches.c | ||
38 | index d72d52b90433..f564dfcc13a4 100644 | ||
39 | --- a/fs/drop_caches.c | ||
40 | +++ b/fs/drop_caches.c | ||
41 | @@ -8,10 +8,12 @@ | ||
42 | #include <linux/writeback.h> | ||
43 | #include <linux/sysctl.h> | ||
44 | #include <linux/gfp.h> | ||
45 | +#include <linux/swap.h> | ||
46 | #include "internal.h" | ||
47 | |||
48 | /* A global variable is a bit ugly, but it keeps the code simple */ | ||
49 | int sysctl_drop_caches; | ||
50 | +int sysctl_shrink_caches_mb; | ||
51 | |||
52 | static void drop_pagecache_sb(struct super_block *sb, void *unused) | ||
53 | { | ||
54 | @@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, | ||
55 | } | ||
56 | return 0; | ||
57 | } | ||
58 | + | ||
59 | +int shrink_caches_sysctl_handler(struct ctl_table *table, int write, | ||
60 | + void __user *buffer, size_t *length, loff_t *ppos) | ||
61 | +{ | ||
62 | + int ret; | ||
63 | + unsigned long nr_to_reclaim, page_reclaimed; | ||
64 | + | ||
65 | + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
66 | + if (ret) | ||
67 | + return ret; | ||
68 | + | ||
69 | + nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE; | ||
70 | + if (write) { | ||
71 | + page_reclaimed = shrink_all_memory(nr_to_reclaim); | ||
72 | + if (page_reclaimed > 0) | ||
73 | + lru_add_drain_all(); | ||
74 | + | ||
75 | + if (page_reclaimed != nr_to_reclaim) | ||
76 | + return page_reclaimed; | ||
77 | + } | ||
78 | + | ||
79 | + return 0; | ||
80 | +} | ||
81 | diff --git a/include/linux/mm.h b/include/linux/mm.h | ||
82 | index 833f23d98baa..0bb66c1c31c9 100644 | ||
83 | --- a/include/linux/mm.h | ||
84 | +++ b/include/linux/mm.h | ||
85 | @@ -2308,6 +2308,10 @@ extern int kvm_ret_mem_advice; | ||
86 | int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | ||
87 | void __user *buffer, size_t *length, | ||
88 | loff_t *ppos); | ||
89 | +extern int sysctl_shrink_caches_mb; | ||
90 | +int shrink_caches_sysctl_handler(struct ctl_table *table, int write, | ||
91 | + void __user *buffer, size_t *length, | ||
92 | + loff_t *ppos); | ||
93 | #endif | ||
94 | |||
95 | void drop_slab(void); | ||
96 | diff --git a/kernel/sysctl.c b/kernel/sysctl.c | ||
97 | index d8ae774fa042..5dc9a46ae212 100644 | ||
98 | --- a/kernel/sysctl.c | ||
99 | +++ b/kernel/sysctl.c | ||
100 | @@ -1405,6 +1405,14 @@ static struct ctl_table vm_table[] = { | ||
101 | .mode = 0644, | ||
102 | .proc_handler = kvm_madv_instant_free_sysctl_handler, | ||
103 | }, | ||
104 | + { | ||
105 | + .procname = "shrink_caches_mb", | ||
106 | + .data = &sysctl_shrink_caches_mb, | ||
107 | + .maxlen = sizeof(int), | ||
108 | + .mode = 0644, | ||
109 | + .proc_handler = shrink_caches_sysctl_handler, | ||
110 | + .extra1 = &one, | ||
111 | + }, | ||
112 | #ifdef CONFIG_COMPACTION | ||
113 | { | ||
114 | .procname = "compact_memory", | ||
115 | diff --git a/mm/vmscan.c b/mm/vmscan.c | ||
116 | index 30a88b945a44..1198e74d1860 100644 | ||
117 | --- a/mm/vmscan.c | ||
118 | +++ b/mm/vmscan.c | ||
119 | @@ -3525,7 +3525,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | ||
120 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
121 | } | ||
122 | |||
123 | -#ifdef CONFIG_HIBERNATION | ||
124 | /* | ||
125 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | ||
126 | * freed pages. | ||
127 | @@ -3564,7 +3563,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | ||
128 | |||
129 | return nr_reclaimed; | ||
130 | } | ||
131 | -#endif /* CONFIG_HIBERNATION */ | ||
132 | |||
133 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | ||
134 | not required for correctness. So if the last cpu in a node goes | ||
135 | -- | ||
136 | 2.12.1 | ||
137 | |||
diff --git a/patches/boot_time_opt_guest/guest_boot_time_opt.scc b/patches/boot_time_opt_guest/guest_boot_time_opt.scc new file mode 100644 index 0000000..3636c01 --- /dev/null +++ b/patches/boot_time_opt_guest/guest_boot_time_opt.scc | |||
@@ -0,0 +1,19 @@ | |||
1 | define KFEATURE_DESCRIPTION "Boot time optimization changes ported from ClearLinux , https://github.com/clearlinux-pkgs/linux-kvm" | ||
2 | define KFEATURE_COMPATIBILITY all | ||
3 | |||
4 | patch 0103-sysrq-skip-synchronize_rcu-if-there-is-no-old-op.patch | ||
5 | patch 0104-fbcon-enable-no-blink-by-default.patch | ||
6 | patch 0105-vmstats-wakeups.patch | ||
7 | # Remove patch because it causes ixgvbevf to not initialize correctly in the guest | ||
8 | #patch 0106-pci-probe.patch | ||
9 | patch 0107-cgroup.patch | ||
10 | patch 0108-smpboot-reuse-timer-calibration.patch | ||
11 | patch 0109-perf.patch | ||
12 | patch 0110-pci-probe-identify-known-devices.patch | ||
13 | patch 0111-init-no-wait-for-the-known-devices.patch | ||
14 | patch 0112-ksm-wakeups.patch | ||
15 | |||
16 | patch 0151-mm-Export-do_madvise.patch | ||
17 | patch 0152-x86-kvm-Notify-host-to-release-pages.patch | ||
18 | patch 0153-x86-Return-memory-from-guest-to-host-kernel.patch | ||
19 | patch 0154-sysctl-vm-Fine-grained-cache-shrinking.patch | ||