From 3207d4a42e4e8297a07fdc5a22cddc05dc906bf0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 23 Jun 2015 01:26:52 -0500 Subject: [PATCH 01/38] i8042: decrease debug message level to info Author: Arjan van de Ven Signed-off-by: Miguel Bernal Marin Signed-off-by: Jose Carlos Venegas Munoz --- drivers/input/serio/i8042.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 8ec4872b447145..baf7a0d9c332f4 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -625,7 +625,7 @@ static int i8042_enable_kbd_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; - pr_err("Failed to enable KBD port\n"); + pr_info("Failed to enable KBD port\n"); return -EIO; } @@ -644,7 +644,7 @@ static int i8042_enable_aux_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; - pr_err("Failed to enable AUX port\n"); + pr_info("Failed to enable AUX port\n"); return -EIO; } @@ -736,7 +736,7 @@ static int i8042_check_mux(void) i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - pr_err("Failed to disable AUX port, can't use MUX\n"); + pr_info("Failed to disable AUX port, can't use MUX\n"); return -EIO; } @@ -959,7 +959,7 @@ static int i8042_controller_selftest(void) do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { - pr_err("i8042 controller selftest timeout\n"); + pr_info("i8042 controller selftest timeout\n"); return -ENODEV; } @@ -981,7 +981,7 @@ static int i8042_controller_selftest(void) pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else - pr_err("i8042 controller selftest failed\n"); + pr_info("i8042 controller selftest failed\n"); return -EIO; #endif } From 532300cb711cd22b87a5ee613d1d992304b2891c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 11 Jan 2016 10:01:44 -0600 Subject: [PATCH 02/38] increase the ext4 default commit age Both the VM and EXT4 have a "commit to disk after X seconds" time. Currently the EXT4 time is shorter than our VM time, which is a bit suboptional, it's better for performance to let the VM do the writeouts in bulk rather than something deep in the journalling layer. (DISTRO TWEAK -- NOT FOR UPSTREAM) Signed-off-by: Arjan van de Ven Signed-off-by: Jose Carlos Venegas Munoz --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8aef9bb6ad5735..ff2eb4ab1842b2 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -45,7 +45,7 @@ /* * The default maximum commit age, in seconds. */ -#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 +#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 #ifdef CONFIG_JBD2_DEBUG /* From 0d78620362ca1a3e7af197b76006731e4430dea7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 14 Mar 2016 11:10:58 -0600 Subject: [PATCH 03/38] pci pme wakeups Reduce wakeups for PME checks, which are a workaround for miswired boards (sadly, too many of them) in laptops. Signed-off-by: Kai Krakow --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1aa5d6f98ebda2..9f006c30eca554 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -61,7 +61,7 @@ struct pci_pme_device { struct pci_dev *dev; }; -#define PME_TIMEOUT 1000 /* How long between PME checks */ +#define PME_TIMEOUT 4000 /* How long between PME checks */ /* * Following exit from Conventional Reset, devices must be ready within 1 sec From 1277ad058c2fb176fcded7dc1002cbf44594f17e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 19 Mar 2016 21:32:19 -0400 Subject: [PATCH 04/38] intel_idle: tweak cpuidle cstates Increase target_residency in cpuidle cstate Tune intel_idle to be a bit less agressive; Clear linux is cleaner in hygiene (wakupes) than the average linux, so we can afford changing these in a way that increases performance while keeping power efficiency Signed-off-by: Kai Krakow --- drivers/idle/intel_idle.c | 50 +++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 524ed143f875d3..fb0e8d5db68982 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -575,7 +575,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -583,7 +583,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, - .target_residency = 100, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -591,7 +591,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -599,7 +599,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 1500, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -607,7 +607,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -615,7 +615,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 5000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -623,7 +623,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -643,7 +643,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -651,7 +651,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -659,7 +659,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -667,7 +667,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -675,7 +675,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 4000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -683,7 +683,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 7000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -691,7 +691,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -712,7 +712,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -720,7 +720,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -728,7 +728,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, - .target_residency = 200, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -736,7 +736,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, - .target_residency = 800, + .target_residency = 3000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -744,7 +744,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, - .target_residency = 800, + .target_residency = 3200, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -752,7 +752,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -760,7 +760,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -781,7 +781,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 300, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -810,7 +810,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -818,7 +818,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, - .target_residency = 600, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -968,7 +968,7 @@ static struct cpuidle_state gmt_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { From a6f7f4b9509155c1470dee7b9413e91ab46d38a4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 11 Feb 2015 17:28:14 -0600 Subject: [PATCH 05/38] smpboot: reuse timer calibration NO point recalibrating for known-constant tsc ... saves 200ms+ of boot time. Signed-off-by: Kai Krakow --- arch/x86/kernel/tsc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index dfe6847fd99e5e..8bc41b2ac6c3a3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1592,6 +1592,9 @@ unsigned long calibrate_delay_is_known(void) if (!constant_tsc || !mask) return 0; + if (cpu != 0) + return cpu_data(0).loops_per_jiffy; + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; From 65e6127cbc9dc935c1fc6e9ec0713b7b4c163d9f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 6 Jan 2017 15:34:09 +0000 Subject: [PATCH 06/38] ipv4/tcp: allow the memory tuning for tcp to go a little bigger than default Signed-off-by: Kai Krakow --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b731a4a8f2b0d5..654eb58ecd2006 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5144,8 +5144,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; From 21cf48ea632e4d2db7d5178675b431a376c4949d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 17 May 2017 01:52:11 +0000 Subject: [PATCH 07/38] init: wait for partition and retry scan As Clear Linux boots fast the device is not ready when the mounting code is reached, so a retry device scan will be performed every 0.5 sec for at least 40 sec and synchronize the async task. Signed-off-by: Miguel Bernal Marin --- block/early-lookup.c | 15 +++++++++++++-- init/do_mounts.c | 2 ++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 3fb57f7d2b1276..243ad0ca102121 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -5,6 +5,7 @@ */ #include #include +#include struct uuidcmp { const char *uuid; @@ -243,8 +244,18 @@ static int __init devt_from_devnum(const char *name, dev_t *devt) */ int __init early_lookup_bdev(const char *name, dev_t *devt) { - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9, devt); + if (strncmp(name, "PARTUUID=", 9) == 0) { + int res; + int needtowait = 40<<1; + res = devt_from_partuuid(name + 9, devt); + if (!res) return res; + while (res && needtowait) { + msleep(500); + res = devt_from_partuuid(name + 9, devt); + needtowait--; + } + return res; + } if (strncmp(name, "PARTLABEL=", 10) == 0) return devt_from_partlabel(name + 10, devt); if (strncmp(name, "/dev/", 5) == 0) diff --git a/init/do_mounts.c b/init/do_mounts.c index 6af29da8889ebf..a83c82ad7150a2 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -476,7 +476,9 @@ void __init prepare_namespace(void) * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */ + async_synchronize_full(); wait_for_device_probe(); + async_synchronize_full(); md_run_setup(); From 06cb33105b87c5c3445c4bd6b0ecfcb2e6e9e65b Mon Sep 17 00:00:00 2001 From: "Brett T. Warden" Date: Mon, 13 Aug 2018 04:01:21 -0500 Subject: [PATCH 08/38] add boot option to allow unsigned modules Add module.sig_unenforce boot parameter to allow loading unsigned kernel modules. Parameter is only effective if CONFIG_MODULE_SIG_FORCE is enabled and system is *not* SecureBooted. Signed-off-by: Brett T. Warden Signed-off-by: Miguel Bernal Marin --- kernel/module/signing.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/module/signing.c b/kernel/module/signing.c index a2ff4242e623d5..8cd7c6f5d8bbfe 100644 --- a/kernel/module/signing.c +++ b/kernel/module/signing.c @@ -14,6 +14,8 @@ #include #include #include +#include + #include "internal.h" #undef MODULE_PARAM_PREFIX @@ -21,6 +23,11 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); +/* Allow disabling module signature requirement by adding boot param */ +static bool sig_unenforce = false; +module_param(sig_unenforce, bool_enable_only, 0644); + +extern struct boot_params boot_params; /* * Export sig_enforce kernel cmdline parameter to allow other subsystems rely @@ -28,6 +35,8 @@ module_param(sig_enforce, bool_enable_only, 0644); */ bool is_module_sig_enforced(void) { + if (sig_unenforce) + return false; return sig_enforce; } EXPORT_SYMBOL(is_module_sig_enforced); From db361ec399b7bc28691fc6625ac7b64333c1d4f9 Mon Sep 17 00:00:00 2001 From: William Douglas Date: Wed, 20 Jun 2018 17:23:21 +0000 Subject: [PATCH 09/38] enable stateless firmware loading Prefer the order of specific version before generic and /etc before /lib to enable the user to give specific overrides for generic firmware and distribution firmware. Signed-off-by: Kai Krakow --- drivers/base/firmware_loader/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index c6664a78796979..413b0aa5c19d52 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -471,6 +471,8 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv, static char fw_path_para[256]; static const char * const fw_path[] = { fw_path_para, + "/etc/firmware/" UTS_RELEASE, + "/etc/firmware", "/lib/firmware/updates/" UTS_RELEASE, "/lib/firmware/updates", "/lib/firmware/" UTS_RELEASE, From a06f2cc0bbd7a98936060257b1b99298e5975dc5 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 2 Aug 2018 12:03:22 -0700 Subject: [PATCH 10/38] migrate some systemd defaults to the kernel defaults. These settings are needed to prevent networking issues when the networking modules come up by default without explicit settings, which breaks some cases. We don't want the modprobe settings to be read at boot time if we're not going to do anything else ever. Signed-off-by: Kai Krakow --- drivers/net/dummy.c | 2 +- include/uapi/linux/if_bonding.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index e9c5e1e11fa02d..d912cdbd26a4d2 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -43,7 +43,7 @@ #define DRV_NAME "dummy" -static int numdummies = 1; +static int numdummies = 0; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837dbf..bf8e2af101a3c8 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -82,7 +82,7 @@ #define BOND_STATE_ACTIVE 0 /* link is active */ #define BOND_STATE_BACKUP 1 /* link is backup */ -#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ From fdd37916f51a7e21ca5d56478b6b1ce5f2361711 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 10 Mar 2016 15:11:28 +0000 Subject: [PATCH 11/38] xattr: allow setting user.* attributes on symlinks by owner Kvmtool and clear containers supports using user attributes to label host files with the virtual uid/guid of the file in the container. This allows an end user to manage their files and a complete uid space without all the ugly namespace stuff. The one gap in the support is symlinks because an end user can change the ownership of a symbolic link. We support attributes on these files as you can already (as root) set security attributes on them. The current rules seem slightly over-paranoid and as we have a use case this patch enables updating the attributes on a symbolic link IFF you are the owner of the synlink (as permissions are not usually meaningful on the link itself). Signed-off-by: Alan Cox --- fs/xattr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 05ec7e7d9e87e2..09723a7857feaa 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -139,16 +139,17 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode, } /* - * In the user.* namespace, only regular files and directories can have - * extended attributes. For sticky directories, only the owner and - * privileged users can write attributes. + * In the user.* namespace, only regular files, symbolic links, and + * directories can have extended attributes. For symbolic links and + * sticky directories, only the owner and privileged users can write + * attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && - !inode_owner_or_capable(idmap, inode)) + if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) + || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) + && !inode_owner_or_capable(idmap, inode)) return -EPERM; } From facb2e2c7e4b77840b73b28de7f523288ee08b0c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 13 Dec 2018 01:00:49 +0000 Subject: [PATCH 12/38] do accept() in LIFO order for cache efficiency Signed-off-by: Kai Krakow --- include/linux/wait.h | 2 ++ kernel/sched/wait.c | 24 ++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2b322a9b88a2bd..6f977c21fd81d5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -163,6 +163,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); @@ -1192,6 +1193,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f47018c..c5cc616484badd 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -258,6 +269,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fe7947f7740623..99e138cfdd9522 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) From 475a5951f72d8de794dda211581e388de42d0b58 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 18 Feb 2018 23:35:41 +0000 Subject: [PATCH 13/38] locking: rwsem: spin faster tweak rwsem owner spinning a bit Signed-off-by: Kai Krakow --- kernel/locking/rwsem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2bbb6eca51445b..125cdf85741c5e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -747,6 +747,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + int i = 0; lockdep_assert_preemption_disabled(); @@ -783,7 +784,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - cpu_relax(); + if (i++ > 1000) + cpu_relax(); } return state; From 87cf75d5f94fab32c69620e235477de1f07fd3cd Mon Sep 17 00:00:00 2001 From: Joe Konno Date: Tue, 25 Jun 2019 10:35:54 -0700 Subject: [PATCH 14/38] ata: libahci: ignore staggered spin-up Change libahci to ignore firmware's staggered spin-up flag. End-users who wish to honor firmware's SSS flag can add the following kernel parameter to a new file at /etc/kernel/cmdline.d/ignore_sss.conf: libahci.ignore_sss=0 And then run sudo clr-boot-manager update Signed-off-by: Joe Konno --- drivers/ata/libahci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fdfa7b2662180b..13229e9341081e 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -34,14 +34,14 @@ #include "libata.h" static int ahci_skip_host_reset; -int ahci_ignore_sss; +int ahci_ignore_sss=1; EXPORT_SYMBOL_GPL(ahci_ignore_sss); module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444); MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip)"); module_param_named(ignore_sss, ahci_ignore_sss, int, 0444); -MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore)"); +MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore [default])"); static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy, unsigned hints); From ce3da53c3782393f79ed1549a25ffe353fd5960a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 10 Aug 2019 03:19:04 +0000 Subject: [PATCH 15/38] print CPU that faults print cpu number when we print a crash Signed-off-by: Kai Krakow --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ac52255fab01f4..eae6d65b901464 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -756,9 +756,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", loglvl, tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); + (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); print_vma_addr(KERN_CONT " in ", regs->ip); From 8a680632d3204d6d62e94dff51b0014d89b2591d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 11 Nov 2019 23:12:11 +0000 Subject: [PATCH 16/38] nvme workaround Signed-off-by: Kai Krakow --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8da50df56b0795..81c80b5fc542ab 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -62,7 +62,7 @@ static u8 nvme_max_retries = 5; module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -static unsigned long default_ps_max_latency_us = 100000; +static unsigned long default_ps_max_latency_us = 200; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, "max power saving latency for new devices; use PM QOS to change per device"); From 9f10d5a3fa87b399b151ff1bc8eb5e7c9390cf25 Mon Sep 17 00:00:00 2001 From: Alexander Koskovich Date: Wed, 12 Feb 2020 22:47:12 +0000 Subject: [PATCH 17/38] don't report an error if PowerClamp run on other CPU Signed-off-by: Kai Krakow --- drivers/thermal/intel/intel_powerclamp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 4ba649370aa1a7..d2957742992266 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -710,6 +710,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; +static const struct x86_cpu_id amd_cpu[] = { + { X86_VENDOR_AMD }, + {}, +}; + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} @@ -719,6 +724,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); + return -ENODEV; + } + if (!x86_match_cpu(intel_powerclamp_ids)) { pr_err("CPU does not support MWAIT\n"); return -ENODEV; From c123e705593e5f01cd1b722b95ce618313a0fec4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 27 Sep 2021 17:43:01 +0000 Subject: [PATCH 18/38] lib/raid6: add patch Signed-off-by: Kai Krakow --- lib/raid6/algos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index cd2e88ee1f148c..81565997933625 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -138,8 +138,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) for (best = NULL, algo = raid6_recov_algos; *algo; algo++) if (!best || (*algo)->priority > best->priority) - if (!(*algo)->valid || (*algo)->valid()) + if (!(*algo)->valid || (*algo)->valid()) { best = *algo; + break; + } if (best) { raid6_2data_recov = best->data2; From 28ffaff850b02cb4ce953ecbbbb51351e48093e7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 16 Nov 2021 17:39:25 +0000 Subject: [PATCH 19/38] itmt_epb: use epb to scale itmt Signed-off-by: Kai Krakow --- arch/x86/include/asm/topology.h | 1 + arch/x86/kernel/cpu/intel_epb.c | 4 ++++ arch/x86/kernel/itmt.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 92f3664dd933b1..a9bbadb21d9ed5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -245,6 +245,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu); /* Interface to notify scheduler that system supports ITMT */ int sched_set_itmt_support(void); diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 30b1d63b97f3a1..fe0f6e2bb29765 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev, if (ret < 0) return ret; + /* update the ITMT scheduler logic to use the power policy data */ + /* scale the val up by 2 so the range is 224 - 256 */ + sched_set_itmt_power_ratio(256 - val * 2, cpu); + return count; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 51b805c727fc8f..81e7cf72e83d7d 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -25,6 +25,7 @@ static DEFINE_MUTEX(itmt_update_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio); /* Boolean to track if system has ITMT capabilities */ static bool __read_mostly sched_itmt_capable; @@ -159,7 +160,12 @@ void sched_clear_itmt_support(void) int arch_asym_cpu_priority(int cpu) { - return per_cpu(sched_core_priority, cpu); + int power_ratio = per_cpu(sched_power_ratio, cpu); + + /* a power ratio of 0 (uninitialized) is assumed to be maximum */ + if (power_ratio == 0) + power_ratio = 256 - 2 * 6; + return per_cpu(sched_core_priority, cpu) * power_ratio / 256; } /** @@ -180,3 +186,24 @@ void sched_set_itmt_core_prio(int prio, int cpu) { per_cpu(sched_core_priority, cpu) = prio; } + +/** + * sched_set_itmt_power_ratio() - Set CPU priority based on ITMT + * @power_ratio: The power scaling ratio [1..256] for the core + * @core_cpu: The cpu number associated with the core + * + * Set a scaling to the cpu performance based on long term power + * settings (like EPB). + * + * Note this is for the policy not for the actual dynamic frequency; + * the frequency will increase itself as workloads run on a core. + */ + +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu) +{ + int cpu; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + per_cpu(sched_power_ratio, cpu) = power_ratio; + } +} From c05a1065ca9491b37c96b70e6392abd60d5520eb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 18 Nov 2021 16:09:47 +0000 Subject: [PATCH 20/38] itmt2 ADL fixes On systems with overclocking enabled, CPPC Highest Performance can be hard coded to 0xff. In this case even if we have cores with different highest performance, ITMT can't be enabled as the current implementation depends on CPPC Highest Performance. On such systems we can use MSR_HWP_CAPABILITIES maximum performance field when CPPC.Highest Performance is 0xff. Due to legacy reasons, we can't solely depend on MSR_HWP_CAPABILITIES as in some older systems CPPC Highest Performance is the only way to identify different performing cores. Signed-off-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 400337f3b572da..d158197677c908 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -376,6 +376,13 @@ static void intel_pstate_set_itmt_prio(int cpu) * update them at any time after it has been called. */ sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); + /* + * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. + * In this case we can't use CPPC.highest_perf to enable ITMT. + * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + */ + if (cppc_perf.highest_perf == 0xff) + cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) From 31441123871c0f0bdd16af268401b3f27cadee7c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 23 Nov 2021 17:38:50 +0000 Subject: [PATCH 21/38] add a per cpu minimum high watermark an tune batch size make sure there's at least 1024 per cpu pages... a reasonably small amount for todays system Signed-off-by: Kai Krakow --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de65e8b4f75f21..5af8600f9726e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5529,11 +5529,11 @@ static int zone_batchsize(struct zone *zone) /* * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * of the zone or 4MB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, 4 * SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; From 120dcf083d15cc774ed5a7e2c28a9150f597579c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Apr 2022 00:07:38 +0000 Subject: [PATCH 22/38] novector gcc12/build workarounds Signed-off-by: Kai Krakow --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d12..906f568b9a9e1a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -74,7 +74,7 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -O3 -fno-tree-vectorize -march=westmere -mpopcnt -fivopts -fmodulo-sched KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 From 49cf3783062b5b5441f3cd7659e1ce2ed671aa50 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 28 Apr 2023 17:01:35 +0100 Subject: [PATCH 23/38] md/raid6 algorithms: scale test duration for speedier boots Instead of using jiffies and waiting for jiffies to wrap before measuring use the higher precision local_time for benchmarking. Measure 2500 loops, which works out to be accurate enough for benchmarking the raid algo data rates. Also add division by zero checking in case timing measurements are bogus. Speeds up raid benchmarking from 48,000 usecs to 4000 usecs, saving 0.044 seconds on boot. Signed-off-by: Colin Ian King --- lib/raid6/algos.c | 53 ++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 81565997933625..4d86cfe17c3fde 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,6 +18,8 @@ #else #include #include +#include + /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); EXPORT_SYMBOL(raid6_empty_zero_page); @@ -157,12 +159,15 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) static inline const struct raid6_calls *raid6_choose_gen( void *(*const dptrs)[RAID6_TEST_DISKS], const int disks) { - unsigned long perf, bestgenperf, j0, j1; + unsigned long perf; + const unsigned long max_perf = 2500; int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ const struct raid6_calls *const *algo; const struct raid6_calls *best; + const u64 ns_per_mb = 1000000000 >> 20; + u64 n, ns, t, ns_best = ~0ULL; - for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + for (best = NULL, algo = raid6_algos; *algo; algo++) { if (!best || (*algo)->priority >= best->priority) { if ((*algo)->valid && !(*algo)->valid()) continue; @@ -172,26 +177,20 @@ static inline const struct raid6_calls *raid6_choose_gen( break; } - perf = 0; - preempt_disable(); - j0 = jiffies; - while ((j1 = jiffies) == j0) - cpu_relax(); - while (time_before(jiffies, - j1 + (1<gen_syndrome(disks, PAGE_SIZE, *dptrs); - perf++; } + ns = local_clock() - t; preempt_enable(); - if (perf > bestgenperf) { - bestgenperf = perf; + if (ns < ns_best) { + ns_best = ns; best = *algo; } - pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, - (perf * HZ * (disks-2)) >> - (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2); + pr_info("raid6: %-8s gen() %5llu MB/s (%llu ns)\n", (*algo)->name, (ns > 0) ? n / ns : 0, ns); } } @@ -208,31 +207,23 @@ static inline const struct raid6_calls *raid6_choose_gen( goto out; } - pr_info("raid6: using algorithm %s gen() %ld MB/s\n", - best->name, - (bestgenperf * HZ * (disks - 2)) >> - (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2); + pr_info("raid6: using algorithm %s gen() %llu MB/s (%llu ns)\n", + best->name, (ns_best > 0) ? n / ns_best : 0, ns_best); if (best->xor_syndrome) { - perf = 0; - preempt_disable(); - j0 = jiffies; - while ((j1 = jiffies) == j0) - cpu_relax(); - while (time_before(jiffies, - j1 + (1 << RAID6_TIME_JIFFIES_LG2))) { + t = local_clock(); + for (perf = 0; perf < max_perf; perf++) { best->xor_syndrome(disks, start, stop, PAGE_SIZE, *dptrs); - perf++; } + ns = local_clock() - t; preempt_enable(); - pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", - (perf * HZ * (disks - 2)) >> - (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); + n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2); + pr_info("raid6: .... xor() %llu MB/s, rmw enabled (%llu ns)\n", (ns > 0) ? n / ns : 0, ns); } - out: return best; } From 98cb7c52d536c681cc889a5e8c18f8642488af5e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 20 Jan 2023 11:16:42 +0000 Subject: [PATCH 24/38] initcall: only print non-zero initcall debug to speed up boot Printing initcall timings that successfully return after 0 usecs provides not much useful information and takes a small amount of time to do so. Disable the initcall timings for these specific cases. On an Alderlake i9-12900 this reduces kernel boot time by 0.67% (timed up to the invocation of systemd starting) based on 10 boot measurements. Signed-off-by: Colin Ian King --- init/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index c4778edae7972f..f5128cc7b34bf1 100644 --- a/init/main.c +++ b/init/main.c @@ -1220,10 +1220,13 @@ static __init_or_module void trace_initcall_finish_cb(void *data, initcall_t fn, int ret) { ktime_t rettime, *calltime = data; + long long delta; rettime = ktime_get(); - printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", - fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); + delta = ktime_us_delta(rettime, *calltime); + if (ret || delta) + printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", + fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); } static ktime_t initcall_calltime; From 99386860a161daaf7a23c2cac67b75d467e18714 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Mon, 18 Nov 2024 18:32:58 +0100 Subject: [PATCH 25/38] Place libraries right below the binary for PIE binaries Author: Intel ClearLinux Place libraries right below the binary for PIE binaries, this helps code locality (and thus performance). Signed-off-by: Kai Krakow --- fs/binfmt_elf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0a216a078c3155..e0b51138141e3b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1282,6 +1282,8 @@ static int load_elf_binary(struct linux_binprm *bprm) mm = current->mm; mm->end_code = end_code; mm->start_code = start_code; + if (start_code >= ELF_ET_DYN_BASE) + mm->mmap_base = start_code; mm->start_data = start_data; mm->end_data = end_data; mm->start_stack = bprm->p; From 55f367770f8952470fb29204cc52fcd3ee2cef54 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Mon, 18 Nov 2024 18:32:58 +0100 Subject: [PATCH 26/38] epp retune Signed-off-by: Kai Krakow --- arch/x86/include/asm/msr-index.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 61e991507353eb..d931f3f5258931 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -544,7 +544,7 @@ #define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) #define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) -#define HWP_EPP_PERFORMANCE 0x00 +#define HWP_EPP_PERFORMANCE 0x01 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF From e7c1f1cc0298f74d034b6550a05b07ba5d4d9524 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 1 Feb 2023 11:53:51 +0000 Subject: [PATCH 27/38] sched/core: add some branch hints based on gcov analysis Signed-off-by: Colin Ian King --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9803f10a082a7b..04c73988e7370c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -594,7 +594,7 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) /* Matches synchronize_rcu() in __sched_core_enable() */ preempt_disable(); - if (sched_core_disabled()) { + if (likely(sched_core_disabled())) { raw_spin_lock_nested(&rq->__lock, subclass); /* preempt_count *MUST* be > 1 */ preempt_enable_no_resched(); @@ -804,7 +804,7 @@ void update_rq_clock(struct rq *rq) #endif delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - if (delta < 0) + if (unlikely(delta < 0)) return; rq->clock += delta; update_rq_clock_task(rq, delta); @@ -6055,7 +6055,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct rq *rq_i; bool need_sync; - if (!sched_core_enabled(rq)) + if (likely(!sched_core_enabled(rq))) return __pick_next_task(rq, prev, rf); cpu = cpu_of(rq); @@ -7229,7 +7229,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0) && !irqs_disabled()) { + if (unlikely(should_resched(0) && !irqs_disabled())) { preempt_schedule_common(); return 1; } From f6aabffb5f98bf7ba9e669be5b506eec2f9d897d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 6 Mar 2023 12:25:29 +0000 Subject: [PATCH 28/38] crypto: kdf: make the module init call a late init call Signed-off-by: Colin Ian King --- crypto/kdf_sp800108.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/kdf_sp800108.c b/crypto/kdf_sp800108.c index c3f9938e1ad27f..e77478e064d8f6 100644 --- a/crypto/kdf_sp800108.c +++ b/crypto/kdf_sp800108.c @@ -149,7 +149,7 @@ static int __init crypto_kdf108_init(void) static void __exit crypto_kdf108_exit(void) { } -module_init(crypto_kdf108_init); +late_initcall(crypto_kdf108_init); module_exit(crypto_kdf108_exit); MODULE_LICENSE("GPL v2"); From 105a2b639dc54cc2aeab65d53b31a23fbd1e7185 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Oct 2024 16:29:50 +0100 Subject: [PATCH 29/38] handle sched_yield gracefully when being hammered Some misguided apps hammer sched_yield() in a tight loop (they should be using futexes instead) which causes massive lock contention even if there is little work to do or to yield to. rare limit yielding since the base scheduler does a pretty good job already about just running the right things Signed-off-by: Colin Ian King --- kernel/sched/syscalls.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index f9cb7896c1b966..45723ca32d1d8b 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1390,10 +1390,22 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, return ret; } +static DEFINE_PER_CPU(unsigned long, last_yield); + static void do_sched_yield(void) { struct rq_flags rf; struct rq *rq; + int cpu = raw_smp_processor_id(); + + cond_resched(); + + /* rate limit yielding to something sensible */ + + if (!time_after(jiffies, per_cpu(last_yield, cpu))) + return; + + per_cpu(last_yield, cpu) = jiffies; rq = this_rq_lock_irq(&rf); From 3f144b781b51038359b45b2cefc03fd58055a797 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Mon, 18 Nov 2024 18:32:59 +0100 Subject: [PATCH 30/38] scale net alloc Author: Intel ClearLinux Signed-off-by: Kai Krakow --- include/net/sock.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index fa055cf1785efd..584fc86d684038 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1579,10 +1579,17 @@ static inline void sk_mem_charge(struct sock *sk, int size) static inline void sk_mem_uncharge(struct sock *sk, int size) { + int reclaimable, reclaim_threshold; + + reclaim_threshold = 64 * 1024; if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); - sk_mem_reclaim(sk); + reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); + if (reclaimable > reclaim_threshold) { + reclaimable -= reclaim_threshold; + __sk_mem_reclaim(sk, reclaimable); + } } /* From 9b5878663716b64391059e76fa886e8f41b972d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 3 May 2023 17:31:05 +0100 Subject: [PATCH 31/38] clocksource: only perform extended clocksource checks for AMD systems Signed-off-by: Colin Ian King --- drivers/clocksource/acpi_pm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index b4330a01a566bf..e41cd5c6885530 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -208,13 +208,16 @@ static int verify_pmtmr_rate(void) static int __init init_acpi_pm_clocksource(void) { u64 value1, value2; - unsigned int i, j = 0; + unsigned int i, j = 0, checks = 1; if (!pmtmr_ioport) return -ENODEV; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + checks = ACPI_PM_MONOTONICITY_CHECKS; + /* "verify" this timing source: */ - for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) { + for (j = 0; j < checks; j++) { udelay(100 * j); value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm); for (i = 0; i < ACPI_PM_READ_CHECKS; i++) { From 701158d33a999c32d0646283229c66fd78bf0632 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Mon, 18 Nov 2024 18:32:59 +0100 Subject: [PATCH 32/38] better idle balance Author: Intel ClearLinux Signed-off-by: Kai Krakow --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ddc096d6b0c203..e792e85ae8607c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12866,7 +12866,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) update_next_balance(sd, &next_balance); - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle/2 < curr_cost + sd->max_newidle_lb_cost) break; if (sd->flags & SD_BALANCE_NEWIDLE) { From 9cf138293f39c1fa57e22742eae279f31efad627 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 27 Jun 2023 14:12:27 +0100 Subject: [PATCH 33/38] ACPI: align slab for improved memory performance Enabling SLAB_HWCACHE_ALIGN for the ACPI object caches improves boot speed in the ACPICA core for object allocation and free'ing especially in the AML parsing and execution phases in boot. Testing with 100 boots shows an average boot saving in acpi_init of ~35000 usecs compared to the unaligned version. Most of the ACPI objects being allocated and free'd are of very short life times in the critical paths for parsing and execution, so the extra memory used for alignment isn't too onerous. Signed-off-by: Colin Ian King --- drivers/acpi/osl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 70af3fbbebe54c..dab3d50896351a 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1549,7 +1549,7 @@ void acpi_os_release_lock(acpi_spinlock lockp, acpi_cpu_flags not_used) acpi_status acpi_os_create_cache(char *name, u16 size, u16 depth, acpi_cache_t **cache) { - *cache = kmem_cache_create(name, size, 0, 0, NULL); + *cache = kmem_cache_create(name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (*cache == NULL) return AE_ERROR; else From a654e20e2b791a7f85701241feeb25aee5b12cdd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Sep 2023 14:16:21 +0100 Subject: [PATCH 34/38] thermal: intel: powerclamp: check MWAIT first, use pr_warn insteal of pr_err For x86 targets it's more pertinant to check for lack of MWAIT than AMD specific cpus, so swap the order of tests. Also make the pr_err a pr_warn to align with other ENODEV warning messages. Signed-off-by: Colin Ian King --- drivers/thermal/intel/intel_powerclamp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index d2957742992266..a3a03061b41d42 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -723,14 +723,13 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { - - if (x86_match_cpu(amd_cpu)){ - pr_info("Intel PowerClamp does not support AMD CPUs\n"); + if (!x86_match_cpu(intel_powerclamp_ids)) { + pr_info("CPU does not support MWAIT\n"); return -ENODEV; } - if (!x86_match_cpu(intel_powerclamp_ids)) { - pr_err("CPU does not support MWAIT\n"); + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); return -ENODEV; } From 9ad61f29954da0ffedc7ede4fae415eb77eb7aa1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2023 12:41:00 +0100 Subject: [PATCH 35/38] KVM: VMX: make vmx_init a late init call to get to init process faster Making vmx_init a late initcall improves QEMU kernel boot times to get to the init process. Average of 100 boots, QEMU boot average reduced from 0.776 seconds to 0.622 seconds (~19.8% faster) on Alderlake i9-12900 and ~0.5% faster for non-QEMU UEFI boots. Signed-off-by: Colin Ian King --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1af30e3472cdd9..bea10effa18f3a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8675,4 +8675,4 @@ static int __init vmx_init(void) kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); +late_initcall(vmx_init); From 1c61dd483ff765fdef8f834b699f637043ac8059 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Mon, 18 Nov 2024 18:33:25 +0100 Subject: [PATCH 36/38] timer slack Author: Intel ClearLinux Signed-off-by: Kai Krakow --- init/init_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/init_task.c b/init/init_task.c index 136a8231355ab7..15d7392020cae9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -139,7 +139,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .journal_info = NULL, INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), - .timer_slack_ns = 50000, /* 50 usec default slack */ + .timer_slack_ns = 50, /* 50 nsec default slack */ .thread_pid = &init_struct_pid, .thread_node = LIST_HEAD_INIT(init_signals.thread_head), #ifdef CONFIG_AUDIT From f2a5b7386571cd0e41d2c6794682c84f7ca289c1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Nov 2023 13:29:45 +0000 Subject: [PATCH 37/38] sched/fair: remove upper limit on cpu number Signed-off-by: Colin Ian King --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e792e85ae8607c..e25af5af5434e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -190,7 +190,7 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) */ static unsigned int get_update_sysctl_factor(void) { - unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); + unsigned int cpus = num_online_cpus(); unsigned int factor; switch (sysctl_sched_tunable_scaling) { From d0c43823f97b2b1f9ac2330abe4ffa066ec888c9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 24 Apr 2024 16:45:47 +0100 Subject: [PATCH 38/38] net: sock: increase default number of _SK_MEM_PACKETS to 1024 scale these by a factor of 4 to improve socket performance Signed-off-by: Colin Ian King --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 584fc86d684038..da0a3e5bc99f5b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2820,7 +2820,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ -#define _SK_MEM_PACKETS 256 +#define _SK_MEM_PACKETS 1024 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)