From e6e8b21fc9d6ef31c6d6301a7293842e8b0b7129 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 23 Jun 2015 01:26:52 -0500 Subject: [PATCH 01/30] i8042: decrease debug message level to info Author: Arjan van de Ven Signed-off-by: Miguel Bernal Marin Signed-off-by: Jose Carlos Venegas Munoz --- drivers/input/serio/i8042.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 6dac7c1853a541..fab04cd8a7a095 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; - pr_err("Failed to enable KBD port\n"); + pr_info("Failed to enable KBD port\n"); return -EIO; } @@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; - pr_err("Failed to enable AUX port\n"); + pr_info("Failed to enable AUX port\n"); return -EIO; } @@ -732,7 +732,7 @@ static int i8042_check_mux(void) i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - pr_err("Failed to disable AUX port, can't use MUX\n"); + pr_info("Failed to disable AUX port, can't use MUX\n"); return -EIO; } @@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { - pr_err("i8042 controller selftest timeout\n"); + pr_info("i8042 controller selftest timeout\n"); return -ENODEV; } @@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else - pr_err("i8042 controller selftest failed\n"); + pr_info("i8042 controller selftest failed\n"); return -EIO; #endif } From 16ba690d60713879712a49a1eb8e177708d7bf97 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 11 Jan 2016 10:01:44 -0600 Subject: [PATCH 02/30] increase the ext4 default commit age Both the VM and EXT4 have a "commit to disk after X seconds" time. Currently the EXT4 time is shorter than our VM time, which is a bit suboptional, it's better for performance to let the VM do the writeouts in bulk rather than something deep in the journalling layer. (DISTRO TWEAK -- NOT FOR UPSTREAM) Signed-off-by: Arjan van de Ven Signed-off-by: Jose Carlos Venegas Munoz --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0b7242370b5673..16b8fc483b3d35 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -45,7 +45,7 @@ /* * The default maximum commit age, in seconds. */ -#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 +#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 #ifdef CONFIG_JBD2_DEBUG /* From 78aae73f3e7d38423a201712cbdfb2f1df2ba858 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 14 Mar 2016 11:22:09 -0600 Subject: [PATCH 03/30] silence rapl --- drivers/powercap/intel_rapl_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 26d00b1853b421..3e239d6548b523 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1518,7 +1518,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (!id) { - pr_err("driver does not support CPU family %d model %d\n", + pr_info("driver does not support CPU family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; From f243bb0d107144dabab04f63eeb4cf251295d965 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 14 Mar 2016 11:10:58 -0600 Subject: [PATCH 04/30] pci pme wakeups Reduce wakeups for PME checks, which are a workaround for miswired boards (sadly, too many of them) in laptops. --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2127aba3550b5d..cf5c72a88c2bae 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -62,7 +62,7 @@ struct pci_pme_device { struct pci_dev *dev; }; -#define PME_TIMEOUT 1000 /* How long between PME checks */ +#define PME_TIMEOUT 4000 /* How long between PME checks */ static void pci_dev_d3_sleep(struct pci_dev *dev) { From 9071085217a6c530425ea8521537f1c270e3dac5 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 6 May 2019 12:57:09 -0500 Subject: [PATCH 05/30] ksm-wakeups reduce wakeups in ksm by adding rounding (aligning) when the sleep times are 1 second or longer Signed-off-by: Arjan van de Ven --- kernel/watchdog.c | 2 +- mm/ksm.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33e2..be1439d38f260c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; -int __read_mostly watchdog_thresh = 10; +int __read_mostly watchdog_thresh = 40; static int __read_mostly nmi_watchdog_available; struct cpumask watchdog_cpumask __read_mostly; diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03dc..7009cf42be76b1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2427,9 +2427,14 @@ static int ksm_scan_thread(void *nothing) if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); - wait_event_interruptible_timeout(ksm_iter_wait, - sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), - msecs_to_jiffies(sleep_ms)); + if (sleep_ms >= 1000) + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(round_jiffies_relative(sleep_ms))); + else + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); From cd4b867e2751c014479881acc23bf7baf351a1e1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 19 Mar 2016 21:32:19 -0400 Subject: [PATCH 06/30] intel_idle: tweak cpuidle cstates Increase target_residency in cpuidle cstate Tune intel_idle to be a bit less agressive; Clear linux is cleaner in hygiene (wakupes) than the average linux, so we can afford changing these in a way that increases performance while keeping power efficiency --- drivers/idle/intel_idle.c | 50 +++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cfeb24d40d3789..8ac40b7c7b4127 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, - .target_residency = 100, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 1500, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 5000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 4000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 7000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, - .target_residency = 200, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, - .target_residency = 800, + .target_residency = 3000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, - .target_residency = 800, + .target_residency = 3200, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 300, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, - .target_residency = 600, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -942,7 +942,7 @@ static struct cpuidle_state adl_n_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { From d768ec76c021a4cfd9b85854f141702c7e2ddb5a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 11 Feb 2015 17:28:14 -0600 Subject: [PATCH 07/30] smpboot: reuse timer calibration NO point recalibrating for known-constant tsc ... saves 200ms+ of boot time. --- arch/x86/kernel/tsc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cceb..c2f80184fd3325 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) if (!constant_tsc || !mask) return 0; + if (cpu != 0) + return cpu_data(0).loops_per_jiffy; + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; From ee7c9cb0b9c9f596b2ee24d4883f27fe994a9ecb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 2 Jun 2016 23:36:32 -0500 Subject: [PATCH 08/30] initialize ata before graphics ATA init is the long pole in the boot process, and its asynchronous. move the graphics init after it so that ata and graphics initialize in parallel --- drivers/Makefile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9bd..1e1a0832fb48a1 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -59,15 +59,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -79,6 +72,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb and intelfb depend on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ From df6ed5ed28a8543a425e6ef839102027069f1a78 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 6 Jan 2017 15:34:09 +0000 Subject: [PATCH 09/30] ipv4/tcp: allow the memory tuning for tcp to go a little bigger than default --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f2205756cfeee..2d20275dd6504d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4810,8 +4810,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; From 010e3c9ce23b3a2e326859e6e889f4190bd16310 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 17 May 2017 01:52:11 +0000 Subject: [PATCH 10/30] init: wait for partition and retry scan As Clear Linux boots fast the device is not ready when the mounting code is reached, so a retry device scan will be performed every 0.5 sec for at least 40 sec and synchronize the async task. Signed-off-by: Miguel Bernal Marin --- init/do_mounts.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a84a..06fef7f97c028e 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) if (strcmp(name, "/dev/ram") == 0) return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTUUID=", 9) == 0) { + dev_t res; + int needtowait = 40<<1; + res = devt_from_partuuid(name + 9); + while (!res && needtowait) { + /* waiting 0.5 sec */ + msleep(500); + res = devt_from_partuuid(name + 9); + needtowait--; + } + return res; + } if (strncmp(name, "PARTLABEL=", 10) == 0) return devt_from_partlabel(name + 10); if (strncmp(name, "/dev/", 5) == 0) @@ -612,7 +622,9 @@ void __init prepare_namespace(void) * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */ + async_synchronize_full(); wait_for_device_probe(); + async_synchronize_full(); md_run_setup(); From b202143bbc9a33f6bd89479d31e865dfa061d767 Mon Sep 17 00:00:00 2001 From: William Douglas Date: Wed, 20 Jun 2018 17:23:21 +0000 Subject: [PATCH 11/30] enable stateless firmware loading Prefer the order of specific version before generic and /etc before /lib to enable the user to give specific overrides for generic firmware and distribution firmware. Signed-off-by: Kai Krakow --- drivers/base/firmware_loader/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 7c3590fd97c28d..bb4880e10581f5 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -470,6 +470,8 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv, static char fw_path_para[256]; static const char * const fw_path[] = { fw_path_para, + "/etc/firmware/" UTS_RELEASE, + "/etc/firmware", "/lib/firmware/updates/" UTS_RELEASE, "/lib/firmware/updates", "/lib/firmware/" UTS_RELEASE, From d7589d0f6baa5066790468a1fd6d6779ebbf173c Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 2 Aug 2018 12:03:22 -0700 Subject: [PATCH 12/30] migrate some systemd defaults to the kernel defaults. These settings are needed to prevent networking issues when the networking modules come up by default without explicit settings, which breaks some cases. We don't want the modprobe settings to be read at boot time if we're not going to do anything else ever. Signed-off-by: Kai Krakow --- drivers/net/dummy.c | 2 +- include/uapi/linux/if_bonding.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index aa0fc00faecbe8..b93a4d71be29e0 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -43,7 +43,7 @@ #define DRV_NAME "dummy" -static int numdummies = 1; +static int numdummies = 0; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837dbf..bf8e2af101a3c8 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -82,7 +82,7 @@ #define BOND_STATE_ACTIVE 0 /* link is active */ #define BOND_STATE_BACKUP 1 /* link is backup */ -#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ From fc80743c581f8c314d1485b44781f1517293475d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 8 Dec 2018 18:21:32 +0000 Subject: [PATCH 13/30] use lfence instead of rep and nop Signed-off-by: Kai Krakow --- arch/x86/include/asm/vdso/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 57b1a7034c640a..e2c45674f98975 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -10,7 +10,7 @@ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { - asm volatile("rep; nop" ::: "memory"); + asm volatile("lfence" ::: "memory"); } static __always_inline void cpu_relax(void) From 44b41a32eade6cafee849f695a981ce81ac3190b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 13 Dec 2018 01:00:49 +0000 Subject: [PATCH 14/30] do accept() in LIFO order for cache efficiency Signed-off-by: Kai Krakow --- include/linux/wait.h | 2 ++ kernel/sched/wait.c | 24 ++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 7f5a51aae0a73d..8e57dcc0469811 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); @@ -1192,6 +1193,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 9860bb9a847cf0..c7f045873a5fbd 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -289,6 +300,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4e84ed21d16fed..c6e54eca42d983 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -588,7 +588,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) From 6a6df016143aa13cf1e7e5bdb3b9b0c56000316d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 18 Feb 2018 23:35:41 +0000 Subject: [PATCH 15/30] locking: rwsem: spin faster tweak rwsem owner spinning a bit Signed-off-by: Kai Krakow --- kernel/locking/rwsem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44873594de0316..fe62d59f2bdcf4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + int i = 0; lockdep_assert_preemption_disabled(); @@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - cpu_relax(); + if (i++ > 1000) + cpu_relax(); } return state; From 05488791300742e01c12cdc9329aa4e794d6c207 Mon Sep 17 00:00:00 2001 From: Joe Konno Date: Tue, 25 Jun 2019 10:35:54 -0700 Subject: [PATCH 16/30] ata: libahci: ignore staggered spin-up Change libahci to ignore firmware's staggered spin-up flag. End-users who wish to honor firmware's SSS flag can add the following kernel parameter to a new file at /etc/kernel/cmdline.d/ignore_sss.conf: libahci.ignore_sss=0 And then run sudo clr-boot-manager update Signed-off-by: Joe Konno --- drivers/ata/libahci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 954386a2b5002b..cef60f30278c53 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -34,14 +34,14 @@ #include "libata.h" static int ahci_skip_host_reset; -int ahci_ignore_sss; +int ahci_ignore_sss=1; EXPORT_SYMBOL_GPL(ahci_ignore_sss); module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444); MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip)"); module_param_named(ignore_sss, ahci_ignore_sss, int, 0444); -MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore)"); +MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore [default])"); static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy, unsigned hints); From 3fafb0fce224919107438f3d0572538ff44fa9a4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 10 Aug 2019 03:19:04 +0000 Subject: [PATCH 17/30] print CPU that faults print cpu number when we print a crash Signed-off-by: Kai Krakow --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8bc..1a14f52added00 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", loglvl, tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); + (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); print_vma_addr(KERN_CONT " in ", regs->ip); From 66dc1ca43088f13c0800aee46a7e8c650dcb8905 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 19 Aug 2021 14:49:47 -0700 Subject: [PATCH 18/30] x86/microcode: Add an option to reload microcode even if revision is the same This is POC to support rollback. This is a simple version, admin uses echo 2 instead of echo 1 to reload. We don't do the version checks. #echo 1 > /sys/devices/system/cpu/microcode/reload The following usage, writing 2 to reload file is helpful to reload the microcode again even if the revision is less than what is loaded. #echo 2 > /sys/devices/system/cpu/microcode/reload Signed-off-by: Ashok Raj --- arch/x86/kernel/cpu/microcode/core.c | 40 ++++++++++++++++++++++++++- arch/x86/kernel/cpu/microcode/intel.c | 14 ++++++---- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6a41cee242f6d7..18dc2dd80c890b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -44,6 +44,8 @@ static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr = true; +bool ucode_rollback = false; +int enable_rollback = 0; bool initrd_gone; @@ -80,6 +82,26 @@ static u32 final_levels[] = { 0, /* T-101 terminator */ }; +static int __init ucode_setup(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "rollback", 8)) { + enable_rollback = 1; + pr_info("Microcode Rollback Enabled\n"); + } + str += strcspn(str, ","); + while (*str == ',') + str++; + } + return 0; +} + +__setup("ucode=", ucode_setup); + + /* * Check the current patch level on this CPU. * @@ -513,6 +535,7 @@ static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + struct cpuinfo_x86 *c = &boot_cpu_data; enum ucode_state tmp_ret = UCODE_OK; int bsp = boot_cpu_data.cpu_index; unsigned long val; @@ -522,7 +545,7 @@ static ssize_t reload_store(struct device *dev, if (ret) return ret; - if (val != 1) + if (!val || val > 2) return size; cpus_read_lock(); @@ -530,6 +553,20 @@ static ssize_t reload_store(struct device *dev, ret = check_online_cpus(); if (ret) goto put; + /* + * Check if the vendor is Intel to permit reloading + * microcode even if the revision is unchanged. + * This is typically used during development of microcode + * and changing rev is a pain. + */ + if ((val == 2) && ((c->x86_vendor != X86_VENDOR_INTEL) || + !enable_rollback)) + return size; + else if (val == 2) { + mutex_lock(µcode_mutex); + ucode_rollback = true; + mutex_unlock(µcode_mutex); + } tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); if (tmp_ret != UCODE_NEW) @@ -540,6 +577,7 @@ static ssize_t reload_store(struct device *dev, mutex_unlock(µcode_mutex); put: + ucode_rollback = false; cpus_read_unlock(); if (ret == 0) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1fcbd671f1dffc..11acd1a4ca9166 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -44,6 +44,7 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; +extern bool ucode_rollback; /* * Returns 1 if update has been found, 0 otherwise. @@ -80,7 +81,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev { struct microcode_header_intel *mc_hdr = mc; - if (mc_hdr->rev <= new_rev) + if (!ucode_rollback && mc_hdr->rev <= new_rev) return 0; return find_matching_signature(mc, csig, cpf); @@ -120,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne if (find_matching_signature(data, sig, pf)) { prev_found = true; - if (mc_hdr->rev <= mc_saved_hdr->rev) + if (!ucode_rollback && mc_hdr->rev <= mc_saved_hdr->rev) continue; p = memdup_patch(data, size); @@ -649,7 +650,7 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) phdr = (struct microcode_header_intel *)iter->data; - if (phdr->rev <= uci->cpu_sig.rev) + if (!ucode_rollback && phdr->rev <= uci->cpu_sig.rev) continue; if (!find_matching_signature(phdr, @@ -734,10 +735,11 @@ static enum ucode_state apply_microcode_intel(int cpu) * already. */ rev = intel_get_microcode_revision(); - if (rev >= mc->hdr.rev) { + if (!ucode_rollback && rev >= mc->hdr.rev) { ret = UCODE_OK; goto out; - } + } else if (ucode_rollback) + ret = UCODE_OK; /* * Writeback and invalidate caches before updating microcode to avoid @@ -756,7 +758,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (bsp && rev != prev_rev) { + if (bsp && ((rev != prev_rev) || ucode_rollback)) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, From 202c459e48e9429fc97d7c79bd0a3e7e65b01201 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 11 Nov 2019 23:12:11 +0000 Subject: [PATCH 19/30] nvme workaround Signed-off-by: Kai Krakow --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7e3893d06babdf..8984f328a4f6fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -58,7 +58,7 @@ static u8 nvme_max_retries = 5; module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -static unsigned long default_ps_max_latency_us = 100000; +static unsigned long default_ps_max_latency_us = 200; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, "max power saving latency for new devices; use PM QOS to change per device"); From 451db4b662ac48371fabf0665203f2369d18fbb7 Mon Sep 17 00:00:00 2001 From: Alexander Koskovich Date: Wed, 12 Feb 2020 22:47:12 +0000 Subject: [PATCH 20/30] don't report an error if PowerClamp run on other CPU Signed-off-by: Kai Krakow --- drivers/thermal/intel/intel_powerclamp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index b80e25ec12615f..187b4ee6e9f5df 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; +static const struct x86_cpu_id amd_cpu[] = { + { X86_VENDOR_AMD }, + {}, +}; + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} @@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); + return -ENODEV; + } + if (!x86_match_cpu(intel_powerclamp_ids)) { pr_err("CPU does not support MWAIT\n"); return -ENODEV; From 93c895dd2633113e84a20559db6efccc0607cde7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 16 Nov 2021 17:39:25 +0000 Subject: [PATCH 21/30] itmt_epb: use epb to scale itmt Signed-off-by: Kai Krakow --- arch/x86/include/asm/topology.h | 1 + arch/x86/kernel/cpu/intel_epb.c | 4 ++++ arch/x86/kernel/itmt.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 458c891a827365..d86eb1ebf59f5b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -175,6 +175,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu); /* Interface to notify scheduler that system supports ITMT */ int sched_set_itmt_support(void); diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index fbaf12e43f4160..c8c2d6f1a8aca2 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev, if (ret < 0) return ret; + /* update the ITMT scheduler logic to use the power policy data */ + /* scale the val up by 2 so the range is 224 - 256 */ + sched_set_itmt_power_ratio(256 - val * 2, cpu); + return count; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9ff480e94511b8..d4326e050fb70c 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -25,6 +25,7 @@ static DEFINE_MUTEX(itmt_update_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio); /* Boolean to track if system has ITMT capabilities */ static bool __read_mostly sched_itmt_capable; @@ -169,7 +170,12 @@ void sched_clear_itmt_support(void) int arch_asym_cpu_priority(int cpu) { - return per_cpu(sched_core_priority, cpu); + int power_ratio = per_cpu(sched_power_ratio, cpu); + + /* a power ratio of 0 (uninitialized) is assumed to be maximum */ + if (power_ratio == 0) + power_ratio = 256 - 2 * 6; + return per_cpu(sched_core_priority, cpu) * power_ratio / 256; } /** @@ -203,3 +209,24 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) i++; } } + +/** + * sched_set_itmt_power_ratio() - Set CPU priority based on ITMT + * @power_ratio: The power scaling ratio [1..256] for the core + * @core_cpu: The cpu number associated with the core + * + * Set a scaling to the cpu performance based on long term power + * settings (like EPB). + * + * Note this is for the policy not for the actual dynamic frequency; + * the frequency will increase itself as workloads run on a core. + */ + +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu) +{ + int cpu; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + per_cpu(sched_power_ratio, cpu) = power_ratio; + } +} From 1ab5b8829bed2f7bf4d36861b38c6434120f2418 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 18 Nov 2021 16:09:47 +0000 Subject: [PATCH 22/30] itmt2 ADL fixes On systems with overclocking enabled, CPPC Highest Performance can be hard coded to 0xff. In this case even if we have cores with different highest performance, ITMT can't be enabled as the current implementation depends on CPPC Highest Performance. On such systems we can use MSR_HWP_CAPABILITIES maximum performance field when CPPC.Highest Performance is 0xff. Due to legacy reasons, we can't solely depend on MSR_HWP_CAPABILITIES as in some older systems CPPC Highest Performance is the only way to identify different performing cores. Signed-off-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6ff73c30769fae..46516074bfd032 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -365,6 +365,13 @@ static void intel_pstate_set_itmt_prio(int cpu) * update them at any time after it has been called. */ sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); + /* + * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. + * In this case we can't use CPPC.highest_perf to enable ITMT. + * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + */ + if (cppc_perf.highest_perf == 0xff) + cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) From abf6581254b041be3b48da5d12e78d4f18ceea8a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 23 Nov 2021 17:38:50 +0000 Subject: [PATCH 23/30] add a per cpu minimum high watermark an tune batch size make sure there's at least 1024 per cpu pages... a reasonably small amount for todays system Signed-off-by: Kai Krakow --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e60657875d328..688ebfafd58ecf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7069,11 +7069,11 @@ static int zone_batchsize(struct zone *zone) /* * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * of the zone or 4MB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, 4 * SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; From 9fa32b94f52cd1d8a4bfb11ede172cafcbf889c9 Mon Sep 17 00:00:00 2001 From: "Brett T. Warden" Date: Mon, 19 Sep 2022 08:52:45 -0700 Subject: [PATCH 24/30] scale Signed-off-by: Kai Krakow --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 266a1ab054341c..25a5289cb334de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -611,7 +611,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { + if (x > MEMCG_CHARGE_BATCH * 128) { /* * If stats_flush_threshold exceeds the threshold * (>num_online_cpus()), cgroup stats update will be triggered From 0471ed8289eef53eb281b9e06c30386790554cb4 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 25 Aug 2022 15:55:26 -0700 Subject: [PATCH 25/30] sched/fair: Simplify asym_packing logic for SMT sched groups When the destination CPU is an SMT sibling and idle, it can only help the busiest group if all of its other SMT siblings are also idle. Otherwise, there is not increase in throughput. It does not matter whether the busiest group has SMT siblings. Simply check if there are any tasks running on the local group before proceeding. Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Len Brown Cc: Mel Gorman Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim C. Chen Cc: Valentin Schneider Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Len Brown Signed-off-by: Ricardo Neri --- kernel/sched/fair.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c78..048959ebbc9c08 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8900,12 +8900,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sched_group *sg) { #ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; + bool local_is_smt; int sg_busy_cpus; local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; if (!local_is_smt) { @@ -8926,25 +8924,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); } - /* @dst_cpu has SMT siblings. */ - - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; - - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; - } - /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). + * @dst_cpu has SMT siblings. When both @dst_cpu and the busiest core + * have one or more busy siblings, moving tasks between them results + * in the same throughput. Only if all the siblings of @dst_cpu are + * idle throughput can increase. + * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. */ - if (!sds->local_stat.sum_nr_running) + if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); return false; From 08777e6e9c3792e136828a97fe0e15235222b841 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 25 Aug 2022 15:55:28 -0700 Subject: [PATCH 26/30] sched/fair: Let lower-priority CPUs do active balancing When more than one SMT siblings of a physical core are busy, an idle CPU of lower priority can help. Indicate that the low priority CPU can do active balancing from the high- priority CPU only if they belong to separate cores. Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Len Brown Cc: Mel Gorman Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim C. Chen Cc: Valentin Schneider Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Len Brown Signed-off-by: Ricardo Neri --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 048959ebbc9c08..155862be793155 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10151,9 +10151,14 @@ asym_active_balance(struct lb_env *env) * ASYM_PACKING needs to force migrate tasks from busy but * lower priority CPUs in order to pack all tasks in the * highest priority CPUs. + * + * If the busy CPU has higher priority but is an SMT sibling + * in which other SMT siblings are also busy, a lower-priority + * CPU in a separate core can help. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !(env->sd->flags & SD_SHARE_CPUCAPACITY)); } static inline bool From 73171cbbd2e43120e6a0fd278340bed192152d77 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 25 Aug 2022 15:55:29 -0700 Subject: [PATCH 27/30] x86/sched: Avoid unnecessary migrations within SMT domains Having different priorities for each SMT sibling triggers unnecessary load balancing towards the higher-priority sibling. The scheduler now has logic to allow lower-priority CPUs to relieve load from scheduling groups composed of SMT siblings with more than one busy sibling. Hence, it is no longer necessary to give different priorities to each of the SMT siblings of a physical core. Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Len Brown Cc: Mel Gorman Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim C. Chen Cc: Valentin Schneider Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Len Brown Signed-off-by: Ricardo Neri --- arch/x86/kernel/itmt.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index d4326e050fb70c..2158b43f7cd9fd 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -180,34 +180,21 @@ int arch_asym_cpu_priority(int cpu) /** * sched_set_itmt_core_prio() - Set CPU priority based on ITMT - * @prio: Priority of cpu core - * @core_cpu: The cpu number associated with the core + * @prio: Priority of @cpu + * @cpu: The CPU number * * The pstate driver will find out the max boost frequency * and call this function to set a priority proportional - * to the max boost frequency. CPU with higher boost + * to the max boost frequency. CPUs with higher boost * frequency will receive higher priority. * * No need to rebuild sched domain after updating * the CPU priorities. The sched domains have no * dependency on CPU priorities. */ -void sched_set_itmt_core_prio(int prio, int core_cpu) +void sched_set_itmt_core_prio(int prio, int cpu) { - int cpu, i = 1; - - for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { - int smt_prio; - - /* - * Ensure that the siblings are moved to the end - * of the priority chain and only used when - * all other high priority cpus are out of capacity. - */ - smt_prio = prio * smp_num_siblings / (i * i); - per_cpu(sched_core_priority, cpu) = smt_prio; - i++; - } + per_cpu(sched_core_priority, cpu) = prio * 64 - cpu; } /** From a4b7c0ed8b70c63a5e1383a483fb9a352af60457 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 4 Jan 2023 21:22:00 +0000 Subject: [PATCH 28/30] powerbump functionality --- arch/x86/kernel/Makefile | 2 + arch/x86/kernel/powerbump.c | 80 ++++++++++++++++++++++++++++++++ block/bio.c | 4 ++ drivers/cpuidle/governors/menu.c | 4 ++ fs/buffer.c | 4 ++ fs/jbd2/transaction.c | 2 + include/linux/powerbump.h | 14 ++++++ 7 files changed, 110 insertions(+) create mode 100644 arch/x86/kernel/powerbump.c create mode 100644 include/linux/powerbump.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c08..7d931995efdc31 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-y += powerbump.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/powerbump.c b/arch/x86/kernel/powerbump.c new file mode 100644 index 00000000000000..c6b3762113bf8c --- /dev/null +++ b/arch/x86/kernel/powerbump.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Intel Corporation + * Author: Arjan van de Ven + * + * Kernel power-bump infrastructructure + */ +#include +#include +#include + +static DEFINE_PER_CPU(unsigned long, bump_timeout); /* jiffies at which the lease for the bump times out */ + + + +/* + * a note about the use of the current cpu versus preemption. + * + * Most uses of in_power_bump() are inside local power management code, + * and are pinned to that cpu already. + * + * On the "set" side, interrupt level code is obviously also fully + * migration-race free. + * + * All other cases are exposed to a migration-race. + * + * The goal of powerbump is statistical rather than deterministic, + * e.g. on average the CPU that hits event X will go towards Y more + * often than not, and the impact of being wrong is a bit of extra + * power potentially for some short durations. + * Weighted against the costs in performance and complexity of dealing + * with the race, the race condition is acceptable. + * + * The second known race is where interrupt context might set a bump + * time in the middle of process context setting a different but smaller bump time, + * with the result that process context will win incorrectly, and the + * actual bump time will be less than expected, but still non-zero. + * Here also the cost of dealing with the raice is outweight with the + * limited impact. + */ + + +int in_power_bump(void) +{ + int cpu = raw_smp_processor_id(); + if (time_before(jiffies, per_cpu(bump_timeout, cpu))) + return 1; + + /* deal with wrap issues by keeping the stored bump value close to current */ + per_cpu(bump_timeout, cpu) = jiffies; + return 0; +} +EXPORT_SYMBOL_GPL(in_power_bump); + +void give_power_bump(int msecs) +{ + unsigned long nextjiffies; + int cpu; + /* we need to round up an extra jiffie */ + nextjiffies = jiffies + msecs_to_jiffies(msecs) + 1; + + cpu = raw_smp_processor_id(); + if (time_before(per_cpu(bump_timeout, cpu), nextjiffies)) + per_cpu(bump_timeout, cpu) = nextjiffies; + +} +EXPORT_SYMBOL_GPL(give_power_bump); + +static __init int powerbump_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(bump_timeout, cpu) = jiffies; + } + + return 0; +} + +late_initcall(powerbump_init); \ No newline at end of file diff --git a/block/bio.c b/block/bio.c index 57c2f327225bd1..08ba43fe3242b3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1294,6 +1295,7 @@ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { + give_power_bump(BUMP_FOR_DISK); complete(bio->bi_private); } @@ -1319,6 +1321,8 @@ int submit_bio_wait(struct bio *bio) bio->bi_opf |= REQ_SYNC; submit_bio(bio); + give_power_bump(BUMP_FOR_DISK); + /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c4922684f30583..5bc5de2c1c694a 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -18,6 +18,7 @@ #include #include #include +#include #define BUCKETS 12 #define INTERVAL_SHIFT 3 @@ -279,6 +280,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } + if (in_power_bump() && latency_req > BUMP_LATENCY_THRESHOLD) + latency_req = BUMP_LATENCY_THRESHOLD; + /* determine the expected residency time, round up */ delta = tick_nohz_get_sleep_length(&delta_tick); if (unlikely(delta < 0)) { diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dde5..139a1b18b24014 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "internal.h" @@ -119,6 +120,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { + give_power_bump(BUMP_FOR_DISK); wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); @@ -156,6 +158,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { + give_power_bump(BUMP_FOR_DISK); __end_buffer_read_notouch(bh, uptodate); put_bh(bh); } @@ -163,6 +166,7 @@ EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { + give_power_bump(BUMP_FOR_DISK); if (uptodate) { set_buffer_uptodate(bh); } else { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a404ac1c178f0..f451099d9343a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -1104,6 +1105,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&jh->b_state_lock); + give_power_bump(BUMP_FOR_DISK); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } diff --git a/include/linux/powerbump.h b/include/linux/powerbump.h new file mode 100644 index 00000000000000..a740b147a54d85 --- /dev/null +++ b/include/linux/powerbump.h @@ -0,0 +1,14 @@ +#pragma once + +/* in nsecs */ +#define BUMP_LATENCY_THRESHOLD 2000 + + +/* bump time constants, in msec */ +#define BUMP_FOR_DISK 3 + + + +/* API prototypes */ +extern void give_power_bump(int msecs); +extern int in_power_bump(void); From 66c6adf5ffe518ea4953f543adeccf9e940b0c6d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 5 Jan 2023 16:52:33 +0000 Subject: [PATCH 29/30] add networking support for powerbump --- include/linux/powerbump.h | 3 ++- net/core/dev.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/powerbump.h b/include/linux/powerbump.h index a740b147a54d85..10df80d6f03f8e 100644 --- a/include/linux/powerbump.h +++ b/include/linux/powerbump.h @@ -5,7 +5,8 @@ /* bump time constants, in msec */ -#define BUMP_FOR_DISK 3 +#define BUMP_FOR_DISK 3 +#define BUMP_FOR_NETWORK 3 diff --git a/net/core/dev.c b/net/core/dev.c index 3be256051e99b9..bed00f3a22694d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -150,6 +150,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" @@ -5744,6 +5745,7 @@ int netif_receive_skb(struct sk_buff *skb) int ret; trace_netif_receive_skb_entry(skb); + give_power_bump(BUMP_FOR_NETWORK); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); @@ -5768,6 +5770,7 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; + give_power_bump(BUMP_FOR_NETWORK); if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); From 7ee30f8d6c51aba3effe98184b32f18aae300f56 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 12 Jan 2023 19:19:04 +0000 Subject: [PATCH 30/30] futex bump --- include/linux/powerbump.h | 1 + kernel/futex/waitwake.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/linux/powerbump.h b/include/linux/powerbump.h index 10df80d6f03f8e..a17ed37744668e 100644 --- a/include/linux/powerbump.h +++ b/include/linux/powerbump.h @@ -7,6 +7,7 @@ /* bump time constants, in msec */ #define BUMP_FOR_DISK 3 #define BUMP_FOR_NETWORK 3 +#define BUMP_FOR_FUTEX 3 diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b94082033b..e4fc09a98cbcd6 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "futex.h" @@ -336,6 +337,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); + give_power_bump(BUMP_FOR_FUTEX); /* Arm the timer */ if (timeout)