From 0112c941af194dbf3978767addf784989b3d39b6 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Wed, 14 May 2025 09:37:49 -0400 Subject: [PATCH 001/353] ASoC: SOF: Intel: hda: Fix UAF when reloading module hda_generic_machine_select() appends -idisp to the tplg filename by allocating a new string with devm_kasprintf(), then stores the string right back into the global variable snd_soc_acpi_intel_hda_machines. When the module is unloaded, this memory is freed, resulting in a global variable pointing to freed memory. Reloading the module then triggers a use-after-free: BUG: KFENCE: use-after-free read in string+0x48/0xe0 Use-after-free read at 0x00000000967e0109 (in kfence-#99): string+0x48/0xe0 vsnprintf+0x329/0x6e0 devm_kvasprintf+0x54/0xb0 devm_kasprintf+0x58/0x80 hda_machine_select.cold+0x198/0x17a2 [snd_sof_intel_hda_generic] sof_probe_work+0x7f/0x600 [snd_sof] process_one_work+0x17b/0x330 worker_thread+0x2ce/0x3f0 kthread+0xcf/0x100 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x1a/0x30 kfence-#99: 0x00000000198a940f-0x00000000ace47d9d, size=64, cache=kmalloc-64 allocated by task 333 on cpu 8 at 17.798069s (130.453553s ago): devm_kmalloc+0x52/0x120 devm_kvasprintf+0x66/0xb0 devm_kasprintf+0x58/0x80 hda_machine_select.cold+0x198/0x17a2 [snd_sof_intel_hda_generic] sof_probe_work+0x7f/0x600 [snd_sof] process_one_work+0x17b/0x330 worker_thread+0x2ce/0x3f0 kthread+0xcf/0x100 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x1a/0x30 freed by task 1543 on cpu 4 at 141.586686s (6.665010s ago): release_nodes+0x43/0xb0 devres_release_all+0x90/0xf0 device_unbind_cleanup+0xe/0x70 device_release_driver_internal+0x1c1/0x200 driver_detach+0x48/0x90 bus_remove_driver+0x6d/0xf0 pci_unregister_driver+0x42/0xb0 __do_sys_delete_module+0x1d1/0x310 do_syscall_64+0x82/0x190 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by copying the match array with devm_kmemdup_array() before we modify it. Fixes: 5458411d7594 ("ASoC: SOF: Intel: hda: refactoring topology name fixup for HDA mach") Suggested-by: Peter Ujfalusi Acked-by: Peter Ujfalusi Signed-off-by: Tavian Barnes Link: https://patch.msgid.link/570b15570b274520a0d9052f4e0f064a29c950ef.1747229716.git.tavianator@tavianator.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index 96f00adc89091f..5347c433ac8eb0 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -1057,7 +1057,21 @@ static void hda_generic_machine_select(struct snd_sof_dev *sdev, if (!*mach && codec_num <= 2) { bool tplg_fixup = false; - hda_mach = snd_soc_acpi_intel_hda_machines; + /* + * make a local copy of the match array since we might + * be modifying it + */ + hda_mach = devm_kmemdup_array(sdev->dev, + snd_soc_acpi_intel_hda_machines, + 2, /* we have one entry + sentinel in the array */ + sizeof(snd_soc_acpi_intel_hda_machines[0]), + GFP_KERNEL); + if (!hda_mach) { + dev_err(bus->dev, + "%s: failed to duplicate the HDA match table\n", + __func__); + return; + } dev_info(bus->dev, "using HDA machine driver %s now\n", hda_mach->drv_name); From d082eb23321395cf2031a5214adc107ca7a9c91a Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Tue, 13 May 2025 20:37:44 +0800 Subject: [PATCH 002/353] ASoC: tegra210_ahub: Add check to of_device_get_match_data() In tegra_ahub_probe(), check the result of function of_device_get_match_data(), return an error code in case it fails. Signed-off-by: Yuanjun Gong Link: https://patch.msgid.link/20250513123744.3041724-1-ruc_gongyuanjun@163.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_ahub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/tegra/tegra210_ahub.c b/sound/soc/tegra/tegra210_ahub.c index 99683f292b5d84..ae4965a9f7649e 100644 --- a/sound/soc/tegra/tegra210_ahub.c +++ b/sound/soc/tegra/tegra210_ahub.c @@ -1359,6 +1359,8 @@ static int tegra_ahub_probe(struct platform_device *pdev) return -ENOMEM; ahub->soc_data = of_device_get_match_data(&pdev->dev); + if (!ahub->soc_data) + return -ENODEV; platform_set_drvdata(pdev, ahub); From da763124036d56f3fc2a65139c4e928322fbf94f Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 6 May 2025 02:54:23 +0800 Subject: [PATCH 003/353] ASoC: intel/sdw_utils: Assign initial value in asoc_sdw_rt_amp_spk_rtd_init() Initialize "ret" with "-EINVAL" to handle cases where "strstr()" for "codec_dai->component->name_prefix" doesn't find "-1" nor "-2". In that case "name_prefix" is invalid because for current implementation it's expected to have either "-1" or "-2" in it. (Maybe "-3", "-4" and so on in the future.) Link: https://scan5.scan.coverity.com/#/project-view/36179/10063?selectedIssue=1627120 Signed-off-by: I Hsin Cheng Link: https://patch.msgid.link/20250505185423.680608-1-richard120310@gmail.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_rt_amp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sdw_utils/soc_sdw_rt_amp.c b/sound/soc/sdw_utils/soc_sdw_rt_amp.c index 0538c252ba69b1..83c2368170cb5e 100644 --- a/sound/soc/sdw_utils/soc_sdw_rt_amp.c +++ b/sound/soc/sdw_utils/soc_sdw_rt_amp.c @@ -190,7 +190,7 @@ int asoc_sdw_rt_amp_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc const struct snd_soc_dapm_route *rt_amp_map; char codec_name[CODEC_NAME_SIZE]; struct snd_soc_dai *codec_dai; - int ret; + int ret = -EINVAL; int i; rt_amp_map = get_codec_name_and_route(dai, codec_name); From 088fe609df05a46e966856ab5e6618098b75ed1d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 May 2025 12:57:03 +0200 Subject: [PATCH 004/353] ASoC: dt-bindings: mediatek: Simplify mediatek,clk-provider "mediatek,clk-provider" property is a string, not an string array, thus "items" is not really correct. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250514105702.28622-2-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml | 7 +++---- .../bindings/sound/mt8186-mt6366-da7219-max98357.yaml | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml index 76d5a437dc8f4a..7ba2ea2dfa0b17 100644 --- a/Documentation/devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml @@ -96,10 +96,9 @@ patternProperties: mediatek,clk-provider: $ref: /schemas/types.yaml#/definitions/string description: Indicates dai-link clock master. - items: - enum: - - cpu - - codec + enum: + - cpu + - codec additionalProperties: false diff --git a/Documentation/devicetree/bindings/sound/mt8186-mt6366-da7219-max98357.yaml b/Documentation/devicetree/bindings/sound/mt8186-mt6366-da7219-max98357.yaml index cbc641ecbe94af..037f21443ad14c 100644 --- a/Documentation/devicetree/bindings/sound/mt8186-mt6366-da7219-max98357.yaml +++ b/Documentation/devicetree/bindings/sound/mt8186-mt6366-da7219-max98357.yaml @@ -124,10 +124,9 @@ patternProperties: mediatek,clk-provider: $ref: /schemas/types.yaml#/definitions/string description: Indicates dai-link clock master. - items: - enum: - - cpu - - codec + enum: + - cpu + - codec required: - link-name From 07ed7a6f28a1ad999585a1f6d1b3dc94839834f7 Mon Sep 17 00:00:00 2001 From: Brady Norander Date: Sun, 30 Mar 2025 09:08:45 -0400 Subject: [PATCH 005/353] ASoC: amd: use new ACP dev names for DAI links On AMD SoC platforms with an ACP2x gpu ip block (such as stoneyridge), the amdgpu driver will create several platform devices for the ACP ASoC driver to communicate with the ACP hardware block on the gpu. These platform devices include dma for audio and one or multiple i2s interfaces. The amdgpu driver has always created these platform devices with automatic ids. The ASoC machine drives hardcode the platform device name. This creates an issue where if the ACP platform devices are not the first to be created, the ids can be different to what the machine drivers expect, causing them to not find the ACP platform devices and failing to load. Switch to using static ids for these ACP platform devices so that the names never change. Depends on patch: drm/amdgpu: use static ids for ACP platform devs [1] [1] https://lore.kernel.org/all/20250325210517.2097188-1-bradynorander@gmail.com/ Signed-off-by: Brady Norander Link: https://patch.msgid.link/20250330130844.37870-2-bradynorander@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/acp-da7219-max98357a.c | 8 ++++---- sound/soc/amd/acp-es8336.c | 4 ++-- sound/soc/amd/acp-rt5645.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sound/soc/amd/acp-da7219-max98357a.c b/sound/soc/amd/acp-da7219-max98357a.c index 02b04f355ca663..42aa009c4e13d7 100644 --- a/sound/soc/amd/acp-da7219-max98357a.c +++ b/sound/soc/amd/acp-da7219-max98357a.c @@ -517,11 +517,11 @@ static const struct snd_soc_ops cz_rt5682_dmic1_cap_ops = { }; SND_SOC_DAILINK_DEF(designware1, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.1.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.1"))); SND_SOC_DAILINK_DEF(designware2, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.2.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.2"))); SND_SOC_DAILINK_DEF(designware3, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.3.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.3"))); SND_SOC_DAILINK_DEF(dlgs, DAILINK_COMP_ARRAY(COMP_CODEC("i2c-DLGS7219:00", "da7219-hifi"))); @@ -533,7 +533,7 @@ SND_SOC_DAILINK_DEF(adau, DAILINK_COMP_ARRAY(COMP_CODEC("ADAU7002:00", "adau7002-hifi"))); SND_SOC_DAILINK_DEF(platform, - DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.0.auto"))); + DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.0"))); static struct snd_soc_dai_link cz_dai_7219_98357[] = { { diff --git a/sound/soc/amd/acp-es8336.c b/sound/soc/amd/acp-es8336.c index 0193b3eae7a660..b16dde0e29871f 100644 --- a/sound/soc/amd/acp-es8336.c +++ b/sound/soc/amd/acp-es8336.c @@ -137,11 +137,11 @@ static const struct snd_soc_ops st_es8336_ops = { }; SND_SOC_DAILINK_DEF(designware1, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.2.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.1"))); SND_SOC_DAILINK_DEF(codec, DAILINK_COMP_ARRAY(COMP_CODEC("i2c-ESSX8336:00", "ES8316 HiFi"))); SND_SOC_DAILINK_DEF(platform, - DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.1.auto"))); + DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.0"))); static struct snd_soc_dai_link st_dai_es8336[] = { { diff --git a/sound/soc/amd/acp-rt5645.c b/sound/soc/amd/acp-rt5645.c index 72ddad24dbda7c..11d3731693801f 100644 --- a/sound/soc/amd/acp-rt5645.c +++ b/sound/soc/amd/acp-rt5645.c @@ -108,15 +108,15 @@ static const struct snd_soc_ops cz_aif1_ops = { }; SND_SOC_DAILINK_DEF(designware1, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.1.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.1"))); SND_SOC_DAILINK_DEF(designware2, - DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.2.auto"))); + DAILINK_COMP_ARRAY(COMP_CPU("designware-i2s.2"))); SND_SOC_DAILINK_DEF(codec, DAILINK_COMP_ARRAY(COMP_CODEC("i2c-10EC5650:00", "rt5645-aif1"))); SND_SOC_DAILINK_DEF(platform, - DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.0.auto"))); + DAILINK_COMP_ARRAY(COMP_PLATFORM("acp_audio_dma.0"))); static struct snd_soc_dai_link cz_dai_rt5650[] = { { From a066ab2721c42f5f9817210fde50e1d327fc70e8 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 14 May 2025 19:53:38 +0200 Subject: [PATCH 006/353] ASoC: q6apm-lpass-dais: Print APM port id in decimal on enable error Change the port enable failure error message format specifier to make it less confusing. Take the chance to align the style ('fail'->'Failed') while at it. Signed-off-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250514-topic-asoc_print_hexdec-v1-1-85e90947ec4f@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index 9c98a35ad09945..a0d90462fd6a38 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -206,7 +206,7 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s rc = q6apm_graph_start(dai_data->graph[dai->id]); if (rc < 0) { - dev_err(dai->dev, "fail to start APM port %x\n", dai->id); + dev_err(dai->dev, "Failed to start APM port %d\n", dai->id); goto err; } dai_data->is_port_started[dai->id] = true; From 867956e4f71bbda045cc04cb7f33c9a1c5b74f06 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 14 May 2025 17:45:45 +0800 Subject: [PATCH 007/353] ASoC: codecs: add support for ES8389 The driver is for codec es8389 of everest which is different from ES8388 Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250514094546.35508-2-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 5 + sound/soc/codecs/Makefile | 4 +- sound/soc/codecs/es8389.c | 962 ++++++++++++++++++++++++++++++++++++++ sound/soc/codecs/es8389.h | 140 ++++++ 4 files changed, 1110 insertions(+), 1 deletion(-) create mode 100644 sound/soc/codecs/es8389.c create mode 100644 sound/soc/codecs/es8389.h diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 21df6783dc267a..8fe795504dbb5c 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -120,6 +120,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_ES8326 imply SND_SOC_ES8328_SPI imply SND_SOC_ES8328_I2C + imply SND_SOC_ES8389 imply SND_SOC_ES7134 imply SND_SOC_ES7241 imply SND_SOC_FRAMER @@ -1211,6 +1212,10 @@ config SND_SOC_ES8328_SPI depends on SPI_MASTER select SND_SOC_ES8328 +config SND_SOC_ES8389 + tristate "Everest Semi ES8389 CODEC" + depends on I2C + config SND_SOC_FRAMER tristate "Framer codec" depends on GENERIC_FRAMER diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index cc2f9b589b56d6..c92824713df069 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -134,6 +134,7 @@ snd-soc-es8326-y := es8326.o snd-soc-es8328-y := es8328.o snd-soc-es8328-i2c-y := es8328-i2c.o snd-soc-es8328-spi-y := es8328-spi.o +snd-soc-es8389-y := es8389.o snd-soc-framer-y := framer-codec.o snd-soc-gtm601-y := gtm601.o snd-soc-hdac-hdmi-y := hdac_hdmi.o @@ -556,6 +557,7 @@ obj-$(CONFIG_SND_SOC_ES8326) += snd-soc-es8326.o obj-$(CONFIG_SND_SOC_ES8328) += snd-soc-es8328.o obj-$(CONFIG_SND_SOC_ES8328_I2C)+= snd-soc-es8328-i2c.o obj-$(CONFIG_SND_SOC_ES8328_SPI)+= snd-soc-es8328-spi.o +obj-$(CONFIG_SND_SOC_ES8389) += snd-soc-es8389.o obj-$(CONFIG_SND_SOC_FRAMER) += snd-soc-framer.o obj-$(CONFIG_SND_SOC_GTM601) += snd-soc-gtm601.o obj-$(CONFIG_SND_SOC_HDAC_HDMI) += snd-soc-hdac-hdmi.o @@ -849,4 +851,4 @@ obj-$(CONFIG_SND_SOC_LPASS_RX_MACRO) += snd-soc-lpass-rx-macro.o obj-$(CONFIG_SND_SOC_LPASS_TX_MACRO) += snd-soc-lpass-tx-macro.o # Mux -obj-$(CONFIG_SND_SOC_SIMPLE_MUX) += snd-soc-simple-mux.o +obj-$(CONFIG_SND_SOC_SIMPLE_MUX) += snd-soc-simple-mux.o \ No newline at end of file diff --git a/sound/soc/codecs/es8389.c b/sound/soc/codecs/es8389.c new file mode 100644 index 00000000000000..ba1763f36f1712 --- /dev/null +++ b/sound/soc/codecs/es8389.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * es8389.c -- ES8389 ALSA SoC Audio Codec + * + * Copyright Everest Semiconductor Co., Ltd + * + * Authors: Michael Zhang (zhangyi@everest-semi.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "es8389.h" + + +/* codec private data */ + +struct es8389_private { + struct regmap *regmap; + struct clk *mclk; + unsigned int sysclk; + int mastermode; + + u8 mclk_src; + enum snd_soc_bias_level bias_level; +}; + +static bool es8389_volatile_register(struct device *dev, + unsigned int reg) +{ + if ((reg <= 0xff)) + return true; + else + return false; +} + +static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -9550, 50, 0); +static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -9550, 50, 0); +static const DECLARE_TLV_DB_SCALE(pga_vol_tlv, 0, 300, 0); +static const DECLARE_TLV_DB_SCALE(mix_vol_tlv, -9500, 100, 0); +static const DECLARE_TLV_DB_SCALE(alc_target_tlv, -3200, 200, 0); +static const DECLARE_TLV_DB_SCALE(alc_max_level, -3200, 200, 0); + +static int es8389_dmic_set(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_dapm_kcontrol_component(kcontrol); + struct snd_soc_dapm_context *dapm = snd_soc_dapm_kcontrol_dapm(kcontrol); + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; + unsigned int val; + bool changed1, changed2; + + val = ucontrol->value.integer.value[0]; + if (val > 1) + return -EINVAL; + + if (val) { + regmap_update_bits_check(es8389->regmap, ES8389_DMIC_EN, 0xC0, 0xC0, &changed1); + regmap_update_bits_check(es8389->regmap, ES8389_ADC_MODE, 0x03, 0x03, &changed2); + } else { + regmap_update_bits_check(es8389->regmap, ES8389_DMIC_EN, 0xC0, 0x00, &changed1); + regmap_update_bits_check(es8389->regmap, ES8389_ADC_MODE, 0x03, 0x00, &changed2); + } + + if (changed1 & changed2) + return snd_soc_dapm_mux_update_power(dapm, kcontrol, val, e, NULL); + else + return 0; +} + +static const char *const alc[] = { + "ALC OFF", + "ADCR ALC ON", + "ADCL ALC ON", + "ADCL & ADCL ALC ON", +}; + +static const char *const ramprate[] = { + "0.125db/1 LRCK", + "0.125db/4 LRCK", + "0.125db/8 LRCK", + "0.125db/16 LRCK", + "0.125db/32 LRCK", + "0.125db/64 LRCK", + "0.125db/128 LRCK", + "0.125db/256 LRCK", + "0.125db/512 LRCK", + "0.125db/1024 LRCK", + "0.125db/2048 LRCK", + "0.125db/4096 LRCK", + "0.125db/8192 LRCK", + "0.125db/16384 LRCK", + "0.125db/32768 LRCK", + "0.125db/65536 LRCK", +}; + +static const char *const winsize[] = { + "2 LRCK", + "4 LRCK", + "8 LRCK", + "16 LRCK", + "32 LRCK", + "64 LRCK", + "128 LRCK", + "256 LRCK", + "512 LRCK", + "1024 LRCK", + "2048 LRCK", + "4096 LRCK", + "8192 LRCK", + "16384 LRCK", + "32768 LRCK", + "65536 LRCK", +}; + +static const struct soc_enum alc_enable = + SOC_ENUM_SINGLE(ES8389_ALC_ON, 5, 4, alc); +static const struct soc_enum alc_ramprate = + SOC_ENUM_SINGLE(ES8389_ALC_CTL, 4, 16, ramprate); +static const struct soc_enum alc_winsize = + SOC_ENUM_SINGLE(ES8389_ALC_CTL, 0, 16, winsize); + +static const char *const es8389_outl_mux_txt[] = { + "Normal", + "DAC2 channel to DAC1 channel", +}; + +static const char *const es8389_outr_mux_txt[] = { + "Normal", + "DAC1 channel to DAC2 channel", +}; + +static const char *const es8389_dmic_mux_txt[] = { + "AMIC", + "DMIC", +}; + +static const char *const es8389_pga1_texts[] = { + "DifferentialL", "Line 1P", "Line 2P" +}; + +static const char *const es8389_pga2_texts[] = { + "DifferentialR", "Line 2N", "Line 1N" +}; + +static const unsigned int es8389_pga_values[] = { + 1, 5, 6 +}; + +static const struct soc_enum es8389_outl_mux_enum = + SOC_ENUM_SINGLE(ES8389_DAC_MIX, 5, + ARRAY_SIZE(es8389_outl_mux_txt), es8389_outl_mux_txt); + +static const struct snd_kcontrol_new es8389_outl_mux_controls = + SOC_DAPM_ENUM("OUTL MUX", es8389_outl_mux_enum); + +static const struct soc_enum es8389_outr_mux_enum = + SOC_ENUM_SINGLE(ES8389_DAC_MIX, 4, + ARRAY_SIZE(es8389_outr_mux_txt), es8389_outr_mux_txt); + +static const struct snd_kcontrol_new es8389_outr_mux_controls = + SOC_DAPM_ENUM("OUTR MUX", es8389_outr_mux_enum); + +static SOC_ENUM_SINGLE_DECL( + es8389_dmic_mux_enum, ES8389_DMIC_EN, 6, es8389_dmic_mux_txt); + +static const struct soc_enum es8389_pgal_enum = + SOC_VALUE_ENUM_SINGLE(ES8389_MIC1_GAIN, 4, 7, + ARRAY_SIZE(es8389_pga1_texts), es8389_pga1_texts, + es8389_pga_values); + +static const struct soc_enum es8389_pgar_enum = + SOC_VALUE_ENUM_SINGLE(ES8389_MIC2_GAIN, 4, 7, + ARRAY_SIZE(es8389_pga2_texts), es8389_pga2_texts, + es8389_pga_values); + +static const struct snd_kcontrol_new es8389_dmic_mux_controls = + SOC_DAPM_ENUM_EXT("ADC MUX", es8389_dmic_mux_enum, + snd_soc_dapm_get_enum_double, es8389_dmic_set); + +static const struct snd_kcontrol_new es8389_left_mixer_controls[] = { + SOC_DAPM_SINGLE("DACR DACL Mixer", ES8389_DAC_MIX, 3, 1, 0), +}; + +static const struct snd_kcontrol_new es8389_right_mixer_controls[] = { + SOC_DAPM_SINGLE("DACL DACR Mixer", ES8389_DAC_MIX, 2, 1, 0), +}; + +static const struct snd_kcontrol_new es8389_leftadc_mixer_controls[] = { + SOC_DAPM_SINGLE("ADCL DACL Mixer", ES8389_DAC_MIX, 1, 1, 0), +}; + +static const struct snd_kcontrol_new es8389_rightadc_mixer_controls[] = { + SOC_DAPM_SINGLE("ADCR DACR Mixer", ES8389_DAC_MIX, 0, 1, 0), +}; + +static const struct snd_kcontrol_new es8389_adc_mixer_controls[] = { + SOC_DAPM_SINGLE("DACL ADCL Mixer", ES8389_ADC_RESET, 7, 1, 0), + SOC_DAPM_SINGLE("DACR ADCR Mixer", ES8389_ADC_RESET, 6, 1, 0), +}; + +static const struct snd_kcontrol_new es8389_snd_controls[] = { + SOC_SINGLE_TLV("ADCL Capture Volume", ES8389_ADCL_VOL, 0, 0xFF, 0, adc_vol_tlv), + SOC_SINGLE_TLV("ADCR Capture Volume", ES8389_ADCR_VOL, 0, 0xFF, 0, adc_vol_tlv), + SOC_SINGLE_TLV("ADCL PGA Volume", ES8389_MIC1_GAIN, 0, 0x0E, 0, pga_vol_tlv), + SOC_SINGLE_TLV("ADCR PGA Volume", ES8389_MIC2_GAIN, 0, 0x0E, 0, pga_vol_tlv), + + SOC_ENUM("PGAL Select", es8389_pgal_enum), + SOC_ENUM("PGAR Select", es8389_pgar_enum), + SOC_ENUM("ALC Capture Switch", alc_enable), + SOC_SINGLE_TLV("ALC Capture Target Level", ES8389_ALC_TARGET, + 0, 0x0f, 0, alc_target_tlv), + SOC_SINGLE_TLV("ALC Capture Max Gain", ES8389_ALC_GAIN, + 0, 0x0f, 0, alc_max_level), + SOC_ENUM("ADC Ramp Rate", alc_ramprate), + SOC_ENUM("ALC Capture Winsize", alc_winsize), + SOC_DOUBLE("ADC OSR Volume ON Switch", ES8389_ADC_MUTE, 6, 7, 1, 0), + SOC_SINGLE_TLV("ADC OSR Volume", ES8389_OSR_VOL, 0, 0xFF, 0, adc_vol_tlv), + SOC_DOUBLE("ADC OUTPUT Invert Switch", ES8389_ADC_HPF2, 5, 6, 1, 0), + + SOC_SINGLE_TLV("DACL Playback Volume", ES8389_DACL_VOL, 0, 0xFF, 0, dac_vol_tlv), + SOC_SINGLE_TLV("DACR Playback Volume", ES8389_DACR_VOL, 0, 0xFF, 0, dac_vol_tlv), + SOC_DOUBLE("DAC OUTPUT Invert Switch", ES8389_DAC_INV, 5, 6, 1, 0), + SOC_SINGLE_TLV("ADC2DAC Mixer Volume", ES8389_MIX_VOL, 0, 0x7F, 0, mix_vol_tlv), +}; + +static const struct snd_soc_dapm_widget es8389_dapm_widgets[] = { + /*Input Side*/ + SND_SOC_DAPM_INPUT("INPUT1"), + SND_SOC_DAPM_INPUT("INPUT2"), + SND_SOC_DAPM_INPUT("DMIC"), + SND_SOC_DAPM_PGA("PGAL", SND_SOC_NOPM, 4, 0, NULL, 0), + SND_SOC_DAPM_PGA("PGAR", SND_SOC_NOPM, 4, 0, NULL, 0), + + /*ADCs*/ + SND_SOC_DAPM_ADC("ADCL", NULL, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_ADC("ADCR", NULL, SND_SOC_NOPM, 0, 0), + + /* Audio Interface */ + SND_SOC_DAPM_AIF_OUT("I2S OUT", "I2S Capture", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_IN("I2S IN", "I2S Playback", 0, SND_SOC_NOPM, 0, 0), + + /*DACs*/ + SND_SOC_DAPM_DAC("DACL", NULL, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_DAC("DACR", NULL, SND_SOC_NOPM, 0, 0), + + /*Output Side*/ + SND_SOC_DAPM_OUTPUT("HPOL"), + SND_SOC_DAPM_OUTPUT("HPOR"), + + /* Digital Interface */ + SND_SOC_DAPM_PGA("IF DAC", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACL1", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACR1", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACL2", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACR2", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACL3", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DACR3", SND_SOC_NOPM, 0, 0, NULL, 0), + + /* Digital Interface Select */ + SND_SOC_DAPM_MIXER("IF DACL Mixer", SND_SOC_NOPM, 0, 0, + &es8389_left_mixer_controls[0], + ARRAY_SIZE(es8389_left_mixer_controls)), + SND_SOC_DAPM_MIXER("IF DACR Mixer", SND_SOC_NOPM, 0, 0, + &es8389_right_mixer_controls[0], + ARRAY_SIZE(es8389_right_mixer_controls)), + SND_SOC_DAPM_MIXER("IF ADCDACL Mixer", SND_SOC_NOPM, 0, 0, + &es8389_leftadc_mixer_controls[0], + ARRAY_SIZE(es8389_leftadc_mixer_controls)), + SND_SOC_DAPM_MIXER("IF ADCDACR Mixer", SND_SOC_NOPM, 0, 0, + &es8389_rightadc_mixer_controls[0], + ARRAY_SIZE(es8389_rightadc_mixer_controls)), + + SND_SOC_DAPM_MIXER("ADC Mixer", SND_SOC_NOPM, 0, 0, + &es8389_adc_mixer_controls[0], + ARRAY_SIZE(es8389_adc_mixer_controls)), + SND_SOC_DAPM_MUX("ADC MUX", SND_SOC_NOPM, 0, 0, &es8389_dmic_mux_controls), + + SND_SOC_DAPM_MUX("OUTL MUX", SND_SOC_NOPM, 0, 0, &es8389_outl_mux_controls), + SND_SOC_DAPM_MUX("OUTR MUX", SND_SOC_NOPM, 0, 0, &es8389_outr_mux_controls), +}; + + +static const struct snd_soc_dapm_route es8389_dapm_routes[] = { + {"PGAL", NULL, "INPUT1"}, + {"PGAR", NULL, "INPUT2"}, + + {"ADCL", NULL, "PGAL"}, + {"ADCR", NULL, "PGAR"}, + + {"ADC Mixer", "DACL ADCL Mixer", "DACL"}, + {"ADC Mixer", "DACR ADCR Mixer", "DACR"}, + {"ADC Mixer", NULL, "ADCL"}, + {"ADC Mixer", NULL, "ADCR"}, + + {"ADC MUX", "AMIC", "ADC Mixer"}, + {"ADC MUX", "DMIC", "DMIC"}, + + {"I2S OUT", NULL, "ADC MUX"}, + + {"DACL", NULL, "I2S IN"}, + {"DACR", NULL, "I2S IN"}, + + {"IF DACL1", NULL, "DACL"}, + {"IF DACR1", NULL, "DACR"}, + {"IF DACL2", NULL, "DACL"}, + {"IF DACR2", NULL, "DACR"}, + {"IF DACL3", NULL, "DACL"}, + {"IF DACR3", NULL, "DACR"}, + + {"IF DACL Mixer", NULL, "IF DACL2"}, + {"IF DACL Mixer", "DACR DACL Mixer", "IF DACR1"}, + {"IF DACR Mixer", NULL, "IF DACR2"}, + {"IF DACR Mixer", "DACL DACR Mixer", "IF DACL1"}, + + {"IF ADCDACL Mixer", NULL, "IF DACL Mixer"}, + {"IF ADCDACL Mixer", "ADCL DACL Mixer", "IF DACL3"}, + {"IF ADCDACR Mixer", NULL, "IF DACR Mixer"}, + {"IF ADCDACR Mixer", "ADCR DACR Mixer", "IF DACR3"}, + + {"OUTL MUX", "Normal", "IF ADCDACL Mixer"}, + {"OUTL MUX", "DAC2 channel to DAC1 channel", "IF ADCDACR Mixer"}, + {"OUTR MUX", "Normal", "IF ADCDACR Mixer"}, + {"OUTR MUX", "DAC1 channel to DAC2 channel", "IF ADCDACL Mixer"}, + + {"HPOL", NULL, "OUTL MUX"}, + {"HPOR", NULL, "OUTR MUX"}, + +}; + +struct _coeff_div { + u16 fs; + u32 mclk; + u32 rate; + u8 Reg0x04; + u8 Reg0x05; + u8 Reg0x06; + u8 Reg0x07; + u8 Reg0x08; + u8 Reg0x09; + u8 Reg0x0A; + u8 Reg0x0F; + u8 Reg0x11; + u8 Reg0x21; + u8 Reg0x22; + u8 Reg0x26; + u8 Reg0x30; + u8 Reg0x41; + u8 Reg0x42; + u8 Reg0x43; + u8 Reg0xF0; + u8 Reg0xF1; + u8 Reg0x16; + u8 Reg0x18; + u8 Reg0x19; +}; + +/* codec hifi mclk clock divider coefficients */ +static const struct _coeff_div coeff_div[] = { + {32, 256000, 8000, 0x00, 0x57, 0x84, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {36, 288000, 8000, 0x00, 0x55, 0x84, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {48, 384000, 8000, 0x02, 0x5F, 0x04, 0xC0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {64, 512000, 8000, 0x00, 0x4D, 0x24, 0xC0, 0x03, 0xD1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {72, 576000, 8000, 0x00, 0x45, 0x24, 0xC0, 0x01, 0xD1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {96, 768000, 8000, 0x02, 0x57, 0x84, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {128, 1024000, 8000, 0x00, 0x45, 0x04, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {192, 1536000, 8000, 0x02, 0x4D, 0x24, 0xC0, 0x03, 0xD1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {256, 2048000, 8000, 0x01, 0x45, 0x04, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {288, 2304000, 8000, 0x01, 0x51, 0x00, 0xC0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {384, 3072000, 8000, 0x02, 0x45, 0x04, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {512, 4096000, 8000, 0x00, 0x41, 0x04, 0xE0, 0x00, 0xD1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {768, 6144000, 8000, 0x05, 0x45, 0x04, 0xD0, 0x03, 0xC1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {1024, 8192000, 8000, 0x01, 0x41, 0x06, 0xE0, 0x00, 0xD1, 0xB0, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {1536, 12288000, 8000, 0x02, 0x41, 0x04, 0xE0, 0x00, 0xD1, 0xB0, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {1625, 13000000, 8000, 0x40, 0x6E, 0x05, 0xC8, 0x01, 0xC2, 0x90, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x09, 0x19, 0x07}, + {2048, 16384000, 8000, 0x03, 0x44, 0x01, 0xC0, 0x00, 0xD2, 0x80, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {2304, 18432000, 8000, 0x11, 0x45, 0x25, 0xF0, 0x00, 0xD1, 0xB0, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {3072, 24576000, 8000, 0x05, 0x44, 0x01, 0xC0, 0x00, 0xD2, 0x80, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {32, 512000, 16000, 0x00, 0x55, 0x84, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {36, 576000, 16000, 0x00, 0x55, 0x84, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {48, 768000, 16000, 0x02, 0x57, 0x04, 0xC0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {50, 800000, 16000, 0x00, 0x7E, 0x01, 0xD9, 0x00, 0xC2, 0x80, 0x00, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {64, 1024000, 16000, 0x00, 0x45, 0x24, 0xC0, 0x01, 0xD1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {72, 1152000, 16000, 0x00, 0x45, 0x24, 0xC0, 0x01, 0xD1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {96, 1536000, 16000, 0x02, 0x55, 0x84, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {128, 2048000, 16000, 0x00, 0x51, 0x04, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {144, 2304000, 16000, 0x00, 0x51, 0x00, 0xC0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x23, 0x8F, 0xB7, 0xC0, 0x1F, 0x8F, 0x01, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {192, 3072000, 16000, 0x02, 0x65, 0x25, 0xE0, 0x00, 0xE1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {256, 4096000, 16000, 0x00, 0x41, 0x04, 0xC0, 0x01, 0xD1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {300, 4800000, 16000, 0x02, 0x66, 0x01, 0xD9, 0x00, 0xC2, 0x80, 0x00, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {384, 6144000, 16000, 0x02, 0x51, 0x04, 0xD0, 0x01, 0xC1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {512, 8192000, 16000, 0x01, 0x41, 0x04, 0xC0, 0x01, 0xD1, 0x90, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {750, 12000000, 16000, 0x0E, 0x7E, 0x01, 0xC9, 0x00, 0xC2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {768, 12288000, 16000, 0x02, 0x41, 0x04, 0xC0, 0x01, 0xD1, 0x90, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1024, 16384000, 16000, 0x03, 0x41, 0x04, 0xC0, 0x01, 0xD1, 0x90, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1152, 18432000, 16000, 0x08, 0x51, 0x04, 0xD0, 0x01, 0xC1, 0x90, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1200, 19200000, 16000, 0x0B, 0x66, 0x01, 0xD9, 0x00, 0xC2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1500, 24000000, 16000, 0x0E, 0x26, 0x01, 0xD9, 0x00, 0xC2, 0x80, 0xC0, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1536, 24576000, 16000, 0x05, 0x41, 0x04, 0xC0, 0x01, 0xD1, 0x90, 0xC0, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0xFF, 0x7F, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {1625, 26000000, 16000, 0x40, 0x6E, 0x05, 0xC8, 0x01, 0xC2, 0x90, 0xC0, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x12, 0x31, 0x0E}, + {800, 19200000, 24000, 0x07, 0x66, 0x01, 0xD9, 0x00, 0xC2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0xC7, 0x95, 0x00, 0x12, 0x00, 0x1A, 0x49, 0x14}, + {600, 19200000, 32000, 0x05, 0x46, 0x01, 0xD8, 0x10, 0xD2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x23, 0x61, 0x1B}, + {32, 1411200, 44100, 0x00, 0x45, 0xA4, 0xD0, 0x10, 0xD1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {64, 2822400, 44100, 0x00, 0x51, 0x00, 0xC0, 0x10, 0xC1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {128, 5644800, 44100, 0x00, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {256, 11289600, 44100, 0x01, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {512, 22579200, 44100, 0x03, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0xC0, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {32, 1536000, 48000, 0x00, 0x45, 0xA4, 0xD0, 0x10, 0xD1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {48, 2304000, 48000, 0x02, 0x55, 0x04, 0xC0, 0x10, 0xC1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {50, 2400000, 48000, 0x00, 0x76, 0x01, 0xC8, 0x10, 0xC2, 0x80, 0x00, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {64, 3072000, 48000, 0x00, 0x51, 0x04, 0xC0, 0x10, 0xC1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {100, 4800000, 48000, 0x00, 0x46, 0x01, 0xD8, 0x10, 0xD2, 0x80, 0x00, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {125, 6000000, 48000, 0x04, 0x6E, 0x05, 0xC8, 0x10, 0xC2, 0x80, 0x00, 0x01, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {128, 6144000, 48000, 0x00, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0x00, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {200, 9600000, 48000, 0x01, 0x46, 0x01, 0xD8, 0x10, 0xD2, 0x80, 0x00, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {250, 12000000, 48000, 0x04, 0x76, 0x01, 0xC8, 0x10, 0xC2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {256, 12288000, 48000, 0x01, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {384, 18432000, 48000, 0x02, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0x40, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {400, 19200000, 48000, 0x03, 0x46, 0x01, 0xD8, 0x10, 0xD2, 0x80, 0x40, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {500, 24000000, 48000, 0x04, 0x46, 0x01, 0xD8, 0x10, 0xD2, 0x80, 0xC0, 0x00, 0x18, 0x95, 0xD0, 0xC0, 0x63, 0x95, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {512, 24576000, 48000, 0x03, 0x41, 0x04, 0xD0, 0x10, 0xD1, 0x80, 0xC0, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {800, 38400000, 48000, 0x18, 0x45, 0x04, 0xC0, 0x10, 0xC1, 0x80, 0xC0, 0x00, 0x1F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x00, 0x12, 0x00, 0x35, 0x91, 0x28}, + {128, 11289600, 88200, 0x00, 0x50, 0x00, 0xC0, 0x10, 0xC1, 0x80, 0x40, 0x00, 0x9F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x80, 0x12, 0xC0, 0x32, 0x89, 0x25}, + {64, 6144000, 96000, 0x00, 0x41, 0x00, 0xD0, 0x10, 0xD1, 0x80, 0x00, 0x00, 0x9F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x80, 0x12, 0xC0, 0x35, 0x91, 0x28}, + {128, 12288000, 96000, 0x00, 0x50, 0x00, 0xC0, 0x10, 0xC1, 0x80, 0xC0, 0x00, 0x9F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x80, 0x12, 0xC0, 0x35, 0x91, 0x28}, + {256, 24576000, 96000, 0x00, 0x40, 0x00, 0xC0, 0x10, 0xC1, 0x80, 0xC0, 0x00, 0x9F, 0x7F, 0xBF, 0xC0, 0x7F, 0x7F, 0x80, 0x12, 0xC0, 0x35, 0x91, 0x28}, + {128, 24576000, 192000, 0x00, 0x50, 0x00, 0xC0, 0x18, 0xC1, 0x81, 0xC0, 0x00, 0x8F, 0x7F, 0xEF, 0xC0, 0x3F, 0x7F, 0x80, 0x12, 0xC0, 0x3F, 0xF9, 0x3F}, + + {50, 400000, 8000, 0x00, 0x75, 0x05, 0xC8, 0x01, 0xC1, 0x90, 0x10, 0x00, 0x18, 0xC7, 0xD0, 0xC0, 0x8F, 0xC7, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {600, 4800000, 8000, 0x05, 0x65, 0x25, 0xF9, 0x00, 0xD1, 0x90, 0x10, 0x00, 0x18, 0xC7, 0xD0, 0xC0, 0x8F, 0xC7, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {1500, 12000000, 8000, 0x0E, 0x25, 0x25, 0xE8, 0x00, 0xD1, 0x90, 0x40, 0x00, 0x31, 0xC7, 0xC5, 0x00, 0x8F, 0xC7, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {2400, 19200000, 8000, 0x0B, 0x01, 0x00, 0xD0, 0x00, 0xD1, 0x80, 0x90, 0x00, 0x31, 0xC7, 0xC5, 0x00, 0xC7, 0xC7, 0x00, 0x12, 0x00, 0x09, 0x19, 0x07}, + {3000, 24000000, 8000, 0x0E, 0x24, 0x05, 0xD0, 0x00, 0xC2, 0x80, 0xC0, 0x00, 0x31, 0xC7, 0xC5, 0x00, 0x8F, 0xC7, 0x01, 0x12, 0x00, 0x09, 0x19, 0x07}, + {3250, 26000000, 8000, 0x40, 0x05, 0xA4, 0xC0, 0x00, 0xD1, 0x80, 0xD0, 0x00, 0x31, 0xC7, 0xC5, 0x00, 0xC7, 0xC7, 0x00, 0x12, 0x00, 0x09, 0x19, 0x07}, +}; + +static inline int get_coeff(int mclk, int rate) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(coeff_div); i++) { + if (coeff_div[i].rate == rate && coeff_div[i].mclk == mclk) + return i; + } + return -EINVAL; +} + +/* + * if PLL not be used, use internal clk1 for mclk,otherwise, use internal clk2 for PLL source. + */ +static int es8389_set_dai_sysclk(struct snd_soc_dai *dai, + int clk_id, unsigned int freq, int dir) +{ + struct snd_soc_component *component = dai->component; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + es8389->sysclk = freq; + + return 0; +} + +static int es8389_set_tdm_slot(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, int slots, int slot_width) +{ + struct snd_soc_component *component = dai->component; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + regmap_update_bits(es8389->regmap, ES8389_PTDM_SLOT, + ES8389_TDM_SLOT, (slots << ES8389_TDM_SHIFT)); + regmap_update_bits(es8389->regmap, ES8389_DAC_RAMP, + ES8389_TDM_SLOT, (slots << ES8389_TDM_SHIFT)); + + return 0; +} + +static int es8389_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt) +{ + struct snd_soc_component *component = dai->component; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + u8 state = 0; + + switch (fmt & SND_SOC_DAIFMT_CLOCK_PROVIDER_MASK) { + case SND_SOC_DAIFMT_CBC_CFP: + regmap_update_bits(es8389->regmap, ES8389_MASTER_MODE, + ES8389_MASTER_MODE_EN, ES8389_MASTER_MODE_EN); + es8389->mastermode = 1; + break; + case SND_SOC_DAIFMT_CBC_CFC: + es8389->mastermode = 0; + break; + default: + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_I2S: + state |= ES8389_DAIFMT_I2S; + break; + case SND_SOC_DAIFMT_RIGHT_J: + dev_err(component->dev, "component driver does not support right justified\n"); + return -EINVAL; + case SND_SOC_DAIFMT_LEFT_J: + state |= ES8389_DAIFMT_LEFT_J; + break; + case SND_SOC_DAIFMT_DSP_A: + state |= ES8389_DAIFMT_DSP_A; + break; + case SND_SOC_DAIFMT_DSP_B: + state |= ES8389_DAIFMT_DSP_B; + break; + default: + break; + } + regmap_update_bits(es8389->regmap, ES8389_ADC_FORMAT_MUTE, ES8389_DAIFMT_MASK, state); + regmap_update_bits(es8389->regmap, ES8389_DAC_FORMAT_MUTE, ES8389_DAIFMT_MASK, state); + + return 0; +} + +static int es8389_pcm_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + int coeff; + u8 state = 0; + + switch (params_format(params)) { + case SNDRV_PCM_FORMAT_S16_LE: + state |= ES8389_S16_LE; + break; + case SNDRV_PCM_FORMAT_S20_3LE: + state |= ES8389_S20_3_LE; + break; + case SNDRV_PCM_FORMAT_S18_3LE: + state |= ES8389_S18_LE; + break; + case SNDRV_PCM_FORMAT_S24_LE: + state |= ES8389_S24_LE; + break; + case SNDRV_PCM_FORMAT_S32_LE: + state |= ES8389_S32_LE; + break; + default: + return -EINVAL; + } + + regmap_update_bits(es8389->regmap, ES8389_ADC_FORMAT_MUTE, ES8389_DATA_LEN_MASK, state); + regmap_update_bits(es8389->regmap, ES8389_DAC_FORMAT_MUTE, ES8389_DATA_LEN_MASK, state); + + if (es8389->mclk_src == ES8389_SCLK_PIN) { + regmap_update_bits(es8389->regmap, ES8389_MASTER_CLK, + ES8389_MCLK_SOURCE, es8389->mclk_src); + es8389->sysclk = params_channels(params) * params_width(params) * params_rate(params); + } + + coeff = get_coeff(es8389->sysclk, params_rate(params)); + if (coeff >= 0) { + regmap_write(es8389->regmap, ES8389_CLK_DIV1, coeff_div[coeff].Reg0x04); + regmap_write(es8389->regmap, ES8389_CLK_MUL, coeff_div[coeff].Reg0x05); + regmap_write(es8389->regmap, ES8389_CLK_MUX1, coeff_div[coeff].Reg0x06); + regmap_write(es8389->regmap, ES8389_CLK_MUX2, coeff_div[coeff].Reg0x07); + regmap_write(es8389->regmap, ES8389_CLK_CTL1, coeff_div[coeff].Reg0x08); + regmap_write(es8389->regmap, ES8389_CLK_CTL2, coeff_div[coeff].Reg0x09); + regmap_write(es8389->regmap, ES8389_CLK_CTL3, coeff_div[coeff].Reg0x0A); + regmap_update_bits(es8389->regmap, ES8389_OSC_CLK, + 0xC0, coeff_div[coeff].Reg0x0F); + regmap_write(es8389->regmap, ES8389_CLK_DIV2, coeff_div[coeff].Reg0x11); + regmap_write(es8389->regmap, ES8389_ADC_OSR, coeff_div[coeff].Reg0x21); + regmap_write(es8389->regmap, ES8389_ADC_DSP, coeff_div[coeff].Reg0x22); + regmap_write(es8389->regmap, ES8389_OSR_VOL, coeff_div[coeff].Reg0x26); + regmap_update_bits(es8389->regmap, ES8389_SYSTEM30, + 0xC0, coeff_div[coeff].Reg0x30); + regmap_write(es8389->regmap, ES8389_DAC_DSM_OSR, coeff_div[coeff].Reg0x41); + regmap_write(es8389->regmap, ES8389_DAC_DSP_OSR, coeff_div[coeff].Reg0x42); + regmap_update_bits(es8389->regmap, ES8389_DAC_MISC, + 0x81, coeff_div[coeff].Reg0x43); + regmap_update_bits(es8389->regmap, ES8389_CHIP_MISC, + 0x72, coeff_div[coeff].Reg0xF0); + regmap_write(es8389->regmap, ES8389_CSM_STATE1, coeff_div[coeff].Reg0xF1); + regmap_write(es8389->regmap, ES8389_SYSTEM16, coeff_div[coeff].Reg0x16); + regmap_write(es8389->regmap, ES8389_SYSTEM18, coeff_div[coeff].Reg0x18); + regmap_write(es8389->regmap, ES8389_SYSTEM19, coeff_div[coeff].Reg0x19); + } else { + dev_warn(component->dev, "Clock coefficients do not match"); + } + + return 0; +} + +static int es8389_set_bias_level(struct snd_soc_component *component, + enum snd_soc_bias_level level) +{ + int ret; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + switch (level) { + case SND_SOC_BIAS_ON: + ret = clk_prepare_enable(es8389->mclk); + if (ret) + return ret; + + regmap_update_bits(es8389->regmap, ES8389_HPSW, 0x20, 0x20); + regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0xD9); + regmap_write(es8389->regmap, ES8389_ADC_EN, 0x8F); + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0xE4); + regmap_write(es8389->regmap, ES8389_RESET, 0x01); + regmap_write(es8389->regmap, ES8389_CLK_OFF1, 0xC3); + regmap_update_bits(es8389->regmap, ES8389_ADC_HPF1, 0x0f, 0x0a); + regmap_update_bits(es8389->regmap, ES8389_ADC_HPF2, 0x0f, 0x0a); + usleep_range(70000, 72000); + regmap_write(es8389->regmap, ES8389_DAC_RESET, 0X00); + break; + case SND_SOC_BIAS_PREPARE: + break; + case SND_SOC_BIAS_STANDBY: + regmap_update_bits(es8389->regmap, ES8389_ADC_HPF1, 0x0f, 0x04); + regmap_update_bits(es8389->regmap, ES8389_ADC_HPF2, 0x0f, 0x04); + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0xD4); + usleep_range(70000, 72000); + regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0x59); + regmap_write(es8389->regmap, ES8389_ADC_EN, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_OFF1, 0x00); + regmap_write(es8389->regmap, ES8389_RESET, 0x7E); + regmap_update_bits(es8389->regmap, ES8389_DAC_INV, 0x80, 0x80); + usleep_range(8000, 8500); + regmap_update_bits(es8389->regmap, ES8389_DAC_INV, 0x80, 0x00); + + clk_disable_unprepare(es8389->mclk); + break; + case SND_SOC_BIAS_OFF: + break; + } + return 0; +} + + + +static int es8389_mute(struct snd_soc_dai *dai, int mute, int direction) +{ + struct snd_soc_component *component = dai->component; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + if (mute) { + if (direction == SNDRV_PCM_STREAM_PLAYBACK) { + regmap_update_bits(es8389->regmap, ES8389_DAC_FORMAT_MUTE, + 0x03, 0x03); + } else { + regmap_update_bits(es8389->regmap, ES8389_ADC_FORMAT_MUTE, + 0x03, 0x03); + } + } else { + if (direction == SNDRV_PCM_STREAM_PLAYBACK) { + regmap_update_bits(es8389->regmap, ES8389_DAC_FORMAT_MUTE, + 0x03, 0x00); + } else { + regmap_update_bits(es8389->regmap, ES8389_ADC_FORMAT_MUTE, + 0x03, 0x00); + } + } + + return 0; +} + +#define es8389_RATES SNDRV_PCM_RATE_8000_96000 + +#define es8389_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE |\ + SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S32_LE) + +static const struct snd_soc_dai_ops es8389_ops = { + .hw_params = es8389_pcm_hw_params, + .set_fmt = es8389_set_dai_fmt, + .set_sysclk = es8389_set_dai_sysclk, + .set_tdm_slot = es8389_set_tdm_slot, + .mute_stream = es8389_mute, +}; + +static struct snd_soc_dai_driver es8389_dai = { + .name = "ES8389 HiFi", + .playback = { + .stream_name = "Playback", + .channels_min = 1, + .channels_max = 2, + .rates = es8389_RATES, + .formats = es8389_FORMATS, + }, + .capture = { + .stream_name = "Capture", + .channels_min = 1, + .channels_max = 2, + .rates = es8389_RATES, + .formats = es8389_FORMATS, + }, + .ops = &es8389_ops, + .symmetric_rate = 1, +}; + +static void es8389_init(struct snd_soc_component *component) +{ + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + regmap_write(es8389->regmap, ES8389_ISO_CTL, 0x00); + regmap_write(es8389->regmap, ES8389_RESET, 0x7E); + regmap_write(es8389->regmap, ES8389_ISO_CTL, 0x38); + regmap_write(es8389->regmap, ES8389_ADC_HPF1, 0x64); + regmap_write(es8389->regmap, ES8389_ADC_HPF2, 0x04); + regmap_write(es8389->regmap, ES8389_DAC_INV, 0x03); + + regmap_write(es8389->regmap, ES8389_VMID, 0x2A); + regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0xC9); + regmap_write(es8389->regmap, ES8389_ANA_VSEL, 0x4F); + regmap_write(es8389->regmap, ES8389_ANA_CTL2, 0x06); + regmap_write(es8389->regmap, ES8389_LOW_POWER1, 0x00); + regmap_write(es8389->regmap, ES8389_DMIC_EN, 0x16); + + regmap_write(es8389->regmap, ES8389_PGA_SW, 0xAA); + regmap_write(es8389->regmap, ES8389_MOD_SW1, 0x66); + regmap_write(es8389->regmap, ES8389_MOD_SW2, 0x99); + regmap_write(es8389->regmap, ES8389_ADC_MODE, (0x00 | ES8389_TDM_MODE)); + regmap_update_bits(es8389->regmap, ES8389_DMIC_EN, 0xC0, 0x00); + regmap_update_bits(es8389->regmap, ES8389_ADC_MODE, 0x03, 0x00); + + regmap_update_bits(es8389->regmap, ES8389_MIC1_GAIN, + ES8389_MIC_SEL_MASK, ES8389_MIC_DEFAULT); + regmap_update_bits(es8389->regmap, ES8389_MIC2_GAIN, + ES8389_MIC_SEL_MASK, ES8389_MIC_DEFAULT); + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0xC4); + regmap_write(es8389->regmap, ES8389_MASTER_MODE, 0x08); + regmap_write(es8389->regmap, ES8389_CSM_STATE1, 0x00); + regmap_write(es8389->regmap, ES8389_SYSTEM12, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM13, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM14, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM15, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM16, 0x35); + regmap_write(es8389->regmap, ES8389_SYSTEM17, 0x09); + regmap_write(es8389->regmap, ES8389_SYSTEM18, 0x91); + regmap_write(es8389->regmap, ES8389_SYSTEM19, 0x28); + regmap_write(es8389->regmap, ES8389_SYSTEM1A, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM1B, 0x01); + regmap_write(es8389->regmap, ES8389_SYSTEM1C, 0x11); + + regmap_write(es8389->regmap, ES8389_CHIP_MISC, 0x13); + regmap_write(es8389->regmap, ES8389_MASTER_CLK, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_DIV1, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_MUL, 0x10); + regmap_write(es8389->regmap, ES8389_CLK_MUX1, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_MUX2, 0xC0); + regmap_write(es8389->regmap, ES8389_CLK_CTL1, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_CTL2, 0xC0); + regmap_write(es8389->regmap, ES8389_CLK_CTL3, 0x80); + regmap_write(es8389->regmap, ES8389_SCLK_DIV, 0x04); + regmap_write(es8389->regmap, ES8389_LRCK_DIV1, 0x01); + regmap_write(es8389->regmap, ES8389_LRCK_DIV2, 0x00); + regmap_write(es8389->regmap, ES8389_OSC_CLK, 0x00); + regmap_write(es8389->regmap, ES8389_ADC_OSR, 0x1F); + regmap_write(es8389->regmap, ES8389_ADC_DSP, 0x7F); + regmap_write(es8389->regmap, ES8389_ADC_MUTE, 0xC0); + regmap_write(es8389->regmap, ES8389_SYSTEM30, 0xF4); + regmap_write(es8389->regmap, ES8389_DAC_DSM_OSR, 0x7F); + regmap_write(es8389->regmap, ES8389_DAC_DSP_OSR, 0x7F); + regmap_write(es8389->regmap, ES8389_DAC_MISC, 0x10); + regmap_write(es8389->regmap, ES8389_DAC_RAMP, 0x0F); + regmap_write(es8389->regmap, ES8389_SYSTEM4C, 0xC0); + regmap_write(es8389->regmap, ES8389_RESET, 0x00); + regmap_write(es8389->regmap, ES8389_CLK_OFF1, 0xC1); + regmap_write(es8389->regmap, ES8389_RESET, 0x01); + regmap_write(es8389->regmap, ES8389_DAC_RESET, 0x02); + + regmap_update_bits(es8389->regmap, ES8389_ADC_FORMAT_MUTE, 0x03, 0x03); + regmap_update_bits(es8389->regmap, ES8389_DAC_FORMAT_MUTE, 0x03, 0x03); +} + +static int es8389_suspend(struct snd_soc_component *component) +{ + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + es8389_set_bias_level(component, SND_SOC_BIAS_STANDBY); + regcache_cache_only(es8389->regmap, true); + regcache_mark_dirty(es8389->regmap); + + return 0; +} + +static int es8389_resume(struct snd_soc_component *component) +{ + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + unsigned int regv; + + regcache_cache_only(es8389->regmap, false); + regcache_cache_bypass(es8389->regmap, true); + regmap_read(es8389->regmap, ES8389_RESET, ®v); + regcache_cache_bypass(es8389->regmap, false); + + if (regv == 0xff) + es8389_init(component); + else + es8389_set_bias_level(component, SND_SOC_BIAS_ON); + + regcache_sync(es8389->regmap); + + return 0; +} + +static int es8389_probe(struct snd_soc_component *component) +{ + int ret; + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + ret = device_property_read_u8(component->dev, "everest,mclk-src", &es8389->mclk_src); + if (ret != 0) { + dev_dbg(component->dev, "mclk-src return %d", ret); + es8389->mclk_src = ES8389_MCLK_SOURCE; + } + + es8389->mclk = devm_clk_get(component->dev, "mclk"); + if (IS_ERR(es8389->mclk)) + return dev_err_probe(component->dev, PTR_ERR(es8389->mclk), + "ES8389 is unable to get mclk\n"); + + if (!es8389->mclk) + dev_err(component->dev, "%s, assuming static mclk\n", __func__); + + ret = clk_prepare_enable(es8389->mclk); + if (ret) { + dev_err(component->dev, "%s, unable to enable mclk\n", __func__); + return ret; + } + + es8389_init(component); + es8389_set_bias_level(component, SND_SOC_BIAS_STANDBY); + + return 0; +} + +static void es8389_remove(struct snd_soc_component *component) +{ + struct es8389_private *es8389 = snd_soc_component_get_drvdata(component); + + regmap_write(es8389->regmap, ES8389_MASTER_MODE, 0x28); + regmap_write(es8389->regmap, ES8389_HPSW, 0x00); + regmap_write(es8389->regmap, ES8389_VMID, 0x00); + regmap_write(es8389->regmap, ES8389_RESET, 0x00); + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0xCC); + usleep_range(500000, 550000);//500MS + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0x00); + regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0x08); + regmap_write(es8389->regmap, ES8389_ISO_CTL, 0xC1); + regmap_write(es8389->regmap, ES8389_PULL_DOWN, 0x00); + +} + +static const struct snd_soc_component_driver soc_codec_dev_es8389 = { + .probe = es8389_probe, + .remove = es8389_remove, + .suspend = es8389_suspend, + .resume = es8389_resume, + .set_bias_level = es8389_set_bias_level, + + .controls = es8389_snd_controls, + .num_controls = ARRAY_SIZE(es8389_snd_controls), + .dapm_widgets = es8389_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(es8389_dapm_widgets), + .dapm_routes = es8389_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(es8389_dapm_routes), + .idle_bias_on = 1, + .use_pmdown_time = 1, +}; + +static const struct regmap_config es8389_regmap = { + .reg_bits = 8, + .val_bits = 8, + + .max_register = ES8389_MAX_REGISTER, + + .volatile_reg = es8389_volatile_register, + .cache_type = REGCACHE_MAPLE, +}; + +static void es8389_i2c_shutdown(struct i2c_client *i2c) +{ + struct es8389_private *es8389; + + es8389 = i2c_get_clientdata(i2c); + + regmap_write(es8389->regmap, ES8389_MASTER_MODE, 0x28); + regmap_write(es8389->regmap, ES8389_HPSW, 0x00); + regmap_write(es8389->regmap, ES8389_VMID, 0x00); + regmap_write(es8389->regmap, ES8389_RESET, 0x00); + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0xCC); + usleep_range(500000, 550000);//500MS + regmap_write(es8389->regmap, ES8389_CSM_JUMP, 0x00); + regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0x08); + regmap_write(es8389->regmap, ES8389_ISO_CTL, 0xC1); + regmap_write(es8389->regmap, ES8389_PULL_DOWN, 0x00); +} + +static int es8389_i2c_probe(struct i2c_client *i2c_client) +{ + struct es8389_private *es8389; + int ret; + + es8389 = devm_kzalloc(&i2c_client->dev, sizeof(*es8389), GFP_KERNEL); + if (es8389 == NULL) + return -ENOMEM; + + i2c_set_clientdata(i2c_client, es8389); + es8389->regmap = devm_regmap_init_i2c(i2c_client, &es8389_regmap); + if (IS_ERR(es8389->regmap)) + return dev_err_probe(&i2c_client->dev, PTR_ERR(es8389->regmap), + "regmap_init() failed\n"); + + ret = devm_snd_soc_register_component(&i2c_client->dev, + &soc_codec_dev_es8389, + &es8389_dai, + 1); + + return ret; +} + +#ifdef CONFIG_OF +static const struct of_device_id es8389_if_dt_ids[] = { + { .compatible = "everest,es8389", }, + { } +}; +MODULE_DEVICE_TABLE(of, es8389_if_dt_ids); +#endif + +static const struct i2c_device_id es8389_i2c_id[] = { + {"es8389"}, + { } +}; +MODULE_DEVICE_TABLE(i2c, es8389_i2c_id); + +static struct i2c_driver es8389_i2c_driver = { + .driver = { + .name = "es8389", + .of_match_table = of_match_ptr(es8389_if_dt_ids), + }, + .shutdown = es8389_i2c_shutdown, + .probe = es8389_i2c_probe, + .id_table = es8389_i2c_id, +}; +module_i2c_driver(es8389_i2c_driver); + +MODULE_DESCRIPTION("ASoC es8389 driver"); +MODULE_AUTHOR("Michael Zhang "); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/es8389.h b/sound/soc/codecs/es8389.h new file mode 100644 index 00000000000000..123d1e4b2d5390 --- /dev/null +++ b/sound/soc/codecs/es8389.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* +* ES8389.h -- ES8389 ALSA SoC Audio Codec +* +* Authors: +* +* Based on ES8374.h by Michael Zhang +*/ + +#ifndef _ES8389_H +#define _ES8389_H + +/* +* ES8389_REGISTER NAME_REG_REGISTER ADDRESS +*/ +#define ES8389_RESET 0x00 /*reset digital,csm,clock manager etc.*/ + +/* +* Clock Scheme Register definition +*/ +#define ES8389_MASTER_MODE 0x01 +#define ES8389_MASTER_CLK 0x02 +#define ES8389_CLK_OFF1 0x03 +#define ES8389_CLK_DIV1 0x04 +#define ES8389_CLK_MUL 0x05 +#define ES8389_CLK_MUX1 0x06 +#define ES8389_CLK_MUX2 0x07 +#define ES8389_CLK_CTL1 0x08 +#define ES8389_CLK_CTL2 0x09 +#define ES8389_CLK_CTL3 0x0A +#define ES8389_SCLK_DIV 0x0B +#define ES8389_LRCK_DIV1 0x0C +#define ES8389_LRCK_DIV2 0x0D +#define ES8389_CLK_OFF2 0x0E +#define ES8389_OSC_CLK 0x0F +#define ES8389_CSM_JUMP 0x10 +#define ES8389_CLK_DIV2 0x11 +#define ES8389_SYSTEM12 0x12 +#define ES8389_SYSTEM13 0x13 +#define ES8389_SYSTEM14 0x14 +#define ES8389_SYSTEM15 0x15 +#define ES8389_SYSTEM16 0x16 +#define ES8389_SYSTEM17 0x17 +#define ES8389_SYSTEM18 0x18 +#define ES8389_SYSTEM19 0x19 +#define ES8389_SYSTEM1A 0x1A +#define ES8389_SYSTEM1B 0x1B +#define ES8389_SYSTEM1C 0x1C +#define ES8389_ADC_FORMAT_MUTE 0x20 +#define ES8389_ADC_OSR 0x21 +#define ES8389_ADC_DSP 0x22 +#define ES8389_ADC_MODE 0x23 +#define ES8389_ADC_HPF1 0x24 +#define ES8389_ADC_HPF2 0x25 +#define ES8389_OSR_VOL 0x26 +#define ES8389_ADCL_VOL 0x27 +#define ES8389_ADCR_VOL 0x28 +#define ES8389_ALC_CTL 0x29 +#define ES8389_PTDM_SLOT 0x2A +#define ES8389_ALC_ON 0x2B +#define ES8389_ALC_TARGET 0x2C +#define ES8389_ALC_GAIN 0x2D +#define ES8389_SYSTEM2E 0x2E +#define ES8389_ADC_MUTE 0x2F +#define ES8389_SYSTEM30 0x30 +#define ES8389_ADC_RESET 0x31 +#define ES8389_DAC_FORMAT_MUTE 0x40 +#define ES8389_DAC_DSM_OSR 0x41 +#define ES8389_DAC_DSP_OSR 0x42 +#define ES8389_DAC_MISC 0x43 +#define ES8389_DAC_MIX 0x44 +#define ES8389_DAC_INV 0x45 +#define ES8389_DACL_VOL 0x46 +#define ES8389_DACR_VOL 0x47 +#define ES8389_MIX_VOL 0x48 +#define ES8389_DAC_RAMP 0x49 +#define ES8389_SYSTEM4C 0x4C +#define ES8389_DAC_RESET 0x4D +#define ES8389_VMID 0x60 +#define ES8389_ANA_CTL1 0x61 +#define ES8389_ANA_VSEL 0x62 +#define ES8389_ANA_CTL2 0x63 +#define ES8389_ADC_EN 0x64 +#define ES8389_HPSW 0x69 +#define ES8389_LOW_POWER1 0x6B +#define ES8389_LOW_POWER2 0x6C +#define ES8389_DMIC_EN 0x6D +#define ES8389_PGA_SW 0x6E +#define ES8389_MOD_SW1 0x6F +#define ES8389_MOD_SW2 0x70 +#define ES8389_MOD_SW3 0x71 +#define ES8389_MIC1_GAIN 0x72 +#define ES8389_MIC2_GAIN 0x73 + +#define ES8389_CHIP_MISC 0xF0 +#define ES8389_CSM_STATE1 0xF1 +#define ES8389_PULL_DOWN 0xF2 +#define ES8389_ISO_CTL 0xF3 +#define ES8389_CSM_STATE2 0xF4 + +#define ES8389_CHIP_ID0 0xFD +#define ES8389_CHIP_ID1 0xFE + +#define ES8389_MAX_REGISTER 0xFF + +#define ES8389_MIC_SEL_MASK (7 << 4) +#define ES8389_MIC_DEFAULT (1 << 4) + +#define ES8389_MASTER_MODE_EN (1 << 0) + +#define ES8389_TDM_OFF (0 << 0) +#define ES8389_STDM_ON (1 << 7) +#define ES8389_PTDM_ON (1 << 6) + +#define ES8389_TDM_MODE ES8389_TDM_OFF +#define ES8389_TDM_SLOT (0x70 << 0) +#define ES8389_TDM_SHIFT 4 + +#define ES8389_MCLK_SOURCE (1 << 6) +#define ES8389_MCLK_PIN (1 << 6) +#define ES8389_SCLK_PIN (0 << 6) + +/* ES8389_FMT */ +#define ES8389_S24_LE (0 << 5) +#define ES8389_S20_3_LE (1 << 5) +#define ES8389_S18_LE (2 << 5) +#define ES8389_S16_LE (3 << 5) +#define ES8389_S32_LE (4 << 5) +#define ES8389_DATA_LEN_MASK (7 << 5) + +#define ES8389_DAIFMT_MASK (7 << 2) +#define ES8389_DAIFMT_I2S 0 +#define ES8389_DAIFMT_LEFT_J (1 << 2) +#define ES8389_DAIFMT_DSP_A (1 << 3) +#define ES8389_DAIFMT_DSP_B (3 << 3) + +#define ES8389_STATE_ON (13 << 0) +#define ES8389_STATE_STANDBY (7 << 0) + +#endif From 900ed3447978e911b8780268ba85f518a435a2ad Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 14 May 2025 17:45:46 +0800 Subject: [PATCH 008/353] ASoC: dt-bindings: Add Everest ES8389 audio CODEC Add device tree binding documentation for Everest ES8389 which is different from ES8388 Signed-off-by: Zhang Yi Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250514094546.35508-3-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- .../bindings/sound/everest,es8389.yaml | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/everest,es8389.yaml diff --git a/Documentation/devicetree/bindings/sound/everest,es8389.yaml b/Documentation/devicetree/bindings/sound/everest,es8389.yaml new file mode 100644 index 00000000000000..a673df485ab30f --- /dev/null +++ b/Documentation/devicetree/bindings/sound/everest,es8389.yaml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/everest,es8389.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Everest ES8389 audio CODEC + +maintainers: + - Michael Zhang + +allOf: + - $ref: dai-common.yaml# + +properties: + compatible: + const: everest,es8389 + + reg: + maxItems: 1 + + clocks: + items: + - description: clock for master clock (MCLK) + + clock-names: + items: + - const: mclk + + "#sound-dai-cells": + const: 0 + +required: + - compatible + - reg + - "#sound-dai-cells" + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + es8389: codec@10 { + compatible = "everest,es8389"; + reg = <0x10>; + #sound-dai-cells = <0>; + }; + }; From 3ea62b1678c00716af11a5c4aaac6e4e1cd7ac3e Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Tue, 13 May 2025 13:39:23 +0000 Subject: [PATCH 009/353] ASoC: cs35l56: Log tuning unique identifiers during firmware load The cs35l56 smart amplifier has some informational firmware controls that are populated by a tuning bin file to unique values - logging these during firmware load identifies the specific configuration being used on that device instance. Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/47762a5f1ce2b178ad863c6698296aea09b72e10.1747142267.git.simont@opensource.cirrus.com Acked-by: Takashi Iwai Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 1 + sound/soc/codecs/cs35l56-shared.c | 28 ++++++++++++++++++++++++++++ sound/soc/codecs/cs35l56.c | 1 + 3 files changed, 30 insertions(+) diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 63f2c63f7c59c6..e17c4cadd04d1a 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -360,6 +360,7 @@ void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_ds int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base); int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base, bool *fw_missing, unsigned int *fw_version); +void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); int cs35l56_hw_init(struct cs35l56_base *cs35l56_base); int cs35l56_get_speaker_id(struct cs35l56_base *cs35l56_base); int cs35l56_get_bclk_freq_id(unsigned int freq); diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 7f768718b69bcd..d0831d60958486 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -909,6 +909,33 @@ int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base, } EXPORT_SYMBOL_NS_GPL(cs35l56_read_prot_status, "SND_SOC_CS35L56_SHARED"); +void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp) +{ + __be32 pid, sid, tid; + int ret; + + scoped_guard(mutex, &cs_dsp->pwr_lock) { + ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_PRJCT_ID", + WMFW_ADSP2_XM, 0x9f212), + 0, &pid, sizeof(pid)); + if (!ret) + ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_CHNNL_ID", + WMFW_ADSP2_XM, 0x9f212), + 0, &sid, sizeof(sid)); + if (!ret) + ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_SNPSHT_ID", + WMFW_ADSP2_XM, 0x9f212), + 0, &tid, sizeof(tid)); + } + + if (ret) + dev_warn(cs35l56_base->dev, "Can't read tuning IDs"); + else + dev_info(cs35l56_base->dev, "Tuning PID: %#x, SID: %#x, TID: %#x\n", + be32_to_cpu(pid), be32_to_cpu(sid), be32_to_cpu(tid)); +} +EXPORT_SYMBOL_NS_GPL(cs35l56_log_tuning, "SND_SOC_CS35L56_SHARED"); + int cs35l56_hw_init(struct cs35l56_base *cs35l56_base) { int ret; @@ -1249,3 +1276,4 @@ MODULE_AUTHOR("Richard Fitzgerald "); MODULE_AUTHOR("Simon Trimmer "); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("SND_SOC_CS_AMP_LIB"); +MODULE_IMPORT_NS("FW_CS_DSP"); diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index cdb283ed938cfd..c78e4746e42870 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -847,6 +847,7 @@ static void cs35l56_dsp_work(struct work_struct *work) else cs35l56_patch(cs35l56, firmware_missing); + cs35l56_log_tuning(&cs35l56->base, &cs35l56->dsp.cs_dsp); err: pm_runtime_mark_last_busy(cs35l56->base.dev); pm_runtime_put_autosuspend(cs35l56->base.dev); From 9a174191bbfc3adf6001991fd3cc5709592a8473 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Tue, 13 May 2025 13:39:24 +0000 Subject: [PATCH 010/353] ALSA: hda: cs35l56: Log tuning unique identifiers during firmware load The cs35l56 smart amplifier has some informational firmware controls that are populated by a tuning bin file to unique values - logging these during firmware load identifies the specific configuration being used on that device instance. Signed-off-by: Simon Trimmer Acked-by: Takashi Iwai Link: https://patch.msgid.link/2fcc0e6fc5b8669acb026bebe44a4995ac83b967.1747142267.git.simont@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/pci/hda/cs35l56_hda.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 8ee8cb76ecbd88..3f2fd32f4ad916 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -675,6 +675,8 @@ static void cs35l56_hda_fw_load(struct cs35l56_hda *cs35l56) if (ret) cs_dsp_stop(&cs35l56->cs_dsp); + cs35l56_log_tuning(&cs35l56->base, &cs35l56->cs_dsp); + err_powered_up: if (!cs35l56->base.fw_patched) cs_dsp_power_down(&cs35l56->cs_dsp); From 46b7a6dac7af1cdca47ddb1a712e548de0c49f7d Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 15 May 2025 15:38:21 +0800 Subject: [PATCH 011/353] ASoC: mediatek: mt8183-afe-pcm: Shorten memif_data table using macros The memif_data table describes all the supported PCM channels for the audio frontend. Most of the fields are either the same or can be derived from the interface's name. This results in a very long table (in source code) that can be shortened with macros. Do just that. Some "convenience" macros were added to cover non-existent register fields that would otherwise require multiple layers of macros to handle. Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20250515073825.4155297-2-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8183/mt8183-afe-pcm.c | 243 +++++---------------- 1 file changed, 54 insertions(+), 189 deletions(-) diff --git a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c index d083b4bf0f9545..a4bed8e335d897 100644 --- a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c +++ b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c @@ -424,196 +424,61 @@ static const struct snd_soc_component_driver mt8183_afe_pcm_dai_component = { .name = "mt8183-afe-pcm-dai", }; +#define MT8183_MEMIF_BASE(_id, _en_reg, _fs_reg, _mono_reg) \ + [MT8183_MEMIF_##_id] = { \ + .name = #_id, \ + .id = MT8183_MEMIF_##_id, \ + .reg_ofs_base = AFE_##_id##_BASE, \ + .reg_ofs_cur = AFE_##_id##_CUR, \ + .reg_ofs_end = AFE_##_id##_END, \ + .fs_reg = (_fs_reg), \ + .fs_shift = _id##_MODE_SFT, \ + .fs_maskbit = _id##_MODE_MASK, \ + .mono_reg = (_mono_reg), \ + .mono_shift = _id##_DATA_SFT, \ + .enable_reg = (_en_reg), \ + .enable_shift = _id##_ON_SFT, \ + .hd_reg = AFE_MEMIF_HD_MODE, \ + .hd_align_reg = AFE_MEMIF_HDALIGN, \ + .hd_shift = _id##_HD_SFT, \ + .hd_align_mshift = _id##_HD_ALIGN_SFT, \ + .agent_disable_reg = -1, \ + .agent_disable_shift = -1, \ + .msb_reg = -1, \ + .msb_shift = -1, \ + } + +#define MT8183_MEMIF(_id, _fs_reg, _mono_reg) \ + MT8183_MEMIF_BASE(_id, AFE_DAC_CON0, _fs_reg, _mono_reg) + +/* For convenience with macros: missing register fields */ +#define MOD_DAI_DATA_SFT -1 +#define HDMI_MODE_SFT -1 +#define HDMI_MODE_MASK -1 +#define HDMI_DATA_SFT -1 +#define HDMI_ON_SFT -1 + +/* For convenience with macros: register name differences */ +#define AFE_VUL12_BASE AFE_VUL_D2_BASE +#define AFE_VUL12_CUR AFE_VUL_D2_CUR +#define AFE_VUL12_END AFE_VUL_D2_END +#define AWB2_HD_ALIGN_SFT AWB2_ALIGN_SFT +#define VUL12_DATA_SFT VUL12_MONO_SFT +#define AFE_HDMI_BASE AFE_HDMI_OUT_BASE +#define AFE_HDMI_CUR AFE_HDMI_OUT_CUR +#define AFE_HDMI_END AFE_HDMI_OUT_END + static const struct mtk_base_memif_data memif_data[MT8183_MEMIF_NUM] = { - [MT8183_MEMIF_DL1] = { - .name = "DL1", - .id = MT8183_MEMIF_DL1, - .reg_ofs_base = AFE_DL1_BASE, - .reg_ofs_cur = AFE_DL1_CUR, - .fs_reg = AFE_DAC_CON1, - .fs_shift = DL1_MODE_SFT, - .fs_maskbit = DL1_MODE_MASK, - .mono_reg = AFE_DAC_CON1, - .mono_shift = DL1_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = DL1_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = DL1_HD_SFT, - .hd_align_mshift = DL1_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_DL2] = { - .name = "DL2", - .id = MT8183_MEMIF_DL2, - .reg_ofs_base = AFE_DL2_BASE, - .reg_ofs_cur = AFE_DL2_CUR, - .fs_reg = AFE_DAC_CON1, - .fs_shift = DL2_MODE_SFT, - .fs_maskbit = DL2_MODE_MASK, - .mono_reg = AFE_DAC_CON1, - .mono_shift = DL2_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = DL2_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = DL2_HD_SFT, - .hd_align_mshift = DL2_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_DL3] = { - .name = "DL3", - .id = MT8183_MEMIF_DL3, - .reg_ofs_base = AFE_DL3_BASE, - .reg_ofs_cur = AFE_DL3_CUR, - .fs_reg = AFE_DAC_CON2, - .fs_shift = DL3_MODE_SFT, - .fs_maskbit = DL3_MODE_MASK, - .mono_reg = AFE_DAC_CON1, - .mono_shift = DL3_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = DL3_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = DL3_HD_SFT, - .hd_align_mshift = DL3_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_VUL2] = { - .name = "VUL2", - .id = MT8183_MEMIF_VUL2, - .reg_ofs_base = AFE_VUL2_BASE, - .reg_ofs_cur = AFE_VUL2_CUR, - .fs_reg = AFE_DAC_CON2, - .fs_shift = VUL2_MODE_SFT, - .fs_maskbit = VUL2_MODE_MASK, - .mono_reg = AFE_DAC_CON2, - .mono_shift = VUL2_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = VUL2_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = VUL2_HD_SFT, - .hd_align_mshift = VUL2_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_AWB] = { - .name = "AWB", - .id = MT8183_MEMIF_AWB, - .reg_ofs_base = AFE_AWB_BASE, - .reg_ofs_cur = AFE_AWB_CUR, - .fs_reg = AFE_DAC_CON1, - .fs_shift = AWB_MODE_SFT, - .fs_maskbit = AWB_MODE_MASK, - .mono_reg = AFE_DAC_CON1, - .mono_shift = AWB_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = AWB_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = AWB_HD_SFT, - .hd_align_mshift = AWB_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_AWB2] = { - .name = "AWB2", - .id = MT8183_MEMIF_AWB2, - .reg_ofs_base = AFE_AWB2_BASE, - .reg_ofs_cur = AFE_AWB2_CUR, - .fs_reg = AFE_DAC_CON2, - .fs_shift = AWB2_MODE_SFT, - .fs_maskbit = AWB2_MODE_MASK, - .mono_reg = AFE_DAC_CON2, - .mono_shift = AWB2_DATA_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = AWB2_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = AWB2_HD_SFT, - .hd_align_mshift = AWB2_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_VUL12] = { - .name = "VUL12", - .id = MT8183_MEMIF_VUL12, - .reg_ofs_base = AFE_VUL_D2_BASE, - .reg_ofs_cur = AFE_VUL_D2_CUR, - .fs_reg = AFE_DAC_CON0, - .fs_shift = VUL12_MODE_SFT, - .fs_maskbit = VUL12_MODE_MASK, - .mono_reg = AFE_DAC_CON0, - .mono_shift = VUL12_MONO_SFT, - .enable_reg = AFE_DAC_CON0, - .enable_shift = VUL12_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = VUL12_HD_SFT, - .hd_align_mshift = VUL12_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_MOD_DAI] = { - .name = "MOD_DAI", - .id = MT8183_MEMIF_MOD_DAI, - .reg_ofs_base = AFE_MOD_DAI_BASE, - .reg_ofs_cur = AFE_MOD_DAI_CUR, - .fs_reg = AFE_DAC_CON1, - .fs_shift = MOD_DAI_MODE_SFT, - .fs_maskbit = MOD_DAI_MODE_MASK, - .mono_reg = -1, - .mono_shift = 0, - .enable_reg = AFE_DAC_CON0, - .enable_shift = MOD_DAI_ON_SFT, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = MOD_DAI_HD_SFT, - .hd_align_mshift = MOD_DAI_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, - [MT8183_MEMIF_HDMI] = { - .name = "HDMI", - .id = MT8183_MEMIF_HDMI, - .reg_ofs_base = AFE_HDMI_OUT_BASE, - .reg_ofs_cur = AFE_HDMI_OUT_CUR, - .fs_reg = -1, - .fs_shift = -1, - .fs_maskbit = -1, - .mono_reg = -1, - .mono_shift = -1, - .enable_reg = -1, /* control in tdm for sync start */ - .enable_shift = -1, - .hd_reg = AFE_MEMIF_HD_MODE, - .hd_align_reg = AFE_MEMIF_HDALIGN, - .hd_shift = HDMI_HD_SFT, - .hd_align_mshift = HDMI_HD_ALIGN_SFT, - .agent_disable_reg = -1, - .agent_disable_shift = -1, - .msb_reg = -1, - .msb_shift = -1, - }, + MT8183_MEMIF(DL1, AFE_DAC_CON1, AFE_DAC_CON1), + MT8183_MEMIF(DL2, AFE_DAC_CON1, AFE_DAC_CON1), + MT8183_MEMIF(DL3, AFE_DAC_CON2, AFE_DAC_CON1), + MT8183_MEMIF(VUL2, AFE_DAC_CON2, AFE_DAC_CON2), + MT8183_MEMIF(AWB, AFE_DAC_CON1, AFE_DAC_CON1), + MT8183_MEMIF(AWB2, AFE_DAC_CON2, AFE_DAC_CON2), + MT8183_MEMIF(VUL12, AFE_DAC_CON0, AFE_DAC_CON0), + MT8183_MEMIF(MOD_DAI, AFE_DAC_CON1, -1), + /* enable control in tdm for sync start */ + MT8183_MEMIF_BASE(HDMI, -1, -1, -1), }; static const struct mtk_base_irq_data irq_data[MT8183_IRQ_NUM] = { From 5461e730fab28d17722df8833bc4f3d7c0df5da2 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 15 May 2025 15:38:22 +0800 Subject: [PATCH 012/353] ASoC: mediatek: mt8183-afe-pcm: Shorten irq_data table using macros The irq_data table describes all the supported interrupts for the audio frontend. The parameters are either the same or can be derived from the interrupt number. This results in a very long table (in source code) that can be shortened with macros. Do just that. Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20250515073825.4155297-3-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8183/mt8183-afe-pcm.c | 176 ++++----------------- 1 file changed, 33 insertions(+), 143 deletions(-) diff --git a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c index a4bed8e335d897..fe315224a25974 100644 --- a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c +++ b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c @@ -481,150 +481,40 @@ static const struct mtk_base_memif_data memif_data[MT8183_MEMIF_NUM] = { MT8183_MEMIF_BASE(HDMI, -1, -1, -1), }; +#define MT8183_AFE_IRQ_BASE(_id, _fs_reg, _fs_shift, _fs_maskbit) \ + [MT8183_IRQ_##_id] = { \ + .id = MT8183_IRQ_##_id, \ + .irq_cnt_reg = AFE_IRQ_MCU_CNT##_id, \ + .irq_cnt_shift = 0, \ + .irq_cnt_maskbit = 0x3ffff, \ + .irq_fs_reg = _fs_reg, \ + .irq_fs_shift = _fs_shift, \ + .irq_fs_maskbit = _fs_maskbit, \ + .irq_en_reg = AFE_IRQ_MCU_CON0, \ + .irq_en_shift = IRQ##_id##_MCU_ON_SFT, \ + .irq_clr_reg = AFE_IRQ_MCU_CLR, \ + .irq_clr_shift = IRQ##_id##_MCU_CLR_SFT, \ + } + +#define MT8183_AFE_IRQ(_id) \ + MT8183_AFE_IRQ_BASE(_id, AFE_IRQ_MCU_CON1 + _id / 8 * 4, \ + IRQ##_id##_MCU_MODE_SFT, \ + IRQ##_id##_MCU_MODE_MASK) + +#define MT8183_AFE_IRQ_NOFS(_id) MT8183_AFE_IRQ_BASE(_id, -1, -1, -1) + static const struct mtk_base_irq_data irq_data[MT8183_IRQ_NUM] = { - [MT8183_IRQ_0] = { - .id = MT8183_IRQ_0, - .irq_cnt_reg = AFE_IRQ_MCU_CNT0, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ0_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ0_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ0_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ0_MCU_CLR_SFT, - }, - [MT8183_IRQ_1] = { - .id = MT8183_IRQ_1, - .irq_cnt_reg = AFE_IRQ_MCU_CNT1, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ1_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ1_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ1_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ1_MCU_CLR_SFT, - }, - [MT8183_IRQ_2] = { - .id = MT8183_IRQ_2, - .irq_cnt_reg = AFE_IRQ_MCU_CNT2, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ2_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ2_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ2_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ2_MCU_CLR_SFT, - }, - [MT8183_IRQ_3] = { - .id = MT8183_IRQ_3, - .irq_cnt_reg = AFE_IRQ_MCU_CNT3, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ3_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ3_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ3_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ3_MCU_CLR_SFT, - }, - [MT8183_IRQ_4] = { - .id = MT8183_IRQ_4, - .irq_cnt_reg = AFE_IRQ_MCU_CNT4, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ4_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ4_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ4_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ4_MCU_CLR_SFT, - }, - [MT8183_IRQ_5] = { - .id = MT8183_IRQ_5, - .irq_cnt_reg = AFE_IRQ_MCU_CNT5, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ5_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ5_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ5_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ5_MCU_CLR_SFT, - }, - [MT8183_IRQ_6] = { - .id = MT8183_IRQ_6, - .irq_cnt_reg = AFE_IRQ_MCU_CNT6, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ6_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ6_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ6_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ6_MCU_CLR_SFT, - }, - [MT8183_IRQ_7] = { - .id = MT8183_IRQ_7, - .irq_cnt_reg = AFE_IRQ_MCU_CNT7, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON1, - .irq_fs_shift = IRQ7_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ7_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ7_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ7_MCU_CLR_SFT, - }, - [MT8183_IRQ_8] = { - .id = MT8183_IRQ_8, - .irq_cnt_reg = AFE_IRQ_MCU_CNT8, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = -1, - .irq_fs_shift = -1, - .irq_fs_maskbit = -1, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ8_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ8_MCU_CLR_SFT, - }, - [MT8183_IRQ_11] = { - .id = MT8183_IRQ_11, - .irq_cnt_reg = AFE_IRQ_MCU_CNT11, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON2, - .irq_fs_shift = IRQ11_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ11_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ11_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ11_MCU_CLR_SFT, - }, - [MT8183_IRQ_12] = { - .id = MT8183_IRQ_12, - .irq_cnt_reg = AFE_IRQ_MCU_CNT12, - .irq_cnt_shift = 0, - .irq_cnt_maskbit = 0x3ffff, - .irq_fs_reg = AFE_IRQ_MCU_CON2, - .irq_fs_shift = IRQ12_MCU_MODE_SFT, - .irq_fs_maskbit = IRQ12_MCU_MODE_MASK, - .irq_en_reg = AFE_IRQ_MCU_CON0, - .irq_en_shift = IRQ12_MCU_ON_SFT, - .irq_clr_reg = AFE_IRQ_MCU_CLR, - .irq_clr_shift = IRQ12_MCU_CLR_SFT, - }, + MT8183_AFE_IRQ(0), + MT8183_AFE_IRQ(1), + MT8183_AFE_IRQ(2), + MT8183_AFE_IRQ(3), + MT8183_AFE_IRQ(4), + MT8183_AFE_IRQ(5), + MT8183_AFE_IRQ(6), + MT8183_AFE_IRQ(7), + MT8183_AFE_IRQ_NOFS(8), + MT8183_AFE_IRQ(11), + MT8183_AFE_IRQ(12), }; static bool mt8183_is_volatile_reg(struct device *dev, unsigned int reg) From 31f1f32ad2bb08531986035db340259b71efa3f2 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 15 May 2025 15:38:23 +0800 Subject: [PATCH 013/353] ASoC: mediatek: mt8183-afe-pcm: shorten mt8183_is_volatile_reg() mt8183_is_volatile_reg() is a large switch-case block that lists out every register that is volatile. Since many pairs of registers have consecutive addresses, the cases can be compressed down with the ellipsis, i.e. GCC extension "case ranges" [1] to cover more addresses in one case, shortening the source code. This is not completely the same, since the addresses are 4-byte aligned, and using the case ranges feature adds all unaligned addresses in between. In practice this doesn't matter since the unaligned addresses are blocked by the regmap core. This also ends up compiling slightly smaller with a reduction of 128 bytes in the text section. [1] https://gcc.gnu.org/onlinedocs/gcc/Case-Ranges.html Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20250515073825.4155297-4-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8183/mt8183-afe-pcm.c | 152 ++++++--------------- 1 file changed, 40 insertions(+), 112 deletions(-) diff --git a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c index fe315224a25974..e8884354995cb1 100644 --- a/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c +++ b/sound/soc/mediatek/mt8183/mt8183-afe-pcm.c @@ -522,86 +522,46 @@ static bool mt8183_is_volatile_reg(struct device *dev, unsigned int reg) /* these auto-gen reg has read-only bit, so put it as volatile */ /* volatile reg cannot be cached, so cannot be set when power off */ switch (reg) { - case AUDIO_TOP_CON0: /* reg bit controlled by CCF */ - case AUDIO_TOP_CON1: /* reg bit controlled by CCF */ + case AUDIO_TOP_CON0 ... AUDIO_TOP_CON1: /* reg bit controlled by CCF */ case AUDIO_TOP_CON3: - case AFE_DL1_CUR: - case AFE_DL1_END: - case AFE_DL2_CUR: - case AFE_DL2_END: - case AFE_AWB_END: - case AFE_AWB_CUR: - case AFE_VUL_END: - case AFE_VUL_CUR: - case AFE_MEMIF_MON0: - case AFE_MEMIF_MON1: - case AFE_MEMIF_MON2: - case AFE_MEMIF_MON3: - case AFE_MEMIF_MON4: - case AFE_MEMIF_MON5: - case AFE_MEMIF_MON6: - case AFE_MEMIF_MON7: - case AFE_MEMIF_MON8: - case AFE_MEMIF_MON9: - case AFE_ADDA_SRC_DEBUG_MON0: - case AFE_ADDA_SRC_DEBUG_MON1: - case AFE_ADDA_UL_SRC_MON0: - case AFE_ADDA_UL_SRC_MON1: + case AFE_DL1_CUR ... AFE_DL1_END: + case AFE_DL2_CUR ... AFE_DL2_END: + case AFE_AWB_END ... AFE_AWB_CUR: + case AFE_VUL_END ... AFE_VUL_CUR: + case AFE_MEMIF_MON0 ... AFE_MEMIF_MON9: + case AFE_ADDA_SRC_DEBUG_MON0 ... AFE_ADDA_SRC_DEBUG_MON1: + case AFE_ADDA_UL_SRC_MON0 ... AFE_ADDA_UL_SRC_MON1: case AFE_SIDETONE_MON: - case AFE_SIDETONE_CON0: - case AFE_SIDETONE_COEFF: + case AFE_SIDETONE_CON0 ... AFE_SIDETONE_COEFF: case AFE_BUS_MON0: - case AFE_MRGIF_MON0: - case AFE_MRGIF_MON1: - case AFE_MRGIF_MON2: - case AFE_I2S_MON: + case AFE_MRGIF_MON0 ... AFE_I2S_MON: case AFE_DAC_MON: - case AFE_VUL2_END: - case AFE_VUL2_CUR: - case AFE_IRQ0_MCU_CNT_MON: - case AFE_IRQ6_MCU_CNT_MON: - case AFE_MOD_DAI_END: - case AFE_MOD_DAI_CUR: - case AFE_VUL_D2_END: - case AFE_VUL_D2_CUR: - case AFE_DL3_CUR: - case AFE_DL3_END: + case AFE_VUL2_END ... AFE_VUL2_CUR: + case AFE_IRQ0_MCU_CNT_MON ... AFE_IRQ6_MCU_CNT_MON: + case AFE_MOD_DAI_END ... AFE_MOD_DAI_CUR: + case AFE_VUL_D2_END ... AFE_VUL_D2_CUR: + case AFE_DL3_CUR ... AFE_DL3_END: case AFE_HDMI_OUT_CON0: - case AFE_HDMI_OUT_CUR: - case AFE_HDMI_OUT_END: - case AFE_IRQ3_MCU_CNT_MON: - case AFE_IRQ4_MCU_CNT_MON: - case AFE_IRQ_MCU_STATUS: - case AFE_IRQ_MCU_CLR: + case AFE_HDMI_OUT_CUR ... AFE_HDMI_OUT_END: + case AFE_IRQ3_MCU_CNT_MON... AFE_IRQ4_MCU_CNT_MON: + case AFE_IRQ_MCU_STATUS ... AFE_IRQ_MCU_CLR: case AFE_IRQ_MCU_MON2: - case AFE_IRQ1_MCU_CNT_MON: - case AFE_IRQ2_MCU_CNT_MON: - case AFE_IRQ1_MCU_EN_CNT_MON: - case AFE_IRQ5_MCU_CNT_MON: + case AFE_IRQ1_MCU_CNT_MON ... AFE_IRQ5_MCU_CNT_MON: case AFE_IRQ7_MCU_CNT_MON: case AFE_GAIN1_CUR: case AFE_GAIN2_CUR: case AFE_SRAM_DELSEL_CON0: - case AFE_SRAM_DELSEL_CON2: - case AFE_SRAM_DELSEL_CON3: - case AFE_ASRC_2CH_CON12: - case AFE_ASRC_2CH_CON13: + case AFE_SRAM_DELSEL_CON2 ... AFE_SRAM_DELSEL_CON3: + case AFE_ASRC_2CH_CON12 ... AFE_ASRC_2CH_CON13: case PCM_INTF_CON2: - case FPGA_CFG0: - case FPGA_CFG1: - case FPGA_CFG2: - case FPGA_CFG3: - case AUDIO_TOP_DBG_MON0: - case AUDIO_TOP_DBG_MON1: - case AFE_IRQ8_MCU_CNT_MON: - case AFE_IRQ11_MCU_CNT_MON: - case AFE_IRQ12_MCU_CNT_MON: + case FPGA_CFG0 ... FPGA_CFG1: + case FPGA_CFG2 ... FPGA_CFG3: + case AUDIO_TOP_DBG_MON0 ... AUDIO_TOP_DBG_MON1: + case AFE_IRQ8_MCU_CNT_MON ... AFE_IRQ12_MCU_CNT_MON: case AFE_CBIP_MON0: - case AFE_CBIP_SLV_MUX_MON0: - case AFE_CBIP_SLV_DECODER_MON0: + case AFE_CBIP_SLV_MUX_MON0 ... AFE_CBIP_SLV_DECODER_MON0: case AFE_ADDA6_SRC_DEBUG_MON0: - case AFE_ADD6A_UL_SRC_MON0: - case AFE_ADDA6_UL_SRC_MON1: + case AFE_ADD6A_UL_SRC_MON0... AFE_ADDA6_UL_SRC_MON1: case AFE_DL1_CUR_MSB: case AFE_DL2_CUR_MSB: case AFE_AWB_CUR_MSB: @@ -611,55 +571,23 @@ static bool mt8183_is_volatile_reg(struct device *dev, unsigned int reg) case AFE_VUL_D2_CUR_MSB: case AFE_DL3_CUR_MSB: case AFE_HDMI_OUT_CUR_MSB: - case AFE_AWB2_END: - case AFE_AWB2_CUR: + case AFE_AWB2_END ... AFE_AWB2_CUR: case AFE_AWB2_CUR_MSB: - case AFE_ADDA_DL_SDM_FIFO_MON: - case AFE_ADDA_DL_SRC_LCH_MON: - case AFE_ADDA_DL_SRC_RCH_MON: - case AFE_ADDA_DL_SDM_OUT_MON: - case AFE_CONNSYS_I2S_MON: - case AFE_ASRC_2CH_CON0: - case AFE_ASRC_2CH_CON2: - case AFE_ASRC_2CH_CON3: - case AFE_ASRC_2CH_CON4: - case AFE_ASRC_2CH_CON5: - case AFE_ASRC_2CH_CON7: - case AFE_ASRC_2CH_CON8: - case AFE_MEMIF_MON12: - case AFE_MEMIF_MON13: - case AFE_MEMIF_MON14: - case AFE_MEMIF_MON15: - case AFE_MEMIF_MON16: - case AFE_MEMIF_MON17: - case AFE_MEMIF_MON18: - case AFE_MEMIF_MON19: - case AFE_MEMIF_MON20: - case AFE_MEMIF_MON21: - case AFE_MEMIF_MON22: - case AFE_MEMIF_MON23: - case AFE_MEMIF_MON24: - case AFE_ADDA_MTKAIF_MON0: - case AFE_ADDA_MTKAIF_MON1: + case AFE_ADDA_DL_SDM_FIFO_MON ... AFE_ADDA_DL_SDM_OUT_MON: + case AFE_CONNSYS_I2S_MON ... AFE_ASRC_2CH_CON0: + case AFE_ASRC_2CH_CON2 ... AFE_ASRC_2CH_CON5: + case AFE_ASRC_2CH_CON7 ... AFE_ASRC_2CH_CON8: + case AFE_MEMIF_MON12 ... AFE_MEMIF_MON24: + case AFE_ADDA_MTKAIF_MON0 ... AFE_ADDA_MTKAIF_MON1: case AFE_AUD_PAD_TOP: case AFE_GENERAL1_ASRC_2CH_CON0: - case AFE_GENERAL1_ASRC_2CH_CON2: - case AFE_GENERAL1_ASRC_2CH_CON3: - case AFE_GENERAL1_ASRC_2CH_CON4: - case AFE_GENERAL1_ASRC_2CH_CON5: - case AFE_GENERAL1_ASRC_2CH_CON7: - case AFE_GENERAL1_ASRC_2CH_CON8: - case AFE_GENERAL1_ASRC_2CH_CON12: - case AFE_GENERAL1_ASRC_2CH_CON13: + case AFE_GENERAL1_ASRC_2CH_CON2 ... AFE_GENERAL1_ASRC_2CH_CON5: + case AFE_GENERAL1_ASRC_2CH_CON7 ... AFE_GENERAL1_ASRC_2CH_CON8: + case AFE_GENERAL1_ASRC_2CH_CON12 ... AFE_GENERAL1_ASRC_2CH_CON13: case AFE_GENERAL2_ASRC_2CH_CON0: - case AFE_GENERAL2_ASRC_2CH_CON2: - case AFE_GENERAL2_ASRC_2CH_CON3: - case AFE_GENERAL2_ASRC_2CH_CON4: - case AFE_GENERAL2_ASRC_2CH_CON5: - case AFE_GENERAL2_ASRC_2CH_CON7: - case AFE_GENERAL2_ASRC_2CH_CON8: - case AFE_GENERAL2_ASRC_2CH_CON12: - case AFE_GENERAL2_ASRC_2CH_CON13: + case AFE_GENERAL2_ASRC_2CH_CON2 ... AFE_GENERAL2_ASRC_2CH_CON5: + case AFE_GENERAL2_ASRC_2CH_CON7 ... AFE_GENERAL2_ASRC_2CH_CON8: + case AFE_GENERAL2_ASRC_2CH_CON12 ... AFE_GENERAL2_ASRC_2CH_CON13: return true; default: return false; From 3c5ab6410f8cea24ce993d9688083f67befe9a1b Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 16 May 2025 13:05:48 +0800 Subject: [PATCH 014/353] ASoC: soc-utils: Check string pointer validity in snd_soc_dlc_is_dummy() In the recently added snd_soc_dlc_is_dummy(), the helper uses the .name and .dai_name fields without checking their validity. For .name, this field is NULL if the component is matched by .of_node instead. In fact, only one of these fields may be set. This caused a NULL pointer dereference on MediaTek MT8195 and MT8188 platforms with the subsequent conversion to snd_soc_dlc_is_dummy() in their machine drivers. The codecs are all matches through the device tree, so their .name fields are empty. For .dai_name, there are cases where this field is empty, such as for the following component definitions: #define COMP_EMPTY() { } #define COMP_PLATFORM(_name) { .name = _name } #define COMP_AUX(_name) { .name = _name } #define COMP_CODEC_CONF(_name) { .name = _name } Or the single link CPU DAI case in the simple audio card family, as covered by simple_util_canonicalize_cpu(), in which the .dai_name field is explicitly cleared. To fix this, check the validity of the fields before using them in string comparison. Fixes: 3e021f3b8115 ("ASoC: soc-utils: add snd_soc_dlc_is_dummy()") Cc: Kuninori Morimoto Signed-off-by: Chen-Yu Tsai Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20250516050549.407133-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/soc-utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-utils.c b/sound/soc/soc-utils.c index a1bcc852a6de0b..c8adfff826bd47 100644 --- a/sound/soc/soc-utils.c +++ b/sound/soc/soc-utils.c @@ -267,8 +267,8 @@ int snd_soc_dlc_is_dummy(struct snd_soc_dai_link_component *dlc) if (dlc == &snd_soc_dummy_dlc) return true; - if (strcmp(dlc->name, snd_soc_dummy_dlc.name) == 0 || - strcmp(dlc->dai_name, snd_soc_dummy_dlc.dai_name) == 0) + if ((dlc->name && strcmp(dlc->name, snd_soc_dummy_dlc.name) == 0) || + (dlc->dai_name && strcmp(dlc->dai_name, snd_soc_dummy_dlc.dai_name) == 0)) return true; return false; From 16c49ed5d12b276160a5927873f5f075399cae57 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 16 May 2025 16:03:34 +0800 Subject: [PATCH 015/353] ASoC: fsl_xcvr: update platform driver name XCVR driver is not only used for i.MX8MP platform, so update driver name to make it more generic. Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20250516080334.3272878-1-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_xcvr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl_xcvr.c b/sound/soc/fsl/fsl_xcvr.c index 83aea341c1b609..e3111dd80be486 100644 --- a/sound/soc/fsl/fsl_xcvr.c +++ b/sound/soc/fsl/fsl_xcvr.c @@ -1827,7 +1827,7 @@ static const struct dev_pm_ops fsl_xcvr_pm_ops = { static struct platform_driver fsl_xcvr_driver = { .probe = fsl_xcvr_probe, .driver = { - .name = "fsl,imx8mp-audio-xcvr", + .name = "fsl-xcvr", .pm = pm_ptr(&fsl_xcvr_pm_ops), .of_match_table = fsl_xcvr_dt_ids, }, From f8b2f84079a4998421c7eae024f0b8f2a495e925 Mon Sep 17 00:00:00 2001 From: Talhah Peerbhai Date: Fri, 16 May 2025 01:27:41 +0300 Subject: [PATCH 016/353] ASoC: amd: yc: Add quirk for Lenovo Yoga Pro 7 14ASP9 Similar to many other Lenovo models with AMD chips, the Lenovo Yoga Pro 7 14ASP9 (product name 83HN) requires a specific quirk to ensure internal mic detection. This patch adds a quirk fixing this. Signed-off-by: Talhah Peerbhai Link: https://patch.msgid.link/20250515222741.144616-1-talhah.peerbhai@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index e632f16c910250..3d9da93d22ee84 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -311,6 +311,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "83AS"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83HN"), + } + }, { .driver_data = &acp6x_card, .matches = { @@ -360,7 +367,7 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "M5402RA"), } }, - { + { .driver_data = &acp6x_card, .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), From 363c38b72bd82c817ab627ab6e1fccb282e2a788 Mon Sep 17 00:00:00 2001 From: Sheetal Date: Mon, 12 May 2025 06:42:56 +0000 Subject: [PATCH 017/353] dt-bindings: Update Tegra194 and Tegra234 HDA bindings - Tegra194 and Tegra234 HDA is not compatible with Tegra30, hence update them as standalone compatibles. Also, add necessary logic to the binding doc as HDA clocks and resets for Tegra194 and Tegra234 are different from Tegra30. This fixes below dtbs_check errors: - compatible: 'oneOf' conditional failed, one must be fixed: ['nvidia,tegra194-hda'] is too short 'nvidia,tegra30-hda' was expected 'nvidia,tegra132-hda' was expected - compatible: 'oneOf' conditional failed, one must be fixed: ['nvidia,tegra234-hda'] is too short 'nvidia,tegra30-hda' was expected 'nvidia,tegra132-hda' was expected - hda@3510000: clock-names:1: 'hda2hdmi' was expected - hda@3510000: reset-names:1: 'hda2hdmi' was expected Signed-off-by: Sheetal Reviewed-by: Rob Herring (Arm) Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250512064258.1028331-2-sheetal@nvidia.com --- .../bindings/sound/nvidia,tegra30-hda.yaml | 83 ++++++++++++++++--- 1 file changed, 72 insertions(+), 11 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml index 3ca9affb79a20a..703f009862a489 100644 --- a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml +++ b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml @@ -20,11 +20,12 @@ properties: compatible: oneOf: - - const: nvidia,tegra30-hda + - enum: + - nvidia,tegra30-hda + - nvidia,tegra194-hda + - nvidia,tegra234-hda - items: - enum: - - nvidia,tegra234-hda - - nvidia,tegra194-hda - nvidia,tegra186-hda - nvidia,tegra210-hda - nvidia,tegra124-hda @@ -48,10 +49,7 @@ properties: clock-names: minItems: 2 - items: - - const: hda - - const: hda2hdmi - - const: hda2codec_2x + maxItems: 3 resets: minItems: 2 @@ -59,10 +57,7 @@ properties: reset-names: minItems: 2 - items: - - const: hda - - const: hda2hdmi - - const: hda2codec_2x + maxItems: 3 power-domains: maxItems: 1 @@ -93,6 +88,72 @@ required: additionalProperties: false +allOf: + - if: + properties: + compatible: + contains: + enum: + - nvidia,tegra30-hda + then: + properties: + clocks: + minItems: 3 + clock-names: + items: + - const: hda + - const: hda2hdmi + - const: hda2codec_2x + resets: + minItems: 3 + reset-names: + items: + - const: hda + - const: hda2hdmi + - const: hda2codec_2x + - if: + properties: + compatible: + contains: + enum: + - nvidia,tegra194-hda + then: + properties: + clocks: + minItems: 3 + clock-names: + items: + - const: hda + - const: hda2hdmi + - const: hda2codec_2x + resets: + maxItems: 2 + reset-names: + items: + - const: hda + - const: hda2hdmi + - if: + properties: + compatible: + contains: + enum: + - nvidia,tegra234-hda + then: + properties: + clocks: + minItems: 2 + maxItems: 2 + clock-names: + items: + - const: hda + - const: hda2codec_2x + resets: + maxItems: 2 + reset-names: + items: + - const: hda + - const: hda2codec_2x + examples: - | #include From ca278e658e9309dd7ac88817db7af01df804c387 Mon Sep 17 00:00:00 2001 From: Sheetal Date: Mon, 12 May 2025 06:42:57 +0000 Subject: [PATCH 018/353] dt-bindings: Document Tegra264 HDA Support For Tegra264 device there is one clock, two resets and no power-domain. Add the Tegra264 compatible and necessary logic to the binding doc to determine appropriate clocks, resets and power-domain properties based on the compatible string. Signed-off-by: Sheetal Reviewed-by: Rob Herring (Arm) Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250512064258.1028331-3-sheetal@nvidia.com --- .../bindings/sound/nvidia,tegra30-hda.yaml | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml index 703f009862a489..8a8767589ee08d 100644 --- a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml +++ b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml @@ -24,6 +24,7 @@ properties: - nvidia,tegra30-hda - nvidia,tegra194-hda - nvidia,tegra234-hda + - nvidia,tegra264-hda - items: - enum: - nvidia,tegra186-hda @@ -44,11 +45,11 @@ properties: maxItems: 1 clocks: - minItems: 2 + minItems: 1 maxItems: 3 clock-names: - minItems: 2 + minItems: 1 maxItems: 3 resets: @@ -153,6 +154,26 @@ allOf: items: - const: hda - const: hda2codec_2x + - if: + properties: + compatible: + contains: + enum: + - nvidia,tegra264-hda + then: + properties: + clocks: + maxItems: 1 + clock-names: + items: + - const: hda + resets: + maxItems: 2 + reset-names: + items: + - const: hda + - const: hda2codec_2x + power-domains: false examples: - | From c8dfcbeb3a36f0465ab5fb5c185c3fdeb5b3eb4d Mon Sep 17 00:00:00 2001 From: Mohan Kumar D Date: Mon, 12 May 2025 06:42:58 +0000 Subject: [PATCH 019/353] ALSA: hda/tegra: Add Tegra264 support Update HDA driver to support Tegra264 differences from legacy HDA, which includes: clocks/resets, always power on, and hardware-managed FPCI/IPFS initialization. The driver retrieves this chip-specific information from soc_data. Signed-off-by: Mohan Kumar D Signed-off-by: Sheetal Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250512064258.1028331-4-sheetal@nvidia.com --- sound/pci/hda/hda_tegra.c | 51 +++++++++++++++++++++++++++++++++----- sound/pci/hda/patch_hdmi.c | 1 + 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/sound/pci/hda/hda_tegra.c b/sound/pci/hda/hda_tegra.c index a590d431c5ff6e..8c0dd439f5a55c 100644 --- a/sound/pci/hda/hda_tegra.c +++ b/sound/pci/hda/hda_tegra.c @@ -72,6 +72,10 @@ struct hda_tegra_soc { bool has_hda2codec_2x_reset; bool has_hda2hdmi; + bool has_hda2codec_2x; + bool input_stream; + bool always_on; + bool requires_init; }; struct hda_tegra { @@ -187,7 +191,9 @@ static int hda_tegra_runtime_resume(struct device *dev) if (rc != 0) return rc; if (chip->running) { - hda_tegra_init(hda); + if (hda->soc->requires_init) + hda_tegra_init(hda); + azx_init_chip(chip, 1); /* disable controller wake up event*/ azx_writew(chip, WAKEEN, azx_readw(chip, WAKEEN) & @@ -250,7 +256,8 @@ static int hda_tegra_init_chip(struct azx *chip, struct platform_device *pdev) bus->remap_addr = hda->regs + HDA_BAR0; bus->addr = res->start + HDA_BAR0; - hda_tegra_init(hda); + if (hda->soc->requires_init) + hda_tegra_init(hda); return 0; } @@ -323,7 +330,7 @@ static int hda_tegra_first_init(struct azx *chip, struct platform_device *pdev) * starts with offset 0 which is wrong as HW register for output stream * offset starts with 4. */ - if (of_device_is_compatible(np, "nvidia,tegra234-hda")) + if (!hda->soc->input_stream) chip->capture_streams = 4; chip->playback_streams = (gcap >> 12) & 0x0f; @@ -419,7 +426,6 @@ static int hda_tegra_create(struct snd_card *card, chip->driver_caps = driver_caps; chip->driver_type = driver_caps & 0xff; chip->dev_index = 0; - chip->jackpoll_interval = msecs_to_jiffies(5000); INIT_LIST_HEAD(&chip->pcm_list); chip->codec_probe_mask = -1; @@ -436,7 +442,16 @@ static int hda_tegra_create(struct snd_card *card, chip->bus.core.sync_write = 0; chip->bus.core.needs_damn_long_delay = 1; chip->bus.core.aligned_mmio = 1; - chip->bus.jackpoll_in_suspend = 1; + + /* + * HDA power domain and clocks are always on for Tegra264 and + * the jack detection logic would work always, so no need of + * jack polling mechanism running. + */ + if (!hda->soc->always_on) { + chip->jackpoll_interval = msecs_to_jiffies(5000); + chip->bus.jackpoll_in_suspend = 1; + } err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); if (err < 0) { @@ -450,22 +465,44 @@ static int hda_tegra_create(struct snd_card *card, static const struct hda_tegra_soc tegra30_data = { .has_hda2codec_2x_reset = true, .has_hda2hdmi = true, + .has_hda2codec_2x = true, + .input_stream = true, + .always_on = false, + .requires_init = true, }; static const struct hda_tegra_soc tegra194_data = { .has_hda2codec_2x_reset = false, .has_hda2hdmi = true, + .has_hda2codec_2x = true, + .input_stream = true, + .always_on = false, + .requires_init = true, }; static const struct hda_tegra_soc tegra234_data = { .has_hda2codec_2x_reset = true, .has_hda2hdmi = false, + .has_hda2codec_2x = true, + .input_stream = false, + .always_on = false, + .requires_init = true, +}; + +static const struct hda_tegra_soc tegra264_data = { + .has_hda2codec_2x_reset = true, + .has_hda2hdmi = false, + .has_hda2codec_2x = false, + .input_stream = false, + .always_on = true, + .requires_init = false, }; static const struct of_device_id hda_tegra_match[] = { { .compatible = "nvidia,tegra30-hda", .data = &tegra30_data }, { .compatible = "nvidia,tegra194-hda", .data = &tegra194_data }, { .compatible = "nvidia,tegra234-hda", .data = &tegra234_data }, + { .compatible = "nvidia,tegra264-hda", .data = &tegra264_data }, {}, }; MODULE_DEVICE_TABLE(of, hda_tegra_match); @@ -520,7 +557,9 @@ static int hda_tegra_probe(struct platform_device *pdev) hda->clocks[hda->nclocks++].id = "hda"; if (hda->soc->has_hda2hdmi) hda->clocks[hda->nclocks++].id = "hda2hdmi"; - hda->clocks[hda->nclocks++].id = "hda2codec_2x"; + + if (hda->soc->has_hda2codec_2x) + hda->clocks[hda->nclocks++].id = "hda2codec_2x"; err = devm_clk_bulk_get(&pdev->dev, hda->nclocks, hda->clocks); if (err < 0) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index f451b9a671c001..08308231b4ed3d 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4551,6 +4551,7 @@ HDA_CODEC_ENTRY(0x10de002e, "Tegra186 HDMI/DP1", patch_tegra_hdmi), HDA_CODEC_ENTRY(0x10de002f, "Tegra194 HDMI/DP2", patch_tegra_hdmi), HDA_CODEC_ENTRY(0x10de0030, "Tegra194 HDMI/DP3", patch_tegra_hdmi), HDA_CODEC_ENTRY(0x10de0031, "Tegra234 HDMI/DP", patch_tegra234_hdmi), +HDA_CODEC_ENTRY(0x10de0034, "Tegra264 HDMI/DP", patch_tegra234_hdmi), HDA_CODEC_ENTRY(0x10de0040, "GPU 40 HDMI/DP", patch_nvhdmi), HDA_CODEC_ENTRY(0x10de0041, "GPU 41 HDMI/DP", patch_nvhdmi), HDA_CODEC_ENTRY(0x10de0042, "GPU 42 HDMI/DP", patch_nvhdmi), From 7e07bd9aa20ba44e235223f8c05033e33e941efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20Ku=C4=8Dera?= Date: Thu, 15 May 2025 14:23:46 +0200 Subject: [PATCH 020/353] ALSA: usb-audio: Rename Pioneer mixer channel controls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit so alsamixer can show the controls in correct categories: capture/playback Fixes: 171bb5123fbc ("ALSA: usb-audio: Add Pioneer DJ DJM-V10 support") Signed-off-by: František Kučera Link: https://patch.msgid.link/20250515122345.3911-2-franta-linux@frantovo.cz Signed-off-by: Takashi Iwai --- sound/usb/mixer_quirks.c | 104 +++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index 2cabf3bf63d791..aad205df93b263 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -3713,13 +3713,13 @@ static const u16 snd_djm_opts_250mk2_pb2[] = { 0x0200, 0x0201, 0x0204 }; static const u16 snd_djm_opts_250mk2_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_250mk2[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 250mk2_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 250mk2_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 250mk2_cap3, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch1 Output", 250mk2_pb1, 0, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch2 Output", 250mk2_pb2, 1, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch3 Output", 250mk2_pb3, 2, SND_DJM_WINDEX_PB) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 250mk2_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 250mk2_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 250mk2_cap3, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Output 1 Playback Switch", 250mk2_pb1, 0, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 2 Playback Switch", 250mk2_pb2, 1, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 3 Playback Switch", 250mk2_pb3, 2, SND_DJM_WINDEX_PB) }; @@ -3738,13 +3738,13 @@ static const u16 snd_djm_opts_450_pb2[] = { 0x0200, 0x0201, 0x0204 }; static const u16 snd_djm_opts_450_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_450[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 450_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 450_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 450_cap3, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch1 Output", 450_pb1, 0, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch2 Output", 450_pb2, 1, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch3 Output", 450_pb3, 2, SND_DJM_WINDEX_PB) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 450_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 450_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 450_cap3, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Output 1 Playback Switch", 450_pb1, 0, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 2 Playback Switch", 450_pb2, 1, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 3 Playback Switch", 450_pb3, 2, SND_DJM_WINDEX_PB) }; @@ -3759,11 +3759,11 @@ static const u16 snd_djm_opts_750_cap4[] = { 0x0401, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040f }; static const struct snd_djm_ctl snd_djm_ctls_750[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 750_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 750_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 750_cap3, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch4 Input", 750_cap4, 0, SND_DJM_WINDEX_CAP) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 750_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 750_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 750_cap3, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", 750_cap4, 0, SND_DJM_WINDEX_CAP) }; @@ -3778,11 +3778,11 @@ static const u16 snd_djm_opts_850_cap4[] = { 0x0400, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040f }; static const struct snd_djm_ctl snd_djm_ctls_850[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 850_cap1, 1, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 850_cap2, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 850_cap3, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch4 Input", 850_cap4, 1, SND_DJM_WINDEX_CAP) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 850_cap1, 1, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 850_cap2, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 850_cap3, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", 850_cap4, 1, SND_DJM_WINDEX_CAP) }; @@ -3799,12 +3799,12 @@ static const u16 snd_djm_opts_900nxs2_cap5[] = { 0x0507, 0x0508, 0x0509, 0x050a, 0x0511, 0x0512, 0x0513, 0x0514 }; static const struct snd_djm_ctl snd_djm_ctls_900nxs2[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 900nxs2_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 900nxs2_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 900nxs2_cap3, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch4 Input", 900nxs2_cap4, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch5 Input", 900nxs2_cap5, 3, SND_DJM_WINDEX_CAP) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 900nxs2_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 900nxs2_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 900nxs2_cap3, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", 900nxs2_cap4, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 5 Capture Switch", 900nxs2_cap5, 3, SND_DJM_WINDEX_CAP) }; // DJM-750MK2 @@ -3825,15 +3825,15 @@ static const u16 snd_djm_opts_750mk2_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_750mk2[] = { - SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Ch1 Input", 750mk2_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", 750mk2_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", 750mk2_cap3, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch4 Input", 750mk2_cap4, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch5 Input", 750mk2_cap5, 3, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch1 Output", 750mk2_pb1, 0, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch2 Output", 750mk2_pb2, 1, SND_DJM_WINDEX_PB), - SND_DJM_CTL("Ch3 Output", 750mk2_pb3, 2, SND_DJM_WINDEX_PB) + SND_DJM_CTL("Master Input Level Capture Switch", cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", 750mk2_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", 750mk2_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", 750mk2_cap3, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", 750mk2_cap4, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 5 Capture Switch", 750mk2_cap5, 3, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Output 1 Playback Switch", 750mk2_pb1, 0, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 2 Playback Switch", 750mk2_pb2, 1, SND_DJM_WINDEX_PB), + SND_DJM_CTL("Output 3 Playback Switch", 750mk2_pb3, 2, SND_DJM_WINDEX_PB) }; @@ -3853,12 +3853,12 @@ static const u16 snd_djm_opts_a9_cap5[] = { 0x0501, 0x0502, 0x0503, 0x0505, 0x0506, 0x0507, 0x0508, 0x0509, 0x050a, 0x050e }; static const struct snd_djm_ctl snd_djm_ctls_a9[] = { - SND_DJM_CTL("Capture Level", a9_cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Master Input", a9_cap1, 3, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch1 Input", a9_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch2 Input", a9_cap3, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch3 Input", a9_cap4, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Ch4 Input", a9_cap5, 2, SND_DJM_WINDEX_CAP) + SND_DJM_CTL("Master Input Level Capture Switch", a9_cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Master Input Capture Switch", a9_cap1, 3, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 1 Capture Switch", a9_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", a9_cap3, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", a9_cap4, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", a9_cap5, 2, SND_DJM_WINDEX_CAP) }; // DJM-V10 @@ -3895,13 +3895,13 @@ static const u16 snd_djm_opts_v10_cap6[] = { }; static const struct snd_djm_ctl snd_djm_ctls_v10[] = { - SND_DJM_CTL("Master Capture Level Capture Switch", v10_cap_level, 0, SND_DJM_WINDEX_CAPLVL), - SND_DJM_CTL("Capture Ch1 Capture Switch", v10_cap1, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Capture Ch2 Capture Switch", v10_cap2, 2, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Capture Ch3 Capture Switch", v10_cap3, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Capture Ch4 Capture Switch", v10_cap4, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Capture Ch5 Capture Switch", v10_cap5, 0, SND_DJM_WINDEX_CAP), - SND_DJM_CTL("Capture Ch6 Capture Switch", v10_cap6, 0, SND_DJM_WINDEX_CAP) + SND_DJM_CTL("Master Input Level Capture Switch", v10_cap_level, 0, SND_DJM_WINDEX_CAPLVL), + SND_DJM_CTL("Input 1 Capture Switch", v10_cap1, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 2 Capture Switch", v10_cap2, 2, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 3 Capture Switch", v10_cap3, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 4 Capture Switch", v10_cap4, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 5 Capture Switch", v10_cap5, 0, SND_DJM_WINDEX_CAP), + SND_DJM_CTL("Input 6 Capture Switch", v10_cap6, 0, SND_DJM_WINDEX_CAP) // playback channels are fixed and controlled by hardware knobs on the mixer }; From 37402083130f47816e1a1fb06d6c334a16b9accb Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Fri, 16 May 2025 15:04:16 +0800 Subject: [PATCH 021/353] ALSA: scarlett2: Use USB API functions rather than constants Use the function usb_endpoint_num() rather than constants. The Coccinelle semantic patch is as follows: @@ struct usb_endpoint_descriptor *epd; @@ - (epd->bEndpointAddress & \(USB_ENDPOINT_NUMBER_MASK\|0x0f\)) + usb_endpoint_num(epd) Signed-off-by: Chen Ni Link: https://patch.msgid.link/20250516070416.12458-1-nichen@iscas.ac.cn Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 288d22e6a0b253..93589e86828a3e 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -8574,8 +8574,7 @@ static int scarlett2_find_fc_interface(struct usb_device *dev, epd = get_endpoint(intf->altsetting, 0); private->bInterfaceNumber = desc->bInterfaceNumber; - private->bEndpointAddress = epd->bEndpointAddress & - USB_ENDPOINT_NUMBER_MASK; + private->bEndpointAddress = usb_endpoint_num(epd); private->wMaxPacketSize = le16_to_cpu(epd->wMaxPacketSize); private->bInterval = epd->bInterval; return 0; From 17785370f8d6ab8551df1b56cedf7bf7d4440a23 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Fri, 16 May 2025 15:09:38 +0800 Subject: [PATCH 022/353] ALSA: usb: fcp: Use USB API functions rather than constants Use the function usb_endpoint_num() rather than constants. The Coccinelle semantic patch is as follows: @@ struct usb_endpoint_descriptor *epd; @@ - (epd->bEndpointAddress & \(USB_ENDPOINT_NUMBER_MASK\|0x0f\)) + usb_endpoint_num(epd) Signed-off-by: Chen Ni Link: https://patch.msgid.link/20250516070938.12520-1-nichen@iscas.ac.cn Signed-off-by: Takashi Iwai --- sound/usb/fcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/usb/fcp.c b/sound/usb/fcp.c index 7df65041ace556..98f9964311a735 100644 --- a/sound/usb/fcp.c +++ b/sound/usb/fcp.c @@ -1092,8 +1092,7 @@ static int fcp_find_fc_interface(struct usb_mixer_interface *mixer) epd = get_endpoint(intf->altsetting, 0); private->bInterfaceNumber = desc->bInterfaceNumber; - private->bEndpointAddress = epd->bEndpointAddress & - USB_ENDPOINT_NUMBER_MASK; + private->bEndpointAddress = usb_endpoint_num(epd); private->wMaxPacketSize = le16_to_cpu(epd->wMaxPacketSize); private->bInterval = epd->bInterval; return 0; From bfb7f2477424e4be43ae3553e15746d28495df39 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 11 May 2025 15:45:27 +0200 Subject: [PATCH 023/353] ALSA: seq: Fix delivery of UMP events to group ports When an event with UMP message is sent to a UMP client, the EP port receives always no matter where the event is sent to, as it's a catch-all port. OTOH, if an event is sent to EP port, and if the event has a certain UMP Group, it should have been delivered to the associated UMP Group port, too, but this was ignored, so far. This patch addresses the behavior. Now a UMP event sent to the Endpoint port will be delivered to the subscribers of the UMP group port the event is associated with. The patch also does a bit of refactoring to simplify the code about __deliver_to_subscribers(). Fixes: 177ccf811df4 ("ALSA: seq: Support MIDI 2.0 UMP Endpoint port") Link: https://patch.msgid.link/20250511134528.6314-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 52 ++++++++++++++++++++------------ sound/core/seq/seq_ump_convert.c | 18 +++++++++++ sound/core/seq/seq_ump_convert.h | 1 + 3 files changed, 52 insertions(+), 19 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 198c598a539398..880240924bfd05 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -732,15 +732,21 @@ static int snd_seq_deliver_single_event(struct snd_seq_client *client, */ static int __deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, - struct snd_seq_client_port *src_port, - int atomic, int hop) + int port, int atomic, int hop) { + struct snd_seq_client_port *src_port; struct snd_seq_subscribers *subs; int err, result = 0, num_ev = 0; union __snd_seq_event event_saved; size_t saved_size; struct snd_seq_port_subs_info *grp; + if (port < 0) + return 0; + src_port = snd_seq_port_use_ptr(client, port); + if (!src_port) + return 0; + /* save original event record */ saved_size = snd_seq_event_packet_size(event); memcpy(&event_saved, event, saved_size); @@ -775,6 +781,7 @@ static int __deliver_to_subscribers(struct snd_seq_client *client, read_unlock(&grp->list_lock); else up_read(&grp->list_mutex); + snd_seq_port_unlock(src_port); memcpy(event, &event_saved, saved_size); return (result < 0) ? result : num_ev; } @@ -783,25 +790,32 @@ static int deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { - struct snd_seq_client_port *src_port; - int ret = 0, ret2; - - src_port = snd_seq_port_use_ptr(client, event->source.port); - if (src_port) { - ret = __deliver_to_subscribers(client, event, src_port, atomic, hop); - snd_seq_port_unlock(src_port); - } - - if (client->ump_endpoint_port < 0 || - event->source.port == client->ump_endpoint_port) - return ret; + int ret; +#if IS_ENABLED(CONFIG_SND_SEQ_UMP) + int ret2; +#endif - src_port = snd_seq_port_use_ptr(client, client->ump_endpoint_port); - if (!src_port) + ret = __deliver_to_subscribers(client, event, + event->source.port, atomic, hop); +#if IS_ENABLED(CONFIG_SND_SEQ_UMP) + if (!snd_seq_client_is_ump(client) || client->ump_endpoint_port < 0) return ret; - ret2 = __deliver_to_subscribers(client, event, src_port, atomic, hop); - snd_seq_port_unlock(src_port); - return ret2 < 0 ? ret2 : ret; + /* If it's an event from EP port (and with a UMP group), + * deliver to subscribers of the corresponding UMP group port, too. + * Or, if it's from non-EP port, deliver to subscribers of EP port, too. + */ + if (event->source.port == client->ump_endpoint_port) + ret2 = __deliver_to_subscribers(client, event, + snd_seq_ump_group_port(event), + atomic, hop); + else + ret2 = __deliver_to_subscribers(client, event, + client->ump_endpoint_port, + atomic, hop); + if (ret2 < 0) + return ret2; +#endif + return ret; } /* deliver an event to the destination port(s). diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index ff7e558b4d51d0..db2f169cae11ea 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -1285,3 +1285,21 @@ int snd_seq_deliver_to_ump(struct snd_seq_client *source, else return cvt_to_ump_midi1(dest, dest_port, event, atomic, hop); } + +/* return the UMP group-port number of the event; + * return -1 if groupless or non-UMP event + */ +int snd_seq_ump_group_port(const struct snd_seq_event *event) +{ + const struct snd_seq_ump_event *ump_ev = + (const struct snd_seq_ump_event *)event; + unsigned char type; + + if (!snd_seq_ev_is_ump(event)) + return -1; + type = ump_message_type(ump_ev->ump[0]); + if (ump_is_groupless_msg(type)) + return -1; + /* group-port number starts from 1 */ + return ump_message_group(ump_ev->ump[0]) + 1; +} diff --git a/sound/core/seq/seq_ump_convert.h b/sound/core/seq/seq_ump_convert.h index 6c146d8032804f..4abf0a7637d701 100644 --- a/sound/core/seq/seq_ump_convert.h +++ b/sound/core/seq/seq_ump_convert.h @@ -18,5 +18,6 @@ int snd_seq_deliver_to_ump(struct snd_seq_client *source, struct snd_seq_client_port *dest_port, struct snd_seq_event *event, int atomic, int hop); +int snd_seq_ump_group_port(const struct snd_seq_event *event); #endif /* __SEQ_UMP_CONVERT_H */ From 981c133a49331f906501da1233d40b16994e3bd1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 11 May 2025 16:11:45 +0200 Subject: [PATCH 024/353] ALSA: ump: Fix a typo of snd_ump_stream_msg_device_info s/devince/device/ It's used only internally, so no any behavior changes. Fixes: 37e0e14128e0 ("ALSA: ump: Support UMP Endpoint and Function Block parsing") Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250511141147.10246-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- drivers/usb/gadget/function/f_midi2.c | 2 +- include/sound/ump_msg.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index 12e866fb311d63..0a800ba53816a8 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -475,7 +475,7 @@ static void reply_ump_stream_ep_info(struct f_midi2_ep *ep) /* reply a UMP EP device info */ static void reply_ump_stream_ep_device(struct f_midi2_ep *ep) { - struct snd_ump_stream_msg_devince_info rep = { + struct snd_ump_stream_msg_device_info rep = { .type = UMP_MSG_TYPE_STREAM, .status = UMP_STREAM_MSG_STATUS_DEVICE_INFO, .manufacture_id = ep->info.manufacturer, diff --git a/include/sound/ump_msg.h b/include/sound/ump_msg.h index 72f60ddfea7535..9556b4755a1ed8 100644 --- a/include/sound/ump_msg.h +++ b/include/sound/ump_msg.h @@ -604,7 +604,7 @@ struct snd_ump_stream_msg_ep_info { } __packed; /* UMP Stream Message: Device Info Notification (128bit) */ -struct snd_ump_stream_msg_devince_info { +struct snd_ump_stream_msg_device_info { #ifdef __BIG_ENDIAN_BITFIELD /* 0 */ u32 type:4; @@ -754,7 +754,7 @@ struct snd_ump_stream_msg_fb_name { union snd_ump_stream_msg { struct snd_ump_stream_msg_ep_discovery ep_discovery; struct snd_ump_stream_msg_ep_info ep_info; - struct snd_ump_stream_msg_devince_info device_info; + struct snd_ump_stream_msg_device_info device_info; struct snd_ump_stream_msg_stream_cfg stream_cfg; struct snd_ump_stream_msg_fb_discovery fb_discovery; struct snd_ump_stream_msg_fb_info fb_info; From ff2a69fa02ab8ceb3dd7bd0f35cd795cd0c8e9e8 Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Mon, 12 May 2025 22:23:37 +0200 Subject: [PATCH 025/353] ALSA: usb-audio: Add sample rate quirk for Audioengine D1 A user reported on the Arch Linux Forums that their device is emitting the following message in the kernel journal, which is fixed by adding the quirk as submitted in this patch: > kernel: usb 1-2: current rate 8436480 is different from the runtime rate 48000 There also is an entry for this product line added long time ago. Their specific device has the following ID: $ lsusb | grep Audio Bus 001 Device 002: ID 1101:0003 EasyPass Industrial Co., Ltd Audioengine D1 Link: https://bbs.archlinux.org/viewtopic.php?id=305494 Fixes: 93f9d1a4ac593 ("ALSA: usb-audio: Apply sample rate quirk for Audioengine D1") Cc: stable@vger.kernel.org Signed-off-by: Christian Heusel Link: https://patch.msgid.link/20250512-audioengine-quirk-addition-v1-1-4c370af6eff7@heusel.eu Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 4dcddab1a0e3d4..3164c54a79ff39 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2251,6 +2251,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x1395, 0x740a, /* Sennheiser DECT */ From b56fa1863a4702550a9c629a43ce594af4b04033 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 13 May 2025 09:31:04 +0200 Subject: [PATCH 026/353] ALSA: sh: SND_AICA should depend on SH_DMA_API If CONFIG_SH_DMA_API=n: WARNING: unmet direct dependencies detected for G2_DMA Depends on [n]: SH_DREAMCAST [=y] && SH_DMA_API [=n] Selected by [y]: - SND_AICA [=y] && SOUND [=y] && SND [=y] && SND_SUPERH [=y] && SH_DREAMCAST [=y] SND_AICA selects G2_DMA. As the latter depends on SH_DMA_API, the former should depend on SH_DMA_API, too. Fixes: f477a538c14d07f8 ("sh: dma: fix kconfig dependency for G2_DMA") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505131320.PzgTtl9H-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/b90625f8a9078d0d304bafe862cbe3a3fab40082.1747121335.git.geert+renesas@glider.be Signed-off-by: Takashi Iwai --- sound/sh/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sh/Kconfig b/sound/sh/Kconfig index b75fbb3236a7b9..f5fa09d740b4c9 100644 --- a/sound/sh/Kconfig +++ b/sound/sh/Kconfig @@ -14,7 +14,7 @@ if SND_SUPERH config SND_AICA tristate "Dreamcast Yamaha AICA sound" - depends on SH_DREAMCAST + depends on SH_DREAMCAST && SH_DMA_API select SND_PCM select G2_DMA help From 0af6ddeabbdd4eeea22889b41e24fde445555978 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 14 May 2025 17:24:44 +0800 Subject: [PATCH 027/353] ALSA: es1968: Add error handling for snd_pcm_hw_constraint_pow2() The function snd_es1968_capture_open() calls the function snd_pcm_hw_constraint_pow2(), but does not check its return value. A proper implementation can be found in snd_cx25821_pcm_open(). Add error handling for snd_pcm_hw_constraint_pow2() and propagate its error code. Fixes: b942cf815b57 ("[ALSA] es1968 - Fix stuttering capture") Cc: stable@vger.kernel.org # v2.6.22 Signed-off-by: Wentao Liang Link: https://patch.msgid.link/20250514092444.331-1-vulab@iscas.ac.cn Signed-off-by: Takashi Iwai --- sound/pci/es1968.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c index 899135d3922e4d..37bac890613eee 100644 --- a/sound/pci/es1968.c +++ b/sound/pci/es1968.c @@ -1561,7 +1561,7 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream) struct snd_pcm_runtime *runtime = substream->runtime; struct es1968 *chip = snd_pcm_substream_chip(substream); struct esschan *es; - int apu1, apu2; + int err, apu1, apu2; apu1 = snd_es1968_alloc_apu_pair(chip, ESM_APU_PCM_CAPTURE); if (apu1 < 0) @@ -1605,7 +1605,9 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream) runtime->hw = snd_es1968_capture; runtime->hw.buffer_bytes_max = runtime->hw.period_bytes_max = calc_available_memory_size(chip) - 1024; /* keep MIXBUF size */ - snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES); + err = snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES); + if (err < 0) + return err; spin_lock_irq(&chip->substream_lock); list_add(&es->list, &chip->substream_list); From 266c67b5c5df1d6f75093fcc56d1aa612cbfc663 Mon Sep 17 00:00:00 2001 From: Nicolas Chauvet Date: Thu, 15 May 2025 12:21:32 +0200 Subject: [PATCH 028/353] ALSA: usb-audio: Add sample rate quirk for Microdia JP001 USB Camera Microdia JP001 does not support reading the sample rate which leads to many lines of "cannot get freq at ep 0x84". This patch adds the USB ID to quirks.c and avoids those error messages. usb 7-4: New USB device found, idVendor=0c45, idProduct=636b, bcdDevice= 1.00 usb 7-4: New USB device strings: Mfr=2, Product=1, SerialNumber=3 usb 7-4: Product: JP001 usb 7-4: Manufacturer: JP001 usb 7-4: SerialNumber: JP001 usb 7-4: 3:1: cannot get freq at ep 0x84 Cc: Signed-off-by: Nicolas Chauvet Link: https://patch.msgid.link/20250515102132.73062-1-kwizart@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3164c54a79ff39..5ed523b13fadba 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2243,6 +2243,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), + DEVICE_FLG(0x0c45, 0x636b, /* Microdia JP001 USB Camera */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0d8c, 0x0014, /* USB Audio Device */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ From d2532bc87ad2780fbe4b55f591c83a0ce41c77bf Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 15 May 2025 17:28:32 +0100 Subject: [PATCH 029/353] ALSA: hda/realtek: Add support for Acer Helios Laptops using CS35L41 HDA Laptops use 2 CS35L41 Amps with HDA, using External boost with I2C. Similar to previous Acer laptops, these laptops also need the ALC255_FIXUP_PREDATOR_SUBWOOFER quirk to function properly. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20250515162848.405055-2-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 61c4bcb2087341..f6932f8c987a01 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8026,6 +8026,7 @@ enum { ALC283_FIXUP_DELL_HP_RESUME, ALC294_FIXUP_ASUS_CS35L41_SPI_2, ALC274_FIXUP_HP_AIO_BIND_DACS, + ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9298,6 +9299,12 @@ static const struct hda_fixup alc269_fixups[] = { { } } }, + [ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l41_fixup_i2c_two, + .chained = true, + .chain_id = ALC255_FIXUP_PREDATOR_SUBWOOFER + }, [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -10453,6 +10460,9 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x1534, "Acer Predator PH315-54", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x159c, "Acer Nitro 5 AN515-58", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x169a, "Acer Swift SFG16", ALC256_FIXUP_ACER_SFG16_MICMUTE_LED), + SND_PCI_QUIRK(0x1025, 0x1826, "Acer Helios ZPC", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1025, 0x182c, "Acer Helios ZPD", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1025, 0x1844, "Acer Helios ZPS", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x053c, "Dell Latitude E5430", ALC292_FIXUP_DELL_E7X), SND_PCI_QUIRK(0x1028, 0x054b, "Dell XPS one 2710", ALC275_FIXUP_DELL_XPS), From f19304b6f28312d71b74ed1b0d4d38979080acd5 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 15 May 2025 17:28:33 +0100 Subject: [PATCH 030/353] ALSA: hda: cs35l41: Fix swapped l/r audio channels for Acer Helios laptops Fixes audio channel assignment from ACPI using configuration table. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20250515162848.405055-3-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda_property.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 61d2314834e7b1..d8249d997c2a0b 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -31,6 +31,9 @@ struct cs35l41_config { }; static const struct cs35l41_config cs35l41_config_table[] = { + { "10251826", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, + { "1025182C", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, + { "10251844", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, { "10280B27", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 }, { "10280B28", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 }, { "10280BEB", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 }, @@ -452,6 +455,9 @@ struct cs35l41_prop_model { static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CLSA0100", NULL, lenovo_legion_no_acpi }, { "CLSA0101", NULL, lenovo_legion_no_acpi }, + { "CSC3551", "10251826", generic_dsd_config }, + { "CSC3551", "1025182C", generic_dsd_config }, + { "CSC3551", "10251844", generic_dsd_config }, { "CSC3551", "10280B27", generic_dsd_config }, { "CSC3551", "10280B28", generic_dsd_config }, { "CSC3551", "10280BEB", generic_dsd_config }, From e147c53990d24fcbcc165b0d2f0c1f836958d1d2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 19 May 2025 12:56:25 +0200 Subject: [PATCH 031/353] ALSA: n64: Replace deprecated strcpy() with strscpy() strcpy() is deprecated; use strscpy() instead. No functional changes intended. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250519105628.491675-1-thorsten.blum@linux.dev Signed-off-by: Takashi Iwai --- sound/mips/snd-n64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sound/mips/snd-n64.c b/sound/mips/snd-n64.c index bff6d85b8fe267..e1b2ff65d8508a 100644 --- a/sound/mips/snd-n64.c +++ b/sound/mips/snd-n64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -327,14 +328,14 @@ static int __init n64audio_probe(struct platform_device *pdev) goto fail_dma_alloc; pcm->private_data = priv; - strcpy(pcm->name, "N64 Audio"); + strscpy(pcm->name, "N64 Audio"); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &n64audio_pcm_ops); snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_VMALLOC, card->dev, 0, 0); - strcpy(card->driver, "N64 Audio"); - strcpy(card->shortname, "N64 Audio"); - strcpy(card->longname, "N64 Audio"); + strscpy(card->driver, "N64 Audio"); + strscpy(card->shortname, "N64 Audio"); + strscpy(card->longname, "N64 Audio"); irq = platform_get_irq(pdev, 0); if (irq < 0) { From e018e5dc363ed0e3e5527c9c24d41f45d07a292e Mon Sep 17 00:00:00 2001 From: Siddarth Gundu Date: Tue, 20 May 2025 01:18:33 +0530 Subject: [PATCH 032/353] ALSA: dbri: replace strcpy() with strscpy() strcpy() is deprecated; use strscpy() instead. Both the destination and source buffer are of fixed length so strscpy with 2-arguments is used. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Siddarth Gundu Link: https://patch.msgid.link/20250519194833.106463-1-siddarthsgml@gmail.com Signed-off-by: Takashi Iwai --- sound/sparc/dbri.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c index 69f1c9e37f4b97..93cbe158009f20 100644 --- a/sound/sparc/dbri.c +++ b/sound/sparc/dbri.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -2239,7 +2240,7 @@ static int snd_dbri_pcm(struct snd_card *card) pcm->private_data = card->private_data; pcm->info_flags = 0; - strcpy(pcm->name, card->shortname); + strscpy(pcm->name, card->shortname); snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_CONTINUOUS, NULL, 64 * 1024, 64 * 1024); @@ -2446,7 +2447,7 @@ static int snd_dbri_mixer(struct snd_card *card) return -EINVAL; dbri = card->private_data; - strcpy(card->mixername, card->shortname); + strscpy(card->mixername, card->shortname); for (idx = 0; idx < ARRAY_SIZE(dbri_controls); idx++) { err = snd_ctl_add(card, @@ -2613,8 +2614,8 @@ static int dbri_probe(struct platform_device *op) if (err < 0) return err; - strcpy(card->driver, "DBRI"); - strcpy(card->shortname, "Sun DBRI"); + strscpy(card->driver, "DBRI"); + strscpy(card->shortname, "Sun DBRI"); rp = &op->resource[0]; sprintf(card->longname, "%s at 0x%02lx:0x%016llx, irq %d", card->shortname, From 0b5b32c086a723d6d220e7b86bc1ea9faa7d465c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 19 May 2025 23:20:30 +0200 Subject: [PATCH 033/353] ALSA: usb-audio: Kill timer properly at removal The USB-audio MIDI code initializes the timer, but in a rare case, the driver might be freed without the disconnect call. This leaves the timer in an active state while the assigned object is released via snd_usbmidi_free(), which ends up with a kernel warning when the debug configuration is enabled, as spotted by fuzzer. For avoiding the problem, put timer_shutdown_sync() at snd_usbmidi_free(), so that the timer can be killed properly. While we're at it, replace the existing timer_delete_sync() at the disconnect callback with timer_shutdown_sync(), too. Reported-by: syzbot+d8f72178ab6783a7daea@syzkaller.appspotmail.com Closes: https://lore.kernel.org/681c70d7.050a0220.a19a9.00c6.GAE@google.com Cc: Link: https://patch.msgid.link/20250519212031.14436-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 9ab746ba367d5f..3a8a977ed359c9 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1530,6 +1530,7 @@ static void snd_usbmidi_free(struct snd_usb_midi *umidi) snd_usbmidi_in_endpoint_delete(ep->in); } mutex_destroy(&umidi->mutex); + timer_shutdown_sync(&umidi->error_timer); kfree(umidi); } @@ -1553,7 +1554,7 @@ void snd_usbmidi_disconnect(struct list_head *p) spin_unlock_irq(&umidi->disc_lock); up_write(&umidi->disc_rwsem); - timer_delete_sync(&umidi->error_timer); + timer_shutdown_sync(&umidi->error_timer); for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; From 28f0f0cc993bc8f1145d51ecd3b44a03c1a1732d Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 29 Apr 2025 11:18:07 +0100 Subject: [PATCH 034/353] soundwire: bus: Simplify sdw_assign_device_num() Simplify the code in sdw_assign_device_num(). Remove the new_device flag which can be simply handled inline and do a bit less shuffling of dev_num in and out of various variables. This patch should cause no functional changes. Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20250429101808.348462-2-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 6f8a20014e76d4..6cfec4a2af33a3 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -750,41 +750,36 @@ static int sdw_get_device_num(struct sdw_slave *slave) static int sdw_assign_device_num(struct sdw_slave *slave) { struct sdw_bus *bus = slave->bus; - int ret, dev_num; - bool new_device = false; + struct device *dev = bus->dev; + int ret; /* check first if device number is assigned, if so reuse that */ if (!slave->dev_num) { if (!slave->dev_num_sticky) { + int dev_num; + mutex_lock(&slave->bus->bus_lock); dev_num = sdw_get_device_num(slave); mutex_unlock(&slave->bus->bus_lock); if (dev_num < 0) { - dev_err(bus->dev, "Get dev_num failed: %d\n", - dev_num); + dev_err(dev, "Get dev_num failed: %d\n", dev_num); return dev_num; } - slave->dev_num = dev_num; + slave->dev_num_sticky = dev_num; - new_device = true; } else { - slave->dev_num = slave->dev_num_sticky; + dev_dbg(dev, "Slave already registered, reusing dev_num: %d\n", + slave->dev_num_sticky); } } - if (!new_device) - dev_dbg(bus->dev, - "Slave already registered, reusing dev_num:%d\n", - slave->dev_num); - /* Clear the slave->dev_num to transfer message on device 0 */ - dev_num = slave->dev_num; slave->dev_num = 0; - ret = sdw_write_no_pm(slave, SDW_SCP_DEVNUMBER, dev_num); + ret = sdw_write_no_pm(slave, SDW_SCP_DEVNUMBER, slave->dev_num_sticky); if (ret < 0) { - dev_err(bus->dev, "Program device_num %d failed: %d\n", - dev_num, ret); + dev_err(dev, "Program device_num %d failed: %d\n", + slave->dev_num_sticky, ret); return ret; } @@ -792,7 +787,7 @@ static int sdw_assign_device_num(struct sdw_slave *slave) slave->dev_num = slave->dev_num_sticky; if (bus->ops && bus->ops->new_peripheral_assigned) - bus->ops->new_peripheral_assigned(bus, slave, dev_num); + bus->ops->new_peripheral_assigned(bus, slave, slave->dev_num); return 0; } From e9336ced769387b130567d18a6e608d7d6f3bdac Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 29 Apr 2025 11:18:08 +0100 Subject: [PATCH 035/353] soundwire: bus: Add internal slave ID and use for IRQs Currently the SoundWire IRQ code uses the dev_num to create an IRQ mapping for each slave. However, there is an issue there, the dev_num is only allocated when the slave enumerates on the bus and enumeration may happen before or after probe of the slave driver. In the case enumeration happens after probe of the slave driver then the IRQ mapping will use dev_num before it is set. This could cause multiple slaves to use zero as their IRQ mapping. It is very desirable to have the IRQ mapped before the slave probe is called, so drivers can do resource allocation in probe as normal. To solve these issues add an internal ID created for each slave when it is probed and use that for mapping the IRQ. Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20250429101808.348462-3-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus.c | 2 ++ drivers/soundwire/bus_type.c | 10 ++++++++++ drivers/soundwire/irq.c | 6 +++--- include/linux/soundwire/sdw.h | 6 ++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 6cfec4a2af33a3..377ab21c9cf0fb 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -56,6 +56,8 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, return ret; } + ida_init(&bus->slave_ida); + ret = sdw_master_device_add(bus, parent, fwnode); if (ret < 0) { dev_err(parent, "Failed to add master device at link %d\n", diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c index e98d5db81b1ced..75d6f16efcedb4 100644 --- a/drivers/soundwire/bus_type.c +++ b/drivers/soundwire/bus_type.c @@ -105,9 +105,17 @@ static int sdw_drv_probe(struct device *dev) if (ret) return ret; + ret = ida_alloc_max(&slave->bus->slave_ida, SDW_FW_MAX_DEVICES, GFP_KERNEL); + if (ret < 0) { + dev_err(dev, "Failed to allocated ID: %d\n", ret); + return ret; + } + slave->index = ret; + ret = drv->probe(slave, id); if (ret) { dev_pm_domain_detach(dev, false); + ida_free(&slave->bus->slave_ida, slave->index); return ret; } @@ -174,6 +182,8 @@ static int sdw_drv_remove(struct device *dev) dev_pm_domain_detach(dev, false); + ida_free(&slave->bus->slave_ida, slave->index); + return ret; } diff --git a/drivers/soundwire/irq.c b/drivers/soundwire/irq.c index c237e6d0766b33..f18be37efef8e0 100644 --- a/drivers/soundwire/irq.c +++ b/drivers/soundwire/irq.c @@ -31,7 +31,7 @@ int sdw_irq_create(struct sdw_bus *bus, { bus->irq_chip.name = dev_name(bus->dev); - bus->domain = irq_domain_create_linear(fwnode, SDW_MAX_DEVICES, + bus->domain = irq_domain_create_linear(fwnode, SDW_FW_MAX_DEVICES, &sdw_domain_ops, bus); if (!bus->domain) { dev_err(bus->dev, "Failed to add IRQ domain\n"); @@ -50,12 +50,12 @@ static void sdw_irq_dispose_mapping(void *data) { struct sdw_slave *slave = data; - irq_dispose_mapping(irq_find_mapping(slave->bus->domain, slave->dev_num)); + irq_dispose_mapping(slave->irq); } void sdw_irq_create_mapping(struct sdw_slave *slave) { - slave->irq = irq_create_mapping(slave->bus->domain, slave->dev_num); + slave->irq = irq_create_mapping(slave->bus->domain, slave->index); if (!slave->irq) dev_warn(&slave->dev, "Failed to map IRQ\n"); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2362f621d94c26..0832776262ac3b 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ struct sdw_slave; #define SDW_FRAME_CTRL_BITS 48 #define SDW_MAX_DEVICES 11 +#define SDW_FW_MAX_DEVICES 16 #define SDW_MAX_PORTS 15 #define SDW_VALID_PORT_RANGE(n) ((n) < SDW_MAX_PORTS && (n) >= 1) @@ -630,6 +632,7 @@ struct sdw_slave_ops { * struct sdw_slave - SoundWire Slave * @id: MIPI device ID * @dev: Linux device + * @index: internal ID for this slave * @irq: IRQ number * @status: Status reported by the Slave * @bus: Bus handle @@ -661,6 +664,7 @@ struct sdw_slave_ops { struct sdw_slave { struct sdw_slave_id id; struct device dev; + int index; int irq; enum sdw_slave_status status; struct sdw_bus *bus; @@ -968,6 +972,7 @@ struct sdw_stream_runtime { * @md: Master device * @bus_lock_key: bus lock key associated to @bus_lock * @bus_lock: bus lock + * @slave_ida: IDA for allocating internal slave IDs * @slaves: list of Slaves on this bus * @msg_lock_key: message lock key associated to @msg_lock * @msg_lock: message lock @@ -1010,6 +1015,7 @@ struct sdw_bus { struct sdw_master_device *md; struct lock_class_key bus_lock_key; struct mutex bus_lock; + struct ida slave_ida; struct list_head slaves; struct lock_class_key msg_lock_key; struct mutex msg_lock; From 5730f3d1ac26ede14acf2e64b19fdc7717a2f3dc Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 036/353] x86/bpf: Call branch history clearing sequence on exit Classic BPF programs have been identified as potential vectors for intra-mode Branch Target Injection (BTI) attacks. Classic BPF programs can be run by unprivileged users. They allow unprivileged code to execute inside the kernel. Attackers can use unprivileged cBPF to craft branch history in kernel mode that can influence the target of indirect branches. Introduce a branch history buffer (BHB) clearing sequence during the JIT compilation of classic BPF programs. The clearing sequence is the same as is used in previous mitigations to protect syscalls. Since eBPF programs already have their own mitigations in place, only insert the call on classic programs that aren't run by privileged users. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Daniel Borkmann Reviewed-by: Alexandre Chartre --- arch/x86/net/bpf_jit_comp.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f08..6fb786c8b2aa38 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1502,6 +1502,30 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) #define PRIV_STACK_GUARD_SZ 8 #define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL +static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + struct bpf_prog *bpf_prog) +{ + u8 *prog = *pprog; + u8 *func; + + if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { + /* The clearing sequence clobbers eax and ecx. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x51); /* push rcx */ + ip += 2; + + func = (u8 *)clear_bhb_loop; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + + if (emit_call(&prog, func, ip)) + return -EINVAL; + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } + *pprog = prog; + return 0; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -2544,6 +2568,13 @@ st: if (is_imm8(insn->off)) seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; + if (bpf_prog_was_classic(bpf_prog) && + !capable(CAP_SYS_ADMIN)) { + u8 *ip = image + addrs[i - 1]; + + if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) + return -EINVAL; + } if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); From 7e4ee28a37929c144f2a113992f4de1c9b5e6fe4 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 037/353] x86/bpf: Add IBHF call at end of classic BPF Classic BPF programs can be run by unprivileged users, allowing unprivileged code to execute inside the kernel. Attackers can use this to craft branch history in kernel mode that can influence the target of indirect branches. BHI_DIS_S provides user-kernel isolation of branch history, but cBPF can be used to bypass this protection by crafting branch history in kernel mode. To stop intra-mode attacks via cBPF programs, Intel created a new instruction Indirect Branch History Fence (IBHF). IBHF prevents the predicted targets of subsequent indirect branches from being influenced by branch history prior to the IBHF. IBHF is only effective while BHI_DIS_S is enabled. Add the IBHF instruction to cBPF jitted code's exit path. Add the new fence when the hardware mitigation is enabled (i.e., X86_FEATURE_CLEAR_BHB_HW is set) or after the software sequence (X86_FEATURE_CLEAR_BHB_LOOP) is being used in a virtual machine. Note that X86_FEATURE_CLEAR_BHB_HW and X86_FEATURE_CLEAR_BHB_LOOP are mutually exclusive, so the JIT compiler will only emit the new fence, not the SW sequence, when X86_FEATURE_CLEAR_BHB_HW is set. Hardware that enumerates BHI_NO basically has BHI_DIS_S protections always enabled, regardless of the value of BHI_DIS_S. Since BHI_DIS_S doesn't protect against intra-mode attacks, enumerate BHI bug on BHI_NO hardware as well. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Daniel Borkmann Reviewed-by: Alexandre Chartre --- arch/x86/kernel/cpu/common.c | 9 ++++++--- arch/x86/net/bpf_jit_comp.c | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12126adbc3a9a7..5ab13d9241c0ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1439,9 +1439,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6fb786c8b2aa38..e472572392ef6c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -41,6 +41,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT5(b1, b2, b3, b4, b5) \ + do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0) #define EMIT1_off32(b1, off) \ do { EMIT1(b1); EMIT(off, 4); } while (0) @@ -1522,6 +1524,23 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, EMIT1(0x59); /* pop rcx */ EMIT1(0x58); /* pop rax */ } + /* Insert IBHF instruction */ + if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && + cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || + (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW) && + IS_ENABLED(CONFIG_X86_64))) { + /* + * Add an Indirect Branch History Fence (IBHF). IBHF acts as a + * fence preventing branch history from before the fence from + * affecting indirect branches after the fence. This is + * specifically used in cBPF jitted code to prevent Intra-mode + * BHI attacks. The IBHF instruction is designed to be a NOP on + * hardware that doesn't need or support it. The REP and REX.W + * prefixes are required by the microcode, and they also ensure + * that the NOP is unlikely to be used in existing code. + */ + EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ + } *pprog = prog; return 0; } From 20f25403809e2393b6346cf4f4999d67787f0e04 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 038/353] x86/bhi: Do not set BHI_DIS_S in 32-bit mode With the possibility of intra-mode BHI via cBPF, complete mitigation for BHI is to use IBHF (history fence) instruction with BHI_DIS_S set. Since this new instruction is only available in 64-bit mode, setting BHI_DIS_S in 32-bit mode is only a partial mitigation. Do not set BHI_DIS_S in 32-bit mode so as to avoid reporting misleading mitigated status. With this change IBHF won't be used in 32-bit mode, also remove the CONFIG_X86_64 check from emit_spectre_bhb_barrier(). Suggested-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- arch/x86/kernel/cpu/bugs.c | 6 +++--- arch/x86/net/bpf_jit_comp.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 362602b705cc43..f219f0f4f2d11d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1697,11 +1697,11 @@ static void __init bhi_select_mitigation(void) return; } - /* Mitigate in hardware if supported */ - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e472572392ef6c..8a0fabb850b772 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1527,8 +1527,7 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, /* Insert IBHF instruction */ if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || - (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW) && - IS_ENABLED(CONFIG_X86_64))) { + cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { /* * Add an Indirect Branch History Fence (IBHF). IBHF acts as a * fence preventing branch history from before the fence from @@ -1538,6 +1537,8 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, * hardware that doesn't need or support it. The REP and REX.W * prefixes are required by the microcode, and they also ensure * that the NOP is unlikely to be used in existing code. + * + * IBHF is not a valid instruction in 32-bit mode. */ EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ } From 41bd98b936201872514d4a7358384f11df5574c5 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 11 Apr 2025 15:36:38 -0700 Subject: [PATCH 039/353] Documentation: x86/bugs/its: Add ITS documentation Add the admin-guide for Indirect Target Selection (ITS). Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- Documentation/admin-guide/hw-vuln/index.rst | 1 + .../hw-vuln/indirect-target-selection.rst | 168 ++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/indirect-target-selection.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 451874b8135d8e..ce296b8430fc98 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -23,3 +23,4 @@ are configurable at compile, boot or run time. gather_data_sampling reg-file-data-sampling rsb + indirect-target-selection diff --git a/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst b/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst new file mode 100644 index 00000000000000..d9ca64108d2332 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst @@ -0,0 +1,168 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Indirect Target Selection (ITS) +=============================== + +ITS is a vulnerability in some Intel CPUs that support Enhanced IBRS and were +released before Alder Lake. ITS may allow an attacker to control the prediction +of indirect branches and RETs located in the lower half of a cacheline. + +ITS is assigned CVE-2024-28956 with a CVSS score of 4.7 (Medium). + +Scope of Impact +--------------- +- **eIBRS Guest/Host Isolation**: Indirect branches in KVM/kernel may still be + predicted with unintended target corresponding to a branch in the guest. + +- **Intra-Mode BTI**: In-kernel training such as through cBPF or other native + gadgets. + +- **Indirect Branch Prediction Barrier (IBPB)**: After an IBPB, indirect + branches may still be predicted with targets corresponding to direct branches + executed prior to the IBPB. This is fixed by the IPU 2025.1 microcode, which + should be available via distro updates. Alternatively microcode can be + obtained from Intel's github repository [#f1]_. + +Affected CPUs +------------- +Below is the list of ITS affected CPUs [#f2]_ [#f3]_: + + ======================== ============ ==================== =============== + Common name Family_Model eIBRS Intra-mode BTI + Guest/Host Isolation + ======================== ============ ==================== =============== + SKYLAKE_X (step >= 6) 06_55H Affected Affected + ICELAKE_X 06_6AH Not affected Affected + ICELAKE_D 06_6CH Not affected Affected + ICELAKE_L 06_7EH Not affected Affected + TIGERLAKE_L 06_8CH Not affected Affected + TIGERLAKE 06_8DH Not affected Affected + KABYLAKE_L (step >= 12) 06_8EH Affected Affected + KABYLAKE (step >= 13) 06_9EH Affected Affected + COMETLAKE 06_A5H Affected Affected + COMETLAKE_L 06_A6H Affected Affected + ROCKETLAKE 06_A7H Not affected Affected + ======================== ============ ==================== =============== + +- All affected CPUs enumerate Enhanced IBRS feature. +- IBPB isolation is affected on all ITS affected CPUs, and need a microcode + update for mitigation. +- None of the affected CPUs enumerate BHI_CTRL which was introduced in Golden + Cove (Alder Lake and Sapphire Rapids). This can help guests to determine the + host's affected status. +- Intel Atom CPUs are not affected by ITS. + +Mitigation +---------- +As only the indirect branches and RETs that have their last byte of instruction +in the lower half of the cacheline are vulnerable to ITS, the basic idea behind +the mitigation is to not allow indirect branches in the lower half. + +This is achieved by relying on existing retpoline support in the kernel, and in +compilers. ITS-vulnerable retpoline sites are runtime patched to point to newly +added ITS-safe thunks. These safe thunks consists of indirect branch in the +second half of the cacheline. Not all retpoline sites are patched to thunks, if +a retpoline site is evaluated to be ITS-safe, it is replaced with an inline +indirect branch. + +Dynamic thunks +~~~~~~~~~~~~~~ +From a dynamically allocated pool of safe-thunks, each vulnerable site is +replaced with a new thunk, such that they get a unique address. This could +improve the branch prediction accuracy. Also, it is a defense-in-depth measure +against aliasing. + +Note, for simplicity, indirect branches in eBPF programs are always replaced +with a jump to a static thunk in __x86_indirect_its_thunk_array. If required, +in future this can be changed to use dynamic thunks. + +All vulnerable RETs are replaced with a static thunk, they do not use dynamic +thunks. This is because RETs get their prediction from RSB mostly that does not +depend on source address. RETs that underflow RSB may benefit from dynamic +thunks. But, RETs significantly outnumber indirect branches, and any benefit +from a unique source address could be outweighed by the increased icache +footprint and iTLB pressure. + +Retpoline +~~~~~~~~~ +Retpoline sequence also mitigates ITS-unsafe indirect branches. For this +reason, when retpoline is enabled, ITS mitigation only relocates the RETs to +safe thunks. Unless user requested the RSB-stuffing mitigation. + +RSB Stuffing +~~~~~~~~~~~~ +RSB-stuffing via Call Depth Tracking is a mitigation for Retbleed RSB-underflow +attacks. And it also mitigates RETs that are vulnerable to ITS. + +Mitigation in guests +^^^^^^^^^^^^^^^^^^^^ +All guests deploy ITS mitigation by default, irrespective of eIBRS enumeration +and Family/Model of the guest. This is because eIBRS feature could be hidden +from a guest. One exception to this is when a guest enumerates BHI_DIS_S, which +indicates that the guest is running on an unaffected host. + +To prevent guests from unnecessarily deploying the mitigation on unaffected +platforms, Intel has defined ITS_NO bit(62) in MSR IA32_ARCH_CAPABILITIES. When +a guest sees this bit set, it should not enumerate the ITS bug. Note, this bit +is not set by any hardware, but is **intended for VMMs to synthesize** it for +guests as per the host's affected status. + +Mitigation options +^^^^^^^^^^^^^^^^^^ +The ITS mitigation can be controlled using the "indirect_target_selection" +kernel parameter. The available options are: + + ======== =================================================================== + on (default) Deploy the "Aligned branch/return thunks" mitigation. + If spectre_v2 mitigation enables retpoline, aligned-thunks are only + deployed for the affected RET instructions. Retpoline mitigates + indirect branches. + + off Disable ITS mitigation. + + vmexit Equivalent to "=on" if the CPU is affected by guest/host isolation + part of ITS. Otherwise, mitigation is not deployed. This option is + useful when host userspace is not in the threat model, and only + attacks from guest to host are considered. + + stuff Deploy RSB-fill mitigation when retpoline is also deployed. + Otherwise, deploy the default mitigation. When retpoline mitigation + is enabled, RSB-stuffing via Call-Depth-Tracking also mitigates + ITS. + + force Force the ITS bug and deploy the default mitigation. + ======== =================================================================== + +Sysfs reporting +--------------- + +The sysfs file showing ITS mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/indirect_target_selection + +Note, microcode mitigation status is not reported in this file. + +The possible values in this file are: + +.. list-table:: + + * - Not affected + - The processor is not vulnerable. + * - Vulnerable + - System is vulnerable and no mitigation has been applied. + * - Vulnerable, KVM: Not affected + - System is vulnerable to intra-mode BTI, but not affected by eIBRS + guest/host isolation. + * - Mitigation: Aligned branch/return thunks + - The mitigation is enabled, affected indirect branches and RETs are + relocated to safe thunks. + * - Mitigation: Retpolines, Stuffing RSB + - The mitigation is enabled using retpoline and RSB stuffing. + +References +---------- +.. [#f1] Microcode repository - https://github.com/intel/Intel-Linux-Processor-Microcode-Data-Files + +.. [#f2] Affected Processors list - https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html + +.. [#f3] Affected Processors list (machine readable) - https://github.com/intel/Intel-affected-processor-list From 500a5758288ba44d4fdf550d20266170183659bd Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 17:40:41 -0700 Subject: [PATCH 040/353] x86/its: Enumerate Indirect Target Selection (ITS) bug ITS bug in some pre-Alderlake Intel CPUs may allow indirect branches in the first half of a cache line get predicted to a target of a branch located in the second half of the cache line. Set X86_BUG_ITS on affected CPUs. Mitigation to follow in later commits. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 8 +++++ arch/x86/kernel/cpu/common.c | 58 +++++++++++++++++++++++------- arch/x86/kvm/x86.c | 4 ++- 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67b9..2ff978fa414b5a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -533,4 +533,5 @@ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ +#define X86_BUG_ITS X86_BUG(1*32 + 6) /* "its" CPU is affected by Indirect Target Selection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d50..e7d2f460fcc699 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -211,6 +211,14 @@ * VERW clears CPU Register * File. */ +#define ARCH_CAP_ITS_NO BIT_ULL(62) /* + * Not susceptible to + * Indirect Target Selection. + * This bit is not set by + * HW, but is synthesized by + * VMMs for guests to know + * their affected status. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5ab13d9241c0ad..619cfdb73760d0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1227,6 +1227,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1238,22 +1240,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), @@ -1318,6 +1323,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); @@ -1452,6 +1483,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9896fd574bfc9d..be7bb6d20129db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1584,7 +1584,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1618,6 +1618,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* From e5c0ed755b32351a2413baa1611951bb650cb83e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 21:17:21 -0700 Subject: [PATCH 041/353] x86/its: Add support for ITS-safe indirect thunk Due to ITS, indirect branches in the lower half of a cacheline may be vulnerable to branch target injection attack. Introduce ITS-safe thunks to patch indirect branches in the lower half of cacheline with the thunk. Also thunk any eBPF generated indirect branches in emit_indirect_jump(). Below category of indirect branches are not mitigated: - Indirect branches in the .init section are not mitigated because they are discarded after boot. - Indirect branches that are explicitly marked retpoline-safe. Note that retpoline also mitigates the indirect branches against ITS. This is because the retpoline sequence fills an RSB entry before RET, and it does not suffer from RSB-underflow part of the ITS. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- arch/x86/Kconfig | 11 +++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 4 +++ arch/x86/kernel/alternative.c | 45 ++++++++++++++++++++++++++-- arch/x86/kernel/vmlinux.lds.S | 6 ++++ arch/x86/lib/retpoline.S | 28 +++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 5 +++- 7 files changed, 96 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c0d36e3326aabb..cb28dea201cca2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2712,6 +2712,17 @@ config MITIGATION_SSB of speculative execution in a similar way to the Meltdown and Spectre security vulnerabilities. +config MITIGATION_ITS + bool "Enable Indirect Target Selection mitigation" + depends on CPU_SUP_INTEL && X86_64 + depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK + default y + help + Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in + BPU on some Intel CPUs that may allow Spectre V2 style attacks. If + disabled, mitigation cannot be enabled via cmdline. + See + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2ff978fa414b5a..03af8191073a21 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -481,6 +481,7 @@ #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ #define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ #define X86_FEATURE_PREFER_YMM (21*32 + 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 9) /* Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 5c43f145454ddd..a5eb0cd93c9273 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -336,10 +336,14 @@ #else /* __ASSEMBLER__ */ +#define ITS_THUNK_SIZE 64 + typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +typedef u8 its_thunk_t[ITS_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; +extern its_thunk_t __x86_indirect_its_thunk_array[]; #ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d69055..333c0295b0a522 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -581,7 +581,8 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } -static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; @@ -602,7 +603,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_call_thunk_array[reg], + call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; @@ -610,7 +611,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_jump_thunk_array[reg], + jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; @@ -625,6 +626,35 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 return i; } +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_call_thunk_array[reg], + __x86_indirect_jump_thunk_array[reg]); +} + +#ifdef CONFIG_MITIGATION_ITS +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_its_thunk_array[reg], + __x86_indirect_its_thunk_array[reg]); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#endif + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -699,6 +729,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index aa4d0221583c23..574b4e7aeac78a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -505,6 +505,12 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); "SRSO function pair won't alias"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline"); +. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart"); +. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); +#endif + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a26c43abd47d8d..a06891892853f9 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -367,6 +367,34 @@ SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32, 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(__x86_indirect_its_thunk_array) + +#define GEN(reg) ITS_THUNK reg +#include +#undef GEN + + .align 64, 0xcc +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif + /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8a0fabb850b772..15a87bc505d3a8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -663,7 +663,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { + OPTIMIZER_HIDE_VAR(reg); + emit_jump(&prog, &__x86_indirect_its_thunk_array[reg], ip); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { From 0b368bc8b74775c689e1362ab4c9f0fbe7e55f5a Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 21:17:21 -0700 Subject: [PATCH 042/353] x86/its: Add support for ITS-safe return thunk RETs in the lower half of cacheline may be affected by ITS bug, specifically when the RSB-underflows. Use ITS-safe return thunk for such RETs. RETs that are not patched: - RET in retpoline sequence does not need to be patched, because the sequence itself fills an RSB before RET. - RET in Call Depth Tracking (CDT) thunks __x86_indirect_{call|jump}_thunk and call_depth_return_thunk are not patched because CDT by design prevents RSB-underflow. - RETs in .init section are not reachable after init. - RETs that are explicitly marked safe with ANNOTATE_UNRET_SAFE. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- arch/x86/include/asm/alternative.h | 14 ++++++++++++++ arch/x86/include/asm/nospec-branch.h | 6 ++++++ arch/x86/kernel/alternative.c | 19 +++++++++++++++++-- arch/x86/kernel/ftrace.c | 2 +- arch/x86/kernel/static_call.c | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 4 ++++ arch/x86/lib/retpoline.S | 13 ++++++++++++- arch/x86/net/bpf_jit_comp.c | 2 +- 8 files changed, 57 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fdfa..4037c611537c85 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -124,6 +124,20 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, } #endif +#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) +extern bool cpu_wants_rethunk(void); +extern bool cpu_wants_rethunk_at(void *addr); +#else +static __always_inline bool cpu_wants_rethunk(void) +{ + return false; +} +static __always_inline bool cpu_wants_rethunk_at(void *addr) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index a5eb0cd93c9273..7d04ade3354115 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -367,6 +367,12 @@ static inline void srso_return_thunk(void) {} static inline void srso_alias_return_thunk(void) {} #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_return_thunk(void); +#else +static inline void its_return_thunk(void) {} +#endif + extern void retbleed_return_thunk(void); extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 333c0295b0a522..3e67e72bffdc98 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -814,6 +814,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_MITIGATION_RETHUNK +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -830,7 +845,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { @@ -847,7 +862,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc77a..5eb1514af5593e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -354,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(ip)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e7764522..c3d7ff44b29adc 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, break; case RET: - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; @@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case JCC: if (!func) { func = __static_call_return; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) func = x86_return_thunk; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 574b4e7aeac78a..cda5f8362e9da5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -511,6 +511,10 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); . = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline"); +#endif + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a06891892853f9..ebca28fe7e3133 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -393,7 +393,18 @@ SYM_CODE_START(__x86_indirect_its_thunk_array) .align 64, 0xcc SYM_CODE_END(__x86_indirect_its_thunk_array) -#endif +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(its_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(its_return_thunk) +EXPORT_SYMBOL(its_return_thunk) + +#endif /* CONFIG_MITIGATION_ITS */ /* * This function name is magical and is used by -mfunction-return=thunk-extern diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15a87bc505d3a8..34ba61770b56fe 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -688,7 +688,7 @@ static void emit_return(u8 **pprog, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk()) { emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ From fc8289060a837edd532789882b0c33ca8b00d714 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 20:23:23 -0700 Subject: [PATCH 043/353] x86/its: Enable Indirect Target Selection mitigation Indirect Target Selection (ITS) is a bug in some pre-ADL Intel CPUs with eIBRS. It affects prediction of indirect branch and RETs in the lower half of cacheline. Due to ITS such branches may get wrongly predicted to a target of (direct or indirect) branch that is located in the upper half of the cacheline. Scope of impact =============== Guest/host isolation -------------------- When eIBRS is used for guest/host isolation, the indirect branches in the VMM may still be predicted with targets corresponding to branches in the guest. Intra-mode ---------- cBPF or other native gadgets can be used for intra-mode training and disclosure using ITS. User/kernel isolation --------------------- When eIBRS is enabled user/kernel isolation is not impacted. Indirect Branch Prediction Barrier (IBPB) ----------------------------------------- After an IBPB, indirect branches may be predicted with targets corresponding to direct branches which were executed prior to IBPB. This is mitigated by a microcode update. Add cmdline parameter indirect_target_selection=off|on|force to control the mitigation to relocate the affected branches to an ITS-safe thunk i.e. located in the upper half of cacheline. Also add the sysfs reporting. When retpoline mitigation is deployed, ITS safe-thunks are not needed, because retpoline sequence is already ITS-safe. Similarly, when call depth tracking (CDT) mitigation is deployed (retbleed=stuff), ITS safe return thunk is not used, as CDT prevents RSB-underflow. To not overcomplicate things, ITS mitigation is not supported with spectre-v2 lfence;jmp mitigation. Moreover, it is less practical to deploy lfence;jmp mitigation on ITS affected parts anyways. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- .../ABI/testing/sysfs-devices-system-cpu | 1 + .../admin-guide/kernel-parameters.txt | 13 ++ arch/x86/kernel/cpu/bugs.c | 140 +++++++++++++++++- drivers/base/cpu.c | 3 + include/linux/cpu.h | 2 + 5 files changed, 155 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 6c4934d3f4db3e..badfc5127da5a7 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -516,6 +516,7 @@ Description: information about CPUs heterogeneity. What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/gather_data_sampling + /sys/devices/system/cpu/vulnerabilities/indirect_target_selection /sys/devices/system/cpu/vulnerabilities/itlb_multihit /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a110cbb37f2033..11ea266edaa8a7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2209,6 +2209,18 @@ different crypto accelerators. This option can be used to achieve best performance for particular HW. + indirect_target_selection= [X86,Intel] Mitigation control for Indirect + Target Selection(ITS) bug in Intel CPUs. Updated + microcode is also required for a fix in IBPB. + + on: Enable mitigation (default). + off: Disable mitigation. + force: Force the ITS bug and deploy default + mitigation. + + For details see: + Documentation/admin-guide/hw-vuln/indirect-target-selection.rst + init= [KNL] Format: Run specified binary instead of /sbin/init as init @@ -3700,6 +3712,7 @@ expose users to several CPU vulnerabilities. Equivalent to: if nokaslr then kpti=0 [ARM64] gather_data_sampling=off [X86] + indirect_target_selection=off [X86] kvm.nx_huge_pages=off [X86] l1tf=off [X86] mds=off [X86] diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f219f0f4f2d11d..6500a3ddd260df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -49,6 +49,7 @@ static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); static void __init gds_select_mitigation(void); +static void __init its_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -66,6 +67,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { @@ -178,6 +187,7 @@ void __init cpu_select_mitigations(void) */ srso_select_mitigation(); gds_select_mitigation(); + its_select_mitigation(); } /* @@ -1118,7 +1128,7 @@ static void __init retbleed_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1153,7 +1163,7 @@ static void __init retbleed_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1187,6 +1197,115 @@ static void __init retbleed_select_mitigation(void) pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +enum its_mitigation_cmd { + ITS_CMD_OFF, + ITS_CMD_ON, +}; + +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS; + +static enum its_mitigation_cmd its_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_cmd = ITS_CMD_OFF; + } else if (!strcmp(str, "on")) { + its_cmd = ITS_CMD_ON; + } else if (!strcmp(str, "force")) { + its_cmd = ITS_CMD_ON; + setup_force_cpu_bug(X86_BUG_ITS); + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + enum its_mitigation_cmd cmd = its_cmd; + + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + /* Retpoline+CDT mitigates ITS, bail out */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + boot_cpu_has(X86_FEATURE_CALL_DEPTH)) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + goto out; + } + + /* Exit early to avoid irrelevant warnings */ + if (cmd == ITS_CMD_OFF) { + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (spectre_v2_enabled == SPECTRE_V2_NONE) { + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + + switch (cmd) { + case ITS_CMD_OFF: + its_mitigation = ITS_MITIGATION_OFF; + break; + case ITS_CMD_ON: + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + } +out: + pr_info("%s\n", its_strings[its_mitigation]); +} + #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt @@ -2607,10 +2726,10 @@ static void __init srso_select_mitigation(void) if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; + set_return_thunk(srso_alias_return_thunk); } else { setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; + set_return_thunk(srso_return_thunk); } if (has_microcode) srso_mitigation = SRSO_MITIGATION_SAFE_RET; @@ -2800,6 +2919,11 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2982,6 +3106,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3061,6 +3188,11 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index a7e5118498758e..50651435577c8f 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -600,6 +600,7 @@ CPU_SHOW_VULN_FALLBACK(spec_rstack_overflow); CPU_SHOW_VULN_FALLBACK(gds); CPU_SHOW_VULN_FALLBACK(reg_file_data_sampling); CPU_SHOW_VULN_FALLBACK(ghostwrite); +CPU_SHOW_VULN_FALLBACK(indirect_target_selection); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -616,6 +617,7 @@ static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NU static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL); static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling, NULL); static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); +static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -633,6 +635,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_gather_data_sampling.attr, &dev_attr_reg_file_data_sampling.attr, &dev_attr_ghostwrite.attr, + &dev_attr_indirect_target_selection.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e3049543008b9c..3aa955102b349a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,6 +78,8 @@ extern ssize_t cpu_show_gds(struct device *dev, extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_indirect_target_selection(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From bd3f9970e06c2c3399a7797ea90ca520be72647e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 18 Nov 2024 09:53:12 -0800 Subject: [PATCH 044/353] x86/its: Add "vmexit" option to skip mitigation on some CPUs Ice Lake generation CPUs are not affected by guest/host isolation part of ITS. If a user is only concerned about KVM guests, they can now choose a new cmdline option "vmexit" that will not deploy the ITS mitigation when CPU is not affected by guest/host isolation. This saves the performance overhead of ITS mitigation on Ice Lake gen CPUs. When "vmexit" option selected, if the CPU is affected by ITS guest/host isolation, the default ITS mitigation is deployed. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 11 +++++++++++ arch/x86/kernel/cpu/common.c | 19 ++++++++++++------- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 11ea266edaa8a7..05ddffa02f9dcb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2217,6 +2217,8 @@ off: Disable mitigation. force: Force the ITS bug and deploy default mitigation. + vmexit: Only deploy mitigation if CPU is affected by + guest/host isolation part of ITS. For details see: Documentation/admin-guide/hw-vuln/indirect-target-selection.rst diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 03af8191073a21..39e61212ac9a91 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -535,4 +535,5 @@ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ #define X86_BUG_ITS X86_BUG(1*32 + 6) /* "its" CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG(1*32 + 7) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6500a3ddd260df..0dd885c0b609d4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1203,16 +1203,19 @@ static void __init retbleed_select_mitigation(void) enum its_mitigation_cmd { ITS_CMD_OFF, ITS_CMD_ON, + ITS_CMD_VMEXIT, }; enum its_mitigation { ITS_MITIGATION_OFF, + ITS_MITIGATION_VMEXIT_ONLY, ITS_MITIGATION_ALIGNED_THUNKS, ITS_MITIGATION_RETPOLINE_STUFF, }; static const char * const its_strings[] = { [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", }; @@ -1239,6 +1242,8 @@ static int __init its_parse_cmdline(char *str) } else if (!strcmp(str, "force")) { its_cmd = ITS_CMD_ON; setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_cmd = ITS_CMD_VMEXIT; } else { pr_err("Ignoring unknown indirect_target_selection option (%s).", str); } @@ -1294,6 +1299,12 @@ static void __init its_select_mitigation(void) case ITS_CMD_OFF: its_mitigation = ITS_MITIGATION_OFF; break; + case ITS_CMD_VMEXIT: + if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + goto out; + } + fallthrough; case ITS_CMD_ON: its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 619cfdb73760d0..0ff057ff11ce93 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1229,6 +1229,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RFDS BIT(7) /* CPU is affected by Indirect Target Selection */ #define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1249,16 +1251,16 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), @@ -1483,8 +1485,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); - if (vulnerable_to_its(x86_arch_cap_msr)) + if (vulnerable_to_its(x86_arch_cap_msr)) { setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; From 49109c15a76336d169e65b1506ef4c9fb70077fb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 2 Dec 2024 12:07:08 -0800 Subject: [PATCH 045/353] x86/its: Add support for RSB stuffing mitigation When retpoline mitigation is enabled for spectre-v2, enabling call-depth-tracking and RSB stuffing also mitigates ITS. Add cmdline option indirect_target_selection=stuff to allow enabling RSB stuffing mitigation. When retpoline mitigation is not enabled, =stuff option is ignored, and default mitigation for ITS is deployed. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- .../admin-guide/kernel-parameters.txt | 3 +++ arch/x86/kernel/cpu/bugs.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05ddffa02f9dcb..17fe2ab4536833 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2219,6 +2219,9 @@ mitigation. vmexit: Only deploy mitigation if CPU is affected by guest/host isolation part of ITS. + stuff: Deploy RSB-fill mitigation when retpoline is + also deployed. Otherwise, deploy the default + mitigation. For details see: Documentation/admin-guide/hw-vuln/indirect-target-selection.rst diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0dd885c0b609d4..8596ce85026c0d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1204,6 +1204,7 @@ enum its_mitigation_cmd { ITS_CMD_OFF, ITS_CMD_ON, ITS_CMD_VMEXIT, + ITS_CMD_RSB_STUFF, }; enum its_mitigation { @@ -1244,6 +1245,8 @@ static int __init its_parse_cmdline(char *str) setup_force_cpu_bug(X86_BUG_ITS); } else if (!strcmp(str, "vmexit")) { its_cmd = ITS_CMD_VMEXIT; + } else if (!strcmp(str, "stuff")) { + its_cmd = ITS_CMD_RSB_STUFF; } else { pr_err("Ignoring unknown indirect_target_selection option (%s).", str); } @@ -1295,6 +1298,12 @@ static void __init its_select_mitigation(void) goto out; } + if (cmd == ITS_CMD_RSB_STUFF && + (!boot_cpu_has(X86_FEATURE_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))) { + pr_err("RSB stuff mitigation not supported, using default\n"); + cmd = ITS_CMD_ON; + } + switch (cmd) { case ITS_CMD_OFF: its_mitigation = ITS_MITIGATION_OFF; @@ -1312,6 +1321,16 @@ static void __init its_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RETHUNK); set_return_thunk(its_return_thunk); break; + case ITS_CMD_RSB_STUFF: + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + set_return_thunk(call_depth_return_thunk); + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + pr_info("Retbleed mitigation updated to stuffing\n"); + } + break; } out: pr_info("%s\n", its_strings[its_mitigation]); From 3bd02710a0c8580b342024dadf213bd78ca9b76a Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 2 May 2025 06:25:19 -0700 Subject: [PATCH 046/353] x86/its: Align RETs in BHB clear sequence to avoid thunking The software mitigation for BHI is to execute BHB clear sequence at syscall entry, and possibly after a cBPF program. ITS mitigation thunks RETs in the lower half of the cacheline. This causes the RETs in the BHB clear sequence to be thunked as well, adding unnecessary branches to the BHB clear sequence. Since the sequence is in hot path, align the RET instructions in the sequence to avoid thunking. This is how disassembly clear_bhb_loop() looks like after this change: 0x44 <+4>: mov $0x5,%ecx 0x49 <+9>: call 0xffffffff81001d9b 0x4e <+14>: jmp 0xffffffff81001de5 0x53 <+19>: int3 ... 0x9b <+91>: call 0xffffffff81001dce 0xa0 <+96>: ret 0xa1 <+97>: int3 ... 0xce <+142>: mov $0x5,%eax 0xd3 <+147>: jmp 0xffffffff81001dd6 0xd5 <+149>: nop 0xd6 <+150>: sub $0x1,%eax 0xd9 <+153>: jne 0xffffffff81001dd3 0xdb <+155>: sub $0x1,%ecx 0xde <+158>: jne 0xffffffff81001d9b 0xe0 <+160>: ret 0xe1 <+161>: int3 0xe2 <+162>: int3 0xe3 <+163>: int3 0xe4 <+164>: int3 0xe5 <+165>: lfence 0xe8 <+168>: pop %rbp 0xe9 <+169>: ret Suggested-by: Andrew Cooper Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/entry/entry_64.S | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f40bdf97d390a7..ed04a968cc7d00 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1525,7 +1525,9 @@ SYM_CODE_END(rewind_stack_and_make_dead) * ORC to unwind properly. * * The alignment is for performance and not for safety, and may be safely - * refactored in the future if needed. + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. */ SYM_FUNC_START(clear_bhb_loop) ANNOTATE_NOENDBR @@ -1536,10 +1538,22 @@ SYM_FUNC_START(clear_bhb_loop) call 1f jmp 5f .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc ANNOTATE_INTRA_FUNCTION_CALL 1: call 2f - RET +.Lret1: RET .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc 2: movl $5, %eax 3: jmp 4f nop @@ -1547,7 +1561,7 @@ SYM_FUNC_START(clear_bhb_loop) jnz 3b sub $1, %ecx jnz 1b - RET +.Lret2: RET 5: lfence pop %rbp RET From 9a360eeccd649e228060ae156e1659f7c48185af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2025 16:30:11 +0200 Subject: [PATCH 047/353] mm/execmem: Unify early execmem_cache behaviour Early kernel memory is RWX, only at the end of early boot (before SMP) do we mark things ROX. Have execmem_cache mirror this behaviour for early users. This avoids having to remember what code is execmem and what is not -- we can poke everything with impunity ;-) Also performance for not having to do endless text_poke_mm switches. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/mm/init_32.c | 3 +++ arch/x86/mm/init_64.c | 3 +++ include/linux/execmem.h | 8 +++++++- mm/execmem.c | 40 +++++++++++++++++++++++++++++++++++++--- 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c69..7fa3965dcef158 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -755,6 +756,8 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b24..949a447f75ec7e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1391,6 +1392,8 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; /* diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 65655a5d1be283..089c71cc8ddf17 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -53,7 +53,7 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; -#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM) /** * execmem_fill_trapping_insns - set memory to contain instructions that * will trap @@ -93,9 +93,15 @@ int execmem_make_temp_rw(void *ptr, size_t size); * Return: 0 on success or negative error code on failure. */ int execmem_restore_rox(void *ptr, size_t size); + +/* + * Called from mark_readonly(), where the system transitions to ROX. + */ +void execmem_cache_make_ro(void); #else static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } +static inline void execmem_cache_make_ro(void) { } #endif /** diff --git a/mm/execmem.c b/mm/execmem.c index e6c4f5076ca8d8..6f7a2653b280ed 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -254,6 +254,34 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) return ptr; } +static bool execmem_cache_rox = false; + +void execmem_cache_make_ro(void) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas_free, free_areas, 0, ULONG_MAX); + MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); + struct mutex *mutex = &execmem_cache.mutex; + void *area; + + execmem_cache_rox = true; + + mutex_lock(mutex); + + mas_for_each(&mas_free, area, ULONG_MAX) { + unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; + set_memory_ro(mas_free.index, pages); + } + + mas_for_each(&mas_busy, area, ULONG_MAX) { + unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; + set_memory_ro(mas_busy.index, pages); + } + + mutex_unlock(mutex); +} + static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; @@ -274,9 +302,15 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; + if (execmem_cache_rox) { + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; + } else { + err = set_memory_x((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; + } err = execmem_cache_add(p, alloc_size); if (err) From 52ec5913e8093e09e11a89f42b3288565c5c5097 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Sat, 3 May 2025 09:46:31 -0700 Subject: [PATCH 048/353] x86/ibt: Keep IBT disabled during alternative patching cfi_rewrite_callers() updates the fineIBT hash matching at the caller side, but except for paranoid-mode it relies on apply_retpoline() and friends for any ENDBR relocation. This could temporarily cause an indirect branch to land on a poisoned ENDBR. For instance, with para-virtualization enabled, a simple wrmsrl() could have an indirect branch pointing to native_write_msr() who's ENDBR has been relocated due to fineIBT: : push %rbp mov %rsp,%rbp mov %esi,%eax mov %rsi,%rdx shr $0x20,%rdx mov %edi,%edi mov %rax,%rsi call *0x21e65d0(%rip) # ^^^^^^^^^^^^^^^^^^^^^^^ Such an indirect call during the alternative patching could #CP if the caller is not *yet* adjusted for the new target ENDBR. To prevent a false #CP, keep CET-IBT disabled until all callers are patched. Patching during the module load does not need to be guarded by IBT-disable because the module code is not executed until the patching is complete. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/kernel/alternative.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3e67e72bffdc98..6e0b09b2c97201 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -31,6 +31,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -2085,6 +2086,8 @@ static noinline void __init alt_reloc_selftest(void) void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -2111,6 +2114,9 @@ void __init alternative_instructions(void) */ paravirt_set_cap(); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(/*disable*/ true); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -2134,6 +2140,8 @@ void __init alternative_instructions(void) */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + ibt_restore(ibt); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { From 73e1543130ded5f66ba2962a4ebf0cd4fbdec012 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Oct 2024 10:05:48 -0700 Subject: [PATCH 049/353] x86/its: Use dynamic thunks for indirect branches ITS mitigation moves the unsafe indirect branches to a safe thunk. This could degrade the prediction accuracy as the source address of indirect branches becomes same for different execution paths. To improve the predictions, and hence the performance, assign a separate thunk for each indirect callsite. This is also a defense-in-depth measure to avoid indirect branches aliasing with each other. As an example, 5000 dynamic thunks would utilize around 16 bits of the address space, thereby gaining entropy. For a BTB that uses 32 bits for indexing, dynamic thunks could provide better prediction accuracy over fixed thunks. Have ITS thunks be variable sized and use EXECMEM_MODULE_TEXT such that they are both more flexible (got to extend them later) and live in 2M TLBs, just like kernel code, avoiding undue TLB pressure. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/Kconfig | 1 + arch/x86/include/asm/alternative.h | 10 +++ arch/x86/kernel/alternative.c | 127 ++++++++++++++++++++++++++++- arch/x86/kernel/module.c | 6 ++ include/linux/execmem.h | 3 + include/linux/module.h | 5 ++ 6 files changed, 149 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb28dea201cca2..a7b3fdadde69c9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2716,6 +2716,7 @@ config MITIGATION_ITS bool "Enable Indirect Target Selection mitigation" depends on CPU_SUP_INTEL && X86_64 depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK + select EXECMEM default y help Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4037c611537c85..47948ebbb40969 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -124,6 +124,16 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, } #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_init_mod(struct module *mod); +extern void its_fini_mod(struct module *mod); +extern void its_free_mod(struct module *mod); +#else /* CONFIG_MITIGATION_ITS */ +static inline void its_init_mod(struct module *mod) { } +static inline void its_fini_mod(struct module *mod) { } +static inline void its_free_mod(struct module *mod) { } +#endif + #if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) extern bool cpu_wants_rethunk(void); extern bool cpu_wants_rethunk_at(void *addr); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6e0b09b2c97201..7a10e3ed5d0b5b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -125,6 +127,121 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_MITIGATION_ITS + +static struct module *its_mod; +static void *its_page; +static unsigned int its_offset; + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int i = 0; + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk; +} + +void its_init_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + WARN_ON_ONCE(its_mod != mod); + + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_free(page); + } + kfree(mod->its_page_array); +} + +static void *its_alloc(void) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + + if (!page) + return NULL; + + if (its_mod) { + void *tmp = krealloc(its_mod->its_page_array, + (its_mod->its_num_pages+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + its_mod->its_page_array = tmp; + its_mod->its_page_array[its_mod->its_num_pages++] = page; + + execmem_make_temp_rw(page, PAGE_SIZE); + } + + return no_free_ptr(page); +} + +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } + + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; + + thunk = its_page + its_offset; + its_offset += size; + + return its_init_thunk(thunk, reg); +} + +#endif + /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: @@ -637,9 +754,13 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 #ifdef CONFIG_MITIGATION_ITS static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { - return __emit_trampoline(addr, insn, bytes, - __x86_indirect_its_thunk_array[reg], - __x86_indirect_its_thunk_array[reg]); + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); } /* Check if an indirect branch is at ITS-unsafe address */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f35170175..ff07558b7ebc65 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -266,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } + its_init_mod(me); + if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -286,6 +288,9 @@ int module_finalize(const Elf_Ehdr *hdr, void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + + its_fini_mod(me); + if (returns) { void *rseg = (void *)returns->sh_addr; apply_returns(rseg, rseg + returns->sh_size); @@ -326,4 +331,5 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + its_free_mod(mod); } diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 089c71cc8ddf17..ca42d5e46ccc6b 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -4,6 +4,7 @@ #include #include +#include #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) @@ -176,6 +177,8 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T)); + #ifdef CONFIG_MMU /** * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory diff --git a/include/linux/module.h b/include/linux/module.h index b3329110d6686f..8050f77c3b64f8 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -586,6 +586,11 @@ struct module { atomic_t refcnt; #endif +#ifdef CONFIG_MITIGATION_ITS + int its_num_pages; + void **its_page_array; +#endif + #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; From a3f42b7fa0f9b7ff7152d93a36b8ac6855598390 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2025 09:57:31 +0200 Subject: [PATCH 050/353] x86/its: FineIBT-paranoid vs ITS FineIBT-paranoid was using the retpoline bytes for the paranoid check, disabling retpolines, because all parts that have IBT also have eIBRS and thus don't need no stinking retpolines. Except... ITS needs the retpolines for indirect calls must not be in the first half of a cacheline :-/ So what was the paranoid call sequence: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b lea -0x10(%r11), %r11 e: 75 fd jne d 10: 41 ff d3 call *%r11 13: 90 nop Now becomes: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b f0 lea -0x10(%r11), %r11 e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 Where the paranoid_thunk looks like: 1d: (bad) __x86_indirect_paranoid_thunk_r11: 1e: 75 fd jne 1d __x86_indirect_its_thunk_r11: 20: 41 ff eb jmp *%r11 23: cc int3 [ dhansen: remove initialization to false ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/include/asm/alternative.h | 8 ++ arch/x86/kernel/alternative.c | 145 +++++++++++++++++++++++++---- arch/x86/lib/retpoline.S | 15 ++- arch/x86/net/bpf_jit_comp.c | 2 +- tools/objtool/arch/x86/decode.c | 9 ++ 5 files changed, 159 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 47948ebbb40969..f2294784babc30 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -6,6 +6,7 @@ #include #include #include +#include #define ALT_FLAGS_SHIFT 16 @@ -128,10 +129,17 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, extern void its_init_mod(struct module *mod); extern void its_fini_mod(struct module *mod); extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); #else /* CONFIG_MITIGATION_ITS */ static inline void its_init_mod(struct module *mod) { } static inline void its_fini_mod(struct module *mod) { } static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} #endif #if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7a10e3ed5d0b5b..48fd04e9011483 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -127,6 +127,10 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + #ifdef CONFIG_MITIGATION_ITS static struct module *its_mod; @@ -137,8 +141,25 @@ static unsigned int its_offset; static void *its_init_thunk(void *thunk, int reg) { u8 *bytes = thunk; + int offset = 0; int i = 0; +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence ( + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; @@ -147,7 +168,7 @@ static void *its_init_thunk(void *thunk, int reg) bytes[i++] = 0xe0 + reg; /* jmp *reg */ bytes[i++] = 0xcc; - return thunk; + return thunk + offset; } void its_init_mod(struct module *mod) @@ -217,6 +238,17 @@ static void *its_allocate_thunk(int reg) int size = 3 + (reg / 8); void *thunk; +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { its_page = its_alloc(); if (!its_page) { @@ -240,6 +272,18 @@ static void *its_allocate_thunk(int reg) return its_init_thunk(thunk, reg); } +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + #endif /* @@ -775,8 +819,17 @@ static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) /* Lower-half of the cacheline? */ return !(addr & 0x20); } +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} #endif +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -893,6 +946,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -909,6 +963,12 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -1198,8 +1258,6 @@ int cfi_get_func_arity(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; -static bool cfi_paranoid __ro_after_init = false; - /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1612,6 +1670,19 @@ static int cfi_rand_callers(s32 *start, s32 *end) return 0; } +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -1653,9 +1724,14 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); - ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); - if (WARN_ON_ONCE(ret != 3)) - continue; + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } text_poke_early(addr, bytes, fineibt_paranoid_size); } @@ -1882,29 +1958,66 @@ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 return false; } +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] - * sequence. + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; - u32 hash; - if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + if (!cfi_paranoid) return false; - __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); - *target = regs->r11 + fineibt_preamble_size; - *type = regs->r10; + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } /* - * Since the trapping instruction is the exact, but LOCK prefixed, - * Jcc.d8 that got us here, the normal fixup will work. + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * */ - return true; + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } -Efault: return false; } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ebca28fe7e3133..39374949daa2f6 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -371,6 +371,15 @@ SYM_FUNC_END(call_depth_return_thunk) .macro ITS_THUNK reg +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR @@ -378,19 +387,19 @@ SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) jmp *%\reg int3 .align 32, 0xcc /* fill to the end of the line */ - .skip 32, 0xcc /* skip to the next upper half */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ .endm /* ITS mitigation requires thunks be aligned to upper half of cacheline */ .align 64, 0xcc -.skip 32, 0xcc -SYM_CODE_START(__x86_indirect_its_thunk_array) +.skip 29, 0xcc #define GEN(reg) ITS_THUNK reg #include #undef GEN .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) SYM_CODE_END(__x86_indirect_its_thunk_array) .align 64, 0xcc diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 34ba61770b56fe..ea4dd5b393aaf8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -665,7 +665,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_its_thunk_array[reg], ip); + emit_jump(&prog, its_static_thunk(reg), ip); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 3ce7b54003c229..687c5eafb49a7e 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -189,6 +189,15 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec op2 = ins.opcode.bytes[1]; op3 = ins.opcode.bytes[2]; + /* + * XXX hack, decoder is buggered and thinks 0xea is 7 bytes long. + */ + if (op1 == 0xea) { + insn->len = 1; + insn->type = INSN_BUG; + return 0; + } + if (ins.rex_prefix.nbytes) { rex = ins.rex_prefix.bytes[0]; rex_w = X86_REX_W(rex) >> 3; From 821e5a620f8da055310de96a50487528ef9100ab Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 24 Dec 2024 16:09:28 -0800 Subject: [PATCH 051/353] selftest/x86/bugs: Add selftests for ITS Below are the tests added for Indirect Target Selection (ITS): - its_sysfs.py - Check if sysfs reflects the correct mitigation status for the mitigation selected via the kernel cmdline. - its_permutations.py - tests mitigation selection with cmdline permutations with other bugs like spectre_v2 and retbleed. - its_indirect_alignment.py - verifies that for addresses in .retpoline_sites section that belong to lower half of cacheline are patched to ITS-safe thunk. Typical output looks like below: Site 49: function symbol: __x64_sys_restart_syscall+0x1f <0xffffffffbb1509af> # vmlinux: 0xffffffff813509af: jmp 0xffffffff81f5a8e0 # kcore: 0xffffffffbb1509af: jmpq *%rax # ITS thunk NOT expected for site 49 # PASSED: Found *%rax # Site 50: function symbol: __resched_curr+0xb0 <0xffffffffbb181910> # vmlinux: 0xffffffff81381910: jmp 0xffffffff81f5a8e0 # kcore: 0xffffffffbb181910: jmp 0xffffffffc02000fc # ITS thunk expected for site 50 # PASSED: Found 0xffffffffc02000fc -> jmpq *%rax - its_ret_alignment.py - verifies that for addresses in .return_sites section that belong to lower half of cacheline are patched to its_return_thunk. Typical output looks like below: Site 97: function symbol: collect_event+0x48 <0xffffffffbb007f18> # vmlinux: 0xffffffff81207f18: jmp 0xffffffff81f5b500 # kcore: 0xffffffffbb007f18: jmp 0xffffffffbbd5b560 # PASSED: Found jmp 0xffffffffbbd5b560 # Site 98: function symbol: collect_event+0xa4 <0xffffffffbb007f74> # vmlinux: 0xffffffff81207f74: jmp 0xffffffff81f5b500 # kcore: 0xffffffffbb007f74: retq # PASSED: Found retq Some of these tests have dependency on tools like virtme-ng[1] and drgn[2]. When the dependencies are not met, the test will be skipped. [1] https://github.com/arighi/virtme-ng [2] https://github.com/osandov/drgn Co-developed-by: Tao Zhang Signed-off-by: Tao Zhang Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/x86/bugs/Makefile | 3 + tools/testing/selftests/x86/bugs/common.py | 164 ++++++++++++++++++ .../x86/bugs/its_indirect_alignment.py | 150 ++++++++++++++++ .../selftests/x86/bugs/its_permutations.py | 109 ++++++++++++ .../selftests/x86/bugs/its_ret_alignment.py | 139 +++++++++++++++ tools/testing/selftests/x86/bugs/its_sysfs.py | 65 +++++++ 7 files changed, 631 insertions(+) create mode 100644 tools/testing/selftests/x86/bugs/Makefile create mode 100755 tools/testing/selftests/x86/bugs/common.py create mode 100755 tools/testing/selftests/x86/bugs/its_indirect_alignment.py create mode 100755 tools/testing/selftests/x86/bugs/its_permutations.py create mode 100755 tools/testing/selftests/x86/bugs/its_ret_alignment.py create mode 100755 tools/testing/selftests/x86/bugs/its_sysfs.py diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c77c8c8e3d9bdd..80fb84fa3cfcbd 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -121,6 +121,7 @@ TARGETS += user_events TARGETS += vDSO TARGETS += mm TARGETS += x86 +TARGETS += x86/bugs TARGETS += zram #Please keep the TARGETS list alphabetically sorted # Run "make quicktest=1 run_tests" or diff --git a/tools/testing/selftests/x86/bugs/Makefile b/tools/testing/selftests/x86/bugs/Makefile new file mode 100644 index 00000000000000..8ff2d7226c7f3f --- /dev/null +++ b/tools/testing/selftests/x86/bugs/Makefile @@ -0,0 +1,3 @@ +TEST_PROGS := its_sysfs.py its_permutations.py its_indirect_alignment.py its_ret_alignment.py +TEST_FILES := common.py +include ../../lib.mk diff --git a/tools/testing/selftests/x86/bugs/common.py b/tools/testing/selftests/x86/bugs/common.py new file mode 100755 index 00000000000000..2f9664a80617a6 --- /dev/null +++ b/tools/testing/selftests/x86/bugs/common.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025 Intel Corporation +# +# This contains kselftest framework adapted common functions for testing +# mitigation for x86 bugs. + +import os, sys, re, shutil + +sys.path.insert(0, '../../kselftest') +import ksft + +def read_file(path): + if not os.path.exists(path): + return None + with open(path, 'r') as file: + return file.read().strip() + +def cpuinfo_has(arg): + cpuinfo = read_file('/proc/cpuinfo') + if arg in cpuinfo: + return True + return False + +def cmdline_has(arg): + cmdline = read_file('/proc/cmdline') + if arg in cmdline: + return True + return False + +def cmdline_has_either(args): + cmdline = read_file('/proc/cmdline') + for arg in args: + if arg in cmdline: + return True + return False + +def cmdline_has_none(args): + return not cmdline_has_either(args) + +def cmdline_has_all(args): + cmdline = read_file('/proc/cmdline') + for arg in args: + if arg not in cmdline: + return False + return True + +def get_sysfs(bug): + return read_file("/sys/devices/system/cpu/vulnerabilities/" + bug) + +def sysfs_has(bug, mitigation): + status = get_sysfs(bug) + if mitigation in status: + return True + return False + +def sysfs_has_either(bugs, mitigations): + for bug in bugs: + for mitigation in mitigations: + if sysfs_has(bug, mitigation): + return True + return False + +def sysfs_has_none(bugs, mitigations): + return not sysfs_has_either(bugs, mitigations) + +def sysfs_has_all(bugs, mitigations): + for bug in bugs: + for mitigation in mitigations: + if not sysfs_has(bug, mitigation): + return False + return True + +def bug_check_pass(bug, found): + ksft.print_msg(f"\nFound: {found}") + # ksft.print_msg(f"\ncmdline: {read_file('/proc/cmdline')}") + ksft.test_result_pass(f'{bug}: {found}') + +def bug_check_fail(bug, found, expected): + ksft.print_msg(f'\nFound:\t {found}') + ksft.print_msg(f'Expected:\t {expected}') + ksft.print_msg(f"\ncmdline: {read_file('/proc/cmdline')}") + ksft.test_result_fail(f'{bug}: {found}') + +def bug_status_unknown(bug, found): + ksft.print_msg(f'\nUnknown status: {found}') + ksft.print_msg(f"\ncmdline: {read_file('/proc/cmdline')}") + ksft.test_result_fail(f'{bug}: {found}') + +def basic_checks_sufficient(bug, mitigation): + if not mitigation: + bug_status_unknown(bug, "None") + return True + elif mitigation == "Not affected": + ksft.test_result_pass(bug) + return True + elif mitigation == "Vulnerable": + if cmdline_has_either([f'{bug}=off', 'mitigations=off']): + bug_check_pass(bug, mitigation) + return True + return False + +def get_section_info(vmlinux, section_name): + from elftools.elf.elffile import ELFFile + with open(vmlinux, 'rb') as f: + elffile = ELFFile(f) + section = elffile.get_section_by_name(section_name) + if section is None: + ksft.print_msg("Available sections in vmlinux:") + for sec in elffile.iter_sections(): + ksft.print_msg(sec.name) + raise ValueError(f"Section {section_name} not found in {vmlinux}") + return section['sh_addr'], section['sh_offset'], section['sh_size'] + +def get_patch_sites(vmlinux, offset, size): + import struct + output = [] + with open(vmlinux, 'rb') as f: + f.seek(offset) + i = 0 + while i < size: + data = f.read(4) # s32 + if not data: + break + sym_offset = struct.unpack(' 1: + arg_vmlinux = os.sys.argv[1] + if not os.path.exists(arg_vmlinux): + ksft.test_result_fail(f"its_indirect_alignment.py: vmlinux not found at argument path: {arg_vmlinux}") + ksft.exit_fail() + os.makedirs(f"/usr/lib/debug/lib/modules/{os.uname().release}", exist_ok=True) + os.system(f'cp {arg_vmlinux} /usr/lib/debug/lib/modules/$(uname -r)/vmlinux') + +vmlinux = f"/usr/lib/debug/lib/modules/{os.uname().release}/vmlinux" +if not os.path.exists(vmlinux): + ksft.test_result_fail(f"its_indirect_alignment.py: vmlinux not found at {vmlinux}") + ksft.exit_fail() + +ksft.print_msg(f"Using vmlinux: {vmlinux}") + +retpolines_start_vmlinux, retpolines_sec_offset, size = c.get_section_info(vmlinux, '.retpoline_sites') +ksft.print_msg(f"vmlinux: Section .retpoline_sites (0x{retpolines_start_vmlinux:x}) found at 0x{retpolines_sec_offset:x} with size 0x{size:x}") + +sites_offset = c.get_patch_sites(vmlinux, retpolines_sec_offset, size) +total_retpoline_tests = len(sites_offset) +ksft.print_msg(f"Found {total_retpoline_tests} retpoline sites") + +prog = c.get_runtime_kernel() +retpolines_start_kcore = prog.symbol('__retpoline_sites').address +ksft.print_msg(f'kcore: __retpoline_sites: 0x{retpolines_start_kcore:x}') + +x86_indirect_its_thunk_r15 = prog.symbol('__x86_indirect_its_thunk_r15').address +ksft.print_msg(f'kcore: __x86_indirect_its_thunk_r15: 0x{x86_indirect_its_thunk_r15:x}') + +tests_passed = 0 +tests_failed = 0 +tests_unknown = 0 + +with open(vmlinux, 'rb') as f: + elffile = ELFFile(f) + text_section = elffile.get_section_by_name('.text') + + for i in range(0, len(sites_offset)): + site = retpolines_start_kcore + sites_offset[i] + vmlinux_site = retpolines_start_vmlinux + sites_offset[i] + passed = unknown = failed = False + try: + vmlinux_insn = c.get_instruction_from_vmlinux(elffile, text_section, text_section['sh_addr'], vmlinux_site) + kcore_insn = list(cap.disasm(prog.read(site, 16), site))[0] + operand = kcore_insn.op_str + insn_end = site + kcore_insn.size - 1 # TODO handle Jcc.32 __x86_indirect_thunk_\reg + safe_site = insn_end & 0x20 + site_status = "" if safe_site else "(unsafe)" + + ksft.print_msg(f"\nSite {i}: {identify_address(prog, site)} <0x{site:x}> {site_status}") + ksft.print_msg(f"\tvmlinux: 0x{vmlinux_insn.address:x}:\t{vmlinux_insn.mnemonic}\t{vmlinux_insn.op_str}") + ksft.print_msg(f"\tkcore: 0x{kcore_insn.address:x}:\t{kcore_insn.mnemonic}\t{kcore_insn.op_str}") + + if (site & 0x20) ^ (insn_end & 0x20): + ksft.print_msg(f"\tSite at safe/unsafe boundary: {str(kcore_insn.bytes)} {kcore_insn.mnemonic} {operand}") + if safe_site: + tests_passed += 1 + passed = True + ksft.print_msg(f"\tPASSED: At safe address") + continue + + if operand.startswith('0xffffffff'): + thunk = int(operand, 16) + if thunk > x86_indirect_its_thunk_r15: + insn_at_thunk = list(cap.disasm(prog.read(thunk, 16), thunk))[0] + operand += ' -> ' + insn_at_thunk.mnemonic + ' ' + insn_at_thunk.op_str + ' ' + if 'jmp' in insn_at_thunk.mnemonic and thunk & 0x20: + ksft.print_msg(f"\tPASSED: Found {operand} at safe address") + passed = True + if not passed: + if kcore_insn.operands[0].type == capstone.CS_OP_IMM: + operand += ' <' + prog.symbol(int(operand, 16)) + '>' + if '__x86_indirect_its_thunk_' in operand: + ksft.print_msg(f"\tPASSED: Found {operand}") + else: + ksft.print_msg(f"\tPASSED: Found direct branch: {kcore_insn}, ITS thunk not required.") + passed = True + else: + unknown = True + if passed: + tests_passed += 1 + elif unknown: + ksft.print_msg(f"UNKNOWN: unexpected operand: {kcore_insn}") + tests_unknown += 1 + else: + ksft.print_msg(f'\t************* FAILED *************') + ksft.print_msg(f"\tFound {kcore_insn.bytes} {kcore_insn.mnemonic} {operand}") + ksft.print_msg(f'\t**********************************') + tests_failed += 1 + except Exception as e: + ksft.print_msg(f"UNKNOWN: An unexpected error occurred: {e}") + tests_unknown += 1 + +ksft.print_msg(f"\n\nSummary:") +ksft.print_msg(f"PASS: \t{tests_passed} \t/ {total_retpoline_tests}") +ksft.print_msg(f"FAIL: \t{tests_failed} \t/ {total_retpoline_tests}") +ksft.print_msg(f"UNKNOWN: \t{tests_unknown} \t/ {total_retpoline_tests}") + +if tests_failed == 0: + ksft.test_result_pass("All ITS return thunk sites passed") +else: + ksft.test_result_fail(f"{tests_failed} ITS return thunk sites failed") +ksft.finished() diff --git a/tools/testing/selftests/x86/bugs/its_permutations.py b/tools/testing/selftests/x86/bugs/its_permutations.py new file mode 100755 index 00000000000000..3204f4728c62cc --- /dev/null +++ b/tools/testing/selftests/x86/bugs/its_permutations.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025 Intel Corporation +# +# Test for indirect target selection (ITS) cmdline permutations with other bugs +# like spectre_v2 and retbleed. + +import os, sys, subprocess, itertools, re, shutil + +test_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, test_dir + '/../../kselftest') +import ksft +import common as c + +bug = "indirect_target_selection" +mitigation = c.get_sysfs(bug) + +if not mitigation or "Not affected" in mitigation: + ksft.test_result_skip("Skipping its_permutations.py: not applicable") + ksft.finished() + +if shutil.which('vng') is None: + ksft.test_result_skip("Skipping its_permutations.py: virtme-ng ('vng') not found in PATH.") + ksft.finished() + +TEST = f"{test_dir}/its_sysfs.py" +default_kparam = ['clearcpuid=hypervisor', 'panic=5', 'panic_on_warn=1', 'oops=panic', 'nmi_watchdog=1', 'hung_task_panic=1'] + +DEBUG = " -v " + +# Install dependencies +# https://github.com/arighi/virtme-ng +# apt install virtme-ng +BOOT_CMD = f"vng --run {test_dir}/../../../../../arch/x86/boot/bzImage " +#BOOT_CMD += DEBUG + +bug = "indirect_target_selection" + +input_options = { + 'indirect_target_selection' : ['off', 'on', 'stuff', 'vmexit'], + 'retbleed' : ['off', 'stuff', 'auto'], + 'spectre_v2' : ['off', 'on', 'eibrs', 'retpoline', 'ibrs', 'eibrs,retpoline'], +} + +def pretty_print(output): + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + + # Define patterns and their corresponding colors + patterns = { + r"^ok \d+": OKGREEN, + r"^not ok \d+": FAIL, + r"^# Testing .*": OKBLUE, + r"^# Found: .*": WARNING, + r"^# Totals: .*": BOLD, + r"pass:([1-9]\d*)": OKGREEN, + r"fail:([1-9]\d*)": FAIL, + r"skip:([1-9]\d*)": WARNING, + } + + # Apply colors based on patterns + for pattern, color in patterns.items(): + output = re.sub(pattern, lambda match: f"{color}{match.group(0)}{ENDC}", output, flags=re.MULTILINE) + + print(output) + +combinations = list(itertools.product(*input_options.values())) +ksft.print_header() +ksft.set_plan(len(combinations)) + +logs = "" + +for combination in combinations: + append = "" + log = "" + for p in default_kparam: + append += f' --append={p}' + command = BOOT_CMD + append + test_params = "" + for i, key in enumerate(input_options.keys()): + param = f'{key}={combination[i]}' + test_params += f' {param}' + command += f" --append={param}" + command += f" -- {TEST}" + test_name = f"{bug} {test_params}" + pretty_print(f'# Testing {test_name}') + t = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + t.wait() + output, _ = t.communicate() + if t.returncode == 0: + ksft.test_result_pass(test_name) + else: + ksft.test_result_fail(test_name) + output = output.decode() + log += f" {output}" + pretty_print(log) + logs += output + "\n" + +# Optionally use tappy to parse the output +# apt install python3-tappy +with open("logs.txt", "w") as f: + f.write(logs) + +ksft.finished() diff --git a/tools/testing/selftests/x86/bugs/its_ret_alignment.py b/tools/testing/selftests/x86/bugs/its_ret_alignment.py new file mode 100755 index 00000000000000..f40078d9f6ffc1 --- /dev/null +++ b/tools/testing/selftests/x86/bugs/its_ret_alignment.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025 Intel Corporation +# +# Test for indirect target selection (ITS) mitigation. +# +# Tests if the RETs are correctly patched by evaluating the +# vmlinux .return_sites in /proc/kcore. +# +# Install dependencies +# add-apt-repository ppa:michel-slm/kernel-utils +# apt update +# apt install -y python3-drgn python3-pyelftools python3-capstone +# +# Run on target machine +# mkdir -p /usr/lib/debug/lib/modules/$(uname -r) +# cp $VMLINUX /usr/lib/debug/lib/modules/$(uname -r)/vmlinux +# +# Usage: ./its_ret_alignment.py + +import os, sys, argparse +from pathlib import Path + +this_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, this_dir + '/../../kselftest') +import ksft +import common as c + +bug = "indirect_target_selection" +mitigation = c.get_sysfs(bug) +if not mitigation or "Aligned branch/return thunks" not in mitigation: + ksft.test_result_skip("Skipping its_ret_alignment.py: Aligned branch/return thunks not enabled") + ksft.finished() + +c.check_dependencies_or_skip(['drgn', 'elftools', 'capstone'], script_name="its_ret_alignment.py") + +from elftools.elf.elffile import ELFFile +from drgn.helpers.common.memory import identify_address + +cap = c.init_capstone() + +if len(os.sys.argv) > 1: + arg_vmlinux = os.sys.argv[1] + if not os.path.exists(arg_vmlinux): + ksft.test_result_fail(f"its_ret_alignment.py: vmlinux not found at user-supplied path: {arg_vmlinux}") + ksft.exit_fail() + os.makedirs(f"/usr/lib/debug/lib/modules/{os.uname().release}", exist_ok=True) + os.system(f'cp {arg_vmlinux} /usr/lib/debug/lib/modules/$(uname -r)/vmlinux') + +vmlinux = f"/usr/lib/debug/lib/modules/{os.uname().release}/vmlinux" +if not os.path.exists(vmlinux): + ksft.test_result_fail(f"its_ret_alignment.py: vmlinux not found at {vmlinux}") + ksft.exit_fail() + +ksft.print_msg(f"Using vmlinux: {vmlinux}") + +rethunks_start_vmlinux, rethunks_sec_offset, size = c.get_section_info(vmlinux, '.return_sites') +ksft.print_msg(f"vmlinux: Section .return_sites (0x{rethunks_start_vmlinux:x}) found at 0x{rethunks_sec_offset:x} with size 0x{size:x}") + +sites_offset = c.get_patch_sites(vmlinux, rethunks_sec_offset, size) +total_rethunk_tests = len(sites_offset) +ksft.print_msg(f"Found {total_rethunk_tests} rethunk sites") + +prog = c.get_runtime_kernel() +rethunks_start_kcore = prog.symbol('__return_sites').address +ksft.print_msg(f'kcore: __rethunk_sites: 0x{rethunks_start_kcore:x}') + +its_return_thunk = prog.symbol('its_return_thunk').address +ksft.print_msg(f'kcore: its_return_thunk: 0x{its_return_thunk:x}') + +tests_passed = 0 +tests_failed = 0 +tests_unknown = 0 +tests_skipped = 0 + +with open(vmlinux, 'rb') as f: + elffile = ELFFile(f) + text_section = elffile.get_section_by_name('.text') + + for i in range(len(sites_offset)): + site = rethunks_start_kcore + sites_offset[i] + vmlinux_site = rethunks_start_vmlinux + sites_offset[i] + try: + passed = unknown = failed = skipped = False + + symbol = identify_address(prog, site) + vmlinux_insn = c.get_instruction_from_vmlinux(elffile, text_section, text_section['sh_addr'], vmlinux_site) + kcore_insn = list(cap.disasm(prog.read(site, 16), site))[0] + + insn_end = site + kcore_insn.size - 1 + + safe_site = insn_end & 0x20 + site_status = "" if safe_site else "(unsafe)" + + ksft.print_msg(f"\nSite {i}: {symbol} <0x{site:x}> {site_status}") + ksft.print_msg(f"\tvmlinux: 0x{vmlinux_insn.address:x}:\t{vmlinux_insn.mnemonic}\t{vmlinux_insn.op_str}") + ksft.print_msg(f"\tkcore: 0x{kcore_insn.address:x}:\t{kcore_insn.mnemonic}\t{kcore_insn.op_str}") + + if safe_site: + tests_passed += 1 + passed = True + ksft.print_msg(f"\tPASSED: At safe address") + continue + + if "jmp" in kcore_insn.mnemonic: + passed = True + elif "ret" not in kcore_insn.mnemonic: + skipped = True + + if passed: + ksft.print_msg(f"\tPASSED: Found {kcore_insn.mnemonic} {kcore_insn.op_str}") + tests_passed += 1 + elif skipped: + ksft.print_msg(f"\tSKIPPED: Found '{kcore_insn.mnemonic}'") + tests_skipped += 1 + elif unknown: + ksft.print_msg(f"UNKNOWN: An unknown instruction: {kcore_insn}") + tests_unknown += 1 + else: + ksft.print_msg(f'\t************* FAILED *************') + ksft.print_msg(f"\tFound {kcore_insn.mnemonic} {kcore_insn.op_str}") + ksft.print_msg(f'\t**********************************') + tests_failed += 1 + except Exception as e: + ksft.print_msg(f"UNKNOWN: An unexpected error occurred: {e}") + tests_unknown += 1 + +ksft.print_msg(f"\n\nSummary:") +ksft.print_msg(f"PASSED: \t{tests_passed} \t/ {total_rethunk_tests}") +ksft.print_msg(f"FAILED: \t{tests_failed} \t/ {total_rethunk_tests}") +ksft.print_msg(f"SKIPPED: \t{tests_skipped} \t/ {total_rethunk_tests}") +ksft.print_msg(f"UNKNOWN: \t{tests_unknown} \t/ {total_rethunk_tests}") + +if tests_failed == 0: + ksft.test_result_pass("All ITS return thunk sites passed.") +else: + ksft.test_result_fail(f"{tests_failed} failed sites need ITS return thunks.") +ksft.finished() diff --git a/tools/testing/selftests/x86/bugs/its_sysfs.py b/tools/testing/selftests/x86/bugs/its_sysfs.py new file mode 100755 index 00000000000000..7bca81f2f6065b --- /dev/null +++ b/tools/testing/selftests/x86/bugs/its_sysfs.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025 Intel Corporation +# +# Test for Indirect Target Selection(ITS) mitigation sysfs status. + +import sys, os, re +this_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, this_dir + '/../../kselftest') +import ksft + +from common import * + +bug = "indirect_target_selection" +mitigation = get_sysfs(bug) + +ITS_MITIGATION_ALIGNED_THUNKS = "Mitigation: Aligned branch/return thunks" +ITS_MITIGATION_RETPOLINE_STUFF = "Mitigation: Retpolines, Stuffing RSB" +ITS_MITIGATION_VMEXIT_ONLY = "Mitigation: Vulnerable, KVM: Not affected" +ITS_MITIGATION_VULNERABLE = "Vulnerable" + +def check_mitigation(): + if mitigation == ITS_MITIGATION_ALIGNED_THUNKS: + if cmdline_has(f'{bug}=stuff') and sysfs_has("spectre_v2", "Retpolines"): + bug_check_fail(bug, ITS_MITIGATION_ALIGNED_THUNKS, ITS_MITIGATION_RETPOLINE_STUFF) + return + if cmdline_has(f'{bug}=vmexit') and cpuinfo_has('its_native_only'): + bug_check_fail(bug, ITS_MITIGATION_ALIGNED_THUNKS, ITS_MITIGATION_VMEXIT_ONLY) + return + bug_check_pass(bug, ITS_MITIGATION_ALIGNED_THUNKS) + return + + if mitigation == ITS_MITIGATION_RETPOLINE_STUFF: + if cmdline_has(f'{bug}=stuff') and sysfs_has("spectre_v2", "Retpolines"): + bug_check_pass(bug, ITS_MITIGATION_RETPOLINE_STUFF) + return + if sysfs_has('retbleed', 'Stuffing'): + bug_check_pass(bug, ITS_MITIGATION_RETPOLINE_STUFF) + return + bug_check_fail(bug, ITS_MITIGATION_RETPOLINE_STUFF, ITS_MITIGATION_ALIGNED_THUNKS) + + if mitigation == ITS_MITIGATION_VMEXIT_ONLY: + if cmdline_has(f'{bug}=vmexit') and cpuinfo_has('its_native_only'): + bug_check_pass(bug, ITS_MITIGATION_VMEXIT_ONLY) + return + bug_check_fail(bug, ITS_MITIGATION_VMEXIT_ONLY, ITS_MITIGATION_ALIGNED_THUNKS) + + if mitigation == ITS_MITIGATION_VULNERABLE: + if sysfs_has("spectre_v2", "Vulnerable"): + bug_check_pass(bug, ITS_MITIGATION_VULNERABLE) + else: + bug_check_fail(bug, "Mitigation", ITS_MITIGATION_VULNERABLE) + + bug_status_unknown(bug, mitigation) + return + +ksft.print_header() +ksft.set_plan(1) +ksft.print_msg(f'{bug}: {mitigation} ...') + +if not basic_checks_sufficient(bug, mitigation): + check_mitigation() + +ksft.finished() From 0835b4cce5f5bcd50a06113383943920678b6850 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 9 Dec 2021 15:12:19 +0000 Subject: [PATCH 052/353] arm64: insn: Add support for encoding DSB To generate code in the eBPF epilogue that uses the DSB instruction, insn.c needs a heler to encode the type and domain. Re-use the crm encoding logic from the DMB instruction. Signed-off-by: James Morse Reviewed-by: Catalin Marinas --- arch/arm64/include/asm/insn.h | 1 + arch/arm64/lib/insn.c | 60 +++++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 39577f1d079a98..18c7811774d301 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -706,6 +706,7 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result, } #endif u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type); +u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type); u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, enum aarch64_insn_system_register sysreg); diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index 9bef696e2230be..4e298baddc2e56 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -5,6 +5,7 @@ * * Copyright (C) 2014-2016 Zi Shen Lim */ +#include #include #include #include @@ -1500,43 +1501,41 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); } -u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) +static u32 __get_barrier_crm_val(enum aarch64_insn_mb_type type) { - u32 opt; - u32 insn; - switch (type) { case AARCH64_INSN_MB_SY: - opt = 0xf; - break; + return 0xf; case AARCH64_INSN_MB_ST: - opt = 0xe; - break; + return 0xe; case AARCH64_INSN_MB_LD: - opt = 0xd; - break; + return 0xd; case AARCH64_INSN_MB_ISH: - opt = 0xb; - break; + return 0xb; case AARCH64_INSN_MB_ISHST: - opt = 0xa; - break; + return 0xa; case AARCH64_INSN_MB_ISHLD: - opt = 0x9; - break; + return 0x9; case AARCH64_INSN_MB_NSH: - opt = 0x7; - break; + return 0x7; case AARCH64_INSN_MB_NSHST: - opt = 0x6; - break; + return 0x6; case AARCH64_INSN_MB_NSHLD: - opt = 0x5; - break; + return 0x5; default: - pr_err("%s: unknown dmb type %d\n", __func__, type); + pr_err("%s: unknown barrier type %d\n", __func__, type); return AARCH64_BREAK_FAULT; } +} + +u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) +{ + u32 opt; + u32 insn; + + opt = __get_barrier_crm_val(type); + if (opt == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; insn = aarch64_insn_get_dmb_value(); insn &= ~GENMASK(11, 8); @@ -1545,6 +1544,21 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) return insn; } +u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type) +{ + u32 opt, insn; + + opt = __get_barrier_crm_val(type); + if (opt == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + insn = aarch64_insn_get_dsb_base_value(); + insn &= ~GENMASK(11, 8); + insn |= (opt << 8); + + return insn; +} + u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, enum aarch64_insn_system_register sysreg) { From 9013dccd3cc9876fd7f27ae8f2dd86a16f41c9e3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 19 Aug 2024 14:15:53 +0100 Subject: [PATCH 053/353] arm64: proton-pack: Expose whether the platform is mitigated by firmware is_spectre_bhb_fw_affected() allows the caller to determine if the CPU is known to need a firmware mitigation. CPUs are either on the list of CPUs we know about, or firmware has been queried and reported that the platform is affected - and mitigated by firmware. This helper is not useful to determine if the platform is mitigated by firmware. A CPU could be on the know list, but the firmware may not be implemented. Its affected but not mitigated. spectre_bhb_enable_mitigation() handles this distinction by checking the firmware state before enabling the mitigation. Add a helper to expose this state. This will be used by the BPF JIT to determine if calling firmware for a mitigation is necessary and supported. Signed-off-by: James Morse Reviewed-by: Catalin Marinas --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index f1524cdeacf1c4..78fb720987fb45 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -97,6 +97,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index b607f6dfc5e6f6..355f1f3098a24d 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1094,6 +1094,11 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) update_mitigation_state(&spectre_bhb_state, state); } +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, From 7e0525e18c209068ecef81ba86b2ec593901b569 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 29 Apr 2025 13:55:17 +0100 Subject: [PATCH 054/353] arm64: proton-pack: Expose whether the branchy loop k value Add a helper to expose the k value of the branchy loop. This is needed by the BPF JIT to generate the mitigation sequence in BPF programs. Signed-off-by: James Morse Reviewed-by: Catalin Marinas --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 78fb720987fb45..bca12134245c6a 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -97,6 +97,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 get_spectre_bhb_loop_value(void); bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 355f1f3098a24d..3154094a9e33d4 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -999,6 +999,11 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, return true; } +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; +} + static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) { const char *v = arm64_get_bp_hardening_vector(slot); From 4b2d3e33a00a775018c75e6edd79818c574cbd16 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 9 Dec 2021 15:13:24 +0000 Subject: [PATCH 055/353] arm64: bpf: Add BHB mitigation to the epilogue for cBPF programs A malicious BPF program may manipulate the branch history to influence what the hardware speculates will happen next. On exit from a BPF program, emit the BHB mititgation sequence. This is only applied for 'classic' cBPF programs that are loaded by seccomp. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Acked-by: Daniel Borkmann --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 54 +++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index bca12134245c6a..8fef1262609011 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -97,6 +97,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +extern bool __nospectre_bhb; u8 get_spectre_bhb_loop_value(void); bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 3154094a9e33d4..4459b613077e93 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1021,7 +1021,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) isb(); } -static bool __read_mostly __nospectre_bhb; +bool __read_mostly __nospectre_bhb; static int __init parse_spectre_bhb_param(char *str) { __nospectre_bhb = true; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 70d7c89d3ac907..0ab8e47062d9a1 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "bpf_jit: " fmt +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -939,7 +941,48 @@ static void build_plt(struct jit_ctx *ctx) plt->target = (u64)&dummy_tramp; } -static void build_epilogue(struct jit_ctx *ctx) +/* Clobbers BPF registers 1-4, aka x0-x3 */ +static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) +{ + const u8 r1 = bpf2a64[BPF_REG_1]; /* aka x0 */ + u8 k = get_spectre_bhb_loop_value(); + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + cpu_mitigations_off() || __nospectre_bhb || + arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) + return; + + if (supports_clearbhb(SCOPE_SYSTEM)) { + emit(aarch64_insn_gen_hint(AARCH64_INSN_HINT_CLEARBHB), ctx); + return; + } + + if (k) { + emit_a64_mov_i64(r1, k, ctx); + emit(A64_B(1), ctx); + emit(A64_SUBS_I(true, r1, r1, 1), ctx); + emit(A64_B_(A64_COND_NE, -2), ctx); + emit(aarch64_insn_gen_dsb(AARCH64_INSN_MB_ISH), ctx); + emit(aarch64_insn_get_isb_value(), ctx); + } + + if (is_spectre_bhb_fw_mitigated()) { + emit(A64_ORR_I(false, r1, AARCH64_INSN_REG_ZR, + ARM_SMCCC_ARCH_WORKAROUND_3), ctx); + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + emit(aarch64_insn_get_hvc_value(), ctx); + break; + case SMCCC_CONDUIT_SMC: + emit(aarch64_insn_get_smc_value(), ctx); + break; + default: + pr_err_once("Firmware mitigation enabled with unknown conduit\n"); + } + } +} + +static void build_epilogue(struct jit_ctx *ctx, bool was_classic) { const u8 r0 = bpf2a64[BPF_REG_0]; const u8 ptr = bpf2a64[TCCNT_PTR]; @@ -952,10 +995,13 @@ static void build_epilogue(struct jit_ctx *ctx) emit(A64_POP(A64_ZR, ptr, A64_SP), ctx); + if (was_classic) + build_bhb_mitigation(ctx); + /* Restore FP/LR registers */ emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx); - /* Set return value */ + /* Move the return value from bpf:r0 (aka x7) to x0 */ emit(A64_MOV(1, A64_R(0), r0), ctx); /* Authenticate lr */ @@ -1898,7 +1944,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.epilogue_offset = ctx.idx; - build_epilogue(&ctx); + build_epilogue(&ctx, was_classic); build_plt(&ctx); extable_align = __alignof__(struct exception_table_entry); @@ -1961,7 +2007,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_free_hdr; } - build_epilogue(&ctx); + build_epilogue(&ctx, was_classic); build_plt(&ctx); /* Extra pass to validate JITed code. */ From 162365e268a4ee4da718c2b38109b63131cee483 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 29 Apr 2025 16:03:38 +0100 Subject: [PATCH 056/353] arm64: bpf: Only mitigate cBPF programs loaded by unprivileged users Support for eBPF programs loaded by unprivileged users is typically disabled. This means only cBPF programs need to be mitigated for BHB. In addition, only mitigate cBPF programs that were loaded by an unprivileged user. Privileged users can also load the same program via eBPF, making the mitigation pointless. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Acked-by: Daniel Borkmann --- arch/arm64/net/bpf_jit_comp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0ab8e47062d9a1..634d78422adb27 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -952,6 +952,9 @@ static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) return; + if (capable(CAP_SYS_ADMIN)) + return; + if (supports_clearbhb(SCOPE_SYSTEM)) { emit(aarch64_insn_gen_hint(AARCH64_INSN_HINT_CLEARBHB), ctx); return; From 55fe85dc41942070ed94d75ec3d8c7f649996137 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 12 Aug 2024 17:50:22 +0100 Subject: [PATCH 057/353] arm64: proton-pack: Add new CPUs 'k' values for branch mitigation Update the list of 'k' values for the branch mitigation from arm's website. Add the values for Cortex-X1C. The MIDR_EL1 value can be found here: https://developer.arm.com/documentation/101968/0002/Register-descriptions/AArch> Link: https://developer.arm.com/documentation/110280/2-0/?lang=en Signed-off-by: James Morse Reviewed-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index d1cc0571798bfa..dffff6763812f9 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,6 +81,7 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -168,6 +169,7 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 4459b613077e93..edf1783ffc8174 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -891,6 +891,7 @@ static u8 spectre_bhb_loop_affected(void) MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), From 2bf2383ffe3ca8ffffc4cb44f9c311e1803c26d0 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 24 Apr 2025 11:28:20 -0400 Subject: [PATCH 058/353] fs/xattr.c: fix simple_xattr_list to always include security.* xattrs The vfs has long had a fallback to obtain the security.* xattrs from the LSM when the filesystem does not implement its own listxattr, but shmem/tmpfs and kernfs later gained their own xattr handlers to support other xattrs. Unfortunately, as a side effect, tmpfs and kernfs-based filesystems like sysfs no longer return the synthetic security.* xattr names via listxattr unless they are explicitly set by userspace or initially set upon inode creation after policy load. coreutils has recently switched from unconditionally invoking getxattr for security.* for ls -Z via libselinux to only doing so if listxattr returns the xattr name, breaking ls -Z of such inodes. Before: $ getfattr -m.* /run/initramfs $ getfattr -m.* /sys/kernel/fscaps $ setfattr -n user.foo /run/initramfs $ getfattr -m.* /run/initramfs user.foo After: $ getfattr -m.* /run/initramfs security.selinux $ getfattr -m.* /sys/kernel/fscaps security.selinux $ setfattr -n user.foo /run/initramfs $ getfattr -m.* /run/initramfs security.selinux user.foo Link: https://lore.kernel.org/selinux/CAFqZXNtF8wDyQajPCdGn=iOawX4y77ph0EcfcqcUUj+T87FKyA@mail.gmail.com/ Link: https://lore.kernel.org/selinux/20250423175728.3185-2-stephen.smalley.work@gmail.com/ Signed-off-by: Stephen Smalley Link: https://lore.kernel.org/20250424152822.2719-1-stephen.smalley.work@gmail.com Fixes: b09e0fa4b4ea66266058ee ("tmpfs: implement generic xattr support") Signed-off-by: Christian Brauner --- fs/xattr.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/xattr.c b/fs/xattr.c index fabb2a04501ee7..8ec5b0204bfdc5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1428,6 +1428,15 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } +static bool xattr_is_maclabel(const char *name) +{ + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(suffix); +} + /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1460,6 +1469,17 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) return err; + err = security_inode_listsecurity(inode, buffer, remaining_size); + if (err < 0) + return err; + + if (buffer) { + if (remaining_size < err) + return -ERANGE; + buffer += err; + } + remaining_size -= err; + read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); @@ -1468,6 +1488,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!trusted && xattr_is_trusted(xattr->name)) continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; From fc6cd3a3be8146e9c360dd37c600e4ece983b1cd Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 29 Apr 2025 20:58:27 +0200 Subject: [PATCH 059/353] fs/eventpoll: fix endless busy loop after timeout has expired After commit 0a65bc27bd64 ("eventpoll: Set epoll timeout if it's in the future"), the following program would immediately enter a busy loop in the kernel: ``` int main() { int e = epoll_create1(0); struct epoll_event event = {.events = EPOLLIN}; epoll_ctl(e, EPOLL_CTL_ADD, 0, &event); const struct timespec timeout = {.tv_nsec = 1}; epoll_pwait2(e, &event, 1, &timeout, 0); } ``` This happens because the given (non-zero) timeout of 1 nanosecond usually expires before ep_poll() is entered and then ep_schedule_timeout() returns false, but `timed_out` is never set because the code line that sets it is skipped. This quickly turns into a soft lockup, RCU stalls and deadlocks, inflicting severe headaches to the whole system. When the timeout has expired, we don't need to schedule a hrtimer, but we should set the `timed_out` variable. Therefore, I suggest moving the ep_schedule_timeout() check into the `timed_out` expression instead of skipping it. brauner: Note that there was an earlier fix by Joe Damato in response to my bug report in [1]. Fixes: 0a65bc27bd64 ("eventpoll: Set epoll timeout if it's in the future") Cc: Joe Damato Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Link: https://lore.kernel.org/20250429153419.94723-1-jdamato@fastly.com [1] Link: https://lore.kernel.org/20250429185827.3564438-1-max.kellermann@ionos.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/eventpoll.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4bc264b854c436..d4dbffdedd08e3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2111,9 +2111,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); - if (!eavail && ep_schedule_timeout(to)) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + if (!eavail) + timed_out = !ep_schedule_timeout(to) || + !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* From 2896e5b3d376aa4e1cc9ff909e4ee306ca73b269 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 5 May 2025 12:35:19 -0700 Subject: [PATCH 060/353] swapfile: disable swapon for bs > ps devices Devices which have a requirement for bs > ps cannot be supported for swap as swap still needs work. Now that the block device cache sets the min order for block devices we need this stop gap otherwise all swap operations are rejected. Without this you'll end up with errors on these devices as the swap code still needs much love to support min order. With this we at least now put a stop gap of its use, until the swap subsystem completes its major overhaul: mkswap: /dev/nvme3n1: warning: wiping old swap signature. Setting up swapspace version 1, size = 100 GiB (107374178304 bytes) no label, UUID=6af76b5c-7e7b-4902-b7f7-4c24dde6fa36 swapon: /dev/nvme3n1: swapon failed: Invalid argument Reviewed-by: Davidlohr Bueso Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/aBkS926thy9zvdZb@bombadil.infradead.org Signed-off-by: Christian Brauner --- mm/swapfile.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index f214843612dc32..412ccd6543b34d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3331,6 +3331,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + /* + * The swap subsystem needs a major overhaul to support this. + * It doesn't work yet so just disable it for now. + */ + if (mapping_min_folio_order(mapping) > 0) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + /* * Read the swap header. */ From 3b03f834147814bcbeccf182294a5714e27aab43 Mon Sep 17 00:00:00 2001 From: Jeremy Bongio Date: Wed, 7 May 2025 12:30:10 +0000 Subject: [PATCH 061/353] fs: Remove redundant errseq_set call in mark_buffer_write_io_error. mark_buffer_write_io_error sets sb->s_wb_err to -EIO twice. Once in mapping_set_error and once in errseq_set. Only mapping_set_error checks if bh->b_assoc_map->host is NULL. Discovered during null pointer dereference during writeback to a failing device: [] ? mark_buffer_write_io_error+0x98/0xc0 [] ? mark_buffer_write_io_error+0x8e/0xc0 [] end_buffer_async_write+0x90/0xd0 [] end_bio_bh_io_sync+0x2b/0x40 [] blk_update_request+0x1b6/0x480 [] blk_mq_end_request+0x18/0x30 [] blk_mq_dispatch_rq_list+0x4da/0x8e0 [] __blk_mq_sched_dispatch_requests+0x218/0x6a0 [] blk_mq_sched_dispatch_requests+0x3a/0x80 [] blk_mq_run_hw_queue+0x108/0x330 [] blk_mq_flush_plug_list+0x178/0x5f0 [] __blk_flush_plug+0x41/0x120 [] blk_finish_plug+0x22/0x40 [] wb_writeback+0x150/0x280 [] ? set_worker_desc+0x9f/0xc0 [] wb_workfn+0x24e/0x4a0 Fixes: 485e9605c0573 ("fs/buffer.c: record blockdev write errors in super_block that it backs") Signed-off-by: Jeremy Bongio Link: https://lore.kernel.org/20250507123010.1228243-1-jbongio@google.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 7be23ff20b2733..7ba1807145aa8b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1220,10 +1220,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) { + if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); - errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); - } } EXPORT_SYMBOL(mark_buffer_write_io_error); From ba845c45bc8bd6c480c3f6422335ba6c15880906 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 May 2025 11:49:41 +0200 Subject: [PATCH 062/353] udf: Make sure i_lenExtents is uptodate on inode eviction UDF maintains total length of all extents in i_lenExtents. Generally we keep extent lengths (and thus i_lenExtents) block aligned because it makes the file appending logic simpler. However the standard mandates that the inode size must match the length of all extents and thus we trim the last extent when closing the file. To catch possible bugs we also verify that i_lenExtents matches i_size when evicting inode from memory. Commit b405c1e58b73 ("udf: refactor udf_next_aext() to handle error") however broke the code updating i_lenExtents and thus udf_evict_inode() ended up spewing lots of errors about incorrectly sized extents although the extents were actually sized properly. Fix the updating of i_lenExtents to silence the errors. Fixes: b405c1e58b73 ("udf: refactor udf_next_aext() to handle error") CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4f33a4a4888613..b4071c9cf8c951 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode) } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ - if (ret == 0) + if (ret >= 0) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } From d4e0e0e530716a6d0d7dd7bad1d4b6bd0d7c68cc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 23 Apr 2025 08:18:44 -0500 Subject: [PATCH 063/353] drivers/platform/x86/amd: pmf: Check for invalid sideloaded Smart PC Policies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a policy is passed into amd_pmf_get_pb_data() that causes the engine to fail to start there is a memory leak. Free the memory in this failure path. Fixes: 10817f28e5337 ("platform/x86/amd/pmf: Add capability to sideload of policy binary") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250423132002.3984997-2-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/tee-if.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 14b99d8b63d2fc..e008ac079fade3 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -364,9 +364,14 @@ static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, amd_pmf_hex_dump_pb(dev); ret = amd_pmf_start_policy_engine(dev); if (ret < 0) - return ret; + goto cleanup; return length; + +cleanup: + kfree(dev->policy_buf); + dev->policy_buf = NULL; + return ret; } static const struct file_operations pb_fops = { From 39387f15387da9cb6a377ea2a8f6675da11a172c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 23 Apr 2025 08:18:45 -0500 Subject: [PATCH 064/353] drivers/platform/x86/amd: pmf: Check for invalid Smart PC Policies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 376a8c2a14439 ("platform/x86/amd/pmf: Update PMF Driver for Compatibility with new PMF-TA") added support for platforms that support an updated TA, however it also exposed a number of platforms that although they have support for the updated TA don't actually populate a policy binary. Add an explicit check that the policy binary isn't empty before initializing the TA. Reported-by: Christian Heusel Closes: https://lore.kernel.org/platform-driver-x86/ae644428-5bf2-4b30-81ba-0b259ed3449b@heusel.eu/ Fixes: 376a8c2a14439 ("platform/x86/amd/pmf: Update PMF Driver for Compatibility with new PMF-TA") Signed-off-by: Mario Limonciello Tested-by: Christian Heusel Link: https://lore.kernel.org/r/20250423132002.3984997-3-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/tee-if.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index e008ac079fade3..d3bd12ad036ae0 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -334,6 +334,11 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) return 0; } +static inline bool amd_pmf_pb_valid(struct amd_pmf_dev *dev) +{ + return memchr_inv(dev->policy_buf, 0xff, dev->policy_sz); +} + #ifdef CONFIG_AMD_PMF_DEBUG static void amd_pmf_hex_dump_pb(struct amd_pmf_dev *dev) { @@ -361,6 +366,11 @@ static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, dev->policy_buf = new_policy_buf; dev->policy_sz = length; + if (!amd_pmf_pb_valid(dev)) { + ret = -EINVAL; + goto cleanup; + } + amd_pmf_hex_dump_pb(dev); ret = amd_pmf_start_policy_engine(dev); if (ret < 0) @@ -533,6 +543,12 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) memcpy_fromio(dev->policy_buf, dev->policy_base, dev->policy_sz); + if (!amd_pmf_pb_valid(dev)) { + dev_info(dev->dev, "No Smart PC policy present\n"); + ret = -EINVAL; + goto err_free_policy; + } + amd_pmf_hex_dump_pb(dev); dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL); From 638d0db3df9df61660b2da56445eade029ea09ca Mon Sep 17 00:00:00 2001 From: Suma Hegde Date: Fri, 25 Apr 2025 10:23:57 +0000 Subject: [PATCH 065/353] platform/x86/amd/hsmp: Make amd_hsmp and hsmp_acpi as mutually exclusive drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit amd_hsmp and hsmp_acpi are intended to be mutually exclusive drivers and amd_hsmp is for legacy platforms. To achieve this, it is essential to check for the presence of the ACPI device in plat.c. If the hsmp ACPI device entry is found, allow the hsmp_acpi driver to manage the hsmp and return an error from plat.c. Additionally, rename the driver from amd_hsmp to hsmp_acpi to prevent "Driver 'amd_hsmp' is already registered, aborting..." error in case both drivers are loaded simultaneously. Also, support both platform device based and ACPI based probing for family 0x1A models 0x00 to 0x0F, implement only ACPI based probing for family 0x1A, models 0x10 to 0x1F. Return false from legacy_hsmp_support() for this platform. This aligns with the condition check in is_f1a_m0h(). Link: https://lore.kernel.org/platform-driver-x86/aALZxvHWmphNL1wa@gourry-fedora-PF4VCD3F/ Fixes: 7d3135d16356 ("platform/x86/amd/hsmp: Create separate ACPI, plat and common drivers") Reviewed-by: Naveen Krishna Chatradhi Co-developed-by: Gregory Price Signed-off-by: Gregory Price Signed-off-by: Suma Hegde Link: https://lore.kernel.org/r/20250425102357.266790-1-suma.hegde@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/hsmp/acpi.c | 3 +-- drivers/platform/x86/amd/hsmp/hsmp.h | 1 + drivers/platform/x86/amd/hsmp/plat.c | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/amd/hsmp/acpi.c b/drivers/platform/x86/amd/hsmp/acpi.c index c1eccb3c80c5c9..eaae044e4f824c 100644 --- a/drivers/platform/x86/amd/hsmp/acpi.c +++ b/drivers/platform/x86/amd/hsmp/acpi.c @@ -27,9 +27,8 @@ #include "hsmp.h" -#define DRIVER_NAME "amd_hsmp" +#define DRIVER_NAME "hsmp_acpi" #define DRIVER_VERSION "2.3" -#define ACPI_HSMP_DEVICE_HID "AMDI0097" /* These are the strings specified in ACPI table */ #define MSG_IDOFF_STR "MsgIdOffset" diff --git a/drivers/platform/x86/amd/hsmp/hsmp.h b/drivers/platform/x86/amd/hsmp/hsmp.h index af8b21f821d668..d58d4f0c20d552 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.h +++ b/drivers/platform/x86/amd/hsmp/hsmp.h @@ -23,6 +23,7 @@ #define HSMP_CDEV_NAME "hsmp_cdev" #define HSMP_DEVNODE_NAME "hsmp" +#define ACPI_HSMP_DEVICE_HID "AMDI0097" struct hsmp_mbaddr_info { u32 base_addr; diff --git a/drivers/platform/x86/amd/hsmp/plat.c b/drivers/platform/x86/amd/hsmp/plat.c index b9782a078dbd2f..81931e808bbc81 100644 --- a/drivers/platform/x86/amd/hsmp/plat.c +++ b/drivers/platform/x86/amd/hsmp/plat.c @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -266,7 +267,7 @@ static bool legacy_hsmp_support(void) } case 0x1A: switch (boot_cpu_data.x86_model) { - case 0x00 ... 0x1F: + case 0x00 ... 0x0F: return true; default: return false; @@ -288,6 +289,9 @@ static int __init hsmp_plt_init(void) return ret; } + if (acpi_dev_present(ACPI_HSMP_DEVICE_HID, NULL, -1)) + return -ENODEV; + hsmp_pdev = get_hsmp_pdev(); if (!hsmp_pdev) return -ENOMEM; From 0ec93bd8baaa755144370ff8fb80516388d8218e Mon Sep 17 00:00:00 2001 From: John Chau Date: Mon, 5 May 2025 01:55:13 +0900 Subject: [PATCH 066/353] platform/x86: thinkpad_acpi: Support also NEC Lavie X1475JAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change get_thinkpad_model_data() to check for additional vendor name "NEC" in order to support NEC Lavie X1475JAS notebook (and perhaps more). The reason of this works with minimal changes is because NEC Lavie X1475JAS is a Thinkpad inside. ACPI dumps reveals its OEM ID to be "LENOVO", BIOS version "R2PET30W" matches typical Lenovo BIOS version, the existence of HKEY of LEN0268, with DMI fw string is "R2PHT24W". I compiled and tested with my own machine, attached the dmesg below as proof of work: [ 6.288932] thinkpad_acpi: ThinkPad ACPI Extras v0.26 [ 6.288937] thinkpad_acpi: http://ibm-acpi.sf.net/ [ 6.288938] thinkpad_acpi: ThinkPad BIOS R2PET30W (1.11 ), EC R2PHT24W [ 6.307000] thinkpad_acpi: radio switch found; radios are enabled [ 6.307030] thinkpad_acpi: This ThinkPad has standard ACPI backlight brightness control, supported by the ACPI video driver [ 6.307033] thinkpad_acpi: Disabling thinkpad-acpi brightness events by default... [ 6.320322] thinkpad_acpi: rfkill switch tpacpi_bluetooth_sw: radio is unblocked [ 6.371963] thinkpad_acpi: secondary fan control detected & enabled [ 6.391922] thinkpad_acpi: battery 1 registered (start 0, stop 85, behaviours: 0x7) [ 6.398375] input: ThinkPad Extra Buttons as /devices/platform/thinkpad_acpi/input/input13 Signed-off-by: John Chau Link: https://lore.kernel.org/r/20250504165513.295135-1-johnchau@0atlas.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/thinkpad_acpi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 5790095c175e6f..92b21e49faf62b 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -11478,6 +11478,8 @@ static int __must_check __init get_thinkpad_model_data( tp->vendor = PCI_VENDOR_ID_IBM; else if (dmi_name_in_vendors("LENOVO")) tp->vendor = PCI_VENDOR_ID_LENOVO; + else if (dmi_name_in_vendors("NEC")) + tp->vendor = PCI_VENDOR_ID_LENOVO; else return 0; From 3d5f8c64cf830b0b5d785564039a12b746509c38 Mon Sep 17 00:00:00 2001 From: Runhua He Date: Wed, 7 May 2025 18:01:03 +0800 Subject: [PATCH 067/353] platform/x86/amd/pmc: Declare quirk_spurious_8042 for MECHREVO Wujie 14XA (GX4HRXL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MECHREVO Wujie 14XA (GX4HRXL) wakes up immediately after s2idle entry. This happens regardless of whether the laptop is plugged into AC power, or whether any peripheral is plugged into the laptop. Similar to commit a55bdad5dfd1 ("platform/x86/amd/pmc: Disable keyboard wakeup on AMD Framework 13"), the MECHREVO Wujie 14XA wakes up almost instantly after s2idle suspend entry (IRQ1 is the keyboard): 2025-04-18 17:23:57,588 DEBUG: PM: Triggering wakeup from IRQ 9 2025-04-18 17:23:57,588 DEBUG: PM: Triggering wakeup from IRQ 1 Add this model to the spurious_8042 quirk to workaround this. This patch does not affect the wake-up function of the built-in keyboard. Because the firmware of this machine adds an insurance for keyboard wake-up events, as it always triggers an additional IRQ 9 to wake up the system. Suggested-by: Mingcong Bai Suggested-by: Xinhui Yang Suggested-by: Rong Zhang Fixes: a55bdad5dfd1 ("platform/x86/amd/pmc: Disable keyboard wakeup on AMD Framework 13") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4166 Cc: Mario Limonciello Link: https://zhuanldan.zhihu.com/p/730538041 Tested-by: Yemu Lu Signed-off-by: Runhua He Link: https://lore.kernel.org/r/20250507100103.995395-1-hua@aosc.io Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index b4f49720c87f62..2e3f6fc67c568d 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -217,6 +217,13 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BIOS_VERSION, "03.05"), } }, + { + .ident = "MECHREVO Wujie 14X (GX4HRXL)", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"), + } + }, {} }; From 9a509cabe2b2490465a2a224f505dabaab8ba517 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 1 May 2025 15:17:02 +0200 Subject: [PATCH 068/353] platform/x86: asus-wmi: Fix wlan_ctrl_by_user detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wlan_ctrl_by_user detection was introduced by commit a50bd128f28c ("asus-wmi: record wlan status while controlled by userapp"). Quoting from that commit's commit message: """ When you call WMIMethod(DSTS, 0x00010011) to get WLAN status, it may return (1) 0x00050001 (On) (2) 0x00050000 (Off) (3) 0x00030001 (On) (4) 0x00030000 (Off) (5) 0x00000002 (Unknown) (1), (2) means that the model has hardware GPIO for WLAN, you can call WMIMethod(DEVS, 0x00010011, 1 or 0) to turn WLAN on/off. (3), (4) means that the model doesn’t have hardware GPIO, you need to use API or driver library to turn WLAN on/off, and call WMIMethod(DEVS, 0x00010012, 1 or 0) to set WLAN LED status. After you set WLAN LED status, you can see the WLAN status is changed with WMIMethod(DSTS, 0x00010011). Because the status is recorded lastly (ex: Windows), you can use it for synchronization. (5) means that the model doesn’t have WLAN device. WLAN is the ONLY special case with upper rule. """ The wlan_ctrl_by_user flag should be set on 0x0003000? ((3), (4) above) return values, but the flag mistakenly also gets set on laptops with 0x0005000? ((1), (2)) return values. This is causing rfkill problems on laptops where 0x0005000? is returned. Fix the check to only set the wlan_ctrl_by_user flag for 0x0003000? return values. Fixes: a50bd128f28c ("asus-wmi: record wlan status while controlled by userapp") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219786 Signed-off-by: Hans de Goede Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20250501131702.103360-2-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 0c697b46f436a3..47cc766624d7bb 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -4779,7 +4779,8 @@ static int asus_wmi_add(struct platform_device *pdev) goto fail_leds; asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_WLAN, &result); - if (result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) + if ((result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) == + (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) asus->driver->wlan_ctrl_by_user = 1; if (!(asus->driver->wlan_ctrl_by_user && ashs_present())) { From 8a48385f9c0b70a39e9a07acefa4d482004edb54 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 May 2025 15:24:13 -0400 Subject: [PATCH 069/353] cgroup/cpuset: Extend kthread_is_per_cpu() check to all PF_NO_SETAFFINITY tasks Commit ec5fbdfb99d1 ("cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset") enabled us to pull CPUs dedicated to child partitions from tasks in top_cpuset by ignoring per cpu kthreads. However, there can be other kthreads that are not per cpu but have PF_NO_SETAFFINITY flag set to indicate that we shouldn't mess with their CPU affinity. For other kthreads, their affinity will be changed to skip CPUs dedicated to child partitions whether it is an isolating or a scheduling one. As all the per cpu kthreads have PF_NO_SETAFFINITY set, the PF_NO_SETAFFINITY tasks are essentially a superset of per cpu kthreads. Fix this issue by dropping the kthread_is_per_cpu() check and checking the PF_NO_SETAFFINITY flag instead. Fixes: ec5fbdfb99d1 ("cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset") Signed-off-by: Waiman Long Acked-by: Frederic Weisbecker Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 306b6043009145..24b70ea3e6ce97 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1116,9 +1116,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { From 7d7f2a91dfbf00886f73f1549a96ed94d1b20e16 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 22 Apr 2025 10:26:32 +0200 Subject: [PATCH 070/353] sched_ext: Track currently locked rq Some kfuncs provided by sched_ext may need to operate on a struct rq, but they can be invoked from various contexts, specifically, different scx callbacks. While some of these callbacks are invoked with a particular rq already locked, others are not. This makes it impossible for a kfunc to reliably determine whether it's safe to access a given rq, triggering potential bugs or unsafe behaviors, see for example [1]. To address this, track the currently locked rq whenever a sched_ext callback is invoked via SCX_CALL_OP*(). This allows kfuncs that need to operate on an arbitrary rq to retrieve the currently locked one and apply the appropriate action as needed. [1] https://lore.kernel.org/lkml/20250325140021.73570-1-arighi@nvidia.com/ Suggested-by: Tejun Heo Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 152 +++++++++++++++++++++++++--------------- kernel/sched/ext_idle.c | 2 +- 2 files changed, 95 insertions(+), 59 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fdbf249d1c68ec..585bf6d8238b66 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1118,8 +1118,38 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(mask, op, rq, args...) \ do { \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ scx_ops.op(args); \ @@ -1127,11 +1157,14 @@ do { \ } else { \ scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(mask, op, rq, args...) \ ({ \ __typeof__(scx_ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = scx_ops.op(args); \ @@ -1139,6 +1172,7 @@ do { \ } else { \ __ret = scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1153,31 +1187,31 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \ ({ \ __typeof__(scx_ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \ ({ \ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -2172,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2269,7 +2303,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags add_nr_running(rq, 1); if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2283,7 +2317,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { unsigned long opss; @@ -2304,7 +2338,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) BUG(); case SCX_OPSS_QUEUED: if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2337,7 +2371,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2353,11 +2387,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags */ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false); } if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2377,7 +2411,7 @@ static void yield_task_scx(struct rq *rq) struct task_struct *p = rq->curr; if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } @@ -2387,7 +2421,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) struct task_struct *from = rq->curr; if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to); else return false; } @@ -2945,7 +2979,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2990,7 +3024,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq), prev_on_scx ? prev : NULL); flush_dispatch_buf(rq); @@ -3104,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } @@ -3112,7 +3146,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3193,8 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3207,7 +3240,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3348,7 +3381,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, * verifier. */ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3385,7 +3418,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag *ddsp_taskp = p; cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + select_cpu, NULL, p, prev_cpu, wake_flags); p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) @@ -3430,8 +3463,8 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) @@ -3444,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, rq, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, rq, cpu); else scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "cpu %d going %s, exiting scheduler", cpu, @@ -3550,7 +3583,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3627,7 +3660,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args); if (unlikely(ret)) { ret = ops_sanitize_err("init_task", ret); return ret; @@ -3668,9 +3701,10 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool static void scx_ops_enable_task(struct task_struct *p) { + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -3684,20 +3718,22 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void scx_ops_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -3726,7 +3762,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3835,7 +3871,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) @@ -3851,8 +3887,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3913,7 +3949,7 @@ int scx_tg_online(struct task_group *tg) struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); if (ret) ret = ops_sanitize_err("cgroup_init", ret); @@ -3935,7 +3971,7 @@ void scx_tg_offline(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); @@ -3968,7 +4004,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) continue; if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -3982,8 +4018,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -4001,8 +4037,8 @@ void scx_cgroup_move_task(struct task_struct *p) * cgrp_moving_from set. */ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } @@ -4021,8 +4057,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } out_unlock: @@ -4035,7 +4071,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx_weight = weight; } @@ -4224,7 +4260,7 @@ static void scx_cgroup_exit(void) continue; rcu_read_unlock(); - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); rcu_read_lock(); css_put(css); @@ -4261,7 +4297,7 @@ static int scx_cgroup_init(void) continue; rcu_read_unlock(); - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); @@ -4758,7 +4794,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); @@ -4965,7 +5001,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, if (SCX_HAS_OP(dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -5012,7 +5048,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (SCX_HAS_OP(dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5069,7 +5105,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) used = seq_buf_used(&ns); if (SCX_HAS_OP(dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle); ops_dump_exit(); } @@ -5328,7 +5364,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_idle_enable(ops); if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL); if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index cb343ca889e025..e67a19a071c114 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -674,7 +674,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) * managed by put_prev_task_idle()/set_next_task_idle(). */ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); /* * Update the idle masks: From 81e365c67f505c0bb63818f628fe741411686383 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 22 Apr 2025 10:26:33 +0200 Subject: [PATCH 071/353] sched_ext: Fix missing rq lock in scx_bpf_cpuperf_set() scx_bpf_cpuperf_set() can be used to set a performance target level on any CPU. However, it doesn't correctly acquire the corresponding rq lock, which may lead to unsafe behavior and trigger the following warning, due to the lockdep_assert_rq_held() check: [ 51.713737] WARNING: CPU: 3 PID: 3899 at kernel/sched/sched.h:1512 scx_bpf_cpuperf_set+0x1a0/0x1e0 ... [ 51.713836] Call trace: [ 51.713837] scx_bpf_cpuperf_set+0x1a0/0x1e0 (P) [ 51.713839] bpf_prog_62d35beb9301601f_bpfland_init+0x168/0x440 [ 51.713841] bpf__sched_ext_ops_init+0x54/0x8c [ 51.713843] scx_ops_enable.constprop.0+0x2c0/0x10f0 [ 51.713845] bpf_scx_reg+0x18/0x30 [ 51.713847] bpf_struct_ops_link_create+0x154/0x1b0 [ 51.713849] __sys_bpf+0x1934/0x22a0 Fix by properly acquiring the rq lock when possible or raising an error if we try to operate on a CPU that is not the one currently locked. Fixes: d86adb4fc0655 ("sched_ext: Add cpuperf support") Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 585bf6d8238b66..ac79067dc87e65 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7113,13 +7113,32 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_ops_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } From 144e285d509ca51f5ed42b9e6e3fc438c84d1161 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 25 Apr 2025 19:57:02 +0200 Subject: [PATCH 072/353] sched_ext: Remove duplicate BTF_ID_FLAGS definitions Some kfuncs specific to the idle CPU selection policy are registered in both the scx_kfunc_ids_any and scx_kfunc_ids_idle blocks, even though they should only be defined in the latter. Remove the duplicates from scx_kfunc_ids_any. Fixes: 337d1b354a297 ("sched_ext: Move built-in idle CPU selection policy to a separate file") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ac79067dc87e65..dddb0af36f8daf 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7369,12 +7369,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) From d716ff789bef3ef6914b61b8b2b5adbea1b65820 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 28 Apr 2025 23:43:20 +0200 Subject: [PATCH 073/353] sched_ext: Fix rq lock state in hotplug ops The ops.cpu_online() and ops.cpu_offline() callbacks incorrectly assume that the rq involved in the operation is locked, which is not the case during hotplug, triggering the following warning: WARNING: CPU: 1 PID: 20 at kernel/sched/sched.h:1504 handle_hotplug+0x280/0x340 Fix by not tracking the target rq as locked in the context of ops.cpu_online() and ops.cpu_offline(). Fixes: 18853ba782bef ("sched_ext: Track currently locked rq") Reported-by: Tejun Heo Signed-off-by: Andrea Righi Tested-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index dddb0af36f8daf..4e37b40ce280cb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3477,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, rq, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, rq, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "cpu %d going %s, exiting scheduler", cpu, From 6f1b02d7415b08b00fbfaf8b55d30ec198d3e43e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 May 2025 11:30:39 -1000 Subject: [PATCH 074/353] sched_ext: bpf_iter_scx_dsq_new() should always initialize iterator BPF programs may call next() and destroy() on BPF iterators even after new() returns an error value (e.g. bpf_for_each() macro ignores error returns from new()). bpf_iter_scx_dsq_new() could leave the iterator in an uninitialized state after an error return causing bpf_iter_scx_dsq_next() to dereference garbage data. Make bpf_iter_scx_dsq_new() always clear $kit->dsq so that next() and destroy() become noops. Signed-off-by: Tejun Heo Fixes: 650ba21b131e ("sched_ext: Implement DSQ iterator") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Andrea Righi --- kernel/sched/ext.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4e37b40ce280cb..f5133249fd4d92 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6827,6 +6827,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; From 70e699c80bc6d95ef5d2f419dfb3945bd2d659d3 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 10 Apr 2025 05:22:21 -0700 Subject: [PATCH 075/353] tracing: fprobe: Fix RCU warning message in list traversal When CONFIG_PROVE_RCU_LIST is enabled, fprobe triggers the following warning: WARNING: suspicious RCU usage kernel/trace/fprobe.c:457 RCU-list traversed in non-reader section!! other info that might help us debug this: #1: ffffffff863c4e08 (fprobe_mutex){+.+.}-{4:4}, at: fprobe_module_callback+0x7b/0x8c0 Call Trace: fprobe_module_callback notifier_call_chain blocking_notifier_call_chain This warning occurs because fprobe_remove_node_in_module() traverses an RCU list using RCU primitives without holding an RCU read lock. However, the function is only called from fprobe_module_callback(), which holds the fprobe_mutex lock that provides sufficient protection for safely traversing the list. Fix the warning by specifying the locking design to the CONFIG_PROVE_RCU_LIST mechanism. Add the lockdep_is_held() argument to hlist_for_each_entry_rcu() to inform the RCU checker that fprobe_mutex provides the required protection. Link: https://lore.kernel.org/all/20250410-fprobe-v1-1-068ef5f41436@debian.org/ Fixes: a3dc2983ca7b90 ("tracing: fprobe: Cleanup fprobe hash when module unloading") Signed-off-by: Breno Leitao Tested-by: Antonio Quartulli Tested-by: Matthieu Baerts (NGI0) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76b5..ba7ff14f5339b5 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -454,7 +454,8 @@ static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head * struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist) { + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { if (!within_module(node->addr, mod)) continue; if (delete_fprobe_node(node)) From be657a6e8392f87a65aa671744cf64cab3e4a443 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Sun, 4 May 2025 20:27:52 +0200 Subject: [PATCH 076/353] tracing: add missing trace_probe_log_clear for eprobes Make sure trace_probe_log_clear is called in the tracing eprobe code path, matching the trace_probe_log_init call. Link: https://lore.kernel.org/all/20250504-fix-trace-probe-log-race-v3-1-9e99fec7eddc@gmail.com/ Signed-off-by: Paul Cacheux Acked-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32b4..916555f0de811f 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } From e27ba0bc31408b615edce5bef000968d5505a5d5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 10 May 2025 12:44:41 +0900 Subject: [PATCH 077/353] tracing: probes: Fix a possible race in trace_probe_log APIs Since the shared trace_probe_log variable can be accessed and modified via probe event create operation of kprobe_events, uprobe_events, and dynamic_events, it should be protected. In the dynamic_events, all operations are serialized by `dyn_event_ops_mutex`. But kprobe_events and uprobe_events interfaces are not serialized. To solve this issue, introduces dyn_event_create(), which runs create() operation under the mutex, for kprobe_events and uprobe_events. This also uses lockdep to check the mutex is held when using trace_probe_log* APIs. Link: https://lore.kernel.org/all/174684868120.551552.3068655787654268804.stgit@devnote2/ Reported-by: Paul Cacheux Closes: https://lore.kernel.org/all/20250510074456.805a16872b591e2971a4d221@kernel.org/ Fixes: ab105a4fb894 ("tracing: Use tracing error_log with probe events") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_dynevent.c | 16 +++++++++++++++- kernel/trace/trace_dynevent.h | 1 + kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 9 +++++++++ kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a503..5d64a18cacacc6 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3e7..beee3f8d754444 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d899064..3e5c47b6d7b290 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95eea5..424751cdf31f9f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ static const struct fetch_type *find_fetch_type(const char *type, unsigned long } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f674..35cf76c75dd766 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } From f89e6267eb28294a49b1b36c47958a8b92701c13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 19:58:39 -0700 Subject: [PATCH 078/353] x86/its: Fix build errors when CONFIG_MODULES=n Fix several build errors when CONFIG_MODULES=n, including the following: ../arch/x86/kernel/alternative.c:195:25: error: incomplete definition of type 'struct module' 195 | for (int i = 0; i < mod->its_num_pages; i++) { Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Acked-by: Dave Hansen Tested-by: Steven Rostedt (Google) Reviewed-by: Alexandre Chartre Signed-off-by: Linus Torvalds --- arch/x86/kernel/alternative.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 48fd04e9011483..45bcff181cbae9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -133,7 +133,9 @@ static bool cfi_paranoid __ro_after_init; #ifdef CONFIG_MITIGATION_ITS +#ifdef CONFIG_MODULES static struct module *its_mod; +#endif static void *its_page; static unsigned int its_offset; @@ -171,6 +173,7 @@ static void *its_init_thunk(void *thunk, int reg) return thunk + offset; } +#ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) @@ -209,6 +212,7 @@ void its_free_mod(struct module *mod) } kfree(mod->its_page_array); } +#endif /* CONFIG_MODULES */ static void *its_alloc(void) { @@ -217,6 +221,7 @@ static void *its_alloc(void) if (!page) return NULL; +#ifdef CONFIG_MODULES if (its_mod) { void *tmp = krealloc(its_mod->its_page_array, (its_mod->its_num_pages+1) * sizeof(void *), @@ -229,6 +234,7 @@ static void *its_alloc(void) execmem_make_temp_rw(page, PAGE_SIZE); } +#endif /* CONFIG_MODULES */ return no_free_ptr(page); } From d687af4385e8c0b56f77d7dafcc990b545e48290 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Sat, 29 Mar 2025 09:50:17 -0700 Subject: [PATCH 079/353] arm64: dts: rockchip: Allow Turing RK1 cooling fan to spin down The RK3588 thermal sensor driver only receives interrupts when a higher-temperature threshold is crossed; it cannot notify when the sensor cools back off. As a result, the driver must poll for temperature changes to detect when the conditions for a thermal trip are no longer met. However, it only does so if the DT enables polling. Before this patch, the RK1 DT did not enable polling, causing the fan to continue running at the speed corresponding to the highest temperature reached. Follow suit with similar RK3588 boards by setting a polling-delay of 1000ms, enabling the driver to detect when the sensor cools back off, allowing the fan speed to decrease as appropriate. Fixes: 7c8ec5e6b9d6 ("arm64: dts: rockchip: Enable automatic fan control on Turing RK1") Cc: stable@kernel.org # v6.13+ Signed-off-by: Sam Edwards Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20250329165017.3885-1-CFSworks@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi index 711ac4f2c7cb66..60ad272982ad51 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi @@ -214,6 +214,8 @@ }; &package_thermal { + polling-delay = <1000>; + trips { package_active1: trip-active1 { temperature = <45000>; From 5db37f674d1d4756960c945bccc4fa33d5baa8ae Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Mon, 24 Mar 2025 12:00:43 +0100 Subject: [PATCH 080/353] arm64: dts: rockchip: Remove overdrive-mode OPPs from RK3588J SoC dtsi The differences in the vendor-approved CPU and GPU OPPs for the standard Rockchip RK3588 variant [1] and the industrial Rockchip RK3588J variant [2] come from the latter, presumably, supporting an extended temperature range that's usually associated with industrial applications, despite the two SoC variant datasheets specifying the same upper limit for the allowed ambient temperature for both variants. However, the lower temperature limit is specified much lower for the RK3588J variant. [1][2] To be on the safe side and to ensure maximum longevity of the RK3588J SoCs, only the CPU and GPU OPPs that are declared by the vendor to be always safe for this SoC variant may be provided. As explained by the vendor [3] and according to the RK3588J datasheet, [2] higher-frequency/higher-voltage CPU and GPU OPPs can be used as well, but at the risk of reducing the SoC lifetime expectancy. Presumably, using the higher OPPs may be safe only when not enjoying the assumed extended temperature range that the RK3588J, as an SoC variant targeted specifically at higher-temperature, industrial applications, is made (or binned) for. Anyone able to keep their RK3588J-based board outside the above-presumed extended temperature range at all times, and willing to take the associated risk of possibly reducing the SoC lifetime expectancy, is free to apply a DT overlay that adds the higher CPU and GPU OPPs. With all this and the downstream RK3588(J) DT definitions [4][5] in mind, let's delete the RK3588J CPU and GPU OPPs that are not considered belonging to the normal operation mode for this SoC variant. To quote the RK3588J datasheet [2], "normal mode means the chipset works under safety voltage and frequency; for the industrial environment, highly recommend to keep in normal mode, the lifetime is reasonably guaranteed", while "overdrive mode brings higher frequency, and the voltage will increase accordingly; under the overdrive mode for a long time, the chipset may shorten the lifetime, especially in high-temperature condition". To sum the RK3588J datasheet [2] and the vendor-provided DTs up, [4][5] the maximum allowed CPU core, GPU and NPU frequencies are as follows: IP core | Normal mode | Overdrive mode ------------+-------------+---------------- Cortex-A55 | 1,296 MHz | 1,704 MHz Cortex-A76 | 1,608 MHz | 2,016 MHz GPU | 700 MHz | 850 MHz NPU | 800 MHz | 950 MHz Unfortunately, when it comes to the actual voltages for the RK3588J CPU and GPU OPPs, there's a discrepancy between the RK3588J datasheet [2] and the downstream kernel code. [4][5] The RK3588J datasheet states that "the max. working voltage of CPU/GPU/NPU is 0.75 V under the normal mode", while the downstream kernel code actually allows voltage ranges that go up to 0.95 V, which is still within the voltage range allowed by the datasheet. However, the RK3588J datasheet also tells us to "strictly refer to the software configuration of SDK and the hardware reference design", so let's embrace the voltage ranges provided by the downstream kernel code, which also prevents the undesirable theoretical outcome of ending up with no usable OPPs on a particular board, as a result of the board's voltage regulator(s) being unable to deliver the exact voltages, for whatever reason. The above-described voltage ranges for the RK3588J CPU OPPs remain taken from the downstream kernel code [4][5] by picking the highest, worst-bin values, which ensure that all RK3588J bins will work reliably. Yes, with some power inevitably wasted as unnecessarily generated heat, but the reliability is paramount, together with the longevity. This deficiency may be revisited separately at some point in the future. The provided RK3588J CPU OPPs follow the slightly debatable "provide only the highest-frequency OPP from the same-voltage group" approach that's been established earlier, [6] as a result of the "same-voltage, lower-frequency" OPPs being considered inefficient from the IPA governor's standpoint, which may also be revisited separately at some point in the future. [1] https://wiki.friendlyelec.com/wiki/images/e/ee/Rockchip_RK3588_Datasheet_V1.6-20231016.pdf [2] https://wmsc.lcsc.com/wmsc/upload/file/pdf/v2/lcsc/2403201054_Rockchip-RK3588J_C22364189.pdf [3] https://lore.kernel.org/linux-rockchip/e55125ed-64fb-455e-b1e4-cebe2cf006e4@cherry.de/T/#u [4] https://raw.githubusercontent.com/rockchip-linux/kernel/604cec4004abe5a96c734f2fab7b74809d2d742f/arch/arm64/boot/dts/rockchip/rk3588s.dtsi [5] https://raw.githubusercontent.com/rockchip-linux/kernel/604cec4004abe5a96c734f2fab7b74809d2d742f/arch/arm64/boot/dts/rockchip/rk3588j.dtsi [6] https://lore.kernel.org/all/20240229-rk-dts-additions-v3-5-6afe8473a631@gmail.com/ Fixes: 667885a68658 ("arm64: dts: rockchip: Add OPP data for CPU cores on RK3588j") Fixes: a7b2070505a2 ("arm64: dts: rockchip: Split GPU OPPs of RK3588 and RK3588j") Cc: stable@vger.kernel.org Cc: Heiko Stuebner Cc: Alexey Charkov Helped-by: Quentin Schulz Reviewed-by: Quentin Schulz Signed-off-by: Dragan Simic Link: https://lore.kernel.org/r/eeec0d30d79b019d111b3f0aa2456e69896b2caa.1742813866.git.dsimic@manjaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588j.dtsi | 53 ++++++++--------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi index bce72bac4503b5..3045cb3bd68c63 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi @@ -11,20 +11,15 @@ compatible = "operating-points-v2"; opp-shared; - opp-1416000000 { - opp-hz = /bits/ 64 <1416000000>; + opp-1200000000 { + opp-hz = /bits/ 64 <1200000000>; opp-microvolt = <750000 750000 950000>; clock-latency-ns = <40000>; opp-suspend; }; - opp-1608000000 { - opp-hz = /bits/ 64 <1608000000>; - opp-microvolt = <887500 887500 950000>; - clock-latency-ns = <40000>; - }; - opp-1704000000 { - opp-hz = /bits/ 64 <1704000000>; - opp-microvolt = <937500 937500 950000>; + opp-1296000000 { + opp-hz = /bits/ 64 <1296000000>; + opp-microvolt = <775000 775000 950000>; clock-latency-ns = <40000>; }; }; @@ -33,9 +28,14 @@ compatible = "operating-points-v2"; opp-shared; + opp-1200000000{ + opp-hz = /bits/ 64 <1200000000>; + opp-microvolt = <750000 750000 950000>; + clock-latency-ns = <40000>; + }; opp-1416000000 { opp-hz = /bits/ 64 <1416000000>; - opp-microvolt = <750000 750000 950000>; + opp-microvolt = <762500 762500 950000>; clock-latency-ns = <40000>; }; opp-1608000000 { @@ -43,25 +43,20 @@ opp-microvolt = <787500 787500 950000>; clock-latency-ns = <40000>; }; - opp-1800000000 { - opp-hz = /bits/ 64 <1800000000>; - opp-microvolt = <875000 875000 950000>; - clock-latency-ns = <40000>; - }; - opp-2016000000 { - opp-hz = /bits/ 64 <2016000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; cluster2_opp_table: opp-table-cluster2 { compatible = "operating-points-v2"; opp-shared; + opp-1200000000{ + opp-hz = /bits/ 64 <1200000000>; + opp-microvolt = <750000 750000 950000>; + clock-latency-ns = <40000>; + }; opp-1416000000 { opp-hz = /bits/ 64 <1416000000>; - opp-microvolt = <750000 750000 950000>; + opp-microvolt = <762500 762500 950000>; clock-latency-ns = <40000>; }; opp-1608000000 { @@ -69,16 +64,6 @@ opp-microvolt = <787500 787500 950000>; clock-latency-ns = <40000>; }; - opp-1800000000 { - opp-hz = /bits/ 64 <1800000000>; - opp-microvolt = <875000 875000 950000>; - clock-latency-ns = <40000>; - }; - opp-2016000000 { - opp-hz = /bits/ 64 <2016000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; gpu_opp_table: opp-table { @@ -104,10 +89,6 @@ opp-hz = /bits/ 64 <700000000>; opp-microvolt = <750000 750000 850000>; }; - opp-850000000 { - opp-hz = /bits/ 64 <800000000>; - opp-microvolt = <787500 787500 850000>; - }; }; }; From 3f09eb1d9ece7c41d03950e1aca27c9c60771199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 19 Mar 2025 12:31:38 +0100 Subject: [PATCH 081/353] arm64: dts: rockchip: Add pinmuxing for eMMC on QNAP TS433 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now the emmc worked when booting because the bootrom set up the pin config correctly to load the initial bootloader from it. So when the kernel started it "just" reused this setup but never made sure it was actually correct. This then breaks when the system is started via some other means, like downloading the initial bootloader via the bootrom usb download. With actual emmc pin-config added, barebox is able to access the eMMC even when booted via USB. Fixes: 9da1c0327d58 ("arm64: dts: rockchip: Add basic support for QNAP TS-433") Signed-off-by: Uwe Kleine-König [refined commit message to explain that we're currently just running on bootom-goodwill] Link: https://lore.kernel.org/r/20250319113138.125192-2-uwe@kleine-koenig.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts b/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts index 7bd32d230ad2f1..b80d628c426b71 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts @@ -619,6 +619,8 @@ bus-width = <8>; max-frequency = <200000000>; non-removable; + pinctrl-names = "default"; + pinctrl-0 = <&emmc_bus8 &emmc_clk &emmc_cmd &emmc_datastrobe>; status = "okay"; }; From 7f9efbb3de6bd697eb3428c00e78712f22a246c5 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 9 Apr 2025 15:50:46 -0500 Subject: [PATCH 082/353] arm64: dts: rockchip: Use "regulator-fixed" for btreg on px30-engicam for vcc3v3-btreg The vcc3v3-btreg regulator only has 1 state and no state gpios defined, so "regulator-gpio" is not the correct binding to use. "regulator-fixed" is the correct binding to use. It supports an enable GPIO which is needed in this case. Signed-off-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250409205047.1522943-1-robh@kernel.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi | 3 +-- arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi | 2 +- .../boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi b/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi index 1edfd643b25ae8..a334ef0629d1bb 100644 --- a/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi @@ -31,7 +31,7 @@ }; vcc3v3_btreg: vcc3v3-btreg { - compatible = "regulator-gpio"; + compatible = "regulator-fixed"; enable-active-high; pinctrl-names = "default"; pinctrl-0 = <&bt_enable_h>; @@ -39,7 +39,6 @@ regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; regulator-always-on; - states = <3300000 0x0>; }; vcc3v3_rf_aux_mod: regulator-vcc3v3-rf-aux-mod { diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi b/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi index 80db778c968483..b60e68faa83aa9 100644 --- a/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi @@ -26,5 +26,5 @@ }; &vcc3v3_btreg { - enable-gpios = <&gpio1 RK_PC3 GPIO_ACTIVE_HIGH>; + gpios = <&gpio1 RK_PC3 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts index 165d09ccb94244..5886b802c5202b 100644 --- a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts +++ b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts @@ -39,5 +39,5 @@ }; &vcc3v3_btreg { - enable-gpios = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>; + gpios = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>; }; From 518909013764c16a8e69da1e737cd77894476b71 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 9 Apr 2025 15:50:39 -0500 Subject: [PATCH 083/353] arm64: dts: rockchip: Fix mmc-pwrseq clock name on rock-pi-4 The defined name for "mmc-pwrseq-simple" clock is "ext_clock". Signed-off-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250409205040.1522754-1-robh@kernel.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi index 541dca12bf1a1f..046dbe32901786 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi @@ -43,7 +43,7 @@ sdio_pwrseq: sdio-pwrseq { compatible = "mmc-pwrseq-simple"; clocks = <&rk808 1>; - clock-names = "lpo"; + clock-names = "ext_clock"; pinctrl-names = "default"; pinctrl-0 = <&wifi_enable_h>; reset-gpios = <&gpio0 RK_PB2 GPIO_ACTIVE_LOW>; From 96ae78f97434e5f892bb66d26ee7904eb41a1ede Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 24 Apr 2025 10:47:29 +0200 Subject: [PATCH 084/353] arm64: dts: rockchip: Align wifi node name with bindings in CB2 Since commit 3c3606793f7e ("dt-bindings: wireless: bcm4329-fmac: Use wireless-controller.yaml schema"), bindings expect 'wifi' as node name: rk3566-bigtreetech-cb2-manta.dtb: sdio-wifi@1: $nodename:0: 'sdio-wifi@1' does not match '^wifi(@.*)?$' Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250424084729.105182-1-krzysztof.kozlowski@linaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi index a483514717640f..e7ba477e75f9bf 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi @@ -775,7 +775,7 @@ rockchip,default-sample-phase = <90>; status = "okay"; - sdio-wifi@1 { + wifi@1 { compatible = "brcm,bcm4329-fmac"; reg = <1>; interrupt-parent = <&gpio2>; From ae73c9647195cf3a80b6958fedc78572700e6815 Mon Sep 17 00:00:00 2001 From: Tom Vincent Date: Thu, 17 Apr 2025 09:17:53 +0100 Subject: [PATCH 085/353] arm64: dts: rockchip: Assign RT5616 MCLK rate on rk3588-friendlyelec-cm3588 The Realtek RT5616 audio codec on the FriendlyElec CM3588 module fails to probe correctly due to the missing clock properties. This results in distorted analogue audio output. Assign MCLK to 12.288 MHz, which allows the codec to advertise most of the standard sample rates per other RK3588 devices. Fixes: e23819cf273c ("arm64: dts: rockchip: Add FriendlyElec CM3588 NAS board") Signed-off-by: Tom Vincent Link: https://lore.kernel.org/r/20250417081753.644950-1-linux@tlvince.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi index 1af0a30866f619..af431fdcbea7a6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi @@ -222,6 +222,10 @@ compatible = "realtek,rt5616"; reg = <0x1b>; #sound-dai-cells = <0>; + assigned-clocks = <&cru I2S0_8CH_MCLKOUT>; + assigned-clock-rates = <12288000>; + clocks = <&cru I2S0_8CH_MCLKOUT>; + clock-names = "mclk"; }; }; From 7f44a94c669be7e04fda520de1a8a6d3cbf4fb39 Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Tue, 29 Apr 2025 18:51:55 +0200 Subject: [PATCH 086/353] arm64: dts: rockchip: fix Sige5 RTC interrupt pin Someone made a typo when they added the RTC to the Sige5 DTS, which resulted in it using interrupts from GPIO0 B0 instead of GPIO0 A0. The pinctrl entry for it wasn't typoed though, curiously enough. The Sige5 v1.1 schematic was used to verify that GPIO0 A0 is the correct pin for the RTC wakeup interrupt, so let's change it to that. Fixes: 40f742b07ab2 ("arm64: dts: rockchip: Add rk3576-armsom-sige5 board") Signed-off-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/20250429-sige5-rtc-oopsie-v1-1-8686767d0f1f@collabora.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts index 828bde7fab68dc..314067ba6f3c4f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts +++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts @@ -610,7 +610,7 @@ reg = <0x51>; clock-output-names = "hym8563"; interrupt-parent = <&gpio0>; - interrupts = ; + interrupts = ; pinctrl-names = "default"; pinctrl-0 = <&hym8563_int>; wakeup-source; From 67dbede20854e88968d8f78dcc8d2b091785b554 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 20 Apr 2025 18:47:58 +0200 Subject: [PATCH 087/353] ARM: dts: amlogic: meson8: fix reference to unknown/untested PWM clock Device-tree expects absent clocks to be specified as <0> (instead of using <>). This fixes using the FCLK4/FCLK3 clocks as they are now seen at their correct index (while before they were recognized, but at the correct index - resulting in the hardware using a different clock than what the kernel sees). Fixes: 802cff460aab ("ARM: dts: amlogic: meson8: switch to the new PWM controller binding") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250420164801.330505-2-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong --- arch/arm/boot/dts/amlogic/meson8.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/amlogic/meson8.dtsi b/arch/arm/boot/dts/amlogic/meson8.dtsi index 847f7b1f1e9617..f785e0de0847b5 100644 --- a/arch/arm/boot/dts/amlogic/meson8.dtsi +++ b/arch/arm/boot/dts/amlogic/meson8.dtsi @@ -451,7 +451,7 @@ pwm_ef: pwm@86c0 { compatible = "amlogic,meson8-pwm-v2"; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; reg = <0x86c0 0x10>; @@ -705,7 +705,7 @@ &pwm_ab { compatible = "amlogic,meson8-pwm-v2"; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; @@ -713,7 +713,7 @@ &pwm_cd { compatible = "amlogic,meson8-pwm-v2"; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; From 6c7e7d88ab7272a74d94096ad67b08a66ce27d6c Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 20 Apr 2025 18:47:59 +0200 Subject: [PATCH 088/353] ARM: dts: amlogic: meson8b: fix reference to unknown/untested PWM clock Device-tree expects absent clocks to be specified as <0> (instead of using <>). This fixes using the FCLK4/FCLK3 clocks as they are now seen at their correct index (while before they were recognized, but at the correct index - resulting in the hardware using a different clock than what the kernel sees). Fixes: dbf921861985 ("ARM: dts: amlogic: meson8b: switch to the new PWM controller binding") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250420164801.330505-3-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong --- arch/arm/boot/dts/amlogic/meson8b.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/amlogic/meson8b.dtsi b/arch/arm/boot/dts/amlogic/meson8b.dtsi index 0876611ce26a8c..fdb0abe23a0c8b 100644 --- a/arch/arm/boot/dts/amlogic/meson8b.dtsi +++ b/arch/arm/boot/dts/amlogic/meson8b.dtsi @@ -406,7 +406,7 @@ compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2"; reg = <0x86c0 0x10>; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; #pwm-cells = <3>; @@ -680,7 +680,7 @@ &pwm_ab { compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2"; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; @@ -688,7 +688,7 @@ &pwm_cd { compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2"; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "Video PLL" */ + <0>, /* unknown/untested, the datasheet calls it "Video PLL" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; From 1353540b6e974fa27f9db9bcce3875ba57fca581 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 20 Apr 2025 18:48:00 +0200 Subject: [PATCH 089/353] arm64: dts: amlogic: gx: fix reference to unknown/untested PWM clock Device-tree expects absent clocks to be specified as <0> (instead of using <>). This fixes using the FCLK4/FCLK3 clocks as they are now seen at their correct index (while before they were recognized, but at the correct index - resulting in the hardware using a different clock than what the kernel sees). Fixes: a526eeef9a81 ("arm64: dts: amlogic: gx: switch to the new PWM controller binding") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250420164801.330505-4-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong --- arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 6 +++--- arch/arm64/boot/dts/amlogic/meson-gxl.dtsi | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 8ebce7114a60b7..6c134592c7bb81 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -741,7 +741,7 @@ &pwm_ab { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; @@ -752,14 +752,14 @@ &pwm_cd { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; &pwm_ef { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index 2dc2fdaecf9ff5..19b8a39de6a033 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -811,7 +811,7 @@ &pwm_ab { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; @@ -822,14 +822,14 @@ &pwm_cd { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; &pwm_ef { clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; }; From b7f02477995a18fade8960cc40c0ff55397b28b3 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 20 Apr 2025 18:48:01 +0200 Subject: [PATCH 090/353] arm64: dts: amlogic: g12: fix reference to unknown/untested PWM clock Device-tree expects absent clocks to be specified as <0> (instead of using <>). This fixes using the FCLK4/FCLK3 clocks as they are now seen at their correct index (while before they were recognized, but at the correct index - resulting in the hardware using a different clock than what the kernel sees). Fixes: e6884f2e4129 ("arm64: dts: amlogic: g12: switch to the new PWM controller binding") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250420164801.330505-5-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong --- arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index ab2b3f15ef1946..69834b49673d40 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -2313,7 +2313,7 @@ "amlogic,meson8-pwm-v2"; reg = <0x0 0x19000 0x0 0x20>; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; #pwm-cells = <3>; @@ -2325,7 +2325,7 @@ "amlogic,meson8-pwm-v2"; reg = <0x0 0x1a000 0x0 0x20>; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; #pwm-cells = <3>; @@ -2337,7 +2337,7 @@ "amlogic,meson8-pwm-v2"; reg = <0x0 0x1b000 0x0 0x20>; clocks = <&xtal>, - <>, /* unknown/untested, the datasheet calls it "vid_pll" */ + <0>, /* unknown/untested, the datasheet calls it "vid_pll" */ <&clkc CLKID_FCLK_DIV4>, <&clkc CLKID_FCLK_DIV3>; #pwm-cells = <3>; From 741a830477e8530ba8bd3d77e66aa8a44a1bb975 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Sat, 3 May 2025 08:44:43 +0000 Subject: [PATCH 091/353] arm64: dts: amlogic: dreambox: fix missing clkc_audio node Add the clkc_audio node to fix audio support on Dreambox One/Two. Fixes: 83a6f4c62cb1 ("arm64: dts: meson: add initial support for Dreambox One/Two") CC: stable@vger.kernel.org Suggested-by: Emanuel Strobel Signed-off-by: Christian Hewitt Reviewed-by: Martin Blumenstingl Link: https://lore.kernel.org/r/20250503084443.3704866-1-christianshewitt@gmail.com Signed-off-by: Neil Armstrong --- arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi index de35fa2d7a6de3..8e3e3354ed67a9 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi @@ -116,6 +116,10 @@ status = "okay"; }; +&clkc_audio { + status = "okay"; +}; + &frddr_a { status = "okay"; }; From 9fce4392a4903c053e5ad148d8a59aed9c2ea076 Mon Sep 17 00:00:00 2001 From: Ze Huang Date: Mon, 28 Apr 2025 17:24:36 +0800 Subject: [PATCH 092/353] riscv: dts: sophgo: fix DMA data-width configuration for CV18xx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "snps,data-width" property[1] defines the AXI data width of the DMA controller as: width = 8 × (2^n) bits (0 = 8 bits, 1 = 16 bits, 2 = 32 bits, ..., 6 = 512 bits) where "n" is the value of "snps,data-width". For the CV18xx DMA controller, the correct AXI data width is 32 bits, corresponding to "snps,data-width = 2". Test results on Milkv Duo S can be found here [2]. Link: https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/dma/snps%2Cdw-axi-dmac.yaml#L74 [1] Link: https://gist.github.com/Sutter099/4fa99bb2d89e5af975983124704b3861 [2] Fixes: 514951a81a5e ("riscv: dts: sophgo: cv18xx: add DMA controller") Co-developed-by: Yu Yuan Signed-off-by: Yu Yuan Signed-off-by: Ze Huang Link: https://lore.kernel.org/r/20250428-duo-dma-config-v1-1-eb6ad836ca42@whut.edu.cn Signed-off-by: Inochi Amaoto Signed-off-by: Chen Wang Signed-off-by: Chen Wang --- arch/riscv/boot/dts/sophgo/cv18xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/sophgo/cv18xx.dtsi b/arch/riscv/boot/dts/sophgo/cv18xx.dtsi index c18822ec849f35..58cd546392e056 100644 --- a/arch/riscv/boot/dts/sophgo/cv18xx.dtsi +++ b/arch/riscv/boot/dts/sophgo/cv18xx.dtsi @@ -341,7 +341,7 @@ 1024 1024 1024 1024>; snps,priority = <0 1 2 3 4 5 6 7>; snps,dma-masters = <2>; - snps,data-width = <4>; + snps,data-width = <2>; status = "disabled"; }; From b7b960ab4b5416e77c536a2a33c7209a24c289f2 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Wed, 9 Apr 2025 18:11:21 +0900 Subject: [PATCH 093/353] mailmap: Update email for Asahi Lina Add an alias so I can more easily filter kernel-related emails. Signed-off-by: Asahi Lina Link: https://lore.kernel.org/r/20250409-mailmap-lina-email-v1-1-265d05848ae3@asahilina.net Signed-off-by: Sven Peter --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 1c70e51c789d34..9531546fcac9e0 100644 --- a/.mailmap +++ b/.mailmap @@ -102,6 +102,7 @@ Ard Biesheuvel Arnaud Patard Arnd Bergmann Arun Kumar Neelakantam +Asahi Lina Ashok Raj Nagarajan Ashwin Chaugule Asutosh Das From d8d05f1bbd5083cfaa294b205a6a26835878bca9 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 16 Apr 2025 20:06:18 +0200 Subject: [PATCH 094/353] arm64: dts: apple: touchbar: Mark ps_dispdfr_be as always-on The driver depends on boot loader initialized state which resets when the ps_dispdfr_be power-domain is powered off. This happens on suspend or when the driver is missing during boot. Mark the domain as always on until the driver can handle this. Fixes: 7275e795e520 ("arm64: dts: apple: Add touchbar screen nodes") Signed-off-by: Janne Grunau Reviewed-by: Alyssa Rosenzweig Link: https://lore.kernel.org/r/20250416-arm64_dts_apple_touchbar-v1-1-e1c0b53b9125@jannau.net Signed-off-by: Sven Peter --- arch/arm64/boot/dts/apple/t8103-j293.dts | 10 ++++++++++ arch/arm64/boot/dts/apple/t8112-j493.dts | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/arm64/boot/dts/apple/t8103-j293.dts b/arch/arm64/boot/dts/apple/t8103-j293.dts index 2dfe7b895b2bc0..e2d9439397f71a 100644 --- a/arch/arm64/boot/dts/apple/t8103-j293.dts +++ b/arch/arm64/boot/dts/apple/t8103-j293.dts @@ -77,6 +77,16 @@ }; }; +/* + * The driver depends on boot loader initialized state which resets when this + * power-domain is powered off. This happens on suspend or when the driver is + * missing during boot. Mark the domain as always on until the driver can + * handle this. + */ +&ps_dispdfr_be { + apple,always-on; +}; + &display_dfr { status = "okay"; }; diff --git a/arch/arm64/boot/dts/apple/t8112-j493.dts b/arch/arm64/boot/dts/apple/t8112-j493.dts index 3d73f9ee2f46a3..be86d34c6696cb 100644 --- a/arch/arm64/boot/dts/apple/t8112-j493.dts +++ b/arch/arm64/boot/dts/apple/t8112-j493.dts @@ -40,6 +40,16 @@ }; }; +/* + * The driver depends on boot loader initialized state which resets when this + * power-domain is powered off. This happens on suspend or when the driver is + * missing during boot. Mark the domain as always on until the driver can + * handle this. + */ +&ps_dispdfr_be { + apple,always-on; +}; + &display_dfr { status = "okay"; }; From fe07b24e80648b5224fdc6926e6b986fa4e0170f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 31 Mar 2025 21:06:42 +0200 Subject: [PATCH 095/353] MAINTAINERS: delete email for Shiraz Hashim The email address bounced. I couldn't find a newer one in recent git history (last activity 9 years ago), so delete this email entry. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250331190731.5094-2-wsa+renesas@sang-engineering.com Signed-off-by: Arnd Bergmann --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d212429efa9eb3..d5b0fcf9d0853b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22923,7 +22923,6 @@ F: drivers/accessibility/speakup/ SPEAR PLATFORM/CLOCK/PINCTRL SUPPORT M: Viresh Kumar -M: Shiraz Hashim L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: soc@lists.linux.dev S: Maintained From 17230377d357d6c5c4e7484476e65a1f66c3411c Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 9 Apr 2025 16:02:54 -0500 Subject: [PATCH 096/353] arm64: dts: amazon: Fix simple-bus node name schema warnings Fix a couple of node name warnings from the schema checks: arch/arm64/boot/dts/amazon/alpine-v2-evp.dt.yaml: io-fabric: $nodename:0: 'io-fabric' does not match '^(bus|soc|axi|ahb|apb)(@[0-9a-f]+)?$' arch/arm64/boot/dts/amazon/alpine-v3-evp.dt.yaml: io-fabric: $nodename:0: 'io-fabric' does not match '^(bus|soc|axi|ahb|apb)(@[0-9a-f]+)?$' Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250409210255.1541298-1-robh@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/amazon/alpine-v2.dtsi | 2 +- arch/arm64/boot/dts/amazon/alpine-v3.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi index da9de4986660f2..5a72f0b64247d5 100644 --- a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi +++ b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi @@ -151,7 +151,7 @@ al,msi-num-spis = <160>; }; - io-fabric@fc000000 { + io-bus@fc000000 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi index 8b6156b5af659f..dea60d136c2e3d 100644 --- a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi +++ b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi @@ -361,7 +361,7 @@ interrupt-parent = <&gic>; }; - io-fabric@fc000000 { + io-bus@fc000000 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; From 8b51ef9d31cf8fa6e5ed17959fc2ea045492d169 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 22 Apr 2025 09:12:35 +0200 Subject: [PATCH 097/353] arm64: dts: imx8mp: use 800MHz NoC OPP for nominal drive mode When running in nominal drive mode, the maximum allowed frequency for the NoC is 800MHz, but the OPP table for the i.MX8MP interconnect device listed the 1GHz operating point for the NoC, regardless of the active mode. The newly introduced imx8mp-nominal.dtsi header reconfigures the clock controller to observe nominal drive mode limits, so have it modify the maximum NoC OPP as well. Fixes: 255fbd9eabe7 ("arm64: dts: imx8mp: Add optional nominal drive mode DTSI") Signed-off-by: Ahmad Fatoum Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi | 2 ++ arch/arm64/boot/dts/freescale/imx8mp.dtsi | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi index dc0ccd723c6d92..2ce1860b244d5e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi @@ -88,3 +88,5 @@ <0>, <0>, <400000000>, <1039500000>; }; + +/delete-node/ &{noc_opp_table/opp-1000000000}; diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index ce6793b2d57eef..7c1c87eab54cc6 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1645,6 +1645,12 @@ opp-hz = /bits/ 64 <200000000>; }; + /* Nominal drive mode maximum */ + opp-800000000 { + opp-hz = /bits/ 64 <800000000>; + }; + + /* Overdrive mode maximum */ opp-1000000000 { opp-hz = /bits/ 64 <1000000000>; }; From 16d8390bc4c2818ffc6d1c81a0a728e06e82ef13 Mon Sep 17 00:00:00 2001 From: Himanshu Bhavani Date: Mon, 5 May 2025 11:28:27 +0530 Subject: [PATCH 098/353] arm64: dts: imx8mp-var-som: Fix LDO5 shutdown causing SD card timeout Fix SD card timeout issue caused by LDO5 regulator getting disabled after boot. The kernel log shows LDO5 being disabled, which leads to a timeout on USDHC2: [ 33.760561] LDO5: disabling [ 81.119861] mmc1: Timeout waiting for hardware interrupt. To prevent this, set regulator-boot-on and regulator-always-on for LDO5. Also add the vqmmc regulator to properly support 1.8V/3.3V signaling for USDHC2 using a GPIO-controlled regulator. Fixes: 6c2a1f4f71258 ("arm64: dts: imx8mp-var-som-symphony: Add Variscite Symphony board and VAR-SOM-MX8MP SoM") Signed-off-by: Himanshu Bhavani Acked-by: Tarang Raval Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi index b2ac2583a59292..b59da91fdd041f 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi @@ -35,7 +35,6 @@ <0x1 0x00000000 0 0xc0000000>; }; - reg_usdhc2_vmmc: regulator-usdhc2-vmmc { compatible = "regulator-fixed"; regulator-name = "VSD_3V3"; @@ -46,6 +45,16 @@ startup-delay-us = <100>; off-on-delay-us = <12000>; }; + + reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc { + compatible = "regulator-gpio"; + regulator-name = "VSD_VSEL"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + gpios = <&gpio2 12 GPIO_ACTIVE_HIGH>; + states = <3300000 0x0 1800000 0x1>; + vin-supply = <&ldo5>; + }; }; &A53_0 { @@ -205,6 +214,7 @@ pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_gpio>; cd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>; vmmc-supply = <®_usdhc2_vmmc>; + vqmmc-supply = <®_usdhc2_vqmmc>; bus-width = <4>; status = "okay"; }; From 4fe74ea04cf5a283d4b197d2d4baddf12ea538a5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 15:45:06 -0700 Subject: [PATCH 099/353] binfmt_elf: Move brk for static PIE even if ASLR disabled In commit bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec"), the brk was moved out of the mmap region when loading static PIE binaries (ET_DYN without INTERP). The common case for these binaries was testing new ELF loaders, so the brk needed to be away from mmap to avoid colliding with stack, future mmaps (of the loader-loaded binary), etc. But this was only done when ASLR was enabled, in an attempt to minimize changes to memory layouts. After adding support to respect alignment requirements for static PIE binaries in commit 3545deff0ec7 ("binfmt_elf: Honor PT_LOAD alignment for static PIE"), it became possible to have a large gap after the final PT_LOAD segment and the top of the mmap region. This means that future mmap allocations might go after the last PT_LOAD segment (where brk might be if ASLR was disabled) instead of before them (where they traditionally ended up). On arm64, running with ASLR disabled, Ubuntu 22.04's "ldconfig" binary, a static PIE, has alignment requirements that leaves a gap large enough after the last PT_LOAD segment to fit the vdso and vvar, but still leave enough space for the brk (which immediately follows the last PT_LOAD segment) to be allocated by the binary. fffff7f20000-fffff7fde000 r-xp 00000000 fe:02 8110426 /sbin/ldconfig.real fffff7fee000-fffff7ff5000 rw-p 000be000 fe:02 8110426 /sbin/ldconfig.real fffff7ff5000-fffff7ffa000 rw-p 00000000 00:00 0 ***[brk will go here at fffff7ffa000]*** fffff7ffc000-fffff7ffe000 r--p 00000000 00:00 0 [vvar] fffff7ffe000-fffff8000000 r-xp 00000000 00:00 0 [vdso] fffffffdf000-1000000000000 rw-p 00000000 00:00 0 [stack] After commit 0b3bc3354eb9 ("arm64: vdso: Switch to generic storage implementation"), the arm64 vvar grew slightly, and suddenly the brk collided with the allocation. fffff7f20000-fffff7fde000 r-xp 00000000 fe:02 8110426 /sbin/ldconfig.real fffff7fee000-fffff7ff5000 rw-p 000be000 fe:02 8110426 /sbin/ldconfig.real fffff7ff5000-fffff7ffa000 rw-p 00000000 00:00 0 ***[oops, no room any more, vvar is at fffff7ffa000!]*** fffff7ffa000-fffff7ffe000 r--p 00000000 00:00 0 [vvar] fffff7ffe000-fffff8000000 r-xp 00000000 00:00 0 [vdso] fffffffdf000-1000000000000 rw-p 00000000 00:00 0 [stack] The solution is to unconditionally move the brk out of the mmap region for static PIE binaries. Whether ASLR is enabled or not does not change if there may be future mmap allocation collisions with a growing brk region. Update memory layout comments (with kernel-doc headings), consolidate the setting of mm->brk to later (it isn't needed early), move static PIE brk out of mmap unconditionally, and make sure brk(2) knows to base brk position off of mm->start_brk not mm->end_data no matter what the cause of moving it is (via current->brk_randomized). For the CONFIG_COMPAT_BRK case, though, leave the logic unchanged, as we can never safely move the brk. These systems, however, are not using specially aligned static PIE binaries. Reported-by: Ryan Roberts Closes: https://lore.kernel.org/lkml/f93db308-4a0e-4806-9faf-98f890f5a5e6@arm.com/ Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Link: https://lore.kernel.org/r/20250425224502.work.520-kees@kernel.org Reviewed-by: Ryan Roberts Tested-by: Ryan Roberts Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 71 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 584fa89bc8776e..4c1ea6b52a53de 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -830,6 +830,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; + bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1097,15 +1098,19 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - /* - * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) - * and loaders (ET_DYN without PT_INTERP, since they - * _are_ the ELF interpreter). The loaders must - * be loaded away from programs since the program - * may otherwise collide with the loader (especially - * for ET_EXEC which does not have a randomized - * position). For example to handle invocations of + /** + * DOC: PIE handling + * + * There are effectively two types of ET_DYN ELF + * binaries: programs (i.e. PIE: ET_DYN with + * PT_INTERP) and loaders (i.e. static PIE: ET_DYN + * without PT_INTERP, usually the ELF interpreter + * itself). Loaders must be loaded away from programs + * since the program may otherwise collide with the + * loader (especially for ET_EXEC which does not have + * a randomized position). + * + * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so @@ -1118,6 +1123,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). + * + * See below for "brk" handling details, which is + * also affected by program vs loader and ASLR. */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ @@ -1234,8 +1242,6 @@ static int load_elf_binary(struct linux_binprm *bprm) start_data += load_bias; end_data += load_bias; - current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); - if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, @@ -1291,27 +1297,44 @@ static int load_elf_binary(struct linux_binprm *bprm) mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { + /** + * DOC: "brk" handling + * + * For architectures with ELF randomization, when executing a + * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), + * move the brk area out of the mmap region and into the unused + * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide + * early with the stack growing down or other regions being put + * into the mmap region by the kernel (e.g. vdso). + * + * In the CONFIG_COMPAT_BRK case, though, everything is turned + * off because we're not allowed to move the brk at all. + */ + if (!IS_ENABLED(CONFIG_COMPAT_BRK) && + IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + elf_ex->e_type == ET_DYN && !interpreter) { + elf_brk = ELF_ET_DYN_BASE; + /* This counts as moving the brk, so let brk(2) know. */ + brk_moved = true; + } + mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + + if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* - * For architectures with ELF randomization, when executing - * a loader directly (i.e. no interpreter listed in ELF - * headers), move the brk area out of the mmap region - * (since it grows up, and may collide early with the stack - * growing down), and into the unused ELF_ET_DYN_BASE region. + * If we didn't move the brk to ELF_ET_DYN_BASE (above), + * leave a gap between .bss and brk. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - elf_ex->e_type == ET_DYN && !interpreter) { - mm->brk = mm->start_brk = ELF_ET_DYN_BASE; - } else { - /* Otherwise leave a gap between .bss and brk. */ + if (!brk_moved) mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; - } mm->brk = mm->start_brk = arch_randomize_brk(mm); + brk_moved = true; + } + #ifdef compat_brk_randomized + if (brk_moved) current->brk_randomized = 1; #endif - } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, From 358b784672385b681982df28fc3e5bd12890a5d0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 May 2025 15:26:57 -0400 Subject: [PATCH 100/353] tracing: samples: Initialize trace_array_printk() with the correct function When using trace_array_printk() on a created instance, the correct function to use to initialize it is: trace_array_init_printk() Not trace_printk_init_buffer() The former is a proper function to use, the latter is for initializing trace_printk() and causes the NOTICE banner to be displayed. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Divya Indi Link: https://lore.kernel.org/20250509152657.0f6744d9@gandalf.local.home Fixes: 89ed42495ef4a ("tracing: Sample module to demonstrate kernel access to Ftrace instances.") Fixes: 38ce2a9e33db6 ("tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers") Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/sample-trace-array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index dac67c3674576d..4147616102f906 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -112,7 +112,7 @@ static int __init sample_trace_array_init(void) /* * If context specific per-cpu buffers havent already been allocated. */ - trace_printk_init_buffers(); + trace_array_init_printk(tr); simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); if (IS_ERR(simple_tsk)) { From 1996b808a275fe104c15b38ff60f0cea8d4d08b8 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:45 +0800 Subject: [PATCH 101/353] ftrace: Fix preemption accounting for stacktrace trigger command When using the stacktrace trigger command to trace syscalls, the preemption count was consistently reported as 1 when the system call event itself had 0 ("."). For example: root@ubuntu22-vm:/sys/kernel/tracing/events/syscalls/sys_enter_read $ echo stacktrace > trigger $ echo 1 > enable sshd-416 [002] ..... 232.864910: sys_read(fd: a, buf: 556b1f3221d0, count: 8000) sshd-416 [002] ...1. 232.864913: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption in __DO_TRACE before invoking the trigger callback. Use the tracing_gen_ctx_dec() that will accommodate for the increase of the preemption count in __DO_TRACE when calling the callback. The result is the accurate reporting of: sshd-410 [004] ..... 210.117660: sys_read(fd: 4, buf: 559b725ba130, count: 40000) sshd-410 [004] ..... 210.117662: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: ce33c845b030c ("tracing: Dump stacktrace trigger to the corresponding instance") Link: https://lore.kernel.org/20250512094246.1167956-1-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d9130..6e87ae2a1a66bf 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } From fa71ac1574ef376798d9b2821e47fcaf114fefe0 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:46 +0800 Subject: [PATCH 102/353] ftrace: Fix preemption accounting for stacktrace filter command The preemption count of the stacktrace filter command to trace ksys_read is consistently incorrect: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-453 [004] ...1. 38.308956: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption when invoking the filter command callback in function_trace_probe_call: preempt_disable_notrace(); probe_ops->func(ip, parent_ip, probe_opsbe->tr, probe_ops, probe->data); preempt_enable_notrace(); Use tracing_gen_ctx_dec() to account for the preempt_disable_notrace(), which will output the correct preemption count: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-410 [006] ..... 31.420396: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: 36590c50b2d07 ("tracing: Merge irqflags + preempt counter.") Link: https://lore.kernel.org/20250512094246.1167956-2-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c519d..4e37a0f6aaa384 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -633,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void From ccffc6fcfe68b75acdda137f1a91f61b7a18ded7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 May 2025 11:50:32 -0400 Subject: [PATCH 103/353] ring-buffer: Fix persistent buffer when commit page is the reader page The ring buffer is made up of sub buffers (sometimes called pages as they are by default PAGE_SIZE). It has the following "pages": "tail page" - this is the page that the next write will write to "head page" - this is the page that the reader will swap the reader page with. "reader page" - This belongs to the reader, where it will swap the head page from the ring buffer so that the reader does not race with the writer. The writer may end up on the "reader page" if the ring buffer hasn't written more than one page, where the "tail page" and the "head page" are the same. The persistent ring buffer has meta data that points to where these pages exist so on reboot it can re-create the pointers to the cpu_buffer descriptor. But when the commit page is on the reader page, the logic is incorrect. The check to see if the commit page is on the reader page checked if the head page was the reader page, which would never happen, as the head page is always in the ring buffer. The correct check would be to test if the commit page is on the reader page. If that's the case, then it can exit out early as the commit page is only on the reader page when there's only one page of data in the buffer. There's no reason to iterate the ring buffer pages to find the "commit page" as it is already found. To trigger this bug: # echo 1 > /sys/kernel/tracing/instances/boot_mapped/events/syscalls/sys_enter_fchownat/enable # touch /tmp/x # chown sshd /tmp/x # reboot On boot up, the dmesg will have: Ring buffer meta [0] is from previous boot! Ring buffer meta [1] is from previous boot! Ring buffer meta [2] is from previous boot! Ring buffer meta [3] is from previous boot! Ring buffer meta [4] commit page not found Ring buffer meta [5] is from previous boot! Ring buffer meta [6] is from previous boot! Ring buffer meta [7] is from previous boot! Where the buffer on CPU 4 had a "commit page not found" error and that buffer is cleared and reset causing the output to be empty and the data lost. When it works correctly, it has: # cat /sys/kernel/tracing/instances/boot_mapped/trace_pipe <...>-1137 [004] ..... 998.205323: sys_enter_fchownat: __syscall_nr=0x104 (260) dfd=0xffffff9c (4294967196) filename=(0xffffc90000a0002c) user=0x3e8 (1000) group=0xffffffff (4294967295) flag=0x0 (0 Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250513115032.3e0b97f7@gandalf.local.home Fixes: 5f3b6e839f3ce ("ring-buffer: Validate boot range memory events") Reported-by: Tasos Sahanidis Tested-by: Tasos Sahanidis Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a241d..3f9bf562beea23 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { From 9d212cf983acfab34d239b3b1f82c622b7cb3410 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 5 May 2025 16:03:16 +0100 Subject: [PATCH 104/353] btrfs: fix discard worker infinite loop after disabling discard If the discard worker is running and there's currently only one block group, that block group is a data block group, it's in the unused block groups discard list and is being used (it got an extent allocated from it after becoming unused), the worker can end up in an infinite loop if a transaction abort happens or the async discard is disabled (during remount or unmount for example). This happens like this: 1) Task A, the discard worker, is at peek_discard_list() and find_next_block_group() returns block group X; 2) Block group X is in the unused block groups discard list (its discard index is BTRFS_DISCARD_INDEX_UNUSED) since at some point in the past it become an unused block group and was added to that list, but then later it got an extent allocated from it, so its ->used counter is not zero anymore; 3) The current transaction is aborted by task B and we end up at __btrfs_handle_fs_error() in the transaction abort path, where we call btrfs_discard_stop(), which clears BTRFS_FS_DISCARD_RUNNING from fs_info, and then at __btrfs_handle_fs_error() we set the fs to RO mode (setting SB_RDONLY in the super block's s_flags field); 4) Task A calls __add_to_discard_list() with the goal of moving the block group from the unused block groups discard list into another discard list, but at __add_to_discard_list() we end up doing nothing because btrfs_run_discard_work() returns false, since the super block has SB_RDONLY set in its flags and BTRFS_FS_DISCARD_RUNNING is not set anymore in fs_info->flags. So block group X remains in the unused block groups discard list; 5) Task A then does a goto into the 'again' label, calls find_next_block_group() again we gets block group X again. Then it repeats the previous steps over and over since there are not other block groups in the discard lists and block group X is never moved out of the unused block groups discard list since btrfs_run_discard_work() keeps returning false and therefore __add_to_discard_list() doesn't move block group X out of that discard list. When this happens we can get a soft lockup report like this: [71.957] watchdog: BUG: soft lockup - CPU#0 stuck for 27s! [kworker/u4:3:97] [71.957] Modules linked in: xfs af_packet rfkill (...) [71.957] CPU: 0 UID: 0 PID: 97 Comm: kworker/u4:3 Tainted: G W 6.14.2-1-default #1 openSUSE Tumbleweed 968795ef2b1407352128b466fe887416c33af6fa [71.957] Tainted: [W]=WARN [71.957] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [71.957] Workqueue: btrfs_discard btrfs_discard_workfn [btrfs] [71.957] RIP: 0010:btrfs_discard_workfn+0xc4/0x400 [btrfs] [71.957] Code: c1 01 48 83 (...) [71.957] RSP: 0018:ffffafaec03efe08 EFLAGS: 00000246 [71.957] RAX: ffff897045500000 RBX: ffff8970413ed8d0 RCX: 0000000000000000 [71.957] RDX: 0000000000000001 RSI: ffff8970413ed8d0 RDI: 0000000a8f1272ad [71.957] RBP: 0000000a9d61c60e R08: ffff897045500140 R09: 8080808080808080 [71.957] R10: ffff897040276800 R11: fefefefefefefeff R12: ffff8970413ed860 [71.957] R13: ffff897045500000 R14: ffff8970413ed868 R15: 0000000000000000 [71.957] FS: 0000000000000000(0000) GS:ffff89707bc00000(0000) knlGS:0000000000000000 [71.957] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [71.957] CR2: 00005605bcc8d2f0 CR3: 000000010376a001 CR4: 0000000000770ef0 [71.957] PKRU: 55555554 [71.957] Call Trace: [71.957] [71.957] process_one_work+0x17e/0x330 [71.957] worker_thread+0x2ce/0x3f0 [71.957] ? __pfx_worker_thread+0x10/0x10 [71.957] kthread+0xef/0x220 [71.957] ? __pfx_kthread+0x10/0x10 [71.957] ret_from_fork+0x34/0x50 [71.957] ? __pfx_kthread+0x10/0x10 [71.957] ret_from_fork_asm+0x1a/0x30 [71.957] [71.957] Kernel panic - not syncing: softlockup: hung tasks [71.987] CPU: 0 UID: 0 PID: 97 Comm: kworker/u4:3 Tainted: G W L 6.14.2-1-default #1 openSUSE Tumbleweed 968795ef2b1407352128b466fe887416c33af6fa [71.989] Tainted: [W]=WARN, [L]=SOFTLOCKUP [71.989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [71.991] Workqueue: btrfs_discard btrfs_discard_workfn [btrfs] [71.992] Call Trace: [71.993] [71.994] dump_stack_lvl+0x5a/0x80 [71.994] panic+0x10b/0x2da [71.995] watchdog_timer_fn.cold+0x9a/0xa1 [71.996] ? __pfx_watchdog_timer_fn+0x10/0x10 [71.997] __hrtimer_run_queues+0x132/0x2a0 [71.997] hrtimer_interrupt+0xff/0x230 [71.998] __sysvec_apic_timer_interrupt+0x55/0x100 [71.999] sysvec_apic_timer_interrupt+0x6c/0x90 [72.000] [72.000] [72.001] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [72.002] RIP: 0010:btrfs_discard_workfn+0xc4/0x400 [btrfs] [72.002] Code: c1 01 48 83 (...) [72.005] RSP: 0018:ffffafaec03efe08 EFLAGS: 00000246 [72.006] RAX: ffff897045500000 RBX: ffff8970413ed8d0 RCX: 0000000000000000 [72.006] RDX: 0000000000000001 RSI: ffff8970413ed8d0 RDI: 0000000a8f1272ad [72.007] RBP: 0000000a9d61c60e R08: ffff897045500140 R09: 8080808080808080 [72.008] R10: ffff897040276800 R11: fefefefefefefeff R12: ffff8970413ed860 [72.009] R13: ffff897045500000 R14: ffff8970413ed868 R15: 0000000000000000 [72.010] ? btrfs_discard_workfn+0x51/0x400 [btrfs 23b01089228eb964071fb7ca156eee8cd3bf996f] [72.011] process_one_work+0x17e/0x330 [72.012] worker_thread+0x2ce/0x3f0 [72.013] ? __pfx_worker_thread+0x10/0x10 [72.014] kthread+0xef/0x220 [72.014] ? __pfx_kthread+0x10/0x10 [72.015] ret_from_fork+0x34/0x50 [72.015] ? __pfx_kthread+0x10/0x10 [72.016] ret_from_fork_asm+0x1a/0x30 [72.017] [72.017] Kernel Offset: 0x15000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [72.019] Rebooting in 90 seconds.. So fix this by making sure we move a block group out of the unused block groups discard list when calling __add_to_discard_list(). Fixes: 2bee7eb8bb81 ("btrfs: discard one region at a time in async discard") Link: https://bugzilla.suse.com/show_bug.cgi?id=1242012 CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Boris Burkov Reviewed-by: Daniel Vacek Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/discard.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index d6eef4bd9e9d45..de23c4b3515e58 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -244,6 +245,18 @@ static struct btrfs_block_group *peek_discard_list( block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); From 4cc9180dd710fb2e138b8502ae7d9f4d076a7ceb Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 7 May 2025 12:42:24 -0700 Subject: [PATCH 105/353] btrfs: fix folio leak in submit_one_async_extent() If btrfs_reserve_extent() fails while submitting an async_extent for a compressed write, then we fail to call free_async_extent_pages() on the async_extent and leak its folios. A likely cause for such a failure would be btrfs_reserve_extent() failing to find a large enough contiguous free extent for the compressed extent. I was able to reproduce this by: 1. mount with compress-force=zstd:3 2. fallocating most of a filesystem to a big file 3. fragmenting the remaining free space 4. trying to copy in a file which zstd would generate large compressed extents for (vmlinux worked well for this) Step 4. hits the memory leak and can be repeated ad nauseam to eventually exhaust the system memory. Fix this by detecting the case where we fallback to uncompressed submission for a compressed async_extent and ensuring that we call free_async_extent_pages(). Fixes: 131a821a243f ("btrfs: fallback if compressed IO fails for ENOSPC") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Co-developed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bdafe4d4c4a59c..90f5da3c520ac3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1109,6 +1109,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1129,7 +1130,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1145,6 +1149,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1186,6 +1191,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; From bdfa05a5c25605ad8221e1dc8d3a975739b2ec7d Mon Sep 17 00:00:00 2001 From: Kyoji Ogasawara Date: Fri, 9 May 2025 19:26:31 +0900 Subject: [PATCH 106/353] btrfs: add back warning for mount option commit values exceeding 300 The Btrfs documentation states that if the commit value is greater than 300 a warning should be issued. The warning was accidentally lost in the new mount API update. Fixes: 6941823cc878 ("btrfs: remove old mount API code") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Kyoji Ogasawara Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 1 + fs/btrfs/super.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bcca43046064b8..7baa2ed45198f9 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -300,6 +300,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7121d8c7a318ec..7310e2fa852621 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -569,6 +569,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; From d7caff43f16a3c3a3056867e03721f418304da43 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 7 Apr 2025 15:28:05 +0300 Subject: [PATCH 107/353] tpm: Mask TPM RC in tpm2_start_auth_session() tpm2_start_auth_session() does not mask TPM RC correctly from the callers: [ 28.766528] tpm tpm0: A TPM error (2307) occurred start auth session Process TPM RCs inside tpm2_start_auth_session(), and map them to POSIX error codes. Cc: stable@vger.kernel.org # v6.10+ Fixes: 699e3efd6c64 ("tpm: Add HMAC session start and end functions") Reported-by: Herbert Xu Closes: https://lore.kernel.org/linux-integrity/Z_NgdRHuTKP6JK--@gondor.apana.org.au/ Reviewed-by: Stefano Garzarella Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-sessions.c | 20 ++++++-------------- include/linux/tpm.h | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index 3f89635ba5e855..7b5049b3d476ef 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -40,11 +40,6 @@ * * These are the usage functions: * - * tpm2_start_auth_session() which allocates the opaque auth structure - * and gets a session from the TPM. This must be called before - * any of the following functions. The session is protected by a - * session_key which is derived from a random salt value - * encrypted to the NULL seed. * tpm2_end_auth_session() kills the session and frees the resources. * Under normal operation this function is done by * tpm_buf_check_hmac_response(), so this is only to be used on @@ -963,16 +958,13 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key) } /** - * tpm2_start_auth_session() - create a HMAC authentication session with the TPM - * @chip: the TPM chip structure to create the session with + * tpm2_start_auth_session() - Create an a HMAC authentication session + * @chip: A TPM chip * - * This function loads the NULL seed from its saved context and starts - * an authentication session on the null seed, fills in the - * @chip->auth structure to contain all the session details necessary - * for performing the HMAC, encrypt and decrypt operations and - * returns. The NULL seed is flushed before this function returns. + * Loads the ephemeral key (null seed), and starts an HMAC authenticated + * session. The null seed is flushed before the return. * - * Return: zero on success or actual error encountered. + * Returns zero on success, or a POSIX error code. */ int tpm2_start_auth_session(struct tpm_chip *chip) { @@ -1024,7 +1016,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip) /* hash algorithm for session */ tpm_buf_append_u16(&buf, TPM_ALG_SHA256); - rc = tpm_transmit_cmd(chip, &buf, 0, "start auth session"); + rc = tpm_ret_to_err(tpm_transmit_cmd(chip, &buf, 0, "StartAuthSession")); tpm2_flush_context(chip, null_key); if (rc == TPM2_RC_SUCCESS) diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 6c3125300c009a..9ac9768cc8f7f9 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -257,6 +257,7 @@ enum tpm2_return_codes { TPM2_RC_TESTING = 0x090A, /* RC_WARN */ TPM2_RC_REFERENCE_H0 = 0x0910, TPM2_RC_RETRY = 0x0922, + TPM2_RC_SESSION_MEMORY = 0x0903, }; enum tpm2_command_codes { @@ -437,6 +438,24 @@ static inline u32 tpm2_rc_value(u32 rc) return (rc & BIT(7)) ? rc & 0xbf : rc; } +/* + * Convert a return value from tpm_transmit_cmd() to POSIX error code. + */ +static inline ssize_t tpm_ret_to_err(ssize_t ret) +{ + if (ret < 0) + return ret; + + switch (tpm2_rc_value(ret)) { + case TPM2_RC_SUCCESS: + return 0; + case TPM2_RC_SESSION_MEMORY: + return -ENOMEM; + default: + return -EFAULT; + } +} + #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE) extern int tpm_is_tpm2(struct tpm_chip *chip); From 80042332bc7d69dd30a97359282ef3ba6b4eaabd Mon Sep 17 00:00:00 2001 From: Purva Yeshi Date: Thu, 10 Apr 2025 16:04:42 +0530 Subject: [PATCH 108/353] char: tpm: tpm-buf: Add sanity check fallback in read helpers Fix Smatch-detected issue: drivers/char/tpm/tpm-buf.c:208 tpm_buf_read_u8() error: uninitialized symbol 'value'. drivers/char/tpm/tpm-buf.c:225 tpm_buf_read_u16() error: uninitialized symbol 'value'. drivers/char/tpm/tpm-buf.c:242 tpm_buf_read_u32() error: uninitialized symbol 'value'. Zero-initialize the return values in tpm_buf_read_u8(), tpm_buf_read_u16(), and tpm_buf_read_u32() to guard against uninitialized data in case of a boundary overflow. Add defensive initialization ensures the return values are always defined, preventing undefined behavior if the unexpected happens. Signed-off-by: Purva Yeshi Reviewed-by: Stefano Garzarella Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c index e49a19fea3bdf6..dc882fc9fa9efc 100644 --- a/drivers/char/tpm/tpm-buf.c +++ b/drivers/char/tpm/tpm-buf.c @@ -201,7 +201,7 @@ static void tpm_buf_read(struct tpm_buf *buf, off_t *offset, size_t count, void */ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset) { - u8 value; + u8 value = 0; tpm_buf_read(buf, offset, sizeof(value), &value); @@ -218,7 +218,7 @@ EXPORT_SYMBOL_GPL(tpm_buf_read_u8); */ u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset) { - u16 value; + u16 value = 0; tpm_buf_read(buf, offset, sizeof(value), &value); @@ -235,7 +235,7 @@ EXPORT_SYMBOL_GPL(tpm_buf_read_u16); */ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset) { - u32 value; + u32 value = 0; tpm_buf_read(buf, offset, sizeof(value), &value); From cdda64b47d619d60f4e4be770b10e52e6b2050ec Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 4 Apr 2025 10:23:14 +0200 Subject: [PATCH 109/353] tpm: tis: Double the timeout B to 4s With some Infineon chips the timeouts in tpm_tis_send_data (both B and C) can reach up to about 2250 ms. Timeout C is retried since commit de9e33df7762 ("tpm, tpm_tis: Workaround failed command reception on Infineon devices") Timeout B still needs to be extended. The problem is most commonly encountered with context related operation such as load context/save context. These are issued directly by the kernel, and there is no retry logic for them. When a filesystem is set up to use the TPM for unlocking the boot fails, and restarting the userspace service is ineffective. This is likely because ignoring a load context/save context result puts the real TPM state and the TPM state expected by the kernel out of sync. Chips known to be affected: tpm_tis IFX1522:00: 2.0 TPM (device-id 0x1D, rev-id 54) Description: SLB9672 Firmware Revision: 15.22 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1B, rev-id 22) Firmware Revision: 7.83 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1A, rev-id 16) Firmware Revision: 5.63 Link: https://lore.kernel.org/linux-integrity/Z5pI07m0Muapyu9w@kitsune.suse.cz/ Signed-off-by: Michal Suchanek Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.h | 2 +- include/linux/tpm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index 970d02c337c7f1..6c3aa480396b64 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -54,7 +54,7 @@ enum tis_int_flags { enum tis_defaults { TIS_MEM_LEN = 0x5000, TIS_SHORT_TIMEOUT = 750, /* ms */ - TIS_LONG_TIMEOUT = 2000, /* 2 sec */ + TIS_LONG_TIMEOUT = 4000, /* 4 secs */ TIS_TIMEOUT_MIN_ATML = 14700, /* usecs */ TIS_TIMEOUT_MAX_ATML = 15000, /* usecs */ }; diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 9ac9768cc8f7f9..a3d8305e88a51e 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -224,7 +224,7 @@ enum tpm2_const { enum tpm2_timeouts { TPM2_TIMEOUT_A = 750, - TPM2_TIMEOUT_B = 2000, + TPM2_TIMEOUT_B = 4000, TPM2_TIMEOUT_C = 200, TPM2_TIMEOUT_D = 30, TPM2_DURATION_SHORT = 20, From 2c024e0f9c7104d0a36c04dd05e52babfe0c784f Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 7 Apr 2025 23:08:44 +0000 Subject: [PATCH 110/353] kbuild: Require pahole v1.29 with GENDWARFKSYMS on X86 With CONFIG_GENDWARFKSYMS, __gendwarfksyms_ptr variables are added to the kernel in EXPORT_SYMBOL() to ensure DWARF type information is available for exported symbols in the TUs where they're actually exported. These symbols are dropped when linking vmlinux, but dangling references to them remain in DWARF. With CONFIG_DEBUG_INFO_BTF enabled on X86, pahole versions after commit 47dcb534e253 ("btf_encoder: Stop indexing symbols for VARs") and before commit 9810758003ce ("btf_encoder: Verify 0 address DWARF variables are in ELF section") place these symbols in the .data..percpu section, which results in an "Invalid offset" error in btf_datasec_check_meta() during boot, as all the variables are at zero offset and have non-zero size. If CONFIG_DEBUG_INFO_BTF_MODULES is enabled, this also results in a failure to load modules with: failed to validate module [$module] BTF: -22 As the issue occurs in pahole v1.28 and the fix was merged after v1.29 was released, require pahole v1.29 when GENDWARFKSYMS is enabled with DEBUG_INFO_BTF on X86. Reported-by: Paolo Pisati Signed-off-by: Sami Tolvanen Signed-off-by: Masahiro Yamada --- kernel/module/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a2c..39278737bb68fd 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. From 07e6efdf336302becaee6864c4d76d962783fc62 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 25 Apr 2025 20:08:15 -0700 Subject: [PATCH 111/353] usr/include: openrisc: don't HDRTEST bpf_perf_event.h Since openrisc does not support PERF_EVENTS, omit the HDRTEST of bpf_perf_event.h for arch/openrisc/. Fixes a build error: usr/include/linux/bpf_perf_event.h:14:28: error: field 'regs' has incomplete type Signed-off-by: Randy Dunlap Acked-by: Stafford Horne Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/usr/include/Makefile b/usr/include/Makefile index e3d6b03527fecc..f02f41941b60c8 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -59,6 +59,10 @@ ifeq ($(SRCARCH),arc) no-header-test += linux/bpf_perf_event.h endif +ifeq ($(SRCARCH),openrisc) +no-header-test += linux/bpf_perf_event.h +endif + ifeq ($(SRCARCH),powerpc) no-header-test += linux/bpf_perf_event.h endif From 271bf6f2082881c607f52a58e6d0dca347fc8575 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Tue, 22 Apr 2025 18:54:01 +0800 Subject: [PATCH 112/353] kbuild: deb-pkg: Add libdw-dev:native to Build-Depends-Arch The dwarf.h header, which is included by scripts/gendwarfksyms/gendwarfksyms.h, resides within the libdw-dev package. This portion of the code is compiled under the condition that CONFIG_GENDWARFKSYMS is enabled. Consequently, add libdw-dev to Build-Depends-Arch to prevent unforeseen compilation failures. Fix follow possible error: In file included from scripts/gendwarfksyms/symbols.c:6: scripts/gendwarfksyms/gendwarfksyms.h:6:10: fatal error: 'dwarf.h' file not found 6 | #include | ^~~~~~~~~ Fixes: f28568841ae0 ("tools: Add gendwarfksyms") Reviewed-by: Sami Tolvanen Signed-off-by: WangYuli Reviewed-by: Nicolas Schier Tested-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/package/mkdebian | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 744ddba01d93f9..d4b007b38a4759 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -210,7 +210,7 @@ Rules-Requires-Root: no Build-Depends: debhelper-compat (= 12) Build-Depends-Arch: bc, bison, flex, gcc-${host_gnu} , - kmod, libelf-dev:native, + kmod, libdw-dev:native, libelf-dev:native, libssl-dev:native, libssl-dev , python3:native, rsync Homepage: https://www.kernel.org/ From c2d759476803a9ae1430d1af2941bde0c5ad14ed Mon Sep 17 00:00:00 2001 From: WangYuli Date: Tue, 22 Apr 2025 18:54:02 +0800 Subject: [PATCH 113/353] kbuild: rpm-pkg: Add (elfutils-devel or libdw-devel) to BuildRequires The dwarf.h header, which is included by scripts/gendwarfksyms/gendwarfksyms.h, resides within elfutils-devel or libdw-devel package. This portion of the code is compiled under the condition that CONFIG_GENDWARFKSYMS is enabled. Consequently, add (elfutils-devel or libdw-devel) to BuildRequires to prevent unforeseen compilation failures. Fix follow possible error: In file included from scripts/gendwarfksyms/cache.c:6: scripts/gendwarfksyms/gendwarfksyms.h:6:10: fatal error: 'dwarf.h' file not found 6 | #include | ^~~~~~~~~ Link: https://lore.kernel.org/all/3e52d80d-0c60-4df5-8cb5-21d4b1fce7b7@suse.com/ Fixes: f28568841ae0 ("tools: Add gendwarfksyms") Suggested-by: Petr Pavlu Signed-off-by: WangYuli Reviewed-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/package/kernel.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index 726f34e1196018..98f206cb7c6079 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -16,6 +16,7 @@ Source1: config Source2: diff.patch Provides: kernel-%{KERNELRELEASE} BuildRequires: bc binutils bison dwarves +BuildRequires: (elfutils-devel or libdw-devel) BuildRequires: (elfutils-libelf-devel or libelf-devel) flex BuildRequires: gcc make openssl openssl-devel perl python3 rsync From 23748adbac4ae74c24ba0260612a4e252458cc17 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 6 May 2025 14:02:01 -0700 Subject: [PATCH 114/353] kbuild: Disable -Wdefault-const-init-unsafe A new on by default warning in clang [1] aims to flags instances where const variables without static or thread local storage or const members in aggregate types are not initialized because it can lead to an indeterminate value. This is quite noisy for the kernel due to instances originating from header files such as: drivers/gpu/drm/i915/gt/intel_ring.h:62:2: error: default initialization of an object of type 'typeof (ring->size)' (aka 'const unsigned int') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe] 62 | typecheck(typeof(ring->size), next); | ^ include/linux/typecheck.h:10:9: note: expanded from macro 'typecheck' 10 | ({ type __dummy; \ | ^ include/net/ip.h:478:14: error: default initialization of an object of type 'typeof (rt->dst.expires)' (aka 'const unsigned long') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe] 478 | if (mtu && time_before(jiffies, rt->dst.expires)) | ^ include/linux/jiffies.h:138:26: note: expanded from macro 'time_before' 138 | #define time_before(a,b) time_after(b,a) | ^ include/linux/jiffies.h:128:3: note: expanded from macro 'time_after' 128 | (typecheck(unsigned long, a) && \ | ^ include/linux/typecheck.h:11:12: note: expanded from macro 'typecheck' 11 | typeof(x) __dummy2; \ | ^ include/linux/list.h:409:27: warning: default initialization of an object of type 'union (unnamed union at include/linux/list.h:409:27)' with const member leaves the object uninitialized [-Wdefault-const-init-field-unsafe] 409 | struct list_head *next = smp_load_acquire(&head->next); | ^ include/asm-generic/barrier.h:176:29: note: expanded from macro 'smp_load_acquire' 176 | #define smp_load_acquire(p) __smp_load_acquire(p) | ^ arch/arm64/include/asm/barrier.h:164:59: note: expanded from macro '__smp_load_acquire' 164 | union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u; \ | ^ include/linux/list.h:409:27: note: member '__val' declared 'const' here crypto/scatterwalk.c:66:22: error: default initialization of an object of type 'struct scatter_walk' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe] 66 | struct scatter_walk walk; | ^ include/crypto/algapi.h:112:15: note: member 'addr' declared 'const' here 112 | void *const addr; | ^ fs/hugetlbfs/inode.c:733:24: error: default initialization of an object of type 'struct vm_area_struct' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe] 733 | struct vm_area_struct pseudo_vma; | ^ include/linux/mm_types.h:803:20: note: member 'vm_flags' declared 'const' here 803 | const vm_flags_t vm_flags; | ^ Silencing the instances from typecheck.h is difficult because '= {}' is not available in older but supported compilers and '= {0}' would cause warnings about a literal 0 being treated as NULL. While it might be possible to come up with a local hack to silence the warning for clang-21+, it may not be worth it since -Wuninitialized will still trigger if an uninitialized const variable is actually used. In all audited cases of the "field" variant of the warning, the members are either not used in the particular call path, modified through other means such as memset() / memcpy() because the containing object is not const, or are within a union with other non-const members. Since this warning does not appear to have a high signal to noise ratio, just disable it. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/commit/576161cb6069e2c7656a8ef530727a0f4aefff30 [1] Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/CA+G9fYuNjKcxFKS_MKPRuga32XbndkLGcY-PVuoSwzv6VWbY=w@mail.gmail.com/ Reported-by: Marcus Seyfarth Closes: https://github.com/ClangBuiltLinux/linux/issues/2088 Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/Makefile.extrawarn | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index d88acdf4085524..fd649c68e198ba 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -37,6 +37,18 @@ KBUILD_CFLAGS += -Wno-gnu # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf) + +# Clang may emit a warning when a const variable, such as the dummy variables +# in typecheck(), or const member of an aggregate type are not initialized, +# which can result in unexpected behavior. However, in many audited cases of +# the "field" variant of the warning, this is intentional because the field is +# never used within a particular call path, the field is within a union with +# other non-const members, or the containing object is not const so the field +# can be modified via memcpy() / memset(). While the variable warning also gets +# disabled with this same switch, there should not be too much coverage lost +# because -Wuninitialized will still flag when an uninitialized const variable +# is used. +KBUILD_CFLAGS += $(call cc-disable-warning, default-const-init-unsafe) else # gcc inanely warns about local variables called 'main' From a4eab2c2696dee5ce04cebde968e4ce557409122 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 7 May 2025 16:49:33 +0900 Subject: [PATCH 115/353] um: let 'make clean' properly clean underlying SUBARCH as well Building the kernel with O= is affected by stale in-tree build artifacts. So, if the source tree is not clean, Kbuild displays the following: $ make ARCH=um O=build defconfig make[1]: Entering directory '/.../linux/build' *** *** The source tree is not clean, please run 'make ARCH=um mrproper' *** in /.../linux *** make[2]: *** [/.../linux/Makefile:673: outputmakefile] Error 1 make[1]: *** [/.../linux/Makefile:248: __sub-make] Error 2 make[1]: Leaving directory '/.../linux/build' make: *** [Makefile:248: __sub-make] Error 2 Usually, running 'make mrproper' is sufficient for cleaning the source tree for out-of-tree builds. However, building UML generates build artifacts not only in arch/um/, but also in the SUBARCH directory (i.e., arch/x86/). If in-tree stale files remain under arch/x86/, Kbuild will reuse them instead of creating new ones under the specified build directory. This commit makes 'make ARCH=um clean' recurse into the SUBARCH directory. Reported-by: Shuah Khan Closes: https://lore.kernel.org/lkml/20250502172459.14175-1-skhan@linuxfoundation.org/ Signed-off-by: Masahiro Yamada Acked-by: Johannes Berg Reviewed-by: David Gow Reviewed-by: Shuah Khan --- arch/um/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/Makefile b/arch/um/Makefile index 1d36a613aad83d..9ed792e565c917 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -154,5 +154,6 @@ MRPROPER_FILES += $(HOST_DIR)/include/generated archclean: @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '*.gcov' \) -type f -print | xargs rm -f + $(Q)$(MAKE) -f $(srctree)/Makefile ARCH=$(HEADER_ARCH) clean export HEADER_ARCH SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING DEV_NULL_PATH From a6dea6fe96ec7ebbcb5f5fae0917a6c8b569b5d8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 9 May 2025 22:23:59 +0900 Subject: [PATCH 116/353] init: remove unused CONFIG_CC_CAN_LINK_STATIC This is a leftover from commit 98e20e5e13d2 ("bpfilter: remove bpfilter"). Signed-off-by: Masahiro Yamada --- init/Kconfig | 5 ----- 1 file changed, 5 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 4cdd1049283c15..bf3a920064bec1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -87,11 +87,6 @@ config CC_CAN_LINK default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag)) -config CC_CAN_LINK_STATIC - bool - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) - # Fixed in GCC 14, 13.3, 12.4 and 11.5 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 config GCC_ASM_GOTO_OUTPUT_BROKEN From 2c520a5652fa402deca10b1047cc1df7bbda43fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 11 May 2025 12:55:19 +0900 Subject: [PATCH 117/353] kbuild: fix dependency on sorttable Commit ac4f06789b4f ("kbuild: Create intermediate vmlinux build with relocations preserved") missed replacing one occurrence of "vmlinux" that was added during the same development cycle. Fixes: ac4f06789b4f ("kbuild: Create intermediate vmlinux build with relocations preserved") Signed-off-by: Masahiro Yamada Acked-by: Ard Biesheuvel --- scripts/Makefile.vmlinux | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 85d60d98640169..0c4bb1274a08fb 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -94,7 +94,7 @@ $(vmlinux-final): $(RESOLVE_BTFIDS) endif ifdef CONFIG_BUILDTIME_TABLE_SORT -vmlinux: scripts/sorttable +$(vmlinux-final): scripts/sorttable endif # module.builtin.ranges From f41d6c0997690fa72a8ad5cdb9b2f8b5bcc0105d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 11 May 2025 08:02:27 +0200 Subject: [PATCH 118/353] Revert "kbuild: make all file references relative to source root" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit cacd22ce69585a91c386243cd662ada962431e63. -ffile-prefix-map breaks the ability of debuggers to find the source file corresponding to object files. As there is no simple or uniform way to specify the source directory explicitly, this breaks developers workflows. Revert the unconditional usage of -ffile-prefix-map. Reported-by: Matthieu Baerts Closes: https://lore.kernel.org/lkml/edc50aa7-0740-4942-8c15-96f12f2acc7e@kernel.org/ Reported-by: Ville Syrjälä Closes: https://lore.kernel.org/lkml/aBEttQH4kimHFScx@intel.com/ Fixes: cacd22ce6958 ("kbuild: make all file references relative to source root") Signed-off-by: Thomas Weißschuh Signed-off-by: Masahiro Yamada --- Documentation/kbuild/reproducible-builds.rst | 17 +++++++++++++++++ Makefile | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/kbuild/reproducible-builds.rst b/Documentation/kbuild/reproducible-builds.rst index a7762486c93fcd..f2dcc39044e66d 100644 --- a/Documentation/kbuild/reproducible-builds.rst +++ b/Documentation/kbuild/reproducible-builds.rst @@ -46,6 +46,21 @@ The kernel embeds the building user and host names in `KBUILD_BUILD_USER and KBUILD_BUILD_HOST`_ variables. If you are building from a git commit, you could use its committer address. +Absolute filenames +------------------ + +When the kernel is built out-of-tree, debug information may include +absolute filenames for the source files. This must be overridden by +including the ``-fdebug-prefix-map`` option in the `KCFLAGS`_ variable. + +Depending on the compiler used, the ``__FILE__`` macro may also expand +to an absolute filename in an out-of-tree build. Kbuild automatically +uses the ``-fmacro-prefix-map`` option to prevent this, if it is +supported. + +The Reproducible Builds web site has more information about these +`prefix-map options`_. + Generated files in source packages ---------------------------------- @@ -116,5 +131,7 @@ See ``scripts/setlocalversion`` for details. .. _KBUILD_BUILD_TIMESTAMP: kbuild.html#kbuild-build-timestamp .. _KBUILD_BUILD_USER and KBUILD_BUILD_HOST: kbuild.html#kbuild-build-user-kbuild-build-host +.. _KCFLAGS: kbuild.html#kcflags +.. _prefix-map options: https://reproducible-builds.org/docs/build-path/ .. _Reproducible Builds project: https://reproducible-builds.org/ .. _SOURCE_DATE_EPOCH: https://reproducible-builds.org/docs/source-date-epoch/ diff --git a/Makefile b/Makefile index 64c514f4bc1939..52f84aaf102525 100644 --- a/Makefile +++ b/Makefile @@ -1068,7 +1068,7 @@ KBUILD_CFLAGS += -fno-builtin-wcslen # change __FILE__ to the relative path to the source directory ifdef building_out_of_srctree -KBUILD_CPPFLAGS += $(call cc-option,-ffile-prefix-map=$(srcroot)/=) +KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=) KBUILD_RUSTFLAGS += --remap-path-prefix=$(srcroot)/= endif From 7c879d426a2c39b0fdbb526ab042a2204545ae12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 11 May 2025 08:02:28 +0200 Subject: [PATCH 119/353] Revert "kbuild, rust: use -fremap-path-prefix to make paths relative" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit dbdffaf50ff9cee3259a7cef8a7bd9e0f0ba9f13. --remap-path-prefix breaks the ability of debuggers to find the source file corresponding to object files. As there is no simple or uniform way to specify the source directory explicitly, this breaks developers workflows. Revert the unconditional usage of --remap-path-prefix, equivalent to the same change for -ffile-prefix-map in KBUILD_CPPFLAGS. Fixes: dbdffaf50ff9 ("kbuild, rust: use -fremap-path-prefix to make paths relative") Signed-off-by: Thomas Weißschuh Acked-by: Miguel Ojeda Signed-off-by: Masahiro Yamada --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 52f84aaf102525..0eb9e6c49b32aa 100644 --- a/Makefile +++ b/Makefile @@ -1069,7 +1069,6 @@ KBUILD_CFLAGS += -fno-builtin-wcslen # change __FILE__ to the relative path to the source directory ifdef building_out_of_srctree KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=) -KBUILD_RUSTFLAGS += --remap-path-prefix=$(srcroot)/= endif # include additional Makefiles when needed From 8b00744cf71d96c50ad075aed50ceefcec8554fb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 12 May 2025 14:36:58 +0900 Subject: [PATCH 120/353] kbuild: fix typos "module.builtin" to "modules.builtin" The filenames in the comments do not match the actual generated files. Signed-off-by: Masahiro Yamada --- scripts/Makefile.vmlinux | 2 +- scripts/Makefile.vmlinux_o | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 0c4bb1274a08fb..b64862dc6f08d4 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -97,7 +97,7 @@ ifdef CONFIG_BUILDTIME_TABLE_SORT $(vmlinux-final): scripts/sorttable endif -# module.builtin.ranges +# modules.builtin.ranges # --------------------------------------------------------------------------- ifdef CONFIG_BUILTIN_MODULE_RANGES __default: modules.builtin.ranges diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 938c7457717ea0..b024ffb3e20187 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -73,7 +73,7 @@ vmlinux.o: $(initcalls-lds) vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE targets += vmlinux.o -# module.builtin.modinfo +# modules.builtin.modinfo # --------------------------------------------------------------------------- OBJCOPYFLAGS_modules.builtin.modinfo := -j .modinfo -O binary @@ -82,7 +82,7 @@ targets += modules.builtin.modinfo modules.builtin.modinfo: vmlinux.o FORCE $(call if_changed,objcopy) -# module.builtin +# modules.builtin # --------------------------------------------------------------------------- # The second line aids cases where multiple modules share the same object. From 15f23871eac3a943680fbac27f2e81acf04cdfa7 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 29 Apr 2025 15:05:59 -0400 Subject: [PATCH 121/353] Bluetooth: MGMT: Fix MGMT_OP_ADD_DEVICE invalid device flags Device flags could be updated in the meantime while MGMT_OP_ADD_DEVICE is pending on hci_update_passive_scan_sync so instead of setting the current_flags as cmd->user_data just do a lookup using hci_conn_params_lookup and use the latest stored flags. Fixes: a182d9c84f9c ("Bluetooth: MGMT: Fix Add Device to responding before completing") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c1e1e529e26cc2..46b22708dfbd2d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7506,11 +7506,16 @@ static void add_device_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_cp_add_device *cp = cmd->param; if (!err) { + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, hdev->conn_flags, - PTR_UINT(cmd->user_data)); + params ? params->flags : 0); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, @@ -7613,8 +7618,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd->user_data = UINT_PTR(current_flags); - err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, add_device_complete); if (err < 0) { From bbe36846f7a023c56e8e2a300e99eaec0a2e79c8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 30 Apr 2025 15:07:03 -0400 Subject: [PATCH 122/353] Bluetooth: hci_event: Fix not using key encryption size when its known This fixes the regression introduced by 50c1241e6a8a ("Bluetooth: l2cap: Check encryption key size on incoming connection") introduced a check for l2cap_check_enc_key_size which checks for hcon->enc_key_size which may not be initialized if HCI_OP_READ_ENC_KEY_SIZE is still pending. If the key encryption size is known, due previously reading it using HCI_OP_READ_ENC_KEY_SIZE, then store it as part of link_key/smp_ltk structures so the next time the encryption is changed their values are used as conn->enc_key_size thus avoiding the racing against HCI_OP_READ_ENC_KEY_SIZE. Now that the enc_size is stored as part of key the information the code then attempts to check that there is no downgrade of security if HCI_OP_READ_ENC_KEY_SIZE returns a value smaller than what has been previously stored. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220061 Link: https://bugzilla.kernel.org/show_bug.cgi?id=220063 Fixes: 522e9ed157e3 ("Bluetooth: l2cap: Check encryption key size on incoming connection") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 24 +++++++++++ net/bluetooth/hci_event.c | 73 ++++++++++++++++++-------------- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 522d837a23fa46..54bfeeaa09959b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1798,6 +1798,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); +u8 *hci_conn_key_enc_size(struct hci_conn *conn); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6533e281ada333..946d2ae551f86c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -3023,3 +3023,27 @@ void hci_conn_tx_dequeue(struct hci_conn *conn) kfree_skb(skb); } + +u8 *hci_conn_key_enc_size(struct hci_conn *conn) +{ + if (conn->type == ACL_LINK) { + struct link_key *key; + + key = hci_find_link_key(conn->hdev, &conn->dst); + if (!key) + return NULL; + + return &key->pin_len; + } else if (conn->type == LE_LINK) { + struct smp_ltk *ltk; + + ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type, + conn->role); + if (!ltk) + return NULL; + + return <k->enc_size; + } + + return NULL; +} diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6d6061111ac567..c38ada69c3d7f2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -739,10 +739,17 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data, handle); conn->enc_key_size = 0; } else { + u8 *key_enc_size = hci_conn_key_enc_size(conn); + conn->enc_key_size = rp->key_size; status = 0; - if (conn->enc_key_size < hdev->min_enc_key_size) { + /* Attempt to check if the key size is too small or if it has + * been downgraded from the last time it was stored as part of + * the link_key. + */ + if (conn->enc_key_size < hdev->min_enc_key_size || + (key_enc_size && conn->enc_key_size < *key_enc_size)) { /* As slave role, the conn->state has been set to * BT_CONNECTED and l2cap conn req might not be received * yet, at this moment the l2cap layer almost does @@ -755,6 +762,10 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data, clear_bit(HCI_CONN_ENCRYPT, &conn->flags); clear_bit(HCI_CONN_AES_CCM, &conn->flags); } + + /* Update the key encryption size with the connection one */ + if (key_enc_size && *key_enc_size != conn->enc_key_size) + *key_enc_size = conn->enc_key_size; } hci_encrypt_cfm(conn, status); @@ -3065,6 +3076,34 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata, hci_dev_unlock(hdev); } +static int hci_read_enc_key_size(struct hci_dev *hdev, struct hci_conn *conn) +{ + struct hci_cp_read_enc_key_size cp; + u8 *key_enc_size = hci_conn_key_enc_size(conn); + + if (!read_key_size_capable(hdev)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + return -EOPNOTSUPP; + } + + bt_dev_dbg(hdev, "hcon %p", conn); + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + + /* If the key enc_size is already known, use it as conn->enc_key_size, + * otherwise use hdev->min_enc_key_size so the likes of + * l2cap_check_enc_key_size don't fail while waiting for + * HCI_OP_READ_ENC_KEY_SIZE response. + */ + if (key_enc_size && *key_enc_size) + conn->enc_key_size = *key_enc_size; + else + conn->enc_key_size = hdev->min_enc_key_size; + + return hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); +} + static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3157,23 +3196,11 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && ev->link_type == ACL_LINK) { struct link_key *key; - struct hci_cp_read_enc_key_size cp; key = hci_find_link_key(hdev, &ev->bdaddr); if (key) { set_bit(HCI_CONN_ENCRYPT, &conn->flags); - - if (!read_key_size_capable(hdev)) { - conn->enc_key_size = HCI_LINK_KEY_SIZE; - } else { - cp.handle = cpu_to_le16(conn->handle); - if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, - sizeof(cp), &cp)) { - bt_dev_err(hdev, "sending read key size failed"); - conn->enc_key_size = HCI_LINK_KEY_SIZE; - } - } - + hci_read_enc_key_size(hdev, conn); hci_encrypt_cfm(conn, ev->status); } } @@ -3612,24 +3639,8 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { - struct hci_cp_read_enc_key_size cp; - - /* Only send HCI_Read_Encryption_Key_Size if the - * controller really supports it. If it doesn't, assume - * the default size (16). - */ - if (!read_key_size_capable(hdev)) { - conn->enc_key_size = HCI_LINK_KEY_SIZE; - goto notify; - } - - cp.handle = cpu_to_le16(conn->handle); - if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, - sizeof(cp), &cp)) { - bt_dev_err(hdev, "sending read key size failed"); - conn->enc_key_size = HCI_LINK_KEY_SIZE; + if (hci_read_enc_key_size(hdev, conn)) goto notify; - } goto unlock; } From 8f6385267c3f609ad884ec4e4f2b50e6046b4024 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 7 May 2025 21:47:45 +0100 Subject: [PATCH 123/353] net: qede: Initialize qede_ll_ops with designated initializer After a recent change [1] in clang's randstruct implementation to randomize structures that only contain function pointers, there is an error because qede_ll_ops get randomized but does not use a designated initializer for the first member: drivers/net/ethernet/qlogic/qede/qede_main.c:206:2: error: a randomized struct can only be initialized with a designated initializer 206 | { | ^ Explicitly initialize the common member using a designated initializer to fix the build. Cc: stable@vger.kernel.org Fixes: 035f7f87b729 ("randstruct: Enable Clang support") Link: https://github.com/llvm/llvm-project/commit/04364fb888eea6db9811510607bed4b200bcb082 [1] Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20250507-qede-fix-clang-randstruct-v1-1-5ccc15626fba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 99df00c30b8c6c..b5d744d2586f72 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -203,7 +203,7 @@ static struct pci_driver qede_pci_driver = { }; static struct qed_eth_cb_ops qede_ll_ops = { - { + .common = { #ifdef CONFIG_RFS_ACCEL .arfs_filter_op = qede_arfs_filter_op, #endif From 61066ec45e8c49fca7b4276b64b6c210a313334f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 May 2025 21:35:58 -0700 Subject: [PATCH 124/353] net_sched: Flush gso_skb list too during ->change() Previously, when reducing a qdisc's limit via the ->change() operation, only the main skb queue was trimmed, potentially leaving packets in the gso_skb list. This could result in NULL pointer dereference when we only check sch->limit against sch->q.qlen. This patch introduces a new helper, qdisc_dequeue_internal(), which ensures both the gso_skb list and the main queue are properly flushed when trimming excess packets. All relevant qdiscs (codel, fq, fq_codel, fq_pie, hhf, pie) are updated to use this helper in their ->change() routines. Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler") Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Fixes: d4b36210c2e6 ("net: pkt_sched: PIE AQM scheme") Reported-by: Will Reported-by: Savy Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 +++++++++++++++ net/sched/sch_codel.c | 2 +- net/sched/sch_fq.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_fq_pie.c | 2 +- net/sched/sch_hhf.c | 2 +- net/sched/sch_pie.c | 2 +- 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd01..1c05fed05f2bc2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1031,6 +1031,21 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + return skb; + } + if (direct) + return __qdisc_dequeue_head(&sch->q); + else + return sch->dequeue(sch); +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 12dd71139da396..c93761040c6e77 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -144,7 +144,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 2ca5332cfcc5c5..902ff54706072b 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1136,7 +1136,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); } while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6c9029f71e88d3..2a0f3a513bfaa1 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -441,7 +441,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { - struct sk_buff *skb = fq_codel_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index f3b8203d3e855b..df7fac95ab1513 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -366,7 +366,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 44d9efe1a96a89..5aa434b4670738 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -564,7 +564,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = hhf_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); rtnl_kfree_skbs(skb, skb); } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 3771d000b30d08..ff49a6c97033aa 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -195,7 +195,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); From 862cba8604664011a70e3867be0693e948f8e263 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 May 2025 21:35:59 -0700 Subject: [PATCH 125/353] selftests/tc-testing: Add qdisc limit trimming tests Added new test cases for FQ, FQ_CODEL, FQ_PIE, and HHF qdiscs to verify queue trimming behavior when the qdisc limit is dynamically reduced. Each test injects packets, reduces the qdisc limit, and checks that the new limit is enforced. This is still best effort since timing qdisc backlog is not easy. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/qdiscs/codel.json | 24 +++++++++++++++++++ .../tc-testing/tc-tests/qdiscs/fq.json | 22 +++++++++++++++++ .../tc-testing/tc-tests/qdiscs/fq_codel.json | 22 +++++++++++++++++ .../tc-testing/tc-tests/qdiscs/fq_pie.json | 22 +++++++++++++++++ .../tc-testing/tc-tests/qdiscs/hhf.json | 22 +++++++++++++++++ .../tc-testing/tc-tests/qdiscs/pie.json | 24 +++++++++++++++++++ 6 files changed, 136 insertions(+) create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json index e9469ee71e6fc7..6d515d0e5ed696 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json @@ -189,5 +189,29 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "deb1", + "name": "CODEL test qdisc limit trimming", + "category": ["qdisc", "codel"], + "plugins": { + "requires": ["nsPlugin", "scapyPlugin"] + }, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root codel limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root codel limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc codel 1: root refcnt [0-9]+ limit 1p target 5ms interval 100ms", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json index 3a537b2ec4c972..24faf4e12dfa05 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json @@ -377,5 +377,27 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "9479", + "name": "FQ test qdisc limit trimming", + "category": ["qdisc", "fq"], + "plugins": {"requires": ["nsPlugin", "scapyPlugin"]}, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root fq limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc fq 1: root refcnt [0-9]+ limit 1p", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json index 9774b1e8801bba..4ce62b857fd7ab 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json @@ -294,5 +294,27 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "0436", + "name": "FQ_CODEL test qdisc limit trimming", + "category": ["qdisc", "fq_codel"], + "plugins": {"requires": ["nsPlugin", "scapyPlugin"]}, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root fq_codel limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq_codel limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc fq_codel 1: root refcnt [0-9]+ limit 1p flows 1024 quantum.*target 5ms interval 100ms memory_limit 32Mb ecn drop_batch 64", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json index d012d88d67fee6..229fe1bf4a9062 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json @@ -18,5 +18,27 @@ "matchCount": "1", "teardown": [ ] + }, + { + "id": "83bf", + "name": "FQ_PIE test qdisc limit trimming", + "category": ["qdisc", "fq_pie"], + "plugins": {"requires": ["nsPlugin", "scapyPlugin"]}, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root fq_pie limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq_pie limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc fq_pie 1: root refcnt [0-9]+ limit 1p", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json index dbef5474b26bdc..0ca19fac54a57d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json @@ -188,5 +188,27 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "385f", + "name": "HHF test qdisc limit trimming", + "category": ["qdisc", "hhf"], + "plugins": {"requires": ["nsPlugin", "scapyPlugin"]}, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root hhf limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root hhf limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc hhf 1: root refcnt [0-9]+ limit 1p.*hh_limit 2048 reset_timeout 40ms admit_bytes 128Kb evict_timeout 1s non_hh_weight 2", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json new file mode 100644 index 00000000000000..1a98b66e803071 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json @@ -0,0 +1,24 @@ +[ + { + "id": "6158", + "name": "PIE test qdisc limit trimming", + "category": ["qdisc", "pie"], + "plugins": {"requires": ["nsPlugin", "scapyPlugin"]}, + "setup": [ + "$TC qdisc add dev $DEV1 handle 1: root pie limit 10" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 10, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)" + } + ], + "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root pie limit 1", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DEV1", + "matchPattern": "qdisc pie 1: root refcnt [0-9]+ limit 1p", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"] + } +] From 2a792e38c81debacdf2bc5abbbf6190062bc3d1d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 8 May 2025 03:54:14 +0000 Subject: [PATCH 126/353] tools/net/ynl: ethtool: fix crash when Hardware Clock info is missing Fix a crash in the ethtool YNL implementation when Hardware Clock information is not present in the response. This ensures graceful handling of devices or drivers that do not provide this optional field. e.g. Traceback (most recent call last): File "/net/tools/net/ynl/pyynl/./ethtool.py", line 438, in main() ~~~~^^ File "/net/tools/net/ynl/pyynl/./ethtool.py", line 341, in main print(f'PTP Hardware Clock: {tsinfo["phc-index"]}') ~~~~~~^^^^^^^^^^^^^ KeyError: 'phc-index' Fixes: f3d07b02b2b8 ("tools: ynl: ethtool testing tool") Signed-off-by: Hangbin Liu Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508035414.82974-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ethtool.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py index af7fddd7b085bf..cab6b576c8762e 100755 --- a/tools/net/ynl/pyynl/ethtool.py +++ b/tools/net/ynl/pyynl/ethtool.py @@ -338,16 +338,24 @@ def main(): print('Capabilities:') [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])] - print(f'PTP Hardware Clock: {tsinfo["phc-index"]}') + print(f'PTP Hardware Clock: {tsinfo.get("phc-index", "none")}') - print('Hardware Transmit Timestamp Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] + if 'tx-types' in tsinfo: + print('Hardware Transmit Timestamp Modes:') + [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] + else: + print('Hardware Transmit Timestamp Modes: none') + + if 'rx-filters' in tsinfo: + print('Hardware Receive Filter Modes:') + [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + else: + print('Hardware Receive Filter Modes: none') - print('Hardware Receive Filter Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + if 'stats' in tsinfo and tsinfo['stats']: + print('Statistics:') + [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] - print('Statistics:') - [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return print(f'Settings for {args.device}:') From 0fb820985fae57af271381bf82cdfbabd2bb67ac Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 8 May 2025 13:18:32 +0800 Subject: [PATCH 127/353] net: mctp: Don't access ifa_index when missing In mctp_dump_addrinfo, ifa_index can be used to filter interfaces, but only when the struct ifaddrmsg is provided. Otherwise it will be comparing to uninitialised memory - reproducible in the syzkaller case from dhcpd, or busybox "ip addr show". The kernel MCTP implementation has always filtered by ifa_index, so existing userspace programs expecting to dump MCTP addresses must already be passing a valid ifa_index value (either 0 or a real index). BUG: KMSAN: uninit-value in mctp_dump_addrinfo+0x208/0xac0 net/mctp/device.c:128 mctp_dump_addrinfo+0x208/0xac0 net/mctp/device.c:128 rtnl_dump_all+0x3ec/0x5b0 net/core/rtnetlink.c:4380 rtnl_dumpit+0xd5/0x2f0 net/core/rtnetlink.c:6824 netlink_dump+0x97b/0x1690 net/netlink/af_netlink.c:2309 Fixes: 583be982d934 ("mctp: Add device handling and netlink interface") Reported-by: syzbot+e76d52dadc089b9d197f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68135815.050a0220.3a872c.000e.GAE@google.com/ Reported-by: syzbot+1065a199625a388fce60@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/681357d6.050a0220.14dd7d.000d.GAE@google.com/ Signed-off-by: Matt Johnston Link: https://patch.msgid.link/20250508-mctp-addr-dump-v2-1-c8a53fd2dd66@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/device.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/mctp/device.c b/net/mctp/device.c index 8e0724c56723de..7c0dcf3df31962 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -117,11 +117,18 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct ifaddrmsg *hdr; struct mctp_dev *mdev; - int ifindex, rc; - - hdr = nlmsg_data(cb->nlh); - // filter by ifindex if requested - ifindex = hdr->ifa_index; + int ifindex = 0, rc; + + /* Filter by ifindex if a header is provided */ + if (cb->nlh->nlmsg_len >= nlmsg_msg_size(sizeof(*hdr))) { + hdr = nlmsg_data(cb->nlh); + ifindex = hdr->ifa_index; + } else { + if (cb->strict_check) { + NL_SET_ERR_MSG(cb->extack, "mctp: Invalid header for addr dump request"); + return -EINVAL; + } + } rcu_read_lock(); for_each_netdev_dump(net, dev, mcb->ifindex) { From 361f86d374e0f09f50a2c3851956dd44f99688de Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 8 May 2025 11:44:34 +0300 Subject: [PATCH 128/353] tests/ncdevmem: Fix double-free of queue array netdev_bind_rx takes ownership of the queue array passed as parameter and frees it, so a queue array buffer cannot be reused across multiple netdev_bind_rx calls. This commit fixes that by always passing in a newly created queue array to all netdev_bind_rx calls in ncdevmem. Fixes: 85585b4bc8d8 ("selftests: add ncdevmem, netcat for devmem TCP") Signed-off-by: Cosmin Ratiu Acked-by: Stanislav Fomichev Reviewed-by: Joe Damato Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250508084434.1933069-1-cratiu@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/ncdevmem.c | 55 ++++++++----------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 2bf14ac2b8c624..9d48004ff1a178 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -431,6 +431,22 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) return 0; } +static struct netdev_queue_id *create_queues(void) +{ + struct netdev_queue_id *queues; + size_t i = 0; + + queues = calloc(num_queues, sizeof(*queues)); + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + return queues; +} + int do_server(struct memory_buffer *mem) { char ctrl_data[sizeof(int) * 20000]; @@ -448,7 +464,6 @@ int do_server(struct memory_buffer *mem) char buffer[256]; int socket_fd; int client_fd; - size_t i = 0; int ret; ret = parse_address(server_ip, atoi(port), &server_sin); @@ -471,16 +486,7 @@ int do_server(struct memory_buffer *mem) sleep(1); - queues = malloc(sizeof(*queues) * num_queues); - - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - - if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys)) error(1, 0, "Failed to bind\n"); tmp_mem = malloc(mem->size); @@ -545,7 +551,6 @@ int do_server(struct memory_buffer *mem) goto cleanup; } - i++; for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { if (cm->cmsg_level != SOL_SOCKET || (cm->cmsg_type != SCM_DEVMEM_DMABUF && @@ -630,10 +635,8 @@ int do_server(struct memory_buffer *mem) void run_devmem_tests(void) { - struct netdev_queue_id *queues; struct memory_buffer *mem; struct ynl_sock *ys; - size_t i = 0; mem = provider->alloc(getpagesize() * NUM_PAGES); @@ -641,38 +644,24 @@ void run_devmem_tests(void) if (configure_rss()) error(1, 0, "rss error\n"); - queues = calloc(num_queues, sizeof(*queues)); - if (configure_headersplit(1)) error(1, 0, "Failed to configure header split\n"); - if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + if (!bind_rx_queue(ifindex, mem->fd, + calloc(num_queues, sizeof(struct netdev_queue_id)), + num_queues, &ys)) error(1, 0, "Binding empty queues array should have failed\n"); - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - if (configure_headersplit(0)) error(1, 0, "Failed to configure header split\n"); - if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + if (!bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys)) error(1, 0, "Configure dmabuf with header split off should have failed\n"); if (configure_headersplit(1)) error(1, 0, "Failed to configure header split\n"); - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - - if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys)) error(1, 0, "Failed to bind\n"); /* Deactivating a bound queue should not be legal */ From 7bfa3f2a5d28f287b7aa6196459770b3afdb1721 Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Thu, 8 May 2025 14:16:00 +0930 Subject: [PATCH 129/353] net: mctp: Ensure keys maintain only one ref to corresponding dev mctp_flow_prepare_output() is called in mctp_route_output(), which places outbound packets onto a given interface. The packet may represent a message fragment, in which case we provoke an unbalanced reference count to the underlying device. This causes trouble if we ever attempt to remove the interface: [ 48.702195] usb 1-1: USB disconnect, device number 2 [ 58.883056] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 [ 69.022548] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 [ 79.172568] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 ... Predicate the invocation of mctp_dev_set_key() in mctp_flow_prepare_output() on not already having associated the device with the key. It's not yet realistic to uphold the property that the key maintains only one device reference earlier in the transmission sequence as the route (and therefore the device) may not be known at the time the key is associated with the socket. Fixes: 67737c457281 ("mctp: Pass flow data & flow release events to drivers") Acked-by: Jeremy Kerr Signed-off-by: Andrew Jeffery Link: https://patch.msgid.link/20250508-mctp-dev-refcount-v1-1-d4f965c67bb5@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index 4c460160914f01..d9c8e5a5f9ce9a 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -313,8 +313,10 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (WARN_ON(key->dev && key->dev != dev)) + if (key->dev) { + WARN_ON(key->dev != dev); return; + } mctp_dev_set_key(dev, key); } From 550c306c50eb45a9406ebc8f6a3df7acca994f6d Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 16 Apr 2025 20:37:56 +0200 Subject: [PATCH 130/353] batman-adv: fix duplicate MAC address check batadv_check_known_mac_addr() is both too lenient and too strict: - It is called from batadv_hardif_add_interface(), which means that it checked interfaces that are not used for batman-adv at all. Move it to batadv_hardif_enable_interface(). Also, restrict it to hardifs of the same mesh interface; different mesh interfaces should not interact at all. The batadv_check_known_mac_addr() argument is changed from `struct net_device` to `struct batadv_hard_iface` to achieve this. - The check only cares about hardifs in BATADV_IF_ACTIVE and BATADV_IF_TO_BE_ACTIVATED states, but interfaces in BATADV_IF_INACTIVE state should be checked as well, or the following steps will not result in a warning then they should: - Add two interfaces in down state with different MAC addresses to a mesh as hardifs - Change the MAC addresses so they conflict - Set interfaces to up state Now there will be two active hardifs with the same MAC address, but no warning. Fix by only ignoring hardifs in BATADV_IF_NOT_IN_USE state. The RCU lock can be dropped, as we're holding RTNL anyways when the function is called. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Matthias Schiffer Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 7cd4bdcee43935..558d39dffc2330 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -506,28 +506,32 @@ batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) return false; } -static void batadv_check_known_mac_addr(const struct net_device *net_dev) +static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface) { - const struct batadv_hard_iface *hard_iface; + const struct net_device *mesh_iface = hard_iface->mesh_iface; + const struct batadv_hard_iface *tmp_hard_iface; - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->if_status != BATADV_IF_ACTIVE && - hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) + if (!mesh_iface) + return; + + list_for_each_entry(tmp_hard_iface, &batadv_hardif_list, list) { + if (tmp_hard_iface == hard_iface) + continue; + + if (tmp_hard_iface->mesh_iface != mesh_iface) continue; - if (hard_iface->net_dev == net_dev) + if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; - if (!batadv_compare_eth(hard_iface->net_dev->dev_addr, - net_dev->dev_addr)) + if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr, + hard_iface->net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", - net_dev->dev_addr, hard_iface->net_dev->name); + hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } - rcu_read_unlock(); } /** @@ -763,6 +767,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->net_dev->name, hardif_mtu, required_mtu); + batadv_check_known_mac_addr(hard_iface); + if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); else @@ -901,7 +907,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_v_hardif_init(hard_iface); - batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); batadv_hardif_generation++; @@ -988,7 +993,7 @@ static int batadv_hard_if_event(struct notifier_block *this, if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) goto hardif_put; - batadv_check_known_mac_addr(hard_iface->net_dev); + batadv_check_known_mac_addr(hard_iface); bat_priv = netdev_priv(hard_iface->mesh_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); From 12a4549808b647f91784a565e641cbdddba1cc33 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Fri, 9 May 2025 14:19:35 +0200 Subject: [PATCH 131/353] net: cadence: macb: Fix a possible deadlock in macb_halt_tx. There is a situation where after THALT is set high, TGO stays high as well. Because jiffies are never updated, as we are in a context with interrupts disabled, we never exit that loop and have a deadlock. That deadlock was noticed on a sama5d4 device that stayed locked for days. Use retries instead of jiffies so that the timeout really works and we do not have a deadlock anymore. Fixes: e86cd53afc590 ("net/macb: better manage tx errors") Signed-off-by: Mathieu Othacehe Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250509121935.16282-1-othacehe@gnu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 1fe8ec37491b19..e1e8bd2ec155b8 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -997,22 +997,15 @@ static void macb_update_stats(struct macb *bp) static int macb_halt_tx(struct macb *bp) { - unsigned long halt_time, timeout; - u32 status; + u32 status; macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(THALT)); - timeout = jiffies + usecs_to_jiffies(MACB_HALT_TIMEOUT); - do { - halt_time = jiffies; - status = macb_readl(bp, TSR); - if (!(status & MACB_BIT(TGO))) - return 0; - - udelay(250); - } while (time_before(halt_time, timeout)); - - return -ETIMEDOUT; + /* Poll TSR until TGO is cleared or timeout. */ + return read_poll_timeout_atomic(macb_readl, status, + !(status & MACB_BIT(TGO)), + 250, MACB_HALT_TIMEOUT, false, + bp, TSR); } static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb, int budget) From cc102e91e7071182721d183b28b0ed10d3cd540d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 9 May 2025 10:28:50 +0300 Subject: [PATCH 132/353] net: Lock lower level devices when updating features __netdev_update_features() expects the netdevice to be ops-locked, but it gets called recursively on the lower level netdevices to sync their features, and nothing locks those. This commit fixes that, with the assumption that it shouldn't be possible for both higher-level and lover-level netdevices to require the instance lock, because that would lead to lock dependency warnings. Without this, playing with higher level (e.g. vxlan) netdevices on top of netdevices with instance locking enabled can run into issues: WARNING: CPU: 59 PID: 206496 at ./include/net/netdev_lock.h:17 netif_napi_add_weight_locked+0x753/0xa60 [...] Call Trace: mlx5e_open_channel+0xc09/0x3740 [mlx5_core] mlx5e_open_channels+0x1f0/0x770 [mlx5_core] mlx5e_safe_switch_params+0x1b5/0x2e0 [mlx5_core] set_feature_lro+0x1c2/0x330 [mlx5_core] mlx5e_handle_feature+0xc8/0x140 [mlx5_core] mlx5e_set_features+0x233/0x2e0 [mlx5_core] __netdev_update_features+0x5be/0x1670 __netdev_update_features+0x71f/0x1670 dev_ethtool+0x21c5/0x4aa0 dev_ioctl+0x438/0xae0 sock_ioctl+0x2ba/0x690 __x64_sys_ioctl+0xa78/0x1700 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 7e4d784f5810 ("net: hold netdev instance lock during rtnetlink operations") Signed-off-by: Cosmin Ratiu Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250509072850.2002821-1-cratiu@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 11da1e272ec205..0d891634c69270 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10441,6 +10441,7 @@ static void netdev_sync_lower_features(struct net_device *upper, if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); + netdev_lock_ops(lower); lower->wanted_features &= ~feature; __netdev_update_features(lower); @@ -10449,6 +10450,7 @@ static void netdev_sync_lower_features(struct net_device *upper, &feature, lower->name); else netdev_features_change(lower); + netdev_unlock_ops(lower); } } } From 687933109d7157f2c63d05e7ea0bed07d909eef3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 9 May 2025 14:38:16 +0300 Subject: [PATCH 133/353] net: dsa: sja1105: discard incoming frames in BR_STATE_LISTENING It has been reported that when under a bridge with stp_state=1, the logs get spammed with this message: [ 251.734607] fsl_dpaa2_eth dpni.5 eth0: Couldn't decode source port Further debugging shows the following info associated with packets: source_port=-1, switch_id=-1, vid=-1, vbid=1 In other words, they are data plane packets which are supposed to be decoded by dsa_tag_8021q_find_port_by_vbid(), but the latter (correctly) refuses to do so, because no switch port is currently in BR_STATE_LEARNING or BR_STATE_FORWARDING - so the packet is effectively unexpected. The error goes away after the port progresses to BR_STATE_LEARNING in 15 seconds (the default forward_time of the bridge), because then, dsa_tag_8021q_find_port_by_vbid() can correctly associate the data plane packets with a plausible bridge port in a plausible STP state. Re-reading IEEE 802.1D-1990, I see the following: "4.4.2 Learning: (...) The Forwarding Process shall discard received frames." IEEE 802.1D-2004 further clarifies: "DISABLED, BLOCKING, LISTENING, and BROKEN all correspond to the DISCARDING port state. While those dot1dStpPortStates serve to distinguish reasons for discarding frames, the operation of the Forwarding and Learning processes is the same for all of them. (...) LISTENING represents a port that the spanning tree algorithm has selected to be part of the active topology (computing a Root Port or Designated Port role) but is temporarily discarding frames to guard against loops or incorrect learning." Well, this is not what the driver does - instead it sets mac[port].ingress = true. To get rid of the log spam, prevent unexpected data plane packets to be received by software by discarding them on ingress in the LISTENING state. In terms of blame attribution: the prints only date back to commit d7f9787a763f ("net: dsa: tag_8021q: add support for imprecise RX based on the VBID"). However, the settings would permit a LISTENING port to forward to a FORWARDING port, and the standard suggests that's not OK. Fixes: 640f763f98c2 ("net: dsa: sja1105: Add support for Spanning Tree Protocol") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250509113816.2221992-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index f8454f3b6f9c5d..f674c400f05b29 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2081,6 +2081,7 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port, switch (state) { case BR_STATE_DISABLED: case BR_STATE_BLOCKING: + case BR_STATE_LISTENING: /* From UM10944 description of DRPDTAG (why put this there?): * "Management traffic flows to the port regardless of the state * of the INGRESS flag". So BPDUs are still be allowed to pass. @@ -2090,11 +2091,6 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port, mac[port].egress = false; mac[port].dyn_learn = false; break; - case BR_STATE_LISTENING: - mac[port].ingress = true; - mac[port].egress = false; - mac[port].dyn_learn = false; - break; case BR_STATE_LEARNING: mac[port].ingress = true; mac[port].egress = false; From 8f75c7b2780cdffa6f5e98d668b6bee421cd9148 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 4 May 2025 10:14:33 +0200 Subject: [PATCH 134/353] net: dsa: microchip: let phylink manage PHY EEE configuration on KSZ switches Phylink expects MAC drivers to provide LPI callbacks to properly manage Energy Efficient Ethernet (EEE) configuration. On KSZ switches with integrated PHYs, LPI is internally handled by hardware, while ports without integrated PHYs have no documented MAC-level LPI support. Provide dummy mac_disable_tx_lpi() and mac_enable_tx_lpi() callbacks to satisfy phylink requirements. Also, set default EEE capabilities during phylink initialization where applicable. Since phylink can now gracefully handle optional EEE configuration, remove the need for the MICREL_NO_EEE PHY flag. This change addresses issues caused by incomplete EEE refactoring introduced in commit fe0d4fd9285e ("net: phy: Keep track of EEE configuration"). It is not easily possible to fix all older kernels, but this patch ensures proper behavior on latest kernels and can be considered for backporting to stable kernels starting from v6.14. Fixes: fe0d4fd9285e ("net: phy: Keep track of EEE configuration") Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org # v6.14+ Link: https://patch.msgid.link/20250504081434.424489-2-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/dsa/microchip/ksz_common.c | 135 ++++++++++++++++++++----- 1 file changed, 107 insertions(+), 28 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 89f0796894af66..f95a9aac56ee1b 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -265,16 +265,70 @@ static void ksz_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface); +/** + * ksz_phylink_mac_disable_tx_lpi() - Callback to signal LPI support (Dummy) + * @config: phylink config structure + * + * This function is a dummy handler. See ksz_phylink_mac_enable_tx_lpi() for + * a detailed explanation of EEE/LPI handling in KSZ switches. + */ +static void ksz_phylink_mac_disable_tx_lpi(struct phylink_config *config) +{ +} + +/** + * ksz_phylink_mac_enable_tx_lpi() - Callback to signal LPI support (Dummy) + * @config: phylink config structure + * @timer: timer value before entering LPI (unused) + * @tx_clock_stop: whether to stop the TX clock in LPI mode (unused) + * + * This function signals to phylink that the driver architecture supports + * LPI management, enabling phylink to control EEE advertisement during + * negotiation according to IEEE Std 802.3 (Clause 78). + * + * Hardware Management of EEE/LPI State: + * For KSZ switch ports with integrated PHYs (e.g., KSZ9893R ports 1-2), + * observation and testing suggest that the actual EEE / Low Power Idle (LPI) + * state transitions are managed autonomously by the hardware based on + * the auto-negotiation results. (Note: While the datasheet describes EEE + * operation based on negotiation, it doesn't explicitly detail the internal + * MAC/PHY interaction, so autonomous hardware management of the MAC state + * for LPI is inferred from observed behavior). + * This hardware control, consistent with the switch's ability to operate + * autonomously via strapping, means MAC-level software intervention is not + * required or exposed for managing the LPI state once EEE is negotiated. + * (Ref: KSZ9893R Data Sheet DS00002420D, primarily Section 4.7.5 explaining + * EEE, also Sections 4.1.7 on Auto-Negotiation and 3.2.1 on Configuration + * Straps). + * + * Additionally, ports configured as MAC interfaces (e.g., KSZ9893R port 3) + * lack documented MAC-level LPI control. + * + * Therefore, this callback performs no action and serves primarily to inform + * phylink of LPI awareness and to document the inferred hardware behavior. + * + * Returns: 0 (Always success) + */ +static int ksz_phylink_mac_enable_tx_lpi(struct phylink_config *config, + u32 timer, bool tx_clock_stop) +{ + return 0; +} + static const struct phylink_mac_ops ksz88x3_phylink_mac_ops = { .mac_config = ksz88x3_phylink_mac_config, .mac_link_down = ksz_phylink_mac_link_down, .mac_link_up = ksz8_phylink_mac_link_up, + .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi, + .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi, }; static const struct phylink_mac_ops ksz8_phylink_mac_ops = { .mac_config = ksz_phylink_mac_config, .mac_link_down = ksz_phylink_mac_link_down, .mac_link_up = ksz8_phylink_mac_link_up, + .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi, + .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi, }; static const struct ksz_dev_ops ksz88xx_dev_ops = { @@ -358,6 +412,8 @@ static const struct phylink_mac_ops ksz9477_phylink_mac_ops = { .mac_config = ksz_phylink_mac_config, .mac_link_down = ksz_phylink_mac_link_down, .mac_link_up = ksz9477_phylink_mac_link_up, + .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi, + .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi, }; static const struct ksz_dev_ops ksz9477_dev_ops = { @@ -401,6 +457,8 @@ static const struct phylink_mac_ops lan937x_phylink_mac_ops = { .mac_config = ksz_phylink_mac_config, .mac_link_down = ksz_phylink_mac_link_down, .mac_link_up = ksz9477_phylink_mac_link_up, + .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi, + .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi, }; static const struct ksz_dev_ops lan937x_dev_ops = { @@ -2016,6 +2074,18 @@ static void ksz_phylink_get_caps(struct dsa_switch *ds, int port, if (dev->dev_ops->get_caps) dev->dev_ops->get_caps(dev, port, config); + + if (ds->ops->support_eee && ds->ops->support_eee(ds, port)) { + memcpy(config->lpi_interfaces, config->supported_interfaces, + sizeof(config->lpi_interfaces)); + + config->lpi_capabilities = MAC_100FD; + if (dev->info->gbit_capable[port]) + config->lpi_capabilities |= MAC_1000FD; + + /* EEE is fully operational */ + config->eee_enabled_default = true; + } } void ksz_r_mib_stats64(struct ksz_device *dev, int port) @@ -3008,31 +3078,6 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port) if (!port) return MICREL_KSZ8_P1_ERRATA; break; - case KSZ8567_CHIP_ID: - /* KSZ8567R Errata DS80000752C Module 4 */ - case KSZ8765_CHIP_ID: - case KSZ8794_CHIP_ID: - case KSZ8795_CHIP_ID: - /* KSZ879x/KSZ877x/KSZ876x Errata DS80000687C Module 2 */ - case KSZ9477_CHIP_ID: - /* KSZ9477S Errata DS80000754A Module 4 */ - case KSZ9567_CHIP_ID: - /* KSZ9567S Errata DS80000756A Module 4 */ - case KSZ9896_CHIP_ID: - /* KSZ9896C Errata DS80000757A Module 3 */ - case KSZ9897_CHIP_ID: - case LAN9646_CHIP_ID: - /* KSZ9897R Errata DS80000758C Module 4 */ - /* Energy Efficient Ethernet (EEE) feature select must be manually disabled - * The EEE feature is enabled by default, but it is not fully - * operational. It must be manually disabled through register - * controls. If not disabled, the PHY ports can auto-negotiate - * to enable EEE, and this feature can cause link drops when - * linked to another device supporting EEE. - * - * The same item appears in the errata for all switches above. - */ - return MICREL_NO_EEE; } return 0; @@ -3466,6 +3511,20 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port) return -EOPNOTSUPP; } +/** + * ksz_support_eee - Determine Energy Efficient Ethernet (EEE) support for a + * port + * @ds: Pointer to the DSA switch structure + * @port: Port number to check + * + * This function also documents devices where EEE was initially advertised but + * later withdrawn due to reliability issues, as described in official errata + * documents. These devices are explicitly listed to record known limitations, + * even if there is no technical necessity for runtime checks. + * + * Returns: true if the internal PHY on the given port supports fully + * operational EEE, false otherwise. + */ static bool ksz_support_eee(struct dsa_switch *ds, int port) { struct ksz_device *dev = ds->priv; @@ -3475,15 +3534,35 @@ static bool ksz_support_eee(struct dsa_switch *ds, int port) switch (dev->chip_id) { case KSZ8563_CHIP_ID: + case KSZ9563_CHIP_ID: + case KSZ9893_CHIP_ID: + return true; case KSZ8567_CHIP_ID: + /* KSZ8567R Errata DS80000752C Module 4 */ + case KSZ8765_CHIP_ID: + case KSZ8794_CHIP_ID: + case KSZ8795_CHIP_ID: + /* KSZ879x/KSZ877x/KSZ876x Errata DS80000687C Module 2 */ case KSZ9477_CHIP_ID: - case KSZ9563_CHIP_ID: + /* KSZ9477S Errata DS80000754A Module 4 */ case KSZ9567_CHIP_ID: - case KSZ9893_CHIP_ID: + /* KSZ9567S Errata DS80000756A Module 4 */ case KSZ9896_CHIP_ID: + /* KSZ9896C Errata DS80000757A Module 3 */ case KSZ9897_CHIP_ID: case LAN9646_CHIP_ID: - return true; + /* KSZ9897R Errata DS80000758C Module 4 */ + /* Energy Efficient Ethernet (EEE) feature select must be + * manually disabled + * The EEE feature is enabled by default, but it is not fully + * operational. It must be manually disabled through register + * controls. If not disabled, the PHY ports can auto-negotiate + * to enable EEE, and this feature can cause link drops when + * linked to another device supporting EEE. + * + * The same item appears in the errata for all switches above. + */ + break; } return false; From 8ad7da0ed4d5b088e7c4957002f1ba579d127d5a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 4 May 2025 10:14:34 +0200 Subject: [PATCH 135/353] net: phy: micrel: remove KSZ9477 EEE quirks now handled by phylink The KSZ9477 PHY driver contained workarounds for broken EEE capability advertisements by manually masking supported EEE modes and forcibly disabling EEE if MICREL_NO_EEE was set. With proper MAC-side EEE handling implemented via phylink, these quirks are no longer necessary. Remove MICREL_NO_EEE handling and the use of ksz9477_get_features(). This simplifies the PHY driver and avoids duplicated EEE management logic. Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org # v6.14+ Link: https://patch.msgid.link/20250504081434.424489-3-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/phy/micrel.c | 7 ------- include/linux/micrel_phy.h | 1 - 2 files changed, 8 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 24882d30f68589..e2c6569d8c45ca 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2027,12 +2027,6 @@ static int ksz9477_config_init(struct phy_device *phydev) return err; } - /* According to KSZ9477 Errata DS80000754C (Module 4) all EEE modes - * in this switch shall be regarded as broken. - */ - if (phydev->dev_flags & MICREL_NO_EEE) - phy_disable_eee(phydev); - return kszphy_config_init(phydev); } @@ -5705,7 +5699,6 @@ static struct phy_driver ksphy_driver[] = { .handle_interrupt = kszphy_handle_interrupt, .suspend = genphy_suspend, .resume = ksz9477_resume, - .get_features = ksz9477_get_features, } }; module_phy_driver(ksphy_driver); diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 591bf5b5e8dc22..9af01bdd86d26d 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -44,7 +44,6 @@ #define MICREL_PHY_50MHZ_CLK BIT(0) #define MICREL_PHY_FXEN BIT(1) #define MICREL_KSZ8_P1_ERRATA BIT(2) -#define MICREL_NO_EEE BIT(3) #define MICREL_KSZ9021_EXTREG_CTRL 0xB #define MICREL_KSZ9021_EXTREG_DATA_WRITE 0xC From 5bf2987ba05719cfc6f03d683ab299a4c0d64a6e Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 8 May 2025 11:14:24 +0200 Subject: [PATCH 136/353] net: dsa: b53: prevent standalone from trying to forward to other ports When bridged ports and standalone ports share a VLAN, e.g. via VLAN uppers, or untagged traffic with a vlan unaware bridge, the ASIC will still try to forward traffic to known FDB entries on standalone ports. But since the port VLAN masks prevent forwarding to bridged ports, this traffic will be dropped. This e.g. can be observed in the bridge_vlan_unaware ping tests, where this breaks pinging with learning on. Work around this by enabling the simplified EAP mode on switches supporting it for standalone ports, which causes the ASIC to redirect traffic of unknown source MAC addresses to the CPU port. Since standalone ports do not learn, there are no known source MAC addresses, so effectively this redirects all incoming traffic to the CPU port. Fixes: ff39c2d68679 ("net: dsa: b53: Add bridge support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250508091424.26870-1-jonas.gorski@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/b53/b53_common.c | 33 ++++++++++++++++++++++++++++++++ drivers/net/dsa/b53/b53_regs.h | 14 ++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9eb39cfa5fb275..7216eb8f949367 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -326,6 +326,26 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 vid, } } +static void b53_set_eap_mode(struct b53_device *dev, int port, int mode) +{ + u64 eap_conf; + + if (is5325(dev) || is5365(dev) || dev->chip_id == BCM5389_DEVICE_ID) + return; + + b53_read64(dev, B53_EAP_PAGE, B53_PORT_EAP_CONF(port), &eap_conf); + + if (is63xx(dev)) { + eap_conf &= ~EAP_MODE_MASK_63XX; + eap_conf |= (u64)mode << EAP_MODE_SHIFT_63XX; + } else { + eap_conf &= ~EAP_MODE_MASK; + eap_conf |= (u64)mode << EAP_MODE_SHIFT; + } + + b53_write64(dev, B53_EAP_PAGE, B53_PORT_EAP_CONF(port), eap_conf); +} + static void b53_set_forwarding(struct b53_device *dev, int enable) { u8 mgmt; @@ -586,6 +606,13 @@ int b53_setup_port(struct dsa_switch *ds, int port) b53_port_set_mcast_flood(dev, port, true); b53_port_set_learning(dev, port, false); + /* Force all traffic to go to the CPU port to prevent the ASIC from + * trying to forward to bridged ports on matching FDB entries, then + * dropping frames because it isn't allowed to forward there. + */ + if (dsa_is_user_port(ds, port)) + b53_set_eap_mode(dev, port, EAP_MODE_SIMPLIFIED); + return 0; } EXPORT_SYMBOL(b53_setup_port); @@ -2042,6 +2069,9 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, pvlan |= BIT(i); } + /* Disable redirection of unknown SA to the CPU port */ + b53_set_eap_mode(dev, port, EAP_MODE_BASIC); + /* Configure the local port VLAN control membership to include * remote ports and update the local port bitmask */ @@ -2077,6 +2107,9 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) pvlan &= ~BIT(i); } + /* Enable redirection of unknown SA to the CPU port */ + b53_set_eap_mode(dev, port, EAP_MODE_SIMPLIFIED); + b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), pvlan); dev->ports[port].vlan_ctl_mask = pvlan; diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index bfbcb66bef6626..5f7a0e5c5709d3 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -50,6 +50,9 @@ /* Jumbo Frame Registers */ #define B53_JUMBO_PAGE 0x40 +/* EAP Registers */ +#define B53_EAP_PAGE 0x42 + /* EEE Control Registers Page */ #define B53_EEE_PAGE 0x92 @@ -480,6 +483,17 @@ #define JMS_MIN_SIZE 1518 #define JMS_MAX_SIZE 9724 +/************************************************************************* + * EAP Page Registers + *************************************************************************/ +#define B53_PORT_EAP_CONF(i) (0x20 + 8 * (i)) +#define EAP_MODE_SHIFT 51 +#define EAP_MODE_SHIFT_63XX 50 +#define EAP_MODE_MASK (0x3ull << EAP_MODE_SHIFT) +#define EAP_MODE_MASK_63XX (0x3ull << EAP_MODE_SHIFT_63XX) +#define EAP_MODE_BASIC 0 +#define EAP_MODE_SIMPLIFIED 3 + /************************************************************************* * EEE Configuration Page Registers *************************************************************************/ From b1b37c8a3e35ff6f0338a81929b5f30f9ae82ad7 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 11 May 2025 07:12:00 +0200 Subject: [PATCH 137/353] tools: ynl-gen: Allow multi-attr without nested-attributes again Since commit ce6cb8113c84 ("tools: ynl-gen: individually free previous values on double set"), specifying the "multi-attr" property raises an error unless the "nested-attributes" property is specified as well: File "tools/net/ynl/./pyynl/ynl_gen_c.py", line 1147, in _load_nested_sets child = self.pure_nested_structs.get(nested) ^^^^^^ UnboundLocalError: cannot access local variable 'nested' where it is not associated with a value This appears to be a bug since there are existing specs which omit "nested-attributes" on "multi-attr" attributes. Also, according to Documentation/userspace-api/netlink/specs.rst, multi-attr "is the recommended way of implementing arrays (no extra nesting)", suggesting that nesting should even be avoided in favor of multi-attr. Fix the indentation of the if-block introduced by the commit to avoid the error. Fixes: ce6cb8113c84 ("tools: ynl-gen: individually free previous values on double set") Signed-off-by: Lukas Wunner Link: https://patch.msgid.link/d6b58684b7e5bfb628f7313e6893d0097904e1d1.1746940107.git.lukas@wunner.de Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 30c0a34b2784ec..a7f08edbc23598 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1143,10 +1143,9 @@ def _load_nested_sets(self): self.pure_nested_structs[nested].request = True if attr in rs_members['reply']: self.pure_nested_structs[nested].reply = True - - if spec.is_multi_val(): - child = self.pure_nested_structs.get(nested) - child.in_multi_val = True + if spec.is_multi_val(): + child = self.pure_nested_structs.get(nested) + child.in_multi_val = True self._sort_pure_types() From 9906d852f09cc5bfca82c4e1b6faa4683072e514 Mon Sep 17 00:00:00 2001 From: Konstantin Shkolnyy Date: Wed, 7 May 2025 10:14:56 -0500 Subject: [PATCH 138/353] vsock/test: Fix occasional failure in SIOCOUTQ tests These tests: "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes" "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes" output: "Unexpected 'SIOCOUTQ' value, expected 0, got 64 (CLIENT)". They test that the SIOCOUTQ ioctl reports 0 unsent bytes after the data have been received by the other side. However, sometimes there is a delay in updating this "unsent bytes" counter, and the test fails even though the counter properly goes to 0 several milliseconds later. The delay occurs in the kernel because the used buffer notification callback virtio_vsock_tx_done(), called upon receipt of the data by the other side, doesn't update the counter itself. It delegates that to a kernel thread (via vsock->tx_work). Sometimes that thread is delayed more than the test expects. Change the test to poll SIOCOUTQ until it returns 0 or a timeout occurs. Signed-off-by: Konstantin Shkolnyy Reviewed-by: Stefano Garzarella Fixes: 18ee44ce97c1 ("test/vsock: add ioctl unsent bytes test") Link: https://patch.msgid.link/20250507151456.2577061-1-kshk@linux.ibm.com Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index d0f6d253ac72d0..613551132a9663 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1264,21 +1264,25 @@ static void test_unsent_bytes_client(const struct test_opts *opts, int type) send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); control_expectln("RECEIVED"); - ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); - if (ret < 0) { - if (errno == EOPNOTSUPP) { - fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); - } else { + /* SIOCOUTQ isn't guaranteed to instantly track sent data. Even though + * the "RECEIVED" message means that the other side has received the + * data, there can be a delay in our kernel before updating the "unsent + * bytes" counter. Repeat SIOCOUTQ until it returns 0. + */ + timeout_begin(TIMEOUT); + do { + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); + break; + } perror("ioctl"); exit(EXIT_FAILURE); } - } else if (ret == 0 && sock_bytes_unsent != 0) { - fprintf(stderr, - "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", - sock_bytes_unsent); - exit(EXIT_FAILURE); - } - + timeout_check("SIOCOUTQ"); + } while (sock_bytes_unsent != 0); + timeout_end(); close(fd); } From 1886cb1604522f503a700174de74547c5246496c Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 11 May 2025 13:15:52 +0300 Subject: [PATCH 139/353] net/mlx5e: Disable MACsec offload for uplink representor profile MACsec offload is not supported in switchdev mode for uplink representors. When switching to the uplink representor profile, the MACsec offload feature must be cleared from the netdevice's features. If left enabled, attempts to add offloads result in a null pointer dereference, as the uplink representor does not support MACsec offload even though the feature bit remains set. Clear NETIF_F_HW_MACSEC in mlx5e_fix_uplink_rep_features(). Kernel log: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000f: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000078-0x000000000000007f] CPU: 29 UID: 0 PID: 4714 Comm: ip Not tainted 6.14.0-rc4_for_upstream_debug_2025_03_02_17_35 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__mutex_lock+0x128/0x1dd0 Code: d0 7c 08 84 d2 0f 85 ad 15 00 00 8b 35 91 5c fe 03 85 f6 75 29 49 8d 7e 60 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 a6 15 00 00 4d 3b 76 60 0f 85 fd 0b 00 00 65 ff RSP: 0018:ffff888147a4f160 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 000000000000000f RSI: 0000000000000000 RDI: 0000000000000078 RBP: ffff888147a4f2e0 R08: ffffffffa05d2c19 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: dffffc0000000000 R14: 0000000000000018 R15: ffff888152de0000 FS: 00007f855e27d800(0000) GS:ffff88881ee80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004e5768 CR3: 000000013ae7c005 CR4: 0000000000372eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? __mutex_lock+0x128/0x1dd0 ? lockdep_set_lock_cmp_fn+0x190/0x190 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? mutex_lock_io_nested+0x1ae0/0x1ae0 ? lock_acquire+0x1c2/0x530 ? macsec_upd_offload+0x145/0x380 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? kasan_save_stack+0x30/0x40 ? kasan_save_stack+0x20/0x40 ? kasan_save_track+0x10/0x30 ? __kasan_kmalloc+0x77/0x90 ? __kmalloc_noprof+0x249/0x6b0 ? genl_family_rcv_msg_attrs_parse.constprop.0+0xb5/0x240 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? mlx5e_macsec_add_rxsa+0x11a0/0x11a0 [mlx5_core] macsec_update_offload+0x26c/0x820 ? macsec_set_mac_address+0x4b0/0x4b0 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? _raw_spin_unlock_irqrestore+0x47/0x50 macsec_upd_offload+0x2c8/0x380 ? macsec_update_offload+0x820/0x820 ? __nla_parse+0x22/0x30 ? genl_family_rcv_msg_attrs_parse.constprop.0+0x15e/0x240 genl_family_rcv_msg_doit+0x1cc/0x2a0 ? genl_family_rcv_msg_attrs_parse.constprop.0+0x240/0x240 ? cap_capable+0xd4/0x330 genl_rcv_msg+0x3ea/0x670 ? genl_family_rcv_msg_dumpit+0x2a0/0x2a0 ? lockdep_set_lock_cmp_fn+0x190/0x190 ? macsec_update_offload+0x820/0x820 netlink_rcv_skb+0x12b/0x390 ? genl_family_rcv_msg_dumpit+0x2a0/0x2a0 ? netlink_ack+0xd80/0xd80 ? rwsem_down_read_slowpath+0xf90/0xf90 ? netlink_deliver_tap+0xcd/0xac0 ? netlink_deliver_tap+0x155/0xac0 ? _copy_from_iter+0x1bb/0x12c0 genl_rcv+0x24/0x40 netlink_unicast+0x440/0x700 ? netlink_attachskb+0x760/0x760 ? lock_acquire+0x1c2/0x530 ? __might_fault+0xbb/0x170 netlink_sendmsg+0x749/0xc10 ? netlink_unicast+0x700/0x700 ? __might_fault+0xbb/0x170 ? netlink_unicast+0x700/0x700 __sock_sendmsg+0xc5/0x190 ____sys_sendmsg+0x53f/0x760 ? import_iovec+0x7/0x10 ? kernel_sendmsg+0x30/0x30 ? __copy_msghdr+0x3c0/0x3c0 ? filter_irq_stacks+0x90/0x90 ? stack_depot_save_flags+0x28/0xa30 ___sys_sendmsg+0xeb/0x170 ? kasan_save_stack+0x30/0x40 ? copy_msghdr_from_user+0x110/0x110 ? do_syscall_64+0x6d/0x140 ? lock_acquire+0x1c2/0x530 ? __virt_addr_valid+0x116/0x3b0 ? __virt_addr_valid+0x1da/0x3b0 ? lock_downgrade+0x680/0x680 ? __delete_object+0x21/0x50 __sys_sendmsg+0xf7/0x180 ? __sys_sendmsg_sock+0x20/0x20 ? kmem_cache_free+0x14c/0x4e0 ? __x64_sys_close+0x78/0xd0 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f855e113367 Code: 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RSP: 002b:00007ffd15e90c88 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f855e113367 RDX: 0000000000000000 RSI: 00007ffd15e90cf0 RDI: 0000000000000004 RBP: 00007ffd15e90dbc R08: 0000000000000028 R09: 000000000045d100 R10: 00007f855e011dd8 R11: 0000000000000246 R12: 0000000000000019 R13: 0000000067c6b785 R14: 00000000004a1e80 R15: 0000000000000000 Modules linked in: 8021q garp mrp sch_ingress openvswitch nsh mlx5_ib mlx5_fwctl mlx5_dpll mlx5_core rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: mlx5_core] ---[ end trace 0000000000000000 ]--- Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Carolina Jubran Reviewed-by: Shahar Shitrit Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Link: https://patch.msgid.link/1746958552-561295-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3506024c245391..9bd166f489e7ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4349,6 +4349,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev if (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER) netdev_warn(netdev, "Disabling HW_VLAN CTAG FILTERING, not supported in switchdev mode\n"); + features &= ~NETIF_F_HW_MACSEC; + if (netdev->features & NETIF_F_HW_MACSEC) + netdev_warn(netdev, "Disabling HW MACsec offload, not supported in switchdev mode\n"); + return features; } From 2cbd0680714ed7cc0f1b10421f700832a7262e00 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 12 May 2025 16:17:51 +0300 Subject: [PATCH 140/353] docs: networking: timestamping: improve stacked PHC sentence The first paragraph makes no grammatical sense. I suppose a portion of the intended sentece is missing: "[The challenge with ] stacked PHCs (...) is that they uncover bugs". Rephrase, and at the same time simplify the structure of the sentence a little bit, it is not easy to follow. Fixes: 94d9f78f4d64 ("docs: networking: timestamping: add section for stacked PHC devices") Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Link: https://patch.msgid.link/20250512131751.320283-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- Documentation/networking/timestamping.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index b8fef8101176dc..7aabead906487e 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -811,11 +811,9 @@ Documentation/devicetree/bindings/ptp/timestamper.txt for more details. 3.2.4 Other caveats for MAC drivers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Stacked PHCs, especially DSA (but not only) - since that doesn't require any -modification to MAC drivers, so it is more difficult to ensure correctness of -all possible code paths - is that they uncover bugs which were impossible to -trigger before the existence of stacked PTP clocks. One example has to do with -this line of code, already presented earlier:: +The use of stacked PHCs may uncover MAC driver bugs which were impossible to +trigger without them. One example has to do with this line of code, already +presented earlier:: skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; From 79367e45d97c24ac5da1294d0b277abe30768879 Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Mon, 12 May 2025 10:18:27 +0530 Subject: [PATCH 141/353] qlcnic: fix memory leak in qlcnic_sriov_channel_cfg_cmd() In one of the error paths in qlcnic_sriov_channel_cfg_cmd(), the memory allocated in qlcnic_sriov_alloc_bc_mbx_args() for mailbox arguments is not freed. Fix that by jumping to the error path that frees them, by calling qlcnic_free_mbx_args(). This was found using static analysis. Fixes: f197a7aa6288 ("qlcnic: VF-PF communication channel implementation") Signed-off-by: Abdun Nihaal Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250512044829.36400-1-abdun.nihaal@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c index 28d24d59efb84f..d57b976b904095 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c @@ -1484,8 +1484,11 @@ static int qlcnic_sriov_channel_cfg_cmd(struct qlcnic_adapter *adapter, u8 cmd_o } cmd_op = (cmd.rsp.arg[0] & 0xff); - if (cmd.rsp.arg[0] >> 25 == 2) - return 2; + if (cmd.rsp.arg[0] >> 25 == 2) { + ret = 2; + goto out; + } + if (cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT) set_bit(QLC_BC_VF_STATE, &vf->state); else From 4ed7120f522f4bb1c7f6768157102f3c24c07a64 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Mon, 12 May 2025 11:59:01 +0530 Subject: [PATCH 142/353] octeontx2-pf: Fix ethtool support for SDP representors The hardware supports multiple MAC types, including RPM, SDP, and LBK. However, features such as link settings and pause frames are only available on RPM MAC, and not supported on SDP or LBK. This patch updates the ethtool operations logic accordingly to reflect this behavior. Fixes: 2f7f33a09516 ("octeontx2-pf: Add representors for sdp MAC") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 010385b2998869..45b8c9230184d8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -315,7 +315,7 @@ static void otx2_get_pauseparam(struct net_device *netdev, struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_pause_frm_cfg *req, *rsp; - if (is_otx2_lbkvf(pfvf->pdev)) + if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev)) return; mutex_lock(&pfvf->mbox.lock); @@ -347,7 +347,7 @@ static int otx2_set_pauseparam(struct net_device *netdev, if (pause->autoneg) return -EOPNOTSUPP; - if (is_otx2_lbkvf(pfvf->pdev)) + if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev)) return -EOPNOTSUPP; if (pause->rx_pause) @@ -941,8 +941,8 @@ static u32 otx2_get_link(struct net_device *netdev) { struct otx2_nic *pfvf = netdev_priv(netdev); - /* LBK link is internal and always UP */ - if (is_otx2_lbkvf(pfvf->pdev)) + /* LBK and SDP links are internal and always UP */ + if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev)) return 1; return pfvf->linfo.link_up; } @@ -1413,7 +1413,7 @@ static int otx2vf_get_link_ksettings(struct net_device *netdev, { struct otx2_nic *pfvf = netdev_priv(netdev); - if (is_otx2_lbkvf(pfvf->pdev)) { + if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev)) { cmd->base.duplex = DUPLEX_FULL; cmd->base.speed = SPEED_100000; } else { From a20db20b5b6a724b12aebcc14bd166fcde9b50fb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 13 May 2025 15:13:16 -0700 Subject: [PATCH 143/353] netlink: specs: tc: fix a couple of attribute names Fix up spelling of two attribute names. These are clearly typoes and will prevent C codegen from working. Let's treat this as a fix to get the correction into users' hands ASAP, and prevent anyone depending on the wrong names. Fixes: a1bcfde83669 ("doc/netlink/specs: Add a spec for tc") Link: https://patch.msgid.link/20250513221316.841700-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/tc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index aacccea5dfe42a..5e1ff04f51f267 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -2745,7 +2745,7 @@ attribute-sets: type: u16 byte-order: big-endian - - name: key-l2-tpv3-sid + name: key-l2tpv3-sid type: u32 byte-order: big-endian - @@ -3504,7 +3504,7 @@ attribute-sets: name: rate64 type: u64 - - name: prate4 + name: prate64 type: u64 - name: burst From 1560c876e347882f965b4bb983e0f82cc1055e0e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 13 May 2025 15:16:38 -0700 Subject: [PATCH 144/353] netlink: specs: tc: all actions are indexed arrays Some TC filters have actions listed as indexed arrays of nests and some as just nests. They are all indexed arrays, the handling is common across filters. Fixes: 2267672a6190 ("doc/netlink/specs: Update the tc spec") Link: https://patch.msgid.link/20250513221638.842532-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/tc.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index 5e1ff04f51f267..953aa837958b3f 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -2017,7 +2017,8 @@ attribute-sets: attributes: - name: act - type: nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: police @@ -2250,7 +2251,8 @@ attribute-sets: attributes: - name: act - type: nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: police From 551039dd3bae53cecd71cad90841f097c70b9d0b Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 12 May 2025 18:12:36 +0530 Subject: [PATCH 145/353] octeontx2-pf: macsec: Fix incorrect max transmit size in TX secy MASCEC hardware block has a field called maximum transmit size for TX secy. Max packet size going out of MCS block has be programmed taking into account full packet size which has L2 header,SecTag and ICV. MACSEC offload driver is configuring max transmit size as macsec interface MTU which is incorrect. Say with 1500 MTU of real device, macsec interface created on top of real device will have MTU of 1468(1500 - (SecTag + ICV)). This is causing packets from macsec interface of size greater than or equal to 1468 are not getting transmitted out because driver programmed max transmit size as 1468 instead of 1514(1500 + ETH_HDR_LEN). Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Subbaraya Sundeep Reviewed-by: Simon Horman Link: https://patch.msgid.link/1747053756-4529-1-git-send-email-sbhatta@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index f3b9daffaec3c2..4c7e0f345cb5ba 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -531,7 +531,8 @@ static int cn10k_mcs_write_tx_secy(struct otx2_nic *pfvf, if (sw_tx_sc->encrypt) sectag_tci |= (MCS_TCI_E | MCS_TCI_C); - policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, secy->netdev->mtu); + policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, + pfvf->netdev->mtu + OTX2_ETH_HLEN); /* Write SecTag excluding AN bits(1..0) */ policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_TCI, sectag_tci >> 2); policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_OFFSET, tag_offset); From 5c497765d9ae60f70c58a392449a82ddbc4ceb8e Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Tue, 13 May 2025 10:10:07 +0800 Subject: [PATCH 146/353] net: txgbe: Fix to calculate EEPROM checksum for AML devices In the new firmware version, the shadow ram reserves some space to store I2C information, so the checksum calculation needs to skip this section. Otherwise, the driver will fail to probe because the invalid EEPROM checksum. Fixes: 2e5af6b2ae85 ("net: txgbe: Add basic support for new AML devices") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/1C6BF7A937237F5A+20250513021009.145708-2-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c | 8 +++++++- drivers/net/ethernet/wangxun/txgbe/txgbe_type.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c index 4b9921b7bb1125..a054b259d435dd 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c @@ -99,9 +99,15 @@ static int txgbe_calc_eeprom_checksum(struct wx *wx, u16 *checksum) } local_buffer = eeprom_ptrs; - for (i = 0; i < TXGBE_EEPROM_LAST_WORD; i++) + for (i = 0; i < TXGBE_EEPROM_LAST_WORD; i++) { + if (wx->mac.type == wx_mac_aml) { + if (i >= TXGBE_EEPROM_I2C_SRART_PTR && + i < TXGBE_EEPROM_I2C_END_PTR) + local_buffer[i] = 0xffff; + } if (i != wx->eeprom.sw_region_offset + TXGBE_EEPROM_CHECKSUM) *checksum += local_buffer[i]; + } kvfree(eeprom_ptrs); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h index 9c1c26234cad95..f423012dec2256 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h @@ -158,6 +158,8 @@ #define TXGBE_EEPROM_VERSION_L 0x1D #define TXGBE_EEPROM_VERSION_H 0x1E #define TXGBE_ISCSI_BOOT_CONFIG 0x07 +#define TXGBE_EEPROM_I2C_SRART_PTR 0x580 +#define TXGBE_EEPROM_I2C_END_PTR 0x800 #define TXGBE_MAX_MSIX_VECTORS 64 #define TXGBE_MAX_FDIR_INDICES 63 From bb399bdd09ae6dbfc96817ed4804002ec664ddfe Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Tue, 13 May 2025 10:10:08 +0800 Subject: [PATCH 147/353] net: libwx: Fix FW mailbox reply timeout For the new SW-FW interaction, the timeout waiting for the firmware to return is too short. So that some mailbox commands cannot be completed. Use the 'timeout' parameter instead of fixed timeout value for flexible configuration. Fixes: 2e5af6b2ae85 ("net: txgbe: Add basic support for new AML devices") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/5D5BDE3EA501BDB8+20250513021009.145708-3-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index aed45abafb1b79..ccdc57e9b0d56c 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -434,8 +434,8 @@ static int wx_host_interface_command_r(struct wx *wx, u32 *buffer, wr32m(wx, WX_SW2FW_MBOX_CMD, WX_SW2FW_MBOX_CMD_VLD, WX_SW2FW_MBOX_CMD_VLD); /* polling reply from FW */ - err = read_poll_timeout(wx_poll_fw_reply, reply, reply, 1000, 50000, - true, wx, buffer, send_cmd); + err = read_poll_timeout(wx_poll_fw_reply, reply, reply, 2000, + timeout * 1000, true, wx, buffer, send_cmd); if (err) { wx_err(wx, "Polling from FW messages timeout, cmd: 0x%x, index: %d\n", send_cmd, wx->swfw_index); From 6b0ad8c0ae00e18babdc56195f18482eb84bf9f4 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Tue, 13 May 2025 10:10:09 +0800 Subject: [PATCH 148/353] net: libwx: Fix FW mailbox unknown command For the new SW-FW interaction, missing the error return if there is an unknown command. It causes the driver to mistakenly believe that the interaction is complete. This problem occurs when new driver is paired with old firmware, which does not support the new mailbox commands. Fixes: 2e5af6b2ae85 ("net: txgbe: Add basic support for new AML devices") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/64DBB705D35A0016+20250513021009.145708-4-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index ccdc57e9b0d56c..490d34233d38c5 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -442,6 +442,12 @@ static int wx_host_interface_command_r(struct wx *wx, u32 *buffer, goto rel_out; } + if (hdr->cmd_or_resp.ret_status == 0x80) { + wx_err(wx, "Unknown FW command: 0x%x\n", send_cmd); + err = -EINVAL; + goto rel_out; + } + /* expect no reply from FW then return */ if (!return_data) goto rel_out; From 3d6dfa1c2910280967f4834318e002624295953e Mon Sep 17 00:00:00 2001 From: Bo-Cun Chen Date: Tue, 13 May 2025 05:27:30 +0100 Subject: [PATCH 149/353] net: ethernet: mtk_eth_soc: fix typo for declaration MT7988 ESW capability Since MTK_ESW_BIT is a bit number rather than a bitmap, it causes MTK_HAS_CAPS to produce incorrect results. This leads to the ETH driver not declaring MAC capabilities correctly for the MT7988 ESW. Fixes: 445eb6448ed3 ("net: ethernet: mtk_eth_soc: add basic support for MT7988 SoC") Signed-off-by: Bo-Cun Chen Signed-off-by: Daniel Golle Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/b8b37f409d1280fad9c4d32521e6207f63cd3213.1747110258.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 22a532695fb0dd..6c92072b4c2808 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -4748,7 +4748,7 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) } if (mtk_is_netsys_v3_or_greater(mac->hw) && - MTK_HAS_CAPS(mac->hw->soc->caps, MTK_ESW_BIT) && + MTK_HAS_CAPS(mac->hw->soc->caps, MTK_ESW) && id == MTK_GMAC1_ID) { mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | From e98039616a1126acda3b892b928c503de8db124c Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 13 May 2025 12:45:54 +0530 Subject: [PATCH 150/353] octeontx2-af: Fix CGX Receive counters Each CGX block supports 4 logical MACs (LMACS). Receive counters CGX_CMR_RX_STAT0-8 are per LMAC and CGX_CMR_RX_STAT9-12 are per CGX. Due a bug in previous patch, stale Per CGX counters values observed. Fixes: 66208910e57a ("octeontx2-af: Support to retrieve CGX LMAC stats") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250513071554.728922-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 0b27a695008bdb..971993586fb49d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -717,6 +717,11 @@ int cgx_get_rx_stats(void *cgxd, int lmac_id, int idx, u64 *rx_stat) if (!is_lmac_valid(cgx, lmac_id)) return -ENODEV; + + /* pass lmac as 0 for CGX_CMR_RX_STAT9-12 */ + if (idx >= CGX_RX_STAT_GLOBAL_INDEX) + lmac_id = 0; + *rx_stat = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_STAT0 + (idx * 8)); return 0; } From 68d18b3b5c85981d9be851cb85f925c0c5bc0900 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:00 -0700 Subject: [PATCH 151/353] Drivers: hv: Allow vmbus_sendpacket_mpb_desc() to create multiple ranges vmbus_sendpacket_mpb_desc() is currently used only by the storvsc driver and is hardcoded to create a single GPA range. To allow it to also be used by the netvsc driver to create multiple GPA ranges, no longer hardcode as having a single GPA range. Allow the calling driver to specify the rangecount in the supplied descriptor. Update the storvsc driver to reflect this new approach. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-2-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/hv/channel.c | 6 +++--- drivers/scsi/storvsc_drv.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fb8cd8469328ee..4ffd5eaa781725 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1136,9 +1136,10 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); /* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1160,7 +1161,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 35db061ae3ecde..2e6b2412d2c946 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1819,6 +1819,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) return SCSI_MLQUEUE_DEVICE_BUSY; } + payload->rangecount = 1; payload->range.len = length; payload->range.offset = offset_in_hvpg; From b5b08f2a12eda209858db3856cdb0bc759e91fe6 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:01 -0700 Subject: [PATCH 152/353] hv_netvsc: Use vmbus_sendpacket_mpb_desc() to send VMBus messages netvsc currently uses vmbus_sendpacket_pagebuffer() to send VMBus messages. This function creates a series of GPA ranges, each of which contains a single PFN. However, if the rndis header in the VMBus message crosses a page boundary, the netvsc protocol with the host requires that both PFNs for the rndis header must be in a single "GPA range" data structure, which isn't possible with vmbus_sendpacket_pagebuffer(). As the first step in fixing this, add a new function netvsc_build_mpb_array() to build a VMBus message with multiple GPA ranges, each of which may contain multiple PFNs. Use vmbus_sendpacket_mpb_desc() to send this VMBus message to the host. There's no functional change since higher levels of netvsc don't maintain or propagate knowledge of contiguous PFNs. Based on its input, netvsc_build_mpb_array() still produces a separate GPA range for each PFN and the behavior is the same as with vmbus_sendpacket_pagebuffer(). But the groundwork is laid for a subsequent patch to provide the necessary grouping. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-3-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc.c | 50 +++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index d6f5b9ea3109d2..5882d6670200ae 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1055,6 +1055,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev, return 0; } +/* Build an "array" of mpb entries describing the data to be transferred + * over VMBus. After the desc header fields, each "array" entry is variable + * size, and each entry starts after the end of the previous entry. The + * "offset" and "len" fields for each entry imply the size of the entry. + * + * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V + * uses that granularity, even if the system page size of the guest is larger. + * Each entry in the input "pb" array must describe a contiguous range of + * guest physical memory so that the pfns are sequential if the range crosses + * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE. + */ +static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb, + u32 page_buffer_count, + struct vmbus_packet_mpb_array *desc, + u32 *desc_size) +{ + struct hv_mpb_array *mpb_entry = &desc->range; + int i, j; + + for (i = 0; i < page_buffer_count; i++) { + u32 offset = pb[i].offset; + u32 len = pb[i].len; + + mpb_entry->offset = offset; + mpb_entry->len = len; + + for (j = 0; j < HVPFN_UP(offset + len); j++) + mpb_entry->pfn_array[j] = pb[i].pfn + j; + + mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j]; + } + + desc->rangecount = page_buffer_count; + *desc_size = (char *)mpb_entry - (char *)desc; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -1097,6 +1133,9 @@ static inline int netvsc_send_pkt( packet->dma_range = NULL; if (packet->page_buf_cnt) { + struct vmbus_channel_packet_page_buffer desc; + u32 desc_size; + if (packet->cp_partial) pb += packet->rmsg_pgcnt; @@ -1106,11 +1145,12 @@ static inline int netvsc_send_pkt( goto exit; } - ret = vmbus_sendpacket_pagebuffer(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, sizeof(nvmsg), - req_id); - + netvsc_build_mpb_array(pb, packet->page_buf_cnt, + (struct vmbus_packet_mpb_array *)&desc, + &desc_size); + ret = vmbus_sendpacket_mpb_desc(out_channel, + (struct vmbus_packet_mpb_array *)&desc, + desc_size, &nvmsg, sizeof(nvmsg), req_id); if (ret) netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { From 449fefc85b829182a5f396c8378cc443be72e091 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:02 -0700 Subject: [PATCH 153/353] hv_netvsc: Preserve contiguous PFN grouping in the page buffer array Starting with commit dca5161f9bd0 ("hv_netvsc: Check status in SEND_RNDIS_PKT completion message") in the 6.3 kernel, the Linux driver for Hyper-V synthetic networking (netvsc) occasionally reports "nvsp_rndis_pkt_complete error status: 2".[1] This error indicates that Hyper-V has rejected a network packet transmit request from the guest, and the outgoing network packet is dropped. Higher level network protocols presumably recover and resend the packet so there is no functional error, but performance is slightly impacted. Commit dca5161f9bd0 is not the cause of the error -- it only added reporting of an error that was already happening without any notice. The error has presumably been present since the netvsc driver was originally introduced into Linux. The root cause of the problem is that the netvsc driver in Linux may send an incorrectly formatted VMBus message to Hyper-V when transmitting the network packet. The incorrect formatting occurs when the rndis header of the VMBus message crosses a page boundary due to how the Linux skb head memory is aligned. In such a case, two PFNs are required to describe the location of the rndis header, even though they are contiguous in guest physical address (GPA) space. Hyper-V requires that two rndis header PFNs be in a single "GPA range" data struture, but current netvsc code puts each PFN in its own GPA range, which Hyper-V rejects as an error. The incorrect formatting occurs only for larger packets that netvsc must transmit via a VMBus "GPA Direct" message. There's no problem when netvsc transmits a smaller packet by copying it into a pre- allocated send buffer slot because the pre-allocated slots don't have page crossing issues. After commit 14ad6ed30a10 ("net: allow small head cache usage with large MAX_SKB_FRAGS values") in the 6.14-rc4 kernel, the error occurs much more frequently in VMs with 16 or more vCPUs. It may occur every few seconds, or even more frequently, in an ssh session that outputs a lot of text. Commit 14ad6ed30a10 subtly changes how skb head memory is allocated, making it much more likely that the rndis header will cross a page boundary when the vCPU count is 16 or more. The changes in commit 14ad6ed30a10 are perfectly valid -- they just had the side effect of making the netvsc bug more prominent. Current code in init_page_array() creates a separate page buffer array entry for each PFN required to identify the data to be transmitted. Contiguous PFNs get separate entries in the page buffer array, and any information about contiguity is lost. Fix the core issue by having init_page_array() construct the page buffer array to represent contiguous ranges rather than individual pages. When these ranges are subsequently passed to netvsc_build_mpb_array(), it can build GPA ranges that contain multiple PFNs, as required to avoid the error "nvsp_rndis_pkt_complete error status: 2". If instead the network packet is sent by copying into a pre-allocated send buffer slot, the copy proceeds using the contiguous ranges rather than individual pages, but the result of the copying is the same. Also fix rndis_filter_send_request() to construct a contiguous range, since it has its own page buffer array. This change has a side benefit in CoCo VMs in that netvsc_dma_map() calls dma_map_single() on each contiguous range instead of on each page. This results in fewer calls to dma_map_single() but on larger chunks of memory, which should reduce contention on the swiotlb. Since the page buffer array now contains one entry for each contiguous range instead of for each individual page, the number of entries in the array can be reduced, saving 208 bytes of stack space in netvsc_xmit() when MAX_SKG_FRAGS has the default value of 17. [1] https://bugzilla.kernel.org/show_bug.cgi?id=217503 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217503 Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-4-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/hyperv_net.h | 12 ++++++ drivers/net/hyperv/netvsc_drv.c | 63 ++++++++----------------------- drivers/net/hyperv/rndis_filter.c | 24 +++--------- 3 files changed, 32 insertions(+), 67 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 70f7cb383228ea..76725f25abd52d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -893,6 +893,18 @@ struct nvsp_message { sizeof(struct nvsp_message)) #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) +/* Maximum # of contiguous data ranges that can make up a trasmitted packet. + * Typically it's the max SKB fragments plus 2 for the rndis packet and the + * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may + * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries + * in a GPA direct packet sent to netvsp over VMBus. + */ +#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT +#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) +#else +#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT +#endif + /* Estimated requestor size: * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c51b318b8a72e4..929f6b3de76837 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -326,43 +326,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, - struct hv_page_buffer *pb) -{ - int j = 0; - - hvpfn += offset >> HV_HYP_PAGE_SHIFT; - offset = offset & ~HV_HYP_PAGE_MASK; - - while (len > 0) { - unsigned long bytes; - - bytes = HV_HYP_PAGE_SIZE - offset; - if (bytes > len) - bytes = len; - pb[j].pfn = hvpfn; - pb[j].offset = offset; - pb[j].len = bytes; - - offset += bytes; - len -= bytes; - - if (offset == HV_HYP_PAGE_SIZE && len) { - hvpfn++; - offset = 0; - j++; - } - } - - return j + 1; -} - static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 slots_used = 0; - char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; @@ -371,28 +338,28 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_hvpfn(hdr), - offset_in_hvpage(hdr), - len, - &pb[slots_used]); + pb[0].offset = offset_in_hvpage(hdr); + pb[0].len = len; + pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = slots_used; + packet->rmsg_pgcnt = 1; - slots_used += fill_pg_buf(virt_to_hvpfn(data), - offset_in_hvpage(data), - skb_headlen(skb), - &pb[slots_used]); + pb[1].offset = offset_in_hvpage(skb->data); + pb[1].len = skb_headlen(skb); + pb[1].pfn = virt_to_hvpfn(skb->data); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; + struct hv_page_buffer *cur_pb = &pb[i + 2]; + u64 pfn = page_to_hvpfn(skb_frag_page(frag)); + u32 offset = skb_frag_off(frag); - slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), - skb_frag_off(frag), - skb_frag_size(frag), - &pb[slots_used]); + cur_pb->offset = offset_in_hvpage(offset); + cur_pb->len = skb_frag_size(frag); + cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); } - return slots_used; + return frags + 2; } static int count_skb_frag_slots(struct sk_buff *skb) @@ -483,7 +450,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) struct net_device *vf_netdev; u32 rndis_msg_size; u32 hash; - struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; + struct hv_page_buffer pb[MAX_DATA_RANGES]; /* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 82747dfacd70f0..9e73959e61ee0b 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { struct hv_netvsc_packet *packet; - struct hv_page_buffer page_buf[2]; - struct hv_page_buffer *pb = page_buf; + struct hv_page_buffer pb; int ret; /* Setup the packet to send it */ @@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->total_data_buflen = req->request_msg.msg_len; packet->page_buf_cnt = 1; - pb[0].pfn = virt_to_phys(&req->request_msg) >> - HV_HYP_PAGE_SHIFT; - pb[0].len = req->request_msg.msg_len; - pb[0].offset = offset_in_hvpage(&req->request_msg); - - /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { - packet->page_buf_cnt++; - pb[0].len = HV_HYP_PAGE_SIZE - - pb[0].offset; - pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> HV_HYP_PAGE_SHIFT; - pb[1].offset = 0; - pb[1].len = req->request_msg.msg_len - - pb[0].len; - } + pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; + pb.len = req->request_msg.msg_len; + pb.offset = offset_in_hvpage(&req->request_msg); trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); + ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); rcu_read_unlock_bh(); return ret; From 192c1f8b170a195032c6c2744824f2c2216560e5 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:03 -0700 Subject: [PATCH 154/353] hv_netvsc: Remove rmsg_pgcnt init_page_array() now always creates a single page buffer array entry for the rndis message, even if the rndis message crosses a page boundary. As such, the number of page buffer array entries used for the rndis message must no longer be tracked -- it is always just 1. Remove the rmsg_pgcnt field and use "1" where the value is needed. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-5-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/hyperv_net.h | 1 - drivers/net/hyperv/netvsc.c | 7 +++---- drivers/net/hyperv/netvsc_drv.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 76725f25abd52d..cb6f5482d203e1 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -158,7 +158,6 @@ struct hv_netvsc_packet { u8 cp_partial; /* partial copy into send buffer */ u8 rmsg_size; /* RNDIS header and PPI size */ - u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */ u8 page_buf_cnt; u16 q_idx; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 5882d6670200ae..720104661d7f24 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -953,8 +953,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, + pend_size; int i; u32 padding = 0; - u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt; u32 remain; /* Add padding */ @@ -1137,7 +1136,7 @@ static inline int netvsc_send_pkt( u32 desc_size; if (packet->cp_partial) - pb += packet->rmsg_pgcnt; + pb++; ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); if (ret) { @@ -1299,7 +1298,7 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = section_index; if (packet->cp_partial) { - packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->page_buf_cnt--; packet->total_data_buflen = msd_len + packet->rmsg_size; } else { packet->page_buf_cnt = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 929f6b3de76837..d8b169ac0343c5 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -343,7 +343,6 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, pb[0].len = len; pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = 1; pb[1].offset = offset_in_hvpage(skb->data); pb[1].len = skb_headlen(skb); From 27ee2857f3825bc98c819b76052e9a299feab6ac Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:04 -0700 Subject: [PATCH 155/353] Drivers: hv: vmbus: Remove vmbus_sendpacket_pagebuffer() With the netvsc driver changed to use vmbus_sendpacket_mpb_desc() instead of vmbus_sendpacket_pagebuffer(), the latter has no remaining callers. Remove it. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-6-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/hv/channel.c | 59 ------------------------------------------ include/linux/hyperv.h | 7 ----- 2 files changed, 66 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 4ffd5eaa781725..35f26fa1ffe76e 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1076,65 +1076,6 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, } EXPORT_SYMBOL(vmbus_sendpacket); -/* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - /* * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index d6ffe01962c2f8..b52ac40d583095 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1167,13 +1167,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, From f63a2a920d256b44d8d98497807f76dec1720cc0 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 12 May 2025 18:22:37 +0530 Subject: [PATCH 156/353] octeontx2-pf: Do not reallocate all ntuple filters If ntuple filters count is modified followed by unicast filters count using devlink then the ntuple count set by user is ignored and all the ntuple filters are being reallocated. Fix this by storing the ntuple count set by user. Without this patch, say if user tries to modify ntuple count as 8 followed by ucast filter count as 4 using devlink commands then ntuple count is being reverted to default value 16 i.e, not retaining user set value 8. Fixes: 39c469188b6d ("octeontx2-pf: Add ucast filter count configurability via devlink.") Signed-off-by: Subbaraya Sundeep Reviewed-by: Simon Horman Link: https://patch.msgid.link/1747054357-5850-1-git-send-email-sbhatta@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 1 + drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c | 1 + drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 1e88422825be7a..d6b4b74e4002b1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -356,6 +356,7 @@ struct otx2_flow_config { struct list_head flow_list_tc; u8 ucast_flt_cnt; bool ntuple; + u16 ntuple_cnt; }; struct dev_hw_ops { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c index 33ec9a7f7c0339..e13ae5484c19cb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c @@ -41,6 +41,7 @@ static int otx2_dl_mcam_count_set(struct devlink *devlink, u32 id, if (!pfvf->flow_cfg) return 0; + pfvf->flow_cfg->ntuple_cnt = ctx->val.vu16; otx2_alloc_mcam_entries(pfvf, ctx->val.vu16); return 0; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 47bfd1fb37d4bc..64c6d9162ef644 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -247,7 +247,7 @@ int otx2_mcam_entry_init(struct otx2_nic *pfvf) mutex_unlock(&pfvf->mbox.lock); /* Allocate entries for Ntuple filters */ - count = otx2_alloc_mcam_entries(pfvf, OTX2_DEFAULT_FLOWCOUNT); + count = otx2_alloc_mcam_entries(pfvf, flow_cfg->ntuple_cnt); if (count <= 0) { otx2_clear_ntuple_flow_info(pfvf, flow_cfg); return 0; @@ -307,6 +307,7 @@ int otx2_mcam_flow_init(struct otx2_nic *pf) INIT_LIST_HEAD(&pf->flow_cfg->flow_list_tc); pf->flow_cfg->ucast_flt_cnt = OTX2_DEFAULT_UNICAST_FLOWS; + pf->flow_cfg->ntuple_cnt = OTX2_DEFAULT_FLOWCOUNT; /* Allocate bare minimum number of MCAM entries needed for * unicast and ntuple filters. From 4043b038199aebe8f2239f016688c0efaf82bdc6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 May 2025 14:48:05 +0200 Subject: [PATCH 157/353] mlxsw: spectrum_router: Fix use-after-free when deleting GRE net devices The driver only offloads neighbors that are constructed on top of net devices registered by it or their uppers (which are all Ethernet). The device supports GRE encapsulation and decapsulation of forwarded traffic, but the driver will not offload dummy neighbors constructed on top of GRE net devices as they are not uppers of its net devices: # ip link add name gre1 up type gre tos inherit local 192.0.2.1 remote 198.51.100.1 # ip neigh add 0.0.0.0 lladdr 0.0.0.0 nud noarp dev gre1 $ ip neigh show dev gre1 nud noarp 0.0.0.0 lladdr 0.0.0.0 NOARP (Note that the neighbor is not marked with 'offload') When the driver is reloaded and the existing configuration is replayed, the driver does not perform the same check regarding existing neighbors and offloads the previously added one: # devlink dev reload pci/0000:01:00.0 $ ip neigh show dev gre1 nud noarp 0.0.0.0 lladdr 0.0.0.0 offload NOARP If the neighbor is later deleted, the driver will ignore the notification (given the GRE net device is not its upper) and will therefore keep referencing freed memory, resulting in a use-after-free [1] when the net device is deleted: # ip neigh del 0.0.0.0 lladdr 0.0.0.0 dev gre1 # ip link del dev gre1 Fix by skipping neighbor replay if the net device for which the replay is performed is not our upper. [1] BUG: KASAN: slab-use-after-free in mlxsw_sp_neigh_entry_update+0x1ea/0x200 Read of size 8 at addr ffff888155b0e420 by task ip/2282 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 mlxsw_sp_neigh_entry_update+0x1ea/0x200 mlxsw_sp_router_rif_gone_sync+0x2a8/0x440 mlxsw_sp_rif_destroy+0x1e9/0x750 mlxsw_sp_netdevice_ipip_ol_event+0x3c9/0xdc0 mlxsw_sp_router_netdevice_event+0x3ac/0x15e0 notifier_call_chain+0xca/0x150 call_netdevice_notifiers_info+0x7f/0x100 unregister_netdevice_many_notify+0xc8c/0x1d90 rtnl_dellink+0x34e/0xa50 rtnetlink_rcv_msg+0x6fb/0xb70 netlink_rcv_skb+0x131/0x360 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x121/0x1b0 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 8fdb09a7674c ("mlxsw: spectrum_router: Replay neighbours when RIF is made") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://patch.msgid.link/c53c02c904fde32dad484657be3b1477884e9ad6.1747225701.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 464821dd492dae..a2033837182e86 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3014,6 +3014,9 @@ static int mlxsw_sp_neigh_rif_made_sync(struct mlxsw_sp *mlxsw_sp, .rif = rif, }; + if (!mlxsw_sp_dev_lower_is_port(mlxsw_sp_rif_dev(rif))) + return 0; + neigh_for_each(&arp_tbl, mlxsw_sp_neigh_rif_made_sync_each, &rms); if (rms.err) goto err_arp; From b3bade8b459cd7cc1a72395d62e6c74483cd6231 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 13 May 2025 23:29:08 -0700 Subject: [PATCH 158/353] bnxt_en: bring back rtnl_lock() in the bnxt_open() path Error recovery, PCIe AER, resume, and TX timeout will invoke bnxt_open() with netdev_lock only. This will cause RTNL assert failure in netif_set_real_num_tx_queues(), netif_set_real_num_tx_queues(), and netif_set_real_num_tx_queues(). Example error recovery assert: RTNL: assertion failed at net/core/dev.c (3178) WARNING: CPU: 3 PID: 3392 at net/core/dev.c:3178 netif_set_real_num_tx_queues+0x1fd/0x210 Call Trace: ? __pfx_bnxt_msix+0x10/0x10 [bnxt_en] __bnxt_open_nic+0x1ef/0xb20 [bnxt_en] bnxt_open+0xda/0x130 [bnxt_en] bnxt_fw_reset_task+0x21f/0x780 [bnxt_en] process_scheduled_works+0x9d/0x400 For now, bring back rtnl_lock() in all these code paths that can invoke bnxt_open(). In the bnxt_queue_start() error path, we don't have rtnl_lock held so we just change it to call netif_close() instead of bnxt_reset_task() for simplicity. This error path is unlikely so it should be fine. Fixes: 004b5008016a ("eth: bnxt: remove most dependencies on RTNL") Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250514062908.2766677-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 36 ++++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 86a5de44b6f3fe..6afc2ab6fad228 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -14013,13 +14013,28 @@ static void bnxt_unlock_sp(struct bnxt *bp) netdev_unlock(bp->dev); } +/* Same as bnxt_lock_sp() with additional rtnl_lock */ +static void bnxt_rtnl_lock_sp(struct bnxt *bp) +{ + clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state); + rtnl_lock(); + netdev_lock(bp->dev); +} + +static void bnxt_rtnl_unlock_sp(struct bnxt *bp) +{ + set_bit(BNXT_STATE_IN_SP_TASK, &bp->state); + netdev_unlock(bp->dev); + rtnl_unlock(); +} + /* Only called from bnxt_sp_task() */ static void bnxt_reset(struct bnxt *bp, bool silent) { - bnxt_lock_sp(bp); + bnxt_rtnl_lock_sp(bp); if (test_bit(BNXT_STATE_OPEN, &bp->state)) bnxt_reset_task(bp, silent); - bnxt_unlock_sp(bp); + bnxt_rtnl_unlock_sp(bp); } /* Only called from bnxt_sp_task() */ @@ -14027,9 +14042,9 @@ static void bnxt_rx_ring_reset(struct bnxt *bp) { int i; - bnxt_lock_sp(bp); + bnxt_rtnl_lock_sp(bp); if (!test_bit(BNXT_STATE_OPEN, &bp->state)) { - bnxt_unlock_sp(bp); + bnxt_rtnl_unlock_sp(bp); return; } /* Disable and flush TPA before resetting the RX ring */ @@ -14068,7 +14083,7 @@ static void bnxt_rx_ring_reset(struct bnxt *bp) } if (bp->flags & BNXT_FLAG_TPA) bnxt_set_tpa(bp, true); - bnxt_unlock_sp(bp); + bnxt_rtnl_unlock_sp(bp); } static void bnxt_fw_fatal_close(struct bnxt *bp) @@ -14960,15 +14975,17 @@ static void bnxt_fw_reset_task(struct work_struct *work) bp->fw_reset_state = BNXT_FW_RESET_STATE_OPENING; fallthrough; case BNXT_FW_RESET_STATE_OPENING: - while (!netdev_trylock(bp->dev)) { + while (!rtnl_trylock()) { bnxt_queue_fw_reset_work(bp, HZ / 10); return; } + netdev_lock(bp->dev); rc = bnxt_open(bp->dev); if (rc) { netdev_err(bp->dev, "bnxt_open() failed during FW reset\n"); bnxt_fw_reset_abort(bp, rc); netdev_unlock(bp->dev); + rtnl_unlock(); goto ulp_start; } @@ -14988,6 +15005,7 @@ static void bnxt_fw_reset_task(struct work_struct *work) bnxt_dl_health_fw_status_update(bp, true); } netdev_unlock(bp->dev); + rtnl_unlock(); bnxt_ulp_start(bp, 0); bnxt_reenable_sriov(bp); netdev_lock(bp->dev); @@ -15936,7 +15954,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) rc); napi_enable_locked(&bnapi->napi); bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); - bnxt_reset_task(bp, true); + netif_close(dev); return rc; } @@ -16752,6 +16770,7 @@ static int bnxt_resume(struct device *device) struct bnxt *bp = netdev_priv(dev); int rc = 0; + rtnl_lock(); netdev_lock(dev); rc = pci_enable_device(bp->pdev); if (rc) { @@ -16796,6 +16815,7 @@ static int bnxt_resume(struct device *device) resume_exit: netdev_unlock(bp->dev); + rtnl_unlock(); bnxt_ulp_start(bp, rc); if (!rc) bnxt_reenable_sriov(bp); @@ -16961,6 +16981,7 @@ static void bnxt_io_resume(struct pci_dev *pdev) int err; netdev_info(bp->dev, "PCI Slot Resume\n"); + rtnl_lock(); netdev_lock(netdev); err = bnxt_hwrm_func_qcaps(bp); @@ -16978,6 +16999,7 @@ static void bnxt_io_resume(struct pci_dev *pdev) netif_device_attach(netdev); netdev_unlock(netdev); + rtnl_unlock(); bnxt_ulp_start(bp, err); if (!err) bnxt_reenable_sriov(bp); From 3ea441c0c540cf1df37f98087b4529fde1846881 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 9 May 2025 11:46:45 -0700 Subject: [PATCH 159/353] wifi: mac80211: Set n_channels after allocating struct cfg80211_scan_request Make sure that n_channels is set after allocating the struct cfg80211_registered_device::int_scan_req member. Seen with syzkaller: UBSAN: array-index-out-of-bounds in net/mac80211/scan.c:1208:5 index 0 is out of range for type 'struct ieee80211_channel *[] __counted_by(n_channels)' (aka 'struct ieee80211_channel *[]') This was missed in the initial conversions because I failed to locate the allocation likely due to the "sizeof(void *)" not matching the "channels" array type. Reported-by: syzbot+4bcdddd48bb6f0be0da1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/680fd171.050a0220.2b69d1.045e.GAE@google.com/ Fixes: e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20250509184641.work.542-kees@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 741e6c7edcb7c7..6b6de43d9420ac 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1354,10 +1354,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR); - local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + - sizeof(void *) * channels, GFP_KERNEL); + local->int_scan_req = kzalloc(struct_size(local->int_scan_req, + channels, channels), + GFP_KERNEL); if (!local->int_scan_req) return -ENOMEM; + local->int_scan_req->n_channels = channels; eth_broadcast_addr(local->int_scan_req->bssid); From 5fdfa3cea5e6a3732bb9ebba90150d947cd665c4 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Tue, 6 May 2025 14:55:39 +0300 Subject: [PATCH 160/353] wifi: mt76: disable napi on driver removal A warning on driver removal started occurring after commit 9dd05df8403b ("net: warn if NAPI instance wasn't shut down"). Disable tx napi before deleting it in mt76_dma_cleanup(). WARNING: CPU: 4 PID: 18828 at net/core/dev.c:7288 __netif_napi_del_locked+0xf0/0x100 CPU: 4 UID: 0 PID: 18828 Comm: modprobe Not tainted 6.15.0-rc4 #4 PREEMPT(lazy) Hardware name: ASUS System Product Name/PRIME X670E-PRO WIFI, BIOS 3035 09/05/2024 RIP: 0010:__netif_napi_del_locked+0xf0/0x100 Call Trace: mt76_dma_cleanup+0x54/0x2f0 [mt76] mt7921_pci_remove+0xd5/0x190 [mt7921e] pci_device_remove+0x47/0xc0 device_release_driver_internal+0x19e/0x200 driver_detach+0x48/0x90 bus_remove_driver+0x6d/0xf0 pci_unregister_driver+0x2e/0xb0 __do_sys_delete_module.isra.0+0x197/0x2e0 do_syscall_64+0x7b/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Tested with mt7921e but the same pattern can be actually applied to other mt76 drivers calling mt76_dma_cleanup() during removal. Tx napi is enabled in their *_dma_init() functions and only toggled off and on again inside their suspend/resume/reset paths. So it should be okay to disable tx napi in such a generic way. Found by Linux Verification Center (linuxtesting.org). Fixes: 2ac515a5d74f ("mt76: mt76x02: use napi polling for tx cleanup") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Tested-by: Ming Yen Hsieh Link: https://patch.msgid.link/20250506115540.19045-1-pchelkin@ispras.ru Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 844af16ee55131..35b4ec91979e6a 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -1011,6 +1011,7 @@ void mt76_dma_cleanup(struct mt76_dev *dev) int i; mt76_worker_disable(&dev->tx_worker); + napi_disable(&dev->tx_napi); netif_napi_del(&dev->tx_napi); for (i = 0; i < ARRAY_SIZE(dev->phys); i++) { From 5dedd54df4dd9667724be93330007def21dc5046 Mon Sep 17 00:00:00 2001 From: Ming Yen Hsieh Date: Fri, 9 May 2025 09:04:20 +0800 Subject: [PATCH 161/353] wifi: mt76: mt7925: fix missing hdr_trans_tlv command for broadcast wtbl Ensure that the hdr_trans_tlv command is included in the broadcast wtbl to prevent the IPv6 and multicast packet from being dropped by the chip. Cc: stable@vger.kernel.org Fixes: cb1353ef3473 ("wifi: mt76: mt7925: integrate *mlo_sta_cmd and *sta_cmd") Reported-by: Benjamin Xiao Tested-by: Niklas Schnelle Signed-off-by: Ming Yen Hsieh Link: https://lore.kernel.org/lkml/EmWnO5b-acRH1TXbGnkx41eJw654vmCR-8_xMBaPMwexCnfkvKCdlU5u19CGbaapJ3KRu-l3B-tSUhf8CCQwL0odjo6Cd5YG5lvNeB-vfdg=@pm.me/ Link: https://patch.msgid.link/20250509010421.403022-1-mingyen.hsieh@mediatek.com Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7925/mcu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c index e61da76b2097bf..14b1f603fb6224 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c @@ -1924,14 +1924,14 @@ mt7925_mcu_sta_cmd(struct mt76_phy *phy, mt7925_mcu_sta_mld_tlv(skb, info->vif, info->link_sta->sta); mt7925_mcu_sta_eht_mld_tlv(skb, info->vif, info->link_sta->sta); } - - mt7925_mcu_sta_hdr_trans_tlv(skb, info->vif, info->link_sta); } if (!info->enable) { mt7925_mcu_sta_remove_tlv(skb); mt76_connac_mcu_add_tlv(skb, STA_REC_MLD_OFF, sizeof(struct tlv)); + } else { + mt7925_mcu_sta_hdr_trans_tlv(skb, info->vif, info->link_sta); } return mt76_mcu_skb_send_msg(dev, skb, info->cmd, true); From 7d2ed39a1b3172e57d521ece24dac9f84214a146 Mon Sep 17 00:00:00 2001 From: Pengtao He Date: Wed, 14 May 2025 21:20:13 +0800 Subject: [PATCH 162/353] net/tls: fix kernel panic when alloc_page failed We cannot set frag_list to NULL pointer when alloc_page failed. It will be used in tls_strp_check_queue_ok when the next time tls_strp_read_sock is called. This is because we don't reset full_len in tls_strp_flush_anchor_copy() so the recv path will try to continue handling the partial record on the next call but we dettached the rcvq from the frag list. Alternative fix would be to reset full_len. Unable to handle kernel NULL pointer dereference at virtual address 0000000000000028 Call trace: tls_strp_check_rcv+0x128/0x27c tls_strp_data_ready+0x34/0x44 tls_data_ready+0x3c/0x1f0 tcp_data_ready+0x9c/0xe4 tcp_data_queue+0xf6c/0x12d0 tcp_rcv_established+0x52c/0x798 Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Signed-off-by: Pengtao He Link: https://patch.msgid.link/20250514132013.17274-1-hept.hept.hept@gmail.com Signed-off-by: Jakub Kicinski --- net/tls/tls_strp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 77e33e1e340e31..65b0da6fdf6a79 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -396,7 +396,6 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; shinfo = skb_shinfo(strp->anchor); - shinfo->frag_list = NULL; /* If we don't know the length go max plus page for cipher overhead */ need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; @@ -412,6 +411,8 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) page, 0, 0); } + shinfo->frag_list = NULL; + strp->copy_mode = 1; strp->stm.offset = 0; From ce253eca41c69c21febfd4df340c44a897042716 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Wed, 14 May 2025 21:56:57 +0200 Subject: [PATCH 163/353] tsnep: fix timestamping with a stacked DSA driver This driver is susceptible to a form of the bug explained in commit c26a2c2ddc01 ("gianfar: Fix TX timestamping with a stacked DSA driver") and in Documentation/networking/timestamping.rst section "Other caveats for MAC drivers", specifically it timestamps any skb which has SKBTX_HW_TSTAMP, and does not consider if timestamping has been enabled in adapter->hwtstamp_config.tx_type. Evaluate the proper TX timestamping condition only once on the TX path (in tsnep_xmit_frame_ring()) and store the result in an additional TX entry flag. Evaluate the new TX entry flag in the TX confirmation path (in tsnep_tx_poll()). This way SKBTX_IN_PROGRESS is set by the driver as required, but never evaluated. SKBTX_IN_PROGRESS shall not be evaluated as it can be set by a stacked DSA driver and evaluating it would lead to unwanted timestamps. Fixes: 403f69bbdbad ("tsnep: Add TSN endpoint Ethernet MAC driver") Suggested-by: Vladimir Oltean Signed-off-by: Gerhard Engleder Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250514195657.25874-1-gerhard@engleder-embedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/engleder/tsnep_main.c | 30 ++++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 625245b0845c7a..eba73246f98664 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -67,6 +67,8 @@ #define TSNEP_TX_TYPE_XDP_NDO_MAP_PAGE (TSNEP_TX_TYPE_XDP_NDO | TSNEP_TX_TYPE_MAP_PAGE) #define TSNEP_TX_TYPE_XDP (TSNEP_TX_TYPE_XDP_TX | TSNEP_TX_TYPE_XDP_NDO) #define TSNEP_TX_TYPE_XSK BIT(12) +#define TSNEP_TX_TYPE_TSTAMP BIT(13) +#define TSNEP_TX_TYPE_SKB_TSTAMP (TSNEP_TX_TYPE_SKB | TSNEP_TX_TYPE_TSTAMP) #define TSNEP_XDP_TX BIT(0) #define TSNEP_XDP_REDIRECT BIT(1) @@ -386,8 +388,7 @@ static void tsnep_tx_activate(struct tsnep_tx *tx, int index, int length, if (entry->skb) { entry->properties = length & TSNEP_DESC_LENGTH_MASK; entry->properties |= TSNEP_DESC_INTERRUPT_FLAG; - if ((entry->type & TSNEP_TX_TYPE_SKB) && - (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS)) + if ((entry->type & TSNEP_TX_TYPE_SKB_TSTAMP) == TSNEP_TX_TYPE_SKB_TSTAMP) entry->properties |= TSNEP_DESC_EXTENDED_WRITEBACK_FLAG; /* toggle user flag to prevent false acknowledge @@ -479,7 +480,8 @@ static int tsnep_tx_map_frag(skb_frag_t *frag, struct tsnep_tx_entry *entry, return mapped; } -static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count) +static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count, + bool do_tstamp) { struct device *dmadev = tx->adapter->dmadev; struct tsnep_tx_entry *entry; @@ -505,6 +507,9 @@ static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count) entry->type = TSNEP_TX_TYPE_SKB_INLINE; mapped = 0; } + + if (do_tstamp) + entry->type |= TSNEP_TX_TYPE_TSTAMP; } else { skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; @@ -558,11 +563,12 @@ static int tsnep_tx_unmap(struct tsnep_tx *tx, int index, int count) static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, struct tsnep_tx *tx) { - int count = 1; struct tsnep_tx_entry *entry; + bool do_tstamp = false; + int count = 1; int length; - int i; int retval; + int i; if (skb_shinfo(skb)->nr_frags > 0) count += skb_shinfo(skb)->nr_frags; @@ -579,7 +585,13 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, entry = &tx->entry[tx->write]; entry->skb = skb; - retval = tsnep_tx_map(skb, tx, count); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && + tx->adapter->hwtstamp_config.tx_type == HWTSTAMP_TX_ON) { + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + do_tstamp = true; + } + + retval = tsnep_tx_map(skb, tx, count, do_tstamp); if (retval < 0) { tsnep_tx_unmap(tx, tx->write, count); dev_kfree_skb_any(entry->skb); @@ -591,9 +603,6 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb, } length = retval; - if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - for (i = 0; i < count; i++) tsnep_tx_activate(tx, (tx->write + i) & TSNEP_RING_MASK, length, i == count - 1); @@ -844,8 +853,7 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) length = tsnep_tx_unmap(tx, tx->read, count); - if ((entry->type & TSNEP_TX_TYPE_SKB) && - (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS) && + if (((entry->type & TSNEP_TX_TYPE_SKB_TSTAMP) == TSNEP_TX_TYPE_SKB_TSTAMP) && (__le32_to_cpu(entry->desc_wb->properties) & TSNEP_DESC_EXTENDED_WRITEBACK_FLAG)) { struct skb_shared_hwtstamps hwtstamps; From 1ea35d5a92a89dc9bbd5d2ac3d06e16752690ad8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 14 May 2025 15:40:28 +0000 Subject: [PATCH 164/353] net: devmem: fix kernel panic when netlink socket close after module unload Kernel panic occurs when a devmem TCP socket is closed after NIC module is unloaded. This is Devmem TCP unregistration scenarios. number is an order. (a)netlink socket close (b)pp destroy (c)uninstall result 1 2 3 OK 1 3 2 (d)Impossible 2 1 3 OK 3 1 2 (e)Kernel panic 2 3 1 (d)Impossible 3 2 1 (d)Impossible (a) netdev_nl_sock_priv_destroy() is called when devmem TCP socket is closed. (b) page_pool_destroy() is called when the interface is down. (c) mp_ops->uninstall() is called when an interface is unregistered. (d) There is no scenario in mp_ops->uninstall() is called before page_pool_destroy(). Because unregister_netdevice_many_notify() closes interfaces first and then calls mp_ops->uninstall(). (e) netdev_nl_sock_priv_destroy() accesses struct net_device to acquire netdev_lock(). But if the interface module has already been removed, net_device pointer is invalid, so it causes kernel panic. In summary, there are only 3 possible scenarios. A. sk close -> pp destroy -> uninstall. B. pp destroy -> sk close -> uninstall. C. pp destroy -> uninstall -> sk close. Case C is a kernel panic scenario. In order to fix this problem, It makes mp_dmabuf_devmem_uninstall() set binding->dev to NULL. It indicates an bound net_device was unregistered. It makes netdev_nl_sock_priv_destroy() do not acquire netdev_lock() if binding->dev is NULL. A new binding->lock is added to protect a dev of a binding. So, lock ordering is like below. priv->lock netdev_lock(dev) binding->lock Tests: Scenario A: ./ncdevmem -s 192.168.1.4 -c 192.168.1.2 -f $interface -l -p 8000 \ -v 7 -t 1 -q 1 & pid=$! sleep 10 kill $pid ip link set $interface down modprobe -rv $module Scenario B: ./ncdevmem -s 192.168.1.4 -c 192.168.1.2 -f $interface -l -p 8000 \ -v 7 -t 1 -q 1 & pid=$! sleep 10 ip link set $interface down kill $pid modprobe -rv $module Scenario C: ./ncdevmem -s 192.168.1.4 -c 192.168.1.2 -f $interface -l -p 8000 \ -v 7 -t 1 -q 1 & pid=$! sleep 10 modprobe -rv $module sleep 5 kill $pid Splat looks like: Oops: general protection fault, probably for non-canonical address 0xdffffc001fffa9f7: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI KASAN: probably user-memory-access in range [0x00000000fffd4fb8-0x00000000fffd4fbf] CPU: 0 UID: 0 PID: 2041 Comm: ncdevmem Tainted: G B W 6.15.0-rc1+ #2 PREEMPT(undef) 0947ec89efa0fd68838b78e36aa1617e97ff5d7f Tainted: [B]=BAD_PAGE, [W]=WARN RIP: 0010:__mutex_lock (./include/linux/sched.h:2244 kernel/locking/mutex.c:400 kernel/locking/mutex.c:443 kernel/locking/mutex.c:605 kernel/locking/mutex.c:746) Code: ea 03 80 3c 02 00 0f 85 4f 13 00 00 49 8b 1e 48 83 e3 f8 74 6a 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 34 48 89 fa 48 c1 ea 03 <0f> b6 f RSP: 0018:ffff88826f7ef730 EFLAGS: 00010203 RAX: dffffc0000000000 RBX: 00000000fffd4f88 RCX: ffffffffaa9bc811 RDX: 000000001fffa9f7 RSI: 0000000000000008 RDI: 00000000fffd4fbc RBP: ffff88826f7ef8b0 R08: 0000000000000000 R09: ffffed103e6aa1a4 R10: 0000000000000007 R11: ffff88826f7ef442 R12: fffffbfff669f65e R13: ffff88812a830040 R14: ffff8881f3550d20 R15: 00000000fffd4f88 FS: 0000000000000000(0000) GS:ffff888866c05000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000563bed0cb288 CR3: 00000001a7c98000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ... netdev_nl_sock_priv_destroy (net/core/netdev-genl.c:953 (discriminator 3)) genl_release (net/netlink/genetlink.c:653 net/netlink/genetlink.c:694 net/netlink/genetlink.c:705) ... netlink_release (net/netlink/af_netlink.c:737) ... __sock_release (net/socket.c:647) sock_close (net/socket.c:1393) Fixes: 1d22d3060b9b ("net: drop rtnl_lock for queue_mgmt operations") Signed-off-by: Taehee Yoo Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250514154028.1062909-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 7 +++++++ net/core/devmem.h | 2 ++ net/core/netdev-genl.c | 11 +++++++++++ 3 files changed, 20 insertions(+) diff --git a/net/core/devmem.c b/net/core/devmem.c index 6e27a47d049354..2db428ab6b8be4 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -200,6 +200,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, refcount_set(&binding->ref, 1); + mutex_init(&binding->lock); + binding->dmabuf = dmabuf; binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); @@ -379,6 +381,11 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv, xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { if (bound_rxq == rxq) { xa_erase(&binding->bound_rxqs, xa_idx); + if (xa_empty(&binding->bound_rxqs)) { + mutex_lock(&binding->lock); + binding->dev = NULL; + mutex_unlock(&binding->lock); + } break; } } diff --git a/net/core/devmem.h b/net/core/devmem.h index 7fc158d527293b..a1aabc9685cc67 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding { struct sg_table *sgt; struct net_device *dev; struct gen_pool *chunk_pool; + /* Protect dev */ + struct mutex lock; /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index dae9f0d432fb34..a877693fecd65a 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -979,14 +979,25 @@ void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; + netdevice_tracker dev_tracker; struct net_device *dev; mutex_lock(&priv->lock); list_for_each_entry_safe(binding, temp, &priv->bindings, list) { + mutex_lock(&binding->lock); dev = binding->dev; + if (!dev) { + mutex_unlock(&binding->lock); + net_devmem_unbind_dmabuf(binding); + continue; + } + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + mutex_unlock(&binding->lock); + netdev_lock(dev); net_devmem_unbind_dmabuf(binding); netdev_unlock(dev); + netdev_put(dev, &dev_tracker); } mutex_unlock(&priv->lock); } From b4092cd2cf9a7f15ff749f31ce778b6d825601f8 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Mon, 24 Mar 2025 15:36:25 +1000 Subject: [PATCH 165/353] HID: bpf: fix BTN_STYLUS for the XP Pen ACK05 remote Usage_Dig_BarrelSwitch was applied in the UsagePage_Button which incorrectly mapped to BTN_TOOL_PENCIL Link: https://gitlab.freedesktop.org/libevdev/udev-hid-bpf/-/merge_requests/193 Fixes: 834da375 ("bpf: add a v6.11+ compatible BPF fixup for the XPPen ACK05 remote") Link: https://patchwork.kernel.org/project/linux-input/patch/20250207-bpf-import-2025-02-07-v1-7-6048fdd5a206@kernel.org/ Signed-off-by: Peter Hutterer Signed-off-by: Jiri Kosina --- drivers/hid/bpf/progs/XPPen__ACK05.bpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c b/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c index 1a0aeea6a081cd..a754710fc90b8d 100644 --- a/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c +++ b/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c @@ -157,6 +157,7 @@ static const __u8 fixed_rdesc_vendor[] = { ReportCount(5) // padding Input(Const) // Byte 4 in report - just exists so we get to be a tablet pad + UsagePage_Digitizers Usage_Dig_BarrelSwitch // BTN_STYLUS ReportCount(1) ReportSize(1) From db92a13a01ce8908c094c22da5485fd6281e0fd6 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 9 Apr 2025 18:42:51 +0530 Subject: [PATCH 166/353] HID: hid-appletb-kbd: Fix wrong date and kernel version in sysfs interface docs The driver hid-appletb-kbd was upstreamed in kernel 6.15. But, due to an oversight on my part, I didn't change the kernel version and expected date while upstreaming the driver, thus it remained as 6.5, the original kernel version when the driver was developed for downstream. This commit should fix this. Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd index 2a19584d091e48..8c9718d83e9d71 100644 --- a/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd +++ b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd @@ -1,6 +1,6 @@ What: /sys/bus/hid/drivers/hid-appletb-kbd//mode -Date: September, 2023 -KernelVersion: 6.5 +Date: March, 2025 +KernelVersion: 6.15 Contact: linux-input@vger.kernel.org Description: The set of keys displayed on the Touch Bar. From 3127ebfe83b5d9dce9ca1eb26ab9a66331e92b23 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Thu, 27 Mar 2025 23:11:46 +0000 Subject: [PATCH 167/353] HID: thrustmaster: fix memory leak in thrustmaster_interrupts() In thrustmaster_interrupts(), the allocated send_buf is not freed if the usb_check_int_endpoints() check fails, leading to a memory leak. Fix this by ensuring send_buf is freed before returning in the error path. Fixes: 50420d7c79c3 ("HID: hid-thrustmaster: Fix warning in thrustmaster_probe by adding endpoint check") Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 3b81468a1df297..0bf70664c35ee1 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -174,6 +174,7 @@ static void thrustmaster_interrupts(struct hid_device *hdev) u8 ep_addr[2] = {b_ep, 0}; if (!usb_check_int_endpoints(usbif, ep_addr)) { + kfree(send_buf); hid_err(hdev, "Unexpected non-int endpoint\n"); return; } From d80fd089a4980a10c596841d35e6465a433ff18f Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Sat, 29 Mar 2025 00:20:03 +0000 Subject: [PATCH 168/353] HID: wacom: handle kzalloc() allocation failure in wacom_wac_queue_flush() During wacom_wac_queue_flush() the code calls kzalloc() to allocate a zero initialised buffer which it uses as a storage buffer to get data from the fifo via kfifo_out(). However it does not check kzalloc() for allocation failure which returns NULL and could potentially lead to a NULL deref. Fix this by checking for kzalloc() failure and skipping the current entry if allocation failure occurs. Fixes: 5e013ad20689 ("HID: wacom: Remove static WACOM_PKGLEN_MAX limit") Signed-off-by: Qasim Ijaz Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 1556d4287fa506..893a8c3e6bff92 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -70,10 +70,16 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, { while (!kfifo_is_empty(fifo)) { int size = kfifo_peek_len(fifo); - u8 *buf = kzalloc(size, GFP_KERNEL); + u8 *buf; unsigned int count; int err; + buf = kzalloc(size, GFP_KERNEL); + if (!buf) { + kfifo_skip(fifo); + continue; + } + count = kfifo_out(fifo, buf, size); if (count != size) { // Hard to say what is the "right" action in this From 7a16f3d71e135448a8f7f15b37d2b728be438f68 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Mon, 14 Apr 2025 19:33:09 +0100 Subject: [PATCH 169/353] HID: wacom: fix memory leak on size mismatch in wacom_wac_queue_flush() In wacom_wac_queue_flush() the code allocates zero initialised buffer which it uses as a storage buffer for copying data from a fifo via kfifo_out(). The kfifo_out() function returns the number of elements it has copied. The code checks if the number of copied elements does not equal the size of the fifo record, if it does not it simply skips the entry and continues to the next iteration. However it does not release the storage buffer leading to a memory leak. Fix the memory leak by freeing the buffer on size mismatch. Fixes: 5e013ad20689 ("HID: wacom: Remove static WACOM_PKGLEN_MAX limit") Reviewed-by: Jason Gerecke Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 893a8c3e6bff92..7ccdb1f615f9d0 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -87,6 +87,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, // to flush seems reasonable enough, however. hid_warn(hdev, "%s: removed fifo entry with unexpected size\n", __func__); + kfree(buf); continue; } err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false); From e7c7e1c7dcda8bef8d696af51286fc892cd4437c Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Tue, 1 Apr 2025 17:48:53 +0800 Subject: [PATCH 170/353] HID: uclogic: Add NULL check in uclogic_input_configured() devm_kasprintf() returns NULL when memory allocation fails. Currently, uclogic_input_configured() does not check for this case, which results in a NULL pointer dereference. Add NULL check after devm_kasprintf() to prevent this issue. Fixes: dd613a4e45f8 ("HID: uclogic: Correct devm device reference for hidinput input_dev name") Signed-off-by: Henry Martin Signed-off-by: Jiri Kosina --- drivers/hid/hid-uclogic-core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-uclogic-core.c b/drivers/hid/hid-uclogic-core.c index a367df6ea01f3d..61a4019ddc7430 100644 --- a/drivers/hid/hid-uclogic-core.c +++ b/drivers/hid/hid-uclogic-core.c @@ -142,11 +142,12 @@ static int uclogic_input_configured(struct hid_device *hdev, suffix = "System Control"; break; } - } - - if (suffix) + } else { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); + if (!hi->input->name) + return -ENOMEM; + } return 0; } From b5a9adcc60b9c0372bbe03145eca1dc9c25b0d8a Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Mon, 14 Apr 2025 19:33:17 +0100 Subject: [PATCH 171/353] HID: wacom: fix shift OOB in kfifo allocation for zero pktlen During wacom_parse_and_register() the code calls wacom_devm_kfifo_alloc to allocate a fifo. During this operation it passes kfifo_alloc a fifo_size of 0. Kfifo attempts to round the size passed to it to the next power of 2 via roundup_pow_of_two (queue-type data structures do this to maintain efficiency of operations). However during this phase a problem arises when the roundup_pow_of_two() function utilises a shift exponent of fls_long(n-1), where n is the fifo_size. Since n is 0 in this case and n is also an unsigned long, doing n-1 causes unsigned integer wrap-around to occur making the fifo_size 4294967295. So the code effectively does fls_long(4294967295) which results in 64. Returning back to roundup_pow_of_two(), the code utilises a shift exponent of 64. When a shift exponent of 64 is used on a 64-bit type such as 1UL it results in a shift-out-of-bounds. The root cause of the issue seems to stem from insufficient validation of wacom_compute_pktlen(), since in this case the fifo_size comes from wacom_wac->features.pktlen. During wacom_parse_and_register() the wacom_compute_pktlen() function sets the pktlen as 0. To fix this, we should handle cases where wacom_compute_pktlen() results in 0. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=d5204cbbdd921f1f7cad Fixes: 5e013ad20689 ("HID: wacom: Remove static WACOM_PKGLEN_MAX limit") Tested-by: Qasim Ijaz Reviewed-by: Jason Gerecke Cc: stable@vger.kernel.org Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 7ccdb1f615f9d0..eaf099b2efdb0a 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2368,6 +2368,8 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless) unsigned int connect_mask = HID_CONNECT_HIDRAW; features->pktlen = wacom_compute_pktlen(hdev); + if (!features->pktlen) + return -ENODEV; if (!devres_open_group(&hdev->dev, wacom, GFP_KERNEL)) return -ENOMEM; From 6dbe4550b07e5229f88357cd8b3bb76a8b18dbd2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 21 Apr 2025 16:32:09 -0500 Subject: [PATCH 172/353] HID: amd_sfh: Fix SRA sensor when it's the only sensor On systems that only have an SRA sensor connected to SFH the sensor doesn't get enabled due to a bad optimization condition of breaking the sensor walk loop. This optimization is unnecessary in the first place because if there is only one device then the loop only runs once. Drop the condition and explicitly mark sensor as enabled. Reported-by: Yijun Shen Tested-By: Yijun Shen Fixes: d1c444b47100d ("HID: amd_sfh: Add support to export device operating states") Cc: stable@vger.kernel.org Signed-off-by: Mario Limonciello Acked-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c index 25f0ebfcbd5f56..c1bdf1e0d44af9 100644 --- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c +++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c @@ -134,9 +134,6 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata) for (i = 0; i < cl_data->num_hid_devices; i++) { cl_data->sensor_sts[i] = SENSOR_DISABLED; - if (cl_data->num_hid_devices == 1 && cl_data->sensor_idx[0] == SRA_IDX) - break; - if (cl_data->sensor_idx[i] == SRA_IDX) { info.sensor_idx = cl_data->sensor_idx[i]; writel(0, privdata->mmio + amd_get_p2c_val(privdata, 0)); @@ -145,8 +142,10 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata) (privdata, cl_data->sensor_idx[i], ENABLE_SENSOR); cl_data->sensor_sts[i] = (status == 0) ? SENSOR_ENABLED : SENSOR_DISABLED; - if (cl_data->sensor_sts[i] == SENSOR_ENABLED) + if (cl_data->sensor_sts[i] == SENSOR_ENABLED) { + cl_data->is_any_sensor_enabled = true; privdata->dev_en.is_sra_present = true; + } continue; } From 8aeaa3f9db42148dd1dc3b17d22be5ebdba7200e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 21 Apr 2025 16:32:10 -0500 Subject: [PATCH 173/353] HID: amd_sfh: Avoid clearing reports for SRA sensor SRA sensor doesn't allocate any memory for reports. Skip trying to clear memory for that sensor in cleanup path. Suggested-by: Basavaraj Natikar Signed-off-by: Mario Limonciello Acked-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c index c1bdf1e0d44af9..0a9b44ce4904e4 100644 --- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c +++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c @@ -83,6 +83,9 @@ static int amd_sfh_hid_client_deinit(struct amd_mp2_dev *privdata) case ALS_IDX: privdata->dev_en.is_als_present = false; break; + case SRA_IDX: + privdata->dev_en.is_sra_present = false; + break; } if (cl_data->sensor_sts[i] == SENSOR_ENABLED) { @@ -237,6 +240,8 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata) cleanup: amd_sfh_hid_client_deinit(privdata); for (i = 0; i < cl_data->num_hid_devices; i++) { + if (cl_data->sensor_idx[i] == SRA_IDX) + continue; devm_kfree(dev, cl_data->feature_report[i]); devm_kfree(dev, in_data->input_report[i]); devm_kfree(dev, cl_data->report_descr[i]); From ffed6a0485b9371202acd401f901bc051b1e295d Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 16 Apr 2025 10:58:03 +0800 Subject: [PATCH 174/353] HID: hid-steam: Remove the unused variable connected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable connected is not effectively used, so delete it. drivers/hid/hid-steam.c:1153:7: warning: variable ‘connected’ set but not used. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=20462 Signed-off-by: Jiapeng Chong Signed-off-by: Jiri Kosina --- drivers/hid/hid-steam.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index dfd9d22ed559c8..949d307c66a806 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -1150,11 +1150,9 @@ static void steam_client_ll_close(struct hid_device *hdev) struct steam_device *steam = hdev->driver_data; unsigned long flags; - bool connected; spin_lock_irqsave(&steam->lock, flags); steam->client_opened--; - connected = steam->connected && !steam->client_opened; spin_unlock_irqrestore(&steam->lock, flags); schedule_work(&steam->unregister_work); From 90c2240e6f3d66aa6ec32c813cde6372502fccf6 Mon Sep 17 00:00:00 2001 From: Milton Barrera Date: Wed, 9 Apr 2025 00:04:28 -0600 Subject: [PATCH 175/353] HID: quirks: Add ADATA XPG alpha wireless mouse support This patch adds HID_QUIRK_ALWAYS_POLL for the ADATA XPG wireless gaming mouse (USB ID 125f:7505) and its USB dongle (USB ID 125f:7506). Without this quirk, the device does not generate input events properly. Signed-off-by: Milton Barrera Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 4 ++++ drivers/hid/hid-quirks.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 288a2b864cc41d..1062731315a2a5 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -41,6 +41,10 @@ #define USB_VENDOR_ID_ACTIONSTAR 0x2101 #define USB_DEVICE_ID_ACTIONSTAR_1011 0x1011 +#define USB_VENDOR_ID_ADATA_XPG 0x125f +#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE 0x7505 +#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE 0x7506 + #define USB_VENDOR_ID_ADS_TECH 0x06e1 #define USB_DEVICE_ID_ADS_TECH_RADIO_SI470X 0xa155 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 646171598e4132..0731473cc9b1ad 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -27,6 +27,8 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_GAMEPAD), HID_QUIRK_BADPAD }, { HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_PREDATOR), HID_QUIRK_BADPAD }, + { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_AFATECH, USB_DEVICE_ID_AFATECH_AF9016), HID_QUIRK_FULLSPEED_INTERVAL }, { HID_USB_DEVICE(USB_VENDOR_ID_AIREN, USB_DEVICE_ID_AIREN_SLIMPLUS), HID_QUIRK_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_AKAI_09E8, USB_DEVICE_ID_AKAI_09E8_MIDIMIX), HID_QUIRK_NO_INIT_REPORTS }, From c9372edb9f6783066a376b6291a3011e60cf2316 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Mon, 12 May 2025 23:24:19 +0800 Subject: [PATCH 176/353] HID: bpf: abort dispatch if device destroyed The current HID bpf implementation assumes no output report/request will go through it after hid_bpf_destroy_device() has been called. This leads to a bug that unplugging certain types of HID devices causes a cleaned- up SRCU to be accessed. The bug was previously a hidden failure until a recent x86 percpu change [1] made it access not-present pages. The bug will be triggered if the conditions below are met: A) a device under the driver has some LEDs on B) hid_ll_driver->request() is uninplemented (e.g., logitech-djreceiver) If condition A is met, hidinput_led_worker() is always scheduled *after* hid_bpf_destroy_device(). hid_destroy_device ` hid_bpf_destroy_device ` cleanup_srcu_struct(&hdev->bpf.srcu) ` hid_remove_device ` ... ` led_classdev_unregister ` led_trigger_set(led_cdev, NULL) ` led_set_brightness(led_cdev, LED_OFF) ` ... ` input_inject_event ` input_event_dispose ` hidinput_input_event ` schedule_work(&hid->led_work) [hidinput_led_worker] This is fine when condition B is not met, where hidinput_led_worker() calls hid_ll_driver->request(). This is the case for most HID drivers, which implement it or use the generic one from usbhid. The driver itself or an underlying driver will then abort processing the request. Otherwise, hidinput_led_worker() tries hid_hw_output_report() and leads to the bug. hidinput_led_worker ` hid_hw_output_report ` dispatch_hid_bpf_output_report ` srcu_read_lock(&hdev->bpf.srcu) ` srcu_read_unlock(&hdev->bpf.srcu, idx) The bug has existed since the introduction [2] of dispatch_hid_bpf_output_report(). However, the same bug also exists in dispatch_hid_bpf_raw_requests(), and I've reproduced (no visible effect because of the lack of [1], but confirmed bpf.destroyed == 1) the bug against the commit (i.e., the Fixes:) introducing the function. This is because hidinput_led_worker() falls back to hid_hw_raw_request() when hid_ll_driver->output_report() is uninplemented (e.g., logitech- djreceiver). hidinput_led_worker ` hid_hw_output_report: -ENOSYS ` hid_hw_raw_request ` dispatch_hid_bpf_raw_requests ` srcu_read_lock(&hdev->bpf.srcu) ` srcu_read_unlock(&hdev->bpf.srcu, idx) Fix the issue by returning early in the two mentioned functions if hid_bpf has been marked as destroyed. Though dispatch_hid_bpf_device_event() handles input events, and there is no evidence that it may be called after the destruction, the same check, as a safety net, is also added to it to maintain the consistency among all dispatch functions. The impact of the bug on other architectures is unclear. Even if it acts as a hidden failure, this is still dangerous because it corrupts whatever is on the address calculated by SRCU. Thus, CC'ing the stable list. [1]: commit 9d7de2aa8b41 ("x86/percpu/64: Use relative percpu offsets") [2]: commit 9286675a2aed ("HID: bpf: add HID-BPF hooks for hid_hw_output_report") Closes: https://lore.kernel.org/all/20250506145548.GGaBoi9Jzp3aeJizTR@fat_crate.local/ Fixes: 8bd0488b5ea5 ("HID: bpf: add HID-BPF hooks for hid_hw_raw_requests") Cc: stable@vger.kernel.org Signed-off-by: Rong Zhang Tested-by: Petr Tesarik Link: https://patch.msgid.link/20250512152420.87441-1-i@rong.moe Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/hid_bpf_dispatch.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 2e96ec6a3073da..9a06f9b0e4ef33 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -38,6 +38,9 @@ dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type struct hid_bpf_ops *e; int ret; + if (unlikely(hdev->bpf.destroyed)) + return ERR_PTR(-ENODEV); + if (type >= HID_REPORT_TYPES) return ERR_PTR(-EINVAL); @@ -93,6 +96,9 @@ int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, struct hid_bpf_ops *e; int ret, idx; + if (unlikely(hdev->bpf.destroyed)) + return -ENODEV; + if (rtype >= HID_REPORT_TYPES) return -EINVAL; @@ -130,6 +136,9 @@ int dispatch_hid_bpf_output_report(struct hid_device *hdev, struct hid_bpf_ops *e; int ret, idx; + if (unlikely(hdev->bpf.destroyed)) + return -ENODEV; + idx = srcu_read_lock(&hdev->bpf.srcu); list_for_each_entry_srcu(e, &hdev->bpf.prog_list, list, srcu_read_lock_held(&hdev->bpf.srcu)) { From dd91d125a9f54c38dff91a25aacbc98bd1327493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Sat, 3 May 2025 08:53:58 +0200 Subject: [PATCH 177/353] landlock: Remove KUnit test that triggers a warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A KUnit test checking boundaries triggers a canary warning, which may be disturbing. Let's remove this test for now. Hopefully, KUnit will soon get support for suppressing warning backtraces [1]. Cc: Alessandro Carminati Cc: Andrew Morton Cc: Günther Noack Reported-by: Tingmao Wang Closes: https://lore.kernel.org/r/20250327213807.12964-1-m@maowtm.org Link: https://lore.kernel.org/r/20250425193249.78b45d2589575c15f483c3d8@linux-foundation.org [1] Link: https://lore.kernel.org/r/20250503065359.3625407-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- security/landlock/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/landlock/audit.c b/security/landlock/audit.c index 7e5e0ed0e4e5fe..58d5c40d4d0e1d 100644 --- a/security/landlock/audit.c +++ b/security/landlock/audit.c @@ -175,7 +175,7 @@ static void test_get_hierarchy(struct kunit *const test) KUNIT_EXPECT_EQ(test, 10, get_hierarchy(&dom2, 0)->id); KUNIT_EXPECT_EQ(test, 20, get_hierarchy(&dom2, 1)->id); KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, 2)->id); - KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, -1)->id); + /* KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, -1)->id); */ } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ From 719fce76d1957f5e4903c4ba11c757d1b3e5c48a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 12 May 2025 11:37:30 +0200 Subject: [PATCH 178/353] landlock: Improve bit operations in audit code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the BIT() and BIT_ULL() macros in the new audit code instead of explicit shifts to improve readability. Use bitmask instead of modulo operation to simplify code. Add test_range1_rand15() and test_range2_rand15() KUnit tests to improve get_id_range() coverage. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20250512093732.1408485-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- security/landlock/audit.c | 2 +- security/landlock/id.c | 33 +++++++++++++++++++++++++++++++-- security/landlock/syscalls.c | 3 ++- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/security/landlock/audit.c b/security/landlock/audit.c index 58d5c40d4d0e1d..c52d079cdb77ba 100644 --- a/security/landlock/audit.c +++ b/security/landlock/audit.c @@ -437,7 +437,7 @@ void landlock_log_denial(const struct landlock_cred_security *const subject, return; /* Checks if the current exec was restricting itself. */ - if (subject->domain_exec & (1 << youngest_layer)) { + if (subject->domain_exec & BIT(youngest_layer)) { /* Ignores denials for the same execution. */ if (!youngest_denied->log_same_exec) return; diff --git a/security/landlock/id.c b/security/landlock/id.c index 11fab9259c157f..56f7cc0fc7440f 100644 --- a/security/landlock/id.c +++ b/security/landlock/id.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -25,7 +26,7 @@ static void __init init_id(atomic64_t *const counter, const u32 random_32bits) * Ensures sure 64-bit values are always used by user space (or may * fail with -EOVERFLOW), and makes this testable. */ - init = 1ULL << 32; + init = BIT_ULL(32); /* * Makes a large (2^32) boot-time value to limit ID collision in logs @@ -105,7 +106,7 @@ static u64 get_id_range(size_t number_of_ids, atomic64_t *const counter, * to get a new ID (e.g. a full landlock_restrict_self() call), and the * cost of draining all available IDs during the system's uptime. */ - random_4bits = random_4bits % (1 << 4); + random_4bits &= 0b1111; step = number_of_ids + random_4bits; /* It is safe to cast a signed atomic to an unsigned value. */ @@ -144,6 +145,19 @@ static void test_range1_rand1(struct kunit *const test) init + 2); } +static void test_range1_rand15(struct kunit *const test) +{ + atomic64_t counter; + u64 init; + + init = get_random_u32(); + atomic64_set(&counter, init); + KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 15), init); + KUNIT_EXPECT_EQ( + test, get_id_range(get_random_u8(), &counter, get_random_u8()), + init + 16); +} + static void test_range1_rand16(struct kunit *const test) { atomic64_t counter; @@ -196,6 +210,19 @@ static void test_range2_rand2(struct kunit *const test) init + 4); } +static void test_range2_rand15(struct kunit *const test) +{ + atomic64_t counter; + u64 init; + + init = get_random_u32(); + atomic64_set(&counter, init); + KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 15), init); + KUNIT_EXPECT_EQ( + test, get_id_range(get_random_u8(), &counter, get_random_u8()), + init + 17); +} + static void test_range2_rand16(struct kunit *const test) { atomic64_t counter; @@ -232,10 +259,12 @@ static struct kunit_case __refdata test_cases[] = { KUNIT_CASE(test_init_once), KUNIT_CASE(test_range1_rand0), KUNIT_CASE(test_range1_rand1), + KUNIT_CASE(test_range1_rand15), KUNIT_CASE(test_range1_rand16), KUNIT_CASE(test_range2_rand0), KUNIT_CASE(test_range2_rand1), KUNIT_CASE(test_range2_rand2), + KUNIT_CASE(test_range2_rand15), KUNIT_CASE(test_range2_rand16), {} /* clang-format on */ diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index b9561e3417aecb..33eafb71e4f31b 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -563,7 +564,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, new_llcred->domain = new_dom; #ifdef CONFIG_AUDIT - new_llcred->domain_exec |= 1 << (new_dom->num_layers - 1); + new_llcred->domain_exec |= BIT(new_dom->num_layers - 1); #endif /* CONFIG_AUDIT */ return commit_creds(new_cred); From a3c603434f027ddc1af1f5d517beac9dcf795ec0 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 12 Apr 2025 09:57:14 +0200 Subject: [PATCH 179/353] RDMA/rxe: Fix slab-use-after-free Read in rxe_queue_cleanup bug Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x7d/0xa0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xcf/0x610 mm/kasan/report.c:489 kasan_report+0xb5/0xe0 mm/kasan/report.c:602 rxe_queue_cleanup+0xd0/0xe0 drivers/infiniband/sw/rxe/rxe_queue.c:195 rxe_cq_cleanup+0x3f/0x50 drivers/infiniband/sw/rxe/rxe_cq.c:132 __rxe_cleanup+0x168/0x300 drivers/infiniband/sw/rxe/rxe_pool.c:232 rxe_create_cq+0x22e/0x3a0 drivers/infiniband/sw/rxe/rxe_verbs.c:1109 create_cq+0x658/0xb90 drivers/infiniband/core/uverbs_cmd.c:1052 ib_uverbs_create_cq+0xc7/0x120 drivers/infiniband/core/uverbs_cmd.c:1095 ib_uverbs_write+0x969/0xc90 drivers/infiniband/core/uverbs_main.c:679 vfs_write fs/read_write.c:677 [inline] vfs_write+0x26a/0xcc0 fs/read_write.c:659 ksys_write+0x1b8/0x200 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xaa/0x1b0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f In the function rxe_create_cq, when rxe_cq_from_init fails, the function rxe_cleanup will be called to handle the allocated resources. In fact, some memory resources have already been freed in the function rxe_cq_from_init. Thus, this problem will occur. The solution is to let rxe_cleanup do all the work. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://paste.ubuntu.com/p/tJgC42wDf6/ Tested-by: liuyi Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250412075714.3257358-1-yanjun.zhu@linux.dev Reviewed-by: Daisuke Matsuda Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_cq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fec87c9030abdc..fffd144d509eb0 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); - if (err) { - vfree(cq->queue->buf); - kfree(cq->queue); + if (err) return err; - } cq->is_user = uresp; From f8d7d9c00f53ba67f2a9dbd39b63d6d4d0df1d3c Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Mon, 14 Apr 2025 18:42:30 -0500 Subject: [PATCH 180/353] irdma: free iwdev->rf after removing MSI-X Currently iwdev->rf is allocated in irdma_probe(), but free in irdma_ib_dealloc_device(). It can be misleading. Move the free to irdma_remove() to be more obvious. Freeing in irdma_ib_dealloc_device() leads to KASAN use-after-free issue. Which can also lead to NULL pointer dereference. Fix this. irdma_deinit_interrupts() can't be moved before freeing iwdef->rf, because in this case deinit interrupts will be done before freeing irqs. The simplest solution is to move kfree(iwdev->rf) to irdma_remove(). Reproducer: sudo rmmod irdma Minified splat(s): BUG: KASAN: use-after-free in irdma_remove+0x257/0x2d0 [irdma] Call Trace: ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? kfree+0x253/0x450 ? irdma_remove+0x257/0x2d0 [irdma] kasan_report+0xed/0x120 ? irdma_remove+0x257/0x2d0 [irdma] irdma_remove+0x257/0x2d0 [irdma] auxiliary_bus_remove+0x56/0x80 device_release_driver_internal+0x371/0x530 ? kernfs_put.part.0+0x147/0x310 driver_detach+0xbf/0x180 bus_remove_driver+0x11b/0x2a0 auxiliary_driver_unregister+0x1a/0x50 irdma_exit_module+0x40/0x4c [irdma] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:ice_free_rdma_qvector+0x2a/0xa0 [ice] Call Trace: ? ice_free_rdma_qvector+0x2a/0xa0 [ice] irdma_remove+0x179/0x2d0 [irdma] auxiliary_bus_remove+0x56/0x80 device_release_driver_internal+0x371/0x530 ? kobject_put+0x61/0x4b0 driver_detach+0xbf/0x180 bus_remove_driver+0x11b/0x2a0 auxiliary_driver_unregister+0x1a/0x50 irdma_exit_module+0x40/0x4c [irdma] Reported-by: Marcin Szycik Closes: https://lore.kernel.org/netdev/8e533834-4564-472f-b29b-4f1cb7730053@linux.intel.com/ Fixes: 3e0d3cb3fbe0 ("ice, irdma: move interrupts code to irdma") Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20250414234231.523-1-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.c | 2 ++ drivers/infiniband/hw/irdma/verbs.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 1ee8969595d3d6..d10fd16dcec3d8 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -255,6 +255,8 @@ static void irdma_remove(struct auxiliary_device *aux_dev) ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); irdma_deinit_interrupts(iwdev->rf, pf); + kfree(iwdev->rf); + pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index eeb932e5873036..1e8c92826de221 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4871,5 +4871,4 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) irdma_rt_deinit_hw(iwdev); irdma_ctrl_deinit_hw(iwdev->rf); - kfree(iwdev->rf); } From dc11db59bdda63f7a116022b750f65fb11aa1231 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Apr 2025 18:42:31 -0500 Subject: [PATCH 181/353] ice, irdma: fix an off by one in error handling code If we don't allocate the MIN number of IRQs then we need to free what we have and return -ENOMEM. The problem is this loop is off by one so it frees an entry that wasn't allocated and it doesn't free the first entry where i == 0. Fixes: 3e0d3cb3fbe0 ("ice, irdma: move interrupts code to irdma") Signed-off-by: Dan Carpenter Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20250414234231.523-2-tatyana.e.nikolova@intel.com Reviewed-by: Michal Swiatkowski Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index d10fd16dcec3d8..7599e31b574369 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -221,7 +221,7 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) break; if (i < IRDMA_MIN_MSIX) { - for (; i > 0; i--) + while (--i >= 0) ice_free_rdma_qvector(pf, &rf->msix_entries[i]); kfree(rf->msix_entries); From dfbef56dfce6bd3ba0969b0d6b116f074830f03b Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 6 May 2025 17:10:08 +0200 Subject: [PATCH 182/353] RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xc3/0x670 mm/kasan/report.c:521 kasan_report+0xe0/0x110 mm/kasan/report.c:634 strlen+0x93/0xa0 lib/string.c:420 __fortify_strlen include/linux/fortify-string.h:268 [inline] get_kobj_path_length lib/kobject.c:118 [inline] kobject_get_path+0x3f/0x2a0 lib/kobject.c:158 kobject_uevent_env+0x289/0x1870 lib/kobject_uevent.c:545 ib_register_device drivers/infiniband/core/device.c:1472 [inline] ib_register_device+0x8cf/0xe00 drivers/infiniband/core/device.c:1393 rxe_register_device+0x275/0x320 drivers/infiniband/sw/rxe/rxe_verbs.c:1552 rxe_net_add+0x8e/0xe0 drivers/infiniband/sw/rxe/rxe_net.c:550 rxe_newlink+0x70/0x190 drivers/infiniband/sw/rxe/rxe.c:225 nldev_newlink+0x3a3/0x680 drivers/infiniband/core/nldev.c:1796 rdma_nl_rcv_msg+0x387/0x6e0 drivers/infiniband/core/netlink.c:195 rdma_nl_rcv_skb.constprop.0.isra.0+0x2e5/0x450 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg net/socket.c:727 [inline] ____sys_sendmsg+0xa95/0xc70 net/socket.c:2566 ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620 __sys_sendmsg+0x16d/0x220 net/socket.c:2652 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x260 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f This problem is similar to the problem that the commit 1d6a9e7449e2 ("RDMA/core: Fix use-after-free when rename device name") fixes. The root cause is: the function ib_device_rename() renames the name with lock. But in the function kobject_uevent(), this name is accessed without lock protection at the same time. The solution is to add the lock protection when this name is accessed in the function kobject_uevent(). Fixes: 779e0bf47632 ("RDMA/core: Do not indicate device ready when device enablement fails") Link: https://patch.msgid.link/r/20250506151008.75701-1-yanjun.zhu@linux.dev Reported-by: syzbot+e2ce9e275ecc70a30b72@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e2ce9e275ecc70a30b72 Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b4e3e4beb7f455..d4263385850a7a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1352,6 +1352,9 @@ static void ib_device_notify_register(struct ib_device *device) down_read(&devices_rwsem); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; @@ -1468,10 +1471,9 @@ int ib_register_device(struct ib_device *device, const char *name, return ret; } dev_set_uevent_suppress(&device->dev, false); - /* Mark for userspace that device is ready */ - kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); + ib_device_put(device); return 0; From 955a411809189c1b43e097155d743f04ce99816e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 11:30:21 -0400 Subject: [PATCH 183/353] bcachefs: Don't strip rebalance_opts from indirect extents Fix bch2_bkey_clear_needs_rebalance(): indirect extents are never supposed to have bch_extent_rebalance stripped off, because that's how we get the IO path options when we don't have the original inode it belonged to. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4ccdfc1f34aa35..623273556aa978 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { - if (!bch2_bkey_rebalance_opts(k)) + if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) return 0; struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); From ec74f74b4cb785898b0b73c8e527803e51f5d3f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 14:42:37 -0400 Subject: [PATCH 184/353] bcachefs: Fix broken btree_path lock invariants in next_node() This fixes btree locking assert pops users were seeing during evacuate: https://github.com/koverstreet/bcachefs/issues/878 May 09 22:45:02 sharon kernel: bcachefs (68116e25-fa2d-4c6f-86c7-e8b431d792ae): bch2_btree_insert_node(): node not locked at level 1 May 09 22:45:02 sharon kernel: bch2_btree_node_rewrite [bcachefs]: watermark=btree no_check_rw alloc l=0-1 mode=none nodes_written=0 cl.remaining=2 journal_seq=0 May 09 22:45:02 sharon kernel: path: idx 1 ref 1:0 S B btree=alloc level=0 pos 0:3699637:0 0:3698012:1-0:3699637:0 bch2_move_btree.isra.0+0x1db/0x490 [bcachefs] uptodate 0 locks_want 2 May 09 22:45:02 sharon kernel: l=0 locks intent seq 4 node ffff8bd700c93600 May 09 22:45:02 sharon kernel: l=1 locks unlocked seq 1712 node ffff8bd6fd5e7a00 May 09 22:45:02 sharon kernel: l=2 locks unlocked seq 2295 node ffff8bd6cc725400 May 09 22:45:02 sharon kernel: l=3 locks unlocked seq 0 node 0000000000000000 Evacuate walks btree nodes with bch2_btree_iter_next_node() and rewrites them, bch2_btree_update_start() upgrades the path to take intent locks as far as it needs to. But next_node() does low level unlock/relock calls on individual nodes, and didn't handle the case where a path is supposed to be holding multiple intent locks. If a path has locks_want > 1, it needs to be either holding locks on all the btree nodes (at each level) requested, or none of them. Fix this with a bch2_btree_path_downgrade(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 59fa527ac68548..9c9bc7b6c71279 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ return NULL; } + /* + * We don't correctly handle nodes with extra intent locks here: + * downgrade so we don't violate locking invariants + */ + bch2_btree_path_downgrade(trans, path); + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); From 39eb17e46dcfb413bc9d1329bb4d9506c0a2213a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 12:55:44 -0400 Subject: [PATCH 185/353] bcachefs: Fix livelock in journal_entry_open() When the journal is low on space, we might do discards from journal_res_get() -> journal_entry_open(). Make sure we set j->can_discard correctly, so that if we're low on space but not because discards aren't keeping up we don't livelock. Fixes: 8e4d28036c29 ("bcachefs: Don't aggressively discard the journal") Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 976464d8a695ae..cc00b0fc40d8e3 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@ #include #include +static bool __should_discard_bucket(struct journal *, struct journal_device *); + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j) ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (ja->discard_idx != ja->dirty_idx_ondisk) - can_discard = true; + can_discard |= __should_discard_bucket(j, ja); max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; @@ -264,13 +265,19 @@ void bch2_journal_space_available(struct journal *j) /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) { - spin_lock(&j->lock); unsigned min_free = max(4, ja->nr / 8); - bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free && + return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < + min_free && ja->discard_idx != ja->dirty_idx_ondisk; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + spin_lock(&j->lock); + bool ret = __should_discard_bucket(j, ja); spin_unlock(&j->lock); return ret; From 610bcf374841ec6d7da83e8ff440c50e96faa113 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 14:27:01 -0400 Subject: [PATCH 186/353] bcachefs: Don't set btree nodes as accessed on fill Prevent jobs that do lots of scanning (i.e. evacuatee, scrub) from causing OOMs. The shrinker code seems to be having issues when it doesn't do any freeing because it's just flipping off the acccessed bit - and the accessed bit shouldn't be set on first use anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9b80201c7982f5..89989129579728 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -852,7 +852,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); - set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -1286,6 +1285,10 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, six_unlock_read(&b->c.lock); goto retry; } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ @@ -1301,10 +1304,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, prefetch(p + L1_CACHE_BYTES * 2); } - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); From 6fa1ea6af0ea5e7e8c7dc28e98c52811300bd4a7 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 13 May 2025 18:54:26 +0800 Subject: [PATCH 187/353] bcachefs: Fix self deadlock Before invoking bch2_accounting_mem_mod_locked in bch2_gc_accounting_done, we already write locked mark_lock, in bch2_accounting_mem_insert, we lock mark_lock again. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 17 +++++++++++++++-- fs/bcachefs/disk_accounting.h | 16 +++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b007319b72e91c..1f0422bfae359f 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, return ret; } +int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + struct bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + return __bch2_accounting_mem_insert(c, a); +} + static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) @@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal); + BCH_ACCOUNTING_normal, true); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read); + BCH_ACCOUNTING_read, false); percpu_up_read(&c->mark_lock); return ret; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index abb1f6206fe928..d557b99b3c0ae6 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -136,6 +136,7 @@ enum bch_accounting_mode { }; int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); +int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) @@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) */ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) + enum bch_accounting_mode mode, + bool write_locked) { struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; @@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, mode); + int ret = 0; + if (unlikely(write_locked)) + ret = bch2_accounting_mem_insert_locked(c, a, mode); + else + ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } @@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); percpu_up_read(&trans->c->mark_lock); return ret; } @@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, EBUG_ON(bversion_zero(a->k.bversion)); return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) : 0; } @@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans struct bkey_s_accounting a = accounting_i_to_s(a_i); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); bch2_accounting_neg(a); } } From 50462b2d6ebcaafe8824dea9d40e73e3a53bbff9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 21:14:17 -0400 Subject: [PATCH 188/353] bcachefs: Fix set_should_be_locked() call in peek_slot() set_should_be_locked() needs to be called before peek_key_cache(), which traverses other paths and may do a trans unlock/relock. This fixes an assertion pop in path_peek_slot(), when the path we're using is unexpectedly not uptodate. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 9c9bc7b6c71279..a873ec1baf581c 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2749,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre ret = trans_maybe_inject_restart(trans, _RET_IP_); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } /* extents can't span inode numbers: */ @@ -2769,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } struct btree_path *path = btree_iter_path(trans, iter); if (unlikely(!btree_path_node(path, path->level))) return bkey_s_c_null; + btree_path_set_should_be_locked(trans, path); + if ((iter->flags & BTREE_ITER_cached) || !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; @@ -2796,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ - goto out_no_locked; + goto out; } - k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); if (unlikely(!k.k)) - goto out_no_locked; + goto out; if (unlikely(k.k->type == KEY_TYPE_whiteout && (iter->flags & BTREE_ITER_filter_snapshots) && @@ -2839,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } if (unlikely(bkey_err(k))) - goto out_no_locked; + goto out; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2861,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } } out: - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); ret = bch2_btree_iter_verify_ret(trans, iter, k); From 69bcff08c31a7caca58fd33963cad3052263c046 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 13:40:47 -0400 Subject: [PATCH 189/353] bcachefs: Fix accidental O(n^2) in fiemap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since bch2_seek_pagecache_data() searches for dirty data, we only want to call it for holes in the extents btree - otherwise we have an accidental O(n^2), as we repeatedly search the same range. Reported-by: Marcin Mirosław Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b6801861c66f1c..4b742e62255bb7 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1429,7 +1429,9 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, if (ret) goto err; - ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur); + u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; + + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); if (ret) goto err; From 15c812dda70e73bdde4ad5347db9d3b77b9fcb09 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 15:05:19 -0400 Subject: [PATCH 190/353] bcachefs: Fix missing commit in backpointer to missing target Fsck wants to do transaction commits from an outer context; it may have other repair to do (i.e. duplicate backpointers). But when calling backpointer_not_found() from runtime code, i.e. runtime self healing, we should be doing the commit - the outer context expects to just be doing lookups. This fixes bugs where we get stuck spinning, reported as "RCU lock hold time warnings. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 117 +++++++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 38 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ff26bb51515004..5f195d2280a4ea 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, static int backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k, - struct bkey_buf *last_flushed) + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans, } if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) + "%s", buf.buf)) { ret = bch2_backpointer_del(trans, bp.k->p); + if (ret || !commit) + goto out; + + /* + * Normally, on transaction commit from inside a transaction, + * we'll return -BCH_ERR_transaction_restart_nested, since a + * transaction commit invalidates pointers given out by peek(). + * + * However, since we're updating a write buffer btree, if we + * return a transaction restart and loop we won't see that the + * backpointer has been deleted without an additional write + * buffer flush - and those are expensive. + * + * So we're relying on the caller immediately advancing to the + * next backpointer and starting a new transaction immediately + * after backpointer_get_key() returns NULL: + */ + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } +out: fsck_err: printbuf_exit(&buf); return ret; } -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) +static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + struct bkey_buf *last_flushed, + bool commit) +{ + struct bch_fs *c = trans->c; + + BUG_ON(!bp.v->level); + + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level - 1, + 0); + struct btree *b = bch2_btree_iter_peek_node(trans, iter); + if (IS_ERR_OR_NULL(b)) + goto err; + + BUG_ON(b->c.level != bp.v->level - 1); + + if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, + bkey_i_to_s_c(&b->key), bp)) + return b; + + if (btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), + last_flushed, commit); + b = ret ? ERR_PTR(ret) : NULL; + } +err: + bch2_trans_iter_exit(trans, iter); + return b; +} + +static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; @@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, iter); if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) @@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree_iter *iter, struct bkey_buf *last_flushed) { - struct bch_fs *c = trans->c; - - BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - - BUG_ON(b->c.level != bp.v->level - 1); - - if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), bp)) - return b; + return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); +} - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); - } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); - b = ret ? ERR_PTR(ret) : NULL; - } -err: - bch2_trans_iter_exit(trans, iter); - return b; +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed) +{ + return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); } static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, @@ -521,7 +562,7 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); + __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; From 5e9da07ea33c61f29efb69c6f9402cb0e96c1b50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 18:53:48 -0400 Subject: [PATCH 191/353] bcachefs: fix wrong arg to fsck_err() fsck_err() needs the btree transaction passed to it if there is one - so that it can unlock/relock around prompting userspace for fixing the error. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 7b25cedd3e40b5..71d428f376a5f8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2446,7 +2446,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(c, subvol_loop, "subvolume loop")) + if (fsck_err(trans, subvol_loop, "subvolume loop")) ret = reattach_subvol(trans, s); break; } From 6acaed78ee3038da543a0b1fb4175b9eb8e3c5b7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 24 Apr 2025 23:22:23 +0200 Subject: [PATCH 192/353] MAINTAINERS: add me as maintainer for the gpio sloppy logic analyzer This was forgotten when the analyzer went upstream. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250424212234.5313-2-wsa+renesas@sang-engineering.com Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d5b0fcf9d0853b..144ed02146e3ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10146,6 +10146,13 @@ F: drivers/gpio/gpio-regmap.c F: include/linux/gpio/regmap.h K: (devm_)?gpio_regmap_(un)?register +GPIO SLOPPY LOGIC ANALYZER +M: Wolfram Sang +S: Supported +F: Documentation/dev-tools/gpio-sloppy-logic-analyzer.rst +F: drivers/gpio/gpio-sloppy-logic-analyzer.c +F: tools/gpio/gpio-sloppy-logic-analyzer.sh + GPIO SUBSYSTEM M: Linus Walleij M: Bartosz Golaszewski From 3fe4ffd5d2e45450674de44b5eb9e99b09aedd14 Mon Sep 17 00:00:00 2001 From: Emanuele Ghidoli Date: Mon, 12 May 2025 11:54:41 +0200 Subject: [PATCH 193/353] gpio: pca953x: fix IRQ storm on system wake up If an input changes state during wake-up and is used as an interrupt source, the IRQ handler reads the volatile input register to clear the interrupt mask and deassert the IRQ line. However, the IRQ handler is triggered before access to the register is granted, causing the read operation to fail. As a result, the IRQ handler enters a loop, repeatedly printing the "failed reading register" message, until `pca953x_resume()` is eventually called, which restores the driver context and enables access to registers. Fix by disabling the IRQ line before entering suspend mode, and re-enabling it after the driver context is restored in `pca953x_resume()`. An IRQ can be disabled with disable_irq() and still wake the system as long as the IRQ has wake enabled, so the wake-up functionality is preserved. Fixes: b76574300504 ("gpio: pca953x: Restore registers after suspend/resume cycle") Cc: stable@vger.kernel.org Signed-off-by: Emanuele Ghidoli Signed-off-by: Francesco Dolcini Reviewed-by: Andy Shevchenko Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250512095441.31645-1-francesco@dolcini.it Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 442435ded020ac..13cc120cf11f14 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -1204,6 +1204,8 @@ static int pca953x_restore_context(struct pca953x_chip *chip) guard(mutex)(&chip->i2c_lock); + if (chip->client->irq > 0) + enable_irq(chip->client->irq); regcache_cache_only(chip->regmap, false); regcache_mark_dirty(chip->regmap); ret = pca953x_regcache_sync(chip); @@ -1216,6 +1218,10 @@ static int pca953x_restore_context(struct pca953x_chip *chip) static void pca953x_save_context(struct pca953x_chip *chip) { guard(mutex)(&chip->i2c_lock); + + /* Disable IRQ to prevent early triggering while regmap "cache only" is on */ + if (chip->client->irq > 0) + disable_irq(chip->client->irq); regcache_cache_only(chip->regmap, true); } From 05b4542d0e48e735cb788a4bdcb998747b1bef92 Mon Sep 17 00:00:00 2001 From: Markus Burri Date: Fri, 9 May 2025 17:04:59 +0200 Subject: [PATCH 194/353] gpio: virtuser: fix potential out-of-bound write If the caller wrote more characters, count is truncated to the max available space in "simple_write_to_buffer". Check that the input size does not exceed the buffer size. Write a zero termination afterwards. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505091754.285hHbr2-lkp@intel.com/ Signed-off-by: Markus Burri Link: https://lore.kernel.org/r/20250509150459.115489-1-markus.burri@mt.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtuser.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c index 13407fd4f0ebe8..eab6726953b411 100644 --- a/drivers/gpio/gpio-virtuser.c +++ b/drivers/gpio/gpio-virtuser.c @@ -401,10 +401,15 @@ static ssize_t gpio_virtuser_direction_do_write(struct file *file, char buf[32], *trimmed; int ret, dir, val = 0; - ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (count >= sizeof(buf)) + return -EINVAL; + + ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count); if (ret < 0) return ret; + buf[ret] = '\0'; + trimmed = strim(buf); if (strcmp(trimmed, "input") == 0) { @@ -623,12 +628,15 @@ static ssize_t gpio_virtuser_consumer_write(struct file *file, char buf[GPIO_VIRTUSER_NAME_BUF_LEN + 2]; int ret; + if (count >= sizeof(buf)) + return -EINVAL; + ret = simple_write_to_buffer(buf, GPIO_VIRTUSER_NAME_BUF_LEN, ppos, user_buf, count); if (ret < 0) return ret; - buf[strlen(buf) - 1] = '\0'; + buf[ret] = '\0'; ret = gpiod_set_consumer_name(data->ad.desc, buf); if (ret) From d6d3f6257ce5791b556fffd39666edd849541b5d Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Thu, 8 May 2025 09:49:43 +0300 Subject: [PATCH 195/353] regulator: max20086: fix invalid memory access max20086_parse_regulators_dt() calls of_regulator_match() using an array of struct of_regulator_match allocated on the stack for the matches argument. of_regulator_match() calls devm_of_regulator_put_matches(), which calls devres_alloc() to allocate a struct devm_of_regulator_matches which will be de-allocated using devm_of_regulator_put_matches(). struct devm_of_regulator_matches is populated with the stack allocated matches array. If the device fails to probe, devm_of_regulator_put_matches() will be called and will try to call of_node_put() on that stack pointer, generating the following dmesg entries: max20086 6-0028: Failed to read DEVICE_ID reg: -121 kobject: '\xc0$\xa5\x03' (000000002cebcb7a): is not initialized, yet kobject_put() is being called. Followed by a stack trace matching the call flow described above. Switch to allocating the matches array using devm_kcalloc() to avoid accessing the stack pointer long after it's out of scope. This also has the advantage of allowing multiple max20086 to probe without overriding the data stored inside the global of_regulator_match. Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20250508064947.2567255-1-demonsingur@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index 59eb23d467ec05..198d45f8e88493 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -132,7 +132,7 @@ static int max20086_regulators_register(struct max20086 *chip) static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) { - struct of_regulator_match matches[MAX20086_MAX_REGULATORS] = { }; + struct of_regulator_match *matches; struct device_node *node; unsigned int i; int ret; @@ -143,6 +143,11 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) return -ENODEV; } + matches = devm_kcalloc(chip->dev, chip->info->num_outputs, + sizeof(*matches), GFP_KERNEL); + if (!matches) + return -ENOMEM; + for (i = 0; i < chip->info->num_outputs; ++i) matches[i].name = max20086_output_names[i]; From 82eb774de8fc40dfa326f01a540d2945f308b45f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 May 2025 13:10:35 +0200 Subject: [PATCH 196/353] spi: loopback-test: Do not split 1024-byte hexdumps spi_test_print_hex_dump() prints buffers holding less than 1024 bytes in full. Larger buffers are truncated: only the first 512 and the last 512 bytes are printed, separated by a truncation message. The latter is confusing in case the buffer holds exactly 1024 bytes, as all data is printed anyway. Fix this by printing buffers holding up to and including 1024 bytes in full. Fixes: 84e0c4e5e2c4ef42 ("spi: add loopback test driver to allow for spi_master regression tests") Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/37ee1bc90c6554c9347040adabf04188c8f704aa.1746184171.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- drivers/spi/spi-loopback-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c index 31a878d9458d95..7740f94847a883 100644 --- a/drivers/spi/spi-loopback-test.c +++ b/drivers/spi/spi-loopback-test.c @@ -420,7 +420,7 @@ MODULE_LICENSE("GPL"); static void spi_test_print_hex_dump(char *pre, const void *ptr, size_t len) { /* limit the hex_dump */ - if (len < 1024) { + if (len <= 1024) { print_hex_dump(KERN_INFO, pre, DUMP_PREFIX_OFFSET, 16, 1, ptr, len, 0); From 4213b026d6f6d111fff2a44e46aa410fb2cdc443 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 6 May 2025 13:36:59 -0500 Subject: [PATCH 197/353] spi: tegra114: Use value to check for invalid delays A delay unit of 0 is a valid entry, thus it is not valid to check for unused delays. Instead, check the value field; if that is zero, the given delay is unset. Fixes: 4426e6b4ecf6 ("spi: tegra114: Don't fail set_cs_timing when delays are zero") Cc: stable@vger.kernel.org Signed-off-by: Aaron Kling Reviewed-by: Jon Hunter Link: https://patch.msgid.link/20250506-spi-tegra114-fixup-v1-1-136dc2f732f3@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra114.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c index 2a8bb798e95b95..795a8482c2c700 100644 --- a/drivers/spi/spi-tegra114.c +++ b/drivers/spi/spi-tegra114.c @@ -728,9 +728,9 @@ static int tegra_spi_set_hw_cs_timing(struct spi_device *spi) u32 inactive_cycles; u8 cs_state; - if ((setup->unit && setup->unit != SPI_DELAY_UNIT_SCK) || - (hold->unit && hold->unit != SPI_DELAY_UNIT_SCK) || - (inactive->unit && inactive->unit != SPI_DELAY_UNIT_SCK)) { + if ((setup->value && setup->unit != SPI_DELAY_UNIT_SCK) || + (hold->value && hold->unit != SPI_DELAY_UNIT_SCK) || + (inactive->value && inactive->unit != SPI_DELAY_UNIT_SCK)) { dev_err(&spi->dev, "Invalid delay unit %d, should be SPI_DELAY_UNIT_SCK\n", SPI_DELAY_UNIT_SCK); From a9a7bbe9c8787c66066d631fd77685108606fbdc Mon Sep 17 00:00:00 2001 From: Alessandro Grassi Date: Fri, 2 May 2025 11:55:20 +0200 Subject: [PATCH 198/353] spi: spi-sun4i: fix early activation The SPI interface is activated before the CPOL setting is applied. In that moment, the clock idles high and CS goes low. After a short delay, CPOL and other settings are applied, which may cause the clock to change state and idle low. This transition is not part of a clock cycle, and it can confuse the receiving device. To prevent this unexpected transition, activate the interface while CPOL and the other settings are being applied. Signed-off-by: Alessandro Grassi Link: https://patch.msgid.link/20250502095520.13825-1-alessandro.grassi@mailbox.org Signed-off-by: Mark Brown --- drivers/spi/spi-sun4i.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index f89826d7dc4964..aa92fd5a35a98f 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -264,6 +264,9 @@ static int sun4i_spi_transfer_one(struct spi_controller *host, else reg |= SUN4I_CTL_DHB; + /* Now that the settings are correct, enable the interface */ + reg |= SUN4I_CTL_ENABLE; + sun4i_spi_write(sspi, SUN4I_CTL_REG, reg); /* Ensure that we have a parent clock fast enough */ @@ -404,7 +407,7 @@ static int sun4i_spi_runtime_resume(struct device *dev) } sun4i_spi_write(sspi, SUN4I_CTL_REG, - SUN4I_CTL_ENABLE | SUN4I_CTL_MASTER | SUN4I_CTL_TP); + SUN4I_CTL_MASTER | SUN4I_CTL_TP); return 0; From dd1474234a1cac607b8313f745163f1ccc417135 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 5 May 2025 16:35:49 -0700 Subject: [PATCH 199/353] xfs: free up mp->m_free[0].count in error case In xfs_init_percpu_counters(), memory for mp->m_free[0].count wasn't freed in error case. Free it up in this patch. Signed-off-by: Wengang Wang Fixes: 712bae96631852 ("xfs: generalize the freespace and reserved blocks handling") Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b2dd0c0bf50979..3be041647ec17e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1149,7 +1149,7 @@ xfs_init_percpu_counters( return 0; free_freecounters: - while (--i > 0) + while (--i >= 0) percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: From c640fae956b42a7c1692f9a8e21e33d168732604 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 May 2025 16:43:05 +0200 Subject: [PATCH 200/353] xfs: fix zoned GC data corruption due to wrong bv_offset xfs_zone_gc_write_chunk writes out the data buffer read in earlier using the same bio, and currenly looks at bv_offset for the offset into the scratch folio for that. But commit 26064d3e2b4d ("block: fix adding folio to bio") changed how bv_page and bv_offset are calculated for adding larger folios, breaking this fragile logic. Switch to extracting the full physical address from the old bio_vec, and calculate the offset into the folio from that instead. This fixes data corruption during garbage collection with heavy rockdsb workloads. Thanks to Hans for tracking down the culprit commit during long bisection sessions. Fixes: 26064d3e2b4d ("block: fix adding folio to bio") Fixes: 080d01c41d44 ("xfs: implement zoned garbage collection") Reported-by: Hans Holmberg Signed-off-by: Christoph Hellwig Reviewed-by: Hans Holmberg Tested-by: Hans Holmberg Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 81c94dd1d5966b..d613a4094db65c 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -807,7 +807,8 @@ xfs_zone_gc_write_chunk( { struct xfs_zone_gc_data *data = chunk->data; struct xfs_mount *mp = chunk->ip->i_mount; - unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; + phys_addr_t bvec_paddr = + bvec_phys(bio_first_bvec_all(&chunk->bio)); struct xfs_gc_bio *split_chunk; if (chunk->bio.bi_status) @@ -822,7 +823,7 @@ xfs_zone_gc_write_chunk( bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, - folio_offset); + offset_in_folio(chunk->scratch->folio, bvec_paddr)); while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) xfs_zone_gc_submit_write(data, split_chunk); From c84d529f3e30f4cfab4670f6f1d058af8139bf32 Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Mon, 12 May 2025 22:00:32 +0530 Subject: [PATCH 201/353] xfs: Fail remount with noattr2 on a v5 with v4 enabled Bug: When we compile the kernel with CONFIG_XFS_SUPPORT_V4=y, remount with "-o remount,noattr2" on a v5 XFS does not fail explicitly. Reproduction: mkfs.xfs -f /dev/loop0 mount /dev/loop0 /mnt/scratch mount -o remount,noattr2 /dev/loop0 /mnt/scratch However, with CONFIG_XFS_SUPPORT_V4=n, the remount correctly fails explicitly. This is because the way the following 2 functions are defined: static inline bool xfs_has_attr2 (struct xfs_mount *mp) { return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || (mp->m_features & XFS_FEAT_ATTR2); } static inline bool xfs_has_noattr2 (const struct xfs_mount *mp) { return mp->m_features & XFS_FEAT_NOATTR2; } xfs_has_attr2() returns true when CONFIG_XFS_SUPPORT_V4=n and hence, the following if condition in xfs_fs_validate_params() succeeds and returns -EINVAL: /* * We have not read the superblock at this point, so only the attr2 * mount option can set the attr2 feature by this stage. */ if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); return -EINVAL; } With CONFIG_XFS_SUPPORT_V4=y, xfs_has_attr2() always return false and hence no error is returned. Fix: Check if the existing mount has crc enabled(i.e, of type v5 and has attr2 enabled) and the remount has noattr2, if yes, return -EINVAL. I have tested xfs/{189,539} in fstests with v4 and v5 XFS with both CONFIG_XFS_SUPPORT_V4=y/n and they both behave as expected. This patch also fixes remount from noattr2 -> attr2 (on a v4 xfs). Related discussion in [1] [1] https://lore.kernel.org/all/Z65o6nWxT00MaUrW@dread.disaster.area/ Signed-off-by: Nirjhar Roy (IBM) Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_super.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3be041647ec17e..4a11ddccc563ad 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2114,6 +2114,21 @@ xfs_fs_reconfigure( if (error) return error; + /* attr2 -> noattr2 */ + if (xfs_has_noattr2(new_mp)) { + if (xfs_has_crc(mp)) { + xfs_warn(mp, + "attr2 is always enabled for a V5 filesystem - can't be changed."); + return -EINVAL; + } + mp->m_features &= ~XFS_FEAT_ATTR2; + mp->m_features |= XFS_FEAT_NOATTR2; + } else if (xfs_has_attr2(new_mp)) { + /* noattr2 -> attr2 */ + mp->m_features &= ~XFS_FEAT_NOATTR2; + mp->m_features |= XFS_FEAT_ATTR2; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -2126,6 +2141,17 @@ xfs_fs_reconfigure( mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } + /* + * Now that mp has been modified according to the remount options, we + * do a final option validation with xfs_finish_flags() just like it is + * just like it is done during mount. We cannot use + * done during mount. We cannot use xfs_finish_flags() on new_mp as it + * contains only the user given options. + */ + error = xfs_finish_flags(mp); + if (error) + return error; + /* ro -> rw */ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); From a0c14d4e5fc230b19afc6e2fbf8d0def4124e304 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 12 May 2025 13:42:55 +0200 Subject: [PATCH 202/353] xfs: Fix a comment on xfs_ail_delete It doesn't return anything. Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino Reviewed-by: Chandan Babu R Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trans_ail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 85a649fec6acf6..7d327a3e5a736a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -315,7 +315,7 @@ xfs_ail_splice( } /* - * Delete the given item from the AIL. Return a pointer to the item. + * Delete the given item from the AIL. */ static void xfs_ail_delete( From 7da24d550621af28e3802ac2e840c4bb1683a145 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 12 May 2025 13:42:56 +0200 Subject: [PATCH 203/353] xfs: Fix comment on xfs_trans_ail_update_bulk() This function doesn't take the AIL lock, but should be called with AIL lock held. Also (hopefuly) simplify the comment. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trans_ail.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d327a3e5a736a..67c328d23e4ae9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -777,26 +777,28 @@ xfs_ail_update_finish( } /* - * xfs_trans_ail_update - bulk AIL insertion operation. + * xfs_trans_ail_update_bulk - bulk AIL insertion operation. * - * @xfs_trans_ail_update takes an array of log items that all need to be + * @xfs_trans_ail_update_bulk takes an array of log items that all need to be * positioned at the same LSN in the AIL. If an item is not in the AIL, it will - * be added. Otherwise, it will be repositioned by removing it and re-adding - * it to the AIL. If we move the first item in the AIL, update the log tail to - * match the new minimum LSN in the AIL. + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. * - * This function takes the AIL lock once to execute the update operations on - * all the items in the array, and as such should not be called with the AIL - * lock held. As a result, once we have the AIL lock, we need to check each log - * item LSN to confirm it needs to be moved forward in the AIL. + * If we move the first item in the AIL, update the log tail to match the new + * minimum LSN in the AIL. * - * To optimise the insert operation, we delete all the items from the AIL in - * the first pass, moving them into a temporary list, then splice the temporary - * list into the correct position in the AIL. This avoids needing to do an - * insert operation on every item. + * This function should be called with the AIL lock held. * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. + * To optimise the insert operation, we add all items to a temporary list, then + * splice this list into the correct position in the AIL. + * + * Items that are already in the AIL are first deleted from their current + * location before being added to the temporary list. + * + * This avoids needing to do an insert operation on every item. + * + * The AIL lock is dropped by xfs_ail_update_finish() before returning to + * the caller. */ void xfs_trans_ail_update_bulk( From 2cf5381f497663c290222ee14b624c3631af227f Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 12 May 2025 13:20:25 +0800 Subject: [PATCH 204/353] io_uring/uring_cmd: fix hybrid polling initialization issue Modify the check for whether the timer is initialized during IO transfer when passthrough is used with hybrid polling, to ensure that it's always setup correctly. Cc: stable@vger.kernel.org Fixes: 01ee194d1aba ("io_uring: add support for hybrid IOPOLL") Signed-off-by: hexue Link: https://lore.kernel.org/r/20250512052025.293031-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a9ea7d29cdd978..430ed620ddfe01 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -254,6 +254,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); From 1458e54c84e08844b52910bf0936573a7eaea492 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 May 2025 09:06:06 -0600 Subject: [PATCH 205/353] io_uring/memmap: don't use page_address() on a highmem page For older/32-bit systems with highmem, don't assume that the pages in a mapped region are always going to be mapped. If io_region_init_ptr() finds that the pages are coalescable, also check if the first page is a HighMem page or not. If it is, fall through to the usual vmap() mapping rather than attempt to get the unmapped page address. Cc: stable@vger.kernel.org Fixes: c4d0ac1c1567 ("io_uring/memmap: optimise single folio regions") Link: https://lore.kernel.org/all/681fe2fb.050a0220.f2294.001a.GAE@google.com/ Reported-by: syzbot+5b8c4abafcb1d791ccfc@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/681fed0a.050a0220.f2294.001c.GAE@google.com/ Reported-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Tested-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 76fcc79656b002..07f8a5cbd37ec7 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -116,7 +116,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { - if (ifd.nr_folios == 1) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } From 0d83d365edf2c93d1e9ccfc4cd5daff63b81293d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 May 2025 15:02:23 -0600 Subject: [PATCH 206/353] io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo() Not everything requires locking in there, which is why the 'has_lock' variable exists. But enough does that it's a bit unwieldy to manage. Wrap the whole thing in a ->uring_lock trylock, and just return with no output if we fail to grab it. The existing trylock() will already have greatly diminished utility/output for the failure case. This fixes an issue with reading the SQE fields, if the ring is being actively resized at the same time. Reported-by: Jann Horn Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9414ca6d101c0e..e0d6a59a89fa1b 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* @@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: \n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { + if (!xa_empty(&ctx->personalities)) { unsigned long index; const struct cred *cred; @@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} #endif From 4900f46e0b5a6a45c357113b345dd0a29cb14936 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 May 2025 08:38:02 -0700 Subject: [PATCH 207/353] block: always allocate integrity buffer when required Many nvme metadata formats can not strip or generate the metadata on the controller side. For these formats, a host provided integrity buffer is mandatory even if it isn't checked. The block integrity read_verify and write_generate attributes prevent allocating the metadata buffer, but we need it when the format requires it, otherwise reads and writes will be rejected by the driver with IO errors. Assume the integrity buffer can be offloaded to the controller if the metadata size is the same as the protection information size. Otherwise provide an unchecked host buffer when the read verify or write generation attributes are disabled. This fixes the following nvme warning: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 371 at drivers/nvme/host/core.c:1036 nvme_setup_rw+0x122/0x210 ... RIP: 0010:nvme_setup_rw+0x122/0x210 ... Call Trace: nvme_setup_cmd+0x1b4/0x280 nvme_queue_rqs+0xc4/0x1f0 [nvme] blk_mq_dispatch_queue_requests+0x24a/0x430 blk_mq_flush_plug_list+0x50/0x140 __blk_flush_plug+0xc1/0x100 __submit_bio+0x1c1/0x360 ? submit_bio_noacct_nocheck+0x2d6/0x3c0 submit_bio_noacct_nocheck+0x2d6/0x3c0 ? submit_bio_noacct+0x47/0x4c0 submit_bio_wait+0x48/0xa0 __blkdev_direct_IO_simple+0xee/0x210 ? current_time+0x1d/0x100 ? current_time+0x1d/0x100 ? __bio_clone+0xb0/0xb0 blkdev_read_iter+0xbb/0x140 vfs_read+0x239/0x310 ksys_read+0x58/0xc0 do_syscall_64+0x6c/0x180 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250509153802.3482493-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 62 +++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index e524c609be5066..9c665766479201 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -9,6 +9,7 @@ * not aware of PI. */ #include +#include #include #include "blk.h" @@ -43,6 +44,29 @@ static void bio_integrity_verify_fn(struct work_struct *work) bio_endio(bio); } +#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) +static bool bip_should_check(struct bio_integrity_payload *bip) +{ + return bip->bip_flags & BIP_CHECK_FLAGS; +} + +static bool bi_offload_capable(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return bi->tuple_size == sizeof(struct crc64_pi_tuple); + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return bi->tuple_size == sizeof(struct t10_pi_tuple); + default: + pr_warn_once("%s: unknown integrity checksum type:%d\n", + __func__, bi->csum_type); + fallthrough; + case BLK_INTEGRITY_CSUM_NONE: + return false; + } +} + /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio @@ -54,12 +78,12 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_data *bid = container_of(bip, struct bio_integrity_data, bip); - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && + bip_should_check(bip)) { INIT_WORK(&bid->work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bid->work); return false; @@ -84,6 +108,7 @@ bool bio_integrity_prep(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; + bool set_flags = true; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; @@ -100,19 +125,24 @@ bool bio_integrity_prep(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; + if (bi->flags & BLK_INTEGRITY_NOVERIFY) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + } break; case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk for non-integrity metadata where nothing else * initializes the memory. */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + if (bi->flags & BLK_INTEGRITY_NOGENERATE) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + gfp |= __GFP_ZERO; + } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; break; default: @@ -137,19 +167,21 @@ bool bio_integrity_prep(struct bio *bio) bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bid->bip.bip_flags |= BIP_IP_CHECKSUM; - if (bi->csum_type) - bid->bip.bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bid->bip.bip_flags |= BIP_CHECK_REFTAG; + if (set_flags) { + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bid->bip.bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bid->bip.bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bid->bip.bip_flags |= BIP_CHECK_REFTAG; + } if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) goto err_end_io; /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) + if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; From 548151814e754b42e0b945fa1cb562f0b97d2ea5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 May 2025 20:35:40 -0700 Subject: [PATCH 208/353] nvme-pci: make nvme_pci_npages_prp() __always_inline The only reason nvme_pci_npages_prp() could be used as a compile-time known result in BUILD_BUG_ON() is because the compiler was always choosing to inline the function. Under special circumstances (sanitizer coverage functions disabled for __init functions on ARCH=um), the compiler decided to stop inlining it: drivers/nvme/host/pci.c: In function 'nvme_init': include/linux/compiler_types.h:557:45: error: call to '__compiletime_assert_678' declared with attribute error: BUILD_BUG_ON failed: nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS 557 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/compiler_types.h:538:25: note: in definition of macro '__compiletime_assert' 538 | prefix ## suffix(); \ | ^~~~~~ include/linux/compiler_types.h:557:9: note: in expansion of macro '_compiletime_assert' 557 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert' 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) | ^~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:50:9: note: in expansion of macro 'BUILD_BUG_ON_MSG' 50 | BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) | ^~~~~~~~~~~~~~~~ drivers/nvme/host/pci.c:3804:9: note: in expansion of macro 'BUILD_BUG_ON' 3804 | BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); | ^~~~~~~~~~~~ Force it to be __always_inline to make sure it is always available for use with BUILD_BUG_ON(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505061846.12FMyRjj-lkp@intel.com/ Fixes: c372cdd1efdf ("nvme-pci: iod npages fits in s8") Signed-off-by: Kees Cook Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2e30e9be7408cb..de084ece213611 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -390,7 +390,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_pci_npages_prp(void) +static __always_inline int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); From 4aac493f256f74eb47c92968e3a0e55be9c2a618 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 8 May 2025 16:57:06 +0200 Subject: [PATCH 209/353] nvme-pci: acquire cq_poll_lock in nvme_poll_irqdisable We need to lock this queue for that condition because the timeout work executes per-namespace and can poll the poll CQ. Reported-by: Hannes Reinecke Closes: https://lore.kernel.org/all/20240902130728.1999-1-hare@kernel.org/ Fixes: a0fa9647a54e ("NVMe: add blk polling support") Signed-off-by: Keith Busch Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index de084ece213611..a9390ac7211ea7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1202,7 +1202,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + spin_lock(&nvmeq->cq_poll_lock); nvme_poll_cq(nvmeq, NULL); + spin_unlock(&nvmeq->cq_poll_lock); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } From 1d4756ca5bc38ad77fd509b860875727fc2323bb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 May 2025 08:25:00 +0900 Subject: [PATCH 210/353] nvmet: pci-epf: clear completion queue IRQ flag on delete The function nvmet_pci_epf_delete_cq() unconditionally calls nvmet_pci_epf_remove_irq_vector() even for completion queues that do not have interrupts enabled. Furthermore, for completion queues that do have IRQ enabled, deleting and re-creating the completion queue leaves the flag NVMET_PCI_EPF_Q_IRQ_ENABLED set, even if the completion queue is being re-created with IRQ disabled. Fix these issues by calling nvmet_pci_epf_remove_irq_vector() only if NVMET_PCI_EPF_Q_IRQ_ENABLED is set and make sure to always clear that flag. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/pci-epf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 7fab7f3d79b743..d5442991f2fbd4 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1344,7 +1344,8 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) cancel_delayed_work_sync(&cq->work); nvmet_pci_epf_drain_queue(cq); - nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); + if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); return NVME_SC_SUCCESS; From 26f1ee226e2c5b6e8d4e0d1154e124b5e662a216 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 May 2025 08:25:01 +0900 Subject: [PATCH 211/353] nvmet: pci-epf: do not fall back to using INTX if not supported Some endpoint PCIe controllers do not support raising legacy INTX interrupts. This support is indicated by the intx_capable field of struct pci_epc_features. Modify nvmet_pci_epf_raise_irq() to not automatically fallback to trying raising an INTX interrupt after an MSI or MSI-X error if the controller does not support INTX. Signed-off-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/pci-epf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index d5442991f2fbd4..bde7818c673d8c 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -636,14 +636,16 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, switch (nvme_epf->irq_type) { case PCI_IRQ_MSIX: case PCI_IRQ_MSI: + /* + * If we fail to raise an MSI or MSI-X interrupt, it is likely + * because the host is using legacy INTX IRQs (e.g. BIOS, + * grub), but we can fallback to the INTX type only if the + * endpoint controller supports this type. + */ ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, nvme_epf->irq_type, cq->vector + 1); - if (!ret) + if (!ret || !nvme_epf->epc_features->intx_capable) break; - /* - * If we got an error, it is likely because the host is using - * legacy IRQs (e.g. BIOS, grub). - */ fallthrough; case PCI_IRQ_INTX: ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, From 43f92c4e45fa233f8f4152a90198c07c0f4ebbc4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 May 2025 08:25:02 +0900 Subject: [PATCH 212/353] nvmet: pci-epf: cleanup nvmet_pci_epf_raise_irq() There is no point in taking the controller irq_lock and calling nvmet_pci_epf_should_raise_irq() for a completion queue which does not have IRQ enabled (NVMET_PCI_EPF_Q_IRQ_ENABLED flag is not set). Move the test for the NVMET_PCI_EPF_Q_IRQ_ENABLED flag out of nvmet_pci_epf_should_raise_irq() to the top of nvmet_pci_epf_raise_irq() to return early when no IRQ should be raised. Also, use dev_err_ratelimited() to avoid a message storm under load when raising IRQs is failing. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/pci-epf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index bde7818c673d8c..a6ccf3fcccc239 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -596,9 +596,6 @@ static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, struct nvmet_pci_epf_irq_vector *iv = cq->iv; bool ret; - if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) - return false; - /* IRQ coalescing for the admin queue is not allowed. */ if (!cq->qid) return true; @@ -625,7 +622,8 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, struct pci_epf *epf = nvme_epf->epf; int ret = 0; - if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) || + !test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) return; mutex_lock(&ctrl->irq_lock); @@ -658,7 +656,9 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, } if (ret) - dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret); + dev_err_ratelimited(ctrl->dev, + "CQ[%u]: Failed to raise IRQ (err=%d)\n", + cq->qid, ret); unlock: mutex_unlock(&ctrl->irq_lock); From 52b41b66eac8009889f37e3facf75a5a5e91516d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 May 2025 08:25:03 +0900 Subject: [PATCH 213/353] nvmet: pci-epf: improve debug message Improve the debug message of nvmet_pci_epf_create_cq() to indicate if a completion queue IRQ is disabled. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/pci-epf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index a6ccf3fcccc239..94a908b2340ee7 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1321,8 +1321,14 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); - dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", - cqid, qsize, cq->qes, cq->vector); + if (test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + dev_dbg(ctrl->dev, + "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", + cqid, qsize, cq->qes, cq->vector); + else + dev_dbg(ctrl->dev, + "CQ[%u]: %u entries of %zu B, IRQ disabled\n", + cqid, qsize, cq->qes); return NVME_SC_SUCCESS; From 6bf65bf643c472c6a5e505c2e0aecd639981cee7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 May 2025 08:25:04 +0900 Subject: [PATCH 214/353] nvmet: pci-epf: remove NVMET_PCI_EPF_Q_IS_SQ The flag NVMET_PCI_EPF_Q_IS_SQ is set but never used. Remove it. Signed-off-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/pci-epf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 94a908b2340ee7..7123c855b5a67c 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -62,8 +62,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); #define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1) enum nvmet_pci_epf_queue_flags { - NVMET_PCI_EPF_Q_IS_SQ = 0, /* The queue is a submission queue */ - NVMET_PCI_EPF_Q_LIVE, /* The queue is live */ + NVMET_PCI_EPF_Q_LIVE = 0, /* The queue is live */ NVMET_PCI_EPF_Q_IRQ_ENABLED, /* IRQ is enabled for this queue */ }; @@ -1542,7 +1541,6 @@ static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl, if (sq) { queue = &ctrl->sq[qid]; - set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags); } else { queue = &ctrl->cq[qid]; INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work); From 37211dbfb24d81fdbd356f8267078b93de13ec53 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Thu, 8 May 2025 15:38:00 -0700 Subject: [PATCH 215/353] nvme: multipath: enable BLK_FEAT_ATOMIC_WRITES for multipathing A change to QEMU resulted in all nvme controllers (single and multi-controller subsystems) to have its CMIC.MCTRS bit set which indicates the subsystem supports multiple controllers and it is possible a namespace can be shared between those multiple controllers in a multipath configuration. When a namespace of a CMIC.MCTRS enabled subsystem is allocated, a multipath node is created. The queue limits for this node are inherited from the namespace being allocated. When inheriting queue limits, the features being inherited need to be specified. The atomic write feature (BLK_FEAT_ATOMIC_WRITES) was not specified so the atomic queue limits were not inherited by the multipath disk node which resulted in the sysfs atomic write attributes being zeroed. The fix is to include BLK_FEAT_ATOMIC_WRITES in the list of features to be inherited. Signed-off-by: Alan Adamson Reviewed-by: John Garry Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 250f3da67cc9f7..cf0ef4745564c3 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -638,7 +638,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; - lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | + BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES; if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; From 6b0a6c6ff74f5a861dcb0c2f6242bdff4ecd99c8 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Thu, 8 May 2025 15:38:01 -0700 Subject: [PATCH 216/353] nvme: all namespaces in a subsystem must adhere to a common atomic write size The first namespace configured in a subsystem sets the subsystem's atomic write size based on its AWUPF or NAWUPF. Subsequent namespaces must have an atomic write size (per their AWUPF or NAWUPF) less than or equal to the subsystem's atomic write size, or their probing will be rejected. Signed-off-by: Alan Adamson [hch: fold in review comments from John Garry] Signed-off-by: Christoph Hellwig Reviewed-by: John Garry --- drivers/nvme/host/core.c | 30 +++++++++++++++++++++++++++--- drivers/nvme/host/nvme.h | 3 ++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ac53629fce68d4..6b04473c0ab73c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2059,7 +2059,21 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else - atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + atomic_bs = (1 + ns->ctrl->awupf) * bs; + + /* + * Set subsystem atomic bs. + */ + if (ns->ctrl->subsys->atomic_bs) { + if (atomic_bs != ns->ctrl->subsys->atomic_bs) { + dev_err_ratelimited(ns->ctrl->device, + "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", + ns->disk ? ns->disk->disk_name : "?", + ns->ctrl->subsys->atomic_bs, + atomic_bs); + } + } else + ns->ctrl->subsys->atomic_bs = atomic_bs; nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); } @@ -2201,6 +2215,17 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, nvme_set_chunk_sectors(ns, id, &lim); if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; + + /* + * Validate the max atomic write size fits within the subsystem's + * atomic write capabilities. + */ + if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) { + blk_mq_unfreeze_queue(ns->disk->queue, memflags); + ret = -ENXIO; + goto out; + } + nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) @@ -3031,7 +3056,6 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) kfree(subsys); return -EINVAL; } - subsys->awupf = le16_to_cpu(id->awupf); nvme_mpath_default_iopolicy(subsys); subsys->dev.class = &nvme_subsys_class; @@ -3441,7 +3465,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); - + ctrl->awupf = le16_to_cpu(id->awupf); out_free: kfree(id); return ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 51e07864212710..8fc4683418a3a6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -410,6 +410,7 @@ struct nvme_ctrl { enum nvme_ctrl_type cntrltype; enum nvme_dctype dctype; + u16 awupf; /* 0's based value. */ }; static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) @@ -442,11 +443,11 @@ struct nvme_subsystem { u8 cmic; enum nvme_subsys_type subtype; u16 vendor_id; - u16 awupf; /* 0's based awupf value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; #endif + u32 atomic_bs; }; /* From 956843bb0e1b00ed68f23259949bedccf98192bc Mon Sep 17 00:00:00 2001 From: Ilya Guterman Date: Sat, 10 May 2025 19:21:30 +0900 Subject: [PATCH 217/353] nvme-pci: add NVME_QUIRK_NO_DEEPEST_PS quirk for SOLIDIGM P44 Pro This commit adds the NVME_QUIRK_NO_DEEPEST_PS quirk for device [126f:2262], which belongs to device SOLIDIGM P44 Pro SSDPFKKW020X7 The device frequently have trouble exiting the deepest power state (5), resulting in the entire disk being unresponsive. Verified by setting nvme_core.default_ps_max_latency_us=10000 and observing the expected behavior. Signed-off-by: Ilya Guterman Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a9390ac7211ea7..f1dd804151b1c9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3739,6 +3739,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ From bbc525496386e0ccb799854ca9f728fadaa19f4f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 May 2025 00:26:01 +0800 Subject: [PATCH 218/353] ublk: fix dead loop when canceling io command Commit: f40139fde527 ("ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd") adds a request state check in ublk_cancel_cmd(), and if the request is started, skips canceling this uring_cmd. However, the current uring_cmd may be in ACTIVE state, without block request coming to the uring command. Meantime, if the cached request in tag_set.tags[tag] has been delivered to ublk server and reycycled, then this uring_cmd can't be canceled. ublk requests are aborted in ublk char device release handler, which depends on canceling all ACTIVE uring_cmd. So it causes a dead loop. Fix this issue by not taking a stale request into account when canceling uring_cmd in ublk_cancel_cmd(). Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-block/mruqwpf4tqenkbtgezv5oxwq7ngyq24jzeyqy4ixzvivatbbxv@4oh2wzz4e6qn/ Fixes: f40139fde527 ("ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250515162601.77346-1-ming.lei@redhat.com [axboe: rewording of commit message] Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f9032076bc0615..dc104c025cd568 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1708,7 +1708,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, * that ublk_dispatch_req() is always called */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); - if (req && blk_mq_request_started(req)) + if (req && blk_mq_request_started(req) && req->tag == tag) return; spin_lock(&ubq->cancel_lock); From 3b308526c220e1a766358e5ce717d08d6cab511a Mon Sep 17 00:00:00 2001 From: Steve Siwinski Date: Thu, 8 May 2025 16:01:22 -0400 Subject: [PATCH 219/353] scsi: sd_zbc: block: Respect bio vector limits for REPORT ZONES buffer The REPORT ZONES buffer size is currently limited by the HBA's maximum segment count to ensure the buffer can be mapped. However, the block layer further limits the number of iovec entries to 1024 when allocating a bio. To avoid allocation of buffers too large to be mapped, further restrict the maximum buffer size to BIO_MAX_INLINE_VECS. Replace the UIO_MAXIOV symbolic name with the more contextually appropriate BIO_MAX_INLINE_VECS. Fixes: b091ac616846 ("sd_zbc: Fix report zones buffer allocation") Cc: stable@vger.kernel.org Signed-off-by: Steve Siwinski Link: https://lore.kernel.org/r/20250508200122.243129-1-ssiwinski@atto.com Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- block/bio.c | 2 +- drivers/scsi/sd_zbc.c | 6 +++++- include/linux/bio.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4e6c85a33d74db..4be592d37fb666 100644 --- a/block/bio.c +++ b/block/bio.c @@ -611,7 +611,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_vecs > UIO_MAXIOV) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 7a447ff600d276..a8db66428f80d4 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -169,6 +169,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, unsigned int nr_zones, size_t *buflen) { struct request_queue *q = sdkp->disk->queue; + unsigned int max_segments; size_t bufsize; void *buf; @@ -180,12 +181,15 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, * Furthermore, since the report zone command cannot be split, make * sure that the allocated buffer can always be mapped by limiting the * number of pages allocated to the HBA max segments limit. + * Since max segments can be larger than the max inline bio vectors, + * further limit the allocated buffer to BIO_MAX_INLINE_VECS. */ nr_zones = min(nr_zones, sdkp->zone_info.nr_zones); bufsize = roundup((nr_zones + 1) * 64, SECTOR_SIZE); bufsize = min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); - bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + max_segments = min(BIO_MAX_INLINE_VECS, queue_max_segments(q)); + bufsize = min_t(size_t, bufsize, max_segments << PAGE_SHIFT); while (bufsize >= SECTOR_SIZE) { buf = kvzalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); diff --git a/include/linux/bio.h b/include/linux/bio.h index cafc7c215de8be..b786ec5bcc81d0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,6 +11,7 @@ #include #define BIO_MAX_VECS 256U +#define BIO_MAX_INLINE_VECS UIO_MAXIOV struct queue_limits; From dcad943c9e479b703f29e295539b3da42f3a2239 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 6 Apr 2025 10:38:52 +0200 Subject: [PATCH 220/353] NFSv4: Handle fatal ENETDOWN and ENETUNREACH errors Ensure that the NFSv4 error handling code recognises the RPC_TASK_NETUNREACH_FATAL flag, and handles the ENETDOWN and ENETUNREACH errors accordingly. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/nfs4proc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 970f28dbf2539e..1f7cc260b007ef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -671,6 +671,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, struct nfs_client *clp = server->nfs_client; int ret; + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + exception->delay = 0; + exception->recovering = 0; + exception->retry = 0; + return -EIO; + } + ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { int ret2 = nfs4_exception_should_retrans(server, exception); From d0776385b88828aeff9551f93ffc25fb2f8e7c9a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 6 Apr 2025 11:05:27 +0200 Subject: [PATCH 221/353] NFSv4/pnfs: Layoutreturn on close must handle fatal networking errors If we have a fatal ENETDOWN or ENETUNREACH error, then the layoutreturn on close code should also handle that as fatal, and free the layouts. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/pnfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f582713bf05eb..10fdd065a61c23 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1661,6 +1661,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, /* Was there an RPC level error? If not, retry */ if (task->tk_rpc_status == 0) break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } /* If the call was not sent, let caller handle it */ if (!RPC_WAS_SENT(task)) return 0; From 2baa6d31bb518c6ca2a78e163a0de6bda5e6b40a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Apr 2025 14:36:41 +0200 Subject: [PATCH 222/353] pNFS/flexfiles: Record the RPC errors in the I/O tracepoints When debugging I/O issues, we want to see not just the NFS level errors, but also the RPC level problems, so record both in the tracepoints. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 6 ++--- fs/nfs/nfs4trace.h | 34 +++++++++++++++++--------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 61ad269c825ff0..e6909cafab6864 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1329,7 +1329,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); - trace_ff_layout_read_error(hdr); + trace_ff_layout_read_error(hdr, task->tk_status); } err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1502,7 +1502,7 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); - trace_ff_layout_write_error(hdr); + trace_ff_layout_write_error(hdr, task->tk_status); } err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1551,7 +1551,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); - trace_ff_layout_commit_error(data); + trace_ff_layout_commit_error(data, task->tk_status); } err = ff_layout_async_handle_error(task, NULL, data->ds_clp, diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index bc67fe6801b138..deab4c0e21a064 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2051,13 +2051,15 @@ TRACE_EVENT(fl_getdevinfo, DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( - const struct nfs_pgio_header *hdr + const struct nfs_pgio_header *hdr, + int error ), - TP_ARGS(hdr), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2073,7 +2075,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_fast_assign( const struct inode *inode = hdr->inode; - __entry->error = hdr->res.op_status; + __entry->error = -error; + __entry->nfs_error = hdr->res.op_status; __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2088,7 +2091,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s", + "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " + "nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2096,28 +2100,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, __entry->fhandle, __entry->offset, __entry->count, __entry->stateid_seq, __entry->stateid_hash, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); #define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \ TP_PROTO( \ - const struct nfs_pgio_header *hdr \ + const struct nfs_pgio_header *hdr, \ + int error \ ), \ - TP_ARGS(hdr)) + TP_ARGS(hdr, error)) DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error); DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error); TRACE_EVENT(ff_layout_commit_error, TP_PROTO( - const struct nfs_commit_data *data + const struct nfs_commit_data *data, + int error ), - TP_ARGS(data), + TP_ARGS(data, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2131,7 +2139,8 @@ TRACE_EVENT(ff_layout_commit_error, TP_fast_assign( const struct inode *inode = data->inode; - __entry->error = data->res.op_status; + __entry->error = -error; + __entry->nfs_error = data->res.op_status; __entry->fhandle = nfs_fhandle_hash(data->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2142,14 +2151,15 @@ TRACE_EVENT(ff_layout_commit_error, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u dstaddr=%s", + "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); From c3e4c261b67171a22c87238e79a2ae484c0f06ee Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 17 Apr 2025 15:25:08 +0800 Subject: [PATCH 223/353] nfs: handle failure of nfs_get_lock_context in unlock path When memory is insufficient, the allocation of nfs_lock_context in nfs_get_lock_context() fails and returns -ENOMEM. If we mistakenly treat an nfs4_unlockdata structure (whose l_ctx member has been set to -ENOMEM) as valid and proceed to execute rpc_run_task(), this will trigger a NULL pointer dereference in nfs4_locku_prepare. For example: BUG: kernel NULL pointer dereference, address: 000000000000000c PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 15 UID: 0 PID: 12 Comm: kworker/u64:0 Not tainted 6.15.0-rc2-dirty #60 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 Workqueue: rpciod rpc_async_schedule RIP: 0010:nfs4_locku_prepare+0x35/0xc2 Code: 89 f2 48 89 fd 48 c7 c7 68 69 ef b5 53 48 8b 8e 90 00 00 00 48 89 f3 RSP: 0018:ffffbbafc006bdb8 EFLAGS: 00010246 RAX: 000000000000004b RBX: ffff9b964fc1fa00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: fffffffffffffff4 RDI: ffff9ba53fddbf40 RBP: ffff9ba539934000 R08: 0000000000000000 R09: ffffbbafc006bc38 R10: ffffffffb6b689c8 R11: 0000000000000003 R12: ffff9ba539934030 R13: 0000000000000001 R14: 0000000004248060 R15: ffffffffb56d1c30 FS: 0000000000000000(0000) GS:ffff9ba5881f0000(0000) knlGS:00000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000000c CR3: 000000093f244000 CR4: 00000000000006f0 Call Trace: __rpc_execute+0xbc/0x480 rpc_async_schedule+0x2f/0x40 process_one_work+0x232/0x5d0 worker_thread+0x1da/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0x10d/0x240 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Modules linked in: CR2: 000000000000000c ---[ end trace 0000000000000000 ]--- Free the allocated nfs4_unlockdata when nfs_get_lock_context() fails and return NULL to terminate subsequent rpc_run_task, preventing NULL pointer dereference. Fixes: f30cb757f680 ("NFS: Always wait for I/O completion before unlock") Signed-off-by: Li Lingfeng Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20250417072508.3850532-1-lilingfeng3@huawei.com Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1f7cc260b007ef..b1d2122bd5a749 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7083,10 +7083,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -7094,7 +7102,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); From 11157b3492f9ddca641fab79dc390979495a68be Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 10 Apr 2025 16:42:03 -0400 Subject: [PATCH 224/353] nfs: don't share pNFS DS connections between net namespaces Currently, different NFS clients can share the same DS connections, even when they are in different net namespaces. If a containerized client creates a DS connection, another container can find and use it. When the first client exits, the connection will close which can lead to stalls in other clients. Add a net namespace pointer to struct nfs4_pnfs_ds, and compare those value to the caller's netns in _data_server_lookup_locked() when searching for a nfs4_pnfs_ds to match. Reported-by: Omar Sandoval Reported-by: Sargun Dillon Closes: https://lore.kernel.org/linux-nfs/Z_ArpQC_vREh_hEA@telecaster/ Tested-by: Sargun Dillon Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250410-nfs-ds-netns-v2-1-f80b7979ba80@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayoutdev.c | 6 +++--- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 +++--- fs/nfs/pnfs.h | 4 +++- fs/nfs/pnfs_nfs.c | 9 +++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4fa304fa5bc4b2..29d9234d5c085f 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct page *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e58bedfb1dcc14..4a304cf17c4b07 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 30d2613e912b88..91ff877185c8af 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -60,6 +60,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index dbef837e871ad4..2ee20a0f0b36d3 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -604,12 +604,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } @@ -716,7 +716,7 @@ nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -734,13 +734,14 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + tmp_ds = _data_server_lookup_locked(net, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, From 6b5431503aee883fa57bd2d373d079a7d81ddc4a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 10 Apr 2025 16:42:04 -0400 Subject: [PATCH 225/353] nfs: move the nfs4_data_server_cache into struct nfs_net Since struct nfs4_pnfs_ds should not be shared between net namespaces, move from a global list of objects to a per-netns list and spinlock. Tested-by: Sargun Dillon Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250410-nfs-ds-netns-v2-2-f80b7979ba80@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 7 +++++++ fs/nfs/netns.h | 6 +++++- fs/nfs/pnfs_nfs.c | 31 +++++++++++++++++-------------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 02c916a550205f..2115c1189c2df1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1199,6 +1199,10 @@ void nfs_clients_init(struct net *net) INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); +#endif +#if IS_ENABLED(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&nn->nfs4_data_server_cache); + spin_lock_init(&nn->nfs4_data_server_lock); #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); @@ -1216,6 +1220,9 @@ void nfs_clients_exit(struct net *net) nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) + WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index a68b21603ea9a8..6ba3ea39e928c0 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -31,7 +31,11 @@ struct nfs_net { unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + struct list_head nfs4_data_server_cache; + spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */ struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 2ee20a0f0b36d3..91ef486f40b943 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -16,6 +16,7 @@ #include "nfs4session.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -504,14 +505,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); /* * Data server cache * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting: * - set to 1 on allocation * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ static void @@ -604,12 +605,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) + if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } @@ -653,10 +654,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (refcount_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { + struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + + if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) { list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); destroy_ds(ds); } } @@ -718,6 +720,7 @@ nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) struct nfs4_pnfs_ds * nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { + struct nfs_net *nn = net_generic(net, nfs_net_id); struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -733,8 +736,8 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_fla /* this is only used for debugging, so it's ok if its NULL */ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(net, dsaddrs); + spin_lock(&nn->nfs4_data_server_lock); + tmp_ds = _data_server_lookup_locked(nn, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); @@ -743,7 +746,7 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_fla INIT_LIST_HEAD(&ds->ds_node); ds->ds_net = net; ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); + list_add(&ds->ds_node, &nn->nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { @@ -755,7 +758,7 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_fla refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); out: return ds; } From 7df7c77ddc8e1cea013b8ccc788fae85687b47c7 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Tue, 8 Apr 2025 23:53:42 +0300 Subject: [PATCH 226/353] nfs: direct: drop useless initializer in nfs_direct_write_completion() In nfs_direct_write_completion(), the local variable req isn't used outside the *while* loop and is assigned to right at the start of that loop's body, so its initializer appears useless -- drop it; then move the declaration to the loop body (which happens to have a pointless empty line anyway)... Found by Linux Verification Center (linuxtesting.org) with the Svace static analysis tool. Signed-off-by: Sergey Shtylyov Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/416219f5-7983-484b-b5a7-5fb7da9561f7@omp.ru Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f32f8d7c9122bf..48d89716193a7e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -757,7 +757,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; @@ -786,6 +785,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&inode->i_lock); while (!list_empty(&hdr->pages)) { + struct nfs_page *req; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); From 5d58e3fe759ed96920b7204d99cbc88110a967fa Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Wed, 9 Apr 2025 23:36:33 +0300 Subject: [PATCH 227/353] nfs: nfs3acl: drop useless assignment in nfs3_get_acl() In nfs3_get_acl(), the local variable status is assigned the result of nfs_refresh_inode() inside the *switch* statement, but that value gets overwritten in the next *if* statement's true branch and is completely ignored if that branch isn't taken... Found by Linux Verification Center (linuxtesting.org) with the Svace static analysis tool. Signed-off-by: Sergey Shtylyov Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/c32dced7-a4fa-43c0-aafe-ef6c819c2f91@omp.ru Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 18d8f6529f615e..a126eb31f62fae 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) switch (status) { case 0: - status = nfs_refresh_inode(inode, res.fattr); + nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: From 0b61becec6ca686840de647724359ba45a891fb3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Apr 2025 14:43:34 -0400 Subject: [PATCH 228/353] NFS/localio: Fix a race in nfs_local_open_fh() Once the clp->cl_uuid.lock has been dropped, another CPU could come in and free the struct nfsd_file that was just added. To prevent that from happening, take the RCU read lock before dropping the spin lock. Fixes: 86e00412254a ("nfs: cache all open LOCALIO nfsd_file(s) in client") Signed-off-by: Trond Myklebust Reviewed-by: Mike Snitzer --- fs/nfs/localio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 5c21caeae075c7..4ec952f9f47dde 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -278,6 +278,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); if (IS_ERR(new)) return NULL; + rcu_read_lock(); /* try to swap in the pointer */ spin_lock(&clp->cl_uuid.lock); nf = rcu_dereference_protected(*pnf, 1); @@ -287,7 +288,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, rcu_assign_pointer(*pnf, nf); } spin_unlock(&clp->cl_uuid.lock); - rcu_read_lock(); } nf = nfs_local_file_get(nf); rcu_read_unlock(); From ef023afff9ad0baea063c2f1205f00f8c3ee9739 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 May 2025 10:50:13 -0400 Subject: [PATCH 229/353] NFSv4/pnfs: Reset the layout state after a layoutreturn If there are still layout segments in the layout plh_return_lsegs list after a layout return, we should be resetting the state to ensure they eventually get returned as well. Fixes: 68f744797edd ("pNFS: Do not free layout segments that are marked for return") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 10fdd065a61c23..fc7c5fb1019806 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1292,6 +1300,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: From abf99b0ecc217131cd0d0c5330f093599529047b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 May 2025 11:05:36 -0400 Subject: [PATCH 230/353] NFS/pnfs: Fix the error path in pnfs_layoutreturn_retry_later_locked() If there isn't a valid layout, or the layout stateid has changed, the cleanup after a layout return should clear out the old data. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fc7c5fb1019806..3adb7d0dbec7ac 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1254,21 +1254,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode, static void pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + struct list_head *freeme) { - const struct pnfs_layout_segment *lseg; - u32 seq = be32_to_cpu(arg_stateid->seqid); - if (pnfs_layout_is_valid(lo) && - nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) { - list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) { - if (pnfs_seqid_is_newer(lseg->pls_seq, seq) || - !pnfs_should_free_range(&lseg->pls_range, range)) - continue; - pnfs_set_plh_return_info(lo, range->iomode, seq); - break; - } - } + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); } void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, @@ -1276,11 +1270,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, const struct pnfs_layout_range *range) { struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); spin_lock(&inode->i_lock); - pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range); - pnfs_clear_layoutreturn_waitbit(lo); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); } void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, @@ -1716,6 +1711,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); switch (ret) { case -NFS4ERR_BADSESSION: @@ -1724,9 +1720,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case -NFS4ERR_NOMATCHING_LAYOUT: spin_lock(&inode->i_lock); pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, - &args->range); - pnfs_clear_layoutreturn_waitbit(lo); + &args->range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); break; case 0: if (res->lrs_present) From cd1b655df70a23296a639fa572104005237db672 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Apr 2025 18:21:06 -0400 Subject: [PATCH 231/353] NFS: Avoid flushing data while holding directory locks in nfs_rename() The Linux client assumes that all filehandles are non-volatile for renames within the same directory (otherwise sillyrename cannot work). However, the existence of the Linux 'subtree_check' export option has meant that nfs_rename() has always assumed it needs to flush writes before attempting to rename. Since NFSv4 does allow the client to query whether or not the server exhibits this behaviour, and since knfsd does actually set the appropriate flag when 'subtree_check' is enabled on an export, it should be OK to optimise away the write flushing behaviour in the cases where it is clearly not needed. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/client.c | 2 ++ fs/nfs/dir.c | 15 ++++++++++++++- include/linux/nfs_fs_sb.h | 12 +++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2115c1189c2df1..6d63b958c4bb13 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1105,6 +1105,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bd23fc736b3947..d0e0b435a84316 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2676,6 +2676,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2763,7 +2775,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 71319637a84e61..ee03f3cef30ca5 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -213,6 +213,15 @@ struct nfs_server { char *fscache_uniq; /* Uniquifier (or NULL) */ #endif + /* The following #defines numerically match the NFSv4 equivalents */ +#define NFS_FH_NOEXPIRE_WITH_OPEN (0x1) +#define NFS_FH_VOLATILE_ANY (0x2) +#define NFS_FH_VOL_MIGRATION (0x4) +#define NFS_FH_VOL_RENAME (0x8) +#define NFS_FH_RENAME_UNSAFE (NFS_FH_VOLATILE_ANY | NFS_FH_VOL_RENAME) + u32 fh_expire_type; /* V4 bitmask representing file + handle volatility type for + this filesystem */ u32 pnfs_blksize; /* layout_blksize attr */ #if IS_ENABLED(CONFIG_NFS_V4) u32 attr_bitmask[3];/* V4 bitmask representing the set @@ -236,9 +245,6 @@ struct nfs_server { u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ - u32 fh_expire_type; /* V4 bitmask representing file - handle volatility type for - this filesystem */ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ struct rpc_wait_queue roc_rpcwaitq; void *pnfs_ld_data; /* per mount point data */ From 7a4af779c08e7a4a6dabbccdd8a18e0100259b5f Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Wed, 7 May 2025 11:04:32 -0400 Subject: [PATCH 232/353] drm/amdgpu: csa unmap use uninterruptible lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After process exit to unmap csa and free GPU vm, if signal is accepted and then waiting to take vm lock is interrupted and return, it causes memory leaking and below warning backtrace. Change to use uninterruptible wait lock fix the issue. WARNING: CPU: 69 PID: 167800 at amd/amdgpu/amdgpu_kms.c:1525 amdgpu_driver_postclose_kms+0x294/0x2a0 [amdgpu] Call Trace: drm_file_free.part.0+0x1da/0x230 [drm] drm_close_helper.isra.0+0x65/0x70 [drm] drm_release+0x6a/0x120 [drm] amdgpu_drm_release+0x51/0x60 [amdgpu] __fput+0x9f/0x280 ____fput+0xe/0x20 task_work_run+0x67/0xa0 do_exit+0x217/0x3c0 do_group_exit+0x3b/0xb0 get_signal+0x14a/0x8d0 arch_do_signal_or_restart+0xde/0x100 exit_to_user_mode_loop+0xc1/0x1a0 exit_to_user_mode_prepare+0xf4/0x100 syscall_exit_to_user_mode+0x17/0x40 do_syscall_64+0x69/0xc0 Signed-off-by: Philip Yang Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 7dbbfb3c171a6f63b01165958629c9c26abf38ab) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c index cfdf558b48b648..02138aa557935e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c @@ -109,7 +109,7 @@ int amdgpu_unmap_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct drm_exec exec; int r; - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + drm_exec_init(&exec, 0, 0); drm_exec_until_all_locked(&exec) { r = amdgpu_vm_lock_pd(vm, &exec, 0); if (likely(!r)) From 62fb75ff78ac6d9f8eb2c2fde15425c842c553a2 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Thu, 8 May 2025 13:37:35 +0800 Subject: [PATCH 233/353] drm/amdgpu: fix incorrect MALL size for GFX1151 On GFX1151, the reported MALL cache size reflects only half of its actual size; this adjustment corrects the discrepancy. Signed-off-by: Tim Huang Acked-by: Alex Deucher Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher (cherry picked from commit 0a5c060b593ad152318f89e5564bfdfcff8a6ac0) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index e74e26b6a4f23c..fec9a007533acc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -752,6 +752,18 @@ static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->gmc.vram_type = vram_type; adev->gmc.vram_vendor = vram_vendor; + /* The mall_size is already calculated as mall_size_per_umc * num_umc. + * However, for gfx1151, which features a 2-to-1 UMC mapping, + * the result must be multiplied by 2 to determine the actual mall size. + */ + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(11, 5, 1): + adev->gmc.mall_size *= 2; + break; + default: + break; + } + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 1): From dc79b66c5e28eee6a79ca7c7b6283fd21bf41e60 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Fri, 25 Apr 2025 14:44:02 +0800 Subject: [PATCH 234/353] drm/amd/display: Correct the reply value when AUX write incomplete [Why] Now forcing aux->transfer to return 0 when incomplete AUX write is inappropriate. It should return bytes have been transferred. [How] aux->transfer is asked not to change original msg except reply field of drm_dp_aux_msg structure. Copy the msg->buffer when it's write request, and overwrite the first byte when sink reply 1 byte indicating partially written byte number. Then we can return the correct value without changing the original msg. Fixes: 3637e457eb00 ("drm/amd/display: Fix wrong handling for AUX_DEFER case") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 7ac37f0dcd2e0b729fa7b5513908dc8ab802b540) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- .../drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 64df8ca448b3b8..1525c408d45203 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -12763,7 +12763,8 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( /* The reply is stored in the top nibble of the command. */ payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; - if (!payload->write && p_notify->aux_reply.length) + /*write req may receive a byte indicating partially written number as well*/ + if (p_notify->aux_reply.length) memcpy(payload->data, p_notify->aux_reply.data, p_notify->aux_reply.length); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 074b79fd5822e7..c93eef6f7a6b98 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -62,6 +62,7 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, enum aux_return_code_type operation_result; struct amdgpu_device *adev; struct ddc_service *ddc; + uint8_t copy[16]; if (WARN_ON(msg->size > 16)) return -E2BIG; @@ -77,6 +78,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, (msg->request & DP_AUX_I2C_WRITE_STATUS_UPDATE) != 0; payload.defer_delay = 0; + if (payload.write) { + memcpy(copy, msg->buffer, msg->size); + payload.data = copy; + } + result = dc_link_aux_transfer_raw(TO_DM_AUX(aux)->ddc_service, &payload, &operation_result); @@ -100,9 +106,9 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, */ if (payload.write && result >= 0) { if (result) { - /*one byte indicating partially written bytes. Force 0 to retry*/ + /*one byte indicating partially written bytes*/ drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); - result = 0; + result = payload.data[0]; } else if (!payload.reply[0]) /*I2C_ACK|AUX_ACK*/ result = msg->size; From 1e6a0dd297bdd7390b71cd5f9c6d61aeddaff3cc Mon Sep 17 00:00:00 2001 From: Gabe Teeger Date: Fri, 25 Apr 2025 10:31:39 -0400 Subject: [PATCH 235/353] Revert: "drm/amd/display: Enable urgent latency adjustment on DCN35" This reverts commit 756c85e4d0dd ("drm/amd/display: Enable urgent latency adjustment on DCN35") Reason for revert: Negative power impact. Reviewed-by: Nicholas Kazlauskas Signed-off-by: Gabe Teeger Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 9334c491cd8f388232b9a187bf0ddb728482bd6f) --- drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index d9159ca5541249..92f0a099d089ac 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -195,9 +195,9 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .dcn_downspread_percent = 0.5, .gpuvm_min_page_size_bytes = 4096, .hostvm_min_page_size_bytes = 4096, - .do_urgent_latency_adjustment = 1, + .do_urgent_latency_adjustment = 0, .urgent_latency_adjustment_fabric_clock_component_us = 0, - .urgent_latency_adjustment_fabric_clock_reference_mhz = 3000, + .urgent_latency_adjustment_fabric_clock_reference_mhz = 0, }; void dcn35_build_wm_range_table_fpu(struct clk_mgr *clk_mgr) From 1b449b97e11bff19a6fa077e78270a5f4b97c28d Mon Sep 17 00:00:00 2001 From: John Olender Date: Wed, 16 Apr 2025 02:54:26 -0400 Subject: [PATCH 236/353] drm/amd/display: Defer BW-optimization-blocked DRR adjustments [Why & How] Instead of dropping DRR updates, defer them. This fixes issues where monitor continues to see incorrect refresh rate after VRR was turned off by userspace. Fixes: 32953485c558 ("drm/amd/display: Do not update DRR while BW optimizations pending") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3546 Reviewed-by: Sun peng Li Signed-off-by: John Olender Signed-off-by: Aurabindo Pillai Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 53761b7ecd83e6fbb9f2206f8c980a6aa308c844) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 ++ drivers/gpu/drm/amd/display/dc/core/dc.c | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 1525c408d45203..cc01b9c68b479e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -372,6 +372,8 @@ get_crtc_by_otg_inst(struct amdgpu_device *adev, static inline bool is_dc_timing_adjust_needed(struct dm_crtc_state *old_state, struct dm_crtc_state *new_state) { + if (new_state->stream->adjust.timing_adjust_pending) + return true; if (new_state->freesync_config.state == VRR_STATE_ACTIVE_FIXED) return true; else if (amdgpu_dm_crtc_vrr_active(old_state) != amdgpu_dm_crtc_vrr_active(new_state)) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 28d1353f403dfb..ba4ce8a63158bb 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -439,9 +439,12 @@ bool dc_stream_adjust_vmin_vmax(struct dc *dc, * Don't adjust DRR while there's bandwidth optimizations pending to * avoid conflicting with firmware updates. */ - if (dc->ctx->dce_version > DCE_VERSION_MAX) - if (dc->optimized_required || dc->wm_optimized_required) + if (dc->ctx->dce_version > DCE_VERSION_MAX) { + if (dc->optimized_required || dc->wm_optimized_required) { + stream->adjust.timing_adjust_pending = true; return false; + } + } dc_exit_ips_for_hw_access(dc); @@ -3168,7 +3171,8 @@ static void copy_stream_update_to_stream(struct dc *dc, if (update->crtc_timing_adjust) { if (stream->adjust.v_total_min != update->crtc_timing_adjust->v_total_min || - stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max) + stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max || + stream->adjust.timing_adjust_pending) update->crtc_timing_adjust->timing_adjust_pending = true; stream->adjust = *update->crtc_timing_adjust; update->crtc_timing_adjust->timing_adjust_pending = false; From 2f4ccd4eaaa59a3e7bd1a6842c69a64acb02d592 Mon Sep 17 00:00:00 2001 From: George Shen Date: Thu, 24 Apr 2025 10:02:59 -0400 Subject: [PATCH 237/353] drm/amd/display: fix link_set_dpms_off multi-display MST corner case [Why & How] When MST config is unplugged/replugged too quickly, it can potentially result in a scenario where previous DC state has not been reset before the HPD link detection sequence begins. In this case, driver will disable the streams/link prior to re-enabling the link for link training. There is a bug in the current logic that does not account for the fact that current_state can be released and cleared prior to swapping to a new state (resulting in the pipe_ctx stream pointers to be cleared) in between disabling streams. To resolve this, cache the original streams prior to committing any stream updates. Reviewed-by: Wenjing Liu Signed-off-by: George Shen Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1561782686ccc36af844d55d31b44c938dd412dc) --- drivers/gpu/drm/amd/display/dc/link/link_dpms.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c index 268626e73c543d..53c961f86d43c0 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c @@ -148,6 +148,7 @@ void link_blank_dp_stream(struct dc_link *link, bool hw_init) void link_set_all_streams_dpms_off_for_link(struct dc_link *link) { struct pipe_ctx *pipes[MAX_PIPES]; + struct dc_stream_state *streams[MAX_PIPES]; struct dc_state *state = link->dc->current_state; uint8_t count; int i; @@ -160,10 +161,18 @@ void link_set_all_streams_dpms_off_for_link(struct dc_link *link) link_get_master_pipes_with_dpms_on(link, state, &count, pipes); + /* The subsequent call to dc_commit_updates_for_stream for a full update + * will release the current state and swap to a new state. Releasing the + * current state results in the stream pointers in the pipe_ctx structs + * to be zero'd. Hence, cache all streams prior to dc_commit_updates_for_stream. + */ + for (i = 0; i < count; i++) + streams[i] = pipes[i]->stream; + for (i = 0; i < count; i++) { - stream_update.stream = pipes[i]->stream; + stream_update.stream = streams[i]; dc_commit_updates_for_stream(link->ctx->dc, NULL, 0, - pipes[i]->stream, &stream_update, + streams[i], &stream_update, state); } From 5fab08d33614554e665937c43a986561f24cde79 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Tue, 29 Apr 2025 05:03:30 +0800 Subject: [PATCH 238/353] drm/amd/display: check stream id dml21 wrapper to get plane_id [Why & How] Fix a false positive warning which occurs due to lack of correct checks when querying plane_id in DML21. This fixes the warning when performing a mode1 reset (cat /sys/kernel/debug/dri/1/amdgpu_gpu_recover): [ 35.751250] WARNING: CPU: 11 PID: 326 at /tmp/amd.PHpyAl7v/amd/amdgpu/../display/dc/dml2/dml2_dc_resource_mgmt.c:91 dml2_map_dc_pipes+0x243d/0x3f40 [amdgpu] [ 35.751434] Modules linked in: amdgpu(OE) amddrm_ttm_helper(OE) amdttm(OE) amddrm_buddy(OE) amdxcp(OE) amddrm_exec(OE) amd_sched(OE) amdkcl(OE) drm_suballoc_helper drm_ttm_helper ttm drm_display_helper cec rc_core i2c_algo_bit rfcomm qrtr cmac algif_hash algif_skcipher af_alg bnep amd_atl intel_rapl_msr intel_rapl_common snd_hda_codec_hdmi snd_hda_intel edac_mce_amd snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec kvm_amd snd_hda_core snd_hwdep snd_pcm kvm snd_seq_midi snd_seq_midi_event snd_rawmidi crct10dif_pclmul polyval_clmulni polyval_generic btusb ghash_clmulni_intel sha256_ssse3 btrtl sha1_ssse3 snd_seq btintel aesni_intel btbcm btmtk snd_seq_device crypto_simd sunrpc cryptd bluetooth snd_timer ccp binfmt_misc rapl snd i2c_piix4 wmi_bmof gigabyte_wmi k10temp i2c_smbus soundcore gpio_amdpt mac_hid sch_fq_codel msr parport_pc ppdev lp parport efi_pstore nfnetlink dmi_sysfs ip_tables x_tables autofs4 hid_generic usbhid hid crc32_pclmul igc ahci xhci_pci libahci xhci_pci_renesas video wmi [ 35.751501] CPU: 11 UID: 0 PID: 326 Comm: kworker/u64:9 Tainted: G OE 6.11.0-21-generic #21~24.04.1-Ubuntu [ 35.751504] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 35.751505] Hardware name: Gigabyte Technology Co., Ltd. X670E AORUS PRO X/X670E AORUS PRO X, BIOS F30 05/22/2024 [ 35.751506] Workqueue: amdgpu-reset-dev amdgpu_debugfs_reset_work [amdgpu] [ 35.751638] RIP: 0010:dml2_map_dc_pipes+0x243d/0x3f40 [amdgpu] [ 35.751794] Code: 6d 0c 00 00 8b 84 24 88 00 00 00 41 3b 44 9c 20 0f 84 fc 07 00 00 48 83 c3 01 48 83 fb 06 75 b3 4c 8b 64 24 68 4c 8b 6c 24 40 <0f> 0b b8 06 00 00 00 49 8b 94 24 a0 49 00 00 89 c3 83 f8 07 0f 87 [ 35.751796] RSP: 0018:ffffbfa3805d7680 EFLAGS: 00010246 [ 35.751798] RAX: 0000000000010000 RBX: 0000000000000006 RCX: 0000000000000000 [ 35.751799] RDX: 0000000000000000 RSI: 0000000000000005 RDI: 0000000000000000 [ 35.751800] RBP: ffffbfa3805d78f0 R08: 0000000000000000 R09: 0000000000000000 [ 35.751801] R10: 0000000000000000 R11: 0000000000000000 R12: ffffbfa383249000 [ 35.751802] R13: ffffa0e68f280000 R14: ffffbfa383249658 R15: 0000000000000000 [ 35.751803] FS: 0000000000000000(0000) GS:ffffa0edbe580000(0000) knlGS:0000000000000000 [ 35.751804] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 35.751805] CR2: 00005d847ef96c58 CR3: 000000041de3e000 CR4: 0000000000f50ef0 [ 35.751806] PKRU: 55555554 [ 35.751807] Call Trace: [ 35.751810] [ 35.751816] ? show_regs+0x6c/0x80 [ 35.751820] ? __warn+0x88/0x140 [ 35.751822] ? dml2_map_dc_pipes+0x243d/0x3f40 [amdgpu] [ 35.751964] ? report_bug+0x182/0x1b0 [ 35.751969] ? handle_bug+0x6e/0xb0 [ 35.751972] ? exc_invalid_op+0x18/0x80 [ 35.751974] ? asm_exc_invalid_op+0x1b/0x20 [ 35.751978] ? dml2_map_dc_pipes+0x243d/0x3f40 [amdgpu] [ 35.752117] ? math_pow+0x48/0xa0 [amdgpu] [ 35.752256] ? srso_alias_return_thunk+0x5/0xfbef5 [ 35.752260] ? math_pow+0x48/0xa0 [amdgpu] [ 35.752400] ? srso_alias_return_thunk+0x5/0xfbef5 [ 35.752403] ? math_pow+0x11/0xa0 [amdgpu] [ 35.752524] ? srso_alias_return_thunk+0x5/0xfbef5 [ 35.752526] ? core_dcn4_mode_programming+0xe4d/0x20d0 [amdgpu] [ 35.752663] ? srso_alias_return_thunk+0x5/0xfbef5 [ 35.752669] dml21_validate+0x3d4/0x980 [amdgpu] Reviewed-by: Austin Zheng Signed-off-by: Aurabindo Pillai Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit f8ad62c0a93e5dd94243e10f1b742232e4d6411e) --- .../dc/dml2/dml21/dml21_translation_helper.c | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c index 0c8ec30ea67268..731fbd4bc600b4 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c @@ -910,7 +910,7 @@ static void populate_dml21_plane_config_from_plane_state(struct dml2_context *dm } //TODO : Could be possibly moved to a common helper layer. -static bool dml21_wrapper_get_plane_id(const struct dc_state *context, const struct dc_plane_state *plane, unsigned int *plane_id) +static bool dml21_wrapper_get_plane_id(const struct dc_state *context, unsigned int stream_id, const struct dc_plane_state *plane, unsigned int *plane_id) { int i, j; @@ -918,10 +918,12 @@ static bool dml21_wrapper_get_plane_id(const struct dc_state *context, const str return false; for (i = 0; i < context->stream_count; i++) { - for (j = 0; j < context->stream_status[i].plane_count; j++) { - if (context->stream_status[i].plane_states[j] == plane) { - *plane_id = (i << 16) | j; - return true; + if (context->streams[i]->stream_id == stream_id) { + for (j = 0; j < context->stream_status[i].plane_count; j++) { + if (context->stream_status[i].plane_states[j] == plane) { + *plane_id = (i << 16) | j; + return true; + } } } } @@ -944,14 +946,14 @@ static unsigned int map_stream_to_dml21_display_cfg(const struct dml2_context *d return location; } -static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx, +static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx, unsigned int stream_id, const struct dc_plane_state *plane, const struct dc_state *context) { unsigned int plane_id; int i = 0; int location = -1; - if (!dml21_wrapper_get_plane_id(context, plane, &plane_id)) { + if (!dml21_wrapper_get_plane_id(context, stream_id, plane, &plane_id)) { ASSERT(false); return -1; } @@ -1037,7 +1039,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location; } else { for (plane_index = 0; plane_index < context->stream_status[stream_index].plane_count; plane_index++) { - disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->stream_status[stream_index].plane_states[plane_index], context); + disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], context); if (disp_cfg_plane_location < 0) disp_cfg_plane_location = dml_dispcfg->num_planes++; @@ -1048,7 +1050,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s populate_dml21_plane_config_from_plane_state(dml_ctx, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location], context->stream_status[stream_index].plane_states[plane_index], context, stream_index); dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location; - if (dml21_wrapper_get_plane_id(context, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location])) + if (dml21_wrapper_get_plane_id(context, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location])) dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id_valid[disp_cfg_plane_location] = true; /* apply forced pstate policy */ From b9c99c7d2798685c5d45b8435416329a6ecd7837 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Wed, 30 Apr 2025 11:11:47 -0300 Subject: [PATCH 239/353] drm/amd/display: Fix null check of pipe_ctx->plane_state for update_dchubp_dpp Similar to commit 6a057072ddd1 ("drm/amd/display: Fix null check for pipe_ctx->plane_state in dcn20_program_pipe") that addresses a null pointer dereference on dcn20_update_dchubp_dpp. This is the same function hooked for update_dchubp_dpp in dcn401, with the same issue. Fix possible null pointer deference on dcn401_program_pipe too. Fixes: 63ab80d9ac0a ("drm/amd/display: DML2.1 Post-Si Cleanup") Signed-off-by: Melissa Wen Reviewed-by: Alex Hung Signed-off-by: Alex Deucher (cherry picked from commit d8d47f739752227957d8efc0cb894761bfe1d879) --- drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c index 5489f3d431f64d..3af6a3402b8949 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c @@ -1980,9 +1980,9 @@ void dcn401_program_pipe( dc->res_pool->hubbub, pipe_ctx->plane_res.hubp->inst, pipe_ctx->hubp_regs.det_size); } - if (pipe_ctx->update_flags.raw || - (pipe_ctx->plane_state && pipe_ctx->plane_state->update_flags.raw) || - pipe_ctx->stream->update_flags.raw) + if (pipe_ctx->plane_state && (pipe_ctx->update_flags.raw || + pipe_ctx->plane_state->update_flags.raw || + pipe_ctx->stream->update_flags.raw)) dc->hwss.update_dchubp_dpp(dc, pipe_ctx, context); if (pipe_ctx->plane_state && (pipe_ctx->update_flags.bits.enable || From 8aa0dd330d62785b5448d823026e1bd4f51ad1ba Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Tue, 13 May 2025 11:20:24 +0800 Subject: [PATCH 240/353] drm/amd/display: Avoid flooding unnecessary info messages It's expected that we'll encounter temporary exceptions during aux transactions. Adjust logging from drm_info to drm_dbg_dp to prevent flooding with unnecessary log messages. Fixes: 3637e457eb00 ("drm/amd/display: Fix wrong handling for AUX_DEFER case") Cc: Mario Limonciello Cc: Alex Deucher Signed-off-by: Wayne Lin Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250513032026.838036-1-Wayne.Lin@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 9a9c3e1fe5256da14a0a307dff0478f90c55fc8c) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index c93eef6f7a6b98..5cdbc86ef8f5a9 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -107,7 +107,7 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, if (payload.write && result >= 0) { if (result) { /*one byte indicating partially written bytes*/ - drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX partially written\n"); result = payload.data[0]; } else if (!payload.reply[0]) /*I2C_ACK|AUX_ACK*/ @@ -133,11 +133,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, break; } - drm_info(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); + drm_dbg_dp(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); } if (payload.reply[0]) - drm_info(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", payload.reply[0]); return result; From 28a8f1f7ddc74d4811c66a01714d3686dbfc0320 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 22 Apr 2025 11:58:11 -0300 Subject: [PATCH 241/353] Revert "drm/amd/display: Hardware cursor changes color when switched to software cursor" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 272e6aab14bbf98d7a06b2b1cd6308a02d4a10a1. Applying degamma curve to the cursor by default breaks Linux userspace expectation. On Linux, AMD display manager enables cursor degamma ROM just for implict sRGB on HW versions where degamma is split into two blocks: degamma ROM for pre-defined TFs and `gamma correction` for user/custom curves, and degamma ROM settings doesn't apply to cursor plane. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1513 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2803 Reported-by: Michel Dänzer Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4144 Signed-off-by: Melissa Wen Reviewed-by: Alex Hung Signed-off-by: Alex Deucher (cherry picked from commit f6a305d4748801a6c799ae9375b2ecff3aed094b) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c b/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c index 1236e0f9a2560c..712aff7e17f7a0 100644 --- a/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c +++ b/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c @@ -120,10 +120,11 @@ void dpp401_set_cursor_attributes( enum dc_cursor_color_format color_format = cursor_attributes->color_format; int cur_rom_en = 0; - // DCN4 should always do Cursor degamma for Cursor Color modes if (color_format == CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA || color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA) { - cur_rom_en = 1; + if (cursor_attributes->attribute_flags.bits.ENABLE_CURSOR_DEGAMMA) { + cur_rom_en = 1; + } } REG_UPDATE_3(CURSOR0_CONTROL, From f1dcaa56ad77a7d90551be0fb9ea11cd9c99531b Mon Sep 17 00:00:00 2001 From: "David (Ming Qiang) Wu" Date: Mon, 12 May 2025 15:14:43 -0400 Subject: [PATCH 242/353] drm/amdgpu: read back register after written for VCN v4.0.5 On VCN v4.0.5 there is a race condition where the WPTR is not updated after starting from idle when doorbell is used. Adding register read-back after written at function end is to ensure all register writes are done before they can be used. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12528 Signed-off-by: David (Ming Qiang) Wu Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello Reviewed-by: Alex Deucher Reviewed-by: Ruijing Dong Signed-off-by: Alex Deucher (cherry picked from commit 07c9db090b86e5211188e1b351303fbc673378cf) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index a1171e6152ed2d..f11df9c2ec1318 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -1023,6 +1023,10 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT | VCN_RB1_DB_CTRL__EN_MASK); + /* Keeping one read-back to ensure all register writes are done, otherwise + * it may introduce race conditions */ + RREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL); + return 0; } @@ -1205,6 +1209,10 @@ static int vcn_v4_0_5_start(struct amdgpu_vcn_inst *vinst) WREG32_SOC15(VCN, i, regVCN_RB_ENABLE, tmp); fw_shared->sq.queue_mode &= ~(FW_QUEUE_RING_RESET | FW_QUEUE_DPG_HOLD_OFF); + /* Keeping one read-back to ensure all register writes are done, otherwise + * it may introduce race conditions */ + RREG32_SOC15(VCN, i, regVCN_RB_ENABLE); + return 0; } From 49ba5758a4d3153ec8a68379237c71ba30d3f6d9 Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 6 May 2025 02:43:38 +0800 Subject: [PATCH 243/353] drm/meson: Use 1000ULL when operating with mode->clock Coverity scan reported the usage of "mode->clock * 1000" may lead to integer overflow. Use "1000ULL" instead of "1000" when utilizing it to avoid potential integer overflow issue. Link: https://scan5.scan.coverity.com/#/project-view/10074/10063?selectedIssue=1646759 Signed-off-by: I Hsin Cheng Reviewed-by: Martin Blumenstingl Fixes: 1017560164b6 ("drm/meson: use unsigned long long / Hz for frequency types") Link: https://lore.kernel.org/r/20250505184338.678540-1-richard120310@gmail.com Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_encoder_hdmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c index 7752d8ac85f0e5..c08fa93e50a30e 100644 --- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c +++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c @@ -75,7 +75,7 @@ static void meson_encoder_hdmi_set_vclk(struct meson_encoder_hdmi *encoder_hdmi, unsigned long long venc_freq; unsigned long long hdmi_freq; - vclk_freq = mode->clock * 1000; + vclk_freq = mode->clock * 1000ULL; /* For 420, pixel clock is half unlike venc clock */ if (encoder_hdmi->output_bus_fmt == MEDIA_BUS_FMT_UYYVYY8_0_5X24) @@ -123,7 +123,7 @@ static enum drm_mode_status meson_encoder_hdmi_mode_valid(struct drm_bridge *bri struct meson_encoder_hdmi *encoder_hdmi = bridge_to_meson_encoder_hdmi(bridge); struct meson_drm *priv = encoder_hdmi->priv; bool is_hdmi2_sink = display_info->hdmi.scdc.supported; - unsigned long long clock = mode->clock * 1000; + unsigned long long clock = mode->clock * 1000ULL; unsigned long long phy_freq; unsigned long long vclk_freq; unsigned long long venc_freq; From c4db2d12beef46737e73e9f7e3cf3e855479d317 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 17 Apr 2025 07:34:58 -0300 Subject: [PATCH 244/353] drm/tiny: panel-mipi-dbi: Use drm_client_setup_with_fourcc() Since commit 559358282e5b ("drm/fb-helper: Don't use the preferred depth for the BPP default"), RGB565 displays such as the CFAF240320X no longer render correctly: colors are distorted and the content is shown twice horizontally. This regression is due to the fbdev emulation layer defaulting to 32 bits per pixel, whereas the display expects 16 bpp (RGB565). As a result, the framebuffer data is incorrectly interpreted by the panel. Fix the issue by calling drm_client_setup_with_fourcc() with a format explicitly selected based on the display's bits-per-pixel value. For 16 bpp, use DRM_FORMAT_RGB565; for other values, fall back to the previous behavior. This ensures that the allocated framebuffer format matches the hardware expectations, avoiding color and layout corruption. Tested on a CFAF240320X display with an RGB565 configuration, confirming correct colors and layout after applying this patch. Cc: stable@vger.kernel.org Fixes: 559358282e5b ("drm/fb-helper: Don't use the preferred depth for the BPP default") Signed-off-by: Fabio Estevam Reviewed-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250417103458.2496790-1-festevam@gmail.com --- drivers/gpu/drm/tiny/panel-mipi-dbi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tiny/panel-mipi-dbi.c b/drivers/gpu/drm/tiny/panel-mipi-dbi.c index 0460ecaef4bd98..23914a9f7fd376 100644 --- a/drivers/gpu/drm/tiny/panel-mipi-dbi.c +++ b/drivers/gpu/drm/tiny/panel-mipi-dbi.c @@ -390,7 +390,10 @@ static int panel_mipi_dbi_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, drm); - drm_client_setup(drm, NULL); + if (bpp == 16) + drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB565); + else + drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB888); return 0; } From 523ea32a244e1ee00c8bc5033aae5f10b9a14cb9 Mon Sep 17 00:00:00 2001 From: Markus Burri Date: Thu, 8 May 2025 15:06:08 +0200 Subject: [PATCH 245/353] accel/ivpu: Use effective buffer size for zero terminator Use the effective written size instead of original size as index for zero termination. If the input from user-space is to larger and the input is truncated, the original size is out-of-bound. Since there is an upfront size check here, the change is for consistency. Signed-off-by: Markus Burri Reviewed-by: Jacek Lawrynowicz Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250508130612.82270-3-markus.burri@mt.com --- drivers/accel/ivpu/ivpu_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c index f0dad0c9ce3381..cd24ccd20ba6cc 100644 --- a/drivers/accel/ivpu/ivpu_debugfs.c +++ b/drivers/accel/ivpu/ivpu_debugfs.c @@ -455,7 +455,7 @@ priority_bands_fops_write(struct file *file, const char __user *user_buf, size_t if (ret < 0) return ret; - buf[size] = '\0'; + buf[ret] = '\0'; ret = sscanf(buf, "%u %u %u %u", &band, &grace_period, &process_grace_period, &process_quantum); if (ret != 4) From 47c857fc174f981d91b2eae513a47219f4fe91aa Mon Sep 17 00:00:00 2001 From: Hyejeong Choi Date: Mon, 12 May 2025 21:06:38 -0500 Subject: [PATCH 246/353] dma-buf: insert memory barrier before updating num_fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smp_store_mb() inserts memory barrier after storing operation. It is different with what the comment is originally aiming so Null pointer dereference can be happened if memory update is reordered. Signed-off-by: Hyejeong Choi Fixes: a590d0fdbaa5 ("dma-buf: Update reservation shared_count after adding the new fence") CC: stable@vger.kernel.org Reviewed-by: Christian König Link: https://lore.kernel.org/r/20250513020638.GA2329653@au1-maretx-p37.eng.sarc.samsung.com Signed-off-by: Christian König --- drivers/dma-buf/dma-resv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index 5f8d010516f07f..b1ef4546346d44 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -320,8 +320,9 @@ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence, count++; dma_resv_list_set(fobj, i, fence, usage); - /* pointer update must be visible before we extend the num_fences */ - smp_store_mb(fobj->num_fences, count); + /* fence update must be visible before we extend the num_fences */ + smp_wmb(); + fobj->num_fences = count; } EXPORT_SYMBOL(dma_resv_add_fence); From 94d7ae523c53d3694e2f653279bbf4b2e33e70fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 8 May 2025 13:29:31 +0200 Subject: [PATCH 247/353] drm/xe: Fix the gem shrinker name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xe buffer object shrinker name is visible in the /shrinker directory and most if not all other shinkers follow a naming convention that looks like -_: Follow the same convention for xe, changing the name to drm-xe_gem:. Other shrinkers typically use the device node for but since drm drivers typically don't have a single unique device- node, instead use the unique name in the drm device. Fixes: 00c8efc3180f ("drm/xe: Add a shrinker for xe bos") Cc: Matthew Brost Signed-off-by: Thomas Hellström Reviewed-by: Francois Dugast Link: https://lore.kernel.org/r/20250508112931.3347-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 243bf99e2fe75edf8df1711c1377b6fc020b806c) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_shrinker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_shrinker.c b/drivers/gpu/drm/xe/xe_shrinker.c index 8184390f9c7b9c..86d47aaf035892 100644 --- a/drivers/gpu/drm/xe/xe_shrinker.c +++ b/drivers/gpu/drm/xe/xe_shrinker.c @@ -227,7 +227,7 @@ struct xe_shrinker *xe_shrinker_create(struct xe_device *xe) if (!shrinker) return ERR_PTR(-ENOMEM); - shrinker->shrink = shrinker_alloc(0, "xe system shrinker"); + shrinker->shrink = shrinker_alloc(0, "drm-xe_gem:%s", xe->drm.unique); if (!shrinker->shrink) { kfree(shrinker); return ERR_PTR(-ENOMEM); From 92217b23cbe23ae6ea063935e4cad480b94272a1 Mon Sep 17 00:00:00 2001 From: Aradhya Bhatia Date: Mon, 12 May 2025 06:50:04 +0000 Subject: [PATCH 248/353] drm/xe/xe2hpg: Add Wa_22021007897 Add Wa_22021007897 for the Xe2_HPG (graphics version: 20.01) IP. It is a permanent workaround, and applicable on all the steppings. Reviewed-by: Gustavo Sousa Reviewed-by: Tejas Upadhyay Signed-off-by: Aradhya Bhatia Link: https://lore.kernel.org/r/20250512065004.2576-1-aradhya.bhatia@intel.com Signed-off-by: Matt Roper (cherry picked from commit e5c13e2c505b73a8667ef9a0fd5cbd4227e483e6) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 + drivers/gpu/drm/xe/xe_wa.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index da1f198ac107cc..181913967ac9b7 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -157,6 +157,7 @@ #define XEHPG_SC_INSTDONE_EXTRA2 XE_REG_MCR(0x7108) #define COMMON_SLICE_CHICKEN4 XE_REG(0x7300, XE_REG_OPTION_MASKED) +#define SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE REG_BIT(12) #define DISABLE_TDC_LOAD_BALANCING_CALC REG_BIT(6) #define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index 24f644c0a67365..2f833f0d575f24 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -815,6 +815,10 @@ static const struct xe_rtp_entry_sr lrc_was[] = { XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)), XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX)) }, + { XE_RTP_NAME("22021007897"), + XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE)) + }, /* Xe3_LPG */ { XE_RTP_NAME("14021490052"), From f73daeb2e6f48abcee5358ad3715bbd7d52c9d88 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Mon, 12 May 2025 06:54:55 -0700 Subject: [PATCH 249/353] drm/gpusvm: Introduce devmem_only flag for allocation This commit adds a new flag, devmem_only, to the drm_gpusvm structure. The purpose of this flag is to ensure that the get_pages function allocates memory exclusively from the device's memory. If the allocation from device memory fails, the function will return an -EFAULT error. Required for shared CPU and GPU atomics on certain devices. v3: - s/vram_only/devmem_only/ Fixes: 99624bdff867 ("drm/gpusvm: Add support for GPU Shared Virtual Memory") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Signed-off-by: Himal Prasad Ghimiray Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250512135500.1405019-2-matthew.brost@intel.com (cherry picked from commit 8a9b978ebd47df9e0694c34748c2d6fa0c31eb4d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/drm_gpusvm.c | 5 +++++ include/drm/drm_gpusvm.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index de424e670995cf..a58d03e6cac277 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1454,6 +1454,11 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm, goto err_unmap; } + if (ctx->devmem_only) { + err = -EFAULT; + goto err_unmap; + } + addr = dma_map_page(gpusvm->drm->dev, page, 0, PAGE_SIZE << order, diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h index df120b4d1f836c..9fd25fc880a492 100644 --- a/include/drm/drm_gpusvm.h +++ b/include/drm/drm_gpusvm.h @@ -286,6 +286,7 @@ struct drm_gpusvm { * @in_notifier: entering from a MMU notifier * @read_only: operating on read-only memory * @devmem_possible: possible to use device memory + * @devmem_only: use only device memory * * Context that is DRM GPUSVM is operating in (i.e. user arguments). */ @@ -294,6 +295,7 @@ struct drm_gpusvm_ctx { unsigned int in_notifier :1; unsigned int read_only :1; unsigned int devmem_possible :1; + unsigned int devmem_only :1; }; int drm_gpusvm_init(struct drm_gpusvm *gpusvm, From 83830aabff109d6348f361e96819e30c106e028f Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 12 May 2025 06:54:56 -0700 Subject: [PATCH 250/353] drm/xe: Strict migration policy for atomic SVM faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mixing GPU and CPU atomics does not work unless a strict migration policy of GPU atomics must be device memory. Enforce a policy of must be in VRAM with a retry loop of 3 attempts, if retry loop fails abort fault. Removing always_migrate_to_vram modparam as we now have real migration policy. v2: - Only retry migration on atomics - Drop alway migrate modparam v3: - Only set vram_only on DGFX (Himal) - Bail on get_pages failure if vram_only and retry count exceeded (Himal) - s/vram_only/devmem_only - Update xe_svm_range_is_valid to accept devmem_only argument v4: - Fix logic bug get_pages failure v5: - Fix commit message (Himal) - Mention removing always_migrate_to_vram in commit message (Lucas) - Fix xe_svm_range_is_valid to check for devmem pages - Bail on devmem_only && !migrate_devmem (Thomas) v6: - Add READ_ONCE barriers for opportunistic checks (Thomas) - Pair READ_ONCE with WRITE_ONCE (Thomas) v7: - Adjust comments (Thomas) Fixes: 2f118c949160 ("drm/xe: Add SVM VRAM migration") Cc: stable@vger.kernel.org Signed-off-by: Himal Prasad Ghimiray Signed-off-by: Matthew Brost Acked-by: Himal Prasad Ghimiray Reviewed-by: Thomas Hellström Link: https://lore.kernel.org/r/20250512135500.1405019-3-matthew.brost@intel.com (cherry picked from commit a9ac0fa455b050d03e3032501368048fb284d318) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/drm_gpusvm.c | 23 +++++-- drivers/gpu/drm/xe/xe_module.c | 3 - drivers/gpu/drm/xe/xe_module.h | 1 - drivers/gpu/drm/xe/xe_pt.c | 14 ++++- drivers/gpu/drm/xe/xe_svm.c | 111 +++++++++++++++++++++++++-------- drivers/gpu/drm/xe/xe_svm.h | 5 -- include/drm/drm_gpusvm.h | 40 +++++++----- 7 files changed, 140 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index a58d03e6cac277..41f6616bcf76fa 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1118,6 +1118,10 @@ static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm, lockdep_assert_held(&gpusvm->notifier_lock); if (range->flags.has_dma_mapping) { + struct drm_gpusvm_range_flags flags = { + .__flags = range->flags.__flags, + }; + for (i = 0, j = 0; i < npages; j++) { struct drm_pagemap_device_addr *addr = &range->dma_addr[j]; @@ -1131,8 +1135,12 @@ static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm, dev, *addr); i += 1 << addr->order; } - range->flags.has_devmem_pages = false; - range->flags.has_dma_mapping = false; + + /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */ + flags.has_devmem_pages = false; + flags.has_dma_mapping = false; + WRITE_ONCE(range->flags.__flags, flags.__flags); + range->dpagemap = NULL; } } @@ -1334,6 +1342,7 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm, int err = 0; struct dev_pagemap *pagemap; struct drm_pagemap *dpagemap; + struct drm_gpusvm_range_flags flags; retry: hmm_range.notifier_seq = mmu_interval_read_begin(notifier); @@ -1378,7 +1387,8 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm, */ drm_gpusvm_notifier_lock(gpusvm); - if (range->flags.unmapped) { + flags.__flags = range->flags.__flags; + if (flags.unmapped) { drm_gpusvm_notifier_unlock(gpusvm); err = -EFAULT; goto err_free; @@ -1474,14 +1484,17 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm, } i += 1 << order; num_dma_mapped = i; - range->flags.has_dma_mapping = true; + flags.has_dma_mapping = true; } if (zdd) { - range->flags.has_devmem_pages = true; + flags.has_devmem_pages = true; range->dpagemap = dpagemap; } + /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */ + WRITE_ONCE(range->flags.__flags, flags.__flags); + drm_gpusvm_notifier_unlock(gpusvm); kvfree(pfns); set_seqno: diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index 9f4632e39a1ad5..e861c694f3361b 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -29,9 +29,6 @@ struct xe_modparam xe_modparam = { module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 0600); MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must be power of 2"); -module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, bool, 0444); -MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU fault"); - module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444); MODULE_PARM_DESC(force_execlist, "Force Execlist submission"); diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h index 84339e509c80d6..5a3bfea8b7b4c4 100644 --- a/drivers/gpu/drm/xe/xe_module.h +++ b/drivers/gpu/drm/xe/xe_module.h @@ -12,7 +12,6 @@ struct xe_modparam { bool force_execlist; bool probe_display; - bool always_migrate_to_vram; u32 force_vram_bar_size; int guc_log_level; char *guc_firmware_path; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index ffaf0d02dc7de9..856038553b812a 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -2232,11 +2232,19 @@ static void op_commit(struct xe_vm *vm, } case DRM_GPUVA_OP_DRIVER: { + /* WRITE_ONCE pairs with READ_ONCE in xe_svm.c */ + if (op->subop == XE_VMA_SUBOP_MAP_RANGE) { - op->map_range.range->tile_present |= BIT(tile->id); - op->map_range.range->tile_invalidated &= ~BIT(tile->id); + WRITE_ONCE(op->map_range.range->tile_present, + op->map_range.range->tile_present | + BIT(tile->id)); + WRITE_ONCE(op->map_range.range->tile_invalidated, + op->map_range.range->tile_invalidated & + ~BIT(tile->id)); } else if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE) { - op->unmap_range.range->tile_present &= ~BIT(tile->id); + WRITE_ONCE(op->unmap_range.range->tile_present, + op->unmap_range.range->tile_present & + ~BIT(tile->id)); } break; } diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 24c578e1170e40..e65d3b820ee024 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -15,8 +15,17 @@ static bool xe_svm_range_in_vram(struct xe_svm_range *range) { - /* Not reliable without notifier lock */ - return range->base.flags.has_devmem_pages; + /* + * Advisory only check whether the range is currently backed by VRAM + * memory. + */ + + struct drm_gpusvm_range_flags flags = { + /* Pairs with WRITE_ONCE in drm_gpusvm.c */ + .__flags = READ_ONCE(range->base.flags.__flags), + }; + + return flags.has_devmem_pages; } static bool xe_svm_range_has_vram_binding(struct xe_svm_range *range) @@ -645,9 +654,16 @@ void xe_svm_fini(struct xe_vm *vm) } static bool xe_svm_range_is_valid(struct xe_svm_range *range, - struct xe_tile *tile) + struct xe_tile *tile, + bool devmem_only) { - return (range->tile_present & ~range->tile_invalidated) & BIT(tile->id); + /* + * Advisory only check whether the range currently has a valid mapping, + * READ_ONCE pairs with WRITE_ONCE in xe_pt.c + */ + return ((READ_ONCE(range->tile_present) & + ~READ_ONCE(range->tile_invalidated)) & BIT(tile->id)) && + (!devmem_only || xe_svm_range_in_vram(range)); } static struct xe_vram_region *tile_to_vr(struct xe_tile *tile) @@ -712,6 +728,36 @@ static int xe_svm_alloc_vram(struct xe_vm *vm, struct xe_tile *tile, return err; } +static bool supports_4K_migration(struct xe_device *xe) +{ + if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) + return false; + + return true; +} + +static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range, + struct xe_vma *vma) +{ + struct xe_vm *vm = range_to_vm(&range->base); + u64 range_size = xe_svm_range_size(range); + + if (!range->base.flags.migrate_devmem) + return false; + + if (xe_svm_range_in_vram(range)) { + drm_dbg(&vm->xe->drm, "Range is already in VRAM\n"); + return false; + } + + if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) { + drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range migration\n"); + return false; + } + + return true; +} + /** * xe_svm_handle_pagefault() - SVM handle page fault * @vm: The VM. @@ -735,11 +781,14 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR), .check_pages_threshold = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0, + .devmem_only = atomic && IS_DGFX(vm->xe) && + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR), }; struct xe_svm_range *range; struct drm_gpusvm_range *r; struct drm_exec exec; struct dma_fence *fence; + int migrate_try_count = ctx.devmem_only ? 3 : 1; ktime_t end = 0; int err; @@ -758,24 +807,30 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, if (IS_ERR(r)) return PTR_ERR(r); + if (ctx.devmem_only && !r->flags.migrate_devmem) + return -EACCES; + range = to_xe_range(r); - if (xe_svm_range_is_valid(range, tile)) + if (xe_svm_range_is_valid(range, tile, ctx.devmem_only)) return 0; range_debug(range, "PAGE FAULT"); - /* XXX: Add migration policy, for now migrate range once */ - if (!range->skip_migrate && range->base.flags.migrate_devmem && - xe_svm_range_size(range) >= SZ_64K) { - range->skip_migrate = true; - + if (--migrate_try_count >= 0 && + xe_svm_range_needs_migrate_to_vram(range, vma)) { err = xe_svm_alloc_vram(vm, tile, range, &ctx); if (err) { - drm_dbg(&vm->xe->drm, - "VRAM allocation failed, falling back to " - "retrying fault, asid=%u, errno=%pe\n", - vm->usm.asid, ERR_PTR(err)); - goto retry; + if (migrate_try_count || !ctx.devmem_only) { + drm_dbg(&vm->xe->drm, + "VRAM allocation failed, falling back to retrying fault, asid=%u, errno=%pe\n", + vm->usm.asid, ERR_PTR(err)); + goto retry; + } else { + drm_err(&vm->xe->drm, + "VRAM allocation failed, retry count exceeded, asid=%u, errno=%pe\n", + vm->usm.asid, ERR_PTR(err)); + return err; + } } } @@ -783,15 +838,22 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, &ctx); /* Corner where CPU mappings have changed */ if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { - if (err == -EOPNOTSUPP) { - range_debug(range, "PAGE FAULT - EVICT PAGES"); - drm_gpusvm_range_evict(&vm->svm.gpusvm, &range->base); + if (migrate_try_count > 0 || !ctx.devmem_only) { + if (err == -EOPNOTSUPP) { + range_debug(range, "PAGE FAULT - EVICT PAGES"); + drm_gpusvm_range_evict(&vm->svm.gpusvm, + &range->base); + } + drm_dbg(&vm->xe->drm, + "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n", + vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); + range_debug(range, "PAGE FAULT - RETRY PAGES"); + goto retry; + } else { + drm_err(&vm->xe->drm, + "Get pages failed, retry count exceeded, asid=%u, gpusvm=%p, errno=%pe\n", + vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); } - drm_dbg(&vm->xe->drm, - "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n", - vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); - range_debug(range, "PAGE FAULT - RETRY PAGES"); - goto retry; } if (err) { range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT"); @@ -825,9 +887,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, } drm_exec_fini(&exec); - if (xe_modparam.always_migrate_to_vram) - range->skip_migrate = false; - dma_fence_wait(fence, false); dma_fence_put(fence); diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h index be306fe7aaa4b3..fe58ac2f4baaca 100644 --- a/drivers/gpu/drm/xe/xe_svm.h +++ b/drivers/gpu/drm/xe/xe_svm.h @@ -36,11 +36,6 @@ struct xe_svm_range { * range. Protected by GPU SVM notifier lock. */ u8 tile_invalidated; - /** - * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler - * locking. - */ - u8 skip_migrate :1; }; #if IS_ENABLED(CONFIG_DRM_GPUSVM) diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h index 9fd25fc880a492..653d48dbe1c11a 100644 --- a/include/drm/drm_gpusvm.h +++ b/include/drm/drm_gpusvm.h @@ -185,6 +185,31 @@ struct drm_gpusvm_notifier { } flags; }; +/** + * struct drm_gpusvm_range_flags - Structure representing a GPU SVM range flags + * + * @migrate_devmem: Flag indicating whether the range can be migrated to device memory + * @unmapped: Flag indicating if the range has been unmapped + * @partial_unmap: Flag indicating if the range has been partially unmapped + * @has_devmem_pages: Flag indicating if the range has devmem pages + * @has_dma_mapping: Flag indicating if the range has a DMA mapping + * @__flags: Flags for range in u16 form (used for READ_ONCE) + */ +struct drm_gpusvm_range_flags { + union { + struct { + /* All flags below must be set upon creation */ + u16 migrate_devmem : 1; + /* All flags below must be set / cleared under notifier lock */ + u16 unmapped : 1; + u16 partial_unmap : 1; + u16 has_devmem_pages : 1; + u16 has_dma_mapping : 1; + }; + u16 __flags; + }; +}; + /** * struct drm_gpusvm_range - Structure representing a GPU SVM range * @@ -198,11 +223,6 @@ struct drm_gpusvm_notifier { * @dpagemap: The struct drm_pagemap of the device pages we're dma-mapping. * Note this is assuming only one drm_pagemap per range is allowed. * @flags: Flags for range - * @flags.migrate_devmem: Flag indicating whether the range can be migrated to device memory - * @flags.unmapped: Flag indicating if the range has been unmapped - * @flags.partial_unmap: Flag indicating if the range has been partially unmapped - * @flags.has_devmem_pages: Flag indicating if the range has devmem pages - * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping * * This structure represents a GPU SVM range used for tracking memory ranges * mapped in a DRM device. @@ -216,15 +236,7 @@ struct drm_gpusvm_range { unsigned long notifier_seq; struct drm_pagemap_device_addr *dma_addr; struct drm_pagemap *dpagemap; - struct { - /* All flags below must be set upon creation */ - u16 migrate_devmem : 1; - /* All flags below must be set / cleared under notifier lock */ - u16 unmapped : 1; - u16 partial_unmap : 1; - u16 has_devmem_pages : 1; - u16 has_dma_mapping : 1; - } flags; + struct drm_gpusvm_range_flags flags; }; /** From 8cd7812652891f25859c596c583a937e110f1929 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 12 May 2025 06:54:57 -0700 Subject: [PATCH 251/353] drm/gpusvm: Add timeslicing support to GPU SVM Add timeslicing support to GPU SVM which will guarantee the GPU a minimum execution time on piece of physical memory before migration back to CPU. Intended to implement strict migration policies which require memory to be in a certain placement for correct execution. Required for shared CPU and GPU atomics on certain devices. Fixes: 99624bdff867 ("drm/gpusvm: Add support for GPU Shared Virtual Memory") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Himal Prasad Ghimiray Link: https://lore.kernel.org/r/20250512135500.1405019-4-matthew.brost@intel.com (cherry picked from commit 8dc1812b5b3a42311d28eb385eed88e2053ad3cb) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/drm_gpusvm.c | 9 +++++++++ include/drm/drm_gpusvm.h | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index 41f6616bcf76fa..4b2f32889f00f8 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1783,6 +1783,8 @@ int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm, goto err_finalize; /* Upon success bind devmem allocation to range and zdd */ + devmem_allocation->timeslice_expiration = get_jiffies_64() + + msecs_to_jiffies(ctx->timeslice_ms); zdd->devmem_allocation = devmem_allocation; /* Owns ref */ err_finalize: @@ -2003,6 +2005,13 @@ static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas, void *buf; int i, err = 0; + if (page) { + zdd = page->zone_device_data; + if (time_before64(get_jiffies_64(), + zdd->devmem_allocation->timeslice_expiration)) + return 0; + } + start = ALIGN_DOWN(fault_addr, size); end = ALIGN(fault_addr + 1, size); diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h index 653d48dbe1c11a..eaf704d3d05e8b 100644 --- a/include/drm/drm_gpusvm.h +++ b/include/drm/drm_gpusvm.h @@ -89,6 +89,7 @@ struct drm_gpusvm_devmem_ops { * @ops: Pointer to the operations structure for GPU SVM device memory * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to. * @size: Size of device memory allocation + * @timeslice_expiration: Timeslice expiration in jiffies */ struct drm_gpusvm_devmem { struct device *dev; @@ -97,6 +98,7 @@ struct drm_gpusvm_devmem { const struct drm_gpusvm_devmem_ops *ops; struct drm_pagemap *dpagemap; size_t size; + u64 timeslice_expiration; }; /** @@ -295,6 +297,8 @@ struct drm_gpusvm { * @check_pages_threshold: Check CPU pages for present if chunk is less than or * equal to threshold. If not present, reduce chunk * size. + * @timeslice_ms: The timeslice MS which in minimum time a piece of memory + * remains with either exclusive GPU or CPU access. * @in_notifier: entering from a MMU notifier * @read_only: operating on read-only memory * @devmem_possible: possible to use device memory @@ -304,6 +308,7 @@ struct drm_gpusvm { */ struct drm_gpusvm_ctx { unsigned long check_pages_threshold; + unsigned long timeslice_ms; unsigned int in_notifier :1; unsigned int read_only :1; unsigned int devmem_possible :1; From 08212e22c4b52c1190a89f4a7adf9b71ecc39030 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 12 May 2025 06:54:58 -0700 Subject: [PATCH 252/353] drm/xe: Timeslice GPU on atomic SVM fault Ensure GPU can make forward progress on an atomic SVM GPU fault by giving the GPU a timeslice of 5ms v2: - Reduce timeslice to 5ms - Double timeslice on retry - Split out GPU SVM changes into independent patch v5: - Double timeslice in a few more places Fixes: 2f118c949160 ("drm/xe: Add SVM VRAM migration") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Himal Prasad Ghimiray Link: https://lore.kernel.org/r/20250512135500.1405019-5-matthew.brost@intel.com (cherry picked from commit a5d8d3be1dea8154edbbea481081469627665659) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_svm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index e65d3b820ee024..975094c1a58279 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -783,6 +783,8 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0, .devmem_only = atomic && IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR), + .timeslice_ms = atomic && IS_DGFX(vm->xe) && + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? 5 : 0, }; struct xe_svm_range *range; struct drm_gpusvm_range *r; @@ -819,6 +821,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, if (--migrate_try_count >= 0 && xe_svm_range_needs_migrate_to_vram(range, vma)) { err = xe_svm_alloc_vram(vm, tile, range, &ctx); + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ if (err) { if (migrate_try_count || !ctx.devmem_only) { drm_dbg(&vm->xe->drm, @@ -838,6 +841,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, &ctx); /* Corner where CPU mappings have changed */ if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ if (migrate_try_count > 0 || !ctx.devmem_only) { if (err == -EOPNOTSUPP) { range_debug(range, "PAGE FAULT - EVICT PAGES"); @@ -877,6 +881,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, drm_exec_fini(&exec); err = PTR_ERR(fence); if (err == -EAGAIN) { + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ range_debug(range, "PAGE FAULT - RETRY BIND"); goto retry; } From d8b1a644a3002f868e9530794c68048be63f71a4 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 9 May 2025 09:12:01 -0700 Subject: [PATCH 253/353] drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value For determining actual job execution time, save the current value of the CTX_TIMESTAMP register rather than the value saved in LRC since the current register value is the closest to the start time of the job. v2: Define MI_STORE_REGISTER_MEM to fix compile error v3: Place MI_STORE_REGISTER_MEM sorted by MI_INSTR (Lucas) Fixes: 65921374c48f ("drm/xe: Emit ctx timestamp copy in ring ops") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250509161159.2173069-6-umesh.nerlige.ramappa@intel.com (cherry picked from commit 38b14233e5deff51db8faec287b4acd227152246) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/instructions/xe_mi_commands.h | 4 ++++ drivers/gpu/drm/xe/xe_lrc.c | 2 +- drivers/gpu/drm/xe/xe_ring_ops.c | 7 ++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h index 167fb0f742de7b..5a47991b4b81fc 100644 --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h @@ -47,6 +47,10 @@ #define MI_LRI_FORCE_POSTED REG_BIT(12) #define MI_LRI_LEN(x) (((x) & 0xff) + 1) +#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4)) +#define MI_SRM_USE_GGTT REG_BIT(22) +#define MI_SRM_ADD_CS_OFFSET REG_BIT(19) + #define MI_FLUSH_DW __MI_INSTR(0x26) #define MI_FLUSH_DW_PROTECTED_MEM_EN REG_BIT(22) #define MI_FLUSH_DW_STORE_INDEX REG_BIT(21) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index df3ceddede070a..48e8fd85eabe35 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -684,7 +684,7 @@ static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) { - /* The start seqno is stored in the driver-defined portion of PPHWSP */ + /* This is stored in the driver-defined portion of PPHWSP */ return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; } diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index a7582b097ae678..bc1689db4cd716 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -234,13 +234,10 @@ static u32 get_ppgtt_flag(struct xe_sched_job *job) static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) { - dw[i++] = MI_COPY_MEM_MEM | MI_COPY_MEM_MEM_SRC_GGTT | - MI_COPY_MEM_MEM_DST_GGTT; + dw[i++] = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; + dw[i++] = RING_CTX_TIMESTAMP(0).addr; dw[i++] = xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); dw[i++] = 0; - dw[i++] = xe_lrc_ctx_timestamp_ggtt_addr(lrc); - dw[i++] = 0; - dw[i++] = MI_NOOP; return i; } From fd770e45d22f0756dfff2196d0ff99581e5d0aaf Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 9 May 2025 09:12:02 -0700 Subject: [PATCH 254/353] drm/xe: Save the gt pointer in lrc and drop the tile Save the gt pointer in the lrc so that it can used for gt based helpers. Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250509161159.2173069-7-umesh.nerlige.ramappa@intel.com (cherry picked from commit 741d3ef8b8b88fab2729ca89de1180e49bc9cef0) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_lrc.c | 4 ++-- drivers/gpu/drm/xe/xe_lrc_types.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 48e8fd85eabe35..79b43fec4f79fc 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -864,7 +864,7 @@ static void *empty_lrc_data(struct xe_hw_engine *hwe) static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) { - u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile); + u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); @@ -896,6 +896,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, int err; kref_init(&lrc->refcount); + lrc->gt = gt; lrc->flags = 0; lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class); if (xe_gt_has_indirect_ring_state(gt)) @@ -914,7 +915,6 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, return PTR_ERR(lrc->bo); lrc->size = lrc_size; - lrc->tile = gt_to_tile(hwe->gt); lrc->ring.size = ring_size; lrc->ring.tail = 0; lrc->ctx_timestamp = 0; diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h index 71ecb453f811a4..cd38586ae98932 100644 --- a/drivers/gpu/drm/xe/xe_lrc_types.h +++ b/drivers/gpu/drm/xe/xe_lrc_types.h @@ -25,8 +25,8 @@ struct xe_lrc { /** @size: size of lrc including any indirect ring state page */ u32 size; - /** @tile: tile which this LRC belongs to */ - struct xe_tile *tile; + /** @gt: gt which this LRC belongs to */ + struct xe_gt *gt; /** @flags: LRC flags */ #define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1 From e61ed0e71e992102dddd244bf6ef39b5526342ad Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 9 May 2025 09:12:03 -0700 Subject: [PATCH 255/353] drm/xe: Add WA BB to capture active context utilization Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the context, but only gets updated when the context switches out. In order to check how long a context has been active before it switches out, two things are required: (1) Determine if the context is running: To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in the LRC. The value chosen is 1 since 0 is the initial value when the LRC is initialized. During a query, we just check for this value to determine if the context is active. If the context switched out, it would overwrite this location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as the last part of the context restore, so reusing this LRC location will not clobber anything. (2) Calculate the time that the context has been active for: The CTX_TIMESTAMP ticks only when the context is active. If a context is active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific engine instance. Since we do not know which instance the context is running on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and store it in the PPHSWP. Using the above 2 instructions in a WA BB, capture active context utilization. v2: (Matt Brost) - This breaks TDR, fix it by saving the CTX_TIMESTAMP register "drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value" - Drop tile from LRC if using gt "drm/xe: Save the gt pointer in LRC and drop the tile" v3: - Remove helpers for bb_per_ctx_ptr (Matt) - Add define for context active value (Matt) - Use 64 bit CTX TIMESTAMP for platforms that support it. For platforms that don't, live with the rare race. (Matt, Lucas) - Convert engine id to hwe and get the MMIO value (Lucas) - Correct commit message on when WA BB runs (Lucas) v4: - s/GRAPHICS_VER(...)/xe->info.has_64bit_timestamp/ (Matt) - Drop support for active utilization on a VF (CI failure) - In xe_lrc_init ensure the lrc value is 0 to begin with (CI regression) v5: - Minor checkpatch fix - Squash into previous commit and make TDR use 32-bit time - Update code comment to match commit msg Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4532 Cc: # v6.13+ Suggested-by: Lucas De Marchi Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250509161159.2173069-8-umesh.nerlige.ramappa@intel.com (cherry picked from commit 82b98cadb01f63cdb159e596ec06866d00f8e8c7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 5 + drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 2 + drivers/gpu/drm/xe/xe_device_types.h | 2 + drivers/gpu/drm/xe/xe_exec_queue.c | 2 +- drivers/gpu/drm/xe/xe_guc_submit.c | 2 +- drivers/gpu/drm/xe/xe_lrc.c | 193 +++++++++++++++++++++-- drivers/gpu/drm/xe/xe_lrc.h | 5 +- drivers/gpu/drm/xe/xe_lrc_types.h | 5 +- drivers/gpu/drm/xe/xe_pci.c | 2 + drivers/gpu/drm/xe/xe_pci_types.h | 1 + drivers/gpu/drm/xe/xe_trace_lrc.h | 8 +- 11 files changed, 208 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index fb8ec317b6ee6c..891f928d80ce82 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -43,6 +43,10 @@ #define XEHPC_BCS8_RING_BASE 0x3ee000 #define GSCCS_RING_BASE 0x11a000 +#define ENGINE_ID(base) XE_REG((base) + 0x8c) +#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4) +#define ENGINE_CLASS_ID REG_GENMASK(2, 0) + #define RING_TAIL(base) XE_REG((base) + 0x30) #define TAIL_ADDR REG_GENMASK(20, 3) @@ -154,6 +158,7 @@ #define STOP_RING REG_BIT(8) #define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8) +#define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac) #define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc) #define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4) diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h index 57944f90bbf6e9..994af591a2e85e 100644 --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h @@ -11,7 +11,9 @@ #define CTX_RING_TAIL (0x06 + 1) #define CTX_RING_START (0x08 + 1) #define CTX_RING_CTL (0x0a + 1) +#define CTX_BB_PER_CTX_PTR (0x12 + 1) #define CTX_TIMESTAMP (0x22 + 1) +#define CTX_TIMESTAMP_UDW (0x24 + 1) #define CTX_INDIRECT_RING_STATE (0x26 + 1) #define CTX_PDP0_UDW (0x30 + 1) #define CTX_PDP0_LDW (0x32 + 1) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 9f8667ebba853e..0482f26aa48033 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -330,6 +330,8 @@ struct xe_device { u8 has_sriov:1; /** @info.has_usm: Device has unified shared memory support */ u8 has_usm:1; + /** @info.has_64bit_timestamp: Device supports 64-bit timestamps */ + u8 has_64bit_timestamp:1; /** @info.is_dgfx: is discrete device */ u8 is_dgfx:1; /** diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 606922d9dd7302..cd9b1c32f30f80 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -830,7 +830,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { struct xe_device *xe = gt_to_xe(q->gt); struct xe_lrc *lrc; - u32 old_ts, new_ts; + u64 old_ts, new_ts; int idx; /* diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 31bc2022bfc2d8..769781d577df60 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -941,7 +941,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job) return xe_sched_invalidate_job(job, 2); } - ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]); + ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(q->lrc[0])); ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]); /* diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 79b43fec4f79fc..03bfba696b3789 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -24,6 +24,7 @@ #include "xe_hw_fence.h" #include "xe_map.h" #include "xe_memirq.h" +#include "xe_mmio.h" #include "xe_sriov.h" #include "xe_trace_lrc.h" #include "xe_vm.h" @@ -650,6 +651,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) #define LRC_PARALLEL_PPHWSP_OFFSET 2048 +#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096 #define LRC_PPHWSP_SIZE SZ_4K u32 xe_lrc_regs_offset(struct xe_lrc *lrc) @@ -694,11 +696,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; } +static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) +{ + return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; +} + static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) { return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); } +static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) +{ + return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); +} + static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) { /* Indirect ring state page is at the very end of LRC */ @@ -726,8 +738,10 @@ DECL_MAP_ADDR_HELPERS(regs) DECL_MAP_ADDR_HELPERS(start_seqno) DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) DECL_MAP_ADDR_HELPERS(ctx_timestamp) +DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) DECL_MAP_ADDR_HELPERS(parallel) DECL_MAP_ADDR_HELPERS(indirect_ring) +DECL_MAP_ADDR_HELPERS(engine_id) #undef DECL_MAP_ADDR_HELPERS @@ -742,19 +756,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); } +/** + * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address + * @lrc: Pointer to the lrc. + * + * Returns: ctx timestamp udw GGTT address + */ +u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) +{ + return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); +} + /** * xe_lrc_ctx_timestamp() - Read ctx timestamp value * @lrc: Pointer to the lrc. * * Returns: ctx timestamp value */ -u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) +u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) { struct xe_device *xe = lrc_to_xe(lrc); struct iosys_map map; + u32 ldw, udw = 0; map = __xe_lrc_ctx_timestamp_map(lrc); - return xe_map_read32(xe, &map); + ldw = xe_map_read32(xe, &map); + + if (xe->info.has_64bit_timestamp) { + map = __xe_lrc_ctx_timestamp_udw_map(lrc); + udw = xe_map_read32(xe, &map); + } + + return (u64)udw << 32 | ldw; } /** @@ -877,6 +910,65 @@ static void xe_lrc_finish(struct xe_lrc *lrc) xe_bo_unpin(lrc->bo); xe_bo_unlock(lrc->bo); xe_bo_put(lrc->bo); + xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo); +} + +/* + * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active + * context run ticks. + * @lrc: Pointer to the lrc. + * + * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the + * context, but only gets updated when the context switches out. In order to + * check how long a context has been active before it switches out, two things + * are required: + * + * (1) Determine if the context is running: + * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in + * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is + * initialized. During a query, we just check for this value to determine if the + * context is active. If the context switched out, it would overwrite this + * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as + * the last part of context restore, so reusing this LRC location will not + * clobber anything. + * + * (2) Calculate the time that the context has been active for: + * The CTX_TIMESTAMP ticks only when the context is active. If a context is + * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. + * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific + * engine instance. Since we do not know which instance the context is running + * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and + * store it in the PPHSWP. + */ +#define CONTEXT_ACTIVE 1ULL +static void xe_lrc_setup_utilization(struct xe_lrc *lrc) +{ + u32 *cmd; + + cmd = lrc->bb_per_ctx_bo->vmap.vaddr; + + *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; + *cmd++ = ENGINE_ID(0).addr; + *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); + *cmd++ = 0; + + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); + *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); + *cmd++ = 0; + *cmd++ = lower_32_bits(CONTEXT_ACTIVE); + + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); + *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); + *cmd++ = 0; + *cmd++ = upper_32_bits(CONTEXT_ACTIVE); + } + + *cmd++ = MI_BATCH_BUFFER_END; + + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, + xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1); + } #define PVC_CTX_ASID (0x2e + 1) @@ -893,6 +985,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, void *init_data = NULL; u32 arb_enable; u32 lrc_size; + u32 bo_flags; int err; kref_init(&lrc->refcount); @@ -902,22 +995,30 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, if (xe_gt_has_indirect_ring_state(gt)) lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; + bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | + XE_BO_FLAG_GGTT_INVALIDATE; + /* * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address * via VM bind calls. */ lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size, ttm_bo_type_kernel, - XE_BO_FLAG_VRAM_IF_DGFX(tile) | - XE_BO_FLAG_GGTT | - XE_BO_FLAG_GGTT_INVALIDATE); + bo_flags); if (IS_ERR(lrc->bo)) return PTR_ERR(lrc->bo); + lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K, + ttm_bo_type_kernel, + bo_flags); + if (IS_ERR(lrc->bb_per_ctx_bo)) { + err = PTR_ERR(lrc->bb_per_ctx_bo); + goto err_lrc_finish; + } + lrc->size = lrc_size; lrc->ring.size = ring_size; lrc->ring.tail = 0; - lrc->ctx_timestamp = 0; xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, hwe->fence_irq, hwe->name); @@ -990,7 +1091,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); + lrc->ctx_timestamp = 0; xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) + xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); if (xe->info.has_asid && vm) xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); @@ -1019,6 +1123,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, map = __xe_lrc_start_seqno_map(lrc); xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); + xe_lrc_setup_utilization(lrc); + return 0; err_lrc_finish: @@ -1238,6 +1344,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) return __xe_lrc_parallel_map(lrc); } +/** + * xe_lrc_engine_id() - Read engine id value + * @lrc: Pointer to the lrc. + * + * Returns: context id value + */ +static u32 xe_lrc_engine_id(struct xe_lrc *lrc) +{ + struct xe_device *xe = lrc_to_xe(lrc); + struct iosys_map map; + + map = __xe_lrc_engine_id_map(lrc); + return xe_map_read32(xe, &map); +} + static int instr_dw(u32 cmd_header) { /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ @@ -1684,7 +1805,7 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset; snapshot->lrc_snapshot = NULL; - snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc); + snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); return snapshot; } @@ -1784,22 +1905,74 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) kfree(snapshot); } +static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) +{ + u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); + u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); + struct xe_hw_engine *hwe; + u64 val; + + hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); + if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), + "Unexpected engine class:instance %d:%d for context utilization\n", + class, instance)) + return -1; + + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) + val = xe_mmio_read64_2x32(&hwe->gt->mmio, + RING_CTX_TIMESTAMP(hwe->mmio_base)); + else + val = xe_mmio_read32(&hwe->gt->mmio, + RING_CTX_TIMESTAMP(hwe->mmio_base)); + + *reg_ctx_ts = val; + + return 0; +} + /** * xe_lrc_update_timestamp() - Update ctx timestamp * @lrc: Pointer to the lrc. * @old_ts: Old timestamp value * * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and - * update saved value. + * update saved value. With support for active contexts, the calculation may be + * slightly racy, so follow a read-again logic to ensure that the context is + * still active before returning the right timestamp. * * Returns: New ctx timestamp value */ -u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts) +u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) { + u64 lrc_ts, reg_ts; + u32 engine_id; + *old_ts = lrc->ctx_timestamp; - lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc); + lrc_ts = xe_lrc_ctx_timestamp(lrc); + /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ + if (IS_SRIOV_VF(lrc_to_xe(lrc))) { + lrc->ctx_timestamp = lrc_ts; + goto done; + } + + if (lrc_ts == CONTEXT_ACTIVE) { + engine_id = xe_lrc_engine_id(lrc); + if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) + lrc->ctx_timestamp = reg_ts; + + /* read lrc again to ensure context is still active */ + lrc_ts = xe_lrc_ctx_timestamp(lrc); + } + + /* + * If context switched out, just use the lrc_ts. Note that this needs to + * be a separate if condition. + */ + if (lrc_ts != CONTEXT_ACTIVE) + lrc->ctx_timestamp = lrc_ts; +done: trace_xe_lrc_update_timestamp(lrc, *old_ts); return lrc->ctx_timestamp; diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h index 0b40f349ab95d0..eb6e8de8c939e9 100644 --- a/drivers/gpu/drm/xe/xe_lrc.h +++ b/drivers/gpu/drm/xe/xe_lrc.h @@ -120,7 +120,8 @@ void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot); u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc); -u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc); +u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc); +u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc); u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc); u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc); @@ -136,6 +137,6 @@ u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc); * * Returns the current LRC timestamp */ -u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts); +u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts); #endif diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h index cd38586ae98932..ae24cf6f8dd998 100644 --- a/drivers/gpu/drm/xe/xe_lrc_types.h +++ b/drivers/gpu/drm/xe/xe_lrc_types.h @@ -52,7 +52,10 @@ struct xe_lrc { struct xe_hw_fence_ctx fence_ctx; /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */ - u32 ctx_timestamp; + u64 ctx_timestamp; + + /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */ + struct xe_bo *bb_per_ctx_bo; }; struct xe_lrc_snapshot; diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 818f023166d5d3..f4d108dc49b1b1 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -140,6 +140,7 @@ static const struct xe_graphics_desc graphics_xelpg = { .has_indirect_ring_state = 1, \ .has_range_tlb_invalidation = 1, \ .has_usm = 1, \ + .has_64bit_timestamp = 1, \ .va_bits = 48, \ .vm_max_level = 4, \ .hw_engine_mask = \ @@ -668,6 +669,7 @@ static int xe_info_init(struct xe_device *xe, xe->info.has_range_tlb_invalidation = graphics_desc->has_range_tlb_invalidation; xe->info.has_usm = graphics_desc->has_usm; + xe->info.has_64bit_timestamp = graphics_desc->has_64bit_timestamp; for_each_remote_tile(tile, xe, id) { int err; diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h index e9b9bbc138d377..ca6b10d3557349 100644 --- a/drivers/gpu/drm/xe/xe_pci_types.h +++ b/drivers/gpu/drm/xe/xe_pci_types.h @@ -21,6 +21,7 @@ struct xe_graphics_desc { u8 has_indirect_ring_state:1; u8 has_range_tlb_invalidation:1; u8 has_usm:1; + u8 has_64bit_timestamp:1; }; struct xe_media_desc { diff --git a/drivers/gpu/drm/xe/xe_trace_lrc.h b/drivers/gpu/drm/xe/xe_trace_lrc.h index 5c669a0b21808e..d525cbee1e3411 100644 --- a/drivers/gpu/drm/xe/xe_trace_lrc.h +++ b/drivers/gpu/drm/xe/xe_trace_lrc.h @@ -19,12 +19,12 @@ #define __dev_name_lrc(lrc) dev_name(gt_to_xe((lrc)->fence_ctx.gt)->drm.dev) TRACE_EVENT(xe_lrc_update_timestamp, - TP_PROTO(struct xe_lrc *lrc, uint32_t old), + TP_PROTO(struct xe_lrc *lrc, uint64_t old), TP_ARGS(lrc, old), TP_STRUCT__entry( __field(struct xe_lrc *, lrc) - __field(u32, old) - __field(u32, new) + __field(u64, old) + __field(u64, new) __string(name, lrc->fence_ctx.name) __string(device_id, __dev_name_lrc(lrc)) ), @@ -36,7 +36,7 @@ TRACE_EVENT(xe_lrc_update_timestamp, __assign_str(name); __assign_str(device_id); ), - TP_printk("lrc=:%p lrc->name=%s old=%u new=%u device_id:%s", + TP_printk("lrc=:%p lrc->name=%s old=%llu new=%llu device_id:%s", __entry->lrc, __get_str(name), __entry->old, __entry->new, __get_str(device_id)) From f121f7bb072b3824b10cb686c0c330771a60ef86 Mon Sep 17 00:00:00 2001 From: Jethro Donaldson Date: Thu, 15 May 2025 01:23:23 +1200 Subject: [PATCH 256/353] smb: client: fix memory leak during error handling for POSIX mkdir The response buffer for the CREATE request handled by smb311_posix_mkdir() is leaked on the error path (goto err_free_rsp_buf) because the structure pointer *rsp passed to free_rsp_buf() is not assigned until *after* the error condition is checked. As *rsp is initialised to NULL, free_rsp_buf() becomes a no-op and the leak is instead reported by __kmem_cache_shutdown() upon subsequent rmmod of cifs.ko if (and only if) the error path has been hit. Pass rsp_iov.iov_base to free_rsp_buf() instead, similar to the code in other functions in smb2pdu.c for which *rsp is assigned late. Cc: stable@vger.kernel.org Signed-off-by: Jethro Donaldson Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 0b35816d551f7a..4e28632b5fd661 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2968,7 +2968,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* Eventually save off posix specific response info and timestamps */ err_free_rsp_buf: - free_rsp_buf(resp_buftype, rsp); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); kfree(pc_buf); err_free_req: cifs_small_buf_release(req); From 55f3d5911a2bc88d0223d9a67f5ec620bc04bf51 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 12 May 2025 14:58:36 -0300 Subject: [PATCH 257/353] smb: client: fix zero rsize error messages cifs_prepare_read() might be called with a disconnected channel, where TCP_Server_Info::max_read is set to zero due to reconnect, so calling ->negotiate_rize() will set @rsize to default min IO size (64KiB) and then logging CIFS: VFS: SMB: Zero rsize calculated, using minimum value 65536 If the reconnect happens in cifsd thread, cifs_renegotiate_iosize() will end up being called and then @rsize set to the expected value. Since we can't rely on the value of @server->max_read by the time we call cifs_prepare_read(), try to ->negotiate_rize() only if @cifs_sb->ctx->rsize is zero. Reported-by: Steve French Fixes: c59f7c9661b9 ("smb: client: ensure aligned IO sizes") Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 851b74f557c10e..950aa4f912f5cd 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -160,8 +160,10 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; - cifs_negotiate_rsize(server, cifs_sb->ctx, - tlink_tcon(req->cfile->tlink)); + if (cifs_sb->ctx->rsize == 0) { + cifs_negotiate_rsize(server, cifs_sb->ctx, + tlink_tcon(req->cfile->tlink)); + } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &size, &rdata->credits); From afc896c20144e3d4e27f05580262c6770c10e00a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 13 May 2025 19:56:41 +0200 Subject: [PATCH 258/353] i2c: designware: Fix an error handling path in i2c_dw_pci_probe() If navi_amd_register_client() fails, the previous i2c_dw_probe() call should be undone by a corresponding i2c_del_adapter() call, as already done in the remove function. Fixes: 17631e8ca2d3 ("i2c: designware: Add driver support for AMD NAVI GPU") Signed-off-by: Christophe JAILLET Cc: # v5.13+ Acked-by: Jarkko Nikula Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/fcd9651835a32979df8802b2db9504c523a8ebbb.1747158983.git.christophe.jaillet@wanadoo.fr --- drivers/i2c/busses/i2c-designware-pcidrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 8e0267c7cc294e..f21f9877c04047 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -278,9 +278,11 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, if ((dev->flags & MODEL_MASK) == MODEL_AMD_NAVI_GPU) { dev->slave = i2c_new_ccgx_ucsi(&dev->adapter, dev->irq, &dgpu_node); - if (IS_ERR(dev->slave)) + if (IS_ERR(dev->slave)) { + i2c_del_adapter(&dev->adapter); return dev_err_probe(device, PTR_ERR(dev->slave), "register UCSI failed\n"); + } } pm_runtime_set_autosuspend_delay(device, 1000); From 1801ef6658f593cd26ea93d880e0bf63af2ea90b Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Wed, 14 May 2025 22:17:43 +0800 Subject: [PATCH 259/353] LoongArch: Prevent cond_resched() occurring within kernel-fpu When CONFIG_PREEMPT_COUNT is not configured (i.e. CONFIG_PREEMPT_NONE/ CONFIG_PREEMPT_VOLUNTARY), preempt_disable() / preempt_enable() merely acts as a barrier(). However, in these cases cond_resched() can still trigger a context switch and modify the CSR.EUEN, resulting in do_fpu() exception being activated within the kernel-fpu critical sections, as demonstrated in the following path: dcn32_calculate_wm_and_dlg() DC_FP_START() dcn32_calculate_wm_and_dlg_fpu() dcn32_find_dummy_latency_index_for_fw_based_mclk_switch() dcn32_internal_validate_bw() dcn32_enable_phantom_stream() dc_create_stream_for_sink() kzalloc(GFP_KERNEL) __kmem_cache_alloc_node() __cond_resched() DC_FP_END() This patch is similar to commit d02198550423a0b (x86/fpu: Improve crypto performance by making kernel-mode FPU reliably usable in softirqs). It uses local_bh_disable() instead of preempt_disable() for non-RT kernels so it can avoid the cond_resched() issue, and also extend the kernel-fpu application scenarios to the softirq context. Cc: stable@vger.kernel.org Signed-off-by: Tianyang Zhang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/kfpu.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c index ec5b28e570c963..4c476904227f95 100644 --- a/arch/loongarch/kernel/kfpu.c +++ b/arch/loongarch/kernel/kfpu.c @@ -18,11 +18,28 @@ static unsigned int euen_mask = CSR_EUEN_FPEN; static DEFINE_PER_CPU(bool, in_kernel_fpu); static DEFINE_PER_CPU(unsigned int, euen_current); +static inline void fpregs_lock(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_bh_disable(); +} + +static inline void fpregs_unlock(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_bh_enable(); +} + void kernel_fpu_begin(void) { unsigned int *euen_curr; - preempt_disable(); + if (!irqs_disabled()) + fpregs_lock(); WARN_ON(this_cpu_read(in_kernel_fpu)); @@ -73,7 +90,8 @@ void kernel_fpu_end(void) this_cpu_write(in_kernel_fpu, false); - preempt_enable(); + if (!irqs_disabled()) + fpregs_unlock(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); From ef1e4bdfa96ad92d287f8cc87dbd9e461973a7f6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 14 May 2025 22:17:43 +0800 Subject: [PATCH 260/353] LoongArch: Fix MAX_REG_OFFSET calculation Fix MAX_REG_OFFSET calculation, make it point to the last register in 'struct pt_regs' and not to the marker itself, which could allow regs_get_register() to return an invalid offset. Cc: stable@vger.kernel.org Fixes: 803b0fc5c3f2baa6e5 ("LoongArch: Add process management") Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index a5b63c84f8541a..e5d21e836d993c 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -55,7 +55,7 @@ static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long v /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last)) +#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last) - sizeof(unsigned long)) /** * regs_get_register() - get register value from its offset From 6b381205720c53702e50854d80cbb7ba1f3a3c12 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 14 May 2025 22:17:52 +0800 Subject: [PATCH 261/353] LoongArch: Move __arch_cpu_idle() to .cpuidle.text section Now arch_cpu_idle() is annotated with __cpuidle which means it is in the .cpuidle.text section, but __arch_cpu_idle() isn't. Thus, fix the missing .cpuidle.text section assignment for __arch_cpu_idle() in order to correct backtracing with nmi_backtrace(). The principle is similar to the commit 97c8580e85cf81c ("MIPS: Annotate cpu_wait implementations with __cpuidle") Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/kernel/genex.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 4f09121417818d..733a7665e434dc 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -16,6 +16,7 @@ #include #include + .section .cpuidle.text, "ax" .align 5 SYM_FUNC_START(__arch_cpu_idle) /* start of idle interrupt region */ @@ -31,14 +32,16 @@ SYM_FUNC_START(__arch_cpu_idle) */ idle 0 /* end of idle interrupt region */ -1: jr ra +idle_exit: + jr ra SYM_FUNC_END(__arch_cpu_idle) + .previous SYM_CODE_START(handle_vint) UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL - la_abs t1, 1b + la_abs t1, idle_exit LONG_L t0, sp, PT_ERA /* 3 instructions idle interrupt region */ ori t0, t0, 0b1100 From e0d7d1181a3b48f424ed2260d716a00186c18ab3 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 14 May 2025 22:17:52 +0800 Subject: [PATCH 262/353] LoongArch: Save and restore CSR.CNTC for hibernation Save and restore CSR.CNTC for hibernation which is similar to suspend. For host this is unnecessary because sched clock is ensured continuous, but for kvm guest sched clock isn't enough because rdtime.d should also be continuous. Host::rdtime.d = Host::CSR.CNTC + counter Guest::rdtime.d = Host::CSR.CNTC + Host::CSR.GCNTC + Guest::CSR.CNTC + counter so, Guest::rdtime.d = Host::rdtime.d + Host::CSR.GCNTC + Guest::CSR.CNTC To ensure Guest::rdtime.d continuous, Host::rdtime.d should be at first continuous, while Host::CSR.GCNTC / Guest::CSR.CNTC is maintained by KVM. Cc: stable@vger.kernel.org Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/time.c | 2 +- arch/loongarch/power/hibernate.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index e2d3bfeb636643..bc75a3a69fc8d5 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -111,7 +111,7 @@ static unsigned long __init get_loops_per_jiffy(void) return lpj; } -static long init_offset __nosavedata; +static long init_offset; void save_counter(void) { diff --git a/arch/loongarch/power/hibernate.c b/arch/loongarch/power/hibernate.c index 1e0590542f987c..e7b7346592cb2a 100644 --- a/arch/loongarch/power/hibernate.c +++ b/arch/loongarch/power/hibernate.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -14,6 +15,7 @@ struct pt_regs saved_regs; void save_processor_state(void) { + save_counter(); saved_crmd = csr_read32(LOONGARCH_CSR_CRMD); saved_prmd = csr_read32(LOONGARCH_CSR_PRMD); saved_euen = csr_read32(LOONGARCH_CSR_EUEN); @@ -26,6 +28,7 @@ void save_processor_state(void) void restore_processor_state(void) { + sync_counter(); csr_write32(saved_crmd, LOONGARCH_CSR_CRMD); csr_write32(saved_prmd, LOONGARCH_CSR_PRMD); csr_write32(saved_euen, LOONGARCH_CSR_EUEN); From 98d1c69cb0e6bfb5d1233799cc39cb4d5eea7bad Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 14 May 2025 22:18:10 +0800 Subject: [PATCH 263/353] LoongArch: uprobes: Remove user_{en,dis}able_single_step() When executing the "perf probe" and "perf stat" test cases about some cryptographic algorithm, the output shows that "Trace/breakpoint trap". This is because it uses the software singlestep breakpoint for uprobes on LoongArch, and no need to use the hardware singlestep. So just remove the related function call to user_{en,dis}able_single_step() for uprobes on LoongArch. How to reproduce: Please make sure CONFIG_UPROBE_EVENTS is set and openssl supports sm2 algorithm, then execute the following command. cd tools/perf && make ./perf probe -x /usr/lib64/libcrypto.so BN_mod_mul_montgomery ./perf stat -e probe_libcrypto:BN_mod_mul_montgomery openssl speed sm2 Cc: stable@vger.kernel.org Fixes: 19bc6cb64092 ("LoongArch: Add uprobes support") Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/uprobes.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/loongarch/kernel/uprobes.c b/arch/loongarch/kernel/uprobes.c index 87abc7137b738e..0ab9d8d631c413 100644 --- a/arch/loongarch/kernel/uprobes.c +++ b/arch/loongarch/kernel/uprobes.c @@ -42,7 +42,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; instruction_pointer_set(regs, utask->xol_vaddr); - user_enable_single_step(current); return 0; } @@ -59,8 +58,6 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE); - user_disable_single_step(current); - return 0; } @@ -70,7 +67,6 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) current->thread.trap_nr = utask->autask.saved_trap_nr; instruction_pointer_set(regs, utask->vaddr); - user_disable_single_step(current); } bool arch_uprobe_xol_was_trapped(struct task_struct *t) From ee8f9f432fe4041e23da42ba57f3f190ddf2b870 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 14 May 2025 22:18:10 +0800 Subject: [PATCH 264/353] LoongArch: uprobes: Remove redundant code about resume_era arch_uprobe_skip_sstep() returns true if instruction was emulated, that is to say, there is no need to single step for the emulated instructions. regs->csr_era will point to the destination address directly after the exception, so the resume_era related code is redundant, just remove them. Cc: stable@vger.kernel.org Fixes: 19bc6cb64092 ("LoongArch: Add uprobes support") Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/uprobes.h | 1 - arch/loongarch/kernel/uprobes.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/loongarch/include/asm/uprobes.h b/arch/loongarch/include/asm/uprobes.h index 99a0d198927f8b..025fc3f0a1028d 100644 --- a/arch/loongarch/include/asm/uprobes.h +++ b/arch/loongarch/include/asm/uprobes.h @@ -15,7 +15,6 @@ typedef u32 uprobe_opcode_t; #define UPROBE_XOLBP_INSN __emit_break(BRK_UPROBE_XOLBP) struct arch_uprobe { - unsigned long resume_era; u32 insn[2]; u32 ixol[2]; bool simulate; diff --git a/arch/loongarch/kernel/uprobes.c b/arch/loongarch/kernel/uprobes.c index 0ab9d8d631c413..6022eb0f71dbce 100644 --- a/arch/loongarch/kernel/uprobes.c +++ b/arch/loongarch/kernel/uprobes.c @@ -52,11 +52,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); current->thread.trap_nr = utask->autask.saved_trap_nr; - - if (auprobe->simulate) - instruction_pointer_set(regs, auprobe->resume_era); - else - instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE); + instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE); return 0; } @@ -86,7 +82,6 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) insn.word = auprobe->insn[0]; arch_simulate_insn(insn, regs); - auprobe->resume_era = regs->csr_era; return true; } From 9238ea7c40964d4654d56d87d99403e2f94dd0e3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 8 May 2025 16:44:52 +0300 Subject: [PATCH 265/353] perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq Currently, using PEBS-via-PT with a sample frequency instead of a sample period, causes a segfault. For example: BUG: kernel NULL pointer dereference, address: 0000000000000195 ? __die_body.cold+0x19/0x27 ? page_fault_oops+0xca/0x290 ? exc_page_fault+0x7e/0x1b0 ? asm_exc_page_fault+0x26/0x30 ? intel_pmu_pebs_event_update_no_drain+0x40/0x60 ? intel_pmu_pebs_event_update_no_drain+0x32/0x60 intel_pmu_drain_pebs_icl+0x333/0x350 handle_pmi_common+0x272/0x3c0 intel_pmu_handle_irq+0x10a/0x2e0 perf_event_nmi_handler+0x2a/0x50 That happens because intel_pmu_pebs_event_update_no_drain() assumes all the pebs_enabled bits represent counter indexes, which is not always the case. In this particular case, bits 60 and 61 are set for PEBS-via-PT purposes. The behaviour of PEBS-via-PT with sample frequency is questionable because although a PMI is generated (PEBS_PMI_AFTER_EACH_RECORD), the period is not adjusted anyway. Putting that aside, fix intel_pmu_pebs_event_update_no_drain() by passing the mask of counter bits instead of 'size'. Note, prior to the Fixes commit, 'size' would be limited to the maximum counter index, so the issue was not hit. Fixes: 722e42e45c2f1 ("perf/x86: Support counter mask") Signed-off-by: Adrian Hunter Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250508134452.73960-1-adrian.hunter@intel.com --- arch/x86/events/intel/ds.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9b20acc0e932e2..8d86e91bd5e5c3 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2465,8 +2465,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ setup_pebs_fixed_sample_data); } -static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask) { + u64 pebs_enabled = cpuc->pebs_enabled & mask; struct perf_event *event; int bit; @@ -2477,7 +2478,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int * It needs to call intel_pmu_save_and_restart_reload() to * update the event->count for this case. */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -2512,7 +2513,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2626,7 +2627,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } From b243d0586885f6f6671643dd52620b3e21212de8 Mon Sep 17 00:00:00 2001 From: Seongman Lee Date: Sun, 11 May 2025 18:23:28 +0900 Subject: [PATCH 266/353] x86/sev: Fix operator precedence in GHCB_MSR_VMPL_REQ_LEVEL macro The GHCB_MSR_VMPL_REQ_LEVEL macro lacked parentheses around the bitmask expression, causing the shift operation to bind too early. As a result, when requesting VMPL1 (e.g., GHCB_MSR_VMPL_REQ_LEVEL(1)), incorrect values such as 0x000000016 were generated instead of the intended 0x100000016 (the requested VMPL level is specified in GHCBData[39:32]). Fix the precedence issue by grouping the masked value before applying the shift. [ bp: Massage commit message. ] Fixes: 34ff65901735 ("x86/sev: Use kernel provided SVSM Calling Areas") Signed-off-by: Seongman Lee Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250511092329.12680-1-cloudlee1719@gmail.com --- arch/x86/include/asm/sev-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index acb85b9346d84b..0020d77a080001 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -116,7 +116,7 @@ enum psc_op { #define GHCB_MSR_VMPL_REQ 0x016 #define GHCB_MSR_VMPL_REQ_LEVEL(v) \ /* GHCBData[39:32] */ \ - (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \ + ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \ /* GHCBDdata[11:0] */ \ GHCB_MSR_VMPL_REQ) From 1c843ebc484540085d4ef2dc425af8e717f438f3 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Tue, 18 Mar 2025 00:40:31 +0000 Subject: [PATCH 267/353] MAINTAINERS: Update Alexey Makhalov's email address Fix a typo in an email address. Closes: https://lore.kernel.org/all/20240925-rational-succinct-vulture-cca9fb@lemur/T/ Reported-by: Konstantin Ryabitsev Reported-by: Juergen Gross Signed-off-by: Alexey Makhalov Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250318004031.2703923-1-alexey.makhalov@broadcom.com --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 144ed02146e3ab..de9b08096f624c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18439,7 +18439,7 @@ F: include/uapi/linux/ppdev.h PARAVIRT_OPS INTERFACE M: Juergen Gross R: Ajay Kaher -R: Alexey Makhalov +R: Alexey Makhalov R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org @@ -25930,7 +25930,7 @@ F: drivers/misc/vmw_balloon.c VMWARE HYPERVISOR INTERFACE M: Ajay Kaher -M: Alexey Makhalov +M: Alexey Makhalov R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org @@ -25958,7 +25958,7 @@ F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER M: Nick Shi R: Ajay Kaher -R: Alexey Makhalov +R: Alexey Makhalov R: Broadcom internal kernel review list L: netdev@vger.kernel.org S: Supported From 1da9ba06ac654419450e26f673c73e215d75b52f Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 28 Apr 2025 21:41:51 +0000 Subject: [PATCH 268/353] x86/sev: Do not touch VMSA pages during SNP guest memory kdump When kdump is running makedumpfile to generate vmcore and dump SNP guest memory it touches the VMSA page of the vCPU executing kdump. It then results in unrecoverable #NPF/RMP faults as the VMSA page is marked busy/in-use when the vCPU is running and subsequently a causes guest softlockup/hang. Additionally, other APs may be halted in guest mode and their VMSA pages are marked busy and touching these VMSA pages during guest memory dump will also cause #NPF. Issue AP_DESTROY GHCB calls on other APs to ensure they are kicked out of guest mode and then clear the VMSA bit on their VMSA pages. If the vCPU running kdump is an AP, mark it's VMSA page as offline to ensure that makedumpfile excludes that page while dumping guest memory. Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec") Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pankaj Gupta Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250428214151.155464-1-Ashish.Kalra@amd.com --- arch/x86/coco/sev/core.c | 244 +++++++++++++++++++++++++-------------- 1 file changed, 158 insertions(+), 86 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b0c1a7a574974a..41060ba41b5c43 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -959,6 +959,102 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end) set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } +static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id) +{ + bool create = event != SVM_VMGEXIT_AP_DESTROY; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret = 0; + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + + if (create) + ghcb_set_rax(ghcb, vmsa->sev_features); + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, + ((u64)apic_id << 32) | + ((u64)snp_vmpl << 16) | + event); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY")); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) +{ + int ret; + + if (snp_vmpl) { + struct svsm_call call = {}; + unsigned long flags; + + local_irq_save(flags); + + call.caa = this_cpu_read(svsm_caa); + call.rcx = __pa(va); + + if (make_vmsa) { + /* Protocol 0, Call ID 2 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); + call.rdx = __pa(caa); + call.r8 = apic_id; + } else { + /* Protocol 0, Call ID 3 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); + } + + ret = svsm_perform_call_protocol(&call); + + local_irq_restore(flags); + } else { + /* + * If the kernel runs at VMPL0, it can change the VMSA + * bit for a page using the RMPADJUST instruction. + * However, for the instruction to succeed it must + * target the permissions of a lesser privileged (higher + * numbered) VMPL level, so use VMPL1. + */ + u64 attrs = 1; + + if (make_vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); + } + + return ret; +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) +{ + int err; + + err = snp_set_vmsa(vmsa, NULL, apic_id, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + static void set_pte_enc(pte_t *kpte, int level, void *va) { struct pte_enc_desc d = { @@ -1055,6 +1151,65 @@ void snp_kexec_begin(void) pr_warn("Failed to stop shared<->private conversions\n"); } +/* + * Shutdown all APs except the one handling kexec/kdump and clearing + * the VMSA tag on AP's VMSA pages as they are not being used as + * VMSA page anymore. + */ +static void shutdown_all_aps(void) +{ + struct sev_es_save_area *vmsa; + int apic_id, this_cpu, cpu; + + this_cpu = get_cpu(); + + /* + * APs are already in HLT loop when enc_kexec_finish() callback + * is invoked. + */ + for_each_present_cpu(cpu) { + vmsa = per_cpu(sev_vmsa, cpu); + + /* + * The BSP or offlined APs do not have guest allocated VMSA + * and there is no need to clear the VMSA tag for this page. + */ + if (!vmsa) + continue; + + /* + * Cannot clear the VMSA tag for the currently running vCPU. + */ + if (this_cpu == cpu) { + unsigned long pa; + struct page *p; + + pa = __pa(vmsa); + /* + * Mark the VMSA page of the running vCPU as offline + * so that is excluded and not touched by makedumpfile + * while generating vmcore during kdump. + */ + p = pfn_to_online_page(pa >> PAGE_SHIFT); + if (p) + __SetPageOffline(p); + continue; + } + + apic_id = cpuid_to_apicid[cpu]; + + /* + * Issue AP destroy to ensure AP gets kicked out of guest mode + * to allow using RMPADJUST to remove the VMSA tag on it's + * VMSA page. + */ + vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id); + snp_cleanup_vmsa(vmsa, apic_id); + } + + put_cpu(); +} + void snp_kexec_finish(void) { struct sev_es_runtime_data *data; @@ -1069,6 +1224,8 @@ void snp_kexec_finish(void) if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; + shutdown_all_aps(); + unshare_all_memory(); /* @@ -1090,51 +1247,6 @@ void snp_kexec_finish(void) } } -static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) -{ - int ret; - - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - - local_irq_save(flags); - - call.caa = this_cpu_read(svsm_caa); - call.rcx = __pa(va); - - if (make_vmsa) { - /* Protocol 0, Call ID 2 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); - call.rdx = __pa(caa); - call.r8 = apic_id; - } else { - /* Protocol 0, Call ID 3 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); - } - - ret = svsm_perform_call_protocol(&call); - - local_irq_restore(flags); - } else { - /* - * If the kernel runs at VMPL0, it can change the VMSA - * bit for a page using the RMPADJUST instruction. - * However, for the instruction to succeed it must - * target the permissions of a lesser privileged (higher - * numbered) VMPL level, so use VMPL1. - */ - u64 attrs = 1; - - if (make_vmsa) - attrs |= RMPADJUST_VMSA_PAGE_BIT; - - ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); - } - - return ret; -} - #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) #define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) #define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) @@ -1166,24 +1278,10 @@ static void *snp_alloc_vmsa_page(int cpu) return page_address(p + 1); } -static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) -{ - int err; - - err = snp_set_vmsa(vmsa, NULL, apic_id, false); - if (err) - pr_err("clear VMSA page failed (%u), leaking page\n", err); - else - free_page((unsigned long)vmsa); -} - static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) { struct sev_es_save_area *cur_vmsa, *vmsa; - struct ghcb_state state; struct svsm_ca *caa; - unsigned long flags; - struct ghcb *ghcb; u8 sipi_vector; int cpu, ret; u64 cr4; @@ -1297,33 +1395,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) } /* Issue VMGEXIT AP Creation NAE event */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_rax(ghcb, vmsa->sev_features); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); - ghcb_set_sw_exit_info_1(ghcb, - ((u64)apic_id << 32) | - ((u64)snp_vmpl << 16) | - SVM_VMGEXIT_AP_CREATE); - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || - lower_32_bits(ghcb->save.sw_exit_info_1)) { - pr_err("SNP AP Creation error\n"); - ret = -EINVAL; - } - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Perform cleanup if there was an error */ + ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id); if (ret) { snp_cleanup_vmsa(vmsa, apic_id); vmsa = NULL; From c08a1004366470dc95b93b5ff563ccb539379f05 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 6 May 2025 18:35:29 +0000 Subject: [PATCH 269/353] x86/sev: Make sure pages are not skipped during kdump When shared pages are being converted to private during kdump, additional checks are performed. They include handling the case of a GHCB page being contained within a huge page. Currently, this check incorrectly skips a page just below the GHCB page from being transitioned back to private during kdump preparation. This skipped page causes a 0x404 #VC exception when it is accessed later while dumping guest memory for vmcore generation. Correct the range to be checked for GHCB contained in a huge page. Also, ensure that the skipped huge page containing the GHCB page is transitioned back to private by applying the correct address mask later when changing GHCBs to private at end of kdump preparation. [ bp: Massage commit message. ] Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec") Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250506183529.289549-1-Ashish.Kalra@amd.com --- arch/x86/coco/sev/core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 41060ba41b5c43..36beaac713c128 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1101,7 +1101,8 @@ static void unshare_all_memory(void) data = per_cpu(runtime_data, cpu); ghcb = (unsigned long)&data->ghcb_page; - if (addr <= ghcb && ghcb <= addr + size) { + /* Handle the case of a huge page containing the GHCB page */ + if (addr <= ghcb && ghcb < addr + size) { skipped_addr = true; break; } @@ -1213,8 +1214,8 @@ static void shutdown_all_aps(void) void snp_kexec_finish(void) { struct sev_es_runtime_data *data; + unsigned long size, addr; unsigned int level, cpu; - unsigned long size; struct ghcb *ghcb; pte_t *pte; @@ -1242,8 +1243,10 @@ void snp_kexec_finish(void) ghcb = &data->ghcb_page; pte = lookup_address((unsigned long)ghcb, &level); size = page_level_size(level); - set_pte_enc(pte, level, (void *)ghcb); - snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); + /* Handle the case of a huge page containing the GHCB page */ + addr = (unsigned long)ghcb & page_level_mask(level); + set_pte_enc(pte, level, (void *)addr); + snp_set_memory_private(addr, (size / PAGE_SIZE)); } } From 577bfd78655d0051c628e22fa67dbb74b896d54e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 May 2025 20:48:57 +0000 Subject: [PATCH 270/353] x86/CPU/AMD: Add X86_FEATURE_ZEN6 Add a synthetic feature flag for Zen6. [ bp: Move the feature flag to a free slot and avoid future merge conflicts from incoming stuff. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250513204857.3376577-1-yazen.ghannam@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/amd.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 39e61212ac9a91..30144ef9ef02fb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -75,7 +75,7 @@ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ #define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ #define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ -/* Free ( 3*32+ 6) */ +#define X86_FEATURE_ZEN6 ( 3*32+ 6) /* CPU based on Zen6 microarchitecture */ /* Free ( 3*32+ 7) */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ #define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2b36379ff675dc..4e06baab40bb3f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -472,6 +472,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; + case 0x50 ... 0x5f: + case 0x90 ... 0xaf: + case 0xc0 ... 0xcf: + setup_force_cpu_cap(X86_FEATURE_ZEN6); + break; default: goto warn; } From 6c5de3f78fc8f2f3bd36270c70ab77b108f05265 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 16 May 2025 11:08:10 +0200 Subject: [PATCH 271/353] x86/mm: Remove duplicated word in warning message Commit bbeb69ce3013 ("x86/mm: Remove CONFIG_HIGHMEM64G support") introduces a new warning message MSG_HIGHMEM_TRIMMED, which accidentally introduces a duplicated 'for for' in the warning message. Remove this duplicated word. This was noticed while reviewing for references to obsolete kernel build config options. Fixes: bbeb69ce3013 ("x86/mm: Remove CONFIG_HIGHMEM64G support") Signed-off-by: Lukas Bulwahn Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: https://lore.kernel.org/r/20250516090810.556623-1-lukas.bulwahn@redhat.com --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7fa3965dcef158..bb8d99e717b9e7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -566,7 +566,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" + "Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: From 1abed418b9b24e27a367a71fd6492f18e82ec9a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 May 2025 18:28:15 +0100 Subject: [PATCH 272/353] irqchip: Drop MSI_CHIP_FLAG_SET_ACK from unsuspecting MSI drivers Commit 1c000dcaad2be ("irqchip/irq-msi-lib: Optionally set default irq_eoi()/irq_ack()") added blanket MSI_CHIP_FLAG_SET_ACK flags, irrespective of whether the underlying irqchip required it or not. Drop it from a number of drivers that do not require it. Fixes: 1c000dcaad2be ("irqchip/irq-msi-lib: Optionally set default irq_eoi()/irq_ack()") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250513172819.2216709-6-maz@kernel.org --- drivers/irqchip/irq-gic-v2m.c | 2 +- drivers/irqchip/irq-gic-v3-its-msi-parent.c | 2 +- drivers/irqchip/irq-gic-v3-mbi.c | 2 +- drivers/irqchip/irq-mvebu-gicp.c | 2 +- drivers/irqchip/irq-mvebu-odmi.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index dc98c39d2b2034..cc6a6c1585d20b 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -252,7 +252,7 @@ static void __init gicv2m_teardown(void) static struct msi_parent_ops gicv2m_msi_parent_ops = { .supported_flags = GICV2M_MSI_FLAGS_SUPPORTED, .required_flags = GICV2M_MSI_FLAGS_REQUIRED, - .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "GICv2m-", diff --git a/drivers/irqchip/irq-gic-v3-its-msi-parent.c b/drivers/irqchip/irq-gic-v3-its-msi-parent.c index bdb04c8081480d..c5a7eb1c041959 100644 --- a/drivers/irqchip/irq-gic-v3-its-msi-parent.c +++ b/drivers/irqchip/irq-gic-v3-its-msi-parent.c @@ -203,7 +203,7 @@ static bool its_init_dev_msi_info(struct device *dev, struct irq_domain *domain, const struct msi_parent_ops gic_v3_its_msi_parent_ops = { .supported_flags = ITS_MSI_FLAGS_SUPPORTED, .required_flags = ITS_MSI_FLAGS_REQUIRED, - .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "ITS-", diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 34e9ca77a8c368..647b18e24e0c2d 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -197,7 +197,7 @@ static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, static const struct msi_parent_ops gic_v3_mbi_msi_parent_ops = { .supported_flags = MBI_MSI_FLAGS_SUPPORTED, .required_flags = MBI_MSI_FLAGS_REQUIRED, - .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "MBI-", diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c index d67f93f6d75056..60b976286636f2 100644 --- a/drivers/irqchip/irq-mvebu-gicp.c +++ b/drivers/irqchip/irq-mvebu-gicp.c @@ -161,7 +161,7 @@ static const struct irq_domain_ops gicp_domain_ops = { static const struct msi_parent_ops gicp_msi_parent_ops = { .supported_flags = GICP_MSI_FLAGS_SUPPORTED, .required_flags = GICP_MSI_FLAGS_REQUIRED, - .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "GICP-", diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c index 28f7e81df94f0a..54f6f081157339 100644 --- a/drivers/irqchip/irq-mvebu-odmi.c +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -157,7 +157,7 @@ static const struct irq_domain_ops odmi_domain_ops = { static const struct msi_parent_ops odmi_msi_parent_ops = { .supported_flags = ODMI_MSI_FLAGS_SUPPORTED, .required_flags = ODMI_MSI_FLAGS_REQUIRED, - .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "ODMI-", From c6bfbfe5c41f13256b2d02987f3cca4fbd9d66da Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 14 May 2025 10:13:20 -0700 Subject: [PATCH 273/353] irqchip/riscv-imsic: Start local sync timer on correct CPU When starting the local sync timer to synchronize the state of a remote CPU it should be added on the CPU to be synchronized, not the initiating CPU. This results in interrupt delivery being delayed until the timer eventually runs (due to another mask/unmask/migrate operation) on the target CPU. Fixes: 0f67911e821c ("irqchip/riscv-imsic: Separate next and previous pointers in IMSIC vector") Signed-off-by: Andrew Bresticker Signed-off-by: Thomas Gleixner Reviewed-by: Anup Patel Link: https://lore.kernel.org/all/20250514171320.3494917-1-abrestic@rivosinc.com --- drivers/irqchip/irq-riscv-imsic-state.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index bdf5cd2037f289..62f76950a113b5 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -208,17 +208,17 @@ static bool __imsic_local_sync(struct imsic_local_priv *lpriv) } #ifdef CONFIG_SMP -static void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +static void __imsic_local_timer_start(struct imsic_local_priv *lpriv, unsigned int cpu) { lockdep_assert_held(&lpriv->lock); if (!timer_pending(&lpriv->timer)) { lpriv->timer.expires = jiffies + 1; - add_timer_on(&lpriv->timer, smp_processor_id()); + add_timer_on(&lpriv->timer, cpu); } } #else -static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv, unsigned int cpu) { } #endif @@ -233,7 +233,7 @@ void imsic_local_sync_all(bool force_all) if (force_all) bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1); if (!__imsic_local_sync(lpriv)) - __imsic_local_timer_start(lpriv); + __imsic_local_timer_start(lpriv, smp_processor_id()); raw_spin_unlock_irqrestore(&lpriv->lock, flags); } @@ -278,7 +278,7 @@ static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu return; } - __imsic_local_timer_start(lpriv); + __imsic_local_timer_start(lpriv, cpu); } } #else From ec83ead7afe6eab451a39ad53f18dfc841cb9be7 Mon Sep 17 00:00:00 2001 From: Wupeng Ma Date: Thu, 10 Apr 2025 14:26:33 +0800 Subject: [PATCH 274/353] mm: hugetlb: fix incorrect fallback for subpool During our testing with hugetlb subpool enabled, we observe that hstate->resv_huge_pages may underflow into negative values. Root cause analysis reveals a race condition in subpool reservation fallback handling as follow: hugetlb_reserve_pages() /* Attempt subpool reservation */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); /* Global reservation may fail after subpool allocation */ if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; out_put_pages: /* This incorrectly restores reservation to subpool */ hugepage_subpool_put_pages(spool, chg); When hugetlb_acct_memory() fails after subpool allocation, the current implementation over-commits subpool reservations by returning the full 'chg' value instead of the actual allocated 'gbl_reserve' amount. This discrepancy propagates to global reservations during subsequent releases, eventually causing resv_huge_pages underflow. This problem can be trigger easily with the following steps: 1. reverse hugepage for hugeltb allocation 2. mount hugetlbfs with min_size to enable hugetlb subpool 3. alloc hugepages with two task(make sure the second will fail due to insufficient amount of hugepages) 4. with for a few seconds and repeat step 3 which will make hstate->resv_huge_pages to go below zero. To fix this problem, return corrent amount of pages to subpool during the fallback after hugepage_subpool_get_pages is called. Link: https://lkml.kernel.org/r/20250410062633.3102457-1-mawupeng1@huawei.com Fixes: 1c5ecae3a93f ("hugetlbfs: add minimum size accounting to subpools") Signed-off-by: Wupeng Ma Tested-by: Joshua Hahn Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Ma Wupeng Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6ea1be71aa429e..7ae38bfb9096c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3010,7 +3010,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3163,16 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -7239,7 +7247,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7374,8 +7382,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); From 5958308aa3056656074e89b84e8e160378d7164d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 22 Apr 2025 16:49:42 +0200 Subject: [PATCH 275/353] kernel/fork: only call untrack_pfn_clear() on VMAs duplicated for fork() Not intuitive, but vm_area_dup() located in kernel/fork.c is not only used for duplicating VMAs during fork(), but also for duplicating VMAs when splitting VMAs or when mremap()'ing them. VM_PFNMAP mappings can at least get ordinarily mremap()'ed (no change in size) and apparently also shrunk during mremap(), which implies duplicating the VMA in __split_vma() first. In case of ordinary mremap() (no change in size), we first duplicate the VMA in copy_vma_and_data()->copy_vma() to then call untrack_pfn_clear() on the old VMA: we effectively move the VM_PAT reservation. So the untrack_pfn_clear() call on the new VMA duplicating is wrong in that context. Splitting of VMAs seems problematic, because we don't duplicate/adjust the reservation when splitting the VMA. Instead, in memtype_erase() -- called during zapping/munmap -- we shrink a reservation in case only the end address matches: Assume we split a VMA into A and B, both would share a reservation until B is unmapped. So when unmapping B, the reservation would be updated to cover only A. When unmapping A, we would properly remove the now-shrunk reservation. That scenario describes the mremap() shrinking (old_size > new_size), where we split + unmap B, and the untrack_pfn_clear() on the new VMA when is wrong. What if we manage to split a VM_PFNMAP VMA into A and B and unmap A first? It would be broken because we would never free the reservation. Likely, there are ways to trigger such a VMA split outside of mremap(). Affecting other VMA duplication was not intended, vm_area_dup() being used outside of kernel/fork.c was an oversight. So let's fix that for; how to handle VMA splits better should be investigated separately. With a simple reproducer that uses mprotect() to split such a VMA I can trigger x86/PAT: pat_mremap:26448 freeing invalid memtype [mem 0x00000000-0x00000fff] Link: https://lkml.kernel.org/r/20250422144942.2871395-1-david@redhat.com Fixes: dc84bc2aba85 ("x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()") Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Signed-off-by: Andrew Morton --- kernel/fork.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b8e..168681fc4b25a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -498,10 +498,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(new->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(new); - return new; } @@ -672,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; From 9378f47a5ccd970a5764c4e15b8cfaf108dfcf2a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 25 Apr 2025 15:43:25 +0800 Subject: [PATCH 276/353] mm/memory: fix mapcount / refcount sanity check for mTHP reuse The following WARNING was triggered during swap stress test with mTHP enabled: [ 6609.335758] ------------[ cut here ]------------ [ 6609.337758] WARNING: CPU: 82 PID: 755116 at mm/memory.c:3794 do_wp_page+0x1084/0x10e0 [ 6609.340922] Modules linked in: zram virtiofs [ 6609.342699] CPU: 82 UID: 0 PID: 755116 Comm: sh Kdump: loaded Not tainted 6.15.0-rc1+ #1429 PREEMPT(voluntary) [ 6609.347620] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015 [ 6609.349909] RIP: 0010:do_wp_page+0x1084/0x10e0 [ 6609.351532] Code: ff ff 48 c7 c6 80 ba 49 82 4c 89 ef e8 95 fd fe ff 0f 0b bd f5 ff ff ff e9 43 fb ff ff 41 83 a9 bc 12 00 00 01 e9 5c fb ff ff <0f> 0b e9 a6 fc ff ff 65 ff 00 f0 48 0f b a 6d 00 1f 0f 83 82 fc ff [ 6609.357959] RSP: 0000:ffffc90002273d40 EFLAGS: 00010287 [ 6609.359915] RAX: 000000000000000f RBX: 0000000000000000 RCX: 000fffffffe00000 [ 6609.362606] RDX: 0000000000000010 RSI: 000055a119ac1000 RDI: ffffea000ae6ec00 [ 6609.365143] RBP: ffffea000ae6ec68 R08: 84000002b9bb1025 R09: 000055a119ab6000 [ 6609.367569] R10: ffff8881caa2ad80 R11: 0000000000000000 R12: ffff8881caa2ad80 [ 6609.370255] R13: ffffea000ae6ec00 R14: 000055a119ac1c9c R15: ffffc90002273dd8 [ 6609.373007] FS: 00007f08e467f740(0000) GS:ffff88a07c214000(0000) knlGS:0000000000000000 [ 6609.375999] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6609.377946] CR2: 000055a119ac1c9c CR3: 00000001adfd6005 CR4: 0000000000770eb0 [ 6609.380376] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6609.382853] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6609.385216] PKRU: 55555554 [ 6609.386141] Call Trace: [ 6609.387017] [ 6609.387718] ? ___pte_offset_map+0x1b/0x110 [ 6609.389056] __handle_mm_fault+0xa51/0xf00 [ 6609.390363] ? exc_page_fault+0x6a/0x140 [ 6609.391629] handle_mm_fault+0x13d/0x360 [ 6609.392856] do_user_addr_fault+0x2f2/0x7f0 [ 6609.394160] ? sigprocmask+0x77/0xa0 [ 6609.395375] exc_page_fault+0x6a/0x140 [ 6609.396735] asm_exc_page_fault+0x26/0x30 [ 6609.398224] RIP: 0033:0x55a1050bc18b [ 6609.399567] Code: 8b 3f 4d 85 ff 74 40 41 39 5f 18 75 f2 49 8b 7f 08 44 38 27 75 e9 4c 89 c6 4c 89 45 c8 e8 bd 83 fa ff 4c 8b 45 c8 85 c0 75 d5 <41> 83 47 1c 01 48 83 c4 28 4c 89 f8 5b 4 1 5c 41 5d 41 5e 41 5f 5d [ 6609.405971] RSP: 002b:00007ffcf5f37d90 EFLAGS: 00010246 [ 6609.407737] RAX: 0000000000000000 RBX: 00000000182768fa RCX: 0000000000000000 [ 6609.410151] RDX: 00000000000000fa RSI: 000055a105175c7b RDI: 000055a119ac1c60 [ 6609.412606] RBP: 00007ffcf5f37de0 R08: 000055a105175c7b R09: 0000000000000000 [ 6609.414998] R10: 000000004d2dfb5a R11: 0000000000000246 R12: 0000000000000050 [ 6609.417193] R13: 00000000000000fa R14: 000055a119abaf60 R15: 000055a119ac1c80 [ 6609.419268] [ 6609.419928] ---[ end trace 0000000000000000 ]--- The WARN_ON here is simply incorrect. The refcount here must be at least the mapcount, not the opposite. Each mapcount must have a corresponding refcount, but the refcount may increase if other components grab the folio, which is acceptable. Meanwhile, having a mapcount larger than refcount is a real problem. So fix the WARN_ON condition. Link: https://lkml.kernel.org/r/20250425074325.61833-1-ryncsn@gmail.com Fixes: 1da190f4d0a6 ("mm: Copy-on-Write (COW) reuse support for PTE-mapped THP") Signed-off-by: Kairui Song Reported-by: Kairui Song Closes: https://lore.kernel.org/all/CAMgjq7D+ea3eg9gRCVvRnto3Sv3_H3WVhupX4e=k8T5QAfBHbw@mail.gmail.com/ Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index ba3ea0a82f7f77..49199410805cd0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3751,7 +3751,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); - VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); if (folio_test_large_maybe_mapped_shared(folio)) goto unlock; From f246613c3e3a322b3641ff3f8e7291fe2aed35e4 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Tue, 6 May 2025 03:30:34 +0800 Subject: [PATCH 277/353] mm/codetag: move tag retrieval back upfront in __free_pages() Commit 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") introduces a possible use-after-free scenario, when page is non-compound, page[0] could be released by other thread right after put_page_testzero failed in current thread, pgalloc_tag_sub_pages afterwards would manipulate an invalid page for accounting remaining pages: [timeline] [thread1] [thread2] | alloc_page non-compound V | get_page, rf counter inc V | in ___free_pages | put_page_testzero fails V | put_page, page released V | in ___free_pages, | pgalloc_tag_sub_pages | manipulate an invalid page V Restore __free_pages() to its state before, retrieve alloc tag beforehand. Link: https://lkml.kernel.org/r/20250505193034.91682-1-00107082@163.com Fixes: 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") Signed-off-by: David Wang <00107082@163.com> Acked-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 8 ++++++++ mm/page_alloc.c | 15 ++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index c740779778304a..8a7f4f802c5748 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -188,6 +188,13 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) return tag; } +static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +{ + if (mem_alloc_profiling_enabled()) + return __pgalloc_tag_get(page); + return NULL; +} + void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -199,6 +206,7 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} +static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5669baf2a6fea7..1b00e14a97803f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1151,14 +1151,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) __pgalloc_tag_sub(page, nr); } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) +/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) { - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -1168,7 +1163,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ @@ -5065,11 +5060,13 @@ static void ___free_pages(struct page *page, unsigned int order, { /* get PageHead before we drop reference */ int head = PageHead(page); + /* get alloc tag in case the page is released by others */ + struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) __free_frozen_pages(page, order, fpi_flags); else if (!head) { - pgalloc_tag_sub_pages(page, (1 << order) - 1); + pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) __free_frozen_pages(page + (1 << order), order, fpi_flags); From 3035848c6c8c2364544fc9253b33aa726499ddbd Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 6 May 2025 18:36:01 +0100 Subject: [PATCH 278/353] MAINTAINERS: add mm GUP section As part of the ongoing efforts to sub-divide memory management maintainership and reviewership, establish a section for GUP (Get User Pages) support and add appropriate maintainers and reviewers. Link: https://lkml.kernel.org/r/20250506173601.97562-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: John Hubbard Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index de9b08096f624c..ea6af33219b558 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15548,6 +15548,18 @@ S: Maintained F: include/linux/execmem.h F: mm/execmem.c +MEMORY MANAGEMENT - GUP (GET USER PAGES) +M: Andrew Morton +M: David Hildenbrand +R: Jason Gunthorpe +R: John Hubbard +R: Peter Xu +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/gup.c + MEMORY MANAGEMENT - NUMA MEMBLOCKS AND NUMA EMULATION M: Andrew Morton M: Mike Rapoport From c0f123262ee249263a81da9fd680027e233a0318 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 May 2025 14:25:08 +0300 Subject: [PATCH 279/353] mm/page_alloc: ensure try_alloc_pages() plays well with unaccepted memory try_alloc_pages() will not attempt to allocate memory if the system has *any* unaccepted memory. Memory is accepted as needed and can remain in the system indefinitely, causing the interface to always fail. Rather than immediately giving up, attempt to use already accepted memory on free lists. Pass 'alloc_flags' to cond_accept_memory() and do not accept new memory for ALLOC_TRYLOCK requests. Found via code inspection - only BPF uses this at present and the runtime effects are unclear. Link: https://lkml.kernel.org/r/20250506112509.905147-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation") Cc: Alexei Starovoitov Cc: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b00e14a97803f..7248e300d36efb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static bool cond_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -3611,7 +3612,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } - cond_accept_memory(zone, order); + cond_accept_memory(zone, order, alloc_flags); /* * Detect whether the number of free pages is below high @@ -3638,7 +3639,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret; - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* @@ -3691,7 +3692,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, return page; } else { - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* Try again if zone has deferred pages */ @@ -4844,7 +4845,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto failed; } - cond_accept_memory(zone, 0); + cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, @@ -4853,7 +4854,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, break; } - if (cond_accept_memory(zone, 0)) + if (cond_accept_memory(zone, 0, alloc_flags)) goto retry_this_zone; /* Try again if zone has deferred pages */ @@ -7281,7 +7282,8 @@ static inline bool has_unaccepted_memory(void) return static_branch_unlikely(&zones_with_unaccepted_pages); } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { long to_accept, wmark; bool ret = false; @@ -7292,6 +7294,10 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (alloc_flags & ALLOC_TRYLOCK) + return false; + wmark = promo_wmark_pages(zone); /* @@ -7348,7 +7354,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { return false; } @@ -7419,11 +7426,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) if (!pcp_allowed_order(order)) return NULL; -#ifdef CONFIG_UNACCEPTED_MEMORY - /* Bailout, since try_to_accept_memory_one() needs to take a lock */ - if (has_unaccepted_memory()) - return NULL; -#endif /* Bailout, since _deferred_grow_zone() needs to take a lock */ if (deferred_pages_enabled()) return NULL; From 8188cb7c1e4f8f34a1404eefb65ee8c50613b308 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 May 2025 16:32:07 +0300 Subject: [PATCH 280/353] mm/page_alloc: fix race condition in unaccepted memory handling The page allocator tracks the number of zones that have unaccepted memory using static_branch_enc/dec() and uses that static branch in hot paths to determine if it needs to deal with unaccepted memory. Borislav and Thomas pointed out that the tracking is racy: operations on static_branch are not serialized against adding/removing unaccepted pages to/from the zone. Sanity checks inside static_branch machinery detects it: WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0 The comment around the WARN() explains the problem: /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ The effect of this static_branch optimization is only visible on microbenchmark. Instead of adding more complexity around it, remove it altogether. Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.local Reported-by: Borislav Petkov Tested-by: Borislav Petkov (AMD) Reported-by: Thomas Gleixner Cc: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/mm_init.c | 1 - mm/page_alloc.c | 47 ----------------------------------------------- 3 files changed, 49 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 25a29872c634ba..5c7a2b43ad7620 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1590,7 +1590,6 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); -void unaccepted_cleanup_work(struct work_struct *work); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 327764ca0ee4e2..eedce9321e13dc 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1441,7 +1441,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); - INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work); #endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7248e300d36efb..8258349e49acb7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7172,16 +7172,8 @@ bool has_managed_dma(void) #ifdef CONFIG_UNACCEPTED_MEMORY -/* Counts number of zones with unaccepted pages. */ -static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); - static bool lazy_accept = true; -void unaccepted_cleanup_work(struct work_struct *work) -{ - static_branch_dec(&zones_with_unaccepted_pages); -} - static int __init accept_memory_parse(char *p) { if (!strcmp(p, "lazy")) { @@ -7206,11 +7198,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) static void __accept_page(struct zone *zone, unsigned long *flags, struct page *page) { - bool last; - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); __ClearPageUnaccepted(page); @@ -7219,28 +7207,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags, accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) { - /* - * There are two corner cases: - * - * - If allocation occurs during the CPU bring up, - * static_branch_dec() cannot be used directly as - * it causes a deadlock on cpu_hotplug_lock. - * - * Instead, use schedule_work() to prevent deadlock. - * - * - If allocation occurs before workqueues are initialized, - * static_branch_dec() should be called directly. - * - * Workqueues are initialized before CPU bring up, so this - * will not conflict with the first scenario. - */ - if (system_wq) - schedule_work(&zone->unaccepted_cleanup); - else - unaccepted_cleanup_work(&zone->unaccepted_cleanup); - } } void accept_page(struct page *page) @@ -7277,20 +7243,12 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - static bool cond_accept_memory(struct zone *zone, unsigned int order, int alloc_flags) { long to_accept, wmark; bool ret = false; - if (!has_unaccepted_memory()) - return false; - if (list_empty(&zone->unaccepted_pages)) return false; @@ -7328,22 +7286,17 @@ static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); unsigned long flags; - bool first = false; if (!lazy_accept) return false; spin_lock_irqsave(&zone->lock, flags); - first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - if (first) - static_branch_inc(&zones_with_unaccepted_pages); - return true; } From 847e5ef672353ac146d31d52b62f2803f8285080 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 7 May 2025 14:42:24 +0900 Subject: [PATCH 281/353] zsmalloc: don't underflow size calculation in zs_obj_write() Do not mix class->size and object size during offsets/sizes calculation in zs_obj_write(). Size classes can merge into clusters, based on objects-per-zspage and pages-per-zspage characteristics, so some size classes can store objects smaller than class->size. This becomes problematic when object size is much smaller than class->size. zsmalloc can falsely decide that object spans two physical pages, because a larger class->size value is used for that check, while the actual object is much smaller and fits the free space of the first physical page, so there is nothing to write to the second page and memcpy() size calculation underflows. Unable to handle kernel paging request at virtual address ffffc00081ff4000 pc : __memcpy+0x10/0x24 lr : zs_obj_write+0x1b0/0x1d0 [zsmalloc] Call trace: __memcpy+0x10/0x24 (P) zram_write_page+0x150/0x4fc [zram] zram_submit_bio+0x5e0/0x6a4 [zram] __submit_bio+0x168/0x220 submit_bio_noacct_nocheck+0x128/0x2c8 submit_bio_noacct+0x19c/0x2f8 This is mostly seen on system with larger page-sizes, because size class cluters of such systems hold wider size ranges than on 4K PAGE_SIZE systems. Assume a 16K PAGE_SIZE system, a write of 820 bytes object to a 864-bytes size class at offset 15560. 15560 + 864 is more than 16384 so zsmalloc attempts to memcpy() it to two physical pages. However, 16384 - 15560 = 824 which is more than 820, so the object in fact doesn't span two physical pages, and there is no data to write to the second physical page. We always know the exact size in bytes of the object that we are about to write (store), so use it instead of class->size. Link: https://lkml.kernel.org/r/20250507054312.4135983-1-senozhatsky@chromium.org Fixes: 44f76413496e ("zsmalloc: introduce new object mapping API") Signed-off-by: Sergey Senozhatsky Reported-by: Igor Belousov Tested-by: Igor Belousov Acked-by: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 961b270f023c23..d14a7e317ac8bf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1243,19 +1243,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ void *dst = kmap_local_zpdesc(zpdesc); - if (!ZsHugePage(zspage)) - off += ZS_HANDLE_SIZE; memcpy(dst + off, handle_mem, mem_len); kunmap_local(dst); } else { /* this object spans two pages */ size_t sizes[2]; - off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = mem_len - sizes[0]; From ad8c20ff6de80df355f4ffd61feff7a19986efcd Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 9 May 2025 10:09:12 +1200 Subject: [PATCH 282/353] mm: userfaultfd: correct dirty flags set for both present and swap pte MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As David pointed out, what truly matters for mremap and userfaultfd move operations is the soft dirty bit. The current comment and implementation—which always sets the dirty bit for present PTEs and fails to set the soft dirty bit for swap PTEs—are incorrect. This could break features like Checkpoint-Restore in Userspace (CRIU). This patch updates the behavior to correctly set the soft dirty bit for both present and swap PTEs in accordance with mremap. Link: https://lkml.kernel.org/r/20250508220912.7275-1-21cnbao@gmail.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Barry Song Reported-by: David Hildenbrand Closes: https://lore.kernel.org/linux-mm/02f14ee1-923f-47e3-a994-4950afb9afcc@redhat.com/ Acked-by: Peter Xu Reviewed-by: Suren Baghdasaryan Cc: Lokesh Gidra Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7d5d709cc838ed..e0db855c89b41a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1064,8 +1064,13 @@ static int move_present_pte(struct mm_struct *mm, src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + /* Set soft dirty bit so userspace can notice the pte was moved */ +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); +#endif + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); out: @@ -1100,6 +1105,9 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); +#endif set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); From 708d39b515e54c10672833c1e10fc4d2ebc86d44 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 9 Apr 2025 13:22:39 +0100 Subject: [PATCH 283/353] soundwire: bus: Fix race on the creation of the IRQ domain The SoundWire IRQ domain needs to be created before any slaves are added to the bus, such that the domain is always available when needed. Move the call to sdw_irq_create() before the calls to sdw_acpi_find_slaves() and sdw_of_find_slaves(). Fixes: 12a95123bfe1 ("soundwire: bus: Allow SoundWire peripherals to register IRQ handlers") Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20250409122239.1396489-1-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 377ab21c9cf0fb..68db4b67a86ff9 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -124,6 +124,10 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, set_bit(SDW_GROUP13_DEV_NUM, bus->assigned); set_bit(SDW_MASTER_DEV_NUM, bus->assigned); + ret = sdw_irq_create(bus, fwnode); + if (ret) + return ret; + /* * SDW is an enumerable bus, but devices can be powered off. So, * they won't be able to report as present. @@ -140,6 +144,7 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, if (ret < 0) { dev_err(bus->dev, "Finding slaves failed:%d\n", ret); + sdw_irq_delete(bus); return ret; } @@ -158,10 +163,6 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, bus->params.curr_bank = SDW_BANK0; bus->params.next_bank = SDW_BANK1; - ret = sdw_irq_create(bus, fwnode); - if (ret) - return ret; - return 0; } EXPORT_SYMBOL(sdw_bus_master_add); From 7b953a4b302286c048d41791bd69a6a62add5f3f Mon Sep 17 00:00:00 2001 From: Wayne Chang Date: Tue, 8 Apr 2025 11:09:05 +0800 Subject: [PATCH 284/353] phy: tegra: xusb: Use a bitmask for UTMI pad power state tracking The current implementation uses bias_pad_enable as a reference count to manage the shared bias pad for all UTMI PHYs. However, during system suspension with connected USB devices, multiple power-down requests for the UTMI pad result in a mismatch in the reference count, which in turn produces warnings such as: [ 237.762967] WARNING: CPU: 10 PID: 1618 at tegra186_utmi_pad_power_down+0x160/0x170 [ 237.763103] Call trace: [ 237.763104] tegra186_utmi_pad_power_down+0x160/0x170 [ 237.763107] tegra186_utmi_phy_power_off+0x10/0x30 [ 237.763110] phy_power_off+0x48/0x100 [ 237.763113] tegra_xusb_enter_elpg+0x204/0x500 [ 237.763119] tegra_xusb_suspend+0x48/0x140 [ 237.763122] platform_pm_suspend+0x2c/0xb0 [ 237.763125] dpm_run_callback.isra.0+0x20/0xa0 [ 237.763127] __device_suspend+0x118/0x330 [ 237.763129] dpm_suspend+0x10c/0x1f0 [ 237.763130] dpm_suspend_start+0x88/0xb0 [ 237.763132] suspend_devices_and_enter+0x120/0x500 [ 237.763135] pm_suspend+0x1ec/0x270 The root cause was traced back to the dynamic power-down changes introduced in commit a30951d31b25 ("xhci: tegra: USB2 pad power controls"), where the UTMI pad was being powered down without verifying its current state. This unbalanced behavior led to discrepancies in the reference count. To rectify this issue, this patch replaces the single reference counter with a bitmask, renamed to utmi_pad_enabled. Each bit in the mask corresponds to one of the four USB2 PHYs, allowing us to track each pad's enablement status individually. With this change: - The bias pad is powered on only when the mask is clear. - Each UTMI pad is powered on or down based on its corresponding bit in the mask, preventing redundant operations. - The overall power state of the shared bias pad is maintained correctly during suspend/resume cycles. The mutex used to prevent race conditions during UTMI pad enable/disable operations has been moved from the tegra186_utmi_bias_pad_power_on/off functions to the parent functions tegra186_utmi_pad_power_on/down. This change ensures that there are no race conditions when updating the bitmask. Cc: stable@vger.kernel.org Fixes: a30951d31b25 ("xhci: tegra: USB2 pad power controls") Signed-off-by: Wayne Chang Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://lore.kernel.org/r/20250408030905.990474-1-waynec@nvidia.com Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb-tegra186.c | 44 +++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/drivers/phy/tegra/xusb-tegra186.c b/drivers/phy/tegra/xusb-tegra186.c index fae6242aa730e0..cc7b8a6a999f74 100644 --- a/drivers/phy/tegra/xusb-tegra186.c +++ b/drivers/phy/tegra/xusb-tegra186.c @@ -237,6 +237,8 @@ #define DATA0_VAL_PD BIT(1) #define USE_XUSB_AO BIT(4) +#define TEGRA_UTMI_PAD_MAX 4 + #define TEGRA186_LANE(_name, _offset, _shift, _mask, _type) \ { \ .name = _name, \ @@ -269,7 +271,7 @@ struct tegra186_xusb_padctl { /* UTMI bias and tracking */ struct clk *usb2_trk_clk; - unsigned int bias_pad_enable; + DECLARE_BITMAP(utmi_pad_enabled, TEGRA_UTMI_PAD_MAX); /* padctl context */ struct tegra186_xusb_padctl_context context; @@ -603,12 +605,8 @@ static void tegra186_utmi_bias_pad_power_on(struct tegra_xusb_padctl *padctl) u32 value; int err; - mutex_lock(&padctl->lock); - - if (priv->bias_pad_enable++ > 0) { - mutex_unlock(&padctl->lock); + if (!bitmap_empty(priv->utmi_pad_enabled, TEGRA_UTMI_PAD_MAX)) return; - } err = clk_prepare_enable(priv->usb2_trk_clk); if (err < 0) @@ -667,17 +665,8 @@ static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl) struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl); u32 value; - mutex_lock(&padctl->lock); - - if (WARN_ON(priv->bias_pad_enable == 0)) { - mutex_unlock(&padctl->lock); - return; - } - - if (--priv->bias_pad_enable > 0) { - mutex_unlock(&padctl->lock); + if (!bitmap_empty(priv->utmi_pad_enabled, TEGRA_UTMI_PAD_MAX)) return; - } value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL1); value |= USB2_PD_TRK; @@ -690,13 +679,13 @@ static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl) clk_disable_unprepare(priv->usb2_trk_clk); } - mutex_unlock(&padctl->lock); } static void tegra186_utmi_pad_power_on(struct phy *phy) { struct tegra_xusb_lane *lane = phy_get_drvdata(phy); struct tegra_xusb_padctl *padctl = lane->pad->padctl; + struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl); struct tegra_xusb_usb2_port *port; struct device *dev = padctl->dev; unsigned int index = lane->index; @@ -705,9 +694,16 @@ static void tegra186_utmi_pad_power_on(struct phy *phy) if (!phy) return; + mutex_lock(&padctl->lock); + if (test_bit(index, priv->utmi_pad_enabled)) { + mutex_unlock(&padctl->lock); + return; + } + port = tegra_xusb_find_usb2_port(padctl, index); if (!port) { dev_err(dev, "no port found for USB2 lane %u\n", index); + mutex_unlock(&padctl->lock); return; } @@ -724,18 +720,28 @@ static void tegra186_utmi_pad_power_on(struct phy *phy) value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index)); value &= ~USB2_OTG_PD_DR; padctl_writel(padctl, value, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index)); + + set_bit(index, priv->utmi_pad_enabled); + mutex_unlock(&padctl->lock); } static void tegra186_utmi_pad_power_down(struct phy *phy) { struct tegra_xusb_lane *lane = phy_get_drvdata(phy); struct tegra_xusb_padctl *padctl = lane->pad->padctl; + struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl); unsigned int index = lane->index; u32 value; if (!phy) return; + mutex_lock(&padctl->lock); + if (!test_bit(index, priv->utmi_pad_enabled)) { + mutex_unlock(&padctl->lock); + return; + } + dev_dbg(padctl->dev, "power down UTMI pad %u\n", index); value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL0(index)); @@ -748,7 +754,11 @@ static void tegra186_utmi_pad_power_down(struct phy *phy) udelay(2); + clear_bit(index, priv->utmi_pad_enabled); + tegra186_utmi_bias_pad_power_off(padctl); + + mutex_unlock(&padctl->lock); } static int tegra186_xusb_padctl_vbus_override(struct tegra_xusb_padctl *padctl, From 8ad3ae52e0675c10b28477fa4880648a637eaeb0 Mon Sep 17 00:00:00 2001 From: Nitin Rawat Date: Mon, 7 Apr 2025 17:40:08 +0530 Subject: [PATCH 285/353] phy: qcom-qmp-ufs: check for mode type for phy setting Generally all target supports Rate B but for very few like SM8550, two sets of UFS PHY settings are provided, one set is to support HS-G5 Rate A and another set is to support HS-G4 and lower gears with Rate B. Commit b02cc9a17679("phy: qcom-qmp-ufs: Add PHY Configuration support for sm8750") apply Rate B setting for SM8550 gear 5 without checking for mode value (Rate A or Rate B) from Controller driver which caused issue as SM8550 support rate A for Gear 5. Fix this by adding mode check before applying Rat B phy setting. Fixes: b02cc9a17679 ("phy: qcom-qmp-ufs: Add PHY Configuration support for sm8750") Reported-by: Neil Armstrong Closes: https://lore.kernel.org/all/430ed11c-0490-45be-897b-27cad9682371@quicinc.com/ Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Nitin Rawat Link: https://lore.kernel.org/r/20250407121008.22230-1-quic_nitirawa@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-ufs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c index 45b3b792696e9e..b33e2e2b5014d3 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c @@ -1754,7 +1754,8 @@ static void qmp_ufs_init_registers(struct qmp_ufs *qmp, const struct qmp_phy_cfg qmp_ufs_init_all(qmp, &cfg->tbls_hs_overlay[i]); } - qmp_ufs_init_all(qmp, &cfg->tbls_hs_b); + if (qmp->mode == PHY_MODE_UFS_HS_B) + qmp_ufs_init_all(qmp, &cfg->tbls_hs_b); } static int qmp_ufs_com_init(struct qmp_ufs *qmp) From 72f2d242f328717d296e104804c17d3a562892ed Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 20 Mar 2025 16:15:42 +0100 Subject: [PATCH 286/353] phy: can-transceiver: Re-instate "mux-states" property presence check On the Renesas Gray Hawk Single development board: can-transceiver-phy can-phy0: /can-phy0: failed to get mux-state (0) "mux-states" is an optional property for CAN transceivers. However, mux_get() always prints an error message in case of an error, including when the property is not present, confusing the user. Fix this by re-instating the property presence check (this time using the proper API) in a wrapper around devm_mux_state_get(). When the multiplexer subsystem gains support for optional muxes, the wrapper can just be removed. In addition, propagate all real errors upstream, instead of ignoring them. Fixes: d02dfd4ceb2e9f34 ("phy: can-transceiver: Drop unnecessary "mux-states" property presence check") Signed-off-by: Geert Uytterhoeven Tested-by: Biju Das Reviewed-by: Biju Das Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/3d7e0d723908284e8cf06ad1f7950c03173178f3.1742483710.git.geert+renesas@glider.be Signed-off-by: Vinod Koul --- drivers/phy/phy-can-transceiver.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/phy/phy-can-transceiver.c b/drivers/phy/phy-can-transceiver.c index 2bec70615449f9..f59caff4b3d4c2 100644 --- a/drivers/phy/phy-can-transceiver.c +++ b/drivers/phy/phy-can-transceiver.c @@ -93,6 +93,16 @@ static const struct of_device_id can_transceiver_phy_ids[] = { }; MODULE_DEVICE_TABLE(of, can_transceiver_phy_ids); +/* Temporary wrapper until the multiplexer subsystem supports optional muxes */ +static inline struct mux_state * +devm_mux_state_get_optional(struct device *dev, const char *mux_name) +{ + if (!of_property_present(dev->of_node, "mux-states")) + return NULL; + + return devm_mux_state_get(dev, mux_name); +} + static int can_transceiver_phy_probe(struct platform_device *pdev) { struct phy_provider *phy_provider; @@ -114,13 +124,11 @@ static int can_transceiver_phy_probe(struct platform_device *pdev) match = of_match_node(can_transceiver_phy_ids, pdev->dev.of_node); drvdata = match->data; - mux_state = devm_mux_state_get(dev, NULL); - if (IS_ERR(mux_state)) { - if (PTR_ERR(mux_state) == -EPROBE_DEFER) - return PTR_ERR(mux_state); - } else { - can_transceiver_phy->mux_state = mux_state; - } + mux_state = devm_mux_state_get_optional(dev, NULL); + if (IS_ERR(mux_state)) + return PTR_ERR(mux_state); + + can_transceiver_phy->mux_state = mux_state; phy = devm_phy_create(dev, dev->of_node, &can_transceiver_phy_ops); From 36334ea8bec5d997adaa1ac5cb13760317974f0a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Mar 2025 17:36:14 +0300 Subject: [PATCH 287/353] phy: rockchip-samsung-dcphy: Add missing assignment The "ret = " was accidentally dropped so the error handling doesn't work. Fixes: b2a1a2ae7818 ("phy: rockchip: Add Samsung MIPI D-/C-PHY driver") Signed-off-by: Dan Carpenter Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/e64265a4-9543-4728-a49f-ea910fccef7c@stanley.mountain Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c b/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c index 08c78c1bafc9a2..28a052e1736651 100644 --- a/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c +++ b/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c @@ -1653,7 +1653,7 @@ static __maybe_unused int samsung_mipi_dcphy_runtime_resume(struct device *dev) return ret; } - clk_prepare_enable(samsung->ref_clk); + ret = clk_prepare_enable(samsung->ref_clk); if (ret) { dev_err(samsung->dev, "Failed to enable reference clock, %d\n", ret); clk_disable_unprepare(samsung->pclk); From caf8cc30537d3746744fc126edd4850d142ff2ca Mon Sep 17 00:00:00 2001 From: Hal Feng Date: Tue, 22 Apr 2025 18:12:44 +0800 Subject: [PATCH 288/353] phy: starfive: jh7110-usb: Fix USB 2.0 host occasional detection failure JH7110 USB 2.0 host fails to detect USB 2.0 devices occasionally. With a long time of debugging and testing, we found that setting Rx clock gating control signal to normal power consumption mode can solve this problem. Signed-off-by: Hal Feng Link: https://lore.kernel.org/r/20250422101244.51686-1-hal.feng@starfivetech.com Signed-off-by: Vinod Koul --- drivers/phy/starfive/phy-jh7110-usb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/phy/starfive/phy-jh7110-usb.c b/drivers/phy/starfive/phy-jh7110-usb.c index cb5454fbe2c8fa..b505d89860b439 100644 --- a/drivers/phy/starfive/phy-jh7110-usb.c +++ b/drivers/phy/starfive/phy-jh7110-usb.c @@ -18,6 +18,8 @@ #include #define USB_125M_CLK_RATE 125000000 +#define USB_CLK_MODE_OFF 0x0 +#define USB_CLK_MODE_RX_NORMAL_PWR BIT(1) #define USB_LS_KEEPALIVE_OFF 0x4 #define USB_LS_KEEPALIVE_ENABLE BIT(4) @@ -78,6 +80,7 @@ static int jh7110_usb2_phy_init(struct phy *_phy) { struct jh7110_usb2_phy *phy = phy_get_drvdata(_phy); int ret; + unsigned int val; ret = clk_set_rate(phy->usb_125m_clk, USB_125M_CLK_RATE); if (ret) @@ -87,6 +90,10 @@ static int jh7110_usb2_phy_init(struct phy *_phy) if (ret) return ret; + val = readl(phy->regs + USB_CLK_MODE_OFF); + val |= USB_CLK_MODE_RX_NORMAL_PWR; + writel(val, phy->regs + USB_CLK_MODE_OFF); + return 0; } From 5e511b7a7f9396d132a5f64e105bbaa4aa7b96aa Mon Sep 17 00:00:00 2001 From: Algea Cao Date: Sun, 27 Apr 2025 17:51:24 +0800 Subject: [PATCH 289/353] phy: phy-rockchip-samsung-hdptx: Fix PHY PLL output 50.25MHz error When using HDMI PLL frequency division coefficient at 50.25MHz that is calculated by rk_hdptx_phy_clk_pll_calc(), it fails to get PHY LANE lock. Although the calculated values are within the allowable range of PHY PLL configuration. In order to fix the PHY LANE lock error and provide the expected 50.25MHz output, manually compute the required PHY PLL frequency division coefficient and add it to ropll_tmds_cfg configuration table. Signed-off-by: Algea Cao Reviewed-by: Cristian Ciocaltea Acked-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250427095124.3354439-1-algea.cao@rock-chips.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c index fe7c0574835636..77236f012a1f75 100644 --- a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c +++ b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c @@ -476,6 +476,8 @@ static const struct ropll_config ropll_tmds_cfg[] = { 1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, }, { 650000, 162, 162, 1, 1, 11, 1, 1, 1, 1, 1, 1, 1, 54, 0, 16, 4, 1, 1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, }, + { 502500, 84, 84, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 11, 1, 4, 5, + 4, 11, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, }, { 337500, 0x70, 0x70, 1, 1, 0xf, 1, 1, 1, 1, 1, 1, 1, 0x2, 0, 0x01, 5, 1, 1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, }, { 400000, 100, 100, 1, 1, 11, 1, 1, 0, 1, 0, 1, 1, 0x9, 0, 0x05, 0, From 6e0d8c0fbee2126c2b3236fb03c49b9895062658 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Apr 2025 16:08:23 +0300 Subject: [PATCH 290/353] phy: tegra: xusb: remove a stray unlock We used to take a lock in tegra186_utmi_bias_pad_power_on() but now we have moved the lock into the caller. Unfortunately, when we moved the lock this unlock was left behind and it results in a double unlock. Delete it now. Fixes: b47158fb4295 ("phy: tegra: xusb: Use a bitmask for UTMI pad power state tracking") Signed-off-by: Dan Carpenter Reviewed-by: Jon Hunter Link: https://lore.kernel.org/r/aAjmR6To4EnvRl4G@stanley.mountain Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb-tegra186.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/phy/tegra/xusb-tegra186.c b/drivers/phy/tegra/xusb-tegra186.c index cc7b8a6a999f74..23a23f2d64e586 100644 --- a/drivers/phy/tegra/xusb-tegra186.c +++ b/drivers/phy/tegra/xusb-tegra186.c @@ -656,8 +656,6 @@ static void tegra186_utmi_bias_pad_power_on(struct tegra_xusb_padctl *padctl) } else { clk_disable_unprepare(priv->usb2_trk_clk); } - - mutex_unlock(&padctl->lock); } static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl) From 3ed6b6fc9806937f03b15e229026824a78edee6d Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:28 +0300 Subject: [PATCH 291/353] phy: renesas: rcar-gen3-usb2: Fix role detection on unbind/bind It has been observed on the Renesas RZ/G3S SoC that unbinding and binding the PHY driver leads to role autodetection failures. This issue occurs when PHY 3 is the first initialized PHY. PHY 3 does not have an interrupt associated with the USB2_INT_ENABLE register (as rcar_gen3_int_enable[3] = 0). As a result, rcar_gen3_init_otg() is called to initialize OTG without enabling PHY interrupts. To resolve this, add rcar_gen3_is_any_otg_rphy_initialized() and call it in role_store(), role_show(), and rcar_gen3_init_otg(). At the same time, rcar_gen3_init_otg() is only called when initialization for a PHY with interrupt bits is in progress. As a result, the struct rcar_gen3_phy::otg_initialized is no longer needed. Fixes: 549b6b55b005 ("phy: renesas: rcar-gen3-usb2: enable/disable independent irqs") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-2-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 33 ++++++++++-------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 775f4f973a6cc2..46afba2fe0dc69 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -107,7 +107,6 @@ struct rcar_gen3_phy { struct rcar_gen3_chan *ch; u32 int_enable_bits; bool initialized; - bool otg_initialized; bool powered; }; @@ -320,16 +319,15 @@ static bool rcar_gen3_is_any_rphy_initialized(struct rcar_gen3_chan *ch) return false; } -static bool rcar_gen3_needs_init_otg(struct rcar_gen3_chan *ch) +static bool rcar_gen3_is_any_otg_rphy_initialized(struct rcar_gen3_chan *ch) { - int i; - - for (i = 0; i < NUM_OF_PHYS; i++) { - if (ch->rphys[i].otg_initialized) - return false; + for (enum rcar_gen3_phy_index i = PHY_INDEX_BOTH_HC; i <= PHY_INDEX_EHCI; + i++) { + if (ch->rphys[i].initialized) + return true; } - return true; + return false; } static bool rcar_gen3_are_all_rphys_power_off(struct rcar_gen3_chan *ch) @@ -351,7 +349,7 @@ static ssize_t role_store(struct device *dev, struct device_attribute *attr, bool is_b_device; enum phy_mode cur_mode, new_mode; - if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch)) + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; if (sysfs_streq(buf, "host")) @@ -389,7 +387,7 @@ static ssize_t role_show(struct device *dev, struct device_attribute *attr, { struct rcar_gen3_chan *ch = dev_get_drvdata(dev); - if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch)) + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; return sprintf(buf, "%s\n", rcar_gen3_is_host(ch) ? "host" : @@ -402,6 +400,9 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch) void __iomem *usb2_base = ch->base; u32 val; + if (!ch->is_otg_channel || rcar_gen3_is_any_otg_rphy_initialized(ch)) + return; + /* Should not use functions of read-modify-write a register */ val = readl(usb2_base + USB2_LINECTRL1); val = (val & ~USB2_LINECTRL1_DP_RPD) | USB2_LINECTRL1_DPRPD_EN | @@ -465,12 +466,9 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); - /* Initialize otg part */ - if (channel->is_otg_channel) { - if (rcar_gen3_needs_init_otg(channel)) - rcar_gen3_init_otg(channel); - rphy->otg_initialized = true; - } + /* Initialize otg part (only if we initialize a PHY with IRQs). */ + if (rphy->int_enable_bits) + rcar_gen3_init_otg(channel); rphy->initialized = true; @@ -486,9 +484,6 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) rphy->initialized = false; - if (channel->is_otg_channel) - rphy->otg_initialized = false; - val = readl(usb2_base + USB2_INT_ENABLE); val &= ~rphy->int_enable_bits; if (!rcar_gen3_is_any_rphy_initialized(channel)) From 227bbd97c86b76d4dbdd3cc1efe38befda6f5961 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:29 +0300 Subject: [PATCH 292/353] phy: renesas: rcar-gen3-usb2: Move IRQ request in probe Commit 08b0ad375ca6 ("phy: renesas: rcar-gen3-usb2: move IRQ registration to init") moved the IRQ request operation from probe to struct phy_ops::phy_init API to avoid triggering interrupts (which lead to register accesses) while the PHY clocks (enabled through runtime PM APIs) are not active. If this happens, it results in a synchronous abort. One way to reproduce this issue is by enabling CONFIG_DEBUG_SHIRQ, which calls free_irq() on driver removal. Move the IRQ request and free operations back to probe, and take the runtime PM state into account in IRQ handler. This commit is preparatory for the subsequent fixes in this series. Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-3-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 46afba2fe0dc69..bb05fd26eb7f29 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -120,7 +120,6 @@ struct rcar_gen3_chan { struct work_struct work; struct mutex lock; /* protects rphys[...].powered */ enum usb_dr_mode dr_mode; - int irq; u32 obint_enable_bits; bool extcon_host; bool is_otg_channel; @@ -428,16 +427,25 @@ static irqreturn_t rcar_gen3_phy_usb2_irq(int irq, void *_ch) { struct rcar_gen3_chan *ch = _ch; void __iomem *usb2_base = ch->base; - u32 status = readl(usb2_base + USB2_OBINTSTA); + struct device *dev = ch->dev; irqreturn_t ret = IRQ_NONE; + u32 status; + pm_runtime_get_noresume(dev); + + if (pm_runtime_suspended(dev)) + goto rpm_put; + + status = readl(usb2_base + USB2_OBINTSTA); if (status & ch->obint_enable_bits) { - dev_vdbg(ch->dev, "%s: %08x\n", __func__, status); + dev_vdbg(dev, "%s: %08x\n", __func__, status); writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); rcar_gen3_device_recognition(ch); ret = IRQ_HANDLED; } +rpm_put: + pm_runtime_put_noidle(dev); return ret; } @@ -447,17 +455,6 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; void __iomem *usb2_base = channel->base; u32 val; - int ret; - - if (!rcar_gen3_is_any_rphy_initialized(channel) && channel->irq >= 0) { - INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work); - ret = request_irq(channel->irq, rcar_gen3_phy_usb2_irq, - IRQF_SHARED, dev_name(channel->dev), channel); - if (ret < 0) { - dev_err(channel->dev, "No irq handler (%d)\n", channel->irq); - return ret; - } - } /* Initialize USB2 part */ val = readl(usb2_base + USB2_INT_ENABLE); @@ -490,9 +487,6 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) val &= ~USB2_INT_ENABLE_UCOM_INTEN; writel(val, usb2_base + USB2_INT_ENABLE); - if (channel->irq >= 0 && !rcar_gen3_is_any_rphy_initialized(channel)) - free_irq(channel->irq, channel); - return 0; } @@ -698,7 +692,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct rcar_gen3_chan *channel; struct phy_provider *provider; - int ret = 0, i; + int ret = 0, i, irq; if (!dev->of_node) { dev_err(dev, "This driver needs device tree\n"); @@ -714,8 +708,6 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) return PTR_ERR(channel->base); channel->obint_enable_bits = USB2_OBINT_BITS; - /* get irq number here and request_irq for OTG in phy_init */ - channel->irq = platform_get_irq_optional(pdev, 0); channel->dr_mode = rcar_gen3_get_dr_mode(dev->of_node); if (channel->dr_mode != USB_DR_MODE_UNKNOWN) { channel->is_otg_channel = true; @@ -784,6 +776,20 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) channel->vbus = NULL; } + irq = platform_get_irq_optional(pdev, 0); + if (irq < 0 && irq != -ENXIO) { + ret = irq; + goto error; + } else if (irq > 0) { + INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work); + ret = devm_request_irq(dev, irq, rcar_gen3_phy_usb2_irq, + IRQF_SHARED, dev_name(dev), channel); + if (ret < 0) { + dev_err(dev, "Failed to request irq (%d)\n", irq); + goto error; + } + } + provider = devm_of_phy_provider_register(dev, rcar_gen3_phy_usb2_xlate); if (IS_ERR(provider)) { dev_err(dev, "Failed to register PHY provider\n"); From 53f1dc430a3d59b3c40c4d5fbdee17f81b04c6ad Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:30 +0300 Subject: [PATCH 293/353] phy: renesas: rcar-gen3-usb2: Lock around hardware registers and driver data The phy-rcar-gen3-usb2 driver exposes four individual PHYs that are requested and configured by PHY users. The struct phy_ops APIs access the same set of registers to configure all PHYs. Additionally, PHY settings can be modified through sysfs or an IRQ handler. While some struct phy_ops APIs are protected by a driver-wide mutex, others rely on individual PHY-specific mutexes. This approach can lead to various issues, including: 1/ the IRQ handler may interrupt PHY settings in progress, racing with hardware configuration protected by a mutex lock 2/ due to msleep(20) in rcar_gen3_init_otg(), while a configuration thread suspends to wait for the delay, another thread may try to configure another PHY (with phy_init() + phy_power_on()); re-running the phy_init() goes to the exact same configuration code, re-running the same hardware configuration on the same set of registers (and bits) which might impact the result of the msleep for the 1st configuring thread 3/ sysfs can configure the hardware (though role_store()) and it can still race with the phy_init()/phy_power_on() APIs calling into the drivers struct phy_ops To address these issues, add a spinlock to protect hardware register access and driver private data structures (e.g., calls to rcar_gen3_is_any_rphy_initialized()). Checking driver-specific data remains necessary as all PHY instances share common settings. With this change, the existing mutex protection is removed and the cleanup.h helpers are used. While at it, to keep the code simpler, do not skip regulator_enable()/regulator_disable() APIs in rcar_gen3_phy_usb2_power_on()/rcar_gen3_phy_usb2_power_off() as the regulators enable/disable operations are reference counted anyway. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-4-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 49 +++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index bb05fd26eb7f29..00ce564463de88 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -9,6 +9,7 @@ * Copyright (C) 2014 Cogent Embedded, Inc. */ +#include #include #include #include @@ -118,7 +119,7 @@ struct rcar_gen3_chan { struct regulator *vbus; struct reset_control *rstc; struct work_struct work; - struct mutex lock; /* protects rphys[...].powered */ + spinlock_t lock; /* protects access to hardware and driver data structure. */ enum usb_dr_mode dr_mode; u32 obint_enable_bits; bool extcon_host; @@ -348,6 +349,8 @@ static ssize_t role_store(struct device *dev, struct device_attribute *attr, bool is_b_device; enum phy_mode cur_mode, new_mode; + guard(spinlock_irqsave)(&ch->lock); + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; @@ -415,7 +418,7 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch) val = readl(usb2_base + USB2_ADPCTRL); writel(val | USB2_ADPCTRL_IDPULLUP, usb2_base + USB2_ADPCTRL); } - msleep(20); + mdelay(20); writel(0xffffffff, usb2_base + USB2_OBINTSTA); writel(ch->obint_enable_bits, usb2_base + USB2_OBINTEN); @@ -436,12 +439,14 @@ static irqreturn_t rcar_gen3_phy_usb2_irq(int irq, void *_ch) if (pm_runtime_suspended(dev)) goto rpm_put; - status = readl(usb2_base + USB2_OBINTSTA); - if (status & ch->obint_enable_bits) { - dev_vdbg(dev, "%s: %08x\n", __func__, status); - writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); - rcar_gen3_device_recognition(ch); - ret = IRQ_HANDLED; + scoped_guard(spinlock, &ch->lock) { + status = readl(usb2_base + USB2_OBINTSTA); + if (status & ch->obint_enable_bits) { + dev_vdbg(dev, "%s: %08x\n", __func__, status); + writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); + rcar_gen3_device_recognition(ch); + ret = IRQ_HANDLED; + } } rpm_put: @@ -456,6 +461,8 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) void __iomem *usb2_base = channel->base; u32 val; + guard(spinlock_irqsave)(&channel->lock); + /* Initialize USB2 part */ val = readl(usb2_base + USB2_INT_ENABLE); val |= USB2_INT_ENABLE_UCOM_INTEN | rphy->int_enable_bits; @@ -479,6 +486,8 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) void __iomem *usb2_base = channel->base; u32 val; + guard(spinlock_irqsave)(&channel->lock); + rphy->initialized = false; val = readl(usb2_base + USB2_INT_ENABLE); @@ -498,16 +507,17 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p) u32 val; int ret = 0; - mutex_lock(&channel->lock); - if (!rcar_gen3_are_all_rphys_power_off(channel)) - goto out; - if (channel->vbus) { ret = regulator_enable(channel->vbus); if (ret) - goto out; + return ret; } + guard(spinlock_irqsave)(&channel->lock); + + if (!rcar_gen3_are_all_rphys_power_off(channel)) + goto out; + val = readl(usb2_base + USB2_USBCTR); val |= USB2_USBCTR_PLL_RST; writel(val, usb2_base + USB2_USBCTR); @@ -517,7 +527,6 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p) out: /* The powered flag should be set for any other phys anyway */ rphy->powered = true; - mutex_unlock(&channel->lock); return 0; } @@ -528,18 +537,12 @@ static int rcar_gen3_phy_usb2_power_off(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; int ret = 0; - mutex_lock(&channel->lock); - rphy->powered = false; - - if (!rcar_gen3_are_all_rphys_power_off(channel)) - goto out; + scoped_guard(spinlock_irqsave, &channel->lock) + rphy->powered = false; if (channel->vbus) ret = regulator_disable(channel->vbus); -out: - mutex_unlock(&channel->lock); - return ret; } @@ -750,7 +753,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) if (phy_data->no_adp_ctrl) channel->obint_enable_bits = USB2_OBINT_IDCHG_EN; - mutex_init(&channel->lock); + spin_lock_init(&channel->lock); for (i = 0; i < NUM_OF_PHYS; i++) { channel->rphys[i].phy = devm_phy_create(dev, NULL, phy_data->phy_usb2_ops); From afc4c5d06cc37fc49f05920f0a963a3384e0d751 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:31 +0300 Subject: [PATCH 294/353] phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off Assert PLL reset on PHY power off. This saves power. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-5-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 00ce564463de88..118899efda70b3 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -537,9 +537,17 @@ static int rcar_gen3_phy_usb2_power_off(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; int ret = 0; - scoped_guard(spinlock_irqsave, &channel->lock) + scoped_guard(spinlock_irqsave, &channel->lock) { rphy->powered = false; + if (rcar_gen3_are_all_rphys_power_off(channel)) { + u32 val = readl(channel->base + USB2_USBCTR); + + val |= USB2_USBCTR_PLL_RST; + writel(val, channel->base + USB2_USBCTR); + } + } + if (channel->vbus) ret = regulator_disable(channel->vbus); From 6a6d1fee1c1f44278722a1028fb19256ef61fa65 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:32 +0300 Subject: [PATCH 295/353] phy: renesas: rcar-gen3-usb2: Set timing registers only once phy-rcar-gen3-usb2 driver exports 4 PHYs. The timing registers are common to all PHYs. There is no need to set them every time a PHY is initialized. Set timing register only when the 1st PHY is initialized. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-6-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 118899efda70b3..9fdf17e0848a28 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -467,8 +467,11 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) val = readl(usb2_base + USB2_INT_ENABLE); val |= USB2_INT_ENABLE_UCOM_INTEN | rphy->int_enable_bits; writel(val, usb2_base + USB2_INT_ENABLE); - writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); - writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); + + if (!rcar_gen3_is_any_rphy_initialized(channel)) { + writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); + writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); + } /* Initialize otg part (only if we initialize a PHY with IRQs). */ if (rphy->int_enable_bits) From 0b91c26775cfd8f409e80825cf039e8f5fcf1a94 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Mon, 3 Mar 2025 15:27:39 +0800 Subject: [PATCH 296/353] phy: Fix error handling in tegra_xusb_port_init If device_add() fails, do not use device_unregister() for error handling. device_unregister() consists two functions: device_del() and put_device(). device_unregister() should only be called after device_add() succeeded because device_del() undoes what device_add() does if successful. Change device_unregister() to put_device() call before returning from the function. As comment of device_add() says, 'if device_add() succeeds, you should call device_del() when you want to get rid of it. If device_add() has not succeeded, use only put_device() to drop the reference count'. Found by code review. Cc: stable@vger.kernel.org Fixes: 53d2a715c240 ("phy: Add Tegra XUSB pad controller support") Signed-off-by: Ma Ke Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20250303072739.3874987-1-make24@iscas.ac.cn Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c index 79d4814d758d5e..c89df95aa6ca98 100644 --- a/drivers/phy/tegra/xusb.c +++ b/drivers/phy/tegra/xusb.c @@ -548,16 +548,16 @@ static int tegra_xusb_port_init(struct tegra_xusb_port *port, err = dev_set_name(&port->dev, "%s-%u", name, index); if (err < 0) - goto unregister; + goto put_device; err = device_add(&port->dev); if (err < 0) - goto unregister; + goto put_device; return 0; -unregister: - device_unregister(&port->dev); +put_device: + put_device(&port->dev); return err; } From fccff4fff0cfa44562d96484f49e648d32b69167 Mon Sep 17 00:00:00 2001 From: Purva Yeshi Date: Thu, 10 Apr 2025 16:32:16 +0530 Subject: [PATCH 297/353] dmaengine: idxd: cdev: Fix uninitialized use of sva in idxd_cdev_open Fix Smatch-detected issue: drivers/dma/idxd/cdev.c:321 idxd_cdev_open() error: uninitialized symbol 'sva'. 'sva' pointer may be used uninitialized in error handling paths. Specifically, if PASID support is enabled and iommu_sva_bind_device() returns an error, the code jumps to the cleanup label and attempts to call iommu_sva_unbind_device(sva) without ensuring that sva was successfully assigned. This triggers a Smatch warning about an uninitialized symbol. Initialize sva to NULL at declaration and add a check using IS_ERR_OR_NULL() before unbinding the device. This ensures the function does not use an invalid or uninitialized pointer during cleanup. Signed-off-by: Purva Yeshi Reviewed-by: Dave Jiang Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20250410110216.21592-1-purvayeshi550@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index ff94ee892339d5..7bd031a608943c 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -222,7 +222,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) struct idxd_wq *wq; struct device *dev, *fdev; int rc = 0; - struct iommu_sva *sva; + struct iommu_sva *sva = NULL; unsigned int pasid; struct idxd_cdev *idxd_cdev; @@ -317,7 +317,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) if (device_user_pasid_enabled(idxd)) idxd_xa_pasid_remove(ctx); failed_get_pasid: - if (device_user_pasid_enabled(idxd)) + if (device_user_pasid_enabled(idxd) && !IS_ERR_OR_NULL(sva)) iommu_sva_unbind_device(sva); failed: mutex_unlock(&wq->wq_lock); From ff49d7e492cda700b41c9753ddbbb9cfe6e460ca Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 3 Apr 2025 11:24:19 -0500 Subject: [PATCH 298/353] dmaengine: Revert "dmaengine: dmatest: Fix dmatest waiting less when interrupted" Several issues with this change: * The analysis is flawed and it's unclear what problem is being fixed. There is no difference between wait_event_freezable_timeout() and wait_event_timeout() with respect to device interrupts. And of course "the interrupt notifying the finish of an operation happens during wait_event_freezable_timeout()" -- that's how it's supposed to work. * The link at the "Closes:" tag appears to be an unrelated use-after-free in idxd. * It introduces a regression: dmatest threads are meant to be freezable and this change breaks that. See discussion here: https://lore.kernel.org/dmaengine/878qpa13fe.fsf@AUSNATLYNCH.amd.com/ Fixes: e87ca16e9911 ("dmaengine: dmatest: Fix dmatest waiting less when interrupted") Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250403-dmaengine-dmatest-revert-waiting-less-v1-1-8227c5a3d7c8@amd.com Signed-off-by: Vinod Koul --- drivers/dma/dmatest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index d891dfca358e20..91b2fbc0b86471 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -841,9 +841,9 @@ static int dmatest_func(void *data) } else { dma_async_issue_pending(chan); - wait_event_timeout(thread->done_wait, - done->done, - msecs_to_jiffies(params->timeout)); + wait_event_freezable_timeout(thread->done_wait, + done->done, + msecs_to_jiffies(params->timeout)); status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); From 4249084648f13be9a0f3bc5b3e52fd008a200836 Mon Sep 17 00:00:00 2001 From: Yemike Abhilash Chandra Date: Thu, 17 Apr 2025 13:25:21 +0530 Subject: [PATCH 299/353] dmaengine: ti: k3-udma: Use cap_mask directly from dma_device structure instead of a local copy Currently, a local dma_cap_mask_t variable is used to store device cap_mask within udma_of_xlate(). However, the DMA_PRIVATE flag in the device cap_mask can get cleared when the last channel is released. This can happen right after storing the cap_mask locally in udma_of_xlate(), and subsequent dma_request_channel() can fail due to mismatch in the cap_mask. Fix this by removing the local dma_cap_mask_t variable and directly using the one from the dma_device structure. Fixes: 25dcb5dd7b7c ("dmaengine: ti: New driver for K3 UDMA") Cc: stable@vger.kernel.org Signed-off-by: Vaishnav Achath Acked-by: Peter Ujfalusi Reviewed-by: Udit Kumar Signed-off-by: Yemike Abhilash Chandra Link: https://lore.kernel.org/r/20250417075521.623651-1-y-abhilashchandra@ti.com Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index b223a7aacb0cf1..08ed8cd7f1dd03 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -4246,7 +4246,6 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma) { struct udma_dev *ud = ofdma->of_dma_data; - dma_cap_mask_t mask = ud->ddev.cap_mask; struct udma_filter_param filter_param; struct dma_chan *chan; @@ -4278,7 +4277,7 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, } } - chan = __dma_request_channel(&mask, udma_dma_filter_fn, &filter_param, + chan = __dma_request_channel(&ud->ddev.cap_mask, udma_dma_filter_fn, &filter_param, ofdma->of_node); if (!chan) { dev_err(ud->dev, "get channel fail in %s.\n", __func__); From 72d8f0935b1d51184d9f9e1adf99c62320b8fe89 Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Mon, 14 Apr 2025 19:31:13 +0200 Subject: [PATCH 300/353] dmaengine: ti: k3-udma: Add missing locking Recent kernels complain about a missing lock in k3-udma.c when the lock validator is enabled: [ 4.128073] WARNING: CPU: 0 PID: 746 at drivers/dma/ti/../virt-dma.h:169 udma_start.isra.0+0x34/0x238 [ 4.137352] CPU: 0 UID: 0 PID: 746 Comm: kworker/0:3 Not tainted 6.12.9-arm64 #28 [ 4.144867] Hardware name: pp-v12 (DT) [ 4.148648] Workqueue: events udma_check_tx_completion [ 4.153841] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.160834] pc : udma_start.isra.0+0x34/0x238 [ 4.165227] lr : udma_start.isra.0+0x30/0x238 [ 4.169618] sp : ffffffc083cabcf0 [ 4.172963] x29: ffffffc083cabcf0 x28: 0000000000000000 x27: ffffff800001b005 [ 4.180167] x26: ffffffc0812f0000 x25: 0000000000000000 x24: 0000000000000000 [ 4.187370] x23: 0000000000000001 x22: 00000000e21eabe9 x21: ffffff8000fa0670 [ 4.194571] x20: ffffff8001b6bf00 x19: ffffff8000fa0430 x18: ffffffc083b95030 [ 4.201773] x17: 0000000000000000 x16: 00000000f0000000 x15: 0000000000000048 [ 4.208976] x14: 0000000000000048 x13: 0000000000000000 x12: 0000000000000001 [ 4.216179] x11: ffffffc08151a240 x10: 0000000000003ea1 x9 : ffffffc08046ab68 [ 4.223381] x8 : ffffffc083cabac0 x7 : ffffffc081df3718 x6 : 0000000000029fc8 [ 4.230583] x5 : ffffffc0817ee6d8 x4 : 0000000000000bc0 x3 : 0000000000000000 [ 4.237784] x2 : 0000000000000000 x1 : 00000000001fffff x0 : 0000000000000000 [ 4.244986] Call trace: [ 4.247463] udma_start.isra.0+0x34/0x238 [ 4.251509] udma_check_tx_completion+0xd0/0xdc [ 4.256076] process_one_work+0x244/0x3fc [ 4.260129] process_scheduled_works+0x6c/0x74 [ 4.264610] worker_thread+0x150/0x1dc [ 4.268398] kthread+0xd8/0xe8 [ 4.271492] ret_from_fork+0x10/0x20 [ 4.275107] irq event stamp: 220 [ 4.278363] hardirqs last enabled at (219): [] _raw_spin_unlock_irq+0x38/0x50 [ 4.287183] hardirqs last disabled at (220): [] el1_dbg+0x24/0x50 [ 4.294879] softirqs last enabled at (182): [] handle_softirqs+0x1c0/0x3cc [ 4.303437] softirqs last disabled at (177): [] __do_softirq+0x1c/0x28 [ 4.311559] ---[ end trace 0000000000000000 ]--- This commit adds the missing locking. Fixes: 25dcb5dd7b7c ("dmaengine: ti: New driver for K3 UDMA") Cc: Peter Ujfalusi Cc: Vignesh Raghavendra Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Ronald Wahl Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20250414173113.80677-1-rwahl@gmx.de Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 08ed8cd7f1dd03..b6255c0601bb29 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -1091,8 +1091,11 @@ static void udma_check_tx_completion(struct work_struct *work) u32 residue_diff; ktime_t time_diff; unsigned long delay; + unsigned long flags; while (1) { + spin_lock_irqsave(&uc->vc.lock, flags); + if (uc->desc) { /* Get previous residue and time stamp */ residue_diff = uc->tx_drain.residue; @@ -1127,6 +1130,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + spin_unlock_irqrestore(&uc->vc.lock, flags); + usleep_range(ktime_to_us(delay), ktime_to_us(delay) + 10); continue; @@ -1143,6 +1148,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + + spin_unlock_irqrestore(&uc->vc.lock, flags); } static irqreturn_t udma_ring_irq_handler(int irq, void *data) From 64b5dd60b595f0e4e9eca583647dccc81a72690d Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 21 Apr 2025 10:03:37 -0700 Subject: [PATCH 301/353] dmaengine: idxd: Fix allowing write() from different address spaces Check if the process submitting the descriptor belongs to the same address space as the one that opened the file, reject otherwise. Fixes: 6827738dc684 ("dmaengine: idxd: add a write() method for applications to submit work") Signed-off-by: Vinicius Costa Gomes Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20250421170337.3008875-1-dave.jiang@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 7bd031a608943c..a2fb2b84775263 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -407,6 +407,9 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO)) return -EPERM; + if (current->mm != ctx->mm) + return -EPERM; + rc = check_vma(wq, vma, __func__); if (rc < 0) return rc; @@ -473,6 +476,9 @@ static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t ssize_t written = 0; int i; + if (current->mm != ctx->mm) + return -EPERM; + for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) { int rc = idxd_submit_user_descriptor(ctx, udesc + i); @@ -493,6 +499,9 @@ static __poll_t idxd_cdev_poll(struct file *filp, struct idxd_device *idxd = wq->idxd; __poll_t out = 0; + if (current->mm != ctx->mm) + return -EPERM; + poll_wait(filp, &wq->err_queue, wait); spin_lock(&idxd->dev_lock); if (idxd->sw_err.valid) From 586d9373443dcd6383c222d84fc828cb0461087f Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Mon, 21 Apr 2025 17:12:15 +0530 Subject: [PATCH 302/353] dmaengine: ptdma: Move variable condition check to the first place and remove redundancy The variable is dereferenced without first checking if it's null, leading to the following warning: 'Variable dereferenced before check: desc.' drivers/dma/amd/ptdma/ptdma-dmaengine.c: pt_cmd_callback_work() warn: variable dereferenced before check 'desc' Therefore, move the condition check for the 'desc' variable to the first place and remove the redundant extra condition check. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/bfa0a979-ce9f-422d-92c3-34921155d048@stanley.mountain/ Fixes: 656543989457 ("dmaengine: ptdma: Utilize the AE4DMA engine's multi-queue functionality") Signed-off-by: Basavaraj Natikar Link: https://lore.kernel.org/r/20250421114215.1687073-1-Basavaraj.Natikar@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/ptdma/ptdma-dmaengine.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c index 715ac3ae067b85..81339664036f33 100644 --- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c +++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c @@ -342,6 +342,9 @@ static void pt_cmd_callback_work(void *data, int err) struct pt_dma_chan *chan; unsigned long flags; + if (!desc) + return; + dma_chan = desc->vd.tx.chan; chan = to_pt_chan(dma_chan); @@ -355,16 +358,14 @@ static void pt_cmd_callback_work(void *data, int err) desc->status = DMA_ERROR; spin_lock_irqsave(&chan->vc.lock, flags); - if (desc) { - if (desc->status != DMA_COMPLETE) { - if (desc->status != DMA_ERROR) - desc->status = DMA_COMPLETE; + if (desc->status != DMA_COMPLETE) { + if (desc->status != DMA_ERROR) + desc->status = DMA_COMPLETE; - dma_cookie_complete(tx_desc); - dma_descriptor_unmap(tx_desc); - } else { - tx_desc = NULL; - } + dma_cookie_complete(tx_desc); + dma_descriptor_unmap(tx_desc); + } else { + tx_desc = NULL; } spin_unlock_irqrestore(&chan->vc.lock, flags); From 8958b4d66492d6862c551a6ab94a5f9c099a9368 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:09 +0800 Subject: [PATCH 303/353] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_wqs Memory allocated for wqs is not freed if an error occurs during idxd_setup_wqs(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: 7c5dd23e57c1 ("dmaengine: idxd: fix wq conf_dev 'struct device' lifetime") Fixes: 700af3a0a26c ("dmaengine: idxd: add 'struct idxd_dev' as wrapper for conf_dev") Fixes: de5819b99489 ("dmaengine: idxd: track enabled workqueues in bitmap") Fixes: b0325aefd398 ("dmaengine: idxd: add WQ operation cap restriction support") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-2-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index fca1d29249998d..80fb189c9624f6 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -169,8 +169,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); if (!idxd->wq_enable_map) { - kfree(idxd->wqs); - return -ENOMEM; + rc = -ENOMEM; + goto err_bitmap; } for (i = 0; i < idxd->max_wqs; i++) { @@ -189,10 +189,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) conf_dev->bus = &dsa_bus_type; conf_dev->type = &idxd_wq_device_type; rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); - if (rc < 0) { - put_device(conf_dev); + if (rc < 0) goto err; - } mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); @@ -203,7 +201,6 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); if (!wq->wqcfg) { - put_device(conf_dev); rc = -ENOMEM; goto err; } @@ -211,9 +208,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) if (idxd->hw.wq_cap.op_config) { wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); if (!wq->opcap_bmap) { - put_device(conf_dev); rc = -ENOMEM; - goto err; + goto err_opcap_bmap; } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } @@ -224,12 +220,28 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return 0; - err: +err_opcap_bmap: + kfree(wq->wqcfg); + +err: + put_device(conf_dev); + kfree(wq); + while (--i >= 0) { wq = idxd->wqs[i]; + if (idxd->hw.wq_cap.op_config) + bitmap_free(wq->opcap_bmap); + kfree(wq->wqcfg); conf_dev = wq_confdev(wq); put_device(conf_dev); + kfree(wq); + } + bitmap_free(idxd->wq_enable_map); + +err_bitmap: + kfree(idxd->wqs); + return rc; } From 317c20f23a14c6566d7fbffb71a9b82f70f6a0d5 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:10 +0800 Subject: [PATCH 304/353] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_engines Memory allocated for engines is not freed if an error occurs during idxd_setup_engines(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: 75b911309060 ("dmaengine: idxd: fix engine conf_dev lifetime") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-3-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 80fb189c9624f6..ff6ec3c0f60456 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -275,6 +275,7 @@ static int idxd_setup_engines(struct idxd_device *idxd) rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id); if (rc < 0) { put_device(conf_dev); + kfree(engine); goto err; } @@ -288,7 +289,10 @@ static int idxd_setup_engines(struct idxd_device *idxd) engine = idxd->engines[i]; conf_dev = engine_confdev(engine); put_device(conf_dev); + kfree(engine); } + kfree(idxd->engines); + return rc; } From 0b5eca66e96208b5df4400c8ab29bf0285862180 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:11 +0800 Subject: [PATCH 305/353] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_groups Memory allocated for groups is not freed if an error occurs during idxd_setup_groups(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: defe49f96012 ("dmaengine: idxd: fix group conf_dev lifetime") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-4-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index ff6ec3c0f60456..7f0a26e2e0a5ce 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -326,6 +326,7 @@ static int idxd_setup_groups(struct idxd_device *idxd) rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id); if (rc < 0) { put_device(conf_dev); + kfree(group); goto err; } @@ -350,7 +351,10 @@ static int idxd_setup_groups(struct idxd_device *idxd) while (--i >= 0) { group = idxd->groups[i]; put_device(group_confdev(group)); + kfree(group); } + kfree(idxd->groups); + return rc; } From 62467eb5a758a090b8690c5dac696c26fe00402e Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:12 +0800 Subject: [PATCH 306/353] dmaengine: idxd: Add missing cleanup for early error out in idxd_setup_internals The idxd_setup_internals() is missing some cleanup when things fail in the middle. Add the appropriate cleanup routines: - cleanup groups - cleanup enginces - cleanup wqs to make sure it exits gracefully. Fixes: defe49f96012 ("dmaengine: idxd: fix group conf_dev lifetime") Cc: stable@vger.kernel.org Suggested-by: Fenghua Yu Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-5-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 58 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 7f0a26e2e0a5ce..a40fb2fd500618 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -155,6 +155,25 @@ static void idxd_cleanup_interrupts(struct idxd_device *idxd) pci_free_irq_vectors(pdev); } +static void idxd_clean_wqs(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + struct device *conf_dev; + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + if (idxd->hw.wq_cap.op_config) + bitmap_free(wq->opcap_bmap); + kfree(wq->wqcfg); + conf_dev = wq_confdev(wq); + put_device(conf_dev); + kfree(wq); + } + bitmap_free(idxd->wq_enable_map); + kfree(idxd->wqs); +} + static int idxd_setup_wqs(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -245,6 +264,21 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return rc; } +static void idxd_clean_engines(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + struct device *conf_dev; + int i; + + for (i = 0; i < idxd->max_engines; i++) { + engine = idxd->engines[i]; + conf_dev = engine_confdev(engine); + put_device(conf_dev); + kfree(engine); + } + kfree(idxd->engines); +} + static int idxd_setup_engines(struct idxd_device *idxd) { struct idxd_engine *engine; @@ -296,6 +330,19 @@ static int idxd_setup_engines(struct idxd_device *idxd) return rc; } +static void idxd_clean_groups(struct idxd_device *idxd) +{ + struct idxd_group *group; + int i; + + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + put_device(group_confdev(group)); + kfree(group); + } + kfree(idxd->groups); +} + static int idxd_setup_groups(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -410,7 +457,7 @@ static int idxd_init_evl(struct idxd_device *idxd) static int idxd_setup_internals(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; - int rc, i; + int rc; init_waitqueue_head(&idxd->cmd_waitq); @@ -441,14 +488,11 @@ static int idxd_setup_internals(struct idxd_device *idxd) err_evl: destroy_workqueue(idxd->wq); err_wkq_create: - for (i = 0; i < idxd->max_groups; i++) - put_device(group_confdev(idxd->groups[i])); + idxd_clean_groups(idxd); err_group: - for (i = 0; i < idxd->max_engines; i++) - put_device(engine_confdev(idxd->engines[i])); + idxd_clean_engines(idxd); err_engine: - for (i = 0; i < idxd->max_wqs; i++) - put_device(wq_confdev(idxd->wqs[i])); + idxd_clean_wqs(idxd); err_wqs: return rc; } From 2d03849cfcb4ca864bb01dade3dbb83cc5e1d7d8 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:13 +0800 Subject: [PATCH 307/353] dmaengine: idxd: Add missing cleanups in cleanup internals The idxd_cleanup_internals() function only decreases the reference count of groups, engines, and wqs but is missing the step to release memory resources. To fix this, use the cleanup helper to properly release the memory resources. Fixes: ddf742d4f3f1 ("dmaengine: idxd: Add missing cleanup for early error out in probe call") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-6-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index a40fb2fd500618..f8129d2d53f1f1 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -407,14 +407,9 @@ static int idxd_setup_groups(struct idxd_device *idxd) static void idxd_cleanup_internals(struct idxd_device *idxd) { - int i; - - for (i = 0; i < idxd->max_groups; i++) - put_device(group_confdev(idxd->groups[i])); - for (i = 0; i < idxd->max_engines; i++) - put_device(engine_confdev(idxd->engines[i])); - for (i = 0; i < idxd->max_wqs; i++) - put_device(wq_confdev(idxd->wqs[i])); + idxd_clean_groups(idxd); + idxd_clean_engines(idxd); + idxd_clean_wqs(idxd); destroy_workqueue(idxd->wq); } From 6a90ccdf1cc3740e1a78d94b7920488a91ab5b20 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:14 +0800 Subject: [PATCH 308/353] dmaengine: idxd: fix memory leak in error handling path of idxd_alloc Memory allocated for idxd is not freed if an error occurs during idxd_alloc(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: a8563a33a5e2 ("dmanegine: idxd: reformat opcap output to match bitmap_parse() input") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-7-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index f8129d2d53f1f1..302d8983ed8c7b 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -604,28 +604,34 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type); idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL); if (idxd->id < 0) - return NULL; + goto err_ida; idxd->opcap_bmap = bitmap_zalloc_node(IDXD_MAX_OPCAP_BITS, GFP_KERNEL, dev_to_node(dev)); - if (!idxd->opcap_bmap) { - ida_free(&idxd_ida, idxd->id); - return NULL; - } + if (!idxd->opcap_bmap) + goto err_opcap; device_initialize(conf_dev); conf_dev->parent = dev; conf_dev->bus = &dsa_bus_type; conf_dev->type = idxd->data->dev_type; rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id); - if (rc < 0) { - put_device(conf_dev); - return NULL; - } + if (rc < 0) + goto err_name; spin_lock_init(&idxd->dev_lock); spin_lock_init(&idxd->cmd_lock); return idxd; + +err_name: + put_device(conf_dev); + bitmap_free(idxd->opcap_bmap); +err_opcap: + ida_free(&idxd_ida, idxd->id); +err_ida: + kfree(idxd); + + return NULL; } static int idxd_enable_system_pasid(struct idxd_device *idxd) From ed05aa34fc273d74af0d061286401bbb19458e7c Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:15 +0800 Subject: [PATCH 309/353] dmaengine: idxd: fix memory leak in error handling path of idxd_pci_probe Memory allocated for idxd is not freed if an error occurs during idxd_pci_probe(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-8-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 302d8983ed8c7b..f2b5b17538c069 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -587,6 +587,17 @@ static void idxd_read_caps(struct idxd_device *idxd) idxd->hw.iaa_cap.bits = ioread64(idxd->reg_base + IDXD_IAACAP_OFFSET); } +static void idxd_free(struct idxd_device *idxd) +{ + if (!idxd) + return; + + put_device(idxd_confdev(idxd)); + bitmap_free(idxd->opcap_bmap); + ida_free(&idxd_ida, idxd->id); + kfree(idxd); +} + static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data) { struct device *dev = &pdev->dev; @@ -1255,7 +1266,7 @@ int idxd_pci_probe_alloc(struct idxd_device *idxd, struct pci_dev *pdev, err: pci_iounmap(pdev, idxd->reg_base); err_iomap: - put_device(idxd_confdev(idxd)); + idxd_free(idxd); err_idxd_alloc: pci_disable_device(pdev); return rc; From 6d1b2d29b1c13c350c897f6ad6f381307a269a7b Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:16 +0800 Subject: [PATCH 310/353] dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call The remove call stack is missing idxd cleanup to free bitmap, ida and the idxd_device. Call idxd_free() helper routines to make sure we exit gracefully. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Cc: stable@vger.kernel.org Suggested-by: Vinicius Costa Gomes Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-9-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index f2b5b17538c069..974b926bd93009 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1335,6 +1335,7 @@ static void idxd_remove(struct pci_dev *pdev) destroy_workqueue(idxd->wq); perfmon_pmu_remove(idxd); put_device(idxd_confdev(idxd)); + idxd_free(idxd); } static struct pci_driver idxd_pci_driver = { From 90848488611e684d4beb95d2b594b1b54665dc57 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:17 +0800 Subject: [PATCH 311/353] dmaengine: idxd: Refactor remove call with idxd_cleanup() helper The idxd_cleanup() helper cleans up perfmon, interrupts, internals and so on. Refactor remove call with the idxd_cleanup() helper to avoid code duplication. Note, this also fixes the missing put_device() for idxd groups, enginces and wqs. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Cc: stable@vger.kernel.org Suggested-by: Vinicius Costa Gomes Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-10-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 974b926bd93009..760b7d81fcd846 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1308,7 +1308,6 @@ static void idxd_shutdown(struct pci_dev *pdev) static void idxd_remove(struct pci_dev *pdev) { struct idxd_device *idxd = pci_get_drvdata(pdev); - struct idxd_irq_entry *irq_entry; idxd_unregister_devices(idxd); /* @@ -1321,21 +1320,12 @@ static void idxd_remove(struct pci_dev *pdev) get_device(idxd_confdev(idxd)); device_unregister(idxd_confdev(idxd)); idxd_shutdown(pdev); - if (device_pasid_enabled(idxd)) - idxd_disable_system_pasid(idxd); idxd_device_remove_debugfs(idxd); - - irq_entry = idxd_get_ie(idxd, 0); - free_irq(irq_entry->vector, irq_entry); - pci_free_irq_vectors(pdev); + idxd_cleanup(idxd); pci_iounmap(pdev, idxd->reg_base); - if (device_user_pasid_enabled(idxd)) - idxd_disable_sva(pdev); - pci_disable_device(pdev); - destroy_workqueue(idxd->wq); - perfmon_pmu_remove(idxd); put_device(idxd_confdev(idxd)); idxd_free(idxd); + pci_disable_device(pdev); } static struct pci_driver idxd_pci_driver = { From 01ddf006cc14328cfbfa91379f57538744f92b83 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 8 May 2025 10:05:48 -0700 Subject: [PATCH 312/353] dmaengine: idxd: Fix ->poll() return value The fix to block access from different address space did not return a correct value for ->poll() change. kernel test bot reported that a return value of type __poll_t is expected rather than int. Fix to return POLLNVAL to indicate invalid request. Fixes: 8dfa57aabff6 ("dmaengine: idxd: Fix allowing write() from different address spaces") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505081851.rwD7jVxg-lkp@intel.com/ Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20250508170548.2747425-1-dave.jiang@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index a2fb2b84775263..6d12033649f817 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -500,7 +500,7 @@ static __poll_t idxd_cdev_poll(struct file *filp, __poll_t out = 0; if (current->mm != ctx->mm) - return -EPERM; + return POLLNVAL; poll_wait(filp, &wq->err_queue, wait); spin_lock(&idxd->dev_lock); From 995a5b2e44a58e79fae4490ae23c2ab494c72617 Mon Sep 17 00:00:00 2001 From: Qiu-ji Chen Date: Thu, 8 May 2025 15:36:33 +0800 Subject: [PATCH 313/353] dmaengine: mediatek: Fix a possible deadlock error in mtk_cqdma_tx_status() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a potential deadlock bug. Observe that in the mtk-cqdma.c file, functions like mtk_cqdma_issue_pending() and mtk_cqdma_free_active_desc() properly acquire the pc lock before the vc lock when handling pc and vc fields. However, mtk_cqdma_tx_status() violates this order by first acquiring the vc lock before invoking mtk_cqdma_find_active_desc(), which subsequently takes the pc lock. This reversed locking sequence (vc → pc) contradicts the established pc → vc order and creates deadlock risks. Fix the issue by moving the vc lock acquisition code from mtk_cqdma_find_active_desc() to mtk_cqdma_tx_status(). Ensure the pc lock is acquired before the vc lock in the calling function to maintain correct locking hierarchy. Note that since mtk_cqdma_find_active_desc() is a static function with only one caller (mtk_cqdma_tx_status()), this modification safely eliminates the deadlock possibility without affecting other components. This possible bug is found by an experimental static analysis tool developed by our team. This tool analyzes the locking APIs to extract function pairs that can be concurrently executed, and then analyzes the instructions in the paired functions to identify possible concurrency bugs including deadlocks, data races and atomicity violations. Fixes: b1f01e48df5a ("dmaengine: mediatek: Add MediaTek Command-Queue DMA controller for MT6765 SoC") Cc: stable@vger.kernel.org Signed-off-by: Qiu-ji Chen Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250508073634.3719-1-chenqiuji666@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/mediatek/mtk-cqdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/dma/mediatek/mtk-cqdma.c b/drivers/dma/mediatek/mtk-cqdma.c index d5ddb4e30e7150..e35271ac1eed2a 100644 --- a/drivers/dma/mediatek/mtk-cqdma.c +++ b/drivers/dma/mediatek/mtk-cqdma.c @@ -422,13 +422,10 @@ static struct virt_dma_desc *mtk_cqdma_find_active_desc(struct dma_chan *c, struct virt_dma_desc *vd; unsigned long flags; - spin_lock_irqsave(&cvc->pc->lock, flags); list_for_each_entry(vd, &cvc->pc->queue, node) if (vd->tx.cookie == cookie) { - spin_unlock_irqrestore(&cvc->pc->lock, flags); return vd; } - spin_unlock_irqrestore(&cvc->pc->lock, flags); list_for_each_entry(vd, &cvc->vc.desc_issued, node) if (vd->tx.cookie == cookie) @@ -452,9 +449,11 @@ static enum dma_status mtk_cqdma_tx_status(struct dma_chan *c, if (ret == DMA_COMPLETE || !txstate) return ret; + spin_lock_irqsave(&cvc->pc->lock, flags); spin_lock_irqsave(&cvc->vc.lock, flags); vd = mtk_cqdma_find_active_desc(c, cookie); spin_unlock_irqrestore(&cvc->vc.lock, flags); + spin_unlock_irqrestore(&cvc->pc->lock, flags); if (vd) { cvd = to_cqdma_vdesc(vd); From c3cff97751098fb33c8cbf8cdda6baf7d06b7200 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 24 Apr 2025 13:48:29 +0200 Subject: [PATCH 314/353] dmaengine: fsl-edma: Fix return code for unhandled interrupts For fsl,imx93-edma4 two DMA channels share the same interrupt. So in case fsl_edma3_tx_handler is called for the "wrong" channel, the return code must be IRQ_NONE. This signalize that the interrupt wasn't handled. Fixes: 72f5801a4e2b ("dmaengine: fsl-edma: integrate v3 support") Signed-off-by: Stefan Wahren Reviewed-by: Joy Zou Link: https://lore.kernel.org/r/20250424114829.9055-1-wahrenst@gmx.net Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 756d67325db526..66bfa28d984e1b 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -57,7 +57,7 @@ static irqreturn_t fsl_edma3_tx_handler(int irq, void *dev_id) intr = edma_readl_chreg(fsl_chan, ch_int); if (!intr) - return IRQ_HANDLED; + return IRQ_NONE; edma_writel_chreg(fsl_chan, 1, ch_int); From 26221b7a67aba8dd47c143d8c6140c8f50a84e3b Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Thu, 15 May 2025 11:42:13 +0100 Subject: [PATCH 315/353] dmaengine: mediatek: drop unused variable Commit 157ae5ffd76a dmaengine: mediatek: Fix a possible deadlock error in mtk_cqdma_tx_status() fixed locks but kept unused varibale leading to warning and build failure (due to warning treated as errors) drivers/dma/mediatek/mtk-cqdma.c: In function 'mtk_cqdma_find_active_desc': drivers/dma/mediatek/mtk-cqdma.c:423:23: error: unused variable 'flags' [-Werror=unused-variable] 423 | unsigned long flags; | ^~~~~ Fix by dropping this unused flag Reported-by: Stephen Rothwell Fixes: 157ae5ffd76a ("dmaengine: mediatek: Fix a possible deadlock error in mtk_cqdma_tx_status()") Signed-off-by: Vinod Koul --- drivers/dma/mediatek/mtk-cqdma.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/dma/mediatek/mtk-cqdma.c b/drivers/dma/mediatek/mtk-cqdma.c index e35271ac1eed2a..47c8adfdc15504 100644 --- a/drivers/dma/mediatek/mtk-cqdma.c +++ b/drivers/dma/mediatek/mtk-cqdma.c @@ -420,7 +420,6 @@ static struct virt_dma_desc *mtk_cqdma_find_active_desc(struct dma_chan *c, { struct mtk_cqdma_vchan *cvc = to_cqdma_vchan(c); struct virt_dma_desc *vd; - unsigned long flags; list_for_each_entry(vd, &cvc->pc->queue, node) if (vd->tx.cookie == cookie) { From dba8aac48374e2cfcb40d5bce409e83b00e52c55 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 May 2025 13:57:29 -0700 Subject: [PATCH 316/353] Linux 6.15-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0eb9e6c49b32aa..a9edd030365372 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* From 1333457a60293a00188ef4a356c5ed6421760680 Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sun, 11 May 2025 12:41:41 +0530 Subject: [PATCH 317/353] PM: EM: Documentation: Fix typos in example driver code Fix the API name to free the allocated table in the example driver code that modifies the EM. Also fix the passing of correct table when updating the cost. Signed-off-by: Atul Kumar Pant Link: https://patch.msgid.link/20250511071141.13237-1-atulpant.linux@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/energy-model.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 490ddd483f46b0..cbdf7520aaa60a 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -381,17 +381,17 @@ up periodically to check the temperature and modify the EM data:: 26 rcu_read_unlock(); 27 28 /* Calculate 'cost' values for EAS */ - 29 ret = em_dev_compute_costs(dev, table, pd->nr_perf_states); + 29 ret = em_dev_compute_costs(dev, new_table, pd->nr_perf_states); 30 if (ret) { 31 dev_warn(dev, "EM: compute costs failed %d\n", ret); - 32 em_free_table(em_table); + 32 em_table_free(em_table); 33 return; 34 } 35 36 ret = em_dev_update_perf_domain(dev, em_table); 37 if (ret) { 38 dev_warn(dev, "EM: update failed %d\n", ret); - 39 em_free_table(em_table); + 39 em_table_free(em_table); 40 return; 41 } 42 From b6f9c3fed25ef5c93075b4a4838661965474f018 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:39:35 +0200 Subject: [PATCH 318/353] PM: EM: Move CPU capacity check to em_adjust_new_capacity() Move the check of the CPU capacity currently stored in the energy model against the arch_scale_cpu_capacity() value to em_adjust_new_capacity() so it will be done regardless of where the latter is called from. This will be useful when a new em_adjust_new_capacity() caller is added subsequently. While at it, move the pd local variable declaration in em_check_capacity_update() into the loop in which it is used. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/7810787.EvYhyI6sBW@rjwysocki.net --- kernel/power/energy_model.c | 40 ++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 41606247c27763..88449d4390cb20 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -725,10 +725,24 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, * Adjustment of CPU performance values after boot, when all CPUs capacites * are correctly calculated. */ -static void em_adjust_new_capacity(struct device *dev, +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, struct em_perf_domain *pd) { + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); em_table = em_table_dup(pd); if (!em_table) { @@ -744,9 +758,6 @@ static void em_adjust_new_capacity(struct device *dev, static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; - struct em_perf_state *table; - struct em_perf_domain *pd; - unsigned long cpu_capacity; int cpu; if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { @@ -757,7 +768,7 @@ static void em_check_capacity_update(void) /* Check if CPUs capacity has changed than update EM */ for_each_possible_cpu(cpu) { struct cpufreq_policy *policy; - unsigned long em_max_perf; + struct em_perf_domain *pd; struct device *dev; if (cpumask_test_cpu(cpu, cpu_done_mask)) @@ -780,24 +791,7 @@ static void em_check_capacity_update(void) cpumask_or(cpu_done_mask, cpu_done_mask, em_span_cpus(pd)); - cpu_capacity = arch_scale_cpu_capacity(cpu); - - rcu_read_lock(); - table = em_perf_state_from_pd(pd); - em_max_perf = table[pd->nr_perf_states - 1].performance; - rcu_read_unlock(); - - /* - * Check if the CPU capacity has been adjusted during boot - * and trigger the update for new performance values. - */ - if (em_max_perf == cpu_capacity) - continue; - - pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", - cpu, cpu_capacity, em_max_perf); - - em_adjust_new_capacity(dev, pd); + em_adjust_new_capacity(cpu, dev, pd); } free_cpumask_var(cpu_done_mask); From 9c3da96298b95f1120774927fdd0fa9db3d0aada Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:41:21 +0200 Subject: [PATCH 319/353] PM: EM: Introduce em_adjust_cpu_capacity() Add a function for updating the Energy Model for a CPU after its capacity has changed, which subsequently will be used by the intel_pstate driver. An EM_PERF_DOMAIN_ARTIFICIAL check is added to em_recalc_and_update() to prevent it from calling em_compute_costs() for an "artificial" perf domain with a NULL cb parameter which would cause it to crash. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/3637203.iIbC2pHGDl@rjwysocki.net --- include/linux/energy_model.h | 2 ++ kernel/power/energy_model.c | 28 ++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index d8eabbf86a5b31..7fa1eb3cc82399 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -179,6 +179,7 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int em_dev_update_chip_binning(struct device *dev); int em_update_performance_limits(struct em_perf_domain *pd, unsigned long freq_min_khz, unsigned long freq_max_khz); +void em_adjust_cpu_capacity(unsigned int cpu); void em_rebuild_sched_domains(void); /** @@ -403,6 +404,7 @@ int em_update_performance_limits(struct em_perf_domain *pd, { return -EINVAL; } +static inline void em_adjust_cpu_capacity(unsigned int cpu) {} static inline void em_rebuild_sched_domains(void) {} #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 88449d4390cb20..ea7995a25780f3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -702,10 +702,12 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, { int ret; - ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, - pd->flags); - if (ret) - goto free_em_table; + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } ret = em_dev_update_perf_domain(dev, em_table); if (ret) @@ -755,6 +757,24 @@ static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, em_recalc_and_update(dev, pd, em_table); } +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; From 91ddb3194797881f2d8c880338267d5b77000747 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 May 2025 15:23:30 +0200 Subject: [PATCH 320/353] cpufreq: Drop policy locking from cpufreq_policy_is_good_for_eas() Policy locking was added to cpufreq_policy_is_good_for_eas() by commit 4854649b1fb4 ("cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq") to address a theoretical race condition, but it turned out to introduce a circular locking dependency between the policy rwsem and sched_domains_mutex via cpuset_mutex. This leads to a board lockup on OdroidN2 that is based on the ARM64 Amlogic Meson SoC. Drop the policy locking from cpufreq_policy_is_good_for_eas() to address this issue. Fixes: 4854649b1fb4 ("cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq") Closes: https://lore.kernel.org/linux-pm/1bf3df62-0641-459f-99fc-fd511e564b84@samsung.com/ Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2806514.mvXUDI8C0e@rjwysocki.net --- drivers/cpufreq/cpufreq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 731ecfc178d87b..be727da0be4d18 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -3067,8 +3067,6 @@ static bool cpufreq_policy_is_good_for_eas(unsigned int cpu) return false; } - guard(cpufreq_policy_read)(policy); - return sugov_is_governor(policy); } From b70eb336d01f9ef2d85f9d7adf47456032e7800a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:44:30 +0200 Subject: [PATCH 321/353] cpufreq: intel_pstate: EAS support for hybrid platforms Modify intel_pstate to register EM perf domains for CPUs on hybrid platforms without SMT which causes EAS to be enabled on them when schedutil is used as the cpufreq governor (which requires intel_pstate to operate in the passive mode). This change is targeting platforms (for example, Lunar Lake) where the "little" CPUs (E-cores) are always more energy-efficient than the "big" or "performance" CPUs (P-cores) when run at the same HWP performance level, so it is sufficient to tell EAS that E-cores are always preferred (so long as there is enough spare capacity on one of them to run the given task). However, migrating tasks between CPUs of the same type too often is not desirable because it may hurt both performance and energy efficiency due to leaving warm caches behind. For this reason, register a separate perf domain for each CPU and choose the cost values for them so that the cost mostly depends on the CPU type, but there is also a small component of it depending on the performance level (utilization) which helps to balance the load between CPUs of the same type. The cost component related to the CPU type is computed with the help of the observation that the IPC metric value for a given CPU is inversely proportional to its performance-to-frequency scaling factor and the cost of running code on it can be assumed to be roughly proportional to that IPC ratio (in principle, the higher the IPC ratio, the more resources are utilized when running at a given frequency, so the cost should be higher). For all CPUs that are online at the system initialization time, EM perf domains are registered when the driver starts up, after asymmetric capacity support has been enabled. For the CPUs that become online later, EM perf domains are registered after setting the asymmetric capacity for them. Signed-off-by: Rafael J. Wysocki Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/6057101.MhkbZ0Pkbq@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 115 ++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 531abd63769613..b46fc70548319c 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -221,6 +221,7 @@ struct global_params { * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * @suspended: Whether or not the driver has been suspended. + * @pd_registered: Set when a perf domain is registered for this CPU. * @hwp_notify_work: workqueue for HWP notifications. * * This structure stores per CPU instance data for all CPUs. @@ -260,6 +261,9 @@ struct cpudata { unsigned int sched_flags; u32 hwp_boost_min; bool suspended; +#ifdef CONFIG_ENERGY_MODEL + bool pd_registered; +#endif struct delayed_work hwp_notify_work; }; @@ -303,6 +307,7 @@ static bool hwp_is_hybrid; static struct cpufreq_driver *intel_pstate_driver __read_mostly; +#define INTEL_PSTATE_CORE_SCALING 100000 #define HYBRID_SCALING_FACTOR_ADL 78741 #define HYBRID_SCALING_FACTOR_MTL 80000 #define HYBRID_SCALING_FACTOR_LNL 86957 @@ -311,7 +316,7 @@ static int hybrid_scaling_factor; static inline int core_get_scaling(void) { - return 100000; + return INTEL_PSTATE_CORE_SCALING; } #ifdef CONFIG_ACPI @@ -948,12 +953,105 @@ static struct cpudata *hybrid_max_perf_cpu __read_mostly; */ static DEFINE_MUTEX(hybrid_capacity_lock); +#ifdef CONFIG_ENERGY_MODEL +#define HYBRID_EM_STATE_COUNT 4 + +static int hybrid_active_power(struct device *dev, unsigned long *power, + unsigned long *freq) +{ + /* + * Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100% + * of the maximum capacity such that two CPUs of the same type will be + * regarded as equally attractive if the utilization of each of them + * falls into the same bin, which should prevent tasks from being + * migrated between them too often. + * + * For this purpose, return the "frequency" of 2 for the first + * performance level and otherwise leave the value set by the caller. + */ + if (!*freq) + *freq = 2; + + /* No power information. */ + *power = EM_MAX_POWER; + + return 0; +} + +static int hybrid_get_cost(struct device *dev, unsigned long freq, + unsigned long *cost) +{ + struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; + + /* + * The smaller the perf-to-frequency scaling factor, the larger the IPC + * ratio between the given CPU and the least capable CPU in the system. + * Regard that IPC ratio as the primary cost component and assume that + * the scaling factors for different CPU types will differ by at least + * 5% and they will not be above INTEL_PSTATE_CORE_SCALING. + * + * Add the freq value to the cost, so that the cost of running on CPUs + * of the same type in different "utilization bins" is different. + */ + *cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq; + + return 0; +} + +static bool hybrid_register_perf_domain(unsigned int cpu) +{ + static const struct em_data_callback cb + = EM_ADV_DATA_CB(hybrid_active_power, hybrid_get_cost); + struct cpudata *cpudata = all_cpu_data[cpu]; + struct device *cpu_dev; + + /* + * Registering EM perf domains without enabling asymmetric CPU capacity + * support is not really useful and one domain should not be registered + * more than once. + */ + if (!hybrid_max_perf_cpu || cpudata->pd_registered) + return false; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return false; + + if (em_dev_register_perf_domain(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, + cpumask_of(cpu), false)) + return false; + + cpudata->pd_registered = true; + + return true; +} + +static void hybrid_register_all_perf_domains(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + hybrid_register_perf_domain(cpu); +} + +static void hybrid_update_perf_domain(struct cpudata *cpu) +{ + if (cpu->pd_registered) + em_adjust_cpu_capacity(cpu->cpu); +} +#else /* !CONFIG_ENERGY_MODEL */ +static inline bool hybrid_register_perf_domain(unsigned int cpu) { return false; } +static inline void hybrid_register_all_perf_domains(void) {} +static inline void hybrid_update_perf_domain(struct cpudata *cpu) {} +#endif /* CONFIG_ENERGY_MODEL */ + static void hybrid_set_cpu_capacity(struct cpudata *cpu) { arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, cpu->capacity_perf, cpu->pstate.max_pstate_physical); + hybrid_update_perf_domain(cpu); topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu)); @@ -1044,6 +1142,11 @@ static void hybrid_refresh_cpu_capacity_scaling(void) guard(mutex)(&hybrid_capacity_lock); __hybrid_refresh_cpu_capacity_scaling(); + /* + * Perf domains are not registered before setting hybrid_max_perf_cpu, + * so register them all after setting up CPU capacity scaling. + */ + hybrid_register_all_perf_domains(); } static void hybrid_init_cpu_capacity_scaling(bool refresh) @@ -1071,7 +1174,7 @@ static void hybrid_init_cpu_capacity_scaling(bool refresh) hybrid_refresh_cpu_capacity_scaling(); /* * Disabling ITMT causes sched domains to be rebuilt to disable asym - * packing and enable asym capacity. + * packing and enable asym capacity and EAS. */ sched_clear_itmt_support(); } @@ -1149,6 +1252,14 @@ static void hybrid_update_capacity(struct cpudata *cpu) } hybrid_set_cpu_capacity(cpu); + /* + * If the CPU was offline to start with and it is going online for the + * first time, a perf domain needs to be registered for it if hybrid + * capacity scaling has been enabled already. In that case, sched + * domains need to be rebuilt to take the new perf domain into account. + */ + if (hybrid_register_perf_domain(cpu->cpu)) + em_rebuild_sched_domains(); unlock: mutex_unlock(&hybrid_capacity_lock); From d4c50b481a663bd2bb60aad91fc436f23011bec9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:47:53 +0200 Subject: [PATCH 322/353] cpufreq: intel_pstate: EAS: Increase cost for CPUs using L3 cache On some hybrid platforms some efficient CPUs (E-cores) are not connected to the L3 cache, but there are no other differences between them and the other E-cores that use L3. In that case, it is generally more efficient to run "light" workloads on the E-cores that do not use L3 and allow all of the cores using L3, including P-cores, to go into idle states. For this reason, slightly increase the cost for all CPUs sharing the L3 cache to make EAS prefer CPUs that do not use it to the other CPUs of the same type (if any). Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2032776.usQuhbGJ8B@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b46fc70548319c..0102c3e1441f42 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -982,6 +982,7 @@ static int hybrid_get_cost(struct device *dev, unsigned long freq, unsigned long *cost) { struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; + struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(dev->id); /* * The smaller the perf-to-frequency scaling factor, the larger the IPC @@ -994,6 +995,22 @@ static int hybrid_get_cost(struct device *dev, unsigned long freq, * of the same type in different "utilization bins" is different. */ *cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq; + /* + * Increase the cost slightly for CPUs able to access L3 to avoid + * touching it in case some other CPUs of the same type can do the work + * without it. + */ + if (cacheinfo) { + unsigned int i; + + /* Check if L3 cache is there. */ + for (i = 0; i < cacheinfo->num_leaves; i++) { + if (cacheinfo->info_list[i].level == 3) { + *cost += 2; + break; + } + } + } return 0; } From 684d029cd12fe507836198eb0a1b181692887b8d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:48:59 +0200 Subject: [PATCH 323/353] cpufreq: intel_pstate: Document hybrid processor support Describe the support for hybrid processors in intel_pstate, including the CAS and EAS support, in the admin-guide documentation. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/1935040.CQOukoFCf9@rjwysocki.net --- Documentation/admin-guide/pm/intel_pstate.rst | 104 +++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 78fc83ed2a7efb..26e702c7016e57 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -329,6 +329,106 @@ information listed above is the same for all of the processors supporting the HWP feature, which is why ``intel_pstate`` works with all of them.] +Support for Hybrid Processors +============================= + +Some processors supported by ``intel_pstate`` contain two or more types of CPU +cores differing by the maximum turbo P-state, performance vs power characteristics, +cache sizes, and possibly other properties. They are commonly referred to as +hybrid processors. To support them, ``intel_pstate`` requires HWP to be enabled +and it assumes the HWP performance units to be the same for all CPUs in the +system, so a given HWP performance level always represents approximately the +same physical performance regardless of the core (CPU) type. + +Hybrid Processors with SMT +-------------------------- + +On systems where SMT (Simultaneous Multithreading), also referred to as +HyperThreading (HT) in the context of Intel processors, is enabled on at least +one core, ``intel_pstate`` assigns performance-based priorities to CPUs. Namely, +the priority of a given CPU reflects its highest HWP performance level which +causes the CPU scheduler to generally prefer more performant CPUs, so the less +performant CPUs are used when the other ones are fully loaded. However, SMT +siblings (that is, logical CPUs sharing one physical core) are treated in a +special way such that if one of them is in use, the effective priority of the +other ones is lowered below the priorities of the CPUs located in the other +physical cores. + +This approach maximizes performance in the majority of cases, but unfortunately +it also leads to excessive energy usage in some important scenarios, like video +playback, which is not generally desirable. While there is no other viable +choice with SMT enabled because the effective capacity and utilization of SMT +siblings are hard to determine, hybrid processors without SMT can be handled in +more energy-efficient ways. + +.. _CAS: + +Capacity-Aware Scheduling Support +--------------------------------- + +The capacity-aware scheduling (CAS) support in the CPU scheduler is enabled by +``intel_pstate`` by default on hybrid processors without SMT. CAS generally +causes the scheduler to put tasks on a CPU so long as there is a sufficient +amount of spare capacity on it, and if the utilization of a given task is too +high for it, the task will need to go somewhere else. + +Since CAS takes CPU capacities into account, it does not require CPU +prioritization and it allows tasks to be distributed more symmetrically among +the more performant and less performant CPUs. Once placed on a CPU with enough +capacity to accommodate it, a task may just continue to run there regardless of +whether or not the other CPUs are fully loaded, so on average CAS reduces the +utilization of the more performant CPUs which causes the energy usage to be more +balanced because the more performant CPUs are generally less energy-efficient +than the less performant ones. + +In order to use CAS, the scheduler needs to know the capacity of each CPU in +the system and it needs to be able to compute scale-invariant utilization of +CPUs, so ``intel_pstate`` provides it with the requisite information. + +First of all, the capacity of each CPU is represented by the ratio of its highest +HWP performance level, multiplied by 1024, to the highest HWP performance level +of the most performant CPU in the system, which works because the HWP performance +units are the same for all CPUs. Second, the frequency-invariance computations, +carried out by the scheduler to always express CPU utilization in the same units +regardless of the frequency it is currently running at, are adjusted to take the +CPU capacity into account. All of this happens when ``intel_pstate`` has +registered itself with the ``CPUFreq`` core and it has figured out that it is +running on a hybrid processor without SMT. + +Energy-Aware Scheduling Support +------------------------------- + +If ``CONFIG_ENERGY_MODEL`` has been set during kernel configuration and +``intel_pstate`` runs on a hybrid processor without SMT, in addition to enabling +`CAS `_ it registers an Energy Model for the processor. This allows the +Energy-Aware Scheduling (EAS) support to be enabled in the CPU scheduler if +``schedutil`` is used as the ``CPUFreq`` governor which requires ``intel_pstate`` +to operate in the `passive mode `_. + +The Energy Model registered by ``intel_pstate`` is artificial (that is, it is +based on abstract cost values and it does not include any real power numbers) +and it is relatively simple to avoid unnecessary computations in the scheduler. +There is a performance domain in it for every CPU in the system and the cost +values for these performance domains have been chosen so that running a task on +a less performant (small) CPU appears to be always cheaper than running that +task on a more performant (big) CPU. However, for two CPUs of the same type, +the cost difference depends on their current utilization, and the CPU whose +current utilization is higher generally appears to be a more expensive +destination for a given task. This helps to balance the load among CPUs of the +same type. + +Since EAS works on top of CAS, high-utilization tasks are always migrated to +CPUs with enough capacity to accommodate them, but thanks to EAS, low-utilization +tasks tend to be placed on the CPUs that look less expensive to the scheduler. +Effectively, this causes the less performant and less loaded CPUs to be +preferred as long as they have enough spare capacity to run the given task +which generally leads to reduced energy usage. + +The Energy Model created by ``intel_pstate`` can be inspected by looking at +the ``energy_model`` directory in ``debugfs`` (typlically mounted on +``/sys/kernel/debug/``). + + User Space Interface in ``sysfs`` ================================= @@ -697,8 +797,8 @@ of them have to be prepended with the ``intel_pstate=`` prefix. Limits`_ for details). ``no_cas`` - Do not enable capacity-aware scheduling (CAS) which is enabled by - default on hybrid systems. + Do not enable `capacity-aware scheduling `_ which is enabled by + default on hybrid systems without SMT. Diagnostics and Tuning ====================== From 981d2ccf50f6b82c72e85ee7ff44983209f1fd01 Mon Sep 17 00:00:00 2001 From: Seyediman Seyedarab Date: Mon, 12 May 2025 21:57:26 -0400 Subject: [PATCH 324/353] cpufreq: drop redundant cpus_read_lock() from store_local_boost() Lockdep reports a possible circular locking dependency [1] when cpu_hotplug_lock is acquired inside store_local_boost(), after policy->rwsem has already been taken by store(). However, the boost update is strictly per-policy and does not access shared state or iterate over all policies. Since policy->rwsem is already held, this is enough to serialize against concurrent topology changes for the current policy. Remove the cpus_read_lock() to resolve the lockdep warning and avoid unnecessary locking. [1] ====================================================== WARNING: possible circular locking dependency detected 6.15.0-rc6-debug-gb01fc4eca73c #1 Not tainted ------------------------------------------------------ power-profiles-/588 is trying to acquire lock: ffffffffb3a7d910 (cpu_hotplug_lock){++++}-{0:0}, at: store_local_boost+0x56/0xd0 but task is already holding lock: ffff8b6e5a12c380 (&policy->rwsem){++++}-{4:4}, at: store+0x37/0x90 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&policy->rwsem){++++}-{4:4}: down_write+0x29/0xb0 cpufreq_online+0x7e8/0xa40 cpufreq_add_dev+0x82/0xa0 subsys_interface_register+0x148/0x160 cpufreq_register_driver+0x15d/0x260 amd_pstate_register_driver+0x36/0x90 amd_pstate_init+0x1e7/0x270 do_one_initcall+0x68/0x2b0 kernel_init_freeable+0x231/0x270 kernel_init+0x15/0x130 ret_from_fork+0x2c/0x50 ret_from_fork_asm+0x11/0x20 -> #1 (subsys mutex#3){+.+.}-{4:4}: __mutex_lock+0xc2/0x930 subsys_interface_register+0x7f/0x160 cpufreq_register_driver+0x15d/0x260 amd_pstate_register_driver+0x36/0x90 amd_pstate_init+0x1e7/0x270 do_one_initcall+0x68/0x2b0 kernel_init_freeable+0x231/0x270 kernel_init+0x15/0x130 ret_from_fork+0x2c/0x50 ret_from_fork_asm+0x11/0x20 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x10ed/0x1850 lock_acquire.part.0+0x69/0x1b0 cpus_read_lock+0x2a/0xc0 store_local_boost+0x56/0xd0 store+0x50/0x90 kernfs_fop_write_iter+0x132/0x200 vfs_write+0x2b3/0x590 ksys_write+0x74/0xf0 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x56/0x5e Signed-off-by: Seyediman Seyedarab Link: https://patch.msgid.link/20250513015726.1497-1-ImanDevel@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index be727da0be4d18..cb475ac5b618e1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -622,10 +622,7 @@ static ssize_t store_local_boost(struct cpufreq_policy *policy, if (!policy->boost_supported) return -EINVAL; - cpus_read_lock(); ret = policy_set_boost(policy, enable); - cpus_read_unlock(); - if (!ret) return count; From 63e97dd668a76b671edc97979366db1334b13600 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 12 May 2025 23:28:55 +0200 Subject: [PATCH 325/353] cpufreq/amd-pstate: Avoid shadowing ret in amd_pstate_ut_check_driver() Clang warns (or errors with CONFIG_WERROR=y): drivers/cpufreq/amd-pstate-ut.c:262:6: error: variable 'ret' is uninitialized when used here [-Werror,-Wuninitialized] 262 | if (ret) | ^~~ ret is declared at the top of the function and in the for loop so the initialization of ret is local to the loop. Remove the declaration in the for loop so that ret is always used initialized. Fixes: d26d16438bc5 ("amd-pstate-ut: Reset amd-pstate driver mode after running selftests") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505072226.g2QclhaR-lkp@intel.com/ Signed-off-by: Nathan Chancellor Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20250512-amd-pstate-ut-uninit-ret-v1-1-fcb4104f502e@kernel.org Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 30835d0e499403..65f9d2bae2d361 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -246,7 +246,7 @@ static int amd_pstate_ut_check_driver(u32 index) int ret; for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) { - int ret = amd_pstate_set_mode(mode1); + ret = amd_pstate_set_mode(mode1); if (ret) return ret; for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) { From 193d4de0391a412d1c0ac98aba47ba66bc881c86 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 15 May 2025 12:11:25 +0530 Subject: [PATCH 326/353] PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn() pm_runtime_put_autosuspend() schedules a hrtimer to expire at "dev->power.timer_expires". If the hrtimer's callback, pm_suspend_timer_fn(), observes that the current time equals "dev->power.timer_expires", it unexpectedly bails out instead of proceeding with runtime suspend. pm_suspend_timer_fn(): if (expires > 0 && expires < ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(..) } Additionally, as ->timer_expires is not cleared, all the future auto suspend requests will not schedule hrtimer to perform auto suspend. rpm_suspend(): if ((rpmflags & RPM_AUTO) &&...) { if (!(dev->power.timer_expires && ...) { <-- this will fail. hrtimer_start_range_ns(&dev->power.suspend_timer,...); } } Fix this by as well checking if current time reaches the set expiration. Co-developed-by: Patrick Daly Signed-off-by: Patrick Daly Signed-off-by: Charan Teja Kalla Link: https://patch.msgid.link/20250515064125.1211561-1-quic_charante@quicinc.com Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 205a4f8828b0ac..c55a7c70bc1a88 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1011,7 +1011,7 @@ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) * If 'expires' is after the current time, we've been called * too early. */ - if (expires > 0 && expires < ktime_get_mono_fast_ns()) { + if (expires > 0 && expires <= ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); From f15c866d32f79abcf97598f39a7e8f42feaeb8ad Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Mon, 12 May 2025 15:26:53 +0200 Subject: [PATCH 327/353] ucsi_ccg: Disable async suspend in ucsi_ccg_probe() Commit aa7a9275ab81 ("PM: sleep: Suspend async parents after suspending children") had triggered a suspend issue on Tegra boards because it had reordered the syspend of devices with async suspend enabled with respect to some other devices. Specifically, the devices with async suspend enabled that have no children are now suspended before any other devices unless there are device links pointing to them as suppliers. The investigation that followed the failure report uncovered that async suspend was enabled for the cypd4226 device that was a Type-C controller with a dependency on USB PHY and it turned out that disabling async suspend for that device made the issue go away. Since async suspend takes dependencies between parents and children into account as well as other dependencies between devices represented by device links, this means that the cypd4226 has a dependency on another device that is not represented in any form in the kernel (a "hidden" dependency), in which case async suspend should not be enabled for it. Accordingly, make ucsi_ccg_probe() disable async suspend for the devices handled by, which covers the cypd4226 device on the Tegra boards as well as other devices likely to have similar "hidden" dependencies. Fixes: aa7a9275ab81 ("PM: sleep: Suspend async parents after suspending children") Closes: https://lore.kernel.org/linux-pm/c6cd714b-b0eb-42fc-b9b5-4f5f396fb4ec@nvidia.com/ Reported-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Jon Hunter Signed-off-by: Rafael J. Wysocki Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/6180608.lOV4Wx5bFT@rjwysocki.net --- drivers/usb/typec/ucsi/ucsi_ccg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index f01e4ef6619d0a..e9a9df1431af3a 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -1483,6 +1483,8 @@ static int ucsi_ccg_probe(struct i2c_client *client) i2c_set_clientdata(client, uc); + device_disable_async_suspend(uc->dev); + pm_runtime_set_active(uc->dev); pm_runtime_enable(uc->dev); pm_runtime_use_autosuspend(uc->dev); From 84a16adb12e3b8f32c684aee5a9db09dbc1ac458 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 May 2025 14:51:47 +0200 Subject: [PATCH 328/353] PM: sleep: Print PM debug messages during hibernation Commit cdb8c100d8a4 ("include/linux/suspend.h: Only show pm_pr_dbg messages at suspend/resume") caused PM debug messages to only be printed during system-wide suspend and resume in progress, but it forgot about hibernation. Address this by adding a check for hibernation in progress to pm_debug_messages_should_print(). Fixes: cdb8c100d8a4 ("include/linux/suspend.h: Only show pm_pr_dbg messages at suspend/resume") Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/4998903.GXAFRqVoOG@rjwysocki.net --- kernel/power/hibernate.c | 5 +++++ kernel/power/main.c | 3 ++- kernel/power/power.h | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4db2e82fd8733..c503baf170aff6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -90,6 +90,11 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && diff --git a/kernel/power/main.c b/kernel/power/main.c index 97746f08b762b4..fb20a6e26fb6ae 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -613,7 +613,8 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; + return pm_debug_messages_on && (hibernation_in_progress() || + pm_suspend_target_state != PM_SUSPEND_ON); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); diff --git a/kernel/power/power.h b/kernel/power/power.h index c352dea2f67b56..f8496f40b54fa5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -71,10 +71,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ From f6b1e2b67503163e7d896980e266cfe846f46d97 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 May 2025 15:02:27 +0200 Subject: [PATCH 329/353] PM: sleep: Introduce pm_suspend_in_progress() Introduce pm_suspend_in_progress() to be used for checking if a system- wide suspend or resume transition is in progress, instead of comparing pm_suspend_target_state directly to PM_SUSPEND_ON, and use it where applicable. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Rodrigo Vivi Acked-by: Rodrigo Vivi Reviewed-by: Raag Jadav Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2020901.PYKUYFuaPT@rjwysocki.net --- arch/x86/pci/fixup.c | 4 ++-- drivers/base/power/wakeup.c | 2 +- drivers/gpu/drm/xe/xe_pm.c | 2 +- include/linux/suspend.h | 5 +++++ kernel/power/main.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index efefeb82ab61d0..e8ee1afa1992a4 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -970,13 +970,13 @@ static void amd_rp_pme_suspend(struct pci_dev *dev) struct pci_dev *rp; /* - * PM_SUSPEND_ON means we're doing runtime suspend, which means + * If system suspend is not in progress, we're doing runtime suspend, so * amd-pmc will not be involved so PMEs during D3 work as advertised. * * The PMEs *do* work if amd-pmc doesn't put the SoC in the hardware * sleep state, but we assume amd-pmc is always present. */ - if (pm_suspend_target_state == PM_SUSPEND_ON) + if (!pm_suspend_in_progress()) return; rp = pcie_find_root_port(dev); diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 63bf914a4d4467..6f6f309817f4b6 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -337,7 +337,7 @@ int device_wakeup_enable(struct device *dev) if (!dev || !dev->power.can_wakeup) return -EINVAL; - if (pm_suspend_target_state != PM_SUSPEND_ON) + if (pm_suspend_in_progress()) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); ws = wakeup_source_register(dev, dev_name(dev)); diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index 7b6b754ad6eb78..cb7fbf74138e7a 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -641,7 +641,7 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe) return dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || - pm_suspend_target_state != PM_SUSPEND_ON; + pm_suspend_in_progress(); #else return false; #endif diff --git a/include/linux/suspend.h b/include/linux/suspend.h index da6ebca3ff774c..52ea108f945197 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -298,6 +298,11 @@ static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {} static inline void s2idle_wake(void) {} #endif /* !CONFIG_SUSPEND */ +static inline bool pm_suspend_in_progress(void) +{ + return pm_suspend_target_state != PM_SUSPEND_ON; +} + /* struct pbe is used for creating lists of pages that should be restored * atomically during the resume from disk, because the page frames they have * occupied before the suspend are in use. diff --git a/kernel/power/main.c b/kernel/power/main.c index fb20a6e26fb6ae..8c26241c672428 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -613,8 +613,8 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && (hibernation_in_progress() || - pm_suspend_target_state != PM_SUSPEND_ON); + return pm_debug_messages_on && (pm_suspend_in_progress() || + hibernation_in_progress()); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); From 0fc488069014fefdf167001c10deb57f8a014cf6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 May 2025 15:03:35 +0200 Subject: [PATCH 330/353] PM: sleep: Introduce pm_sleep_transition_in_progress() The "suspend in progress" check in device_wakeup_enable() does not cover hibernation, but arguably it should do that, so introduce pm_sleep_transition_in_progress() covering transitions during both system suspend and hibernation to use in there and use it also in pm_debug_messages_should_print(). Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/7820474.EvYhyI6sBW@rjwysocki.net [ rjw: Move the new function definition under CONFIG_PM_SLEEP ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 2 +- include/linux/suspend.h | 4 ++++ kernel/power/main.c | 7 +++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 6f6f309817f4b6..7e612977be1b4c 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -337,7 +337,7 @@ int device_wakeup_enable(struct device *dev) if (!dev || !dev->power.can_wakeup) return -EINVAL; - if (pm_suspend_in_progress()) + if (pm_sleep_transition_in_progress()) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); ws = wakeup_source_register(dev, dev_name(dev)); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 52ea108f945197..b1c76c8f2c8220 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -475,6 +475,8 @@ extern void pm_print_active_wakeup_sources(void); extern unsigned int lock_system_sleep(void); extern void unlock_system_sleep(unsigned int); +extern bool pm_sleep_transition_in_progress(void); + #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) @@ -503,6 +505,8 @@ static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline unsigned int lock_system_sleep(void) { return 0; } static inline void unlock_system_sleep(unsigned int flags) {} +static inline bool pm_sleep_transition_in_progress(void) { return false; } + #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG diff --git a/kernel/power/main.c b/kernel/power/main.c index 8c26241c672428..8d17de9c84162e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -557,6 +557,10 @@ static int __init pm_debugfs_init(void) late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -613,8 +617,7 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && (pm_suspend_in_progress() || - hibernation_in_progress()); + return pm_debug_messages_on && pm_sleep_transition_in_progress(); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); From 879286ecf9ccdd966ce26ccb0b4139b77f14ff4b Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Sun, 11 May 2025 19:46:47 +0200 Subject: [PATCH 331/353] PM: freezer: Rewrite restarting tasks log to remove stray *done.* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `pr_cont()` unfortunately does not work here, as other parts of the Linux kernel log between the two log lines: [18445.295056] r8152-cfgselector 4-1.1.3: USB disconnect, device number 5 [18445.295112] OOM killer enabled. [18445.295115] Restarting tasks ... [18445.295185] usb 3-1: USB disconnect, device number 2 [18445.295193] usb 3-1.1: USB disconnect, device number 3 [18445.296262] usb 3-1.5: USB disconnect, device number 4 [18445.297017] done. [18445.297029] random: crng reseeded on system resumption `pr_cont()` also uses the default log level, normally warning, if the corresponding log line is interrupted. Therefore, replace the `pr_cont()`, and explicitly log it as a separate line with log level info: Restarting tasks: Starting […] Restarting tasks: Done Signed-off-by: Paul Menzel Link: https://patch.msgid.link/20250511174648.950430-1-pmenzel@molgen.mpg.de [ rjw: Rebase on top of an earlier analogous change ] Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 4c674282df0313..dc0dfc349f22b4 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -189,7 +189,7 @@ void thaw_processes(void) oom_killer_enable(); - pr_info("Restarting tasks ...\n"); + pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -208,7 +208,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - pr_info("Done restarting tasks.\n"); + pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } From eef68a9263a9fb2bba7ad12020a843e39e728135 Mon Sep 17 00:00:00 2001 From: "Francesco Poli (wintermute)" Date: Fri, 25 Apr 2025 17:07:31 +0200 Subject: [PATCH 332/353] cpupower: add a systemd service to run cpupower One of the most typical use cases of the 'cpupower' utility works as follows: run 'cpupower' at boot with the desired command-line options and then forget about it. Add a systemd service (disabled by default) that automates this use case (for environments where the initialization system is 'systemd'), by running 'cpupower' at boot with the settings read from a default configuration file. The systemd service, the associated support script and the corresponding default configuration file are derived from what is provided by the Arch Linux package (under "GPL-2.0-or-later" terms), modernized and enhanced in various ways (the script has also been checked with 'shellcheck'). Link: https://gitlab.archlinux.org/archlinux/packaging/packages/linux-tools/-/tree/dd2e2a311e05413d0d87a0346ffce8c7e98d6d2b Signed-off-by: Francesco Poli (wintermute) Reviewed-by: John B. Wyatt IV Reviewed-by: John B. Wyatt IV Tested-by: John B. Wyatt IV Tested-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 15 +++++++++++++ tools/power/cpupower/README | 19 ++++++++++++++++ tools/power/cpupower/cpupower.default | 28 ++++++++++++++++++++++++ tools/power/cpupower/cpupower.service.in | 16 ++++++++++++++ tools/power/cpupower/cpupower.sh | 26 ++++++++++++++++++++++ 5 files changed, 104 insertions(+) create mode 100644 tools/power/cpupower/cpupower.default create mode 100644 tools/power/cpupower/cpupower.service.in create mode 100644 tools/power/cpupower/cpupower.sh diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 835123add0ed1a..9c2b5f71fee1cc 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -2,6 +2,7 @@ # Makefile for cpupower # # Copyright (C) 2005,2006 Dominik Brodowski +# Copyright (C) 2025 Francesco Poli # # Based largely on the Makefile for udev by: # @@ -71,6 +72,7 @@ bindir ?= /usr/bin sbindir ?= /usr/sbin mandir ?= /usr/man libdir ?= /usr/lib +libexecdir ?= /usr/libexec includedir ?= /usr/include localedir ?= /usr/share/locale docdir ?= /usr/share/doc/packages/cpupower @@ -83,6 +85,7 @@ CP = cp -fpR INSTALL = /usr/bin/install -c INSTALL_PROGRAM = ${INSTALL} INSTALL_DATA = ${INSTALL} -m 644 +SETPERM_DATA = chmod 644 #bash completion scripts get sourced and so they should be rw only. INSTALL_SCRIPT = ${INSTALL} -m 644 @@ -302,6 +305,14 @@ install-tools: $(OUTPUT)cpupower $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} $(INSTALL_SCRIPT) cpupower-completion.sh '$(DESTDIR)${bash_completion_dir}/cpupower' + $(INSTALL) -d $(DESTDIR)${confdir}default + $(INSTALL_DATA) cpupower.default '$(DESTDIR)${confdir}default/cpupower' + $(INSTALL) -d $(DESTDIR)${libexecdir} + $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' + $(INSTALL) -d $(DESTDIR)${libdir}/systemd/system + sed 's|___CDIR___|$(DESTDIR)${confdir}|; s|___LDIR___|$(DESTDIR)${libexecdir}|' cpupower.service.in > '$(DESTDIR)${libdir}/systemd/system/cpupower.service' + $(SETPERM_DATA) '$(DESTDIR)${libdir}/systemd/system/cpupower.service' + if test -d /run/systemd/system; then systemctl daemon-reload; fi install-man: $(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1 @@ -336,6 +347,9 @@ uninstall: - rm -f $(DESTDIR)${includedir}/cpufreq.h - rm -f $(DESTDIR)${includedir}/cpuidle.h - rm -f $(DESTDIR)${bindir}/utils/cpupower + - rm -f $(DESTDIR)${confdir}default/cpupower + - rm -f $(DESTDIR)${libexecdir}/cpupower + - rm -f $(DESTDIR)${libdir}/systemd/system/cpupower.service - rm -f $(DESTDIR)${mandir}/man1/cpupower.1 - rm -f $(DESTDIR)${mandir}/man1/cpupower-frequency-set.1 - rm -f $(DESTDIR)${mandir}/man1/cpupower-frequency-info.1 @@ -346,6 +360,7 @@ uninstall: - for HLANG in $(LANGUAGES); do \ rm -f $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo; \ done; + - if test -d /run/systemd/system; then systemctl daemon-reload; fi help: @echo 'Building targets:' diff --git a/tools/power/cpupower/README b/tools/power/cpupower/README index 2678ed81d311f3..e6ae7c1e0a0d7d 100644 --- a/tools/power/cpupower/README +++ b/tools/power/cpupower/README @@ -59,6 +59,10 @@ $ sudo make install ----------------------------------------------------------------------- | man pages | /usr/man | ----------------------------------------------------------------------- +| systemd service | /usr/lib | +----------------------------------------------------------------------- +| systemd support script | /usr/libexec | +----------------------------------------------------------------------- To put it in other words it makes build results available system-wide, enabling any user to simply start using it without any additional steps @@ -109,6 +113,10 @@ The files will be installed to the following dirs: ----------------------------------------------------------------------- | man pages | ${DESTDIR}/usr/man | ----------------------------------------------------------------------- +| systemd service | ${DESTDIR}/usr/lib | +----------------------------------------------------------------------- +| systemd support script | ${DESTDIR}/usr/libexec | +----------------------------------------------------------------------- If you look at the table for the default 'make' output dirs you will notice that the only difference with the non-default case is the @@ -173,6 +181,17 @@ The issue is that binary cannot find the 'libcpupower' library. So, we shall point to the lib dir: sudo LD_LIBRARY_PATH=lib64/ ./bin/cpupower +systemd service +--------------- + +A systemd service is also provided to run the cpupower utility at boot with +settings read from a configuration file. In order to enable this systemd +service, edit '${DESTDIR}/etc/default/cpupower' (uncommenting at least one of +the options, depending on your preferences) and then issue the following +command: + +$ sudo systemctl enable --now cpupower.service + THANKS ------ diff --git a/tools/power/cpupower/cpupower.default b/tools/power/cpupower/cpupower.default new file mode 100644 index 00000000000000..376ca40fe5a6a9 --- /dev/null +++ b/tools/power/cpupower/cpupower.default @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2012, Sébastien Luttringer +# Copyright (C) 2024-2025, Francesco Poli + +# Default file for linux-cpupower + +# --- CPU clock frequency --- + +# Define CPU governor +# Valid governors: ondemand, performance, powersave, conservative, userspace +#GOVERNOR='ondemand' + +# Limit frequency range +# Valid suffixes: Hz, kHz (default), MHz, GHz, THz +#MIN_FREQ="2.25GHz" +#MAX_FREQ="3GHz" + +# Specific frequency to be set. +# Requires userspace governor to be available. +# If this option is set, all the previous frequency options are ignored +#FREQ= + +# --- CPU policy --- + +# Sets a register on supported Intel processore which allows software to convey +# its policy for the relative importance of performance versus energy savings to +# the processor. See man CPUPOWER-SET(1) for additional details +#PERF_BIAS= diff --git a/tools/power/cpupower/cpupower.service.in b/tools/power/cpupower/cpupower.service.in new file mode 100644 index 00000000000000..f91eaed0387208 --- /dev/null +++ b/tools/power/cpupower/cpupower.service.in @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2012-2020, Sébastien Luttringer +# Copyright (C) 2024, Francesco Poli + +[Unit] +Description=Apply cpupower configuration +ConditionVirtualization=!container + +[Service] +Type=oneshot +EnvironmentFile=-___CDIR___default/cpupower +ExecStart=___LDIR___/cpupower +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/tools/power/cpupower/cpupower.sh b/tools/power/cpupower/cpupower.sh new file mode 100644 index 00000000000000..a37dd4cfdb2b9d --- /dev/null +++ b/tools/power/cpupower/cpupower.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2012, Sébastien Luttringer +# Copyright (C) 2024, Francesco Poli + +ESTATUS=0 + +# apply CPU clock frequency options +if test -n "$FREQ" +then + cpupower frequency-set -f "$FREQ" > /dev/null || ESTATUS=1 +elif test -n "${GOVERNOR}${MIN_FREQ}${MAX_FREQ}" +then + cpupower frequency-set \ + ${GOVERNOR:+ -g "$GOVERNOR"} \ + ${MIN_FREQ:+ -d "$MIN_FREQ"} ${MAX_FREQ:+ -u "$MAX_FREQ"} \ + > /dev/null || ESTATUS=1 +fi + +# apply CPU policy options +if test -n "$PERF_BIAS" +then + cpupower set -b "$PERF_BIAS" > /dev/null || ESTATUS=1 +fi + +exit $ESTATUS From 9f29c14adcd7c11794a349e0cdeabbe0dfc7f6db Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Tue, 29 Apr 2025 16:47:10 -0400 Subject: [PATCH 333/353] cpupower: change binding's makefile to use -lcpupower Originally I believed I needed the .o files to make the bindings. The linking failed due to a missing .so link in Fedora or by using make install-lib from the cpupower directory. Amend the makefile and the README. Big thanks to Wander Lairson Costa for the help. Link: https://lore.kernel.org/r/20250429204711.127274-1-jwyatt@redhat.com Signed-off-by: "John B. Wyatt IV" Signed-off-by: "John B. Wyatt IV" Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/Makefile | 8 +++----- tools/power/cpupower/bindings/python/README | 13 ++++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile index 741f21477432a8..81db39a03efba5 100644 --- a/tools/power/cpupower/bindings/python/Makefile +++ b/tools/power/cpupower/bindings/python/Makefile @@ -1,22 +1,20 @@ # SPDX-License-Identifier: GPL-2.0-only # Makefile for libcpupower's Python bindings # -# This Makefile expects you have already run the makefile for cpupower to build -# the .o files in the lib directory for the bindings to be created. +# This Makefile expects you have already run `make install-lib` in the lib +# directory for the bindings to be created. CC := gcc HAVE_SWIG := $(shell if which swig >/dev/null 2>&1; then echo 1; else echo 0; fi) HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; else echo 0; fi) -LIB_DIR := ../../lib PY_INCLUDE = $(firstword $(shell python-config --includes)) -OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o) INSTALL_DIR = $(shell python3 -c "import site; print(site.getsitepackages()[0])") all: _raw_pylibcpupower.so _raw_pylibcpupower.so: raw_pylibcpupower_wrap.o - $(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so + $(CC) -shared -lcpupower raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c $(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE) diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README index 952e2e02fd32bb..2a4896b648b7d9 100644 --- a/tools/power/cpupower/bindings/python/README +++ b/tools/power/cpupower/bindings/python/README @@ -5,18 +5,21 @@ libcpupower (aside from the libcpupower object files). requirements ------------ -* You need the object files in the libcpupower directory compiled by -cpupower's makefile. +* If you are building completely from upstream; please install libcpupower by +running `make install-lib` within the cpupower directory. This installs the +libcpupower.so file and symlinks needed. Otherwise, please make sure a symlink +to libcpupower.so exists in your library path from your distribution's +packages. * The SWIG program must be installed. -* The Python's development libraries installed. +* The Python's development libraries must be installed. Please check that your version of SWIG is compatible with the version of Python installed on your machine by checking the SWIG changelog on their website. https://swig.org/ Note that while SWIG itself is GPL v3+ licensed; the resulting output, -the bindings code: is permissively licensed + the license of libcpupower's .o -files. For these bindings that means GPL v2. +the bindings code: is permissively licensed + the license of libcpupower's +library files. For these bindings that means GPL v2. Please see https://swig.org/legal.html and the discussion [1] for more details. From a13dfe4a6128668e05aab60bb2eaa9821df8bc2d Mon Sep 17 00:00:00 2001 From: "Francesco Poli (wintermute)" Date: Tue, 13 May 2025 18:29:31 +0200 Subject: [PATCH 334/353] cpupower: do not write DESTDIR to cpupower.service Fix the use of DESTDIR variable in the Makefile, as far as the installation of the systemd service unit 'cpupower.service' is concerned. This was caused by a misunderstanding about the purpose of the DESTDIR variable in the Makefile, which is instead meant to support staged installations: its value should not end up into installed file contents. Link: https://lore.kernel.org/linux-pm/20250509002206.bd2519ba52035d47c3c32aa6@paranoici.org/T/#mfbb938f9c0d5a21173acb92a061eb9205fd0abfe Link: https://www.gnu.org/prep/standards/html_node/DESTDIR.html Link: https://lore.kernel.org/r/20250513163937.61062-3-invernomuto@paranoici.org Fixes: 9c70b779ad91 ("cpupower: add a systemd service to run cpupower") Signed-off-by: Francesco Poli (wintermute) Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 9c2b5f71fee1cc..4ad931509eaacc 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -310,7 +310,7 @@ install-tools: $(OUTPUT)cpupower $(INSTALL) -d $(DESTDIR)${libexecdir} $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' $(INSTALL) -d $(DESTDIR)${libdir}/systemd/system - sed 's|___CDIR___|$(DESTDIR)${confdir}|; s|___LDIR___|$(DESTDIR)${libexecdir}|' cpupower.service.in > '$(DESTDIR)${libdir}/systemd/system/cpupower.service' + sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${libdir}/systemd/system/cpupower.service' $(SETPERM_DATA) '$(DESTDIR)${libdir}/systemd/system/cpupower.service' if test -d /run/systemd/system; then systemctl daemon-reload; fi From b71eb8f3f3cc517ad3f23e3d0d1d9fb6b50d0af7 Mon Sep 17 00:00:00 2001 From: "Francesco Poli (wintermute)" Date: Tue, 13 May 2025 18:29:32 +0200 Subject: [PATCH 335/353] cpupower: do not call systemctl at install time Fix the installation procedure for the systemd service unit 'cpupower.service'. Do not call "systemctl daemon-reload" in the Makefile, but explain when this command should be manually issued in the README file. Link: https://lore.kernel.org/linux-pm/20250509002206.bd2519ba52035d47c3c32aa6@paranoici.org/T/#mfbb938f9c0d5a21173acb92a061eb9205fd0abfe Link: https://lore.kernel.org/r/20250513163937.61062-4-invernomuto@paranoici.org Fixes: 9c70b779ad91 ("cpupower: add a systemd service to run cpupower") Signed-off-by: Francesco Poli (wintermute) Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 2 -- tools/power/cpupower/README | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 4ad931509eaacc..7cec2c30f98a97 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -312,7 +312,6 @@ install-tools: $(OUTPUT)cpupower $(INSTALL) -d $(DESTDIR)${libdir}/systemd/system sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${libdir}/systemd/system/cpupower.service' $(SETPERM_DATA) '$(DESTDIR)${libdir}/systemd/system/cpupower.service' - if test -d /run/systemd/system; then systemctl daemon-reload; fi install-man: $(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1 @@ -360,7 +359,6 @@ uninstall: - for HLANG in $(LANGUAGES); do \ rm -f $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo; \ done; - - if test -d /run/systemd/system; then systemctl daemon-reload; fi help: @echo 'Building targets:' diff --git a/tools/power/cpupower/README b/tools/power/cpupower/README index e6ae7c1e0a0d7d..494104de154055 100644 --- a/tools/power/cpupower/README +++ b/tools/power/cpupower/README @@ -59,7 +59,7 @@ $ sudo make install ----------------------------------------------------------------------- | man pages | /usr/man | ----------------------------------------------------------------------- -| systemd service | /usr/lib | +| systemd service | /usr/lib/systemd/system | ----------------------------------------------------------------------- | systemd support script | /usr/libexec | ----------------------------------------------------------------------- @@ -113,7 +113,7 @@ The files will be installed to the following dirs: ----------------------------------------------------------------------- | man pages | ${DESTDIR}/usr/man | ----------------------------------------------------------------------- -| systemd service | ${DESTDIR}/usr/lib | +| systemd service | ${DESTDIR}/usr/lib/systemd/system | ----------------------------------------------------------------------- | systemd support script | ${DESTDIR}/usr/libexec | ----------------------------------------------------------------------- @@ -185,11 +185,20 @@ systemd service --------------- A systemd service is also provided to run the cpupower utility at boot with -settings read from a configuration file. In order to enable this systemd -service, edit '${DESTDIR}/etc/default/cpupower' (uncommenting at least one of -the options, depending on your preferences) and then issue the following +settings read from a configuration file. + +If you want systemd to find the new service after the installation, the service +unit must have been installed in one of the system unit search path directories +(such as '/usr/lib/systemd/system/', which is the default location) and (unless +you are willing to wait for the next reboot) you need to issue the following command: +$ sudo systemctl daemon-reload + +If you want to enable this systemd service, edit +'${DESTDIR}/etc/default/cpupower' (uncommenting at least one of the options, +depending on your preferences) and then issue the following command: + $ sudo systemctl enable --now cpupower.service From 1e8d70bb62d11e51c1132db2c85646b04fe3f6ef Mon Sep 17 00:00:00 2001 From: "Francesco Poli (wintermute)" Date: Tue, 13 May 2025 18:29:33 +0200 Subject: [PATCH 336/353] cpupower: do not install files to /etc/default/ Improve the installation procedure for the systemd service unit 'cpupower.service', to be more distro-agnostic. Do not install the service unit configuration file to /etc/default/ (a directory that is used by Debian and Debian-derivatives and only rarely by other distros). Also, clarify the role of the configuration file in its own comments. Link: https://lore.kernel.org/linux-pm/20250509002206.bd2519ba52035d47c3c32aa6@paranoici.org/T/#ma8a3fa80acc4036af6c754e8ecabacc55b288ad1 Link: https://lore.kernel.org/r/20250513163937.61062-5-invernomuto@paranoici.org Fixes: 9c70b779ad91 ("cpupower: add a systemd service to run cpupower") Signed-off-by: Francesco Poli (wintermute) Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 6 +++--- tools/power/cpupower/README | 6 +++--- .../{cpupower.default => cpupower-service.conf} | 10 +++++++--- tools/power/cpupower/cpupower.service.in | 4 ++-- 4 files changed, 15 insertions(+), 11 deletions(-) rename tools/power/cpupower/{cpupower.default => cpupower-service.conf} (67%) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 7cec2c30f98a97..be8dfac1407635 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -305,8 +305,8 @@ install-tools: $(OUTPUT)cpupower $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} $(INSTALL_SCRIPT) cpupower-completion.sh '$(DESTDIR)${bash_completion_dir}/cpupower' - $(INSTALL) -d $(DESTDIR)${confdir}default - $(INSTALL_DATA) cpupower.default '$(DESTDIR)${confdir}default/cpupower' + $(INSTALL) -d $(DESTDIR)${confdir} + $(INSTALL_DATA) cpupower-service.conf '$(DESTDIR)${confdir}' $(INSTALL) -d $(DESTDIR)${libexecdir} $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' $(INSTALL) -d $(DESTDIR)${libdir}/systemd/system @@ -346,7 +346,7 @@ uninstall: - rm -f $(DESTDIR)${includedir}/cpufreq.h - rm -f $(DESTDIR)${includedir}/cpuidle.h - rm -f $(DESTDIR)${bindir}/utils/cpupower - - rm -f $(DESTDIR)${confdir}default/cpupower + - rm -f $(DESTDIR)${confdir}cpupower-service.conf - rm -f $(DESTDIR)${libexecdir}/cpupower - rm -f $(DESTDIR)${libdir}/systemd/system/cpupower.service - rm -f $(DESTDIR)${mandir}/man1/cpupower.1 diff --git a/tools/power/cpupower/README b/tools/power/cpupower/README index 494104de154055..9de449469568a6 100644 --- a/tools/power/cpupower/README +++ b/tools/power/cpupower/README @@ -195,9 +195,9 @@ command: $ sudo systemctl daemon-reload -If you want to enable this systemd service, edit -'${DESTDIR}/etc/default/cpupower' (uncommenting at least one of the options, -depending on your preferences) and then issue the following command: +If you want to enable this systemd service, edit '/etc/cpupower-service.conf' +(uncommenting at least one of the options, depending on your preferences) +and then issue the following command: $ sudo systemctl enable --now cpupower.service diff --git a/tools/power/cpupower/cpupower.default b/tools/power/cpupower/cpupower-service.conf similarity index 67% rename from tools/power/cpupower/cpupower.default rename to tools/power/cpupower/cpupower-service.conf index 376ca40fe5a6a9..02eabe8e361425 100644 --- a/tools/power/cpupower/cpupower.default +++ b/tools/power/cpupower/cpupower-service.conf @@ -2,7 +2,11 @@ # Copyright (C) 2012, Sébastien Luttringer # Copyright (C) 2024-2025, Francesco Poli -# Default file for linux-cpupower +# Configuration file for cpupower.service systemd service unit +# +# Edit this file (uncommenting at least one of the options, depending on +# your preferences) and then enable cpupower.service, if you want cpupower +# to run at boot with these settings. # --- CPU clock frequency --- @@ -15,14 +19,14 @@ #MIN_FREQ="2.25GHz" #MAX_FREQ="3GHz" -# Specific frequency to be set. +# Set a specific frequency # Requires userspace governor to be available. # If this option is set, all the previous frequency options are ignored #FREQ= # --- CPU policy --- -# Sets a register on supported Intel processore which allows software to convey +# Set a register on supported Intel processore which allows software to convey # its policy for the relative importance of performance versus energy savings to # the processor. See man CPUPOWER-SET(1) for additional details #PERF_BIAS= diff --git a/tools/power/cpupower/cpupower.service.in b/tools/power/cpupower/cpupower.service.in index f91eaed0387208..fbd5b8c1427051 100644 --- a/tools/power/cpupower/cpupower.service.in +++ b/tools/power/cpupower/cpupower.service.in @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later # Copyright (C) 2012-2020, Sébastien Luttringer -# Copyright (C) 2024, Francesco Poli +# Copyright (C) 2024-2025, Francesco Poli [Unit] Description=Apply cpupower configuration @@ -8,7 +8,7 @@ ConditionVirtualization=!container [Service] Type=oneshot -EnvironmentFile=-___CDIR___default/cpupower +EnvironmentFile=-___CDIR___cpupower-service.conf ExecStart=___LDIR___/cpupower RemainAfterExit=yes From a5a12bee35b97c39f9fae8c632f0ee05dbca4874 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Thu, 10 Apr 2025 18:54:54 +0200 Subject: [PATCH 337/353] ACPI: OSI: Stop advertising support for "3.0 _SCP Extensions" As specified in section 5.7.2 of the ACPI specification the feature group string "3.0 _SCP Extensions" implies that the operating system evaluates the _SCP control method with additional parameters. However the ACPI thermal driver evaluates the _SCP control method without those additional parameters, conflicting with the above feature group string advertised to the firmware thru _OSI. Stop advertising support for this feature string to avoid confusing the ACPI firmware. Fixes: e5f660ebef68 ("ACPI / osi: Collect _OSI handling into one single file") Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20250410165456.4173-2-W_Armin@gmx.de Signed-off-by: Rafael J. Wysocki --- drivers/acpi/osi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c index df9328c850bd33..f2c943b934be0a 100644 --- a/drivers/acpi/osi.c +++ b/drivers/acpi/osi.c @@ -42,7 +42,6 @@ static struct acpi_osi_entry osi_setup_entries[OSI_STRING_ENTRIES_MAX] __initdata = { {"Module Device", true}, {"Processor Device", true}, - {"3.0 _SCP Extensions", true}, {"Processor Aggregator Device", true}, }; From bdc70dfbb2b437b87771d78068464be1178b72c2 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Thu, 10 Apr 2025 18:54:55 +0200 Subject: [PATCH 338/353] ACPI: thermal: Execute _SCP before reading trip points As specified in section 11.4.13 of the ACPI specification the operating system is required to evaluate the _ACx and _PSV objects after executing the _SCP control method. Move the execution of the _SCP control method before the invocation of acpi_thermal_get_trip_points() to avoid missing updates to the _ACx and _PSV objects. Fixes: b09872a652d3 ("ACPI: thermal: Fold acpi_thermal_get_info() into its caller") Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20250410165456.4173-3-W_Armin@gmx.de Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 0c874186f8aed4..5c2defe55898f1 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -803,6 +803,12 @@ static int acpi_thermal_add(struct acpi_device *device) acpi_thermal_aml_dependency_fix(tz); + /* + * Set the cooling mode [_SCP] to active cooling. This needs to happen before + * we retrieve the trip point values. + */ + acpi_execute_simple_method(tz->device->handle, "_SCP", ACPI_THERMAL_MODE_ACTIVE); + /* Get trip points [_ACi, _PSV, etc.] (required). */ acpi_thermal_get_trip_points(tz); @@ -814,10 +820,6 @@ static int acpi_thermal_add(struct acpi_device *device) if (result) goto free_memory; - /* Set the cooling mode [_SCP] to active cooling. */ - acpi_execute_simple_method(tz->device->handle, "_SCP", - ACPI_THERMAL_MODE_ACTIVE); - /* Determine the default polling frequency [_TZP]. */ if (tzp) tz->polling_frequency = tzp; From 3937ec4706c1f0ea5e4a6af5e41525ea5eea889a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 17 Apr 2025 09:46:37 +0200 Subject: [PATCH 339/353] thermal/drivers/hisi: Do not enable by default during compile testing Enabling the compile test should not cause automatic enabling of all drivers, but only allow to choose to compile them. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20250417074638.81329-1-krzysztof.kozlowski@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index d3f9686e26e710..510c2b82154699 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -257,7 +257,7 @@ config HISI_THERMAL depends on ARCH_HISI || COMPILE_TEST depends on HAS_IOMEM depends on OF - default y + default ARCH_HISI help Enable this to plug hisilicon's thermal sensor driver into the Linux thermal framework. cpufreq is used as the cooling device to throttle From aae5a3caa55de9e9bf9494227c0e5713ef9fd3b8 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 11 Mar 2025 10:21:22 +0100 Subject: [PATCH 340/353] thermal/drivers/bcm2835: Use %pC instead of %pCn The %pC and %pCn printk format specifiers produce the exact same string. In preparation for removing %pCn, use %pC. Reviewed-by: Geert Uytterhoeven Signed-off-by: Luca Ceresoli Link: https://lore.kernel.org/r/20250311-vsprintf-pcn-v2-1-0af40fc7dee4@bootlin.com Signed-off-by: Daniel Lezcano --- drivers/thermal/broadcom/bcm2835_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c index 7fbba2233c4c12..685a5aee5e0dfb 100644 --- a/drivers/thermal/broadcom/bcm2835_thermal.c +++ b/drivers/thermal/broadcom/bcm2835_thermal.c @@ -192,7 +192,7 @@ static int bcm2835_thermal_probe(struct platform_device *pdev) rate = clk_get_rate(data->clk); if ((rate < 1920000) || (rate > 5000000)) dev_warn(dev, - "Clock %pCn running at %lu Hz is outside of the recommended range: 1.92 to 5MHz\n", + "Clock %pC running at %lu Hz is outside of the recommended range: 1.92 to 5MHz\n", data->clk, rate); /* register of thermal sensor and get info from DT */ From 77fc179db31865382c05bb390847acd97bdb8525 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 11 Mar 2025 10:21:23 +0100 Subject: [PATCH 341/353] vsprintf: remove redundant and unused %pCn format specifier %pC and %pCn print the same string, and commit 900cca294425 ("lib/vsprintf: add %pC{,n,r} format specifiers for clocks") introducing them does not clarify any intended difference. It can be assumed %pC is a default for %pCn as some other specifiers do, but not all are consistent with this policy. Moreover there is now no other suffix other than 'n', which makes a default not really useful. All users in the kernel were using %pC except for one which has been converted. So now remove %pCn and all the unnecessary extra code and documentation. Acked-by: Stephen Boyd Reviewed-by: Geert Uytterhoeven Signed-off-by: Luca Ceresoli Reviewed-by: Petr Mladek Tested-by: Petr Mladek Reviewed-by: Yanteng Si Link: https://lore.kernel.org/r/20250311-vsprintf-pcn-v2-2-0af40fc7dee4@bootlin.com Signed-off-by: Daniel Lezcano --- Documentation/core-api/printk-formats.rst | 3 +-- .../translations/zh_CN/core-api/printk-formats.rst | 3 +-- lib/vsprintf.c | 10 ++-------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 4bdc394e86af4f..c787f72957a402 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -571,9 +571,8 @@ struct clk :: %pC pll1 - %pCn pll1 -For printing struct clk structures. %pC and %pCn print the name of the clock +For printing struct clk structures. %pC prints the name of the clock (Common Clock Framework) or a unique 32-bit ID (legacy clock framework). Passed by reference. diff --git a/Documentation/translations/zh_CN/core-api/printk-formats.rst b/Documentation/translations/zh_CN/core-api/printk-formats.rst index bd36d35eba4eb1..96a917ecc93f2a 100644 --- a/Documentation/translations/zh_CN/core-api/printk-formats.rst +++ b/Documentation/translations/zh_CN/core-api/printk-formats.rst @@ -523,9 +523,8 @@ clk结构体 :: %pC pll1 - %pCn pll1 -用于打印clk结构。%pC 和 %pCn 打印时钟的名称(通用时钟框架)或唯一的32位 +用于打印clk结构。%pC 打印时钟的名称(通用时钟框架)或唯一的32位 ID(传统时钟框架)。 通过引用传递。 diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 01699852f30cdf..e40843cb807ea0 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1981,15 +1981,11 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, if (check_pointer(&buf, end, clk, spec)) return buf; - switch (fmt[1]) { - case 'n': - default: #ifdef CONFIG_COMMON_CLK - return string(buf, end, __clk_get_name(clk), spec); + return string(buf, end, __clk_get_name(clk), spec); #else - return ptr_to_id(buf, end, clk, spec); + return ptr_to_id(buf, end, clk, spec); #endif - } } static @@ -2391,8 +2387,6 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock - * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address - * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long From 617d0d0664785a458ee46aefd89d53dacea21ba6 Mon Sep 17 00:00:00 2001 From: Enrique Isidoro Vazquez Ramos Date: Tue, 25 Mar 2025 13:30:44 -0600 Subject: [PATCH 342/353] thermal/drivers/amlogic: Rename Uptat to uptat to follow kernel coding style The variable Uptat uses CamelCase, which violates the kernel's coding style that mandates snake_case for variable names. This is a purely cosmetic change with no functional impact. Compilation tested with: - checkpatch.pl --strict passed (no new warnings/errors). Signed-off-by: Enrique Isidoro Vazquez Ramos Link: https://lore.kernel.org/r/Z-MEZNMLUmj75uxN@debian.debian Signed-off-by: Daniel Lezcano --- drivers/thermal/amlogic_thermal.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/amlogic_thermal.c b/drivers/thermal/amlogic_thermal.c index 3c5f7dbddf2c1f..5448d772db12ab 100644 --- a/drivers/thermal/amlogic_thermal.c +++ b/drivers/thermal/amlogic_thermal.c @@ -7,10 +7,10 @@ * * Register value to celsius temperature formulas: * Read_Val m * U - * U = ---------, Uptat = --------- + * U = ---------, uptat = --------- * 2^16 1 + n * U * - * Temperature = A * ( Uptat + u_efuse / 2^16 )- B + * Temperature = A * ( uptat + u_efuse / 2^16 )- B * * A B m n : calibration parameters * u_efuse : fused calibration value, it's a signed 16 bits value @@ -112,7 +112,7 @@ static int amlogic_thermal_code_to_millicelsius(struct amlogic_thermal *pdata, const struct amlogic_thermal_soc_calib_data *param = pdata->data->calibration_parameters; int temp; - s64 factor, Uptat, uefuse; + s64 factor, uptat, uefuse; uefuse = pdata->trim_info & TSENSOR_TRIM_SIGN_MASK ? ~(pdata->trim_info & TSENSOR_TRIM_TEMP_MASK) + 1 : @@ -121,12 +121,12 @@ static int amlogic_thermal_code_to_millicelsius(struct amlogic_thermal *pdata, factor = param->n * temp_code; factor = div_s64(factor, 100); - Uptat = temp_code * param->m; - Uptat = div_s64(Uptat, 100); - Uptat = Uptat * BIT(16); - Uptat = div_s64(Uptat, BIT(16) + factor); + uptat = temp_code * param->m; + uptat = div_s64(uptat, 100); + uptat = uptat * BIT(16); + uptat = div_s64(uptat, BIT(16) + factor); - temp = (Uptat + uefuse) * param->A; + temp = (uptat + uefuse) * param->A; temp = div_s64(temp, BIT(16)); temp = (temp - param->B) * 100; From 85c1b47db22fb117e91db5f345241f0f894d4dd1 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 2 Apr 2025 10:38:52 +0200 Subject: [PATCH 343/353] thermal/drivers/mediatek/lvts: Fix debugfs unregister on failure When running the probe function for this driver, the function lvts_debugfs_init() gets called in lvts_domain_init() which, in turn, gets called in lvts_probe() before registering threaded interrupt handlers. Even though it's unlikely, the last call may fail and, if it does, there's nothing removing the already created debugfs folder and files. In order to fix that, instead of calling the lvts debugfs cleanup function upon failure, register a devm action that will take care of calling that upon failure or driver removal. Since devm was used, also delete the call to lvts_debugfs_exit() in the lvts_remove() callback, as now that's done automatically. Fixes: f5f633b18234 ("thermal/drivers/mediatek: Add the Low Voltage Thermal Sensor driver") Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20250402083852.20624-1-angelogioacchino.delregno@collabora.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 088481d91e6e29..c0be4ca55c7b84 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -213,6 +213,13 @@ static const struct debugfs_reg32 lvts_regs[] = { LVTS_DEBUG_FS_REGS(LVTS_CLKEN), }; +static void lvts_debugfs_exit(void *data) +{ + struct lvts_domain *lvts_td = data; + + debugfs_remove_recursive(lvts_td->dom_dentry); +} + static int lvts_debugfs_init(struct device *dev, struct lvts_domain *lvts_td) { struct debugfs_regset32 *regset; @@ -245,12 +252,7 @@ static int lvts_debugfs_init(struct device *dev, struct lvts_domain *lvts_td) debugfs_create_regset32("registers", 0400, dentry, regset); } - return 0; -} - -static void lvts_debugfs_exit(struct lvts_domain *lvts_td) -{ - debugfs_remove_recursive(lvts_td->dom_dentry); + return devm_add_action_or_reset(dev, lvts_debugfs_exit, lvts_td); } #else @@ -1374,8 +1376,6 @@ static void lvts_remove(struct platform_device *pdev) for (i = 0; i < lvts_td->num_lvts_ctrl; i++) lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], false); - - lvts_debugfs_exit(lvts_td); } static const struct lvts_ctrl_data mt7988_lvts_ap_data_ctrl[] = { From 6fcfec04d958a5d6111970ae88996045d945b680 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 5 May 2025 07:24:52 +0200 Subject: [PATCH 344/353] thermal/drivers/mediatek/lvts: Remove unused lvts_debugfs_exit When debugfs is disabled, the function has no reference any more: drivers/thermal/mediatek/lvts_thermal.c:266:13: error: 'lvts_debugfs_exit' defined but not used [-Werror=unused-function] 266 | static void lvts_debugfs_exit(struct lvts_domain *lvts_td) { } | ^~~~~~~~~~~~~~~~~ Fixes: ef280c17a840 ("thermal/drivers/mediatek/lvts: Fix debugfs unregister on failure") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250505052502.1812867-1-arnd@kernel.org Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index c0be4ca55c7b84..985925147ac068 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -263,8 +263,6 @@ static inline int lvts_debugfs_init(struct device *dev, return 0; } -static void lvts_debugfs_exit(struct lvts_domain *lvts_td) { } - #endif static int lvts_raw_to_temp(u32 raw_temp, int temp_factor) From 1e6393244dc15d57b6550f669b0dbe291e0c71e6 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 11 May 2025 20:49:54 +0200 Subject: [PATCH 345/353] dt-bindings: thermal: Add support for Airoha EN7581 thermal sensor Add support for Airoha EN7581 thermal sensor and monitor. This is a simple sensor for the CPU or SoC Package that provide thermal sensor and trip point for hot low and critical condition to fire interrupt and react on the abnormal state. Signed-off-by: Christian Marangi Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250511185003.3754495-1-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- .../thermal/airoha,en7581-thermal.yaml | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 Documentation/devicetree/bindings/thermal/airoha,en7581-thermal.yaml diff --git a/Documentation/devicetree/bindings/thermal/airoha,en7581-thermal.yaml b/Documentation/devicetree/bindings/thermal/airoha,en7581-thermal.yaml new file mode 100644 index 00000000000000..ca0242ef037880 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/airoha,en7581-thermal.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/airoha,en7581-thermal.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Airoha EN7581 Thermal Sensor and Monitor + +maintainers: + - Christian Marangi + +properties: + compatible: + const: airoha,en7581-thermal + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + airoha,chip-scu: + description: phandle to the chip SCU syscon + $ref: /schemas/types.yaml#/definitions/phandle + + '#thermal-sensor-cells': + const: 0 + +required: + - compatible + - reg + - interrupts + - airoha,chip-scu + +additionalProperties: false + +examples: + - | + #include + + thermal-sensor@1efbd800 { + compatible = "airoha,en7581-thermal"; + reg = <0x1efbd000 0xd5c>; + interrupts = ; + airoha,chip-scu = <&chip_scu>; + + #thermal-sensor-cells = <0>; + }; From d79ea9cf65a049775afeef7863be3a14867f16db Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 11 May 2025 20:49:55 +0200 Subject: [PATCH 346/353] thermal/drivers: Add support for Airoha EN7581 thermal sensor Add support for Airoha EN7581 thermal sensor. This provide support for reading the CPU or SoC Package sensor and to setup trip points for hot and critical condition. An interrupt is fired to react on this and doesn't require passive poll to read the temperature. The thermal regs provide a way to read the ADC value from an external register placed in the Chip SCU regs. Monitor will read this value and fire an interrupt if the trip condition configured is reached. The Thermal Trip and Interrupt logic is conceptually similar to Mediatek LVTS Thermal but differ in register mapping and actual function/bug workaround. The implementation only share some register names but from functionality observation it's very different and used only for the basic function of periodically poll the temp and trip the interrupt. Signed-off-by: Christian Marangi Link: https://lore.kernel.org/r/20250511185003.3754495-2-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/Kconfig | 9 + drivers/thermal/Makefile | 1 + drivers/thermal/airoha_thermal.c | 489 +++++++++++++++++++++++++++++++ 3 files changed, 499 insertions(+) create mode 100644 drivers/thermal/airoha_thermal.c diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 510c2b82154699..a09c188b9ad113 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -327,6 +327,15 @@ config QORIQ_THERMAL cpufreq is used as the cooling device to throttle CPUs when the passive trip is crossed. +config AIROHA_THERMAL + tristate "Airoha thermal sensor driver" + depends on ARCH_AIROHA || COMPILE_TEST + depends on MFD_SYSCON + depends on OF + help + Enable this to plug the Airoha thermal sensor driver into the Linux + thermal framework. + config SPEAR_THERMAL tristate "SPEAr thermal sensor driver" depends on PLAT_SPEAR || COMPILE_TEST diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 9abf43a74f2bbd..d7718978db245f 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_K3_THERMAL) += k3_bandgap.o k3_j72xx_bandgap.o # platform thermal drivers obj-y += broadcom/ obj-$(CONFIG_THERMAL_MMIO) += thermal_mmio.o +obj-$(CONFIG_AIROHA_THERMAL) += airoha_thermal.o obj-$(CONFIG_SPEAR_THERMAL) += spear_thermal.o obj-$(CONFIG_SUN8I_THERMAL) += sun8i_thermal.o obj-$(CONFIG_ROCKCHIP_THERMAL) += rockchip_thermal.o diff --git a/drivers/thermal/airoha_thermal.c b/drivers/thermal/airoha_thermal.c new file mode 100644 index 00000000000000..45116cdaee653b --- /dev/null +++ b/drivers/thermal/airoha_thermal.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* SCU regs */ +#define EN7581_PLLRG_PROTECT 0x268 +#define EN7581_PWD_TADC 0x2ec +#define EN7581_MUX_TADC GENMASK(3, 1) +#define EN7581_DOUT_TADC 0x2f8 +#define EN7581_DOUT_TADC_MASK GENMASK(15, 0) + +/* PTP_THERMAL regs */ +#define EN7581_TEMPMONCTL0 0x800 +#define EN7581_SENSE3_EN BIT(3) +#define EN7581_SENSE2_EN BIT(2) +#define EN7581_SENSE1_EN BIT(1) +#define EN7581_SENSE0_EN BIT(0) +#define EN7581_TEMPMONCTL1 0x804 +/* period unit calculated in BUS clock * 256 scaling-up */ +#define EN7581_PERIOD_UNIT GENMASK(9, 0) +#define EN7581_TEMPMONCTL2 0x808 +#define EN7581_FILT_INTERVAL GENMASK(25, 16) +#define EN7581_SEN_INTERVAL GENMASK(9, 0) +#define EN7581_TEMPMONINT 0x80C +#define EN7581_STAGE3_INT_EN BIT(31) +#define EN7581_STAGE2_INT_EN BIT(30) +#define EN7581_STAGE1_INT_EN BIT(29) +#define EN7581_FILTER_INT_EN_3 BIT(28) +#define EN7581_IMMD_INT_EN3 BIT(27) +#define EN7581_NOHOTINTEN3 BIT(26) +#define EN7581_HOFSINTEN3 BIT(25) +#define EN7581_LOFSINTEN3 BIT(24) +#define EN7581_HINTEN3 BIT(23) +#define EN7581_CINTEN3 BIT(22) +#define EN7581_FILTER_INT_EN_2 BIT(21) +#define EN7581_FILTER_INT_EN_1 BIT(20) +#define EN7581_FILTER_INT_EN_0 BIT(19) +#define EN7581_IMMD_INT_EN2 BIT(18) +#define EN7581_IMMD_INT_EN1 BIT(17) +#define EN7581_IMMD_INT_EN0 BIT(16) +#define EN7581_TIME_OUT_INT_EN BIT(15) +#define EN7581_NOHOTINTEN2 BIT(14) +#define EN7581_HOFSINTEN2 BIT(13) +#define EN7581_LOFSINTEN2 BIT(12) +#define EN7581_HINTEN2 BIT(11) +#define EN7581_CINTEN2 BIT(10) +#define EN7581_NOHOTINTEN1 BIT(9) +#define EN7581_HOFSINTEN1 BIT(8) +#define EN7581_LOFSINTEN1 BIT(7) +#define EN7581_HINTEN1 BIT(6) +#define EN7581_CINTEN1 BIT(5) +#define EN7581_NOHOTINTEN0 BIT(4) +/* Similar to COLD and HOT also these seems to be swapped in documentation */ +#define EN7581_LOFSINTEN0 BIT(3) /* In documentation: BIT(2) */ +#define EN7581_HOFSINTEN0 BIT(2) /* In documentation: BIT(3) */ +/* It seems documentation have these swapped as the HW + * - Fire BIT(1) when lower than EN7581_COLD_THRE + * - Fire BIT(0) and BIT(5) when higher than EN7581_HOT2NORMAL_THRE or + * EN7581_HOT_THRE + */ +#define EN7581_CINTEN0 BIT(1) /* In documentation: BIT(0) */ +#define EN7581_HINTEN0 BIT(0) /* In documentation: BIT(1) */ +#define EN7581_TEMPMONINTSTS 0x810 +#define EN7581_STAGE3_INT_STAT BIT(31) +#define EN7581_STAGE2_INT_STAT BIT(30) +#define EN7581_STAGE1_INT_STAT BIT(29) +#define EN7581_FILTER_INT_STAT_3 BIT(28) +#define EN7581_IMMD_INT_STS3 BIT(27) +#define EN7581_NOHOTINTSTS3 BIT(26) +#define EN7581_HOFSINTSTS3 BIT(25) +#define EN7581_LOFSINTSTS3 BIT(24) +#define EN7581_HINTSTS3 BIT(23) +#define EN7581_CINTSTS3 BIT(22) +#define EN7581_FILTER_INT_STAT_2 BIT(21) +#define EN7581_FILTER_INT_STAT_1 BIT(20) +#define EN7581_FILTER_INT_STAT_0 BIT(19) +#define EN7581_IMMD_INT_STS2 BIT(18) +#define EN7581_IMMD_INT_STS1 BIT(17) +#define EN7581_IMMD_INT_STS0 BIT(16) +#define EN7581_TIME_OUT_INT_STAT BIT(15) +#define EN7581_NOHOTINTSTS2 BIT(14) +#define EN7581_HOFSINTSTS2 BIT(13) +#define EN7581_LOFSINTSTS2 BIT(12) +#define EN7581_HINTSTS2 BIT(11) +#define EN7581_CINTSTS2 BIT(10) +#define EN7581_NOHOTINTSTS1 BIT(9) +#define EN7581_HOFSINTSTS1 BIT(8) +#define EN7581_LOFSINTSTS1 BIT(7) +#define EN7581_HINTSTS1 BIT(6) +#define EN7581_CINTSTS1 BIT(5) +#define EN7581_NOHOTINTSTS0 BIT(4) +/* Similar to COLD and HOT also these seems to be swapped in documentation */ +#define EN7581_LOFSINTSTS0 BIT(3) /* In documentation: BIT(2) */ +#define EN7581_HOFSINTSTS0 BIT(2) /* In documentation: BIT(3) */ +/* It seems documentation have these swapped as the HW + * - Fire BIT(1) when lower than EN7581_COLD_THRE + * - Fire BIT(0) and BIT(5) when higher than EN7581_HOT2NORMAL_THRE or + * EN7581_HOT_THRE + * + * To clear things, we swap the define but we keep them documented here. + */ +#define EN7581_CINTSTS0 BIT(1) /* In documentation: BIT(0) */ +#define EN7581_HINTSTS0 BIT(0) /* In documentation: BIT(1)*/ +/* Monitor will take the bigger threshold between HOT2NORMAL and HOT + * and will fire both HOT2NORMAL and HOT interrupt when higher than the 2 + * + * It has also been observed that not setting HOT2NORMAL makes the monitor + * treat COLD threshold as HOT2NORMAL. + */ +#define EN7581_TEMPH2NTHRE 0x824 +/* It seems HOT2NORMAL is actually NORMAL2HOT */ +#define EN7581_HOT2NORMAL_THRE GENMASK(11, 0) +#define EN7581_TEMPHTHRE 0x828 +#define EN7581_HOT_THRE GENMASK(11, 0) +/* Monitor will use this as HOT2NORMAL (fire interrupt when lower than...)*/ +#define EN7581_TEMPCTHRE 0x82c +#define EN7581_COLD_THRE GENMASK(11, 0) +/* Also LOW and HIGH offset register are swapped */ +#define EN7581_TEMPOFFSETL 0x830 /* In documentation: 0x834 */ +#define EN7581_LOW_OFFSET GENMASK(11, 0) +#define EN7581_TEMPOFFSETH 0x834 /* In documentation: 0x830 */ +#define EN7581_HIGH_OFFSET GENMASK(11, 0) +#define EN7581_TEMPMSRCTL0 0x838 +#define EN7581_MSRCTL3 GENMASK(11, 9) +#define EN7581_MSRCTL2 GENMASK(8, 6) +#define EN7581_MSRCTL1 GENMASK(5, 3) +#define EN7581_MSRCTL0 GENMASK(2, 0) +#define EN7581_TEMPADCVALIDADDR 0x878 +#define EN7581_ADC_VALID_ADDR GENMASK(31, 0) +#define EN7581_TEMPADCVOLTADDR 0x87c +#define EN7581_ADC_VOLT_ADDR GENMASK(31, 0) +#define EN7581_TEMPRDCTRL 0x880 +/* + * NOTICE: AHB have this set to 0 by default. Means that + * the same addr is used for ADC volt and valid reading. + * In such case, VALID ADDR is used and volt addr is ignored. + */ +#define EN7581_RD_CTRL_DIFF BIT(0) +#define EN7581_TEMPADCVALIDMASK 0x884 +#define EN7581_ADV_RD_VALID_POLARITY BIT(5) +#define EN7581_ADV_RD_VALID_POS GENMASK(4, 0) +#define EN7581_TEMPADCVOLTAGESHIFT 0x888 +#define EN7581_ADC_VOLTAGE_SHIFT GENMASK(4, 0) +/* + * Same values for each CTL. + * Can operate in: + * - 1 sample + * - 2 sample and make average of them + * - 4,6,10,16 sample, drop max and min and make avgerage of them + */ +#define EN7581_MSRCTL_1SAMPLE 0x0 +#define EN7581_MSRCTL_AVG2SAMPLE 0x1 +#define EN7581_MSRCTL_4SAMPLE_MAX_MIX_AVG2 0x2 +#define EN7581_MSRCTL_6SAMPLE_MAX_MIX_AVG4 0x3 +#define EN7581_MSRCTL_10SAMPLE_MAX_MIX_AVG8 0x4 +#define EN7581_MSRCTL_18SAMPLE_MAX_MIX_AVG16 0x5 +#define EN7581_TEMPAHBPOLL 0x840 +#define EN7581_ADC_POLL_INTVL GENMASK(31, 0) +/* PTPSPARE0,2 reg are used to store efuse info for calibrated temp offset */ +#define EN7581_EFUSE_TEMP_OFFSET_REG 0xf20 /* PTPSPARE0 */ +#define EN7581_EFUSE_TEMP_OFFSET GENMASK(31, 16) +#define EN7581_PTPSPARE1 0xf24 /* PTPSPARE1 */ +#define EN7581_EFUSE_TEMP_CPU_SENSOR_REG 0xf28 /* PTPSPARE2 */ + +#define EN7581_SLOPE_X100_DIO_DEFAULT 5645 +#define EN7581_SLOPE_X100_DIO_AVS 5645 + +#define EN7581_INIT_TEMP_CPK_X10 300 +#define EN7581_INIT_TEMP_FTK_X10 620 +#define EN7581_INIT_TEMP_NONK_X10 550 + +#define EN7581_SCU_THERMAL_PROTECT_KEY 0x12 +#define EN7581_SCU_THERMAL_MUX_DIODE1 0x7 + +/* Convert temp to raw value as read from ADC ((((temp / 100) - init) * slope) / 1000) + offset */ +#define TEMP_TO_RAW(priv, temp) ((((((temp) / 100) - (priv)->init_temp) * \ + (priv)->default_slope) / 1000) + \ + (priv)->default_offset) + +/* Convert raw to temp ((((temp - offset) * 1000) / slope + init) * 100) */ +#define RAW_TO_TEMP(priv, raw) (((((raw) - (priv)->default_offset) * 1000) / \ + (priv)->default_slope + \ + (priv)->init_temp) * 100) + +#define AIROHA_MAX_SAMPLES 6 + +struct airoha_thermal_priv { + void __iomem *base; + struct regmap *chip_scu; + struct resource scu_adc_res; + + struct thermal_zone_device *tz; + int init_temp; + int default_slope; + int default_offset; +}; + +static int airoha_get_thermal_ADC(struct airoha_thermal_priv *priv) +{ + u32 val; + + regmap_read(priv->chip_scu, EN7581_DOUT_TADC, &val); + return FIELD_GET(EN7581_DOUT_TADC_MASK, val); +} + +static void airoha_init_thermal_ADC_mode(struct airoha_thermal_priv *priv) +{ + u32 adc_mux, pllrg; + + /* Save PLLRG current value */ + regmap_read(priv->chip_scu, EN7581_PLLRG_PROTECT, &pllrg); + + /* Give access to thermal regs */ + regmap_write(priv->chip_scu, EN7581_PLLRG_PROTECT, EN7581_SCU_THERMAL_PROTECT_KEY); + adc_mux = FIELD_PREP(EN7581_MUX_TADC, EN7581_SCU_THERMAL_MUX_DIODE1); + regmap_write(priv->chip_scu, EN7581_PWD_TADC, adc_mux); + + /* Restore PLLRG value on exit */ + regmap_write(priv->chip_scu, EN7581_PLLRG_PROTECT, pllrg); +} + +static int airoha_thermal_get_temp(struct thermal_zone_device *tz, int *temp) +{ + struct airoha_thermal_priv *priv = thermal_zone_device_priv(tz); + int min_value, max_value, avg_value, value; + int i; + + avg_value = 0; + min_value = INT_MAX; + max_value = INT_MIN; + + for (i = 0; i < AIROHA_MAX_SAMPLES; i++) { + value = airoha_get_thermal_ADC(priv); + min_value = min(value, min_value); + max_value = max(value, max_value); + avg_value += value; + } + + /* Drop min and max and average for the remaining sample */ + avg_value -= (min_value + max_value); + avg_value /= AIROHA_MAX_SAMPLES - 2; + + *temp = RAW_TO_TEMP(priv, avg_value); + return 0; +} + +static int airoha_thermal_set_trips(struct thermal_zone_device *tz, int low, + int high) +{ + struct airoha_thermal_priv *priv = thermal_zone_device_priv(tz); + bool enable_monitor = false; + + if (high != INT_MAX) { + /* Validate high and clamp it a supported value */ + high = clamp_t(int, high, RAW_TO_TEMP(priv, 0), + RAW_TO_TEMP(priv, FIELD_MAX(EN7581_DOUT_TADC_MASK))); + + /* We offset the high temp of 1°C to trigger correct event */ + writel(TEMP_TO_RAW(priv, high) >> 4, + priv->base + EN7581_TEMPOFFSETH); + + enable_monitor = true; + } + + if (low != -INT_MAX) { + /* Validate low and clamp it to a supported value */ + low = clamp_t(int, high, RAW_TO_TEMP(priv, 0), + RAW_TO_TEMP(priv, FIELD_MAX(EN7581_DOUT_TADC_MASK))); + + /* We offset the low temp of 1°C to trigger correct event */ + writel(TEMP_TO_RAW(priv, low) >> 4, + priv->base + EN7581_TEMPOFFSETL); + + enable_monitor = true; + } + + /* Enable sensor 0 monitor after trip are set */ + if (enable_monitor) + writel(EN7581_SENSE0_EN, priv->base + EN7581_TEMPMONCTL0); + + return 0; +} + +static const struct thermal_zone_device_ops thdev_ops = { + .get_temp = airoha_thermal_get_temp, + .set_trips = airoha_thermal_set_trips, +}; + +static irqreturn_t airoha_thermal_irq(int irq, void *data) +{ + struct airoha_thermal_priv *priv = data; + enum thermal_notify_event event; + bool update = false; + u32 status; + + status = readl(priv->base + EN7581_TEMPMONINTSTS); + switch (status & (EN7581_HOFSINTSTS0 | EN7581_LOFSINTSTS0)) { + case EN7581_HOFSINTSTS0: + event = THERMAL_TRIP_VIOLATED; + update = true; + break; + case EN7581_LOFSINTSTS0: + event = THERMAL_EVENT_UNSPECIFIED; + update = true; + break; + default: + /* Should be impossible as we enable only these Interrupt */ + break; + } + + /* Reset Interrupt */ + writel(status, priv->base + EN7581_TEMPMONINTSTS); + + if (update) + thermal_zone_device_update(priv->tz, event); + + return IRQ_HANDLED; +} + +static void airoha_thermal_setup_adc_val(struct device *dev, + struct airoha_thermal_priv *priv) +{ + u32 efuse_calib_info, cpu_sensor; + + /* Setup thermal sensor to ADC mode and setup the mux to DIODE1 */ + airoha_init_thermal_ADC_mode(priv); + /* sleep 10 ms for ADC to enable */ + usleep_range(10 * USEC_PER_MSEC, 11 * USEC_PER_MSEC); + + efuse_calib_info = readl(priv->base + EN7581_EFUSE_TEMP_OFFSET_REG); + if (efuse_calib_info) { + priv->default_offset = FIELD_GET(EN7581_EFUSE_TEMP_OFFSET, efuse_calib_info); + /* Different slope are applied if the sensor is used for CPU or for package */ + cpu_sensor = readl(priv->base + EN7581_EFUSE_TEMP_CPU_SENSOR_REG); + if (cpu_sensor) { + priv->default_slope = EN7581_SLOPE_X100_DIO_DEFAULT; + priv->init_temp = EN7581_INIT_TEMP_FTK_X10; + } else { + priv->default_slope = EN7581_SLOPE_X100_DIO_AVS; + priv->init_temp = EN7581_INIT_TEMP_CPK_X10; + } + } else { + priv->default_offset = airoha_get_thermal_ADC(priv); + priv->default_slope = EN7581_SLOPE_X100_DIO_DEFAULT; + priv->init_temp = EN7581_INIT_TEMP_NONK_X10; + dev_info(dev, "missing thermal calibrarion EFUSE, using non calibrated value\n"); + } +} + +static void airoha_thermal_setup_monitor(struct airoha_thermal_priv *priv) +{ + /* Set measure mode */ + writel(FIELD_PREP(EN7581_MSRCTL0, EN7581_MSRCTL_6SAMPLE_MAX_MIX_AVG4), + priv->base + EN7581_TEMPMSRCTL0); + + /* + * Configure ADC valid reading addr + * The AHB temp monitor system doesn't have direct access to the + * thermal sensor. It does instead work by providing all kind of + * address to configure how to access and setup an ADC for the + * sensor. EN7581 supports only one sensor hence the + * implementation is greatly simplified but the AHB supports + * up to 4 different sensor from the same ADC that can be + * switched by tuning the ADC mux or wiriting address. + * + * We set valid instead of volt as we don't enable valid/volt + * split reading and AHB read valid addr in such case. + */ + writel(priv->scu_adc_res.start + EN7581_DOUT_TADC, + priv->base + EN7581_TEMPADCVALIDADDR); + + /* + * Configure valid bit on a fake value of bit 16. The ADC outputs + * max of 2 bytes for voltage. + */ + writel(FIELD_PREP(EN7581_ADV_RD_VALID_POS, 16), + priv->base + EN7581_TEMPADCVALIDMASK); + + /* + * AHB supports max 12 bytes for ADC voltage. Shift the read + * value 4 bit to the right. Precision lost by this is minimal + * in the order of half a °C and is acceptable in the context + * of triggering interrupt in critical condition. + */ + writel(FIELD_PREP(EN7581_ADC_VOLTAGE_SHIFT, 4), + priv->base + EN7581_TEMPADCVOLTAGESHIFT); + + /* BUS clock is 300MHz counting unit is 3 * 68.64 * 256 = 52.715us */ + writel(FIELD_PREP(EN7581_PERIOD_UNIT, 3), + priv->base + EN7581_TEMPMONCTL1); + + /* + * filt interval is 1 * 52.715us = 52.715us, + * sen interval is 379 * 52.715us = 19.97ms + */ + writel(FIELD_PREP(EN7581_FILT_INTERVAL, 1) | + FIELD_PREP(EN7581_FILT_INTERVAL, 379), + priv->base + EN7581_TEMPMONCTL2); + + /* AHB poll is set to 146 * 68.64 = 10.02us */ + writel(FIELD_PREP(EN7581_ADC_POLL_INTVL, 146), + priv->base + EN7581_TEMPAHBPOLL); +} + +static int airoha_thermal_probe(struct platform_device *pdev) +{ + struct airoha_thermal_priv *priv; + struct device_node *chip_scu_np; + struct device *dev = &pdev->dev; + int irq, ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + chip_scu_np = of_parse_phandle(dev->of_node, "airoha,chip-scu", 0); + if (!chip_scu_np) + return -EINVAL; + + priv->chip_scu = syscon_node_to_regmap(chip_scu_np); + if (IS_ERR(priv->chip_scu)) + return PTR_ERR(priv->chip_scu); + + of_address_to_resource(chip_scu_np, 0, &priv->scu_adc_res); + of_node_put(chip_scu_np); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + airoha_thermal_irq, IRQF_ONESHOT, + pdev->name, priv); + if (ret) { + dev_err(dev, "Can't get interrupt working.\n"); + return ret; + } + + airoha_thermal_setup_monitor(priv); + airoha_thermal_setup_adc_val(dev, priv); + + /* register of thermal sensor and get info from DT */ + priv->tz = devm_thermal_of_zone_register(dev, 0, priv, &thdev_ops); + if (IS_ERR(priv->tz)) { + dev_err(dev, "register thermal zone sensor failed\n"); + return PTR_ERR(priv->tz); + } + + platform_set_drvdata(pdev, priv); + + /* Enable LOW and HIGH interrupt */ + writel(EN7581_HOFSINTEN0 | EN7581_LOFSINTEN0, + priv->base + EN7581_TEMPMONINT); + + return 0; +} + +static const struct of_device_id airoha_thermal_match[] = { + { .compatible = "airoha,en7581-thermal" }, + {}, +}; +MODULE_DEVICE_TABLE(of, airoha_thermal_match); + +static struct platform_driver airoha_thermal_driver = { + .driver = { + .name = "airoha-thermal", + .of_match_table = airoha_thermal_match, + }, + .probe = airoha_thermal_probe, +}; + +module_platform_driver(airoha_thermal_driver); + +MODULE_AUTHOR("Christian Marangi "); +MODULE_DESCRIPTION("Airoha thermal driver"); +MODULE_LICENSE("GPL"); From a204fad8bf6c34ea75984cce14b322f93fd1e197 Mon Sep 17 00:00:00 2001 From: Sricharan Ramabadhran Date: Fri, 28 Feb 2025 09:11:35 +0400 Subject: [PATCH 347/353] dt-bindings: thermal: qcom-tsens: Add ipq5018 compatible IPQ5018 has tsens v1.0 block with 5 sensors of which 4 are in use and 1 interrupt. Acked-by: Rob Herring Signed-off-by: Sricharan Ramabadhran Signed-off-by: George Moussalem Link: https://lore.kernel.org/r/DS7PR19MB88835BD1063C4D5451E14B0E9DCC2@DS7PR19MB8883.namprd19.prod.outlook.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index f9d8012c8cf513..0e653bbe988495 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -39,6 +39,7 @@ properties: - description: v1 of TSENS items: - enum: + - qcom,ipq5018-tsens - qcom,msm8937-tsens - qcom,msm8956-tsens - qcom,msm8976-tsens @@ -251,6 +252,7 @@ allOf: compatible: contains: enum: + - qcom,ipq5018-tsens - qcom,ipq8064-tsens - qcom,msm8960-tsens - qcom,tsens-v0_1 From ec23451f23b59066e3828f98401fdaf00d6349df Mon Sep 17 00:00:00 2001 From: George Moussalem Date: Fri, 28 Feb 2025 09:11:36 +0400 Subject: [PATCH 348/353] thermal/drivers/qcom/tsens: Update conditions to strictly evaluate for IP v2+ TSENS v2.0+ leverage features not available to prior versions such as updated interrupts init routine, masked interrupts, and watchdog. Currently, the checks in place evaluate whether the IP version is greater than v1 which invalidates when updates to v1 or v1 minor versions are implemented. As such, update the conditional statements to strictly evaluate whether the version is greater than or equal to v2 (inclusive). Signed-off-by: George Moussalem Reviewed-by: Dmitry Baryshkov Reviewed-by: Amit Kucheria Link: https://lore.kernel.org/r/DS7PR19MB8883434CAA053648E22AA8AC9DCC2@DS7PR19MB8883.namprd19.prod.outlook.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 1f5d4de017d9ae..43b388bcc7d6bb 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -447,7 +447,7 @@ static void tsens_set_interrupt(struct tsens_priv *priv, u32 hw_id, dev_dbg(priv->dev, "[%u] %s: %s -> %s\n", hw_id, __func__, irq_type ? ((irq_type == 1) ? "UP" : "CRITICAL") : "LOW", enable ? "en" : "dis"); - if (tsens_version(priv) > VER_1_X) + if (tsens_version(priv) >= VER_2_X) tsens_set_interrupt_v2(priv, hw_id, irq_type, enable); else tsens_set_interrupt_v1(priv, hw_id, irq_type, enable); @@ -499,7 +499,7 @@ static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id, ret = regmap_field_read(priv->rf[LOW_INT_CLEAR_0 + hw_id], &d->low_irq_clear); if (ret) return ret; - if (tsens_version(priv) > VER_1_X) { + if (tsens_version(priv) >= VER_2_X) { ret = regmap_field_read(priv->rf[UP_INT_MASK_0 + hw_id], &d->up_irq_mask); if (ret) return ret; @@ -543,7 +543,7 @@ static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id, static inline u32 masked_irq(u32 hw_id, u32 mask, enum tsens_ver ver) { - if (ver > VER_1_X) + if (ver >= VER_2_X) return mask & (1 << hw_id); /* v1, v0.1 don't have a irq mask register */ @@ -733,7 +733,7 @@ static int tsens_set_trips(struct thermal_zone_device *tz, int low, int high) static int tsens_enable_irq(struct tsens_priv *priv) { int ret; - int val = tsens_version(priv) > VER_1_X ? 7 : 1; + int val = tsens_version(priv) >= VER_2_X ? 7 : 1; ret = regmap_field_write(priv->rf[INT_EN], val); if (ret < 0) @@ -1040,7 +1040,7 @@ int __init init_common(struct tsens_priv *priv) } } - if (tsens_version(priv) > VER_1_X && ver_minor > 2) { + if (tsens_version(priv) >= VER_2_X && ver_minor > 2) { /* Watchdog is present only on v2.3+ */ priv->feat->has_watchdog = 1; for (i = WDOG_BARK_STATUS; i <= CC_MON_MASK; i++) { From 05c1ba5941864421e8b35113610fbc44361eb81a Mon Sep 17 00:00:00 2001 From: George Moussalem Date: Fri, 28 Feb 2025 09:11:37 +0400 Subject: [PATCH 349/353] thermal/drivers/qcom/tsens: Add support for tsens v1 without RPM Adding generic support for SoCs with tsens v1.0 IP with no RPM. Due to lack of RPM, tsens has to be reset and enabled in the driver init. SoCs can have support for more sensors than those which will actually be enabled. As such, init will only enable those explicitly added to the hw_ids array. Co-developed-by: Sricharan Ramabadhran Signed-off-by: Sricharan Ramabadhran Signed-off-by: George Moussalem Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/DS7PR19MB8883C5D7974C7735E23923769DCC2@DS7PR19MB8883.namprd19.prod.outlook.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens-v1.c | 48 +++++++++++++++++++++++++++++++++ drivers/thermal/qcom/tsens.c | 14 +++++++--- drivers/thermal/qcom/tsens.h | 1 + 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index 1a7874676f68e4..877b27274fd214 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -79,6 +79,17 @@ static struct tsens_features tsens_v1_feat = { .trip_max_temp = 120000, }; +static struct tsens_features tsens_v1_no_rpm_feat = { + .ver_major = VER_1_X_NO_RPM, + .crit_int = 0, + .combo_int = 0, + .adc = 1, + .srot_split = 1, + .max_sensors = 11, + .trip_min_temp = -40000, + .trip_max_temp = 120000, +}; + static const struct reg_field tsens_v1_regfields[MAX_REGFIELDS] = { /* ----- SROT ------ */ /* VERSION */ @@ -150,6 +161,43 @@ static int __init init_8956(struct tsens_priv *priv) { return init_common(priv); } +static int __init init_tsens_v1_no_rpm(struct tsens_priv *priv) +{ + int i, ret; + u32 mask = 0; + + ret = init_common(priv); + if (ret < 0) { + dev_err(priv->dev, "Init common failed %d\n", ret); + return ret; + } + + ret = regmap_field_write(priv->rf[TSENS_SW_RST], 1); + if (ret) { + dev_err(priv->dev, "Reset failed\n"); + return ret; + } + + for (i = 0; i < priv->num_sensors; i++) + mask |= BIT(priv->sensor[i].hw_id); + + ret = regmap_field_update_bits(priv->rf[SENSOR_EN], mask, mask); + if (ret) { + dev_err(priv->dev, "Sensor Enable failed\n"); + return ret; + } + + ret = regmap_field_write(priv->rf[TSENS_EN], 1); + if (ret) { + dev_err(priv->dev, "Enable failed\n"); + return ret; + } + + ret = regmap_field_write(priv->rf[TSENS_SW_RST], 0); + + return ret; +} + static const struct tsens_ops ops_generic_v1 = { .init = init_common, .calibrate = calibrate_v1, diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 43b388bcc7d6bb..2cda92f3d12574 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -975,10 +975,16 @@ int __init init_common(struct tsens_priv *priv) ret = regmap_field_read(priv->rf[TSENS_EN], &enabled); if (ret) goto err_put_device; - if (!enabled && (tsens_version(priv) != VER_2_X_NO_RPM)) { - dev_err(dev, "%s: device not enabled\n", __func__); - ret = -ENODEV; - goto err_put_device; + if (!enabled) { + switch (tsens_version(priv)) { + case VER_1_X_NO_RPM: + case VER_2_X_NO_RPM: + break; + default: + dev_err(dev, "%s: device not enabled\n", __func__); + ret = -ENODEV; + goto err_put_device; + } } priv->rf[SENSOR_EN] = devm_regmap_field_alloc(dev, priv->srot_map, diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index 336bc868fd7c31..e3cb611426c425 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -34,6 +34,7 @@ enum tsens_ver { VER_0 = 0, VER_0_1, VER_1_X, + VER_1_X_NO_RPM, VER_2_X, VER_2_X_NO_RPM, }; From 3325ac02965ce3aa4b5c472377f0c12b399b253f Mon Sep 17 00:00:00 2001 From: Sricharan Ramabadhran Date: Fri, 28 Feb 2025 09:11:38 +0400 Subject: [PATCH 350/353] thermal/drivers/qcom/tsens: Add support for IPQ5018 tsens IPQ5018 has tsens IP V1.0, 5 sensors of which 4 are in use and 1 interrupt. The IP does not have a RPM, hence use init routine for tsens v1.0 without RPM which does not early enable. Reviewed-by: Dmitry Baryshkov Signed-off-by: Sricharan Ramabadhran Signed-off-by: George Moussalem Link: https://lore.kernel.org/r/DS7PR19MB8883BD0E36C08DD1D03CE1CB9DCC2@DS7PR19MB8883.namprd19.prod.outlook.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens-v1.c | 14 ++++++++++++++ drivers/thermal/qcom/tsens.c | 3 +++ drivers/thermal/qcom/tsens.h | 3 +++ 3 files changed, 20 insertions(+) diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index 877b27274fd214..27360e70d62a94 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -242,3 +242,17 @@ struct tsens_plat_data data_8976 = { .feat = &tsens_v1_feat, .fields = tsens_v1_regfields, }; + +const struct tsens_ops ops_ipq5018 = { + .init = init_tsens_v1_no_rpm, + .calibrate = tsens_calibrate_common, + .get_temp = get_temp_tsens_valid, +}; + +const struct tsens_plat_data data_ipq5018 = { + .num_sensors = 5, + .ops = &ops_ipq5018, + .hw_ids = (unsigned int []){0, 1, 2, 3, 4}, + .feat = &tsens_v1_no_rpm_feat, + .fields = tsens_v1_regfields, +}; diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 2cda92f3d12574..a2422ebee81691 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -1108,6 +1108,9 @@ static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume); static const struct of_device_id tsens_table[] = { { + .compatible = "qcom,ipq5018-tsens", + .data = &data_ipq5018, + }, { .compatible = "qcom,ipq5332-tsens", .data = &data_ipq5332, }, { diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index e3cb611426c425..2a7afa4c899b9e 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -652,6 +652,9 @@ extern struct tsens_plat_data data_8226, data_8909, data_8916, data_8939, data_8 /* TSENS v1 targets */ extern struct tsens_plat_data data_tsens_v1, data_8937, data_8976, data_8956; +/* TSENS v1 with no RPM targets */ +extern const struct tsens_plat_data data_ipq5018; + /* TSENS v2 targets */ extern struct tsens_plat_data data_8996, data_ipq8074, data_tsens_v2; extern const struct tsens_plat_data data_ipq5332, data_ipq5424; From f59fb377d7675092f4fb2d242e40eb37b0eb65c9 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Wed, 14 May 2025 23:39:12 +0200 Subject: [PATCH 351/353] thermal/drivers/airoha: Fix spelling mistake Fix various spelling mistake in airoha_thermal_setup_monitor() and define. Reported-by: Alok Tiwari Signed-off-by: Christian Marangi Link: https://lore.kernel.org/r/20250514213919.2321490-1-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/airoha_thermal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/airoha_thermal.c b/drivers/thermal/airoha_thermal.c index 45116cdaee653b..9a7a702a17dee2 100644 --- a/drivers/thermal/airoha_thermal.c +++ b/drivers/thermal/airoha_thermal.c @@ -155,7 +155,7 @@ * Can operate in: * - 1 sample * - 2 sample and make average of them - * - 4,6,10,16 sample, drop max and min and make avgerage of them + * - 4,6,10,16 sample, drop max and min and make average of them */ #define EN7581_MSRCTL_1SAMPLE 0x0 #define EN7581_MSRCTL_AVG2SAMPLE 0x1 @@ -365,12 +365,12 @@ static void airoha_thermal_setup_monitor(struct airoha_thermal_priv *priv) /* * Configure ADC valid reading addr * The AHB temp monitor system doesn't have direct access to the - * thermal sensor. It does instead work by providing all kind of - * address to configure how to access and setup an ADC for the + * thermal sensor. It does instead work by providing various + * addresses to configure how to access and setup an ADC for the * sensor. EN7581 supports only one sensor hence the * implementation is greatly simplified but the AHB supports - * up to 4 different sensor from the same ADC that can be - * switched by tuning the ADC mux or wiriting address. + * up to 4 different sensors from the same ADC that can be + * switched by tuning the ADC mux or writing address. * * We set valid instead of volt as we don't enable valid/volt * split reading and AHB read valid addr in such case. From e02ba486e73ecd48ca4fa4a1616483d2b54bde23 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 19 May 2025 15:09:01 +0800 Subject: [PATCH 352/353] thermal: intel: x86_pkg_temp_thermal: Fix bogus trip temperature The tj_max value obtained from the Intel TCC library are in Celsius, whereas the thermal subsystem operates in milli-Celsius. This discrepancy leads to incorrect trip temperature calculations. Fix bogus trip temperature by converting tj_max to milli-Celsius Unit. Fixes: 8ef0ca4a177d ("Merge back other thermal control material for 6.3.") Signed-off-by: Zhang Rui Reported-by: zhang ning Closes: https://lore.kernel.org/all/TY2PR01MB3786EF0FE24353026293F5ACCD97A@TY2PR01MB3786.jpnprd01.prod.outlook.com/ Tested-by: zhang ning Cc: 6.3+ # 6.3+ Link: https://patch.msgid.link/20250519070901.1031233-1-rui.zhang@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/x86_pkg_temp_thermal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c index 496abf8e55e0d5..2841d14914b710 100644 --- a/drivers/thermal/intel/x86_pkg_temp_thermal.c +++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c @@ -329,6 +329,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) tj_max = intel_tcc_get_tjmax(cpu); if (tj_max < 0) return tj_max; + tj_max *= 1000; zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL); if (!zonedev) From ae3701e7fb9eea353be23ccd4eca689f30b36f41 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 20 May 2025 20:55:18 +0800 Subject: [PATCH 353/353] [GitHub Actions] install libdw-dev libdw-dev is required for the new kernel version. Signed-off-by: Bard Liao --- .github/workflows/buildtest.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/buildtest.yml b/.github/workflows/buildtest.yml index f7f59bddadbe54..372952b53187fc 100644 --- a/.github/workflows/buildtest.yml +++ b/.github/workflows/buildtest.yml @@ -138,6 +138,11 @@ jobs: sudo apt update sudo apt install -y uuid-dev + - name: Install libdw-dev + run: | + sudo apt update + sudo apt install -y libdw-dev + - name: build start run: | export ARCH=x86_64 KCFLAGS="-Wall -Werror"