Skip to content

Commit 2370c94

Browse files
committed
powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV
jira LE-1907 Rebuild_History Non-Buildable kernel-5.14.0-427.37.1.el9_4 commit-author Gaurav Batra <gbatra@linux.vnet.ibm.com> commit 09a3c1e When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to NULL pointer exception: Kernel attempted to read user page (0) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000000 Faulting instruction address: 0xc000000020847ad4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ ctrliq#12 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c REGS: c000000029162ca0 TRAP: 0300 Not tainted (6.4.0-Test102+) MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 48288244 XER: 00000008 CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ... NIP _find_next_zero_bit+0x24/0x110 LR bitmap_find_next_zero_area_off+0x5c/0xe0 Call Trace: dev_printk_emit+0x38/0x48 (unreliable) iommu_area_alloc+0xc4/0x180 iommu_range_alloc+0x1e8/0x580 iommu_alloc+0x60/0x130 iommu_alloc_coherent+0x158/0x2b0 dma_iommu_alloc_coherent+0x3c/0x50 dma_alloc_attrs+0x170/0x1f0 mlx5_cmd_init+0xc0/0x760 [mlx5_core] mlx5_function_setup+0xf0/0x510 [mlx5_core] mlx5_init_one+0x84/0x210 [mlx5_core] probe_one+0x118/0x2c0 [mlx5_core] local_pci_probe+0x68/0x110 pci_call_probe+0x68/0x200 pci_device_probe+0xbc/0x1a0 really_probe+0x104/0x540 __driver_probe_device+0xb4/0x230 driver_probe_device+0x54/0x130 __driver_attach+0x158/0x2b0 bus_for_each_dev+0xa8/0x130 driver_attach+0x34/0x50 bus_add_driver+0x16c/0x300 driver_register+0xa4/0x1b0 __pci_register_driver+0x68/0x80 mlx5_init+0xb8/0x100 [mlx5_core] do_one_initcall+0x60/0x300 do_init_module+0x7c/0x2b0 At the time of LPAR dump, before kexec hands over control to kdump kernel, DDWs (Dynamic DMA Windows) are scanned and added to the FDT. For the SR-IOV case, default DMA window "ibm,dma-window" is removed from the FDT and DDW added, for the device. Now, kexec hands over control to the kdump kernel. When the kdump kernel initializes, PCI busses are scanned and IOMMU group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV case, there is no "ibm,dma-window". The original commit: b1fc44e, fixes the path where memory is pre-mapped (direct mapped) to the DDW. When TCEs are direct mapped, there is no need to initialize IOMMU tables. iommu_table_setparms_lpar() only considers "ibm,dma-window" property when initiallizing IOMMU table. In the scenario where TCEs are dynamically allocated for SR-IOV, newly created IOMMU table is not initialized. Later, when the device driver tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown from iommu_area_alloc(). The fix is to initialize the IOMMU table with DDW property stored in the FDT. There are 2 points to remember: 1. For the dedicated adapter, kdump kernel would encounter both default and DDW in FDT. In this case, DDW property is used to initialize the IOMMU table. 2. A DDW could be direct or dynamic mapped. kdump kernel would initialize IOMMU table and mark the existing DDW as "dynamic". This works fine since, at the time of table initialization, iommu_table_clear() makes some space in the DDW, for some predefined number of TCEs which are needed for kdump to succeed. Fixes: b1fc44e ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window") Signed-off-by: Gaurav Batra <gbatra@linux.vnet.ibm.com> Reviewed-by: Brian King <brking@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240125203017.61014-1-gbatra@linux.ibm.com (cherry picked from commit 09a3c1e) Signed-off-by: Jonathan Maple <jmaple@ciq.com>
1 parent ffb9f06 commit 2370c94

File tree

1 file changed

+105
-51
lines changed
  • arch/powerpc/platforms/pseries

1 file changed

+105
-51
lines changed

arch/powerpc/platforms/pseries/iommu.c

Lines changed: 105 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -570,29 +570,6 @@ static void iommu_table_setparms(struct pci_controller *phb,
570570

571571
struct iommu_table_ops iommu_table_lpar_multi_ops;
572572

573-
/*
574-
* iommu_table_setparms_lpar
575-
*
576-
* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
577-
*/
578-
static void iommu_table_setparms_lpar(struct pci_controller *phb,
579-
struct device_node *dn,
580-
struct iommu_table *tbl,
581-
struct iommu_table_group *table_group,
582-
const __be32 *dma_window)
583-
{
584-
unsigned long offset, size, liobn;
585-
586-
of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
587-
588-
iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
589-
&iommu_table_lpar_multi_ops);
590-
591-
592-
table_group->tce32_start = offset;
593-
table_group->tce32_size = size;
594-
}
595-
596573
struct iommu_table_ops iommu_table_pseries_ops = {
597574
.set = tce_build_pSeries,
598575
.clear = tce_free_pSeries,
@@ -721,44 +698,92 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
721698
* dynamic 64bit DMA window, walking up the device tree.
722699
*/
723700
static struct device_node *pci_dma_find(struct device_node *dn,
724-
const __be32 **dma_window)
701+
struct dynamic_dma_window_prop *prop)
725702
{
726-
const __be32 *dw = NULL;
703+
const __be32 *default_prop = NULL;
704+
const __be32 *ddw_prop = NULL;
705+
struct device_node *rdn = NULL;
706+
bool default_win = false, ddw_win = false;
727707

728708
for ( ; dn && PCI_DN(dn); dn = dn->parent) {
729-
dw = of_get_property(dn, "ibm,dma-window", NULL);
730-
if (dw) {
731-
if (dma_window)
732-
*dma_window = dw;
733-
return dn;
709+
default_prop = of_get_property(dn, "ibm,dma-window", NULL);
710+
if (default_prop) {
711+
rdn = dn;
712+
default_win = true;
713+
}
714+
ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
715+
if (ddw_prop) {
716+
rdn = dn;
717+
ddw_win = true;
718+
break;
719+
}
720+
ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
721+
if (ddw_prop) {
722+
rdn = dn;
723+
ddw_win = true;
724+
break;
734725
}
735-
dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
736-
if (dw)
737-
return dn;
738-
dw = of_get_property(dn, DMA64_PROPNAME, NULL);
739-
if (dw)
740-
return dn;
726+
727+
/* At least found default window, which is the case for normal boot */
728+
if (default_win)
729+
break;
741730
}
742731

743-
return NULL;
732+
/* For PCI devices there will always be a DMA window, either on the device
733+
* or parent bus
734+
*/
735+
WARN_ON(!(default_win | ddw_win));
736+
737+
/* caller doesn't want to get DMA window property */
738+
if (!prop)
739+
return rdn;
740+
741+
/* parse DMA window property. During normal system boot, only default
742+
* DMA window is passed in OF. But, for kdump, a dedicated adapter might
743+
* have both default and DDW in FDT. In this scenario, DDW takes precedence
744+
* over default window.
745+
*/
746+
if (ddw_win) {
747+
struct dynamic_dma_window_prop *p;
748+
749+
p = (struct dynamic_dma_window_prop *)ddw_prop;
750+
prop->liobn = p->liobn;
751+
prop->dma_base = p->dma_base;
752+
prop->tce_shift = p->tce_shift;
753+
prop->window_shift = p->window_shift;
754+
} else if (default_win) {
755+
unsigned long offset, size, liobn;
756+
757+
of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
758+
759+
prop->liobn = cpu_to_be32((u32)liobn);
760+
prop->dma_base = cpu_to_be64(offset);
761+
prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K);
762+
prop->window_shift = cpu_to_be32(order_base_2(size));
763+
}
764+
765+
return rdn;
744766
}
745767

746768
static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
747769
{
748770
struct iommu_table *tbl;
749771
struct device_node *dn, *pdn;
750772
struct pci_dn *ppci;
751-
const __be32 *dma_window = NULL;
773+
struct dynamic_dma_window_prop prop;
752774

753775
dn = pci_bus_to_OF_node(bus);
754776

755777
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
756778
dn);
757779

758-
pdn = pci_dma_find(dn, &dma_window);
780+
pdn = pci_dma_find(dn, &prop);
759781

760-
if (dma_window == NULL)
761-
pr_debug(" no ibm,dma-window property !\n");
782+
/* In PPC architecture, there will always be DMA window on bus or one of the
783+
* parent bus. During reboot, there will be ibm,dma-window property to
784+
* define DMA window. For kdump, there will at least be default window or DDW
785+
* or both.
786+
*/
762787

763788
ppci = PCI_DN(pdn);
764789

@@ -768,13 +793,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
768793
if (!ppci->table_group) {
769794
ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
770795
tbl = ppci->table_group->tables[0];
771-
if (dma_window) {
772-
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
773-
ppci->table_group, dma_window);
774796

775-
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
776-
panic("Failed to initialize iommu table");
777-
}
797+
iommu_table_setparms_common(tbl, ppci->phb->bus->number,
798+
be32_to_cpu(prop.liobn),
799+
be64_to_cpu(prop.dma_base),
800+
1ULL << be32_to_cpu(prop.window_shift),
801+
be32_to_cpu(prop.tce_shift), NULL,
802+
&iommu_table_lpar_multi_ops);
803+
804+
/* Only for normal boot with default window. Doesn't matter even
805+
* if we set these with DDW which is 64bit during kdump, since
806+
* these will not be used during kdump.
807+
*/
808+
ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
809+
ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
810+
811+
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
812+
panic("Failed to initialize iommu table");
813+
778814
iommu_register_group(ppci->table_group,
779815
pci_domain_nr(bus), 0);
780816
pr_debug(" created table: %p\n", ppci->table_group);
@@ -965,6 +1001,12 @@ static void find_existing_ddw_windows_named(const char *name)
9651001
continue;
9661002
}
9671003

1004+
/* If at the time of system initialization, there are DDWs in OF,
1005+
* it means this is during kexec. DDW could be direct or dynamic.
1006+
* We will just mark DDWs as "dynamic" since this is kdump path,
1007+
* no need to worry about perforance. ddw_list_new_entry() will
1008+
* set window->direct = false.
1009+
*/
9681010
window = ddw_list_new_entry(pdn, dma64);
9691011
if (!window) {
9701012
of_node_put(pdn);
@@ -1532,8 +1574,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15321574
{
15331575
struct device_node *pdn, *dn;
15341576
struct iommu_table *tbl;
1535-
const __be32 *dma_window = NULL;
15361577
struct pci_dn *pci;
1578+
struct dynamic_dma_window_prop prop;
15371579

15381580
pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
15391581

@@ -1546,7 +1588,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15461588
dn = pci_device_to_OF_node(dev);
15471589
pr_debug(" node is %pOF\n", dn);
15481590

1549-
pdn = pci_dma_find(dn, &dma_window);
1591+
pdn = pci_dma_find(dn, &prop);
15501592
if (!pdn || !PCI_DN(pdn)) {
15511593
printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
15521594
"no DMA window found for pci dev=%s dn=%pOF\n",
@@ -1559,8 +1601,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15591601
if (!pci->table_group) {
15601602
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
15611603
tbl = pci->table_group->tables[0];
1562-
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1563-
pci->table_group, dma_window);
1604+
1605+
iommu_table_setparms_common(tbl, pci->phb->bus->number,
1606+
be32_to_cpu(prop.liobn),
1607+
be64_to_cpu(prop.dma_base),
1608+
1ULL << be32_to_cpu(prop.window_shift),
1609+
be32_to_cpu(prop.tce_shift), NULL,
1610+
&iommu_table_lpar_multi_ops);
1611+
1612+
/* Only for normal boot with default window. Doesn't matter even
1613+
* if we set these with DDW which is 64bit during kdump, since
1614+
* these will not be used during kdump.
1615+
*/
1616+
pci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
1617+
pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
15641618

15651619
iommu_init_table(tbl, pci->phb->node, 0, 0);
15661620
iommu_register_group(pci->table_group,

0 commit comments

Comments
 (0)