diff --git a/sys/powerpc/powernv/opal.h b/sys/powerpc/powernv/opal.h --- a/sys/powerpc/powernv/opal.h +++ b/sys/powerpc/powernv/opal.h @@ -203,6 +203,8 @@ #define OPAL_XIVE_EQ_ALWAYS_NOTIFY 0x00000002 #define OPAL_XIVE_EQ_ESCALATE 0x00000004 +#define OPAL_IODA_INVALID_PE 0xffffffff + struct opal_msg { uint32_t msg_type; uint32_t reserved; diff --git a/sys/powerpc/powernv/opal_iommu.h b/sys/powerpc/powernv/opal_iommu.h new file mode 100644 --- /dev/null +++ b/sys/powerpc/powernv/opal_iommu.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025-2026 Raptor Engineering, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _POWERNV_IODA_IOMMU_H_ +#define _POWERNV_IODA_IOMMU_H_ + +#include +#include + +int powernv_iommu_set_dma_tag(device_t dev, device_t child, bus_dma_tag_t tag); +int powernv_iommu_map(device_t dev, bus_dma_segment_t *segs, int *nsegs, + bus_addr_t min, bus_addr_t max, bus_size_t alignment, bus_addr_t boundary, + void *cookie); +int powernv_iommu_unmap(device_t dev, bus_dma_segment_t *segs, int nsegs, + void *cookie); + +#endif diff --git a/sys/powerpc/powernv/opal_pci.h b/sys/powerpc/powernv/opal_pci.h new file mode 100644 --- /dev/null +++ b/sys/powerpc/powernv/opal_pci.h @@ -0,0 +1,181 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025-2026 Raptor Engineering, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _POWERNV_OPAL_PCI_H_ +#define _POWERNV_OPAL_PCI_H_ + +#include +#include + +#include + +/* + * IODA firmware and hardware both expect a 64k page size, even if the kernel + * notionally uses a lower page size (e.g. 4k) for other operations. + */ +#define POWERNV_FW_PAGE_SHIFT 16 +#define POWERNV_INVALID_DEVFN 0xffffffff + +/* + * The content of the TVT itself is read by hardware (cached into the PHB); + * all other fields are used by the kernel. Note that the TVT contents + * are interpreted differently based on how the PHB is set up; this is why + * we need to keep a local struct for each PHB with the associated configuration + * values! + */ +struct powernv_ioda_tvt { + uint64_t tce_size; + uint64_t tce_tbl_size; + uint64_t tvt_entry_count; + + int page_shift; + + /* contiguous memory read directly by hardware after TCE invalidation */ + uint64_t *tvt_base; +}; + +/* Delayed Resource Configuration (DRC) list */ +typedef struct opalpci_phb_drc_list { + device_t bus; + device_t child; + int type; + int rid; + rman_res_t start; + rman_res_t end; + rman_res_t count; + u_int flags; +} opalpci_phb_drc_list_t; + +struct phb_drc_list_entry +{ + LIST_ENTRY(phb_drc_list_entry) next_entry; + opalpci_phb_drc_list_t list; +}; + +/* + * NOTE + * The "master" PE flag is a convention with no hardware meaning + * It indicates that basic device control (32-bit BARs, MSI, etc.) has been + * routed through that PE. "Slave" PEs are used only to enable additional + * 64-bit BARs on the device that won't fit inside the master PE fpr that + * specific device. + */ +typedef struct opalpci_phb_pe_map { + device_t pci_bus_device; + uint32_t pci_bus_start_devfn; + uint32_t pci_bus_end_devfn; + uint64_t phb_pe; + int is_master_pe; + uint64_t master_pe; +} opalpci_phb_pe_map_t; + +struct phb_pe_map_entry +{ + LIST_ENTRY(phb_pe_map_entry) next_entry; + opalpci_phb_pe_map_t mapping; +}; + +typedef struct opalpci_phb_pe_data { + uint32_t pci_bus_start_devfn; + uint32_t pci_bus_end_devfn; + uint8_t opal_bus_filter; + uint8_t opal_dev_filter; + uint8_t opal_func_filter; + struct powernv_ioda_tvt tvt; /* TVT table for IOMMU */ + int device_count; + int dma_setup_finished; + int msi_activated; + uint8_t *active_32_bit_mmio_segments; + uint64_t phb_pe; +} opalpci_phb_pe_data_t; + +struct phb_pe_data_entry +{ + LIST_ENTRY(phb_pe_data_entry) next_entry; + opalpci_phb_pe_data_t mapping; +}; + +/* Define the shared soft context for all OPAL PHBs */ +struct opalpci_softc { + struct ofw_pci_softc ofw_sc; + uint64_t phb_id; + vmem_t *msi_vmem; + int msi_base; /* Base XIVE number */ + int base_msi_irq; /* Base IRQ assigned by FreeBSD to this PIC */ + int pe_setup_finalized; /* 0 if PE setup has not yet been run */ + struct resource *r_reg; + + uint64_t *tce; /* TCE table for 1:1 mapping */ + + uint8_t root_bus_id; + uint64_t root_bus_pe; + cell_t max_pe_count; + cell_t reserved_pe_id; /* Index of the reserved PE */ + uint64_t m32_pci_base; + uint64_t m32_window_base; + uint64_t m32_window_size; + uint64_t m32_segment_size; + int m64_window_present; + int m64_window_bar_idx; + uint64_t m64_pci_base; + uint64_t m64_window_base; + uint64_t m64_window_size; + uint64_t m64_segment_size; + + /* linked list storing PCI device to PE mappings */ + LIST_HEAD(phb_drc_list, phb_drc_list_entry) pe_drc_list; + LIST_HEAD(phb_pe_map_list, phb_pe_map_entry) pe_map_list; + LIST_HEAD(phb_pe_data_list, phb_pe_data_entry) pe_data_list; +}; + +struct phb_pe_data_entry * opalpci_add_phb_pe_data_entry(device_t phb, + uint64_t pe); +int opalpci_remove_phb_pe_data_entry(device_t phb, uint64_t pe); +struct phb_pe_data_entry * opalpci_get_phb_pe_data_entry(device_t phb, + uint64_t pe); + +struct phb_pe_map_entry * opalpci_add_phb_pe_map_entry(device_t phb, + device_t dev, uint64_t pe, int is_master_pe); +int opalpci_remove_phb_pe_map_entry(device_t phb, device_t dev, uint64_t pe); +uint64_t opalpci_find_phb_pe_for_device(device_t phb, device_t dev); +uint64_t opalpci_find_phb_pe_for_devfn(device_t phb, uint32_t devfn); +uint64_t opalpci_find_unused_phb_pe(device_t phb); +uint64_t opalpci_phb_get_slave_pe_count_for_device(device_t phb, device_t dev, + uint64_t pe); + +int opalpc_ioda2_configure_dma_bypass(device_t phb, uint64_t pe, int enable); + +int opalpci_phb_is_pci_memory_window_64_bit(device_t phb, + rman_res_t window_start); +uint64_t opalpci_phb_get_m64_pe_for_addr(device_t phb, rman_res_t addr); + +int opalpci_ioda2_set_iommu_window(device_t phb, uint64_t pe, + int window_number, struct powernv_ioda_tvt * tvt); +int opalpci_ioda2_clear_iommu_window(device_t phb, uint64_t pe, + int window_number, struct powernv_ioda_tvt * tvt); + +#endif /* _POWERNV_OPAL_PCI_H_ */ \ No newline at end of file diff --git a/sys/powerpc/powernv/opal_pci.c b/sys/powerpc/powernv/opal_pci.c --- a/sys/powerpc/powernv/opal_pci.c +++ b/sys/powerpc/powernv/opal_pci.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2015-2016 Nathan Whitehorn * Copyright (c) 2017-2018 Semihalf + * Copyright (c) 2025-2026 Raptor Engineering, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,6 +45,7 @@ #include #include +#include #include #include @@ -56,12 +58,15 @@ #include "pic_if.h" #include "iommu_if.h" #include "opal.h" +#include "opal_pci.h" +#include "opal_iommu.h" #define OPAL_PCI_TCE_MAX_ENTRIES (1024*1024UL) #define OPAL_PCI_TCE_DEFAULT_SEG_SIZE (16*1024*1024UL) #define OPAL_PCI_TCE_R (1UL << 0) #define OPAL_PCI_TCE_W (1UL << 1) #define PHB3_TCE_KILL_INVAL_ALL (1UL << 63) +#define PHB3_TCE_KILL_INVAL_PE (1UL << 62) /* * Device interface. @@ -86,6 +91,13 @@ int irq); static int opalpci_map_msi(device_t dev, device_t child, int irq, uint64_t *addr, uint32_t *data); +static struct resource * opalpci_alloc_resource(device_t bus, device_t child, + int type, int rid, + rman_res_t start, rman_res_t end, rman_res_t count, + u_int flags); +static void opalpci_finalize_pe_setup(device_t dev); +static int opalpci_release_resource(device_t bus, device_t child, + struct resource *res); static int opalpci_route_interrupt(device_t bus, device_t dev, int pin); /* @@ -148,27 +160,29 @@ DEVMETHOD(pic_eoi, opalpic_pic_eoi), /* Bus interface */ + DEVMETHOD(bus_alloc_resource, opalpci_alloc_resource), + DEVMETHOD(bus_release_resource, opalpci_release_resource), + DEVMETHOD(bus_new_pass, opalpci_finalize_pe_setup), DEVMETHOD(bus_get_dma_tag, opalpci_get_dma_tag), DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), - DEVMETHOD_END -}; +#ifdef IOMMU + /* IOMMU functions */ + DEVMETHOD(iommu_map, powernv_iommu_map), + DEVMETHOD(iommu_unmap, powernv_iommu_unmap), +#endif -struct opalpci_softc { - struct ofw_pci_softc ofw_sc; - uint64_t phb_id; - vmem_t *msi_vmem; - int msi_base; /* Base XIVE number */ - int base_msi_irq; /* Base IRQ assigned by FreeBSD to this PIC */ - uint64_t *tce; /* TCE table for 1:1 mapping */ - struct resource *r_reg; + DEVMETHOD_END }; DEFINE_CLASS_1(pcib, opalpci_driver, opalpci_methods, sizeof(struct opalpci_softc), ofw_pcib_driver); EARLY_DRIVER_MODULE(opalpci, ofwbus, opalpci_driver, 0, 0, BUS_PASS_BUS); +/* define a dedicated memory pool */ +MALLOC_DEFINE(M_OPALPCI_PHB, "opal-phb", "Internal data for PowerNV PHBs"); + static int opalpci_probe(device_t dev) { @@ -190,6 +204,7 @@ return (BUS_PROBE_GENERIC); } +#if 0 static void pci_phb3_tce_invalidate_entire(struct opalpci_softc *sc) { @@ -198,6 +213,17 @@ bus_write_8(sc->r_reg, 0x210, PHB3_TCE_KILL_INVAL_ALL); mb(); } +#endif + +static void +pci_phb3_tce_invalidate_pe(struct opalpci_softc *sc, uint64_t pe) +{ + uint64_t dword = PHB3_TCE_KILL_INVAL_PE | (pe & 0xff); + + mb(); + bus_write_8(sc->r_reg, 0x210, dword); + mb(); +} /* Simple function to round to a power of 2 */ static uint64_t @@ -233,23 +259,553 @@ return (1ULL << sizes[count - 1]); } +struct phb_pe_data_entry * +opalpci_add_phb_pe_data_entry(device_t phb, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_data_entry, &sc->pe_data_list, next_entry) { + if (pe_data_entry->mapping.phb_pe == pe) + KASSERT(0, + ("PHB[%jd] PE %02lx already allocated", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + } + + pe_data_entry = malloc(sizeof(struct phb_pe_data_entry), M_OPALPCI_PHB, + M_WAITOK | M_ZERO); + if (!pe_data_entry) + return NULL; + + pe_data_entry->mapping.pci_bus_start_devfn = POWERNV_INVALID_DEVFN; + pe_data_entry->mapping.pci_bus_end_devfn = POWERNV_INVALID_DEVFN; + pe_data_entry->mapping.opal_bus_filter = OPAL_PCI_BUS_ANY; + pe_data_entry->mapping.opal_dev_filter = OPAL_IGNORE_RID_DEVICE_NUMBER; + pe_data_entry->mapping.opal_func_filter = OPAL_IGNORE_RID_FUNC_NUMBER; + pe_data_entry->mapping.tvt.tvt_base = NULL; + pe_data_entry->mapping.device_count = 0; + pe_data_entry->mapping.dma_setup_finished = 0; + pe_data_entry->mapping.msi_activated = 0; + pe_data_entry->mapping.phb_pe = pe; + pe_data_entry->mapping.active_32_bit_mmio_segments = malloc( + (sizeof(pe_data_entry->mapping.active_32_bit_mmio_segments[0]) * + sc->max_pe_count), + M_OPALPCI_PHB, M_WAITOK | M_ZERO); + LIST_INSERT_HEAD(&sc->pe_data_list, pe_data_entry, next_entry); + + return pe_data_entry; +} + +int +opalpci_remove_phb_pe_data_entry(device_t phb, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_data_entry, &sc->pe_data_list, next_entry) { + if (pe_data_entry->mapping.phb_pe == pe) + break; + } + + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] PE %02lx not found", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + + free(pe_data_entry->mapping.tvt.tvt_base, M_DEVBUF); + free(pe_data_entry->mapping.active_32_bit_mmio_segments, M_OPALPCI_PHB); + pe_data_entry->mapping.tvt.tvt_base = NULL; + + LIST_REMOVE(pe_data_entry, next_entry); + free(pe_data_entry, M_OPALPCI_PHB); + + return 0; +} + +struct phb_pe_data_entry * +opalpci_get_phb_pe_data_entry(device_t phb, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_data_entry, &sc->pe_data_list, next_entry) { + if (pe_data_entry->mapping.phb_pe == pe) + return pe_data_entry; + } + + return NULL; +} + +struct phb_pe_map_entry * +opalpci_add_phb_pe_map_entry(device_t phb, device_t dev, uint64_t pe, + int is_master_pe) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry * pe_map_entry; + struct phb_pe_data_entry * pe_data_entry; + + sc = device_get_softc(phb); + + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (!pe_data_entry) { + pe_data_entry = opalpci_add_phb_pe_data_entry(phb, pe); + + if (!pe_data_entry) { + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] Unable to allocate data structures for PE %02lx", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + } + } + + pe_data_entry->mapping.device_count++; + + LIST_FOREACH(pe_map_entry, &sc->pe_map_list, next_entry) { + if ((pe_map_entry->mapping.pci_bus_device == dev) && + (pe_map_entry->mapping.phb_pe == pe)) + KASSERT(0, + ("PHB[%jd] PE %02lx already assigned to device", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + } + + pe_map_entry = malloc(sizeof(struct phb_pe_map_entry), M_OPALPCI_PHB, + M_WAITOK | M_ZERO); + if (!pe_map_entry) + return NULL; + + pe_map_entry->mapping.pci_bus_device = dev; + pe_map_entry->mapping.pci_bus_start_devfn = POWERNV_INVALID_DEVFN; + pe_map_entry->mapping.pci_bus_end_devfn = POWERNV_INVALID_DEVFN; + pe_map_entry->mapping.phb_pe = pe; + pe_map_entry->mapping.is_master_pe = is_master_pe; + pe_map_entry->mapping.master_pe = OPAL_IODA_INVALID_PE; + LIST_INSERT_HEAD(&sc->pe_map_list, pe_map_entry, next_entry); + + return pe_map_entry; +} + +int +opalpci_remove_phb_pe_map_entry(device_t phb, device_t dev, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry *pe_map_entry, *pe_map_entry_tmp; + struct phb_pe_data_entry *pe_data_entry; + int device_removed = 0; + + sc = device_get_softc(phb); + + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (!pe_data_entry) { + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] Missing data structures for PE %02lx", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + } + + LIST_FOREACH_SAFE(pe_map_entry, &sc->pe_map_list, next_entry, + pe_map_entry_tmp) { + if ((pe_map_entry->mapping.pci_bus_device == dev) && + (pe_map_entry->mapping.phb_pe == pe)) { + if (pe_data_entry->mapping.device_count > 0) + pe_data_entry->mapping.device_count--; + + LIST_REMOVE(pe_map_entry, next_entry); + free(pe_map_entry, M_OPALPCI_PHB); + + device_removed = 1; + } + } + + KASSERT(device_removed, + ("PHB[%jd] PE %02lx device mapping not found", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + + if (pe_data_entry->mapping.device_count == 0) { + opalpci_remove_phb_pe_data_entry(phb, pe); + } + + return 0; +} + +uint64_t +opalpci_find_phb_pe_for_device(device_t phb, device_t dev) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry * pe_map_entry; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_map_entry, &sc->pe_map_list, next_entry) { + if (pe_map_entry->mapping.pci_bus_device == dev) + return pe_map_entry->mapping.phb_pe; + } + + return OPAL_IODA_INVALID_PE; +} + +uint64_t +opalpci_find_phb_pe_for_devfn(device_t phb, uint32_t devfn) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry * pe_map_entry; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_map_entry, &sc->pe_map_list, next_entry) { + if ((pe_map_entry->mapping.pci_bus_start_devfn >= devfn) + && (devfn <= pe_map_entry->mapping.pci_bus_end_devfn)) + return pe_map_entry->mapping.phb_pe; + } + + return OPAL_IODA_INVALID_PE; +} + +uint64_t +opalpci_find_unused_phb_pe(device_t phb) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry * pe_map_entry; + int64_t pe; + uint64_t unused_pe = OPAL_IODA_INVALID_PE; + + sc = device_get_softc(phb); + + /* Scan backward as PEs are traditionally allocated from high to low */ + for (pe = sc->max_pe_count-1; pe >= 0; pe--) { + LIST_FOREACH(pe_map_entry, &sc->pe_map_list, next_entry) { + if (pe_map_entry->mapping.phb_pe == pe) + break; + } + + if (pe_map_entry == NULL) { + unused_pe = pe; + break; + } + } + + return unused_pe; +} + +uint64_t +opalpci_phb_get_slave_pe_count_for_device(device_t phb, device_t dev, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_map_entry * pe_map_entry; + uint64_t slave_count = 0; + + sc = device_get_softc(phb); + + LIST_FOREACH(pe_map_entry, &sc->pe_map_list, next_entry) { + if ((pe_map_entry->mapping.pci_bus_device == dev) && + (pe_map_entry->mapping.phb_pe == pe) && + (!pe_map_entry->mapping.is_master_pe)) + slave_count++; + } + + return slave_count; +} + +static int +opalpci_initialize_ioda2_mmio(device_t dev) +{ + struct opalpci_softc *sc; + int err; + + sc = device_get_softc(dev); + + /* Set up the default 64-bit memory window */ + err = opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, sc->phb_id, + OPAL_M64_WINDOW_TYPE, sc->m64_window_bar_idx, + sc->m64_window_base, 0, + sc->m64_window_size); + if (err != 0) { + device_printf(dev, + "64-bit memory window setup failed: %d\n", err); + return (ENXIO); + } + + /* Enable the default 64-bit memory window + * + * Note that we set this up in segmented mode, which basically + * splits the 64-bit MMIO window into segments, + * with each segment assigned in hardware to each PE#, in numerical + * order. + * + * This means that the the upper address bits of each 64-bit device + * BAR effectively select the associated PE# that we can use for + * that particular device and BAR combination... + */ + err = opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, + OPAL_M64_WINDOW_TYPE, sc->m64_window_bar_idx, + OPAL_ENABLE_M64_SPLIT); + if (err != 0) { + device_printf(dev, + "64-bit memory window enable failed: %d\n", err); + return (ENXIO); + } + + if (bootverbose) + device_printf(dev, + "Enabled segmented 64-bit memory window #%d at 0x%lx " + "size 0x%lx segment size 0x%lx\n", + sc->m64_window_bar_idx, sc->m64_window_base, + sc->m64_window_size, sc->m64_segment_size); + + return 0; +} + +static int +opalpc_ioda2_create_tvt(device_t phb, uint64_t pe, int tve_num, + uint64_t window_size, int page_shift, int levels) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + + sc = device_get_softc(phb); + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (pe_data_entry == NULL) + panic("Failed to locate data structures for PHB %jd PE %02lx\n", + (uintmax_t)sc->phb_id, pe); + + pe_data_entry->mapping.tvt.tce_size = max_tce_size(phb); + pe_data_entry->mapping.tvt.tvt_entry_count = + round_pow2(window_size / pe_data_entry->mapping.tvt.tce_size); + pe_data_entry->mapping.tvt.tce_tbl_size = + MAX(pe_data_entry->mapping.tvt.tvt_entry_count * sizeof(uint64_t), 4096); + pe_data_entry->mapping.tvt.page_shift = page_shift; + if (pe_data_entry->mapping.tvt.tvt_entry_count > OPAL_PCI_TCE_MAX_ENTRIES) + panic("PowerNV supports only %jdGB of memory space\n", + (uintmax_t)((OPAL_PCI_TCE_MAX_ENTRIES * + pe_data_entry->mapping.tvt.tce_size) >> 30)); + + pe_data_entry->mapping.tvt.tvt_base = contigmalloc( + pe_data_entry->mapping.tvt.tce_tbl_size, + M_DEVBUF, M_NOWAIT | M_ZERO, 0, + BUS_SPACE_MAXADDR, pe_data_entry->mapping.tvt.tce_tbl_size, 0); + + if (pe_data_entry->mapping.tvt.tvt_base == NULL) + panic("Failed to allocate TVT for PHB %jd PE %02lx\n", + (uintmax_t)sc->phb_id, pe); + + return 0; +} + +int +opalpc_ioda2_configure_dma_bypass(device_t phb, uint64_t pe, int enable) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + int err; + uint64_t maxmem; + + sc = device_get_softc(phb); + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (pe_data_entry == NULL) + panic("Failed to locate data structures for PHB %jd PE %02lx\n", + (uintmax_t)sc->phb_id, pe); + + /* + * The window number for the 64-bit bypass is always 1 by convention, + * as this selects TVE #1 + */ + if (enable) { + device_printf(phb, "Enabling 64-bit DMA bypass mode\n"); + maxmem = roundup2(powerpc_ptob(Maxmem), + pe_data_entry->mapping.tvt.tce_size); + err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, sc->phb_id, + pe, (pe << 1) + 1, + (1UL << 59), maxmem); + } + else { + device_printf(phb, "Disabling 64-bit DMA bypass mode\n"); + err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, sc->phb_id, + pe, (pe << 1) + 1, + (1UL << 59), 0); + } + if (err != 0) { + device_printf(phb, "64-bit DMA bypass mapping failed: %d\n", err); + return (ENXIO); + } + + return 0; +} + +int +opalpci_ioda2_set_iommu_window(device_t phb, uint64_t pe, int window_number, + struct powernv_ioda_tvt * tvt) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + int err; + + sc = device_get_softc(phb); + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (pe_data_entry == NULL) + panic("Failed to locate data structures for PHB %jd PE %02lx\n", + (uintmax_t)sc->phb_id, pe); + + if (bootverbose) + device_printf(phb, "Mapping PE# %lx window #%d %#jx-%#jx " + "TCE segment size %lx\n", + pe, window_number, + (uintmax_t)(0), + (uintmax_t)(tvt->tce_tbl_size), + (uintmax_t)(tvt->tce_size)); + + err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, sc->phb_id, + pe, (pe << 1) + window_number, + 1, pmap_kextract((uint64_t)&tvt->tvt_base[0]), + tvt->tce_tbl_size, tvt->tce_size); + if (err != 0) { + device_printf(phb, "DMA IOMMU mapping failed: %d\n", err); + return (ENXIO); + } + + /* + * Invalidate all previous TCE entries for this PE. + */ + if (ofw_bus_is_compatible(phb, "power8-pciex")) { + pci_phb3_tce_invalidate_pe(sc, pe); + } + else { + err = opal_call(OPAL_PCI_TCE_KILL, sc->phb_id, OPAL_PCI_TCE_KILL_PE, + pe, 0, 0, 0); + if (err != 0) { + device_printf(phb, "PE# %lx TCE invalidation failed: %d\n", + pe, err); + return (ENXIO); + } + } + + return 0; +} + +int +opalpci_ioda2_clear_iommu_window(device_t phb, uint64_t pe, int window_number, + struct powernv_ioda_tvt * tvt) +{ + struct opalpci_softc *sc; + int err; + + sc = device_get_softc(phb); + + /* + * Disable IOMMU for PE - deactivate all mappings + */ + if (bootverbose) + device_printf(phb, "Unmapping PE# %lx window #%d for DMA\n", + pe, window_number); + + err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, sc->phb_id, + pe, (pe << 1) + window_number, 0, 0, 0, 0); + if (err != 0) { + device_printf(phb, "DMA IOMMU unmapping failed: %d\n", err); + return (ENXIO); + } + + /* + * Invalidate all previous TCE entries for this PE. + */ + if (ofw_bus_is_compatible(phb, "power8-pciex")) { + pci_phb3_tce_invalidate_pe(sc, pe); + } + else { + err = opal_call(OPAL_PCI_TCE_KILL, sc->phb_id, OPAL_PCI_TCE_KILL_PE, + pe, 0, 0, 0); + if (err != 0) { + device_printf(phb, "PE# %lx TCE invalidation failed: %d\n", + pe, err); + return (ENXIO); + } + } + + /* + * Clear kernel mappings + */ + memset(tvt->tvt_base, 0, tvt->tce_tbl_size); + + return 0; +} + +static int +opalpci_ioda2_set_default_dma_window(device_t phb, uint64_t pe) +{ + struct opalpci_softc *sc; + struct phb_pe_data_entry * pe_data_entry; + int i, err; + rman_res_t start, end; + int index_start, index_end; + + sc = device_get_softc(phb); + pe_data_entry = opalpci_get_phb_pe_data_entry(phb, pe); + if (pe_data_entry == NULL) + panic("Failed to locate data structures for PHB %jd PE %02lx\n", + (uintmax_t)sc->phb_id, pe); + + /* Don't rerun setup if DMA has already been configured on this PE */ + if (pe_data_entry->mapping.dma_setup_finished) + return 0; + + /* Set up default IOMMU window using all available physical memory */ + start = 0; + end = powerpc_ptob(Maxmem); + + /* + * Map everything within the default IOMMU window 1:1, using the + * preconfigured TVT segment size + */ + index_start = start / pe_data_entry->mapping.tvt.tce_size; + index_end = end / pe_data_entry->mapping.tvt.tce_size; + if (index_start > pe_data_entry->mapping.tvt.tvt_entry_count) + index_start = pe_data_entry->mapping.tvt.tvt_entry_count; + if (index_end > pe_data_entry->mapping.tvt.tvt_entry_count) + index_end = pe_data_entry->mapping.tvt.tvt_entry_count; + for (i = index_start; i < index_end; i++) { + pe_data_entry->mapping.tvt.tvt_base[i] = + htobe64((i * pe_data_entry->mapping.tvt.tce_size) | + OPAL_PCI_TCE_R | OPAL_PCI_TCE_W); + } + + /* Activate the default IOMMU window */ + err = opalpci_ioda2_set_iommu_window(phb, pe, 0, + &pe_data_entry->mapping.tvt); + if (err != 0) + return err; + + /* Enable 64-bit DMA bypass */ + err = opalpc_ioda2_configure_dma_bypass(phb, pe, 1); + if (err != 0) + return (ENXIO); + + pe_data_entry->mapping.dma_setup_finished = 1; + + return 0; +} + static int opalpci_attach(device_t dev) { struct opalpci_softc *sc; - cell_t id[2], m64ranges[2], m64window[6], npe; + cell_t id[2], m64ranges[2], m64window[6]; phandle_t node; int i, err; - uint64_t maxmem; - uint64_t entries; - uint64_t tce_size; - uint64_t tce_tbl_size; - int m64bar; int rid; sc = device_get_softc(dev); node = ofw_bus_get_node(dev); + /* allocate initial delayed resource allocation linked list */ + LIST_INIT(&sc->pe_drc_list); + + /* allocate initial PE data linked list */ + LIST_INIT(&sc->pe_data_list); + + /* allocate initial device to PE map linked list */ + LIST_INIT(&sc->pe_map_list); + switch (OF_getproplen(node, "ibm,opal-phbid")) { case 8: OF_getencprop(node, "ibm,opal-phbid", id, 8); @@ -265,8 +821,47 @@ return (ENXIO); } + /* + * Retrieve reserved PE number from device tree + * Use PE #0 if reserved PE number not present in DT + */ + if (OF_getencprop(node, "ibm,opal-reserved-pe", + &sc->reserved_pe_id, 4) != 4) + sc->reserved_pe_id = 0; + + /* Retrieve maximum PE count from device tree + * Use a minimum of 1 if PE count not present in DT + */ + if (OF_getencprop(node, "ibm,opal-num-pes", &sc->max_pe_count, 4) != 4) + sc->max_pe_count = 1; + if (bootverbose) - device_printf(dev, "OPAL ID %#lx\n", sc->phb_id); + device_printf(dev, "OPAL ID %#lx reserved PE# %x\n", + sc->phb_id, sc->reserved_pe_id); + + /* + * Retrieve 64-bit memory window configuration data from device tree, + * and use the last 64-bit BAR as the 64-bit memory window + */ + if (OF_getencprop(node, "ibm,opal-available-m64-ranges", + m64ranges, sizeof(m64ranges)) == sizeof(m64ranges)) { + sc->m64_window_bar_idx = m64ranges[0] + m64ranges[1] - 1; + m64ranges[1]--; + } + else { + sc->m64_window_bar_idx = 0; + } + + sc->m64_window_present = 0; + if (OF_getencprop(node, "ibm,opal-m64-window", + m64window, sizeof(m64window)) == sizeof(m64window)) { + sc->m64_pci_base = ((uint64_t)m64window[0] << 32) | m64window[1]; + sc->m64_window_base = ((uint64_t)m64window[2] << 32) | m64window[3]; + sc->m64_window_size = ((uint64_t)m64window[4] << 32) | m64window[5]; + sc->m64_segment_size = sc->m64_window_size / sc->max_pe_count; + + sc->m64_window_present = 1; + } rid = 0; sc->r_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, @@ -277,7 +872,6 @@ return (ENXIO); } -#if 0 /* * Reset PCI IODA table */ @@ -287,6 +881,8 @@ device_printf(dev, "IODA table reset failed: %d\n", err); return (ENXIO); } + +#if 0 err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PHB_COMPLETE, 1); if (err < 0) { @@ -315,103 +911,55 @@ #endif /* - * Map all devices on the bus to partitionable endpoint one until - * such time as we start wanting to do things like bhyve. + * Root bus number is fixed at 0 on PHB3/PHB4 systems + * (and possibly others) */ - err = opal_call(OPAL_PCI_SET_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, - 0, OPAL_PCI_BUS_ANY, OPAL_IGNORE_RID_DEVICE_NUMBER, - OPAL_IGNORE_RID_FUNC_NUMBER, OPAL_MAP_PE); - if (err != 0) { - device_printf(dev, "PE mapping failed: %d\n", err); + sc->root_bus_id = 0; + + /* Mark firmware-reserved PE as allocated */ + if (opalpci_add_phb_pe_map_entry(dev, dev, sc->reserved_pe_id, 1) == + NULL) { + device_printf(dev, "PE map update failed: %d\n", err); return (ENXIO); } /* - * Turn on MMIO, mapped to PE 1 + * Select PE number for root bus + * The root bus is special in that it does not consume 64-bit memory + * resources, and by convention it should use a PE number adjacent to + * the reserved PE number. */ - if (OF_getencprop(node, "ibm,opal-num-pes", &npe, 4) != 4) - npe = 1; - for (i = 0; i < npe; i++) { - err = opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, - OPAL_PCI_DEFAULT_PE, OPAL_M32_WINDOW_TYPE, 0, i); - if (err != 0) - device_printf(dev, "MMIO %d map failed: %d\n", i, err); + if (sc->reserved_pe_id == 0) { + sc->root_bus_pe = 1; } - - if (OF_getencprop(node, "ibm,opal-available-m64-ranges", - m64ranges, sizeof(m64ranges)) == sizeof(m64ranges)) - m64bar = m64ranges[0]; - else - m64bar = 0; - - /* XXX: multiple M64 windows? */ - if (OF_getencprop(node, "ibm,opal-m64-window", - m64window, sizeof(m64window)) == sizeof(m64window)) { - opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, - OPAL_M64_WINDOW_TYPE, m64bar, 0); - opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, sc->phb_id, - OPAL_M64_WINDOW_TYPE, m64bar /* index */, - ((uint64_t)m64window[2] << 32) | m64window[3], 0, - ((uint64_t)m64window[4] << 32) | m64window[5]); - opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, - OPAL_PCI_DEFAULT_PE, OPAL_M64_WINDOW_TYPE, - m64bar /* index */, 0); - opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, - OPAL_M64_WINDOW_TYPE, m64bar, OPAL_ENABLE_M64_NON_SPLIT); + else if (sc->reserved_pe_id == (sc->max_pe_count - 1)) { + sc->root_bus_pe = sc->max_pe_count - 2; } - - /* - * Enable IOMMU for PE1 - map everything 1:1 using - * segments of max_tce_size size - */ - tce_size = max_tce_size(dev); - maxmem = roundup2(powerpc_ptob(Maxmem), tce_size); - entries = round_pow2(maxmem / tce_size); - tce_tbl_size = MAX(entries * sizeof(uint64_t), 4096); - if (entries > OPAL_PCI_TCE_MAX_ENTRIES) - panic("POWERNV supports only %jdGB of memory space\n", - (uintmax_t)((OPAL_PCI_TCE_MAX_ENTRIES * tce_size) >> 30)); - if (bootverbose) - device_printf(dev, "Mapping 0-%#jx for DMA\n", (uintmax_t)maxmem); - sc->tce = contigmalloc(tce_tbl_size, - M_DEVBUF, M_NOWAIT | M_ZERO, 0, - BUS_SPACE_MAXADDR, tce_tbl_size, 0); - if (sc->tce == NULL) - panic("Failed to allocate TCE memory for PHB %jd\n", - (uintmax_t)sc->phb_id); - - for (i = 0; i < entries; i++) - sc->tce[i] = htobe64((i * tce_size) | OPAL_PCI_TCE_R | OPAL_PCI_TCE_W); - - /* Map TCE for every PE. It seems necessary for Power8 */ - for (i = 0; i < npe; i++) { - err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, sc->phb_id, - i, (i << 1), - 1, pmap_kextract((uint64_t)&sc->tce[0]), - tce_tbl_size, tce_size); - if (err != 0) { - device_printf(dev, "DMA IOMMU mapping failed: %d\n", err); + else { + /* + * If the reserved PE is not located in one of the expected + * locations, fall back to allocating a random new PE index for + * the root bus. + */ + sc->root_bus_pe = opalpci_find_unused_phb_pe(dev); + if (sc->root_bus_pe == OPAL_IODA_INVALID_PE) { + device_printf(dev, + "Failed to allocate reserved PE for root bus\n"); return (ENXIO); } + } + if (opalpci_add_phb_pe_map_entry(dev, dev, sc->root_bus_pe, 1) == + NULL) { + device_printf(dev, "PE map update failed: %d\n", err); + return (ENXIO); + } - err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, sc->phb_id, - i, (i << 1) + 1, - (1UL << 59), maxmem); - if (err != 0) { - device_printf(dev, "DMA 64b bypass mapping failed: %d\n", err); - return (ENXIO); + if (sc->m64_window_present) { + if (opalpci_initialize_ioda2_mmio(dev) != 0) { + sc->m64_window_present = 0; } } - /* - * Invalidate all previous TCE entries. - */ - if (ofw_bus_is_compatible(dev, "power8-pciex")) - pci_phb3_tce_invalidate_entire(sc); - else - opal_call(OPAL_PCI_TCE_KILL, sc->phb_id, OPAL_PCI_TCE_KILL_ALL, - OPAL_PCI_DEFAULT_PE, 0, 0, 0); - /* * Get MSI properties */ @@ -434,29 +982,6 @@ msi_ranges[1], msi_ranges[0]); } - /* Create the parent DMA tag */ - /* - * Constrain it to POWER8 PHB (ioda2) for now. It seems to mess up on - * POWER9 systems. - */ - if (ofw_bus_is_compatible(dev, "ibm,ioda2-phb")) { - err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ - 1, 0, /* alignment, bounds */ - OPAL_PCI_BUS_SPACE_LOWADDR_32BIT, /* lowaddr */ - BUS_SPACE_MAXADDR_32BIT, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - BUS_SPACE_MAXSIZE, /* maxsize */ - BUS_SPACE_UNRESTRICTED, /* nsegments */ - BUS_SPACE_MAXSIZE, /* maxsegsize */ - 0, /* flags */ - NULL, NULL, /* lockfunc, lockarg */ - &sc->ofw_sc.sc_dmat); - if (err != 0) { - device_printf(dev, "Failed to create DMA tag\n"); - return (err); - } - } - /* * General OFW PCI attach */ @@ -464,18 +989,35 @@ if (err != 0) return (err); + /* Find 32-bit non-prefetchable memory window */ + for (i = 0; i < sc->ofw_sc.sc_nrange; i++) { + struct ofw_pci_range *rp; + + rp = sc->ofw_sc.sc_range + i; + + if (sc->ofw_sc.sc_range_mask & ((uint64_t)1 << i)) + continue; + + if ((rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) == + OFW_PCI_PHYS_HI_SPACE_MEM32) { + sc->m32_pci_base = rp->pci; + sc->m32_window_base = rp->host; + sc->m32_window_size = rp->size; + sc->m32_segment_size = + sc->m32_window_size / sc->max_pe_count;; + } + } + /* * Unfreeze non-config-space PCI operations. Let this fail silently * if e.g. there is no current freeze. */ - opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, + opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, sc->reserved_pe_id, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); - /* - * OPAL stores 64-bit BARs in a special property rather than "ranges" - */ - if (OF_getencprop(node, "ibm,opal-m64-window", - m64window, sizeof(m64window)) == sizeof(m64window)) { + + /* Inform the PCI core about the 64-bit memory window on the PHB */ + if (sc->m64_window_present) { struct ofw_pci_range *rp; sc->ofw_sc.sc_nrange++; @@ -485,13 +1027,33 @@ rp = &sc->ofw_sc.sc_range[sc->ofw_sc.sc_nrange-1]; rp->pci_hi = OFW_PCI_PHYS_HI_SPACE_MEM64 | OFW_PCI_PHYS_HI_PREFETCHABLE; - rp->pci = ((uint64_t)m64window[0] << 32) | m64window[1]; - rp->host = ((uint64_t)m64window[2] << 32) | m64window[3]; - rp->size = ((uint64_t)m64window[4] << 32) | m64window[5]; - rman_manage_region(&sc->ofw_sc.sc_mem_rman, rp->pci, + rp->pci = sc->m64_pci_base; + rp->host = sc->m64_window_base; + rp->size = sc->m64_window_size; + + /* Exclude the memory segments for the two reserved PEs */ + if (sc->reserved_pe_id == 0) { + rp->host += (2 * sc->m64_segment_size); + rp->size -= (2 * sc->m64_segment_size); + } + else if (sc->reserved_pe_id == (sc->max_pe_count - 1)) { + rp->size -= (2 * sc->m64_segment_size); + } + else { + device_printf(dev, + "Unable to adjust 64-bit memory window to exclude " + "reserved PE# %x\n", + sc->reserved_pe_id); + } + + /* All 64-bit regions are prefetchable */ + sc->ofw_sc.sc_have_pmem = 1; + rman_manage_region(&sc->ofw_sc.sc_pmem_rman, rp->pci, rp->pci + rp->size - 1); } + sc->pe_setup_finalized = 0; + return (ofw_pcib_attach(dev)); } @@ -542,12 +1104,12 @@ (word == ((1UL << (8 * width)) - 1))) { if (error != OPAL_HARDWARE) { opal_call(OPAL_PCI_EEH_FREEZE_STATUS, sc->phb_id, - OPAL_PCI_DEFAULT_PE, vtophys(&eeh_state), + sc->reserved_pe_id, vtophys(&eeh_state), vtophys(&err_type), NULL); err_type = be16toh(err_type); /* XXX unused */ if (eeh_state != OPAL_EEH_STOPPED_NOT_FROZEN) opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, - sc->phb_id, OPAL_PCI_DEFAULT_PE, + sc->phb_id, sc->reserved_pe_id, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); } if (error != OPAL_SUCCESS) @@ -591,16 +1153,657 @@ */ if (error != OPAL_HARDWARE) { opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, - sc->phb_id, OPAL_PCI_DEFAULT_PE, + sc->phb_id, sc->reserved_pe_id, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); } } } +int +opalpci_phb_is_pci_memory_window_64_bit(device_t phb, rman_res_t window_start) +{ + struct opalpci_softc *sc; + + sc = device_get_softc(phb); + + if (!sc->m64_window_present) + return 0; + + if ((window_start >= sc->m64_window_base) + && (window_start < (sc->m64_window_base + sc->m64_window_size))) + return 1; + + return 0; +} + +/* + * Find the associated 64-bit memory window segment number (i.e PE#) for + * the given physical device address. This is intended to be used when + * the 64-bit memory window is configured in M64_SPLIT mode. + */ + +uint64_t +opalpci_phb_get_m64_pe_for_addr(device_t phb, rman_res_t addr) +{ + struct opalpci_softc *sc; + + sc = device_get_softc(phb); + + if (!sc->m64_window_present) + return OPAL_IODA_INVALID_PE; + + /* Degenerate cases -- segment index is always zero here */ + if ((sc->m64_segment_size == 0) || + (sc->m64_segment_size == sc->m64_window_size)) + return 0; + + return (addr - sc->m64_window_base) / sc->m64_window_size; +} + static int -opalpci_route_interrupt(device_t bus, device_t dev, int pin) +opalpci_add_slave_pe_to_master_pe(device_t phb, device_t child, + uint64_t master_pe, uint64_t desired_slave_pe_number) { + struct opalpci_softc *sc; + struct phb_pe_data_entry * master_pe_data_entry; + struct phb_pe_data_entry * slave_pe_data_entry; + struct phb_pe_map_entry * slave_pe_map_entry; + int err; + + sc = device_get_softc(phb); + + /* Retrieve pointer to PE data structure */ + master_pe_data_entry = opalpci_get_phb_pe_data_entry(phb, master_pe); + if (!master_pe_data_entry) { + device_printf(phb, + "Unable to locate data structures for PE %02lx\n", + (uintmax_t)master_pe); + return (ENODEV); + } + + slave_pe_data_entry = opalpci_get_phb_pe_data_entry(phb, + desired_slave_pe_number); + if (slave_pe_data_entry) { + device_printf(phb, + "PE %02lx already exists\n", + (uintmax_t)desired_slave_pe_number); + return (ENODEV); + } + + /* Assign slave PE to device */ + err = opal_call(OPAL_PCI_SET_PE, sc->phb_id, desired_slave_pe_number, + master_pe_data_entry->mapping.pci_bus_start_devfn, + master_pe_data_entry->mapping.opal_bus_filter, + master_pe_data_entry->mapping.opal_dev_filter, + master_pe_data_entry->mapping.opal_func_filter, OPAL_MAP_PE); + if (err == 0) { + device_printf(phb, "Mapped PE# %lx to bus\n", + desired_slave_pe_number); + } + else { + device_printf(phb, "PE mapping failed: %d\n", err); + return (ENXIO); + } + /* Mark slave PE as allocated */ + slave_pe_map_entry = opalpci_add_phb_pe_map_entry(phb, child, + desired_slave_pe_number, 0); + if (slave_pe_map_entry == NULL) { + device_printf(phb, "PE map update failed: %d\n", err); + return (ENXIO); + } + slave_pe_map_entry->mapping.master_pe = master_pe; + + /* Retrieve pointer to PE data structure */ + slave_pe_data_entry = opalpci_get_phb_pe_data_entry(phb, + desired_slave_pe_number); + if (!slave_pe_data_entry) { + device_printf(phb, + "Unable to locate data structures for PE %02lx\n", + (uintmax_t)desired_slave_pe_number); + return (ENODEV); + } + + /* Update the PE to device map with the allocated bus range */ + slave_pe_data_entry->mapping.pci_bus_start_devfn = + master_pe_data_entry->mapping.pci_bus_start_devfn; + slave_pe_data_entry->mapping.pci_bus_end_devfn = + master_pe_data_entry->mapping.pci_bus_end_devfn; + + /* Update the PE to device map with the OPAL filter settings */ + slave_pe_data_entry->mapping.opal_bus_filter = + master_pe_data_entry->mapping.opal_bus_filter; + slave_pe_data_entry->mapping.opal_dev_filter = + master_pe_data_entry->mapping.opal_dev_filter; + slave_pe_data_entry->mapping.opal_func_filter = + master_pe_data_entry->mapping.opal_func_filter; + + /* Initialize IOMMU structures */ + opalpc_ioda2_create_tvt(phb, desired_slave_pe_number, 0, + roundup2(powerpc_ptob(Maxmem), max_tce_size(phb)), + POWERNV_FW_PAGE_SHIFT, 1); + + /* + * Unfreeze non-config-space PCI operations. Let this fail silently + * if e.g. there is no current freeze. + */ + opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, + desired_slave_pe_number, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + return 0; +} + +static int +opalpci_associate_pe_to_bus(device_t bus, device_t child, uint64_t pe, + rman_res_t start, rman_res_t end) +{ + struct ofw_pci_softc *sc; + struct opalpci_softc *phb_sc; + uint8_t opal_bus_filter; + uint32_t opal_resource_id; + int err; + struct phb_pe_map_entry * pe_map_entry; + struct phb_pe_data_entry * pe_data_entry; + rman_res_t count = (end - start) + 1; + + sc = device_get_softc(bus); + phb_sc = device_get_softc(sc->sc_dev); + + /* Assemble PCI resource ID for bus (devfn always == 0x0 for busses) */ + opal_resource_id = start << 8; + + /* Determine required bus filter mode */ + switch (count) { + case 1: + opal_bus_filter = OPAL_PCI_BUS_ALL; + break; + case 2: + opal_bus_filter = OPAL_PCI_BUS_7BITS; + break; + case 4: + opal_bus_filter = OPAL_PCI_BUS_6BITS; + break; + case 8: + opal_bus_filter = OPAL_PCI_BUS_5BITS; + break; + case 16: + opal_bus_filter = OPAL_PCI_BUS_4BITS; + break; + case 32: + opal_bus_filter = OPAL_PCI_BUS_3BITS; + break; + default: + device_printf(sc->sc_dev, + "Invalid subordinate bus count %ld, defaulting " + "to exact bus match\n", count); + opal_bus_filter = OPAL_PCI_BUS_ALL; + } + + if (bootverbose) { + if (count == 1) + device_printf(sc->sc_dev, + "Assigning subordinate bus 0x%lx\n", + start); + else + device_printf(sc->sc_dev, + "Assigning subordinate bus range [0x%lx-0x%lx]\n", + start, end); + } + + err = opal_call(OPAL_PCI_SET_PE, phb_sc->phb_id, pe, + opal_resource_id, opal_bus_filter, OPAL_IGNORE_RID_DEVICE_NUMBER, + OPAL_IGNORE_RID_FUNC_NUMBER, OPAL_MAP_PE); + if (err == 0) { + device_printf(sc->sc_dev, "Mapped PE# %lx to bus\n", pe); + } + else { + device_printf(sc->sc_dev, "PE mapping failed: %d\n", err); + return (ENXIO); + } + + /* + * The root bus PE is always allocated; only add a new entry for a + * subordinate bus or device + */ + if (pe != phb_sc->root_bus_pe) { + pe_map_entry = opalpci_add_phb_pe_map_entry(sc->sc_dev, child, + pe, 1); + if (pe_map_entry == NULL) { + device_printf(sc->sc_dev, "PE map update failed: %d\n", + err); + return (ENXIO); + } + + pe_data_entry = opalpci_get_phb_pe_data_entry(sc->sc_dev, pe); + if (pe_data_entry == NULL) { + device_printf(sc->sc_dev, "PE data update failed: %d\n", + err); + return (ENXIO); + } + + /* Update the PE to device map with the allocated bus range */ + pe_map_entry->mapping.pci_bus_start_devfn = start << 8; + pe_map_entry->mapping.pci_bus_end_devfn = end << 8; + + /* Update the PE data structure with the OPAL filter settings */ + pe_data_entry->mapping.pci_bus_start_devfn = start << 8; + pe_data_entry->mapping.pci_bus_end_devfn = end << 8; + pe_data_entry->mapping.opal_bus_filter = opal_bus_filter; + pe_data_entry->mapping.opal_dev_filter = + OPAL_IGNORE_RID_DEVICE_NUMBER; + pe_data_entry->mapping.opal_func_filter = + OPAL_IGNORE_RID_FUNC_NUMBER; + } + + /* Initialize IOMMU structures */ + opalpc_ioda2_create_tvt(sc->sc_dev, pe, 0, roundup2(powerpc_ptob(Maxmem), + max_tce_size(sc->sc_dev)), POWERNV_FW_PAGE_SHIFT, 1); + + if (pe != phb_sc->root_bus_pe) { + /* Set up default 32-bit IOMMU mapping for PE */ + err = opalpci_ioda2_set_default_dma_window(sc->sc_dev, pe); + if (err != 0) + return (err); + } + + /* + * Unfreeze non-config-space PCI operations. Let this fail silently + * if e.g. there is no current freeze. + */ + opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, phb_sc->phb_id, pe, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + return 0; +} + +static void +opalpci_setup_pci_device_resources(device_t bus, device_t child, int type, + int rid, rman_res_t start, rman_res_t end, rman_res_t count) +{ + struct ofw_pci_softc *sc; + struct opalpci_softc *phb_sc; + struct phb_pe_data_entry *pe_data_entry; + struct phb_drc_list_entry *phb_drc_entry; + uint64_t pe; + uint64_t start_pe; + uint64_t base_pe; + uint8_t child_has_64_bit_bar; + + sc = device_get_softc(bus); + phb_sc = device_get_softc(sc->sc_dev); + + if (type == PCI_RES_BUS) { + if (start == phb_sc->root_bus_id) { + pe = phb_sc->root_bus_pe; + } + else { + pe = opalpci_find_unused_phb_pe(sc->sc_dev); + if (pe == OPAL_IODA_INVALID_PE) { + device_printf(sc->sc_dev, + "No unused PE available\n"); + return; + } + + /* Select a correct base PE# if any 64-bit memory + * address(es) are in use + */ + child_has_64_bit_bar = 0; + LIST_FOREACH(phb_drc_entry, &phb_sc->pe_drc_list, + next_entry) { + if ((phb_drc_entry->list.child == child) && + (phb_drc_entry->list.type == SYS_RES_MEMORY) + && opalpci_phb_is_pci_memory_window_64_bit( + sc->sc_dev, phb_drc_entry->list.start)) { + start_pe = OPAL_IODA_INVALID_PE; + base_pe = OPAL_IODA_INVALID_PE; + child_has_64_bit_bar = 1; + start_pe = + opalpci_phb_get_m64_pe_for_addr( + sc->sc_dev, + phb_drc_entry->list.start); + if (opalpci_get_phb_pe_data_entry( + sc->sc_dev, start_pe)) { + /* PE is already reserved, skip... */ + continue; + } + /* + * Find the lowest candidate PE# to use + * as the base PE# for the bus + */ + if (start_pe < base_pe) { + base_pe = start_pe; + } + } + + if (child_has_64_bit_bar && + (base_pe != pe) && + (base_pe != OPAL_IODA_INVALID_PE)) { + device_printf(sc->sc_dev, + "Overriding PE# %lx to PE# %lx due to " + "64-bit segmented memory constraint\n", + pe, base_pe); + pe = base_pe; + } + } + } + } + else { + pe = opalpci_find_phb_pe_for_device(sc->sc_dev, child); + } + + if (pe == OPAL_IODA_INVALID_PE) { + device_printf(sc->sc_dev, + "Unable to locate PE for device, aborting resource config\n"); + return; + } + + switch (type) { + case PCI_RES_BUS: + opalpci_associate_pe_to_bus(bus, child, pe, start, end); + break; + case SYS_RES_IOPORT: + device_printf(sc->sc_dev, + "No I/O port support, ignoring device I/O resource\n"); + break; + case SYS_RES_MEMORY: + phb_sc = device_get_softc(sc->sc_dev); + if (phb_sc && + opalpci_phb_is_pci_memory_window_64_bit(sc->sc_dev, start)) { + /* Retrieve pointer to PE data structures */ + pe_data_entry = + opalpci_get_phb_pe_data_entry(sc->sc_dev, pe); + if (!pe_data_entry) + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] Unable to locate data structures " + "for PE %02lx", + (uintmax_t)phb_sc->phb_id, (uintmax_t)pe)); + + /* Determine the start PE# for the 64-bit memory range */ + /* + * TODO: Add slave PEs to create larger aggregate window + * if required range exceeds m64_window_size + */ + start_pe = opalpci_phb_get_m64_pe_for_addr(sc->sc_dev,start); + + /* + * Remap the master PE for this device if remap is + * allowed and the desired PE is not already allocated + * as a slave PE for this device + */ + if ((start_pe != pe) && + !opalpci_get_phb_pe_data_entry(sc->sc_dev, start_pe)) { + device_printf(sc->sc_dev, + "Adding slave PE# %lx to PE# %lx due to 64-bit " + "segmented memory constraint\n", + start_pe, pe); + if (opalpci_add_slave_pe_to_master_pe( + sc->sc_dev, child, pe, start_pe)) { + device_printf(sc->sc_dev, + "Unable to allocate slave PE# %lx, " + "64-bit memory accesses will fail\n", + start_pe); + } + } + } + break; + } +} + +static void +opalpci_activate_pci_device_resources(device_t bus, device_t child, int type, + int rid, rman_res_t start, rman_res_t end, rman_res_t count) +{ + struct ofw_pci_softc *sc; + struct opalpci_softc *phb_sc; + struct phb_pe_data_entry * pe_data_entry; + int i, err; + uint64_t pe; + uint64_t window_start; + uint64_t window_index; + + sc = device_get_softc(bus); + phb_sc = device_get_softc(sc->sc_dev); + + /* Busses are handled in setup above */ + if (type == PCI_RES_BUS) + return; + + pe = opalpci_find_phb_pe_for_device(sc->sc_dev, child); + if (pe == OPAL_IODA_INVALID_PE) { + device_printf(sc->sc_dev, + "Unable to locate PE for device, aborting resource " + "activation\n"); + return; + } + + switch (type) { + case SYS_RES_IOPORT: + device_printf(sc->sc_dev, + "No I/O port support, ignoring device I/O resource\n"); + break; + case SYS_RES_MEMORY: + phb_sc = device_get_softc(sc->sc_dev); + if (phb_sc && + !opalpci_phb_is_pci_memory_window_64_bit(sc->sc_dev, start)) { + /* Retrieve pointer to PE data structures */ + pe_data_entry = + opalpci_get_phb_pe_data_entry(sc->sc_dev, pe); + if (!pe_data_entry) + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] Unable to locate data structures " + "for PE %02lx", + (uintmax_t)phb_sc->phb_id, (uintmax_t)pe)); + + /* + * Turn on 32-bit MMIO, mapped to PE + */ + window_start = start - phb_sc->m32_pci_base; + window_index = window_start / phb_sc->m32_segment_size; + + for (i = window_index; i < phb_sc->max_pe_count; i++) { + err = opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, + phb_sc->phb_id, pe, OPAL_M32_WINDOW_TYPE, + 0, i); + if (err == 0) + pe_data_entry->mapping. + active_32_bit_mmio_segments[i] = 1; + else + device_printf(sc->sc_dev, + "PE# %lx MMIO %d map failed: %d\n", + pe, i, err); + + /* + * Stop assigning segments once the end of the + * desired 32-bit window is reached + */ + window_start += phb_sc->m32_segment_size; + if (window_start > (end - phb_sc->m32_pci_base)) { + break; + } + } + + if (bootverbose) + device_printf(sc->sc_dev, + "Enabled 32-bit memory window on PE# %lx " + "at 0x%lx size 0x%lx\n", + pe, start, + end - start); + } + break; + } +} + +static void +opalpci_finalize_pe_setup(device_t dev) +{ + struct opalpci_softc *sc; + struct phb_drc_list_entry *phb_drc_entry, *phb_drc_entry_tmp; + + sc = device_get_softc(dev); + bus_generic_new_pass(dev); + + if (sc->pe_setup_finalized) + return; + + /* Assign all bus PEs first... */ + LIST_FOREACH_SAFE(phb_drc_entry, &sc->pe_drc_list, next_entry, + phb_drc_entry_tmp) { + if (phb_drc_entry->list.type != PCI_RES_BUS) + continue; + + opalpci_setup_pci_device_resources(phb_drc_entry->list.bus, + phb_drc_entry->list.child, + phb_drc_entry->list.type, + phb_drc_entry->list.rid, + phb_drc_entry->list.start, + phb_drc_entry->list.end, + phb_drc_entry->list.count); + + opalpci_activate_pci_device_resources(phb_drc_entry->list.bus, + phb_drc_entry->list.child, + phb_drc_entry->list.type, + phb_drc_entry->list.rid, + phb_drc_entry->list.start, + phb_drc_entry->list.end, + phb_drc_entry->list.count); + + /* Remove the processed entry from the list */ + LIST_REMOVE(phb_drc_entry, next_entry); + free(phb_drc_entry, M_OPALPCI_PHB); + } + + /* .. then configure device resource windows */ + LIST_FOREACH_SAFE(phb_drc_entry, &sc->pe_drc_list, next_entry, + phb_drc_entry_tmp) { + if (phb_drc_entry->list.type == PCI_RES_BUS) + continue; + + opalpci_setup_pci_device_resources(phb_drc_entry->list.bus, + phb_drc_entry->list.child, + phb_drc_entry->list.type, + phb_drc_entry->list.rid, + phb_drc_entry->list.start, + phb_drc_entry->list.end, + phb_drc_entry->list.count); + + opalpci_activate_pci_device_resources(phb_drc_entry->list.bus, + phb_drc_entry->list.child, + phb_drc_entry->list.type, + phb_drc_entry->list.rid, + phb_drc_entry->list.start, + phb_drc_entry->list.end, + phb_drc_entry->list.count); + + /* Remove the processed entry from the list */ + LIST_REMOVE(phb_drc_entry, next_entry); + free(phb_drc_entry, M_OPALPCI_PHB); + } + + sc->pe_setup_finalized = 1; +} + +/* We need to hook the bus allocation and release functions in order to + * assign and release PHB PEs for each subordinate bus. + * + * Overall, this use the logic from OFW PCI bus driver, but also adds relevant + * resources to the Delayed Resource Configuration list for later setup + */ +static struct resource * +opalpci_alloc_resource(device_t bus, device_t child, int type, int rid, + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) +{ + struct ofw_pci_softc *sc; + struct opalpci_softc *phb_sc; + struct phb_drc_list_entry *phb_drc_entry; + + sc = device_get_softc(bus); + + phb_sc = device_get_softc(sc->sc_dev); + + if (phb_sc && + ((type == PCI_RES_BUS) || + (type == SYS_RES_MEMORY) || + (type == SYS_RES_IOPORT))) { + phb_drc_entry = malloc(sizeof(struct phb_drc_list_entry), + M_OPALPCI_PHB, M_WAITOK | M_ZERO); + if (!phb_drc_entry) { + device_printf(sc->sc_dev, + "Unable to allocate DRC memory\n"); + return (NULL); + } + + phb_drc_entry->list.bus = bus; + phb_drc_entry->list.child = child; + phb_drc_entry->list.type = type; + phb_drc_entry->list.rid = rid; + phb_drc_entry->list.start = start; + phb_drc_entry->list.end = end; + phb_drc_entry->list.count = count; + phb_drc_entry->list.flags = flags; + + LIST_INSERT_HEAD(&phb_sc->pe_drc_list, phb_drc_entry, next_entry); + } + + switch (type) { + case PCI_RES_BUS: + return (pci_domain_alloc_bus(sc->sc_pci_domain, child, rid, + start, end, count, flags)); + case SYS_RES_MEMORY: + case SYS_RES_IOPORT: + return (bus_generic_rman_alloc_resource(bus, child, type, rid, + start, end, count, flags)); + default: + return (bus_generic_alloc_resource(bus, child, type, rid, + start, end, count, flags)); + } +} + +/* We need to hook the bus allocation and release functions in order to + * assign and release PHB PEs for each subordinate bus. + * + * Overall, this use the logic from OFW PCI bus driver, with some additions + * for PCI_RES_BUS types. + */ +static int +opalpci_release_resource(device_t bus, device_t child, struct resource *res) +{ + struct ofw_pci_softc *sc; + struct opalpci_softc *phb_sc; + int type; + + type = rman_get_type(res); + sc = device_get_softc(bus); + + phb_sc = device_get_softc(sc->sc_dev); + + if (phb_sc && + ((type == PCI_RES_BUS) || + (type == SYS_RES_MEMORY) || + (type == SYS_RES_IOPORT))) { + if (!phb_sc->pe_setup_finalized) { + device_printf(sc->sc_dev, + "Unable to release resource, initial PE activation " + "still pending on PHB\n"); + return (EBUSY); + } + } + + sc = device_get_softc(bus); + switch (rman_get_type(res)) { + case PCI_RES_BUS: + return (pci_domain_release_bus(sc->sc_pci_domain, child, res)); + case SYS_RES_MEMORY: + case SYS_RES_IOPORT: + return (bus_generic_rman_release_resource(bus, child, res)); + default: + return (bus_generic_release_resource(bus, child, res)); + } +} + + +static int +opalpci_route_interrupt(device_t bus, device_t dev, int pin) +{ return (pin); } @@ -661,26 +1864,41 @@ { struct opalpci_softc *sc; struct pci_devinfo *dinfo; + struct phb_pe_data_entry * pe_data_entry; int err, xive; + uint32_t devfn; + uint64_t pe; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); + devfn = ((pci_get_slot(child) & 0x1f) << 3) | + (pci_get_function(child) & 0x7); + pe = opalpci_find_phb_pe_for_devfn(dev, devfn & 0xff << 8); + + pe_data_entry = opalpci_get_phb_pe_data_entry(dev, pe); + if (!pe_data_entry) + KASSERT(pe_data_entry != NULL, + ("PHB[%jd] Unable to allocate data structures for PE %02lx", + (uintmax_t)sc->phb_id, (uintmax_t)pe)); + + pe_data_entry->mapping.msi_activated = 1; + xive = irq - sc->base_msi_irq - sc->msi_base; - opal_call(OPAL_PCI_SET_XIVE_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive); + opal_call(OPAL_PCI_SET_XIVE_PE, sc->phb_id, pe, xive); dinfo = device_get_ivars(child); if (dinfo->cfg.msi.msi_alloc > 0 && (dinfo->cfg.msi.msi_ctrl & PCIM_MSICTRL_64BIT) == 0) { uint32_t msi32; err = opal_call(OPAL_GET_MSI_32, sc->phb_id, - OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(&msi32), + pe, xive, 1, vtophys(&msi32), vtophys(data)); *addr = be32toh(msi32); } else { err = opal_call(OPAL_GET_MSI_64, sc->phb_id, - OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(addr), vtophys(data)); + pe, xive, 1, vtophys(addr), vtophys(data)); *addr = be64toh(*addr); } *data = be32toh(*data); @@ -714,7 +1932,40 @@ opalpci_get_dma_tag(device_t dev, device_t child) { struct opalpci_softc *sc; + int err; sc = device_get_softc(dev); + + /* DMA tag already exists, return existing tag */ + if (sc->ofw_sc.sc_dmat != NULL) + return (sc->ofw_sc.sc_dmat); + + /* Create the parent DMA tag */ + + /* + * Constrain it to POWER8 PHB3 (ioda2) and POWER9 PHB4 (ioda3), + * treating both the same way + */ + if (ofw_bus_is_compatible(dev, "ibm,ioda2-phb") || + ofw_bus_is_compatible(dev, "ibm,ioda3-phb")) { + + err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + BUS_SPACE_MAXSIZE, /* maxsize */ + BUS_SPACE_UNRESTRICTED, /* nsegments */ + BUS_SPACE_MAXSIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockarg */ + &sc->ofw_sc.sc_dmat); + + if (err != 0) { + device_printf(dev, "Failed to create DMA tag\n"); + return (NULL); + } + } + return (sc->ofw_sc.sc_dmat); -} +} \ No newline at end of file