| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation |
| 4 | * |
| 5 | * Rewrite, cleanup: |
| 6 | * |
| 7 | * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation |
| 8 | * Copyright (C) 2006 Olof Johansson <olof@lixom.net> |
| 9 | * |
| 10 | * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR. |
| 11 | */ |
| 12 | |
| 13 | #include <linux/init.h> |
| 14 | #include <linux/types.h> |
| 15 | #include <linux/slab.h> |
| 16 | #include <linux/mm.h> |
| 17 | #include <linux/memblock.h> |
| 18 | #include <linux/spinlock.h> |
| 19 | #include <linux/string.h> |
| 20 | #include <linux/pci.h> |
| 21 | #include <linux/dma-mapping.h> |
| 22 | #include <linux/crash_dump.h> |
| 23 | #include <linux/memory.h> |
| 24 | #include <linux/vmalloc.h> |
| 25 | #include <linux/of.h> |
| 26 | #include <linux/of_address.h> |
| 27 | #include <linux/iommu.h> |
| 28 | #include <linux/rculist.h> |
| 29 | #include <asm/io.h> |
| 30 | #include <asm/prom.h> |
| 31 | #include <asm/rtas.h> |
| 32 | #include <asm/iommu.h> |
| 33 | #include <asm/pci-bridge.h> |
| 34 | #include <asm/machdep.h> |
| 35 | #include <asm/firmware.h> |
| 36 | #include <asm/tce.h> |
| 37 | #include <asm/ppc-pci.h> |
| 38 | #include <asm/udbg.h> |
| 39 | #include <asm/mmzone.h> |
| 40 | #include <asm/plpar_wrappers.h> |
| 41 | |
| 42 | #include "pseries.h" |
| 43 | |
| 44 | enum { |
| 45 | DDW_QUERY_PE_DMA_WIN = 0, |
| 46 | DDW_CREATE_PE_DMA_WIN = 1, |
| 47 | DDW_REMOVE_PE_DMA_WIN = 2, |
| 48 | |
| 49 | DDW_APPLICABLE_SIZE |
| 50 | }; |
| 51 | |
| 52 | enum { |
| 53 | DDW_EXT_SIZE = 0, |
| 54 | DDW_EXT_RESET_DMA_WIN = 1, |
| 55 | DDW_EXT_QUERY_OUT_SIZE = 2 |
| 56 | }; |
| 57 | |
| 58 | static struct iommu_table *iommu_pseries_alloc_table(int node) |
| 59 | { |
| 60 | struct iommu_table *tbl; |
| 61 | |
| 62 | tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); |
| 63 | if (!tbl) |
| 64 | return NULL; |
| 65 | |
| 66 | INIT_LIST_HEAD_RCU(&tbl->it_group_list); |
| 67 | kref_init(&tbl->it_kref); |
| 68 | return tbl; |
| 69 | } |
| 70 | |
| 71 | #ifdef CONFIG_IOMMU_API |
| 72 | static struct iommu_table_group_ops spapr_tce_table_group_ops; |
| 73 | #endif |
| 74 | |
| 75 | static struct iommu_table_group *iommu_pseries_alloc_group(int node) |
| 76 | { |
| 77 | struct iommu_table_group *table_group; |
| 78 | |
| 79 | table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); |
| 80 | if (!table_group) |
| 81 | return NULL; |
| 82 | |
| 83 | #ifdef CONFIG_IOMMU_API |
| 84 | table_group->ops = &spapr_tce_table_group_ops; |
| 85 | table_group->pgsizes = SZ_4K; |
| 86 | #endif |
| 87 | |
| 88 | table_group->tables[0] = iommu_pseries_alloc_table(node); |
| 89 | if (table_group->tables[0]) |
| 90 | return table_group; |
| 91 | |
| 92 | kfree(table_group); |
| 93 | return NULL; |
| 94 | } |
| 95 | |
| 96 | static void iommu_pseries_free_group(struct iommu_table_group *table_group, |
| 97 | const char *node_name) |
| 98 | { |
| 99 | if (!table_group) |
| 100 | return; |
| 101 | |
| 102 | #ifdef CONFIG_IOMMU_API |
| 103 | if (table_group->group) { |
| 104 | iommu_group_put(table_group->group); |
| 105 | BUG_ON(table_group->group); |
| 106 | } |
| 107 | #endif |
| 108 | |
| 109 | /* Default DMA window table is at index 0, while DDW at 1. SR-IOV |
| 110 | * adapters only have table on index 0(if not direct mapped). |
| 111 | */ |
| 112 | if (table_group->tables[0]) |
| 113 | iommu_tce_table_put(table_group->tables[0]); |
| 114 | |
| 115 | if (table_group->tables[1]) |
| 116 | iommu_tce_table_put(table_group->tables[1]); |
| 117 | |
| 118 | kfree(table_group); |
| 119 | } |
| 120 | |
| 121 | static int tce_build_pSeries(struct iommu_table *tbl, long index, |
| 122 | long npages, unsigned long uaddr, |
| 123 | enum dma_data_direction direction, |
| 124 | unsigned long attrs) |
| 125 | { |
| 126 | u64 proto_tce; |
| 127 | __be64 *tcep; |
| 128 | u64 rpn; |
| 129 | const unsigned long tceshift = tbl->it_page_shift; |
| 130 | const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); |
| 131 | |
| 132 | proto_tce = TCE_PCI_READ; // Read allowed |
| 133 | |
| 134 | if (direction != DMA_TO_DEVICE) |
| 135 | proto_tce |= TCE_PCI_WRITE; |
| 136 | |
| 137 | tcep = ((__be64 *)tbl->it_base) + index; |
| 138 | |
| 139 | while (npages--) { |
| 140 | /* can't move this out since we might cross MEMBLOCK boundary */ |
| 141 | rpn = __pa(uaddr) >> tceshift; |
| 142 | *tcep = cpu_to_be64(proto_tce | rpn << tceshift); |
| 143 | |
| 144 | uaddr += pagesize; |
| 145 | tcep++; |
| 146 | } |
| 147 | return 0; |
| 148 | } |
| 149 | |
| 150 | |
| 151 | static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages) |
| 152 | { |
| 153 | __be64 *tcep; |
| 154 | |
| 155 | tcep = ((__be64 *)tbl->it_base) + index; |
| 156 | |
| 157 | while (npages--) |
| 158 | *(tcep++) = 0; |
| 159 | } |
| 160 | |
| 161 | static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) |
| 162 | { |
| 163 | __be64 *tcep; |
| 164 | |
| 165 | tcep = ((__be64 *)tbl->it_base) + index; |
| 166 | |
| 167 | return be64_to_cpu(*tcep); |
| 168 | } |
| 169 | |
| 170 | #ifdef CONFIG_IOMMU_API |
| 171 | static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl) |
| 172 | { |
| 173 | unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); |
| 174 | unsigned long *uas; |
| 175 | |
| 176 | if (tbl->it_indirect_levels) /* Impossible */ |
| 177 | return -EPERM; |
| 178 | |
| 179 | WARN_ON(tbl->it_userspace); |
| 180 | |
| 181 | uas = vzalloc(cb); |
| 182 | if (!uas) |
| 183 | return -ENOMEM; |
| 184 | |
| 185 | tbl->it_userspace = (__be64 *) uas; |
| 186 | |
| 187 | return 0; |
| 188 | } |
| 189 | #endif |
| 190 | |
| 191 | static void tce_iommu_userspace_view_free(struct iommu_table *tbl) |
| 192 | { |
| 193 | vfree(tbl->it_userspace); |
| 194 | tbl->it_userspace = NULL; |
| 195 | } |
| 196 | |
| 197 | static void tce_free_pSeries(struct iommu_table *tbl) |
| 198 | { |
| 199 | if (!tbl->it_userspace) |
| 200 | tce_iommu_userspace_view_free(tbl); |
| 201 | } |
| 202 | |
| 203 | static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); |
| 204 | static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); |
| 205 | |
| 206 | static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, |
| 207 | long npages, unsigned long uaddr, |
| 208 | enum dma_data_direction direction, |
| 209 | unsigned long attrs) |
| 210 | { |
| 211 | u64 rc = 0; |
| 212 | u64 proto_tce, tce; |
| 213 | u64 rpn; |
| 214 | int ret = 0; |
| 215 | long tcenum_start = tcenum, npages_start = npages; |
| 216 | |
| 217 | rpn = __pa(uaddr) >> tceshift; |
| 218 | proto_tce = TCE_PCI_READ; |
| 219 | if (direction != DMA_TO_DEVICE) |
| 220 | proto_tce |= TCE_PCI_WRITE; |
| 221 | |
| 222 | while (npages--) { |
| 223 | tce = proto_tce | rpn << tceshift; |
| 224 | rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); |
| 225 | |
| 226 | if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { |
| 227 | ret = (int)rc; |
| 228 | tce_free_pSeriesLP(liobn, tcenum_start, tceshift, |
| 229 | (npages_start - (npages + 1))); |
| 230 | break; |
| 231 | } |
| 232 | |
| 233 | if (rc && printk_ratelimit()) { |
| 234 | printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); |
| 235 | printk("\tindex = 0x%llx\n", (u64)liobn); |
| 236 | printk("\ttcenum = 0x%llx\n", (u64)tcenum); |
| 237 | printk("\ttce val = 0x%llx\n", tce ); |
| 238 | dump_stack(); |
| 239 | } |
| 240 | |
| 241 | tcenum++; |
| 242 | rpn++; |
| 243 | } |
| 244 | return ret; |
| 245 | } |
| 246 | |
| 247 | static DEFINE_PER_CPU(__be64 *, tce_page); |
| 248 | |
| 249 | static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, |
| 250 | long npages, unsigned long uaddr, |
| 251 | enum dma_data_direction direction, |
| 252 | unsigned long attrs) |
| 253 | { |
| 254 | u64 rc = 0; |
| 255 | u64 proto_tce; |
| 256 | __be64 *tcep; |
| 257 | u64 rpn; |
| 258 | long l, limit; |
| 259 | long tcenum_start = tcenum, npages_start = npages; |
| 260 | int ret = 0; |
| 261 | unsigned long flags; |
| 262 | const unsigned long tceshift = tbl->it_page_shift; |
| 263 | |
| 264 | if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { |
| 265 | return tce_build_pSeriesLP(tbl->it_index, tcenum, |
| 266 | tceshift, npages, uaddr, |
| 267 | direction, attrs); |
| 268 | } |
| 269 | |
| 270 | local_irq_save(flags); /* to protect tcep and the page behind it */ |
| 271 | |
| 272 | tcep = __this_cpu_read(tce_page); |
| 273 | |
| 274 | /* This is safe to do since interrupts are off when we're called |
| 275 | * from iommu_alloc{,_sg}() |
| 276 | */ |
| 277 | if (!tcep) { |
| 278 | tcep = (__be64 *)__get_free_page(GFP_ATOMIC); |
| 279 | /* If allocation fails, fall back to the loop implementation */ |
| 280 | if (!tcep) { |
| 281 | local_irq_restore(flags); |
| 282 | return tce_build_pSeriesLP(tbl->it_index, tcenum, |
| 283 | tceshift, |
| 284 | npages, uaddr, direction, attrs); |
| 285 | } |
| 286 | __this_cpu_write(tce_page, tcep); |
| 287 | } |
| 288 | |
| 289 | rpn = __pa(uaddr) >> tceshift; |
| 290 | proto_tce = TCE_PCI_READ; |
| 291 | if (direction != DMA_TO_DEVICE) |
| 292 | proto_tce |= TCE_PCI_WRITE; |
| 293 | |
| 294 | /* We can map max one pageful of TCEs at a time */ |
| 295 | do { |
| 296 | /* |
| 297 | * Set up the page with TCE data, looping through and setting |
| 298 | * the values. |
| 299 | */ |
| 300 | limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE); |
| 301 | |
| 302 | for (l = 0; l < limit; l++) { |
| 303 | tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); |
| 304 | rpn++; |
| 305 | } |
| 306 | |
| 307 | rc = plpar_tce_put_indirect((u64)tbl->it_index, |
| 308 | (u64)tcenum << tceshift, |
| 309 | (u64)__pa(tcep), |
| 310 | limit); |
| 311 | |
| 312 | npages -= limit; |
| 313 | tcenum += limit; |
| 314 | } while (npages > 0 && !rc); |
| 315 | |
| 316 | local_irq_restore(flags); |
| 317 | |
| 318 | if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { |
| 319 | ret = (int)rc; |
| 320 | tce_freemulti_pSeriesLP(tbl, tcenum_start, |
| 321 | (npages_start - (npages + limit))); |
| 322 | return ret; |
| 323 | } |
| 324 | |
| 325 | if (rc && printk_ratelimit()) { |
| 326 | printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); |
| 327 | printk("\tindex = 0x%llx\n", (u64)tbl->it_index); |
| 328 | printk("\tnpages = 0x%llx\n", (u64)npages); |
| 329 | printk("\ttce[0] val = 0x%llx\n", tcep[0]); |
| 330 | dump_stack(); |
| 331 | } |
| 332 | return ret; |
| 333 | } |
| 334 | |
| 335 | static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, |
| 336 | long npages) |
| 337 | { |
| 338 | u64 rc; |
| 339 | |
| 340 | while (npages--) { |
| 341 | rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); |
| 342 | |
| 343 | if (rc && printk_ratelimit()) { |
| 344 | printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); |
| 345 | printk("\tindex = 0x%llx\n", (u64)liobn); |
| 346 | printk("\ttcenum = 0x%llx\n", (u64)tcenum); |
| 347 | dump_stack(); |
| 348 | } |
| 349 | |
| 350 | tcenum++; |
| 351 | } |
| 352 | } |
| 353 | |
| 354 | |
| 355 | static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) |
| 356 | { |
| 357 | u64 rc; |
| 358 | long rpages = npages; |
| 359 | unsigned long limit; |
| 360 | |
| 361 | if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) |
| 362 | return tce_free_pSeriesLP(tbl->it_index, tcenum, |
| 363 | tbl->it_page_shift, npages); |
| 364 | |
| 365 | do { |
| 366 | limit = min_t(unsigned long, rpages, 512); |
| 367 | |
| 368 | rc = plpar_tce_stuff((u64)tbl->it_index, |
| 369 | (u64)tcenum << tbl->it_page_shift, 0, limit); |
| 370 | |
| 371 | rpages -= limit; |
| 372 | tcenum += limit; |
| 373 | } while (rpages > 0 && !rc); |
| 374 | |
| 375 | if (rc && printk_ratelimit()) { |
| 376 | printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); |
| 377 | printk("\trc = %lld\n", rc); |
| 378 | printk("\tindex = 0x%llx\n", (u64)tbl->it_index); |
| 379 | printk("\tnpages = 0x%llx\n", (u64)npages); |
| 380 | dump_stack(); |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) |
| 385 | { |
| 386 | u64 rc; |
| 387 | unsigned long tce_ret; |
| 388 | |
| 389 | rc = plpar_tce_get((u64)tbl->it_index, |
| 390 | (u64)tcenum << tbl->it_page_shift, &tce_ret); |
| 391 | |
| 392 | if (rc && printk_ratelimit()) { |
| 393 | printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); |
| 394 | printk("\tindex = 0x%llx\n", (u64)tbl->it_index); |
| 395 | printk("\ttcenum = 0x%llx\n", (u64)tcenum); |
| 396 | dump_stack(); |
| 397 | } |
| 398 | |
| 399 | return tce_ret; |
| 400 | } |
| 401 | |
| 402 | /* this is compatible with cells for the device tree property */ |
| 403 | struct dynamic_dma_window_prop { |
| 404 | __be32 liobn; /* tce table number */ |
| 405 | __be64 dma_base; /* address hi,lo */ |
| 406 | __be32 tce_shift; /* ilog2(tce_page_size) */ |
| 407 | __be32 window_shift; /* ilog2(tce_window_size) */ |
| 408 | }; |
| 409 | |
| 410 | struct dma_win { |
| 411 | struct device_node *device; |
| 412 | const struct dynamic_dma_window_prop *prop; |
| 413 | bool direct; |
| 414 | struct list_head list; |
| 415 | }; |
| 416 | |
| 417 | /* Dynamic DMA Window support */ |
| 418 | struct ddw_query_response { |
| 419 | u32 windows_available; |
| 420 | u64 largest_available_block; |
| 421 | u32 page_size; |
| 422 | u32 migration_capable; |
| 423 | }; |
| 424 | |
| 425 | struct ddw_create_response { |
| 426 | u32 liobn; |
| 427 | u32 addr_hi; |
| 428 | u32 addr_lo; |
| 429 | }; |
| 430 | |
| 431 | static LIST_HEAD(dma_win_list); |
| 432 | /* prevents races between memory on/offline and window creation */ |
| 433 | static DEFINE_SPINLOCK(dma_win_list_lock); |
| 434 | /* protects initializing window twice for same device */ |
| 435 | static DEFINE_MUTEX(dma_win_init_mutex); |
| 436 | |
| 437 | static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, |
| 438 | unsigned long num_pfn, const void *arg) |
| 439 | { |
| 440 | const struct dynamic_dma_window_prop *maprange = arg; |
| 441 | int rc; |
| 442 | u64 tce_size, num_tce, dma_offset, next; |
| 443 | u32 tce_shift; |
| 444 | long limit; |
| 445 | |
| 446 | tce_shift = be32_to_cpu(maprange->tce_shift); |
| 447 | tce_size = 1ULL << tce_shift; |
| 448 | next = start_pfn << PAGE_SHIFT; |
| 449 | num_tce = num_pfn << PAGE_SHIFT; |
| 450 | |
| 451 | /* round back to the beginning of the tce page size */ |
| 452 | num_tce += next & (tce_size - 1); |
| 453 | next &= ~(tce_size - 1); |
| 454 | |
| 455 | /* covert to number of tces */ |
| 456 | num_tce |= tce_size - 1; |
| 457 | num_tce >>= tce_shift; |
| 458 | |
| 459 | do { |
| 460 | /* |
| 461 | * Set up the page with TCE data, looping through and setting |
| 462 | * the values. |
| 463 | */ |
| 464 | limit = min_t(long, num_tce, 512); |
| 465 | dma_offset = next + be64_to_cpu(maprange->dma_base); |
| 466 | |
| 467 | rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), |
| 468 | dma_offset, |
| 469 | 0, limit); |
| 470 | next += limit * tce_size; |
| 471 | num_tce -= limit; |
| 472 | } while (num_tce > 0 && !rc); |
| 473 | |
| 474 | return rc; |
| 475 | } |
| 476 | |
| 477 | static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, |
| 478 | unsigned long num_pfn, const void *arg) |
| 479 | { |
| 480 | const struct dynamic_dma_window_prop *maprange = arg; |
| 481 | u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn; |
| 482 | __be64 *tcep; |
| 483 | u32 tce_shift; |
| 484 | u64 rc = 0; |
| 485 | long l, limit; |
| 486 | |
| 487 | if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { |
| 488 | unsigned long tceshift = be32_to_cpu(maprange->tce_shift); |
| 489 | unsigned long dmastart = (start_pfn << PAGE_SHIFT) + |
| 490 | be64_to_cpu(maprange->dma_base); |
| 491 | unsigned long tcenum = dmastart >> tceshift; |
| 492 | unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; |
| 493 | void *uaddr = __va(start_pfn << PAGE_SHIFT); |
| 494 | |
| 495 | return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), |
| 496 | tcenum, tceshift, npages, (unsigned long) uaddr, |
| 497 | DMA_BIDIRECTIONAL, 0); |
| 498 | } |
| 499 | |
| 500 | local_irq_disable(); /* to protect tcep and the page behind it */ |
| 501 | tcep = __this_cpu_read(tce_page); |
| 502 | |
| 503 | if (!tcep) { |
| 504 | tcep = (__be64 *)__get_free_page(GFP_ATOMIC); |
| 505 | if (!tcep) { |
| 506 | local_irq_enable(); |
| 507 | return -ENOMEM; |
| 508 | } |
| 509 | __this_cpu_write(tce_page, tcep); |
| 510 | } |
| 511 | |
| 512 | proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; |
| 513 | |
| 514 | liobn = (u64)be32_to_cpu(maprange->liobn); |
| 515 | tce_shift = be32_to_cpu(maprange->tce_shift); |
| 516 | tce_size = 1ULL << tce_shift; |
| 517 | next = start_pfn << PAGE_SHIFT; |
| 518 | num_tce = num_pfn << PAGE_SHIFT; |
| 519 | |
| 520 | /* round back to the beginning of the tce page size */ |
| 521 | num_tce += next & (tce_size - 1); |
| 522 | next &= ~(tce_size - 1); |
| 523 | |
| 524 | /* covert to number of tces */ |
| 525 | num_tce |= tce_size - 1; |
| 526 | num_tce >>= tce_shift; |
| 527 | |
| 528 | /* We can map max one pageful of TCEs at a time */ |
| 529 | do { |
| 530 | /* |
| 531 | * Set up the page with TCE data, looping through and setting |
| 532 | * the values. |
| 533 | */ |
| 534 | limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE); |
| 535 | dma_offset = next + be64_to_cpu(maprange->dma_base); |
| 536 | |
| 537 | for (l = 0; l < limit; l++) { |
| 538 | tcep[l] = cpu_to_be64(proto_tce | next); |
| 539 | next += tce_size; |
| 540 | } |
| 541 | |
| 542 | rc = plpar_tce_put_indirect(liobn, |
| 543 | dma_offset, |
| 544 | (u64)__pa(tcep), |
| 545 | limit); |
| 546 | |
| 547 | num_tce -= limit; |
| 548 | } while (num_tce > 0 && !rc); |
| 549 | |
| 550 | /* error cleanup: caller will clear whole range */ |
| 551 | |
| 552 | local_irq_enable(); |
| 553 | return rc; |
| 554 | } |
| 555 | |
| 556 | static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, |
| 557 | unsigned long num_pfn, void *arg) |
| 558 | { |
| 559 | return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); |
| 560 | } |
| 561 | |
| 562 | static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, |
| 563 | unsigned long liobn, unsigned long win_addr, |
| 564 | unsigned long window_size, unsigned long page_shift, |
| 565 | void *base, struct iommu_table_ops *table_ops) |
| 566 | { |
| 567 | tbl->it_busno = busno; |
| 568 | tbl->it_index = liobn; |
| 569 | tbl->it_offset = win_addr >> page_shift; |
| 570 | tbl->it_size = window_size >> page_shift; |
| 571 | tbl->it_page_shift = page_shift; |
| 572 | tbl->it_base = (unsigned long)base; |
| 573 | tbl->it_blocksize = 16; |
| 574 | tbl->it_type = TCE_PCI; |
| 575 | tbl->it_ops = table_ops; |
| 576 | } |
| 577 | |
| 578 | struct iommu_table_ops iommu_table_pseries_ops; |
| 579 | |
| 580 | static void iommu_table_setparms(struct pci_controller *phb, |
| 581 | struct device_node *dn, |
| 582 | struct iommu_table *tbl) |
| 583 | { |
| 584 | struct device_node *node; |
| 585 | const unsigned long *basep; |
| 586 | const u32 *sizep; |
| 587 | |
| 588 | /* Test if we are going over 2GB of DMA space */ |
| 589 | if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { |
| 590 | udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); |
| 591 | panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); |
| 592 | } |
| 593 | |
| 594 | node = phb->dn; |
| 595 | basep = of_get_property(node, "linux,tce-base", NULL); |
| 596 | sizep = of_get_property(node, "linux,tce-size", NULL); |
| 597 | if (basep == NULL || sizep == NULL) { |
| 598 | printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has " |
| 599 | "missing tce entries !\n", dn); |
| 600 | return; |
| 601 | } |
| 602 | |
| 603 | iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, |
| 604 | phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, |
| 605 | __va(*basep), &iommu_table_pseries_ops); |
| 606 | |
| 607 | if (!is_kdump_kernel()) |
| 608 | memset((void *)tbl->it_base, 0, *sizep); |
| 609 | |
| 610 | phb->dma_window_base_cur += phb->dma_window_size; |
| 611 | } |
| 612 | |
| 613 | struct iommu_table_ops iommu_table_lpar_multi_ops; |
| 614 | |
| 615 | struct iommu_table_ops iommu_table_pseries_ops = { |
| 616 | .set = tce_build_pSeries, |
| 617 | .clear = tce_clear_pSeries, |
| 618 | .get = tce_get_pseries |
| 619 | }; |
| 620 | |
| 621 | static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) |
| 622 | { |
| 623 | struct device_node *dn; |
| 624 | struct iommu_table *tbl; |
| 625 | struct device_node *isa_dn, *isa_dn_orig; |
| 626 | struct device_node *tmp; |
| 627 | struct pci_dn *pci; |
| 628 | int children; |
| 629 | |
| 630 | dn = pci_bus_to_OF_node(bus); |
| 631 | |
| 632 | pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn); |
| 633 | |
| 634 | if (bus->self) { |
| 635 | /* This is not a root bus, any setup will be done for the |
| 636 | * device-side of the bridge in iommu_dev_setup_pSeries(). |
| 637 | */ |
| 638 | return; |
| 639 | } |
| 640 | pci = PCI_DN(dn); |
| 641 | |
| 642 | /* Check if the ISA bus on the system is under |
| 643 | * this PHB. |
| 644 | */ |
| 645 | isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa"); |
| 646 | |
| 647 | while (isa_dn && isa_dn != dn) |
| 648 | isa_dn = isa_dn->parent; |
| 649 | |
| 650 | of_node_put(isa_dn_orig); |
| 651 | |
| 652 | /* Count number of direct PCI children of the PHB. */ |
| 653 | for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling) |
| 654 | children++; |
| 655 | |
| 656 | pr_debug("Children: %d\n", children); |
| 657 | |
| 658 | /* Calculate amount of DMA window per slot. Each window must be |
| 659 | * a power of two (due to pci_alloc_consistent requirements). |
| 660 | * |
| 661 | * Keep 256MB aside for PHBs with ISA. |
| 662 | */ |
| 663 | |
| 664 | if (!isa_dn) { |
| 665 | /* No ISA/IDE - just set window size and return */ |
| 666 | pci->phb->dma_window_size = 0x80000000ul; /* To be divided */ |
| 667 | |
| 668 | while (pci->phb->dma_window_size * children > 0x80000000ul) |
| 669 | pci->phb->dma_window_size >>= 1; |
| 670 | pr_debug("No ISA/IDE, window size is 0x%llx\n", |
| 671 | pci->phb->dma_window_size); |
| 672 | pci->phb->dma_window_base_cur = 0; |
| 673 | |
| 674 | return; |
| 675 | } |
| 676 | |
| 677 | /* If we have ISA, then we probably have an IDE |
| 678 | * controller too. Allocate a 128MB table but |
| 679 | * skip the first 128MB to avoid stepping on ISA |
| 680 | * space. |
| 681 | */ |
| 682 | pci->phb->dma_window_size = 0x8000000ul; |
| 683 | pci->phb->dma_window_base_cur = 0x8000000ul; |
| 684 | |
| 685 | pci->table_group = iommu_pseries_alloc_group(pci->phb->node); |
| 686 | tbl = pci->table_group->tables[0]; |
| 687 | |
| 688 | iommu_table_setparms(pci->phb, dn, tbl); |
| 689 | |
| 690 | if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) |
| 691 | panic("Failed to initialize iommu table"); |
| 692 | |
| 693 | /* Divide the rest (1.75GB) among the children */ |
| 694 | pci->phb->dma_window_size = 0x80000000ul; |
| 695 | while (pci->phb->dma_window_size * children > 0x70000000ul) |
| 696 | pci->phb->dma_window_size >>= 1; |
| 697 | |
| 698 | pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); |
| 699 | } |
| 700 | |
| 701 | #ifdef CONFIG_IOMMU_API |
| 702 | static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned |
| 703 | long *tce, enum dma_data_direction *direction) |
| 704 | { |
| 705 | long rc; |
| 706 | unsigned long ioba = (unsigned long) index << tbl->it_page_shift; |
| 707 | unsigned long flags, oldtce = 0; |
| 708 | u64 proto_tce = iommu_direction_to_tce_perm(*direction); |
| 709 | unsigned long newtce = *tce | proto_tce; |
| 710 | |
| 711 | spin_lock_irqsave(&tbl->large_pool.lock, flags); |
| 712 | |
| 713 | rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce); |
| 714 | if (!rc) |
| 715 | rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce); |
| 716 | |
| 717 | if (!rc) { |
| 718 | *direction = iommu_tce_direction(oldtce); |
| 719 | *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); |
| 720 | } |
| 721 | |
| 722 | spin_unlock_irqrestore(&tbl->large_pool.lock, flags); |
| 723 | |
| 724 | return rc; |
| 725 | } |
| 726 | |
| 727 | static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index, |
| 728 | bool __always_unused alloc) |
| 729 | { |
| 730 | return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL; |
| 731 | } |
| 732 | #endif |
| 733 | |
| 734 | struct iommu_table_ops iommu_table_lpar_multi_ops = { |
| 735 | .set = tce_buildmulti_pSeriesLP, |
| 736 | #ifdef CONFIG_IOMMU_API |
| 737 | .xchg_no_kill = tce_exchange_pseries, |
| 738 | .useraddrptr = tce_useraddr_pSeriesLP, |
| 739 | #endif |
| 740 | .clear = tce_freemulti_pSeriesLP, |
| 741 | .get = tce_get_pSeriesLP, |
| 742 | .free = tce_free_pSeries |
| 743 | }; |
| 744 | |
| 745 | #ifdef CONFIG_IOMMU_API |
| 746 | /* |
| 747 | * When the DMA window properties might have been removed, |
| 748 | * the parent node has the table_group setup on it. |
| 749 | */ |
| 750 | static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev, |
| 751 | struct iommu_table_group *table_group) |
| 752 | { |
| 753 | struct device_node *dn = pci_device_to_OF_node(dev); |
| 754 | struct pci_dn *rpdn; |
| 755 | |
| 756 | for (; dn && PCI_DN(dn); dn = dn->parent) { |
| 757 | rpdn = PCI_DN(dn); |
| 758 | |
| 759 | if (table_group == rpdn->table_group) |
| 760 | return dn; |
| 761 | } |
| 762 | |
| 763 | return NULL; |
| 764 | } |
| 765 | #endif |
| 766 | |
| 767 | /* |
| 768 | * Find nearest ibm,dma-window (default DMA window) or direct DMA window or |
| 769 | * dynamic 64bit DMA window, walking up the device tree. |
| 770 | */ |
| 771 | static struct device_node *pci_dma_find(struct device_node *dn, |
| 772 | struct dynamic_dma_window_prop *prop) |
| 773 | { |
| 774 | const __be32 *default_prop = NULL; |
| 775 | const __be32 *ddw_prop = NULL; |
| 776 | struct device_node *rdn = NULL; |
| 777 | bool default_win = false, ddw_win = false; |
| 778 | |
| 779 | for ( ; dn && PCI_DN(dn); dn = dn->parent) { |
| 780 | default_prop = of_get_property(dn, "ibm,dma-window", NULL); |
| 781 | if (default_prop) { |
| 782 | rdn = dn; |
| 783 | default_win = true; |
| 784 | } |
| 785 | ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); |
| 786 | if (ddw_prop) { |
| 787 | rdn = dn; |
| 788 | ddw_win = true; |
| 789 | break; |
| 790 | } |
| 791 | ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); |
| 792 | if (ddw_prop) { |
| 793 | rdn = dn; |
| 794 | ddw_win = true; |
| 795 | break; |
| 796 | } |
| 797 | |
| 798 | /* At least found default window, which is the case for normal boot */ |
| 799 | if (default_win) |
| 800 | break; |
| 801 | } |
| 802 | |
| 803 | /* For PCI devices there will always be a DMA window, either on the device |
| 804 | * or parent bus |
| 805 | */ |
| 806 | WARN_ON(!(default_win | ddw_win)); |
| 807 | |
| 808 | /* caller doesn't want to get DMA window property */ |
| 809 | if (!prop) |
| 810 | return rdn; |
| 811 | |
| 812 | /* parse DMA window property. During normal system boot, only default |
| 813 | * DMA window is passed in OF. But, for kdump, a dedicated adapter might |
| 814 | * have both default and DDW in FDT. In this scenario, DDW takes precedence |
| 815 | * over default window. |
| 816 | */ |
| 817 | if (ddw_win) { |
| 818 | struct dynamic_dma_window_prop *p; |
| 819 | |
| 820 | p = (struct dynamic_dma_window_prop *)ddw_prop; |
| 821 | prop->liobn = p->liobn; |
| 822 | prop->dma_base = p->dma_base; |
| 823 | prop->tce_shift = p->tce_shift; |
| 824 | prop->window_shift = p->window_shift; |
| 825 | } else if (default_win) { |
| 826 | unsigned long offset, size, liobn; |
| 827 | |
| 828 | of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); |
| 829 | |
| 830 | prop->liobn = cpu_to_be32((u32)liobn); |
| 831 | prop->dma_base = cpu_to_be64(offset); |
| 832 | prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); |
| 833 | prop->window_shift = cpu_to_be32(order_base_2(size)); |
| 834 | } |
| 835 | |
| 836 | return rdn; |
| 837 | } |
| 838 | |
| 839 | static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) |
| 840 | { |
| 841 | struct iommu_table *tbl; |
| 842 | struct device_node *dn, *pdn; |
| 843 | struct pci_dn *ppci; |
| 844 | struct dynamic_dma_window_prop prop; |
| 845 | |
| 846 | dn = pci_bus_to_OF_node(bus); |
| 847 | |
| 848 | pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", |
| 849 | dn); |
| 850 | |
| 851 | pdn = pci_dma_find(dn, &prop); |
| 852 | |
| 853 | /* In PPC architecture, there will always be DMA window on bus or one of the |
| 854 | * parent bus. During reboot, there will be ibm,dma-window property to |
| 855 | * define DMA window. For kdump, there will at least be default window or DDW |
| 856 | * or both. |
| 857 | * There is an exception to the above. In case the PE goes into frozen |
| 858 | * state, firmware may not provide ibm,dma-window property at the time |
| 859 | * of LPAR boot up. |
| 860 | */ |
| 861 | |
| 862 | if (!pdn) { |
| 863 | pr_debug(" no ibm,dma-window property !\n"); |
| 864 | return; |
| 865 | } |
| 866 | |
| 867 | ppci = PCI_DN(pdn); |
| 868 | |
| 869 | pr_debug(" parent is %pOF, iommu_table: 0x%p\n", |
| 870 | pdn, ppci->table_group); |
| 871 | |
| 872 | if (!ppci->table_group) { |
| 873 | ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); |
| 874 | tbl = ppci->table_group->tables[0]; |
| 875 | |
| 876 | iommu_table_setparms_common(tbl, ppci->phb->bus->number, |
| 877 | be32_to_cpu(prop.liobn), |
| 878 | be64_to_cpu(prop.dma_base), |
| 879 | 1ULL << be32_to_cpu(prop.window_shift), |
| 880 | be32_to_cpu(prop.tce_shift), NULL, |
| 881 | &iommu_table_lpar_multi_ops); |
| 882 | |
| 883 | if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) |
| 884 | panic("Failed to initialize iommu table"); |
| 885 | |
| 886 | iommu_register_group(ppci->table_group, |
| 887 | pci_domain_nr(bus), 0); |
| 888 | pr_debug(" created table: %p\n", ppci->table_group); |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | |
| 893 | static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) |
| 894 | { |
| 895 | struct device_node *dn; |
| 896 | struct iommu_table *tbl; |
| 897 | |
| 898 | pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev)); |
| 899 | |
| 900 | dn = dev->dev.of_node; |
| 901 | |
| 902 | /* If we're the direct child of a root bus, then we need to allocate |
| 903 | * an iommu table ourselves. The bus setup code should have setup |
| 904 | * the window sizes already. |
| 905 | */ |
| 906 | if (!dev->bus->self) { |
| 907 | struct pci_controller *phb = PCI_DN(dn)->phb; |
| 908 | |
| 909 | pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); |
| 910 | PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); |
| 911 | tbl = PCI_DN(dn)->table_group->tables[0]; |
| 912 | iommu_table_setparms(phb, dn, tbl); |
| 913 | |
| 914 | if (!iommu_init_table(tbl, phb->node, 0, 0)) |
| 915 | panic("Failed to initialize iommu table"); |
| 916 | |
| 917 | set_iommu_table_base(&dev->dev, tbl); |
| 918 | return; |
| 919 | } |
| 920 | |
| 921 | /* If this device is further down the bus tree, search upwards until |
| 922 | * an already allocated iommu table is found and use that. |
| 923 | */ |
| 924 | |
| 925 | while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) |
| 926 | dn = dn->parent; |
| 927 | |
| 928 | if (dn && PCI_DN(dn)) |
| 929 | set_iommu_table_base(&dev->dev, |
| 930 | PCI_DN(dn)->table_group->tables[0]); |
| 931 | else |
| 932 | printk(KERN_WARNING "iommu: Device %s has no iommu table\n", |
| 933 | pci_name(dev)); |
| 934 | } |
| 935 | |
| 936 | static int __read_mostly disable_ddw; |
| 937 | |
| 938 | static int __init disable_ddw_setup(char *str) |
| 939 | { |
| 940 | disable_ddw = 1; |
| 941 | printk(KERN_INFO "ppc iommu: disabling ddw.\n"); |
| 942 | |
| 943 | return 0; |
| 944 | } |
| 945 | |
| 946 | early_param("disable_ddw", disable_ddw_setup); |
| 947 | |
| 948 | static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) |
| 949 | { |
| 950 | int ret; |
| 951 | |
| 952 | ret = tce_clearrange_multi_pSeriesLP(0, |
| 953 | 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); |
| 954 | if (ret) |
| 955 | pr_warn("%pOF failed to clear tces in window.\n", |
| 956 | np); |
| 957 | else |
| 958 | pr_debug("%pOF successfully cleared tces in window.\n", |
| 959 | np); |
| 960 | } |
| 961 | |
| 962 | /* |
| 963 | * Call only if DMA window is clean. |
| 964 | */ |
| 965 | static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) |
| 966 | { |
| 967 | int ret; |
| 968 | |
| 969 | ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); |
| 970 | if (ret) |
| 971 | pr_warn("%pOF: failed to remove DMA window: rtas returned " |
| 972 | "%d to ibm,remove-pe-dma-window(%x) %llx\n", |
| 973 | np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); |
| 974 | else |
| 975 | pr_debug("%pOF: successfully removed DMA window: rtas returned " |
| 976 | "%d to ibm,remove-pe-dma-window(%x) %llx\n", |
| 977 | np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); |
| 978 | } |
| 979 | |
| 980 | static void remove_dma_window(struct device_node *np, u32 *ddw_avail, |
| 981 | struct property *win, bool cleanup) |
| 982 | { |
| 983 | struct dynamic_dma_window_prop *dwp; |
| 984 | u64 liobn; |
| 985 | |
| 986 | dwp = win->value; |
| 987 | liobn = (u64)be32_to_cpu(dwp->liobn); |
| 988 | |
| 989 | if (cleanup) |
| 990 | clean_dma_window(np, dwp); |
| 991 | __remove_dma_window(np, ddw_avail, liobn); |
| 992 | } |
| 993 | |
| 994 | static void copy_property(struct device_node *pdn, const char *from, const char *to) |
| 995 | { |
| 996 | struct property *src, *dst; |
| 997 | |
| 998 | src = of_find_property(pdn, from, NULL); |
| 999 | if (!src) |
| 1000 | return; |
| 1001 | |
| 1002 | dst = kzalloc(sizeof(*dst), GFP_KERNEL); |
| 1003 | if (!dst) |
| 1004 | return; |
| 1005 | |
| 1006 | dst->name = kstrdup(to, GFP_KERNEL); |
| 1007 | dst->value = kmemdup(src->value, src->length, GFP_KERNEL); |
| 1008 | dst->length = src->length; |
| 1009 | if (!dst->name || !dst->value) |
| 1010 | return; |
| 1011 | |
| 1012 | if (of_add_property(pdn, dst)) { |
| 1013 | pr_err("Unable to add DMA window property for %pOF", pdn); |
| 1014 | goto free_prop; |
| 1015 | } |
| 1016 | |
| 1017 | return; |
| 1018 | |
| 1019 | free_prop: |
| 1020 | kfree(dst->name); |
| 1021 | kfree(dst->value); |
| 1022 | kfree(dst); |
| 1023 | } |
| 1024 | |
| 1025 | static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name, |
| 1026 | bool cleanup) |
| 1027 | { |
| 1028 | struct property *win; |
| 1029 | u32 ddw_avail[DDW_APPLICABLE_SIZE]; |
| 1030 | int ret = 0; |
| 1031 | |
| 1032 | win = of_find_property(np, win_name, NULL); |
| 1033 | if (!win) |
| 1034 | return -EINVAL; |
| 1035 | |
| 1036 | ret = of_property_read_u32_array(np, "ibm,ddw-applicable", |
| 1037 | &ddw_avail[0], DDW_APPLICABLE_SIZE); |
| 1038 | if (ret) |
| 1039 | return 0; |
| 1040 | |
| 1041 | if (win->length >= sizeof(struct dynamic_dma_window_prop)) |
| 1042 | remove_dma_window(np, ddw_avail, win, cleanup); |
| 1043 | |
| 1044 | if (!remove_prop) |
| 1045 | return 0; |
| 1046 | |
| 1047 | /* Default window property if removed is lost as reset-pe doesn't restore it. |
| 1048 | * Though FDT has a copy of it, the DLPAR hotplugged devices will not have a |
| 1049 | * node on FDT until next reboot. So, back it up. |
| 1050 | */ |
| 1051 | if ((strcmp(win_name, "ibm,dma-window") == 0) && |
| 1052 | !of_find_property(np, "ibm,dma-window-saved", NULL)) |
| 1053 | copy_property(np, win_name, "ibm,dma-window-saved"); |
| 1054 | |
| 1055 | ret = of_remove_property(np, win); |
| 1056 | if (ret) |
| 1057 | pr_warn("%pOF: failed to remove DMA window property: %d\n", |
| 1058 | np, ret); |
| 1059 | return 0; |
| 1060 | } |
| 1061 | |
| 1062 | static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift, |
| 1063 | bool *direct_mapping) |
| 1064 | { |
| 1065 | struct dma_win *window; |
| 1066 | const struct dynamic_dma_window_prop *dma64; |
| 1067 | bool found = false; |
| 1068 | |
| 1069 | spin_lock(&dma_win_list_lock); |
| 1070 | /* check if we already created a window and dupe that config if so */ |
| 1071 | list_for_each_entry(window, &dma_win_list, list) { |
| 1072 | if (window->device == pdn) { |
| 1073 | dma64 = window->prop; |
| 1074 | *dma_addr = be64_to_cpu(dma64->dma_base); |
| 1075 | *window_shift = be32_to_cpu(dma64->window_shift); |
| 1076 | *direct_mapping = window->direct; |
| 1077 | found = true; |
| 1078 | break; |
| 1079 | } |
| 1080 | } |
| 1081 | spin_unlock(&dma_win_list_lock); |
| 1082 | |
| 1083 | return found; |
| 1084 | } |
| 1085 | |
| 1086 | static struct dma_win *ddw_list_new_entry(struct device_node *pdn, |
| 1087 | const struct dynamic_dma_window_prop *dma64) |
| 1088 | { |
| 1089 | struct dma_win *window; |
| 1090 | |
| 1091 | window = kzalloc(sizeof(*window), GFP_KERNEL); |
| 1092 | if (!window) |
| 1093 | return NULL; |
| 1094 | |
| 1095 | window->device = pdn; |
| 1096 | window->prop = dma64; |
| 1097 | window->direct = false; |
| 1098 | |
| 1099 | return window; |
| 1100 | } |
| 1101 | |
| 1102 | static void find_existing_ddw_windows_named(const char *name) |
| 1103 | { |
| 1104 | int len; |
| 1105 | struct device_node *pdn; |
| 1106 | struct dma_win *window; |
| 1107 | const struct dynamic_dma_window_prop *dma64; |
| 1108 | |
| 1109 | for_each_node_with_property(pdn, name) { |
| 1110 | dma64 = of_get_property(pdn, name, &len); |
| 1111 | if (!dma64 || len < sizeof(*dma64)) { |
| 1112 | remove_dma_window_named(pdn, true, name, true); |
| 1113 | continue; |
| 1114 | } |
| 1115 | |
| 1116 | /* If at the time of system initialization, there are DDWs in OF, |
| 1117 | * it means this is during kexec. DDW could be direct or dynamic. |
| 1118 | * We will just mark DDWs as "dynamic" since this is kdump path, |
| 1119 | * no need to worry about perforance. ddw_list_new_entry() will |
| 1120 | * set window->direct = false. |
| 1121 | */ |
| 1122 | window = ddw_list_new_entry(pdn, dma64); |
| 1123 | if (!window) { |
| 1124 | of_node_put(pdn); |
| 1125 | break; |
| 1126 | } |
| 1127 | |
| 1128 | spin_lock(&dma_win_list_lock); |
| 1129 | list_add(&window->list, &dma_win_list); |
| 1130 | spin_unlock(&dma_win_list_lock); |
| 1131 | } |
| 1132 | } |
| 1133 | |
| 1134 | static int find_existing_ddw_windows(void) |
| 1135 | { |
| 1136 | if (!firmware_has_feature(FW_FEATURE_LPAR)) |
| 1137 | return 0; |
| 1138 | |
| 1139 | find_existing_ddw_windows_named(DIRECT64_PROPNAME); |
| 1140 | find_existing_ddw_windows_named(DMA64_PROPNAME); |
| 1141 | |
| 1142 | return 0; |
| 1143 | } |
| 1144 | machine_arch_initcall(pseries, find_existing_ddw_windows); |
| 1145 | |
| 1146 | /** |
| 1147 | * ddw_read_ext - Get the value of an DDW extension |
| 1148 | * @np: device node from which the extension value is to be read. |
| 1149 | * @extnum: index number of the extension. |
| 1150 | * @value: pointer to return value, modified when extension is available. |
| 1151 | * |
| 1152 | * Checks if "ibm,ddw-extensions" exists for this node, and get the value |
| 1153 | * on index 'extnum'. |
| 1154 | * It can be used only to check if a property exists, passing value == NULL. |
| 1155 | * |
| 1156 | * Returns: |
| 1157 | * 0 if extension successfully read |
| 1158 | * -EINVAL if the "ibm,ddw-extensions" does not exist, |
| 1159 | * -ENODATA if "ibm,ddw-extensions" does not have a value, and |
| 1160 | * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension. |
| 1161 | */ |
| 1162 | static inline int ddw_read_ext(const struct device_node *np, int extnum, |
| 1163 | u32 *value) |
| 1164 | { |
| 1165 | static const char propname[] = "ibm,ddw-extensions"; |
| 1166 | u32 count; |
| 1167 | int ret; |
| 1168 | |
| 1169 | ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count); |
| 1170 | if (ret) |
| 1171 | return ret; |
| 1172 | |
| 1173 | if (count < extnum) |
| 1174 | return -EOVERFLOW; |
| 1175 | |
| 1176 | if (!value) |
| 1177 | value = &count; |
| 1178 | |
| 1179 | return of_property_read_u32_index(np, propname, extnum, value); |
| 1180 | } |
| 1181 | |
| 1182 | static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, |
| 1183 | struct ddw_query_response *query, |
| 1184 | struct device_node *parent) |
| 1185 | { |
| 1186 | struct device_node *dn; |
| 1187 | struct pci_dn *pdn; |
| 1188 | u32 cfg_addr, ext_query, query_out[5]; |
| 1189 | u64 buid; |
| 1190 | int ret, out_sz; |
| 1191 | |
| 1192 | /* |
| 1193 | * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many |
| 1194 | * output parameters ibm,query-pe-dma-windows will have, ranging from |
| 1195 | * 5 to 6. |
| 1196 | */ |
| 1197 | ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query); |
| 1198 | if (!ret && ext_query == 1) |
| 1199 | out_sz = 6; |
| 1200 | else |
| 1201 | out_sz = 5; |
| 1202 | |
| 1203 | /* |
| 1204 | * Get the config address and phb buid of the PE window. |
| 1205 | * Rely on eeh to retrieve this for us. |
| 1206 | * Retrieve them from the pci device, not the node with the |
| 1207 | * dma-window property |
| 1208 | */ |
| 1209 | dn = pci_device_to_OF_node(dev); |
| 1210 | pdn = PCI_DN(dn); |
| 1211 | buid = pdn->phb->buid; |
| 1212 | cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); |
| 1213 | |
| 1214 | ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out, |
| 1215 | cfg_addr, BUID_HI(buid), BUID_LO(buid)); |
| 1216 | |
| 1217 | switch (out_sz) { |
| 1218 | case 5: |
| 1219 | query->windows_available = query_out[0]; |
| 1220 | query->largest_available_block = query_out[1]; |
| 1221 | query->page_size = query_out[2]; |
| 1222 | query->migration_capable = query_out[3]; |
| 1223 | break; |
| 1224 | case 6: |
| 1225 | query->windows_available = query_out[0]; |
| 1226 | query->largest_available_block = ((u64)query_out[1] << 32) | |
| 1227 | query_out[2]; |
| 1228 | query->page_size = query_out[3]; |
| 1229 | query->migration_capable = query_out[4]; |
| 1230 | break; |
| 1231 | } |
| 1232 | |
| 1233 | dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n", |
| 1234 | ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid), |
| 1235 | BUID_LO(buid), ret, query->largest_available_block, |
| 1236 | query->page_size, query->windows_available); |
| 1237 | |
| 1238 | return ret; |
| 1239 | } |
| 1240 | |
| 1241 | static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, |
| 1242 | struct ddw_create_response *create, int page_shift, |
| 1243 | int window_shift) |
| 1244 | { |
| 1245 | struct device_node *dn; |
| 1246 | struct pci_dn *pdn; |
| 1247 | u32 cfg_addr; |
| 1248 | u64 buid; |
| 1249 | int ret; |
| 1250 | |
| 1251 | /* |
| 1252 | * Get the config address and phb buid of the PE window. |
| 1253 | * Rely on eeh to retrieve this for us. |
| 1254 | * Retrieve them from the pci device, not the node with the |
| 1255 | * dma-window property |
| 1256 | */ |
| 1257 | dn = pci_device_to_OF_node(dev); |
| 1258 | pdn = PCI_DN(dn); |
| 1259 | buid = pdn->phb->buid; |
| 1260 | cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); |
| 1261 | |
| 1262 | do { |
| 1263 | /* extra outputs are LIOBN and dma-addr (hi, lo) */ |
| 1264 | ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4, |
| 1265 | (u32 *)create, cfg_addr, BUID_HI(buid), |
| 1266 | BUID_LO(buid), page_shift, window_shift); |
| 1267 | } while (rtas_busy_delay(ret)); |
| 1268 | dev_info(&dev->dev, |
| 1269 | "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " |
| 1270 | "(liobn = 0x%x starting addr = %x %x)\n", |
| 1271 | ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid), |
| 1272 | BUID_LO(buid), page_shift, window_shift, ret, create->liobn, |
| 1273 | create->addr_hi, create->addr_lo); |
| 1274 | |
| 1275 | return ret; |
| 1276 | } |
| 1277 | |
| 1278 | struct failed_ddw_pdn { |
| 1279 | struct device_node *pdn; |
| 1280 | struct list_head list; |
| 1281 | }; |
| 1282 | |
| 1283 | static LIST_HEAD(failed_ddw_pdn_list); |
| 1284 | |
| 1285 | static phys_addr_t ddw_memory_hotplug_max(void) |
| 1286 | { |
| 1287 | resource_size_t max_addr = memory_hotplug_max(); |
| 1288 | struct device_node *memory; |
| 1289 | |
| 1290 | for_each_node_by_type(memory, "memory") { |
| 1291 | struct resource res; |
| 1292 | |
| 1293 | if (of_address_to_resource(memory, 0, &res)) |
| 1294 | continue; |
| 1295 | |
| 1296 | max_addr = max_t(resource_size_t, max_addr, res.end + 1); |
| 1297 | } |
| 1298 | |
| 1299 | return max_addr; |
| 1300 | } |
| 1301 | |
| 1302 | /* |
| 1303 | * Platforms supporting the DDW option starting with LoPAR level 2.7 implement |
| 1304 | * ibm,ddw-extensions, which carries the rtas token for |
| 1305 | * ibm,reset-pe-dma-windows. |
| 1306 | * That rtas-call can be used to restore the default DMA window for the device. |
| 1307 | */ |
| 1308 | static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) |
| 1309 | { |
| 1310 | int ret; |
| 1311 | u32 cfg_addr, reset_dma_win; |
| 1312 | u64 buid; |
| 1313 | struct device_node *dn; |
| 1314 | struct pci_dn *pdn; |
| 1315 | |
| 1316 | ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); |
| 1317 | if (ret) |
| 1318 | return; |
| 1319 | |
| 1320 | dn = pci_device_to_OF_node(dev); |
| 1321 | pdn = PCI_DN(dn); |
| 1322 | buid = pdn->phb->buid; |
| 1323 | cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); |
| 1324 | |
| 1325 | ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid), |
| 1326 | BUID_LO(buid)); |
| 1327 | if (ret) |
| 1328 | dev_info(&dev->dev, |
| 1329 | "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ", |
| 1330 | reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), |
| 1331 | ret); |
| 1332 | } |
| 1333 | |
| 1334 | /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ |
| 1335 | static int iommu_get_page_shift(u32 query_page_size) |
| 1336 | { |
| 1337 | /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */ |
| 1338 | const int shift[] = { |
| 1339 | __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M), |
| 1340 | __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M), |
| 1341 | __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M) |
| 1342 | }; |
| 1343 | |
| 1344 | int i = ARRAY_SIZE(shift) - 1; |
| 1345 | int ret = 0; |
| 1346 | |
| 1347 | /* |
| 1348 | * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field: |
| 1349 | * - bit 31 means 4k pages are supported, |
| 1350 | * - bit 30 means 64k pages are supported, and so on. |
| 1351 | * Larger pagesizes map more memory with the same amount of TCEs, so start probing them. |
| 1352 | */ |
| 1353 | for (; i >= 0 ; i--) { |
| 1354 | if (query_page_size & (1 << i)) |
| 1355 | ret = max(ret, shift[i]); |
| 1356 | } |
| 1357 | |
| 1358 | return ret; |
| 1359 | } |
| 1360 | |
| 1361 | static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, |
| 1362 | u32 page_shift, u32 window_shift) |
| 1363 | { |
| 1364 | struct dynamic_dma_window_prop *ddwprop; |
| 1365 | struct property *win64; |
| 1366 | |
| 1367 | win64 = kzalloc(sizeof(*win64), GFP_KERNEL); |
| 1368 | if (!win64) |
| 1369 | return NULL; |
| 1370 | |
| 1371 | win64->name = kstrdup(propname, GFP_KERNEL); |
| 1372 | ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); |
| 1373 | win64->value = ddwprop; |
| 1374 | win64->length = sizeof(*ddwprop); |
| 1375 | if (!win64->name || !win64->value) { |
| 1376 | kfree(win64->name); |
| 1377 | kfree(win64->value); |
| 1378 | kfree(win64); |
| 1379 | return NULL; |
| 1380 | } |
| 1381 | |
| 1382 | ddwprop->liobn = cpu_to_be32(liobn); |
| 1383 | ddwprop->dma_base = cpu_to_be64(dma_addr); |
| 1384 | ddwprop->tce_shift = cpu_to_be32(page_shift); |
| 1385 | ddwprop->window_shift = cpu_to_be32(window_shift); |
| 1386 | |
| 1387 | return win64; |
| 1388 | } |
| 1389 | |
| 1390 | /* |
| 1391 | * If the PE supports dynamic dma windows, and there is space for a table |
| 1392 | * that can map all pages in a linear offset, then setup such a table, |
| 1393 | * and record the dma-offset in the struct device. |
| 1394 | * |
| 1395 | * dev: the pci device we are checking |
| 1396 | * pdn: the parent pe node with the ibm,dma_window property |
| 1397 | * Future: also check if we can remap the base window for our base page size |
| 1398 | * |
| 1399 | * returns true if can map all pages (direct mapping), false otherwise.. |
| 1400 | */ |
| 1401 | static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) |
| 1402 | { |
| 1403 | int len = 0, ret; |
| 1404 | int max_ram_len = order_base_2(ddw_memory_hotplug_max()); |
| 1405 | struct ddw_query_response query; |
| 1406 | struct ddw_create_response create; |
| 1407 | int page_shift; |
| 1408 | u64 win_addr, dynamic_offset = 0; |
| 1409 | const char *win_name; |
| 1410 | struct device_node *dn; |
| 1411 | u32 ddw_avail[DDW_APPLICABLE_SIZE]; |
| 1412 | struct dma_win *window; |
| 1413 | struct property *win64; |
| 1414 | struct failed_ddw_pdn *fpdn; |
| 1415 | bool default_win_removed = false, direct_mapping = false; |
| 1416 | bool dynamic_mapping = false; |
| 1417 | bool pmem_present; |
| 1418 | struct pci_dn *pci = PCI_DN(pdn); |
| 1419 | struct property *default_win = NULL; |
| 1420 | |
| 1421 | dn = of_find_node_by_type(NULL, "ibm,pmemory"); |
| 1422 | pmem_present = dn != NULL; |
| 1423 | of_node_put(dn); |
| 1424 | |
| 1425 | mutex_lock(&dma_win_init_mutex); |
| 1426 | |
| 1427 | if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) |
| 1428 | goto out_unlock; |
| 1429 | |
| 1430 | /* |
| 1431 | * If we already went through this for a previous function of |
| 1432 | * the same device and failed, we don't want to muck with the |
| 1433 | * DMA window again, as it will race with in-flight operations |
| 1434 | * and can lead to EEHs. The above mutex protects access to the |
| 1435 | * list. |
| 1436 | */ |
| 1437 | list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { |
| 1438 | if (fpdn->pdn == pdn) |
| 1439 | goto out_unlock; |
| 1440 | } |
| 1441 | |
| 1442 | /* |
| 1443 | * the ibm,ddw-applicable property holds the tokens for: |
| 1444 | * ibm,query-pe-dma-window |
| 1445 | * ibm,create-pe-dma-window |
| 1446 | * ibm,remove-pe-dma-window |
| 1447 | * for the given node in that order. |
| 1448 | * the property is actually in the parent, not the PE |
| 1449 | */ |
| 1450 | ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", |
| 1451 | &ddw_avail[0], DDW_APPLICABLE_SIZE); |
| 1452 | if (ret) |
| 1453 | goto out_failed; |
| 1454 | |
| 1455 | /* |
| 1456 | * Query if there is a second window of size to map the |
| 1457 | * whole partition. Query returns number of windows, largest |
| 1458 | * block assigned to PE (partition endpoint), and two bitmasks |
| 1459 | * of page sizes: supported and supported for migrate-dma. |
| 1460 | */ |
| 1461 | dn = pci_device_to_OF_node(dev); |
| 1462 | ret = query_ddw(dev, ddw_avail, &query, pdn); |
| 1463 | if (ret != 0) |
| 1464 | goto out_failed; |
| 1465 | |
| 1466 | /* |
| 1467 | * If there is no window available, remove the default DMA window, |
| 1468 | * if it's present. This will make all the resources available to the |
| 1469 | * new DDW window. |
| 1470 | * If anything fails after this, we need to restore it, so also check |
| 1471 | * for extensions presence. |
| 1472 | */ |
| 1473 | if (query.windows_available == 0) { |
| 1474 | int reset_win_ext; |
| 1475 | |
| 1476 | /* DDW + IOMMU on single window may fail if there is any allocation */ |
| 1477 | if (iommu_table_in_use(pci->table_group->tables[0])) { |
| 1478 | dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); |
| 1479 | goto out_failed; |
| 1480 | } |
| 1481 | |
| 1482 | default_win = of_find_property(pdn, "ibm,dma-window", NULL); |
| 1483 | if (!default_win) |
| 1484 | goto out_failed; |
| 1485 | |
| 1486 | reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL); |
| 1487 | if (reset_win_ext) |
| 1488 | goto out_failed; |
| 1489 | |
| 1490 | remove_dma_window(pdn, ddw_avail, default_win, true); |
| 1491 | default_win_removed = true; |
| 1492 | |
| 1493 | /* Query again, to check if the window is available */ |
| 1494 | ret = query_ddw(dev, ddw_avail, &query, pdn); |
| 1495 | if (ret != 0) |
| 1496 | goto out_failed; |
| 1497 | |
| 1498 | if (query.windows_available == 0) { |
| 1499 | /* no windows are available for this device. */ |
| 1500 | dev_dbg(&dev->dev, "no free dynamic windows"); |
| 1501 | goto out_failed; |
| 1502 | } |
| 1503 | } |
| 1504 | |
| 1505 | page_shift = iommu_get_page_shift(query.page_size); |
| 1506 | if (!page_shift) { |
| 1507 | dev_dbg(&dev->dev, "no supported page size in mask %x", |
| 1508 | query.page_size); |
| 1509 | goto out_failed; |
| 1510 | } |
| 1511 | |
| 1512 | /* |
| 1513 | * The "ibm,pmemory" can appear anywhere in the address space. |
| 1514 | * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS |
| 1515 | * for the upper limit and fallback to max RAM otherwise but this |
| 1516 | * disables device::dma_ops_bypass. |
| 1517 | */ |
| 1518 | len = max_ram_len; |
| 1519 | if (pmem_present) { |
| 1520 | if (query.largest_available_block >= |
| 1521 | (1ULL << (MAX_PHYSMEM_BITS - page_shift))) |
| 1522 | len = MAX_PHYSMEM_BITS; |
| 1523 | else |
| 1524 | dev_info(&dev->dev, "Skipping ibm,pmemory"); |
| 1525 | } |
| 1526 | |
| 1527 | /* check if the available block * number of ptes will map everything */ |
| 1528 | if (query.largest_available_block < (1ULL << (len - page_shift))) { |
| 1529 | dev_dbg(&dev->dev, |
| 1530 | "can't map partition max 0x%llx with %llu %llu-sized pages\n", |
| 1531 | 1ULL << len, |
| 1532 | query.largest_available_block, |
| 1533 | 1ULL << page_shift); |
| 1534 | |
| 1535 | len = order_base_2(query.largest_available_block << page_shift); |
| 1536 | |
| 1537 | dynamic_mapping = true; |
| 1538 | } else { |
| 1539 | direct_mapping = !default_win_removed || |
| 1540 | (len == MAX_PHYSMEM_BITS) || |
| 1541 | (!pmem_present && (len == max_ram_len)); |
| 1542 | |
| 1543 | /* DDW is big enough to direct map RAM. If there is vPMEM, check |
| 1544 | * if enough space is left in DDW where we can dynamically |
| 1545 | * allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW |
| 1546 | * is only for SR-IOV devices. |
| 1547 | */ |
| 1548 | if (default_win_removed && pmem_present && !direct_mapping) { |
| 1549 | /* DDW is big enough to be split */ |
| 1550 | if ((query.largest_available_block << page_shift) >= |
| 1551 | MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) { |
| 1552 | direct_mapping = true; |
| 1553 | |
| 1554 | /* offset of the Dynamic part of DDW */ |
| 1555 | dynamic_offset = 1ULL << max_ram_len; |
| 1556 | } |
| 1557 | |
| 1558 | /* DDW will at least have dynamic allocation */ |
| 1559 | dynamic_mapping = true; |
| 1560 | |
| 1561 | /* create max size DDW possible */ |
| 1562 | len = order_base_2(query.largest_available_block |
| 1563 | << page_shift); |
| 1564 | } |
| 1565 | } |
| 1566 | |
| 1567 | /* Even if the DDW is split into both direct mapped RAM and dynamically |
| 1568 | * mapped vPMEM, the DDW property in OF will be marked as Direct. |
| 1569 | */ |
| 1570 | win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; |
| 1571 | |
| 1572 | ret = create_ddw(dev, ddw_avail, &create, page_shift, len); |
| 1573 | if (ret != 0) |
| 1574 | goto out_failed; |
| 1575 | |
| 1576 | dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", |
| 1577 | create.liobn, dn); |
| 1578 | |
| 1579 | win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; |
| 1580 | win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); |
| 1581 | |
| 1582 | if (!win64) { |
| 1583 | dev_info(&dev->dev, |
| 1584 | "couldn't allocate property, property name, or value\n"); |
| 1585 | goto out_remove_win; |
| 1586 | } |
| 1587 | |
| 1588 | ret = of_add_property(pdn, win64); |
| 1589 | if (ret) { |
| 1590 | dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", |
| 1591 | pdn, ret); |
| 1592 | goto out_free_prop; |
| 1593 | } |
| 1594 | |
| 1595 | window = ddw_list_new_entry(pdn, win64->value); |
| 1596 | if (!window) |
| 1597 | goto out_del_prop; |
| 1598 | |
| 1599 | window->direct = direct_mapping; |
| 1600 | |
| 1601 | if (direct_mapping) { |
| 1602 | /* DDW maps the whole partition, so enable direct DMA mapping */ |
| 1603 | ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, |
| 1604 | win64->value, tce_setrange_multi_pSeriesLP_walk); |
| 1605 | if (ret) { |
| 1606 | dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", |
| 1607 | dn, ret); |
| 1608 | |
| 1609 | /* Make sure to clean DDW if any TCE was set*/ |
| 1610 | clean_dma_window(pdn, win64->value); |
| 1611 | goto out_del_list; |
| 1612 | } |
| 1613 | if (default_win_removed) { |
| 1614 | iommu_tce_table_put(pci->table_group->tables[0]); |
| 1615 | pci->table_group->tables[0] = NULL; |
| 1616 | set_iommu_table_base(&dev->dev, NULL); |
| 1617 | } |
| 1618 | } |
| 1619 | |
| 1620 | if (dynamic_mapping) { |
| 1621 | struct iommu_table *newtbl; |
| 1622 | int i; |
| 1623 | unsigned long start = 0, end = 0; |
| 1624 | u64 dynamic_addr, dynamic_len; |
| 1625 | |
| 1626 | for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { |
| 1627 | const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; |
| 1628 | |
| 1629 | /* Look for MMIO32 */ |
| 1630 | if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { |
| 1631 | start = pci->phb->mem_resources[i].start; |
| 1632 | end = pci->phb->mem_resources[i].end; |
| 1633 | break; |
| 1634 | } |
| 1635 | } |
| 1636 | |
| 1637 | /* New table for using DDW instead of the default DMA window */ |
| 1638 | newtbl = iommu_pseries_alloc_table(pci->phb->node); |
| 1639 | if (!newtbl) { |
| 1640 | dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); |
| 1641 | goto out_del_list; |
| 1642 | } |
| 1643 | |
| 1644 | /* If the DDW is split between directly mapped RAM and Dynamic |
| 1645 | * mapped for TCES, offset into the DDW where the dynamic part |
| 1646 | * begins. |
| 1647 | */ |
| 1648 | dynamic_addr = win_addr + dynamic_offset; |
| 1649 | dynamic_len = (1UL << len) - dynamic_offset; |
| 1650 | iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, |
| 1651 | dynamic_addr, dynamic_len, page_shift, NULL, |
| 1652 | &iommu_table_lpar_multi_ops); |
| 1653 | iommu_init_table(newtbl, pci->phb->node, start, end); |
| 1654 | |
| 1655 | pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl; |
| 1656 | |
| 1657 | set_iommu_table_base(&dev->dev, newtbl); |
| 1658 | } |
| 1659 | |
| 1660 | if (default_win_removed) { |
| 1661 | /* default_win is valid here because default_win_removed == true */ |
| 1662 | if (!of_find_property(pdn, "ibm,dma-window-saved", NULL)) |
| 1663 | copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved"); |
| 1664 | of_remove_property(pdn, default_win); |
| 1665 | dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn); |
| 1666 | } |
| 1667 | |
| 1668 | spin_lock(&dma_win_list_lock); |
| 1669 | list_add(&window->list, &dma_win_list); |
| 1670 | spin_unlock(&dma_win_list_lock); |
| 1671 | |
| 1672 | dev->dev.archdata.dma_offset = win_addr; |
| 1673 | goto out_unlock; |
| 1674 | |
| 1675 | out_del_list: |
| 1676 | kfree(window); |
| 1677 | |
| 1678 | out_del_prop: |
| 1679 | of_remove_property(pdn, win64); |
| 1680 | |
| 1681 | out_free_prop: |
| 1682 | kfree(win64->name); |
| 1683 | kfree(win64->value); |
| 1684 | kfree(win64); |
| 1685 | |
| 1686 | out_remove_win: |
| 1687 | /* DDW is clean, so it's ok to call this directly. */ |
| 1688 | __remove_dma_window(pdn, ddw_avail, create.liobn); |
| 1689 | |
| 1690 | out_failed: |
| 1691 | if (default_win_removed) |
| 1692 | reset_dma_window(dev, pdn); |
| 1693 | |
| 1694 | fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); |
| 1695 | if (!fpdn) |
| 1696 | goto out_unlock; |
| 1697 | fpdn->pdn = pdn; |
| 1698 | list_add(&fpdn->list, &failed_ddw_pdn_list); |
| 1699 | |
| 1700 | out_unlock: |
| 1701 | mutex_unlock(&dma_win_init_mutex); |
| 1702 | |
| 1703 | /* If we have persistent memory and the window size is not big enough |
| 1704 | * to directly map both RAM and vPMEM, then we need to set DMA limit. |
| 1705 | */ |
| 1706 | if (pmem_present && direct_mapping && len != MAX_PHYSMEM_BITS) |
| 1707 | dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + |
| 1708 | (1ULL << max_ram_len); |
| 1709 | |
| 1710 | return direct_mapping; |
| 1711 | } |
| 1712 | |
| 1713 | static __u64 query_page_size_to_mask(u32 query_page_size) |
| 1714 | { |
| 1715 | const long shift[] = { |
| 1716 | (SZ_4K), (SZ_64K), (SZ_16M), |
| 1717 | (SZ_32M), (SZ_64M), (SZ_128M), |
| 1718 | (SZ_256M), (SZ_16G), (SZ_2M) |
| 1719 | }; |
| 1720 | int i, ret = 0; |
| 1721 | |
| 1722 | for (i = 0; i < ARRAY_SIZE(shift); i++) { |
| 1723 | if (query_page_size & (1 << i)) |
| 1724 | ret |= shift[i]; |
| 1725 | } |
| 1726 | |
| 1727 | return ret; |
| 1728 | } |
| 1729 | |
| 1730 | static void spapr_tce_init_table_group(struct pci_dev *pdev, |
| 1731 | struct device_node *pdn, |
| 1732 | struct dynamic_dma_window_prop prop) |
| 1733 | { |
| 1734 | struct iommu_table_group *table_group = PCI_DN(pdn)->table_group; |
| 1735 | u32 ddw_avail[DDW_APPLICABLE_SIZE]; |
| 1736 | |
| 1737 | struct ddw_query_response query; |
| 1738 | int ret; |
| 1739 | |
| 1740 | /* Only for normal boot with default window. Doesn't matter during |
| 1741 | * kdump, since these will not be used during kdump. |
| 1742 | */ |
| 1743 | if (is_kdump_kernel()) |
| 1744 | return; |
| 1745 | |
| 1746 | if (table_group->max_dynamic_windows_supported != 0) |
| 1747 | return; /* already initialized */ |
| 1748 | |
| 1749 | table_group->tce32_start = be64_to_cpu(prop.dma_base); |
| 1750 | table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); |
| 1751 | |
| 1752 | if (!of_find_property(pdn, "ibm,dma-window", NULL)) |
| 1753 | dev_err(&pdev->dev, "default dma window missing!\n"); |
| 1754 | |
| 1755 | ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", |
| 1756 | &ddw_avail[0], DDW_APPLICABLE_SIZE); |
| 1757 | if (ret) { |
| 1758 | table_group->max_dynamic_windows_supported = -1; |
| 1759 | return; |
| 1760 | } |
| 1761 | |
| 1762 | ret = query_ddw(pdev, ddw_avail, &query, pdn); |
| 1763 | if (ret) { |
| 1764 | dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__); |
| 1765 | table_group->max_dynamic_windows_supported = -1; |
| 1766 | return; |
| 1767 | } |
| 1768 | |
| 1769 | if (query.windows_available == 0) |
| 1770 | table_group->max_dynamic_windows_supported = 1; |
| 1771 | else |
| 1772 | table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES; |
| 1773 | |
| 1774 | table_group->max_levels = 1; |
| 1775 | table_group->pgsizes |= query_page_size_to_mask(query.page_size); |
| 1776 | } |
| 1777 | |
| 1778 | static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) |
| 1779 | { |
| 1780 | struct device_node *pdn, *dn; |
| 1781 | struct iommu_table *tbl; |
| 1782 | struct pci_dn *pci; |
| 1783 | struct dynamic_dma_window_prop prop; |
| 1784 | |
| 1785 | pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); |
| 1786 | |
| 1787 | /* dev setup for LPAR is a little tricky, since the device tree might |
| 1788 | * contain the dma-window properties per-device and not necessarily |
| 1789 | * for the bus. So we need to search upwards in the tree until we |
| 1790 | * either hit a dma-window property, OR find a parent with a table |
| 1791 | * already allocated. |
| 1792 | */ |
| 1793 | dn = pci_device_to_OF_node(dev); |
| 1794 | pr_debug(" node is %pOF\n", dn); |
| 1795 | |
| 1796 | pdn = pci_dma_find(dn, &prop); |
| 1797 | if (!pdn || !PCI_DN(pdn)) { |
| 1798 | printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " |
| 1799 | "no DMA window found for pci dev=%s dn=%pOF\n", |
| 1800 | pci_name(dev), dn); |
| 1801 | return; |
| 1802 | } |
| 1803 | pr_debug(" parent is %pOF\n", pdn); |
| 1804 | |
| 1805 | pci = PCI_DN(pdn); |
| 1806 | if (!pci->table_group) { |
| 1807 | pci->table_group = iommu_pseries_alloc_group(pci->phb->node); |
| 1808 | tbl = pci->table_group->tables[0]; |
| 1809 | |
| 1810 | iommu_table_setparms_common(tbl, pci->phb->bus->number, |
| 1811 | be32_to_cpu(prop.liobn), |
| 1812 | be64_to_cpu(prop.dma_base), |
| 1813 | 1ULL << be32_to_cpu(prop.window_shift), |
| 1814 | be32_to_cpu(prop.tce_shift), NULL, |
| 1815 | &iommu_table_lpar_multi_ops); |
| 1816 | |
| 1817 | iommu_init_table(tbl, pci->phb->node, 0, 0); |
| 1818 | iommu_register_group(pci->table_group, |
| 1819 | pci_domain_nr(pci->phb->bus), 0); |
| 1820 | pr_debug(" created table: %p\n", pci->table_group); |
| 1821 | } else { |
| 1822 | pr_debug(" found DMA window, table: %p\n", pci->table_group); |
| 1823 | } |
| 1824 | |
| 1825 | spapr_tce_init_table_group(dev, pdn, prop); |
| 1826 | |
| 1827 | set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); |
| 1828 | iommu_add_device(pci->table_group, &dev->dev); |
| 1829 | } |
| 1830 | |
| 1831 | static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) |
| 1832 | { |
| 1833 | struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; |
| 1834 | |
| 1835 | /* only attempt to use a new window if 64-bit DMA is requested */ |
| 1836 | if (dma_mask < DMA_BIT_MASK(64)) |
| 1837 | return false; |
| 1838 | |
| 1839 | dev_dbg(&pdev->dev, "node is %pOF\n", dn); |
| 1840 | |
| 1841 | /* |
| 1842 | * the device tree might contain the dma-window properties |
| 1843 | * per-device and not necessarily for the bus. So we need to |
| 1844 | * search upwards in the tree until we either hit a dma-window |
| 1845 | * property, OR find a parent with a table already allocated. |
| 1846 | */ |
| 1847 | pdn = pci_dma_find(dn, NULL); |
| 1848 | if (pdn && PCI_DN(pdn)) |
| 1849 | return enable_ddw(pdev, pdn); |
| 1850 | |
| 1851 | return false; |
| 1852 | } |
| 1853 | |
| 1854 | #ifdef CONFIG_IOMMU_API |
| 1855 | /* |
| 1856 | * A simple iommu_table_group_ops which only allows reusing the existing |
| 1857 | * iommu_table. This handles VFIO for POWER7 or the nested KVM. |
| 1858 | * The ops does not allow creating windows and only allows reusing the existing |
| 1859 | * one if it matches table_group->tce32_start/tce32_size/page_shift. |
| 1860 | */ |
| 1861 | static unsigned long spapr_tce_get_table_size(__u32 page_shift, |
| 1862 | __u64 window_size, __u32 levels) |
| 1863 | { |
| 1864 | unsigned long size; |
| 1865 | |
| 1866 | if (levels > 1) |
| 1867 | return ~0U; |
| 1868 | size = window_size >> (page_shift - 3); |
| 1869 | return size; |
| 1870 | } |
| 1871 | |
| 1872 | static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group) |
| 1873 | { |
| 1874 | struct pci_dev *pdev = NULL; |
| 1875 | int ret; |
| 1876 | |
| 1877 | /* No IOMMU group ? */ |
| 1878 | if (!group) |
| 1879 | return NULL; |
| 1880 | |
| 1881 | ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table); |
| 1882 | if (!ret || !pdev) |
| 1883 | return NULL; |
| 1884 | return pdev; |
| 1885 | } |
| 1886 | |
| 1887 | static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn) |
| 1888 | { |
| 1889 | reset_dma_window(pdev, pdn); |
| 1890 | copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window"); |
| 1891 | } |
| 1892 | |
| 1893 | static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn) |
| 1894 | { |
| 1895 | struct pci_dn *pci = PCI_DN(pdn); |
| 1896 | struct dma_win *window; |
| 1897 | bool direct_mapping; |
| 1898 | int len; |
| 1899 | |
| 1900 | if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) { |
| 1901 | remove_dma_window_named(pdn, true, direct_mapping ? |
| 1902 | DIRECT64_PROPNAME : DMA64_PROPNAME, true); |
| 1903 | if (!direct_mapping) { |
| 1904 | WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]); |
| 1905 | |
| 1906 | if (pci->table_group->tables[1]) { |
| 1907 | iommu_tce_table_put(pci->table_group->tables[1]); |
| 1908 | pci->table_group->tables[1] = NULL; |
| 1909 | } else if (pci->table_group->tables[0]) { |
| 1910 | /* Default window was removed and only the DDW exists */ |
| 1911 | iommu_tce_table_put(pci->table_group->tables[0]); |
| 1912 | pci->table_group->tables[0] = NULL; |
| 1913 | } |
| 1914 | } |
| 1915 | spin_lock(&dma_win_list_lock); |
| 1916 | list_for_each_entry(window, &dma_win_list, list) { |
| 1917 | if (window->device == pdn) { |
| 1918 | list_del(&window->list); |
| 1919 | kfree(window); |
| 1920 | break; |
| 1921 | } |
| 1922 | } |
| 1923 | spin_unlock(&dma_win_list_lock); |
| 1924 | } |
| 1925 | |
| 1926 | return 0; |
| 1927 | } |
| 1928 | |
| 1929 | static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group, |
| 1930 | struct device *dev) |
| 1931 | { |
| 1932 | struct pci_dev *pdev = to_pci_dev(dev); |
| 1933 | const __be32 *default_prop; |
| 1934 | long liobn, offset, size; |
| 1935 | struct device_node *pdn; |
| 1936 | struct iommu_table *tbl; |
| 1937 | struct pci_dn *pci; |
| 1938 | |
| 1939 | pdn = pci_dma_find_parent_node(pdev, table_group); |
| 1940 | if (!pdn || !PCI_DN(pdn)) { |
| 1941 | dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn); |
| 1942 | return -1; |
| 1943 | } |
| 1944 | pci = PCI_DN(pdn); |
| 1945 | |
| 1946 | /* The default window is restored if not present already on removal of DDW. |
| 1947 | * However, if used by VFIO SPAPR sub driver, the user's order of removal of |
| 1948 | * windows might have been different to not leading to auto restoration, |
| 1949 | * suppose the DDW was removed first followed by the default one. |
| 1950 | * So, restore the default window with reset-pe-dma call explicitly. |
| 1951 | */ |
| 1952 | restore_default_dma_window(pdev, pdn); |
| 1953 | |
| 1954 | default_prop = of_get_property(pdn, "ibm,dma-window", NULL); |
| 1955 | of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); |
| 1956 | tbl = iommu_pseries_alloc_table(pci->phb->node); |
| 1957 | if (!tbl) { |
| 1958 | dev_err(&pdev->dev, "couldn't create new IOMMU table\n"); |
| 1959 | return -1; |
| 1960 | } |
| 1961 | |
| 1962 | iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset, |
| 1963 | size, IOMMU_PAGE_SHIFT_4K, NULL, |
| 1964 | &iommu_table_lpar_multi_ops); |
| 1965 | iommu_init_table(tbl, pci->phb->node, 0, 0); |
| 1966 | |
| 1967 | pci->table_group->tables[0] = tbl; |
| 1968 | set_iommu_table_base(&pdev->dev, tbl); |
| 1969 | |
| 1970 | return 0; |
| 1971 | } |
| 1972 | |
| 1973 | static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift, |
| 1974 | __u64 window_size) |
| 1975 | { |
| 1976 | if ((window_size <= table_group->tce32_size) && |
| 1977 | (page_shift == IOMMU_PAGE_SHIFT_4K)) |
| 1978 | return true; |
| 1979 | |
| 1980 | return false; |
| 1981 | } |
| 1982 | |
| 1983 | static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, |
| 1984 | __u32 page_shift, __u64 window_size, __u32 levels, |
| 1985 | struct iommu_table **ptbl) |
| 1986 | { |
| 1987 | struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); |
| 1988 | u32 ddw_avail[DDW_APPLICABLE_SIZE]; |
| 1989 | struct ddw_create_response create; |
| 1990 | unsigned long liobn, offset, size; |
| 1991 | unsigned long start = 0, end = 0; |
| 1992 | struct ddw_query_response query; |
| 1993 | const __be32 *default_prop; |
| 1994 | struct failed_ddw_pdn *fpdn; |
| 1995 | unsigned int window_shift; |
| 1996 | struct device_node *pdn; |
| 1997 | struct iommu_table *tbl; |
| 1998 | struct dma_win *window; |
| 1999 | struct property *win64; |
| 2000 | struct pci_dn *pci; |
| 2001 | u64 win_addr; |
| 2002 | int len, i; |
| 2003 | long ret; |
| 2004 | |
| 2005 | if (!is_power_of_2(window_size) || levels > 1) |
| 2006 | return -EINVAL; |
| 2007 | |
| 2008 | window_shift = order_base_2(window_size); |
| 2009 | |
| 2010 | mutex_lock(&dma_win_init_mutex); |
| 2011 | |
| 2012 | ret = -ENODEV; |
| 2013 | |
| 2014 | pdn = pci_dma_find_parent_node(pdev, table_group); |
| 2015 | if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ |
| 2016 | dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); |
| 2017 | goto out_failed; |
| 2018 | } |
| 2019 | pci = PCI_DN(pdn); |
| 2020 | |
| 2021 | /* If the enable DDW failed for the pdn, dont retry! */ |
| 2022 | list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { |
| 2023 | if (fpdn->pdn == pdn) { |
| 2024 | dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn); |
| 2025 | goto out_unlock; |
| 2026 | } |
| 2027 | } |
| 2028 | |
| 2029 | tbl = iommu_pseries_alloc_table(pci->phb->node); |
| 2030 | if (!tbl) { |
| 2031 | dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n"); |
| 2032 | goto out_unlock; |
| 2033 | } |
| 2034 | |
| 2035 | if (num == 0) { |
| 2036 | bool direct_mapping; |
| 2037 | /* The request is not for default window? Ensure there is no DDW window already */ |
| 2038 | if (!is_default_window_request(table_group, page_shift, window_size)) { |
| 2039 | if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, |
| 2040 | &direct_mapping)) { |
| 2041 | dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn); |
| 2042 | ret = -EPERM; |
| 2043 | goto out_unlock; |
| 2044 | } |
| 2045 | } else { |
| 2046 | /* Request is for Default window, ensure there is no DDW if there is a |
| 2047 | * need to reset. reset-pe otherwise removes the DDW also |
| 2048 | */ |
| 2049 | default_prop = of_get_property(pdn, "ibm,dma-window", NULL); |
| 2050 | if (!default_prop) { |
| 2051 | if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, |
| 2052 | &direct_mapping)) { |
| 2053 | dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window", |
| 2054 | pdn); |
| 2055 | ret = -EPERM; |
| 2056 | goto out_unlock; |
| 2057 | } |
| 2058 | |
| 2059 | restore_default_dma_window(pdev, pdn); |
| 2060 | |
| 2061 | default_prop = of_get_property(pdn, "ibm,dma-window", NULL); |
| 2062 | of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); |
| 2063 | /* Limit the default window size to window_size */ |
| 2064 | iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, |
| 2065 | offset, 1UL << window_shift, |
| 2066 | IOMMU_PAGE_SHIFT_4K, NULL, |
| 2067 | &iommu_table_lpar_multi_ops); |
| 2068 | iommu_init_table(tbl, pci->phb->node, start, end); |
| 2069 | |
| 2070 | table_group->tables[0] = tbl; |
| 2071 | |
| 2072 | mutex_unlock(&dma_win_init_mutex); |
| 2073 | |
| 2074 | goto exit; |
| 2075 | } |
| 2076 | } |
| 2077 | } |
| 2078 | |
| 2079 | ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", |
| 2080 | &ddw_avail[0], DDW_APPLICABLE_SIZE); |
| 2081 | if (ret) { |
| 2082 | dev_info(&pdev->dev, "ibm,ddw-applicable not found\n"); |
| 2083 | goto out_failed; |
| 2084 | } |
| 2085 | ret = -ENODEV; |
| 2086 | |
| 2087 | pr_err("%s: Calling query %pOF\n", __func__, pdn); |
| 2088 | ret = query_ddw(pdev, ddw_avail, &query, pdn); |
| 2089 | if (ret) |
| 2090 | goto out_failed; |
| 2091 | ret = -ENODEV; |
| 2092 | |
| 2093 | len = window_shift; |
| 2094 | if (query.largest_available_block < (1ULL << (len - page_shift))) { |
| 2095 | dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n", |
| 2096 | 1ULL << len, query.largest_available_block, |
| 2097 | 1ULL << page_shift); |
| 2098 | ret = -EINVAL; /* Retry with smaller window size */ |
| 2099 | goto out_unlock; |
| 2100 | } |
| 2101 | |
| 2102 | if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) { |
| 2103 | pr_err("%s: Create ddw failed %pOF\n", __func__, pdn); |
| 2104 | goto out_failed; |
| 2105 | } |
| 2106 | |
| 2107 | win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; |
| 2108 | win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len); |
| 2109 | if (!win64) |
| 2110 | goto remove_window; |
| 2111 | |
| 2112 | ret = of_add_property(pdn, win64); |
| 2113 | if (ret) { |
| 2114 | dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret); |
| 2115 | goto free_property; |
| 2116 | } |
| 2117 | ret = -ENODEV; |
| 2118 | |
| 2119 | window = ddw_list_new_entry(pdn, win64->value); |
| 2120 | if (!window) |
| 2121 | goto remove_property; |
| 2122 | |
| 2123 | window->direct = false; |
| 2124 | |
| 2125 | for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { |
| 2126 | const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; |
| 2127 | |
| 2128 | /* Look for MMIO32 */ |
| 2129 | if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { |
| 2130 | start = pci->phb->mem_resources[i].start; |
| 2131 | end = pci->phb->mem_resources[i].end; |
| 2132 | break; |
| 2133 | } |
| 2134 | } |
| 2135 | |
| 2136 | /* New table for using DDW instead of the default DMA window */ |
| 2137 | iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr, |
| 2138 | 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); |
| 2139 | iommu_init_table(tbl, pci->phb->node, start, end); |
| 2140 | |
| 2141 | pci->table_group->tables[num] = tbl; |
| 2142 | set_iommu_table_base(&pdev->dev, tbl); |
| 2143 | pdev->dev.archdata.dma_offset = win_addr; |
| 2144 | |
| 2145 | spin_lock(&dma_win_list_lock); |
| 2146 | list_add(&window->list, &dma_win_list); |
| 2147 | spin_unlock(&dma_win_list_lock); |
| 2148 | |
| 2149 | mutex_unlock(&dma_win_init_mutex); |
| 2150 | |
| 2151 | goto exit; |
| 2152 | |
| 2153 | remove_property: |
| 2154 | of_remove_property(pdn, win64); |
| 2155 | free_property: |
| 2156 | kfree(win64->name); |
| 2157 | kfree(win64->value); |
| 2158 | kfree(win64); |
| 2159 | remove_window: |
| 2160 | __remove_dma_window(pdn, ddw_avail, create.liobn); |
| 2161 | |
| 2162 | out_failed: |
| 2163 | fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); |
| 2164 | if (!fpdn) |
| 2165 | goto out_unlock; |
| 2166 | fpdn->pdn = pdn; |
| 2167 | list_add(&fpdn->list, &failed_ddw_pdn_list); |
| 2168 | |
| 2169 | out_unlock: |
| 2170 | mutex_unlock(&dma_win_init_mutex); |
| 2171 | |
| 2172 | return ret; |
| 2173 | exit: |
| 2174 | /* Allocate the userspace view */ |
| 2175 | pseries_tce_iommu_userspace_view_alloc(tbl); |
| 2176 | tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels); |
| 2177 | |
| 2178 | *ptbl = iommu_tce_table_get(tbl); |
| 2179 | |
| 2180 | return 0; |
| 2181 | } |
| 2182 | |
| 2183 | static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl) |
| 2184 | { |
| 2185 | if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) && |
| 2186 | (tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K)) |
| 2187 | return true; |
| 2188 | |
| 2189 | return false; |
| 2190 | } |
| 2191 | |
| 2192 | static long spapr_tce_set_window(struct iommu_table_group *table_group, |
| 2193 | int num, struct iommu_table *tbl) |
| 2194 | { |
| 2195 | return tbl == table_group->tables[num] ? 0 : -EPERM; |
| 2196 | } |
| 2197 | |
| 2198 | static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) |
| 2199 | { |
| 2200 | struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); |
| 2201 | struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; |
| 2202 | struct iommu_table *tbl = table_group->tables[num]; |
| 2203 | struct failed_ddw_pdn *fpdn; |
| 2204 | struct dma_win *window; |
| 2205 | const char *win_name; |
| 2206 | int ret = -ENODEV; |
| 2207 | |
| 2208 | mutex_lock(&dma_win_init_mutex); |
| 2209 | |
| 2210 | if ((num == 0) && is_default_window_table(table_group, tbl)) |
| 2211 | win_name = "ibm,dma-window"; |
| 2212 | else |
| 2213 | win_name = DMA64_PROPNAME; |
| 2214 | |
| 2215 | pdn = pci_dma_find(dn, NULL); |
| 2216 | if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ |
| 2217 | dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); |
| 2218 | goto out_failed; |
| 2219 | } |
| 2220 | |
| 2221 | /* Dont clear the TCEs, User should have done it */ |
| 2222 | if (remove_dma_window_named(pdn, true, win_name, false)) { |
| 2223 | pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn); |
| 2224 | goto out_failed; /* Could not remove it either! */ |
| 2225 | } |
| 2226 | |
| 2227 | if (strcmp(win_name, DMA64_PROPNAME) == 0) { |
| 2228 | spin_lock(&dma_win_list_lock); |
| 2229 | list_for_each_entry(window, &dma_win_list, list) { |
| 2230 | if (window->device == pdn) { |
| 2231 | list_del(&window->list); |
| 2232 | kfree(window); |
| 2233 | break; |
| 2234 | } |
| 2235 | } |
| 2236 | spin_unlock(&dma_win_list_lock); |
| 2237 | } |
| 2238 | |
| 2239 | iommu_tce_table_put(table_group->tables[num]); |
| 2240 | table_group->tables[num] = NULL; |
| 2241 | |
| 2242 | ret = 0; |
| 2243 | |
| 2244 | goto out_unlock; |
| 2245 | |
| 2246 | out_failed: |
| 2247 | fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); |
| 2248 | if (!fpdn) |
| 2249 | goto out_unlock; |
| 2250 | fpdn->pdn = pdn; |
| 2251 | list_add(&fpdn->list, &failed_ddw_pdn_list); |
| 2252 | |
| 2253 | out_unlock: |
| 2254 | mutex_unlock(&dma_win_init_mutex); |
| 2255 | |
| 2256 | return ret; |
| 2257 | } |
| 2258 | |
| 2259 | static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev) |
| 2260 | { |
| 2261 | struct iommu_table *tbl = table_group->tables[0]; |
| 2262 | struct pci_dev *pdev = to_pci_dev(dev); |
| 2263 | struct device_node *dn = pci_device_to_OF_node(pdev); |
| 2264 | struct device_node *pdn; |
| 2265 | |
| 2266 | /* SRIOV VFs using direct map by the host driver OR multifunction devices |
| 2267 | * where the ownership was taken on the attempt by the first function |
| 2268 | */ |
| 2269 | if (!tbl && (table_group->max_dynamic_windows_supported != 1)) |
| 2270 | return 0; |
| 2271 | |
| 2272 | mutex_lock(&dma_win_init_mutex); |
| 2273 | |
| 2274 | pdn = pci_dma_find(dn, NULL); |
| 2275 | if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ |
| 2276 | dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); |
| 2277 | mutex_unlock(&dma_win_init_mutex); |
| 2278 | return -1; |
| 2279 | } |
| 2280 | |
| 2281 | /* |
| 2282 | * Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table |
| 2283 | * if there are any. In case of direct map, the entries will be left over, which |
| 2284 | * is fine for PEs with 2 DMA windows where the second window is created with create-pe |
| 2285 | * at which point the table is cleared. However, on VFs having only one DMA window, the |
| 2286 | * default window would end up seeing the entries left over from the direct map done |
| 2287 | * on the second window. So, remove the ddw explicitly so that clean_dma_window() |
| 2288 | * cleans up the entries if any. |
| 2289 | */ |
| 2290 | if (remove_dynamic_dma_windows(pdev, pdn)) { |
| 2291 | dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn); |
| 2292 | mutex_unlock(&dma_win_init_mutex); |
| 2293 | return -1; |
| 2294 | } |
| 2295 | |
| 2296 | /* The table_group->tables[0] is not null now, it must be the default window |
| 2297 | * Remove it, let the userspace create it as it needs. |
| 2298 | */ |
| 2299 | if (table_group->tables[0]) { |
| 2300 | remove_dma_window_named(pdn, true, "ibm,dma-window", true); |
| 2301 | iommu_tce_table_put(tbl); |
| 2302 | table_group->tables[0] = NULL; |
| 2303 | } |
| 2304 | set_iommu_table_base(dev, NULL); |
| 2305 | |
| 2306 | mutex_unlock(&dma_win_init_mutex); |
| 2307 | |
| 2308 | return 0; |
| 2309 | } |
| 2310 | |
| 2311 | static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev) |
| 2312 | { |
| 2313 | struct iommu_table *tbl = table_group->tables[0]; |
| 2314 | |
| 2315 | if (tbl) { /* Default window already restored */ |
| 2316 | return; |
| 2317 | } |
| 2318 | |
| 2319 | mutex_lock(&dma_win_init_mutex); |
| 2320 | |
| 2321 | /* Restore the default window */ |
| 2322 | pseries_setup_default_iommu_config(table_group, dev); |
| 2323 | |
| 2324 | mutex_unlock(&dma_win_init_mutex); |
| 2325 | |
| 2326 | return; |
| 2327 | } |
| 2328 | |
| 2329 | static struct iommu_table_group_ops spapr_tce_table_group_ops = { |
| 2330 | .get_table_size = spapr_tce_get_table_size, |
| 2331 | .create_table = spapr_tce_create_table, |
| 2332 | .set_window = spapr_tce_set_window, |
| 2333 | .unset_window = spapr_tce_unset_window, |
| 2334 | .take_ownership = spapr_tce_take_ownership, |
| 2335 | .release_ownership = spapr_tce_release_ownership, |
| 2336 | }; |
| 2337 | #endif |
| 2338 | |
| 2339 | static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, |
| 2340 | void *data) |
| 2341 | { |
| 2342 | struct dma_win *window; |
| 2343 | struct memory_notify *arg = data; |
| 2344 | int ret = 0; |
| 2345 | |
| 2346 | switch (action) { |
| 2347 | case MEM_GOING_ONLINE: |
| 2348 | spin_lock(&dma_win_list_lock); |
| 2349 | list_for_each_entry(window, &dma_win_list, list) { |
| 2350 | if (window->direct) { |
| 2351 | ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, |
| 2352 | arg->nr_pages, window->prop); |
| 2353 | } |
| 2354 | /* XXX log error */ |
| 2355 | } |
| 2356 | spin_unlock(&dma_win_list_lock); |
| 2357 | break; |
| 2358 | case MEM_CANCEL_ONLINE: |
| 2359 | case MEM_OFFLINE: |
| 2360 | spin_lock(&dma_win_list_lock); |
| 2361 | list_for_each_entry(window, &dma_win_list, list) { |
| 2362 | if (window->direct) { |
| 2363 | ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, |
| 2364 | arg->nr_pages, window->prop); |
| 2365 | } |
| 2366 | /* XXX log error */ |
| 2367 | } |
| 2368 | spin_unlock(&dma_win_list_lock); |
| 2369 | break; |
| 2370 | default: |
| 2371 | break; |
| 2372 | } |
| 2373 | if (ret && action != MEM_CANCEL_ONLINE) |
| 2374 | return NOTIFY_BAD; |
| 2375 | |
| 2376 | return NOTIFY_OK; |
| 2377 | } |
| 2378 | |
| 2379 | static struct notifier_block iommu_mem_nb = { |
| 2380 | .notifier_call = iommu_mem_notifier, |
| 2381 | }; |
| 2382 | |
| 2383 | static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data) |
| 2384 | { |
| 2385 | int err = NOTIFY_OK; |
| 2386 | struct of_reconfig_data *rd = data; |
| 2387 | struct device_node *np = rd->dn; |
| 2388 | struct pci_dn *pci = PCI_DN(np); |
| 2389 | struct dma_win *window; |
| 2390 | |
| 2391 | switch (action) { |
| 2392 | case OF_RECONFIG_DETACH_NODE: |
| 2393 | /* |
| 2394 | * Removing the property will invoke the reconfig |
| 2395 | * notifier again, which causes dead-lock on the |
| 2396 | * read-write semaphore of the notifier chain. So |
| 2397 | * we have to remove the property when releasing |
| 2398 | * the device node. |
| 2399 | */ |
| 2400 | if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true)) |
| 2401 | remove_dma_window_named(np, false, DMA64_PROPNAME, true); |
| 2402 | |
| 2403 | if (pci && pci->table_group) |
| 2404 | iommu_pseries_free_group(pci->table_group, |
| 2405 | np->full_name); |
| 2406 | |
| 2407 | spin_lock(&dma_win_list_lock); |
| 2408 | list_for_each_entry(window, &dma_win_list, list) { |
| 2409 | if (window->device == np) { |
| 2410 | list_del(&window->list); |
| 2411 | kfree(window); |
| 2412 | break; |
| 2413 | } |
| 2414 | } |
| 2415 | spin_unlock(&dma_win_list_lock); |
| 2416 | break; |
| 2417 | default: |
| 2418 | err = NOTIFY_DONE; |
| 2419 | break; |
| 2420 | } |
| 2421 | return err; |
| 2422 | } |
| 2423 | |
| 2424 | static struct notifier_block iommu_reconfig_nb = { |
| 2425 | .notifier_call = iommu_reconfig_notifier, |
| 2426 | }; |
| 2427 | |
| 2428 | /* These are called very early. */ |
| 2429 | void __init iommu_init_early_pSeries(void) |
| 2430 | { |
| 2431 | if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) |
| 2432 | return; |
| 2433 | |
| 2434 | if (firmware_has_feature(FW_FEATURE_LPAR)) { |
| 2435 | pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; |
| 2436 | pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; |
| 2437 | if (!disable_ddw) |
| 2438 | pseries_pci_controller_ops.iommu_bypass_supported = |
| 2439 | iommu_bypass_supported_pSeriesLP; |
| 2440 | } else { |
| 2441 | pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; |
| 2442 | pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; |
| 2443 | } |
| 2444 | |
| 2445 | |
| 2446 | of_reconfig_notifier_register(&iommu_reconfig_nb); |
| 2447 | register_memory_notifier(&iommu_mem_nb); |
| 2448 | |
| 2449 | set_pci_dma_ops(&dma_iommu_ops); |
| 2450 | } |
| 2451 | |
| 2452 | static int __init disable_multitce(char *str) |
| 2453 | { |
| 2454 | if (strcmp(str, "off") == 0 && |
| 2455 | firmware_has_feature(FW_FEATURE_LPAR) && |
| 2456 | (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || |
| 2457 | firmware_has_feature(FW_FEATURE_STUFF_TCE))) { |
| 2458 | printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); |
| 2459 | powerpc_firmware_features &= |
| 2460 | ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); |
| 2461 | } |
| 2462 | return 1; |
| 2463 | } |
| 2464 | |
| 2465 | __setup("multitce=", disable_multitce); |
| 2466 | |
| 2467 | #ifdef CONFIG_SPAPR_TCE_IOMMU |
| 2468 | struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, |
| 2469 | struct pci_dev *pdev) |
| 2470 | { |
| 2471 | struct device_node *pdn, *dn = pdev->dev.of_node; |
| 2472 | struct iommu_group *grp; |
| 2473 | struct pci_dn *pci; |
| 2474 | |
| 2475 | pdn = pci_dma_find(dn, NULL); |
| 2476 | if (!pdn || !PCI_DN(pdn)) |
| 2477 | return ERR_PTR(-ENODEV); |
| 2478 | |
| 2479 | pci = PCI_DN(pdn); |
| 2480 | if (!pci->table_group) |
| 2481 | return ERR_PTR(-ENODEV); |
| 2482 | |
| 2483 | grp = pci->table_group->group; |
| 2484 | if (!grp) |
| 2485 | return ERR_PTR(-ENODEV); |
| 2486 | |
| 2487 | return iommu_group_ref_get(grp); |
| 2488 | } |
| 2489 | #endif |