Commit | Line | Data |
---|---|---|
8ada2c1c SR |
1 | /* |
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | */ | |
32 | ||
33 | #include <linux/types.h> | |
34 | #include <linux/sched.h> | |
6e84f315 | 35 | #include <linux/sched/mm.h> |
0881e7bd | 36 | #include <linux/sched/task.h> |
8ada2c1c SR |
37 | #include <linux/pid.h> |
38 | #include <linux/slab.h> | |
39 | #include <linux/export.h> | |
40 | #include <linux/vmalloc.h> | |
0008b84e | 41 | #include <linux/hugetlb.h> |
7cc2e18f | 42 | #include <linux/interval_tree.h> |
75a3e6a3 | 43 | #include <linux/pagemap.h> |
8ada2c1c SR |
44 | |
45 | #include <rdma/ib_verbs.h> | |
46 | #include <rdma/ib_umem.h> | |
47 | #include <rdma/ib_umem_odp.h> | |
48 | ||
f20bef6a JG |
49 | #include "uverbs.h" |
50 | ||
f25a546e JG |
51 | static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, |
52 | const struct mmu_interval_notifier_ops *ops) | |
882214e2 | 53 | { |
22d79c9a JG |
54 | int ret; |
55 | ||
56 | umem_odp->umem.is_odp = 1; | |
f25a546e JG |
57 | mutex_init(&umem_odp->umem_mutex); |
58 | ||
22d79c9a | 59 | if (!umem_odp->is_implicit_odp) { |
204e3e56 | 60 | size_t page_size = 1UL << umem_odp->page_shift; |
f25a546e JG |
61 | unsigned long start; |
62 | unsigned long end; | |
204e3e56 JG |
63 | size_t pages; |
64 | ||
f25a546e | 65 | start = ALIGN_DOWN(umem_odp->umem.address, page_size); |
204e3e56 | 66 | if (check_add_overflow(umem_odp->umem.address, |
b97b218b | 67 | (unsigned long)umem_odp->umem.length, |
f25a546e | 68 | &end)) |
204e3e56 | 69 | return -EOVERFLOW; |
f25a546e JG |
70 | end = ALIGN(end, page_size); |
71 | if (unlikely(end < page_size)) | |
204e3e56 JG |
72 | return -EOVERFLOW; |
73 | ||
f25a546e | 74 | pages = (end - start) >> umem_odp->page_shift; |
22d79c9a JG |
75 | if (!pages) |
76 | return -EINVAL; | |
77 | ||
37824952 JG |
78 | umem_odp->page_list = kvcalloc( |
79 | pages, sizeof(*umem_odp->page_list), GFP_KERNEL); | |
22d79c9a JG |
80 | if (!umem_odp->page_list) |
81 | return -ENOMEM; | |
82 | ||
37824952 JG |
83 | umem_odp->dma_list = kvcalloc( |
84 | pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); | |
22d79c9a JG |
85 | if (!umem_odp->dma_list) { |
86 | ret = -ENOMEM; | |
87 | goto out_page_list; | |
88 | } | |
22d79c9a | 89 | |
f25a546e JG |
90 | ret = mmu_interval_notifier_insert(&umem_odp->notifier, |
91 | umem_odp->umem.owning_mm, | |
92 | start, end - start, ops); | |
93 | if (ret) | |
94 | goto out_dma_list; | |
22d79c9a | 95 | } |
22d79c9a JG |
96 | |
97 | return 0; | |
98 | ||
c571feca | 99 | out_dma_list: |
37824952 | 100 | kvfree(umem_odp->dma_list); |
22d79c9a | 101 | out_page_list: |
37824952 | 102 | kvfree(umem_odp->page_list); |
22d79c9a JG |
103 | return ret; |
104 | } | |
105 | ||
f20bef6a JG |
106 | /** |
107 | * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem | |
108 | * | |
109 | * Implicit ODP umems do not have a VA range and do not have any page lists. | |
110 | * They exist only to hold the per_mm reference to help the driver create | |
111 | * children umems. | |
112 | * | |
113 | * @udata: udata from the syscall being used to create the umem | |
114 | * @access: ib_reg_mr access flags | |
115 | */ | |
116 | struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, | |
117 | int access) | |
118 | { | |
119 | struct ib_ucontext *context = | |
120 | container_of(udata, struct uverbs_attr_bundle, driver_udata) | |
121 | ->context; | |
122 | struct ib_umem *umem; | |
123 | struct ib_umem_odp *umem_odp; | |
124 | int ret; | |
125 | ||
126 | if (access & IB_ACCESS_HUGETLB) | |
127 | return ERR_PTR(-EINVAL); | |
128 | ||
129 | if (!context) | |
130 | return ERR_PTR(-EIO); | |
f20bef6a JG |
131 | |
132 | umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); | |
133 | if (!umem_odp) | |
134 | return ERR_PTR(-ENOMEM); | |
135 | umem = &umem_odp->umem; | |
47f725ee | 136 | umem->ibdev = context->device; |
f20bef6a JG |
137 | umem->writable = ib_access_writable(access); |
138 | umem->owning_mm = current->mm; | |
139 | umem_odp->is_implicit_odp = 1; | |
140 | umem_odp->page_shift = PAGE_SHIFT; | |
141 | ||
f25a546e JG |
142 | umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); |
143 | ret = ib_init_umem_odp(umem_odp, NULL); | |
f20bef6a | 144 | if (ret) { |
f25a546e | 145 | put_pid(umem_odp->tgid); |
f20bef6a JG |
146 | kfree(umem_odp); |
147 | return ERR_PTR(ret); | |
148 | } | |
f20bef6a JG |
149 | return umem_odp; |
150 | } | |
151 | EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); | |
152 | ||
153 | /** | |
154 | * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit | |
155 | * parent ODP umem | |
156 | * | |
157 | * @root: The parent umem enclosing the child. This must be allocated using | |
158 | * ib_alloc_implicit_odp_umem() | |
159 | * @addr: The starting userspace VA | |
160 | * @size: The length of the userspace VA | |
161 | */ | |
f25a546e JG |
162 | struct ib_umem_odp * |
163 | ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, | |
164 | size_t size, | |
165 | const struct mmu_interval_notifier_ops *ops) | |
f27a0d50 | 166 | { |
22d79c9a JG |
167 | /* |
168 | * Caller must ensure that root cannot be freed during the call to | |
169 | * ib_alloc_odp_umem. | |
170 | */ | |
d07d1d70 | 171 | struct ib_umem_odp *odp_data; |
41b4deea | 172 | struct ib_umem *umem; |
d07d1d70 AK |
173 | int ret; |
174 | ||
f20bef6a JG |
175 | if (WARN_ON(!root->is_implicit_odp)) |
176 | return ERR_PTR(-EINVAL); | |
177 | ||
41b4deea JG |
178 | odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); |
179 | if (!odp_data) | |
d07d1d70 | 180 | return ERR_PTR(-ENOMEM); |
41b4deea | 181 | umem = &odp_data->umem; |
47f725ee | 182 | umem->ibdev = root->umem.ibdev; |
3e7e1193 AK |
183 | umem->length = size; |
184 | umem->address = addr; | |
da6a496a | 185 | umem->writable = root->umem.writable; |
22d79c9a JG |
186 | umem->owning_mm = root->umem.owning_mm; |
187 | odp_data->page_shift = PAGE_SHIFT; | |
f25a546e | 188 | odp_data->notifier.ops = ops; |
d07d1d70 | 189 | |
f25a546e JG |
190 | odp_data->tgid = get_pid(root->tgid); |
191 | ret = ib_init_umem_odp(odp_data, ops); | |
22d79c9a | 192 | if (ret) { |
f25a546e | 193 | put_pid(odp_data->tgid); |
22d79c9a JG |
194 | kfree(odp_data); |
195 | return ERR_PTR(ret); | |
d07d1d70 | 196 | } |
b5231b01 | 197 | return odp_data; |
d07d1d70 | 198 | } |
f20bef6a | 199 | EXPORT_SYMBOL(ib_umem_odp_alloc_child); |
d07d1d70 | 200 | |
f20bef6a | 201 | /** |
261dc53f | 202 | * ib_umem_odp_get - Create a umem_odp for a userspace va |
f20bef6a | 203 | * |
261dc53f JG |
204 | * @udata: userspace context to pin memory for |
205 | * @addr: userspace virtual address to start at | |
206 | * @size: length of region to pin | |
207 | * @access: IB_ACCESS_xxx flags for memory being pinned | |
208 | * | |
209 | * The driver should use when the access flags indicate ODP memory. It avoids | |
210 | * pinning, instead, stores the mm for future page fault handling in | |
211 | * conjunction with MMU notifiers. | |
f20bef6a | 212 | */ |
261dc53f | 213 | struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, |
f25a546e JG |
214 | size_t size, int access, |
215 | const struct mmu_interval_notifier_ops *ops) | |
8ada2c1c | 216 | { |
261dc53f JG |
217 | struct ib_umem_odp *umem_odp; |
218 | struct ib_ucontext *context; | |
219 | struct mm_struct *mm; | |
220 | int ret; | |
221 | ||
222 | if (!udata) | |
223 | return ERR_PTR(-EIO); | |
224 | ||
225 | context = container_of(udata, struct uverbs_attr_bundle, driver_udata) | |
226 | ->context; | |
227 | if (!context) | |
228 | return ERR_PTR(-EIO); | |
229 | ||
f25a546e | 230 | if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) |
261dc53f JG |
231 | return ERR_PTR(-EINVAL); |
232 | ||
233 | umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); | |
234 | if (!umem_odp) | |
235 | return ERR_PTR(-ENOMEM); | |
236 | ||
47f725ee | 237 | umem_odp->umem.ibdev = context->device; |
261dc53f JG |
238 | umem_odp->umem.length = size; |
239 | umem_odp->umem.address = addr; | |
240 | umem_odp->umem.writable = ib_access_writable(access); | |
241 | umem_odp->umem.owning_mm = mm = current->mm; | |
f25a546e | 242 | umem_odp->notifier.ops = ops; |
8ada2c1c | 243 | |
d2183c6f | 244 | umem_odp->page_shift = PAGE_SHIFT; |
0008b84e AK |
245 | if (access & IB_ACCESS_HUGETLB) { |
246 | struct vm_area_struct *vma; | |
247 | struct hstate *h; | |
248 | ||
79bb5b7e | 249 | down_read(&mm->mmap_sem); |
d2183c6f | 250 | vma = find_vma(mm, ib_umem_start(umem_odp)); |
79bb5b7e LR |
251 | if (!vma || !is_vm_hugetlb_page(vma)) { |
252 | up_read(&mm->mmap_sem); | |
261dc53f JG |
253 | ret = -EINVAL; |
254 | goto err_free; | |
79bb5b7e | 255 | } |
0008b84e | 256 | h = hstate_vma(vma); |
d2183c6f | 257 | umem_odp->page_shift = huge_page_shift(h); |
79bb5b7e | 258 | up_read(&mm->mmap_sem); |
0008b84e AK |
259 | } |
260 | ||
f25a546e JG |
261 | umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); |
262 | ret = ib_init_umem_odp(umem_odp, ops); | |
261dc53f | 263 | if (ret) |
f25a546e | 264 | goto err_put_pid; |
261dc53f JG |
265 | return umem_odp; |
266 | ||
f25a546e JG |
267 | err_put_pid: |
268 | put_pid(umem_odp->tgid); | |
261dc53f JG |
269 | err_free: |
270 | kfree(umem_odp); | |
271 | return ERR_PTR(ret); | |
8ada2c1c | 272 | } |
261dc53f | 273 | EXPORT_SYMBOL(ib_umem_odp_get); |
8ada2c1c | 274 | |
b5231b01 | 275 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp) |
8ada2c1c SR |
276 | { |
277 | /* | |
278 | * Ensure that no more pages are mapped in the umem. | |
279 | * | |
280 | * It is the driver's responsibility to ensure, before calling us, | |
281 | * that the hardware will not attempt to access the MR any more. | |
282 | */ | |
fd7dbf03 | 283 | if (!umem_odp->is_implicit_odp) { |
9dc775e7 | 284 | mutex_lock(&umem_odp->umem_mutex); |
fd7dbf03 JG |
285 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), |
286 | ib_umem_end(umem_odp)); | |
9dc775e7 | 287 | mutex_unlock(&umem_odp->umem_mutex); |
f25a546e | 288 | mmu_interval_notifier_remove(&umem_odp->notifier); |
37824952 JG |
289 | kvfree(umem_odp->dma_list); |
290 | kvfree(umem_odp->page_list); | |
f25a546e | 291 | put_pid(umem_odp->tgid); |
fd7dbf03 | 292 | } |
0446cad9 | 293 | kfree(umem_odp); |
8ada2c1c | 294 | } |
0446cad9 | 295 | EXPORT_SYMBOL(ib_umem_odp_release); |
8ada2c1c SR |
296 | |
297 | /* | |
298 | * Map for DMA and insert a single page into the on-demand paging page tables. | |
299 | * | |
300 | * @umem: the umem to insert the page to. | |
301 | * @page_index: index in the umem to add the page to. | |
302 | * @page: the page struct to map and add. | |
303 | * @access_mask: access permissions needed for this page. | |
304 | * @current_seq: sequence number for synchronization with invalidations. | |
305 | * the sequence number is taken from | |
b5231b01 | 306 | * umem_odp->notifiers_seq. |
8ada2c1c | 307 | * |
882214e2 HE |
308 | * The function returns -EFAULT if the DMA mapping operation fails. It returns |
309 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | |
8ada2c1c | 310 | * |
ea996974 | 311 | * The page is released via put_user_page even if the operation failed. For |
8ada2c1c SR |
312 | * on-demand pinning, the page is released whenever it isn't stored in the |
313 | * umem. | |
314 | */ | |
315 | static int ib_umem_odp_map_dma_single_page( | |
b5231b01 | 316 | struct ib_umem_odp *umem_odp, |
f25a546e | 317 | unsigned int page_index, |
8ada2c1c SR |
318 | struct page *page, |
319 | u64 access_mask, | |
320 | unsigned long current_seq) | |
321 | { | |
47f725ee | 322 | struct ib_device *dev = umem_odp->umem.ibdev; |
8ada2c1c | 323 | dma_addr_t dma_addr; |
8ada2c1c SR |
324 | int ret = 0; |
325 | ||
f25a546e | 326 | if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { |
882214e2 HE |
327 | ret = -EAGAIN; |
328 | goto out; | |
329 | } | |
b5231b01 | 330 | if (!(umem_odp->dma_list[page_index])) { |
d2183c6f JG |
331 | dma_addr = |
332 | ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), | |
333 | DMA_BIDIRECTIONAL); | |
8ada2c1c SR |
334 | if (ib_dma_mapping_error(dev, dma_addr)) { |
335 | ret = -EFAULT; | |
336 | goto out; | |
337 | } | |
b5231b01 JG |
338 | umem_odp->dma_list[page_index] = dma_addr | access_mask; |
339 | umem_odp->page_list[page_index] = page; | |
d10bcf94 | 340 | umem_odp->npages++; |
b5231b01 JG |
341 | } else if (umem_odp->page_list[page_index] == page) { |
342 | umem_odp->dma_list[page_index] |= access_mask; | |
8ada2c1c | 343 | } else { |
46870b23 JG |
344 | /* |
345 | * This is a race here where we could have done: | |
346 | * | |
347 | * CPU0 CPU1 | |
348 | * get_user_pages() | |
349 | * invalidate() | |
350 | * page_fault() | |
351 | * mutex_lock(umem_mutex) | |
352 | * page from GUP != page in ODP | |
353 | * | |
354 | * It should be prevented by the retry test above as reading | |
355 | * the seq number should be reliable under the | |
356 | * umem_mutex. Thus something is really not working right if | |
357 | * things get here. | |
358 | */ | |
359 | WARN(true, | |
360 | "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | |
361 | umem_odp->page_list[page_index], page); | |
362 | ret = -EAGAIN; | |
8ada2c1c SR |
363 | } |
364 | ||
365 | out: | |
ea996974 | 366 | put_user_page(page); |
8ada2c1c SR |
367 | return ret; |
368 | } | |
369 | ||
370 | /** | |
371 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | |
372 | * | |
373 | * Pins the range of pages passed in the argument, and maps them to | |
374 | * DMA addresses. The DMA addresses of the mapped pages is updated in | |
b5231b01 | 375 | * umem_odp->dma_list. |
8ada2c1c SR |
376 | * |
377 | * Returns the number of pages mapped in success, negative error code | |
378 | * for failure. | |
882214e2 HE |
379 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents |
380 | * the function from completing its task. | |
d9d0674c AK |
381 | * An -ENOENT error code indicates that userspace process is being terminated |
382 | * and mm was already destroyed. | |
b5231b01 | 383 | * @umem_odp: the umem to map and pin |
8ada2c1c SR |
384 | * @user_virt: the address from which we need to map. |
385 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | |
386 | * bigger due to alignment, and may also be smaller in case of an error | |
387 | * pinning or mapping a page. The actual pages mapped is returned in | |
388 | * the return value. | |
389 | * @access_mask: bit mask of the requested access permissions for the given | |
390 | * range. | |
391 | * @current_seq: the MMU notifiers sequance value for synchronization with | |
392 | * invalidations. the sequance number is read from | |
b5231b01 | 393 | * umem_odp->notifiers_seq before calling this function |
8ada2c1c | 394 | */ |
b5231b01 JG |
395 | int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, |
396 | u64 bcnt, u64 access_mask, | |
397 | unsigned long current_seq) | |
8ada2c1c SR |
398 | { |
399 | struct task_struct *owning_process = NULL; | |
f27a0d50 | 400 | struct mm_struct *owning_mm = umem_odp->umem.owning_mm; |
8ada2c1c | 401 | struct page **local_page_list = NULL; |
403cd12e | 402 | u64 page_mask, off; |
d2183c6f JG |
403 | int j, k, ret = 0, start_idx, npages = 0; |
404 | unsigned int flags = 0, page_shift; | |
403cd12e | 405 | phys_addr_t p = 0; |
8ada2c1c SR |
406 | |
407 | if (access_mask == 0) | |
408 | return -EINVAL; | |
409 | ||
d2183c6f JG |
410 | if (user_virt < ib_umem_start(umem_odp) || |
411 | user_virt + bcnt > ib_umem_end(umem_odp)) | |
8ada2c1c SR |
412 | return -EFAULT; |
413 | ||
414 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | |
415 | if (!local_page_list) | |
416 | return -ENOMEM; | |
417 | ||
d2183c6f | 418 | page_shift = umem_odp->page_shift; |
403cd12e AK |
419 | page_mask = ~(BIT(page_shift) - 1); |
420 | off = user_virt & (~page_mask); | |
421 | user_virt = user_virt & page_mask; | |
8ada2c1c SR |
422 | bcnt += off; /* Charge for the first page offset as well. */ |
423 | ||
f27a0d50 JG |
424 | /* |
425 | * owning_process is allowed to be NULL, this means somehow the mm is | |
426 | * existing beyond the lifetime of the originating process.. Presumably | |
427 | * mmget_not_zero will fail in this case. | |
428 | */ | |
f25a546e | 429 | owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); |
4438ee3f | 430 | if (!owning_process || !mmget_not_zero(owning_mm)) { |
8ada2c1c | 431 | ret = -EINVAL; |
8ada2c1c SR |
432 | goto out_put_task; |
433 | } | |
434 | ||
9beae1ea LS |
435 | if (access_mask & ODP_WRITE_ALLOWED_BIT) |
436 | flags |= FOLL_WRITE; | |
437 | ||
d2183c6f | 438 | start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; |
8ada2c1c SR |
439 | k = start_idx; |
440 | ||
441 | while (bcnt > 0) { | |
403cd12e AK |
442 | const size_t gup_num_pages = min_t(size_t, |
443 | (bcnt + BIT(page_shift) - 1) >> page_shift, | |
444 | PAGE_SIZE / sizeof(struct page *)); | |
8ada2c1c SR |
445 | |
446 | down_read(&owning_mm->mmap_sem); | |
447 | /* | |
448 | * Note: this might result in redundent page getting. We can | |
449 | * avoid this by checking dma_list to be 0 before calling | |
450 | * get_user_pages. However, this make the code much more | |
451 | * complex (and doesn't gain us much performance in most use | |
452 | * cases). | |
453 | */ | |
1e987790 DH |
454 | npages = get_user_pages_remote(owning_process, owning_mm, |
455 | user_virt, gup_num_pages, | |
5b56d49f | 456 | flags, local_page_list, NULL, NULL); |
8ada2c1c SR |
457 | up_read(&owning_mm->mmap_sem); |
458 | ||
b02394aa MS |
459 | if (npages < 0) { |
460 | if (npages != -EAGAIN) | |
461 | pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); | |
462 | else | |
463 | pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); | |
8ada2c1c | 464 | break; |
b02394aa | 465 | } |
8ada2c1c SR |
466 | |
467 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | |
b5231b01 | 468 | mutex_lock(&umem_odp->umem_mutex); |
403cd12e AK |
469 | for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { |
470 | if (user_virt & ~page_mask) { | |
471 | p += PAGE_SIZE; | |
472 | if (page_to_phys(local_page_list[j]) != p) { | |
473 | ret = -EFAULT; | |
474 | break; | |
475 | } | |
ea996974 | 476 | put_user_page(local_page_list[j]); |
403cd12e AK |
477 | continue; |
478 | } | |
479 | ||
8ada2c1c | 480 | ret = ib_umem_odp_map_dma_single_page( |
b5231b01 | 481 | umem_odp, k, local_page_list[j], |
403cd12e | 482 | access_mask, current_seq); |
b02394aa MS |
483 | if (ret < 0) { |
484 | if (ret != -EAGAIN) | |
485 | pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); | |
486 | else | |
487 | pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); | |
8ada2c1c | 488 | break; |
b02394aa | 489 | } |
403cd12e AK |
490 | |
491 | p = page_to_phys(local_page_list[j]); | |
8ada2c1c SR |
492 | k++; |
493 | } | |
b5231b01 | 494 | mutex_unlock(&umem_odp->umem_mutex); |
8ada2c1c SR |
495 | |
496 | if (ret < 0) { | |
75a3e6a3 | 497 | /* |
0c507d8f JH |
498 | * Release pages, remembering that the first page |
499 | * to hit an error was already released by | |
500 | * ib_umem_odp_map_dma_single_page(). | |
75a3e6a3 | 501 | */ |
0c507d8f | 502 | if (npages - (j + 1) > 0) |
ea996974 JH |
503 | put_user_pages(&local_page_list[j+1], |
504 | npages - (j + 1)); | |
8ada2c1c SR |
505 | break; |
506 | } | |
507 | } | |
508 | ||
509 | if (ret >= 0) { | |
510 | if (npages < 0 && k == start_idx) | |
511 | ret = npages; | |
512 | else | |
513 | ret = k - start_idx; | |
514 | } | |
515 | ||
516 | mmput(owning_mm); | |
517 | out_put_task: | |
f27a0d50 JG |
518 | if (owning_process) |
519 | put_task_struct(owning_process); | |
8ada2c1c SR |
520 | free_page((unsigned long)local_page_list); |
521 | return ret; | |
522 | } | |
523 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | |
524 | ||
b5231b01 | 525 | void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, |
8ada2c1c SR |
526 | u64 bound) |
527 | { | |
528 | int idx; | |
529 | u64 addr; | |
47f725ee | 530 | struct ib_device *dev = umem_odp->umem.ibdev; |
8ada2c1c | 531 | |
9dc775e7 JG |
532 | lockdep_assert_held(&umem_odp->umem_mutex); |
533 | ||
d2183c6f JG |
534 | virt = max_t(u64, virt, ib_umem_start(umem_odp)); |
535 | bound = min_t(u64, bound, ib_umem_end(umem_odp)); | |
882214e2 HE |
536 | /* Note that during the run of this function, the |
537 | * notifiers_count of the MR is > 0, preventing any racing | |
538 | * faults from completion. We might be racing with other | |
539 | * invalidations, so we must make sure we free each page only | |
540 | * once. */ | |
d2183c6f JG |
541 | for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { |
542 | idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; | |
b5231b01 JG |
543 | if (umem_odp->page_list[idx]) { |
544 | struct page *page = umem_odp->page_list[idx]; | |
545 | dma_addr_t dma = umem_odp->dma_list[idx]; | |
8ada2c1c SR |
546 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; |
547 | ||
548 | WARN_ON(!dma_addr); | |
549 | ||
dd82e668 JG |
550 | ib_dma_unmap_page(dev, dma_addr, |
551 | BIT(umem_odp->page_shift), | |
8ada2c1c | 552 | DMA_BIDIRECTIONAL); |
325ad061 GS |
553 | if (dma & ODP_WRITE_ALLOWED_BIT) { |
554 | struct page *head_page = compound_head(page); | |
882214e2 HE |
555 | /* |
556 | * set_page_dirty prefers being called with | |
557 | * the page lock. However, MMU notifiers are | |
558 | * called sometimes with and sometimes without | |
559 | * the lock. We rely on the umem_mutex instead | |
560 | * to prevent other mmu notifiers from | |
561 | * continuing and allowing the page mapping to | |
562 | * be removed. | |
563 | */ | |
564 | set_page_dirty(head_page); | |
325ad061 | 565 | } |
b5231b01 JG |
566 | umem_odp->page_list[idx] = NULL; |
567 | umem_odp->dma_list[idx] = 0; | |
d10bcf94 | 568 | umem_odp->npages--; |
8ada2c1c | 569 | } |
8ada2c1c SR |
570 | } |
571 | } | |
572 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); |