Commit | Line | Data |
---|---|---|
8ada2c1c SR |
1 | /* |
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | */ | |
32 | ||
33 | #include <linux/types.h> | |
34 | #include <linux/sched.h> | |
6e84f315 | 35 | #include <linux/sched/mm.h> |
0881e7bd | 36 | #include <linux/sched/task.h> |
8ada2c1c SR |
37 | #include <linux/pid.h> |
38 | #include <linux/slab.h> | |
39 | #include <linux/export.h> | |
40 | #include <linux/vmalloc.h> | |
0008b84e | 41 | #include <linux/hugetlb.h> |
fec99ede | 42 | #include <linux/interval_tree_generic.h> |
75a3e6a3 | 43 | #include <linux/pagemap.h> |
8ada2c1c SR |
44 | |
45 | #include <rdma/ib_verbs.h> | |
46 | #include <rdma/ib_umem.h> | |
47 | #include <rdma/ib_umem_odp.h> | |
48 | ||
fec99ede LR |
49 | /* |
50 | * The ib_umem list keeps track of memory regions for which the HW | |
51 | * device request to receive notification when the related memory | |
52 | * mapping is changed. | |
53 | * | |
54 | * ib_umem_lock protects the list. | |
55 | */ | |
56 | ||
57 | static u64 node_start(struct umem_odp_node *n) | |
58 | { | |
59 | struct ib_umem_odp *umem_odp = | |
60 | container_of(n, struct ib_umem_odp, interval_tree); | |
61 | ||
d2183c6f | 62 | return ib_umem_start(umem_odp); |
fec99ede LR |
63 | } |
64 | ||
65 | /* Note that the representation of the intervals in the interval tree | |
66 | * considers the ending point as contained in the interval, while the | |
67 | * function ib_umem_end returns the first address which is not contained | |
68 | * in the umem. | |
69 | */ | |
70 | static u64 node_last(struct umem_odp_node *n) | |
71 | { | |
72 | struct ib_umem_odp *umem_odp = | |
73 | container_of(n, struct ib_umem_odp, interval_tree); | |
74 | ||
d2183c6f | 75 | return ib_umem_end(umem_odp) - 1; |
fec99ede LR |
76 | } |
77 | ||
78 | INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, | |
79 | node_start, node_last, static, rbt_ib_umem) | |
80 | ||
b5231b01 | 81 | static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) |
882214e2 | 82 | { |
b5231b01 | 83 | mutex_lock(&umem_odp->umem_mutex); |
ca748c39 JG |
84 | if (umem_odp->notifiers_count++ == 0) |
85 | /* | |
86 | * Initialize the completion object for waiting on | |
87 | * notifiers. Since notifier_count is zero, no one should be | |
88 | * waiting right now. | |
89 | */ | |
90 | reinit_completion(&umem_odp->notifier_completion); | |
b5231b01 | 91 | mutex_unlock(&umem_odp->umem_mutex); |
882214e2 HE |
92 | } |
93 | ||
b5231b01 | 94 | static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) |
882214e2 | 95 | { |
b5231b01 | 96 | mutex_lock(&umem_odp->umem_mutex); |
ca748c39 JG |
97 | /* |
98 | * This sequence increase will notify the QP page fault that the page | |
99 | * that is going to be mapped in the spte could have been freed. | |
100 | */ | |
101 | ++umem_odp->notifiers_seq; | |
102 | if (--umem_odp->notifiers_count == 0) | |
103 | complete_all(&umem_odp->notifier_completion); | |
b5231b01 | 104 | mutex_unlock(&umem_odp->umem_mutex); |
882214e2 HE |
105 | } |
106 | ||
b5231b01 JG |
107 | static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, |
108 | u64 start, u64 end, void *cookie) | |
109 | { | |
882214e2 HE |
110 | /* |
111 | * Increase the number of notifiers running, to | |
112 | * prevent any further fault handling on this MR. | |
113 | */ | |
b5231b01 JG |
114 | ib_umem_notifier_start_account(umem_odp); |
115 | umem_odp->dying = 1; | |
882214e2 HE |
116 | /* Make sure that the fact the umem is dying is out before we release |
117 | * all pending page faults. */ | |
118 | smp_wmb(); | |
b5231b01 | 119 | complete_all(&umem_odp->notifier_completion); |
d2183c6f JG |
120 | umem_odp->umem.context->invalidate_range( |
121 | umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); | |
882214e2 HE |
122 | return 0; |
123 | } | |
124 | ||
125 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | |
126 | struct mm_struct *mm) | |
127 | { | |
c9990ab3 JG |
128 | struct ib_ucontext_per_mm *per_mm = |
129 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
882214e2 | 130 | |
c9990ab3 | 131 | down_read(&per_mm->umem_rwsem); |
be7a57b4 JG |
132 | if (per_mm->active) |
133 | rbt_ib_umem_for_each_in_range( | |
134 | &per_mm->umem_tree, 0, ULLONG_MAX, | |
135 | ib_umem_notifier_release_trampoline, true, NULL); | |
c9990ab3 | 136 | up_read(&per_mm->umem_rwsem); |
882214e2 HE |
137 | } |
138 | ||
b5231b01 JG |
139 | static int invalidate_range_start_trampoline(struct ib_umem_odp *item, |
140 | u64 start, u64 end, void *cookie) | |
882214e2 HE |
141 | { |
142 | ib_umem_notifier_start_account(item); | |
41b4deea | 143 | item->umem.context->invalidate_range(item, start, end); |
882214e2 HE |
144 | return 0; |
145 | } | |
146 | ||
93065ac7 | 147 | static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, |
5d6527a7 | 148 | const struct mmu_notifier_range *range) |
882214e2 | 149 | { |
c9990ab3 JG |
150 | struct ib_ucontext_per_mm *per_mm = |
151 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
7608bf40 | 152 | int rc; |
93065ac7 | 153 | |
dfcd6660 | 154 | if (mmu_notifier_range_blockable(range)) |
c9990ab3 JG |
155 | down_read(&per_mm->umem_rwsem); |
156 | else if (!down_read_trylock(&per_mm->umem_rwsem)) | |
93065ac7 | 157 | return -EAGAIN; |
882214e2 | 158 | |
be7a57b4 JG |
159 | if (!per_mm->active) { |
160 | up_read(&per_mm->umem_rwsem); | |
161 | /* | |
162 | * At this point active is permanently set and visible to this | |
163 | * CPU without a lock, that fact is relied on to skip the unlock | |
164 | * in range_end. | |
165 | */ | |
166 | return 0; | |
167 | } | |
168 | ||
7608bf40 JG |
169 | rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, |
170 | range->end, | |
171 | invalidate_range_start_trampoline, | |
172 | mmu_notifier_range_blockable(range), | |
173 | NULL); | |
174 | if (rc) | |
175 | up_read(&per_mm->umem_rwsem); | |
176 | return rc; | |
882214e2 HE |
177 | } |
178 | ||
b5231b01 | 179 | static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, |
882214e2 HE |
180 | u64 end, void *cookie) |
181 | { | |
182 | ib_umem_notifier_end_account(item); | |
183 | return 0; | |
184 | } | |
185 | ||
186 | static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | |
5d6527a7 | 187 | const struct mmu_notifier_range *range) |
882214e2 | 188 | { |
c9990ab3 JG |
189 | struct ib_ucontext_per_mm *per_mm = |
190 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
882214e2 | 191 | |
be7a57b4 | 192 | if (unlikely(!per_mm->active)) |
882214e2 HE |
193 | return; |
194 | ||
5d6527a7 JG |
195 | rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, |
196 | range->end, | |
93065ac7 | 197 | invalidate_range_end_trampoline, true, NULL); |
c9990ab3 | 198 | up_read(&per_mm->umem_rwsem); |
882214e2 HE |
199 | } |
200 | ||
46e741f4 | 201 | static const struct mmu_notifier_ops ib_umem_notifiers = { |
882214e2 | 202 | .release = ib_umem_notifier_release, |
882214e2 HE |
203 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, |
204 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | |
205 | }; | |
206 | ||
f27a0d50 JG |
207 | static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) |
208 | { | |
209 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
f27a0d50 JG |
210 | |
211 | down_write(&per_mm->umem_rwsem); | |
d2183c6f | 212 | if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) |
f27a0d50 JG |
213 | rbt_ib_umem_insert(&umem_odp->interval_tree, |
214 | &per_mm->umem_tree); | |
f27a0d50 JG |
215 | up_write(&per_mm->umem_rwsem); |
216 | } | |
217 | ||
218 | static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) | |
219 | { | |
220 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
f27a0d50 JG |
221 | |
222 | down_write(&per_mm->umem_rwsem); | |
d2183c6f | 223 | if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) |
f27a0d50 JG |
224 | rbt_ib_umem_remove(&umem_odp->interval_tree, |
225 | &per_mm->umem_tree); | |
ca748c39 | 226 | complete_all(&umem_odp->notifier_completion); |
f27a0d50 JG |
227 | |
228 | up_write(&per_mm->umem_rwsem); | |
229 | } | |
230 | ||
231 | static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, | |
232 | struct mm_struct *mm) | |
d07d1d70 | 233 | { |
c9990ab3 | 234 | struct ib_ucontext_per_mm *per_mm; |
f27a0d50 JG |
235 | int ret; |
236 | ||
237 | per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); | |
238 | if (!per_mm) | |
239 | return ERR_PTR(-ENOMEM); | |
240 | ||
241 | per_mm->context = ctx; | |
242 | per_mm->mm = mm; | |
243 | per_mm->umem_tree = RB_ROOT_CACHED; | |
244 | init_rwsem(&per_mm->umem_rwsem); | |
4ae27444 | 245 | per_mm->active = true; |
f27a0d50 JG |
246 | |
247 | rcu_read_lock(); | |
248 | per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); | |
249 | rcu_read_unlock(); | |
250 | ||
251 | WARN_ON(mm != current->mm); | |
252 | ||
253 | per_mm->mn.ops = &ib_umem_notifiers; | |
254 | ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); | |
255 | if (ret) { | |
256 | dev_err(&ctx->device->dev, | |
257 | "Failed to register mmu_notifier %d\n", ret); | |
258 | goto out_pid; | |
259 | } | |
260 | ||
261 | list_add(&per_mm->ucontext_list, &ctx->per_mm_list); | |
262 | return per_mm; | |
263 | ||
264 | out_pid: | |
265 | put_pid(per_mm->tgid); | |
266 | kfree(per_mm); | |
267 | return ERR_PTR(ret); | |
268 | } | |
269 | ||
270 | static int get_per_mm(struct ib_umem_odp *umem_odp) | |
271 | { | |
272 | struct ib_ucontext *ctx = umem_odp->umem.context; | |
273 | struct ib_ucontext_per_mm *per_mm; | |
274 | ||
275 | /* | |
276 | * Generally speaking we expect only one or two per_mm in this list, | |
277 | * so no reason to optimize this search today. | |
278 | */ | |
279 | mutex_lock(&ctx->per_mm_list_lock); | |
280 | list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { | |
281 | if (per_mm->mm == umem_odp->umem.owning_mm) | |
282 | goto found; | |
283 | } | |
284 | ||
285 | per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); | |
286 | if (IS_ERR(per_mm)) { | |
287 | mutex_unlock(&ctx->per_mm_list_lock); | |
288 | return PTR_ERR(per_mm); | |
289 | } | |
290 | ||
291 | found: | |
292 | umem_odp->per_mm = per_mm; | |
293 | per_mm->odp_mrs_count++; | |
294 | mutex_unlock(&ctx->per_mm_list_lock); | |
295 | ||
296 | return 0; | |
297 | } | |
298 | ||
56ac9dd9 JG |
299 | static void free_per_mm(struct rcu_head *rcu) |
300 | { | |
301 | kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); | |
302 | } | |
303 | ||
f3738591 | 304 | static void put_per_mm(struct ib_umem_odp *umem_odp) |
f27a0d50 JG |
305 | { |
306 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
307 | struct ib_ucontext *ctx = umem_odp->umem.context; | |
308 | bool need_free; | |
309 | ||
310 | mutex_lock(&ctx->per_mm_list_lock); | |
311 | umem_odp->per_mm = NULL; | |
312 | per_mm->odp_mrs_count--; | |
313 | need_free = per_mm->odp_mrs_count == 0; | |
314 | if (need_free) | |
315 | list_del(&per_mm->ucontext_list); | |
316 | mutex_unlock(&ctx->per_mm_list_lock); | |
317 | ||
318 | if (!need_free) | |
319 | return; | |
320 | ||
be7a57b4 JG |
321 | /* |
322 | * NOTE! mmu_notifier_unregister() can happen between a start/end | |
323 | * callback, resulting in an start/end, and thus an unbalanced | |
324 | * lock. This doesn't really matter to us since we are about to kfree | |
325 | * the memory that holds the lock, however LOCKDEP doesn't like this. | |
326 | */ | |
327 | down_write(&per_mm->umem_rwsem); | |
328 | per_mm->active = false; | |
329 | up_write(&per_mm->umem_rwsem); | |
330 | ||
56ac9dd9 JG |
331 | WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); |
332 | mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); | |
f27a0d50 | 333 | put_pid(per_mm->tgid); |
56ac9dd9 | 334 | mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); |
f27a0d50 JG |
335 | } |
336 | ||
da6a496a | 337 | struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, |
f27a0d50 JG |
338 | unsigned long addr, size_t size) |
339 | { | |
da6a496a | 340 | struct ib_ucontext_per_mm *per_mm = root->per_mm; |
f27a0d50 | 341 | struct ib_ucontext *ctx = per_mm->context; |
d07d1d70 | 342 | struct ib_umem_odp *odp_data; |
41b4deea | 343 | struct ib_umem *umem; |
d07d1d70 AK |
344 | int pages = size >> PAGE_SHIFT; |
345 | int ret; | |
346 | ||
41b4deea JG |
347 | odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); |
348 | if (!odp_data) | |
d07d1d70 | 349 | return ERR_PTR(-ENOMEM); |
41b4deea | 350 | umem = &odp_data->umem; |
f27a0d50 | 351 | umem->context = ctx; |
3e7e1193 AK |
352 | umem->length = size; |
353 | umem->address = addr; | |
d2183c6f | 354 | odp_data->page_shift = PAGE_SHIFT; |
da6a496a | 355 | umem->writable = root->umem.writable; |
597ecc5a | 356 | umem->is_odp = 1; |
f27a0d50 | 357 | odp_data->per_mm = per_mm; |
a2093dd3 AK |
358 | umem->owning_mm = per_mm->mm; |
359 | mmgrab(umem->owning_mm); | |
d07d1d70 | 360 | |
d07d1d70 AK |
361 | mutex_init(&odp_data->umem_mutex); |
362 | init_completion(&odp_data->notifier_completion); | |
363 | ||
fad953ce KC |
364 | odp_data->page_list = |
365 | vzalloc(array_size(pages, sizeof(*odp_data->page_list))); | |
d07d1d70 AK |
366 | if (!odp_data->page_list) { |
367 | ret = -ENOMEM; | |
368 | goto out_odp_data; | |
369 | } | |
370 | ||
fad953ce KC |
371 | odp_data->dma_list = |
372 | vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); | |
d07d1d70 AK |
373 | if (!odp_data->dma_list) { |
374 | ret = -ENOMEM; | |
375 | goto out_page_list; | |
376 | } | |
377 | ||
f27a0d50 JG |
378 | /* |
379 | * Caller must ensure that the umem_odp that the per_mm came from | |
380 | * cannot be freed during the call to ib_alloc_odp_umem. | |
381 | */ | |
382 | mutex_lock(&ctx->per_mm_list_lock); | |
c9990ab3 | 383 | per_mm->odp_mrs_count++; |
f27a0d50 JG |
384 | mutex_unlock(&ctx->per_mm_list_lock); |
385 | add_umem_to_per_mm(odp_data); | |
d07d1d70 | 386 | |
b5231b01 | 387 | return odp_data; |
d07d1d70 AK |
388 | |
389 | out_page_list: | |
390 | vfree(odp_data->page_list); | |
391 | out_odp_data: | |
a2093dd3 | 392 | mmdrop(umem->owning_mm); |
d07d1d70 | 393 | kfree(odp_data); |
d07d1d70 AK |
394 | return ERR_PTR(ret); |
395 | } | |
396 | EXPORT_SYMBOL(ib_alloc_odp_umem); | |
397 | ||
41b4deea | 398 | int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) |
8ada2c1c | 399 | { |
41b4deea | 400 | struct ib_umem *umem = &umem_odp->umem; |
f27a0d50 JG |
401 | /* |
402 | * NOTE: This must called in a process context where umem->owning_mm | |
403 | * == current->mm | |
404 | */ | |
405 | struct mm_struct *mm = umem->owning_mm; | |
8ada2c1c | 406 | int ret_val; |
8ada2c1c | 407 | |
d2183c6f | 408 | umem_odp->page_shift = PAGE_SHIFT; |
0008b84e AK |
409 | if (access & IB_ACCESS_HUGETLB) { |
410 | struct vm_area_struct *vma; | |
411 | struct hstate *h; | |
412 | ||
79bb5b7e | 413 | down_read(&mm->mmap_sem); |
d2183c6f | 414 | vma = find_vma(mm, ib_umem_start(umem_odp)); |
79bb5b7e LR |
415 | if (!vma || !is_vm_hugetlb_page(vma)) { |
416 | up_read(&mm->mmap_sem); | |
0008b84e | 417 | return -EINVAL; |
79bb5b7e | 418 | } |
0008b84e | 419 | h = hstate_vma(vma); |
d2183c6f | 420 | umem_odp->page_shift = huge_page_shift(h); |
79bb5b7e | 421 | up_read(&mm->mmap_sem); |
0008b84e AK |
422 | } |
423 | ||
41b4deea | 424 | mutex_init(&umem_odp->umem_mutex); |
8ada2c1c | 425 | |
41b4deea | 426 | init_completion(&umem_odp->notifier_completion); |
882214e2 | 427 | |
d2183c6f | 428 | if (ib_umem_odp_num_pages(umem_odp)) { |
41b4deea JG |
429 | umem_odp->page_list = |
430 | vzalloc(array_size(sizeof(*umem_odp->page_list), | |
d2183c6f | 431 | ib_umem_odp_num_pages(umem_odp))); |
f27a0d50 JG |
432 | if (!umem_odp->page_list) |
433 | return -ENOMEM; | |
8ada2c1c | 434 | |
41b4deea JG |
435 | umem_odp->dma_list = |
436 | vzalloc(array_size(sizeof(*umem_odp->dma_list), | |
d2183c6f | 437 | ib_umem_odp_num_pages(umem_odp))); |
41b4deea | 438 | if (!umem_odp->dma_list) { |
d07d1d70 AK |
439 | ret_val = -ENOMEM; |
440 | goto out_page_list; | |
441 | } | |
8ada2c1c SR |
442 | } |
443 | ||
f27a0d50 JG |
444 | ret_val = get_per_mm(umem_odp); |
445 | if (ret_val) | |
446 | goto out_dma_list; | |
447 | add_umem_to_per_mm(umem_odp); | |
882214e2 | 448 | |
8ada2c1c SR |
449 | return 0; |
450 | ||
f27a0d50 | 451 | out_dma_list: |
41b4deea | 452 | vfree(umem_odp->dma_list); |
8ada2c1c | 453 | out_page_list: |
41b4deea | 454 | vfree(umem_odp->page_list); |
8ada2c1c SR |
455 | return ret_val; |
456 | } | |
457 | ||
b5231b01 | 458 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp) |
8ada2c1c SR |
459 | { |
460 | /* | |
461 | * Ensure that no more pages are mapped in the umem. | |
462 | * | |
463 | * It is the driver's responsibility to ensure, before calling us, | |
464 | * that the hardware will not attempt to access the MR any more. | |
465 | */ | |
d2183c6f JG |
466 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), |
467 | ib_umem_end(umem_odp)); | |
8ada2c1c | 468 | |
f27a0d50 JG |
469 | remove_umem_from_per_mm(umem_odp); |
470 | put_per_mm(umem_odp); | |
b5231b01 JG |
471 | vfree(umem_odp->dma_list); |
472 | vfree(umem_odp->page_list); | |
8ada2c1c SR |
473 | } |
474 | ||
475 | /* | |
476 | * Map for DMA and insert a single page into the on-demand paging page tables. | |
477 | * | |
478 | * @umem: the umem to insert the page to. | |
479 | * @page_index: index in the umem to add the page to. | |
480 | * @page: the page struct to map and add. | |
481 | * @access_mask: access permissions needed for this page. | |
482 | * @current_seq: sequence number for synchronization with invalidations. | |
483 | * the sequence number is taken from | |
b5231b01 | 484 | * umem_odp->notifiers_seq. |
8ada2c1c | 485 | * |
882214e2 HE |
486 | * The function returns -EFAULT if the DMA mapping operation fails. It returns |
487 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | |
8ada2c1c | 488 | * |
ea996974 | 489 | * The page is released via put_user_page even if the operation failed. For |
8ada2c1c SR |
490 | * on-demand pinning, the page is released whenever it isn't stored in the |
491 | * umem. | |
492 | */ | |
493 | static int ib_umem_odp_map_dma_single_page( | |
b5231b01 | 494 | struct ib_umem_odp *umem_odp, |
8ada2c1c SR |
495 | int page_index, |
496 | struct page *page, | |
497 | u64 access_mask, | |
498 | unsigned long current_seq) | |
499 | { | |
d2183c6f JG |
500 | struct ib_ucontext *context = umem_odp->umem.context; |
501 | struct ib_device *dev = context->device; | |
8ada2c1c | 502 | dma_addr_t dma_addr; |
882214e2 | 503 | int remove_existing_mapping = 0; |
8ada2c1c SR |
504 | int ret = 0; |
505 | ||
882214e2 HE |
506 | /* |
507 | * Note: we avoid writing if seq is different from the initial seq, to | |
508 | * handle case of a racing notifier. This check also allows us to bail | |
509 | * early if we have a notifier running in parallel with us. | |
510 | */ | |
b5231b01 | 511 | if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { |
882214e2 HE |
512 | ret = -EAGAIN; |
513 | goto out; | |
514 | } | |
b5231b01 | 515 | if (!(umem_odp->dma_list[page_index])) { |
d2183c6f JG |
516 | dma_addr = |
517 | ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), | |
518 | DMA_BIDIRECTIONAL); | |
8ada2c1c SR |
519 | if (ib_dma_mapping_error(dev, dma_addr)) { |
520 | ret = -EFAULT; | |
521 | goto out; | |
522 | } | |
b5231b01 JG |
523 | umem_odp->dma_list[page_index] = dma_addr | access_mask; |
524 | umem_odp->page_list[page_index] = page; | |
d10bcf94 | 525 | umem_odp->npages++; |
b5231b01 JG |
526 | } else if (umem_odp->page_list[page_index] == page) { |
527 | umem_odp->dma_list[page_index] |= access_mask; | |
8ada2c1c SR |
528 | } else { |
529 | pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | |
b5231b01 | 530 | umem_odp->page_list[page_index], page); |
882214e2 HE |
531 | /* Better remove the mapping now, to prevent any further |
532 | * damage. */ | |
533 | remove_existing_mapping = 1; | |
8ada2c1c SR |
534 | } |
535 | ||
536 | out: | |
ea996974 | 537 | put_user_page(page); |
8ada2c1c | 538 | |
4ae27444 | 539 | if (remove_existing_mapping) { |
605728e6 | 540 | ib_umem_notifier_start_account(umem_odp); |
d2183c6f | 541 | context->invalidate_range( |
b5231b01 | 542 | umem_odp, |
d2183c6f JG |
543 | ib_umem_start(umem_odp) + |
544 | (page_index << umem_odp->page_shift), | |
545 | ib_umem_start(umem_odp) + | |
546 | ((page_index + 1) << umem_odp->page_shift)); | |
605728e6 | 547 | ib_umem_notifier_end_account(umem_odp); |
882214e2 HE |
548 | ret = -EAGAIN; |
549 | } | |
550 | ||
8ada2c1c SR |
551 | return ret; |
552 | } | |
553 | ||
554 | /** | |
555 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | |
556 | * | |
557 | * Pins the range of pages passed in the argument, and maps them to | |
558 | * DMA addresses. The DMA addresses of the mapped pages is updated in | |
b5231b01 | 559 | * umem_odp->dma_list. |
8ada2c1c SR |
560 | * |
561 | * Returns the number of pages mapped in success, negative error code | |
562 | * for failure. | |
882214e2 HE |
563 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents |
564 | * the function from completing its task. | |
d9d0674c AK |
565 | * An -ENOENT error code indicates that userspace process is being terminated |
566 | * and mm was already destroyed. | |
b5231b01 | 567 | * @umem_odp: the umem to map and pin |
8ada2c1c SR |
568 | * @user_virt: the address from which we need to map. |
569 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | |
570 | * bigger due to alignment, and may also be smaller in case of an error | |
571 | * pinning or mapping a page. The actual pages mapped is returned in | |
572 | * the return value. | |
573 | * @access_mask: bit mask of the requested access permissions for the given | |
574 | * range. | |
575 | * @current_seq: the MMU notifiers sequance value for synchronization with | |
576 | * invalidations. the sequance number is read from | |
b5231b01 | 577 | * umem_odp->notifiers_seq before calling this function |
8ada2c1c | 578 | */ |
b5231b01 JG |
579 | int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, |
580 | u64 bcnt, u64 access_mask, | |
581 | unsigned long current_seq) | |
8ada2c1c SR |
582 | { |
583 | struct task_struct *owning_process = NULL; | |
f27a0d50 | 584 | struct mm_struct *owning_mm = umem_odp->umem.owning_mm; |
8ada2c1c | 585 | struct page **local_page_list = NULL; |
403cd12e | 586 | u64 page_mask, off; |
d2183c6f JG |
587 | int j, k, ret = 0, start_idx, npages = 0; |
588 | unsigned int flags = 0, page_shift; | |
403cd12e | 589 | phys_addr_t p = 0; |
8ada2c1c SR |
590 | |
591 | if (access_mask == 0) | |
592 | return -EINVAL; | |
593 | ||
d2183c6f JG |
594 | if (user_virt < ib_umem_start(umem_odp) || |
595 | user_virt + bcnt > ib_umem_end(umem_odp)) | |
8ada2c1c SR |
596 | return -EFAULT; |
597 | ||
598 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | |
599 | if (!local_page_list) | |
600 | return -ENOMEM; | |
601 | ||
d2183c6f | 602 | page_shift = umem_odp->page_shift; |
403cd12e AK |
603 | page_mask = ~(BIT(page_shift) - 1); |
604 | off = user_virt & (~page_mask); | |
605 | user_virt = user_virt & page_mask; | |
8ada2c1c SR |
606 | bcnt += off; /* Charge for the first page offset as well. */ |
607 | ||
f27a0d50 JG |
608 | /* |
609 | * owning_process is allowed to be NULL, this means somehow the mm is | |
610 | * existing beyond the lifetime of the originating process.. Presumably | |
611 | * mmget_not_zero will fail in this case. | |
612 | */ | |
613 | owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); | |
4438ee3f | 614 | if (!owning_process || !mmget_not_zero(owning_mm)) { |
8ada2c1c | 615 | ret = -EINVAL; |
8ada2c1c SR |
616 | goto out_put_task; |
617 | } | |
618 | ||
9beae1ea LS |
619 | if (access_mask & ODP_WRITE_ALLOWED_BIT) |
620 | flags |= FOLL_WRITE; | |
621 | ||
d2183c6f | 622 | start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; |
8ada2c1c SR |
623 | k = start_idx; |
624 | ||
625 | while (bcnt > 0) { | |
403cd12e AK |
626 | const size_t gup_num_pages = min_t(size_t, |
627 | (bcnt + BIT(page_shift) - 1) >> page_shift, | |
628 | PAGE_SIZE / sizeof(struct page *)); | |
8ada2c1c SR |
629 | |
630 | down_read(&owning_mm->mmap_sem); | |
631 | /* | |
632 | * Note: this might result in redundent page getting. We can | |
633 | * avoid this by checking dma_list to be 0 before calling | |
634 | * get_user_pages. However, this make the code much more | |
635 | * complex (and doesn't gain us much performance in most use | |
636 | * cases). | |
637 | */ | |
1e987790 DH |
638 | npages = get_user_pages_remote(owning_process, owning_mm, |
639 | user_virt, gup_num_pages, | |
5b56d49f | 640 | flags, local_page_list, NULL, NULL); |
8ada2c1c SR |
641 | up_read(&owning_mm->mmap_sem); |
642 | ||
b02394aa MS |
643 | if (npages < 0) { |
644 | if (npages != -EAGAIN) | |
645 | pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); | |
646 | else | |
647 | pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); | |
8ada2c1c | 648 | break; |
b02394aa | 649 | } |
8ada2c1c SR |
650 | |
651 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | |
b5231b01 | 652 | mutex_lock(&umem_odp->umem_mutex); |
403cd12e AK |
653 | for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { |
654 | if (user_virt & ~page_mask) { | |
655 | p += PAGE_SIZE; | |
656 | if (page_to_phys(local_page_list[j]) != p) { | |
657 | ret = -EFAULT; | |
658 | break; | |
659 | } | |
ea996974 | 660 | put_user_page(local_page_list[j]); |
403cd12e AK |
661 | continue; |
662 | } | |
663 | ||
8ada2c1c | 664 | ret = ib_umem_odp_map_dma_single_page( |
b5231b01 | 665 | umem_odp, k, local_page_list[j], |
403cd12e | 666 | access_mask, current_seq); |
b02394aa MS |
667 | if (ret < 0) { |
668 | if (ret != -EAGAIN) | |
669 | pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); | |
670 | else | |
671 | pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); | |
8ada2c1c | 672 | break; |
b02394aa | 673 | } |
403cd12e AK |
674 | |
675 | p = page_to_phys(local_page_list[j]); | |
8ada2c1c SR |
676 | k++; |
677 | } | |
b5231b01 | 678 | mutex_unlock(&umem_odp->umem_mutex); |
8ada2c1c SR |
679 | |
680 | if (ret < 0) { | |
75a3e6a3 | 681 | /* |
0c507d8f JH |
682 | * Release pages, remembering that the first page |
683 | * to hit an error was already released by | |
684 | * ib_umem_odp_map_dma_single_page(). | |
75a3e6a3 | 685 | */ |
0c507d8f | 686 | if (npages - (j + 1) > 0) |
ea996974 JH |
687 | put_user_pages(&local_page_list[j+1], |
688 | npages - (j + 1)); | |
8ada2c1c SR |
689 | break; |
690 | } | |
691 | } | |
692 | ||
693 | if (ret >= 0) { | |
694 | if (npages < 0 && k == start_idx) | |
695 | ret = npages; | |
696 | else | |
697 | ret = k - start_idx; | |
698 | } | |
699 | ||
700 | mmput(owning_mm); | |
701 | out_put_task: | |
f27a0d50 JG |
702 | if (owning_process) |
703 | put_task_struct(owning_process); | |
8ada2c1c SR |
704 | free_page((unsigned long)local_page_list); |
705 | return ret; | |
706 | } | |
707 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | |
708 | ||
b5231b01 | 709 | void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, |
8ada2c1c SR |
710 | u64 bound) |
711 | { | |
712 | int idx; | |
713 | u64 addr; | |
d2183c6f | 714 | struct ib_device *dev = umem_odp->umem.context->device; |
8ada2c1c | 715 | |
d2183c6f JG |
716 | virt = max_t(u64, virt, ib_umem_start(umem_odp)); |
717 | bound = min_t(u64, bound, ib_umem_end(umem_odp)); | |
882214e2 HE |
718 | /* Note that during the run of this function, the |
719 | * notifiers_count of the MR is > 0, preventing any racing | |
720 | * faults from completion. We might be racing with other | |
721 | * invalidations, so we must make sure we free each page only | |
722 | * once. */ | |
b5231b01 | 723 | mutex_lock(&umem_odp->umem_mutex); |
d2183c6f JG |
724 | for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { |
725 | idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; | |
b5231b01 JG |
726 | if (umem_odp->page_list[idx]) { |
727 | struct page *page = umem_odp->page_list[idx]; | |
728 | dma_addr_t dma = umem_odp->dma_list[idx]; | |
8ada2c1c SR |
729 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; |
730 | ||
731 | WARN_ON(!dma_addr); | |
732 | ||
dd82e668 JG |
733 | ib_dma_unmap_page(dev, dma_addr, |
734 | BIT(umem_odp->page_shift), | |
8ada2c1c | 735 | DMA_BIDIRECTIONAL); |
325ad061 GS |
736 | if (dma & ODP_WRITE_ALLOWED_BIT) { |
737 | struct page *head_page = compound_head(page); | |
882214e2 HE |
738 | /* |
739 | * set_page_dirty prefers being called with | |
740 | * the page lock. However, MMU notifiers are | |
741 | * called sometimes with and sometimes without | |
742 | * the lock. We rely on the umem_mutex instead | |
743 | * to prevent other mmu notifiers from | |
744 | * continuing and allowing the page mapping to | |
745 | * be removed. | |
746 | */ | |
747 | set_page_dirty(head_page); | |
325ad061 | 748 | } |
b5231b01 JG |
749 | umem_odp->page_list[idx] = NULL; |
750 | umem_odp->dma_list[idx] = 0; | |
d10bcf94 | 751 | umem_odp->npages--; |
8ada2c1c | 752 | } |
8ada2c1c | 753 | } |
b5231b01 | 754 | mutex_unlock(&umem_odp->umem_mutex); |
8ada2c1c SR |
755 | } |
756 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); | |
fec99ede LR |
757 | |
758 | /* @last is not a part of the interval. See comment for function | |
759 | * node_last. | |
760 | */ | |
761 | int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, | |
762 | u64 start, u64 last, | |
763 | umem_call_back cb, | |
93065ac7 | 764 | bool blockable, |
fec99ede LR |
765 | void *cookie) |
766 | { | |
767 | int ret_val = 0; | |
768 | struct umem_odp_node *node, *next; | |
769 | struct ib_umem_odp *umem; | |
770 | ||
771 | if (unlikely(start == last)) | |
772 | return ret_val; | |
773 | ||
774 | for (node = rbt_ib_umem_iter_first(root, start, last - 1); | |
775 | node; node = next) { | |
93065ac7 MH |
776 | /* TODO move the blockable decision up to the callback */ |
777 | if (!blockable) | |
778 | return -EAGAIN; | |
fec99ede LR |
779 | next = rbt_ib_umem_iter_next(node, start, last - 1); |
780 | umem = container_of(node, struct ib_umem_odp, interval_tree); | |
b5231b01 | 781 | ret_val = cb(umem, start, last, cookie) || ret_val; |
fec99ede LR |
782 | } |
783 | ||
784 | return ret_val; | |
785 | } | |
786 | EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); | |
787 | ||
788 | struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, | |
789 | u64 addr, u64 length) | |
790 | { | |
791 | struct umem_odp_node *node; | |
792 | ||
793 | node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); | |
794 | if (node) | |
795 | return container_of(node, struct ib_umem_odp, interval_tree); | |
796 | return NULL; | |
797 | ||
798 | } | |
799 | EXPORT_SYMBOL(rbt_ib_umem_lookup); |