Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include <linux/kernel.h> | |
38 | #include <linux/mm.h> | |
39 | #include <linux/string.h> | |
40 | #include <linux/stat.h> | |
41 | #include <linux/errno.h> | |
42 | #include <linux/unistd.h> | |
d7e09d03 PT |
43 | #include <asm/uaccess.h> |
44 | ||
45 | #include <linux/fs.h> | |
d7e09d03 PT |
46 | #include <linux/pagemap.h> |
47 | ||
48 | #define DEBUG_SUBSYSTEM S_LLITE | |
49 | ||
50 | #include <lustre_lite.h> | |
51 | #include "llite_internal.h" | |
52 | #include <linux/lustre_compat25.h> | |
53 | ||
54 | struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, | |
55 | int *type); | |
56 | ||
57 | static struct vm_operations_struct ll_file_vm_ops; | |
58 | ||
59 | void policy_from_vma(ldlm_policy_data_t *policy, | |
60 | struct vm_area_struct *vma, unsigned long addr, | |
61 | size_t count) | |
62 | { | |
63 | policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) + | |
64 | (vma->vm_pgoff << PAGE_CACHE_SHIFT); | |
65 | policy->l_extent.end = (policy->l_extent.start + count - 1) | | |
66 | ~CFS_PAGE_MASK; | |
67 | } | |
68 | ||
69 | struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, | |
70 | size_t count) | |
71 | { | |
72 | struct vm_area_struct *vma, *ret = NULL; | |
d7e09d03 PT |
73 | |
74 | /* mmap_sem must have been held by caller. */ | |
75 | LASSERT(!down_write_trylock(&mm->mmap_sem)); | |
76 | ||
77 | for(vma = find_vma(mm, addr); | |
78 | vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { | |
79 | if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && | |
80 | vma->vm_flags & VM_SHARED) { | |
81 | ret = vma; | |
82 | break; | |
83 | } | |
84 | } | |
0a3bdb00 | 85 | return ret; |
d7e09d03 PT |
86 | } |
87 | ||
88 | /** | |
89 | * API independent part for page fault initialization. | |
90 | * \param vma - virtual memory area addressed to page fault | |
91 | * \param env - corespondent lu_env to processing | |
92 | * \param nest - nested level | |
93 | * \param index - page index corespondent to fault. | |
94 | * \parm ra_flags - vma readahead flags. | |
95 | * | |
96 | * \return allocated and initialized env for fault operation. | |
97 | * \retval EINVAL if env can't allocated | |
98 | * \return other error codes from cl_io_init. | |
99 | */ | |
100 | struct cl_io *ll_fault_io_init(struct vm_area_struct *vma, | |
101 | struct lu_env **env_ret, | |
102 | struct cl_env_nest *nest, | |
103 | pgoff_t index, unsigned long *ra_flags) | |
104 | { | |
8a48df70 JH |
105 | struct file *file = vma->vm_file; |
106 | struct inode *inode = file->f_dentry->d_inode; | |
107 | struct cl_io *io; | |
108 | struct cl_fault_io *fio; | |
109 | struct lu_env *env; | |
110 | int rc; | |
d7e09d03 PT |
111 | |
112 | *env_ret = NULL; | |
113 | if (ll_file_nolock(file)) | |
0a3bdb00 | 114 | return ERR_PTR(-EOPNOTSUPP); |
d7e09d03 PT |
115 | |
116 | /* | |
117 | * page fault can be called when lustre IO is | |
118 | * already active for the current thread, e.g., when doing read/write | |
119 | * against user level buffer mapped from Lustre buffer. To avoid | |
120 | * stomping on existing context, optionally force an allocation of a new | |
121 | * one. | |
122 | */ | |
123 | env = cl_env_nested_get(nest); | |
124 | if (IS_ERR(env)) | |
0a3bdb00 | 125 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
126 | |
127 | *env_ret = env; | |
128 | ||
129 | io = ccc_env_thread_io(env); | |
130 | io->ci_obj = ll_i2info(inode)->lli_clob; | |
131 | LASSERT(io->ci_obj != NULL); | |
132 | ||
133 | fio = &io->u.ci_fault; | |
134 | fio->ft_index = index; | |
135 | fio->ft_executable = vma->vm_flags&VM_EXEC; | |
136 | ||
137 | /* | |
138 | * disable VM_SEQ_READ and use VM_RAND_READ to make sure that | |
139 | * the kernel will not read other pages not covered by ldlm in | |
140 | * filemap_nopage. we do our readahead in ll_readpage. | |
141 | */ | |
142 | if (ra_flags != NULL) | |
143 | *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); | |
144 | vma->vm_flags &= ~VM_SEQ_READ; | |
145 | vma->vm_flags |= VM_RAND_READ; | |
146 | ||
147 | CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, | |
148 | fio->ft_index, fio->ft_executable); | |
149 | ||
8a48df70 JH |
150 | rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); |
151 | if (rc == 0) { | |
d7e09d03 PT |
152 | struct ccc_io *cio = ccc_env_io(env); |
153 | struct ll_file_data *fd = LUSTRE_FPRIVATE(file); | |
154 | ||
155 | LASSERT(cio->cui_cl.cis_io == io); | |
156 | ||
8a48df70 JH |
157 | /* mmap lock must be MANDATORY it has to cache |
158 | * pages. */ | |
d7e09d03 | 159 | io->ci_lockreq = CILR_MANDATORY; |
8a48df70 JH |
160 | cio->cui_fd = fd; |
161 | } else { | |
162 | LASSERT(rc < 0); | |
163 | cl_io_fini(env, io); | |
164 | cl_env_nested_put(nest, env); | |
165 | io = ERR_PTR(rc); | |
d7e09d03 PT |
166 | } |
167 | ||
168 | return io; | |
169 | } | |
170 | ||
171 | /* Sharing code of page_mkwrite method for rhel5 and rhel6 */ | |
172 | static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, | |
173 | bool *retry) | |
174 | { | |
175 | struct lu_env *env; | |
176 | struct cl_io *io; | |
177 | struct vvp_io *vio; | |
178 | struct cl_env_nest nest; | |
179 | int result; | |
180 | sigset_t set; | |
181 | struct inode *inode; | |
182 | struct ll_inode_info *lli; | |
d7e09d03 PT |
183 | |
184 | LASSERT(vmpage != NULL); | |
185 | ||
186 | io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); | |
187 | if (IS_ERR(io)) | |
188 | GOTO(out, result = PTR_ERR(io)); | |
189 | ||
190 | result = io->ci_result; | |
191 | if (result < 0) | |
8a48df70 | 192 | GOTO(out_io, result); |
d7e09d03 PT |
193 | |
194 | io->u.ci_fault.ft_mkwrite = 1; | |
195 | io->u.ci_fault.ft_writable = 1; | |
196 | ||
197 | vio = vvp_env_io(env); | |
198 | vio->u.fault.ft_vma = vma; | |
199 | vio->u.fault.ft_vmpage = vmpage; | |
200 | ||
201 | set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); | |
202 | ||
203 | /* we grab lli_trunc_sem to exclude truncate case. | |
204 | * Otherwise, we could add dirty pages into osc cache | |
205 | * while truncate is on-going. */ | |
206 | inode = ccc_object_inode(io->ci_obj); | |
207 | lli = ll_i2info(inode); | |
208 | down_read(&lli->lli_trunc_sem); | |
209 | ||
210 | result = cl_io_loop(env, io); | |
211 | ||
212 | up_read(&lli->lli_trunc_sem); | |
213 | ||
214 | cfs_restore_sigs(set); | |
215 | ||
216 | if (result == 0) { | |
217 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | |
218 | struct ll_inode_info *lli = ll_i2info(inode); | |
219 | ||
220 | lock_page(vmpage); | |
221 | if (vmpage->mapping == NULL) { | |
222 | unlock_page(vmpage); | |
223 | ||
224 | /* page was truncated and lock was cancelled, return | |
225 | * ENODATA so that VM_FAULT_NOPAGE will be returned | |
226 | * to handle_mm_fault(). */ | |
227 | if (result == 0) | |
228 | result = -ENODATA; | |
229 | } else if (!PageDirty(vmpage)) { | |
230 | /* race, the page has been cleaned by ptlrpcd after | |
231 | * it was unlocked, it has to be added into dirty | |
232 | * cache again otherwise this soon-to-dirty page won't | |
233 | * consume any grants, even worse if this page is being | |
234 | * transferred because it will break RPC checksum. | |
235 | */ | |
236 | unlock_page(vmpage); | |
237 | ||
238 | CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " | |
239 | "been written out, retry.\n", | |
240 | vmpage, vmpage->index); | |
241 | ||
242 | *retry = true; | |
243 | result = -EAGAIN; | |
244 | } | |
245 | ||
246 | if (result == 0) { | |
247 | spin_lock(&lli->lli_lock); | |
248 | lli->lli_flags |= LLIF_DATA_MODIFIED; | |
249 | spin_unlock(&lli->lli_lock); | |
250 | } | |
251 | } | |
d7e09d03 | 252 | |
8a48df70 | 253 | out_io: |
d7e09d03 PT |
254 | cl_io_fini(env, io); |
255 | cl_env_nested_put(&nest, env); | |
8a48df70 | 256 | out: |
d7e09d03 | 257 | CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); |
d7e09d03 | 258 | LASSERT(ergo(result == 0, PageLocked(vmpage))); |
8a48df70 JH |
259 | |
260 | return result; | |
d7e09d03 PT |
261 | } |
262 | ||
263 | ||
264 | ||
265 | static inline int to_fault_error(int result) | |
266 | { | |
267 | switch(result) { | |
268 | case 0: | |
269 | result = VM_FAULT_LOCKED; | |
270 | break; | |
271 | case -EFAULT: | |
272 | result = VM_FAULT_NOPAGE; | |
273 | break; | |
274 | case -ENOMEM: | |
275 | result = VM_FAULT_OOM; | |
276 | break; | |
277 | default: | |
278 | result = VM_FAULT_SIGBUS; | |
279 | break; | |
280 | } | |
281 | return result; | |
282 | } | |
283 | ||
284 | /** | |
285 | * Lustre implementation of a vm_operations_struct::fault() method, called by | |
286 | * VM to server page fault (both in kernel and user space). | |
287 | * | |
d0a0acc3 | 288 | * \param vma - is virtual area struct related to page fault |
d7e09d03 PT |
289 | * \param vmf - structure which describe type and address where hit fault |
290 | * | |
291 | * \return allocated and filled _locked_ page for address | |
292 | * \retval VM_FAULT_ERROR on general error | |
293 | * \retval NOPAGE_OOM not have memory for allocate new page | |
294 | */ | |
295 | static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) | |
296 | { | |
297 | struct lu_env *env; | |
298 | struct cl_io *io; | |
299 | struct vvp_io *vio = NULL; | |
300 | struct page *vmpage; | |
301 | unsigned long ra_flags; | |
302 | struct cl_env_nest nest; | |
303 | int result; | |
304 | int fault_ret = 0; | |
d7e09d03 PT |
305 | |
306 | io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); | |
307 | if (IS_ERR(io)) | |
0a3bdb00 | 308 | return to_fault_error(PTR_ERR(io)); |
d7e09d03 PT |
309 | |
310 | result = io->ci_result; | |
311 | if (result == 0) { | |
312 | vio = vvp_env_io(env); | |
313 | vio->u.fault.ft_vma = vma; | |
314 | vio->u.fault.ft_vmpage = NULL; | |
315 | vio->u.fault.fault.ft_vmf = vmf; | |
316 | ||
317 | result = cl_io_loop(env, io); | |
318 | ||
319 | fault_ret = vio->u.fault.fault.ft_flags; | |
320 | vmpage = vio->u.fault.ft_vmpage; | |
321 | if (result != 0 && vmpage != NULL) { | |
322 | page_cache_release(vmpage); | |
323 | vmf->page = NULL; | |
324 | } | |
325 | } | |
326 | cl_io_fini(env, io); | |
327 | cl_env_nested_put(&nest, env); | |
328 | ||
329 | vma->vm_flags |= ra_flags; | |
330 | if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) | |
331 | fault_ret |= to_fault_error(result); | |
332 | ||
333 | CDEBUG(D_MMAP, "%s fault %d/%d\n", | |
334 | current->comm, fault_ret, result); | |
0a3bdb00 | 335 | return fault_ret; |
d7e09d03 PT |
336 | } |
337 | ||
338 | static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
339 | { | |
340 | int count = 0; | |
341 | bool printed = false; | |
342 | int result; | |
343 | sigset_t set; | |
344 | ||
345 | /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite | |
346 | * so that it can be killed by admin but not cause segfault by | |
347 | * other signals. */ | |
348 | set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); | |
349 | ||
350 | restart: | |
351 | result = ll_fault0(vma, vmf); | |
352 | LASSERT(!(result & VM_FAULT_LOCKED)); | |
353 | if (result == 0) { | |
354 | struct page *vmpage = vmf->page; | |
355 | ||
356 | /* check if this page has been truncated */ | |
357 | lock_page(vmpage); | |
358 | if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ | |
359 | unlock_page(vmpage); | |
360 | page_cache_release(vmpage); | |
361 | vmf->page = NULL; | |
362 | ||
363 | if (!printed && ++count > 16) { | |
364 | CWARN("the page is under heavy contention," | |
365 | "maybe your app(%s) needs revising :-)\n", | |
366 | current->comm); | |
367 | printed = true; | |
368 | } | |
369 | ||
370 | goto restart; | |
371 | } | |
372 | ||
34d1f637 | 373 | result = VM_FAULT_LOCKED; |
d7e09d03 PT |
374 | } |
375 | cfs_restore_sigs(set); | |
376 | return result; | |
377 | } | |
378 | ||
379 | static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |
380 | { | |
381 | int count = 0; | |
382 | bool printed = false; | |
383 | bool retry; | |
384 | int result; | |
385 | ||
386 | do { | |
387 | retry = false; | |
388 | result = ll_page_mkwrite0(vma, vmf->page, &retry); | |
389 | ||
390 | if (!printed && ++count > 16) { | |
391 | CWARN("app(%s): the page %lu of file %lu is under heavy" | |
392 | " contention.\n", | |
393 | current->comm, vmf->pgoff, | |
394 | vma->vm_file->f_dentry->d_inode->i_ino); | |
395 | printed = true; | |
396 | } | |
397 | } while (retry); | |
398 | ||
399 | switch(result) { | |
400 | case 0: | |
401 | LASSERT(PageLocked(vmf->page)); | |
402 | result = VM_FAULT_LOCKED; | |
403 | break; | |
404 | case -ENODATA: | |
405 | case -EFAULT: | |
406 | result = VM_FAULT_NOPAGE; | |
407 | break; | |
408 | case -ENOMEM: | |
409 | result = VM_FAULT_OOM; | |
410 | break; | |
411 | case -EAGAIN: | |
412 | result = VM_FAULT_RETRY; | |
413 | break; | |
414 | default: | |
415 | result = VM_FAULT_SIGBUS; | |
416 | break; | |
417 | } | |
418 | ||
419 | return result; | |
420 | } | |
421 | ||
422 | /** | |
423 | * To avoid cancel the locks covering mmapped region for lock cache pressure, | |
424 | * we track the mapped vma count in ccc_object::cob_mmap_cnt. | |
425 | */ | |
426 | static void ll_vm_open(struct vm_area_struct * vma) | |
427 | { | |
428 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | |
429 | struct ccc_object *vob = cl_inode2ccc(inode); | |
430 | ||
d7e09d03 PT |
431 | LASSERT(vma->vm_file); |
432 | LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); | |
433 | atomic_inc(&vob->cob_mmap_cnt); | |
d7e09d03 PT |
434 | } |
435 | ||
436 | /** | |
437 | * Dual to ll_vm_open(). | |
438 | */ | |
439 | static void ll_vm_close(struct vm_area_struct *vma) | |
440 | { | |
441 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | |
442 | struct ccc_object *vob = cl_inode2ccc(inode); | |
443 | ||
d7e09d03 PT |
444 | LASSERT(vma->vm_file); |
445 | atomic_dec(&vob->cob_mmap_cnt); | |
446 | LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); | |
d7e09d03 PT |
447 | } |
448 | ||
449 | ||
450 | /* return the user space pointer that maps to a file offset via a vma */ | |
451 | static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte) | |
452 | { | |
453 | return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT)); | |
454 | ||
455 | } | |
456 | ||
457 | /* XXX put nice comment here. talk about __free_pte -> dirty pages and | |
458 | * nopage's reference passing to the pte */ | |
459 | int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) | |
460 | { | |
461 | int rc = -ENOENT; | |
d7e09d03 PT |
462 | |
463 | LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first); | |
464 | if (mapping_mapped(mapping)) { | |
465 | rc = 0; | |
466 | unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1, | |
467 | last - first + 1, 0); | |
468 | } | |
469 | ||
0a3bdb00 | 470 | return rc; |
d7e09d03 PT |
471 | } |
472 | ||
473 | static struct vm_operations_struct ll_file_vm_ops = { | |
474 | .fault = ll_fault, | |
475 | .page_mkwrite = ll_page_mkwrite, | |
476 | .open = ll_vm_open, | |
477 | .close = ll_vm_close, | |
478 | }; | |
479 | ||
480 | int ll_file_mmap(struct file *file, struct vm_area_struct * vma) | |
481 | { | |
482 | struct inode *inode = file->f_dentry->d_inode; | |
483 | int rc; | |
d7e09d03 PT |
484 | |
485 | if (ll_file_nolock(file)) | |
0a3bdb00 | 486 | return -EOPNOTSUPP; |
d7e09d03 PT |
487 | |
488 | ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); | |
489 | rc = generic_file_mmap(file, vma); | |
490 | if (rc == 0) { | |
491 | vma->vm_ops = &ll_file_vm_ops; | |
492 | vma->vm_ops->open(vma); | |
493 | /* update the inode's size and mtime */ | |
494 | rc = ll_glimpse_size(inode); | |
495 | } | |
496 | ||
0a3bdb00 | 497 | return rc; |
d7e09d03 | 498 | } |