Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2
[linux-block.git] / mm / madvise.c
CommitLineData
1da177e4
LT
1/*
2 * linux/mm/madvise.c
3 *
4 * Copyright (C) 1999 Linus Torvalds
5 * Copyright (C) 2002 Christoph Hellwig
6 */
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
05b74384 11#include <linux/mempolicy.h>
afcf938e 12#include <linux/page-isolation.h>
1da177e4 13#include <linux/hugetlb.h>
3f31d075 14#include <linux/falloc.h>
e8edc6e0 15#include <linux/sched.h>
f8af4da3 16#include <linux/ksm.h>
3f31d075 17#include <linux/fs.h>
1da177e4 18
0a27a14a
NP
19/*
20 * Any behaviour which results in changes to the vma->vm_flags needs to
21 * take mmap_sem for writing. Others, which simply traverse vmas, need
22 * to only take it for reading.
23 */
24static int madvise_need_mmap_write(int behavior)
25{
26 switch (behavior) {
27 case MADV_REMOVE:
28 case MADV_WILLNEED:
29 case MADV_DONTNEED:
30 return 0;
31 default:
32 /* be safe, default to 1. list exceptions explicitly */
33 return 1;
34 }
35}
36
1da177e4
LT
37/*
38 * We can potentially split a vm area into separate
39 * areas, each area with its own behavior.
40 */
05b74384
PM
41static long madvise_behavior(struct vm_area_struct * vma,
42 struct vm_area_struct **prev,
43 unsigned long start, unsigned long end, int behavior)
1da177e4
LT
44{
45 struct mm_struct * mm = vma->vm_mm;
46 int error = 0;
05b74384 47 pgoff_t pgoff;
3866ea90 48 unsigned long new_flags = vma->vm_flags;
e798c6e8
PM
49
50 switch (behavior) {
f8225661
MT
51 case MADV_NORMAL:
52 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
53 break;
e798c6e8 54 case MADV_SEQUENTIAL:
f8225661 55 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e8
PM
56 break;
57 case MADV_RANDOM:
f8225661 58 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e8 59 break;
f8225661
MT
60 case MADV_DONTFORK:
61 new_flags |= VM_DONTCOPY;
62 break;
63 case MADV_DOFORK:
3866ea90
HD
64 if (vma->vm_flags & VM_IO) {
65 error = -EINVAL;
66 goto out;
67 }
f8225661 68 new_flags &= ~VM_DONTCOPY;
e798c6e8 69 break;
accb61fe
JB
70 case MADV_DONTDUMP:
71 new_flags |= VM_NODUMP;
72 break;
73 case MADV_DODUMP:
74 new_flags &= ~VM_NODUMP;
75 break;
f8af4da3
HD
76 case MADV_MERGEABLE:
77 case MADV_UNMERGEABLE:
78 error = ksm_madvise(vma, start, end, behavior, &new_flags);
79 if (error)
80 goto out;
81 break;
0af4e98b 82 case MADV_HUGEPAGE:
a664b2d8 83 case MADV_NOHUGEPAGE:
60ab3244 84 error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b
AA
85 if (error)
86 goto out;
87 break;
e798c6e8
PM
88 }
89
05b74384
PM
90 if (new_flags == vma->vm_flags) {
91 *prev = vma;
836d5ffd 92 goto out;
05b74384
PM
93 }
94
95 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
96 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
97 vma->vm_file, pgoff, vma_policy(vma));
98 if (*prev) {
99 vma = *prev;
100 goto success;
101 }
102
103 *prev = vma;
1da177e4
LT
104
105 if (start != vma->vm_start) {
106 error = split_vma(mm, vma, start, 1);
107 if (error)
108 goto out;
109 }
110
111 if (end != vma->vm_end) {
112 error = split_vma(mm, vma, end, 0);
113 if (error)
114 goto out;
115 }
116
836d5ffd 117success:
1da177e4
LT
118 /*
119 * vm_flags is protected by the mmap_sem held in write mode.
120 */
e798c6e8 121 vma->vm_flags = new_flags;
1da177e4
LT
122
123out:
124 if (error == -ENOMEM)
125 error = -EAGAIN;
126 return error;
127}
128
129/*
130 * Schedule all required I/O operations. Do not wait for completion.
131 */
132static long madvise_willneed(struct vm_area_struct * vma,
05b74384 133 struct vm_area_struct ** prev,
1da177e4
LT
134 unsigned long start, unsigned long end)
135{
136 struct file *file = vma->vm_file;
137
1bef4003
S
138 if (!file)
139 return -EBADF;
140
70688e4d 141 if (file->f_mapping->a_ops->get_xip_mem) {
fe77ba6f
CO
142 /* no bad return value, but ignore advice */
143 return 0;
144 }
145
05b74384 146 *prev = vma;
1da177e4
LT
147 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
148 if (end > vma->vm_end)
149 end = vma->vm_end;
150 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
151
f7e839dd 152 force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4
LT
153 return 0;
154}
155
156/*
157 * Application no longer needs these pages. If the pages are dirty,
158 * it's OK to just throw them away. The app will be more careful about
159 * data it wants to keep. Be sure to free swap resources too. The
7e6cbea3 160 * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4
LT
161 * these pages later if no one else has touched them in the meantime,
162 * although we could add these pages to a global reuse list for
7e6cbea3 163 * shrink_active_list to pick up before reclaiming other pages.
1da177e4
LT
164 *
165 * NB: This interface discards data rather than pushes it out to swap,
166 * as some implementations do. This has performance implications for
167 * applications like large transactional databases which want to discard
168 * pages in anonymous maps after committing to backing store the data
169 * that was kept in them. There is no reason to write this data out to
170 * the swap area if the application is discarding it.
171 *
172 * An interface that causes the system to free clean pages and flush
173 * dirty pages is already available as msync(MS_INVALIDATE).
174 */
175static long madvise_dontneed(struct vm_area_struct * vma,
05b74384 176 struct vm_area_struct ** prev,
1da177e4
LT
177 unsigned long start, unsigned long end)
178{
05b74384 179 *prev = vma;
6aab341e 180 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4
LT
181 return -EINVAL;
182
183 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
184 struct zap_details details = {
185 .nonlinear_vma = vma,
186 .last_index = ULONG_MAX,
187 };
188 zap_page_range(vma, start, end - start, &details);
189 } else
190 zap_page_range(vma, start, end - start, NULL);
191 return 0;
192}
193
f6b3ec23
BP
194/*
195 * Application wants to free up the pages and associated backing store.
196 * This is effectively punching a hole into the middle of a file.
197 *
198 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
199 * Other filesystems return -ENOSYS.
200 */
201static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d 202 struct vm_area_struct **prev,
f6b3ec23
BP
203 unsigned long start, unsigned long end)
204{
3f31d075 205 loff_t offset;
90ed52eb 206 int error;
f6b3ec23 207
90ed52eb 208 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
00e9fa2d 209
f6b3ec23
BP
210 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
211 return -EINVAL;
212
213 if (!vma->vm_file || !vma->vm_file->f_mapping
214 || !vma->vm_file->f_mapping->host) {
215 return -EINVAL;
216 }
217
69cf0fac
HD
218 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
219 return -EACCES;
220
f6b3ec23
BP
221 offset = (loff_t)(start - vma->vm_start)
222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52eb 223
3f31d075 224 /* filesystem's fallocate may need to take i_mutex */
0a27a14a 225 up_read(&current->mm->mmap_sem);
3f31d075
HD
226 error = do_fallocate(vma->vm_file,
227 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
228 offset, end - start);
0a27a14a 229 down_read(&current->mm->mmap_sem);
90ed52eb 230 return error;
f6b3ec23
BP
231}
232
9893e49d
AK
233#ifdef CONFIG_MEMORY_FAILURE
234/*
235 * Error injection support for memory error handling.
236 */
afcf938e 237static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d
AK
238{
239 int ret = 0;
240
241 if (!capable(CAP_SYS_ADMIN))
242 return -EPERM;
243 for (; start < end; start += PAGE_SIZE) {
244 struct page *p;
d15f107d 245 int ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d
AK
246 if (ret != 1)
247 return ret;
afcf938e
AK
248 if (bhv == MADV_SOFT_OFFLINE) {
249 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
250 page_to_pfn(p), start);
251 ret = soft_offline_page(p, MF_COUNT_INCREASED);
252 if (ret)
253 break;
254 continue;
255 }
9893e49d
AK
256 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
257 page_to_pfn(p), start);
258 /* Ignore return value for now */
cd42f4a3 259 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
9893e49d
AK
260 }
261 return ret;
262}
263#endif
264
165cd402 265static long
266madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
267 unsigned long start, unsigned long end, int behavior)
1da177e4 268{
1da177e4 269 switch (behavior) {
f6b3ec23 270 case MADV_REMOVE:
3866ea90 271 return madvise_remove(vma, prev, start, end);
1da177e4 272 case MADV_WILLNEED:
3866ea90 273 return madvise_willneed(vma, prev, start, end);
1da177e4 274 case MADV_DONTNEED:
3866ea90 275 return madvise_dontneed(vma, prev, start, end);
1da177e4 276 default:
3866ea90 277 return madvise_behavior(vma, prev, start, end, behavior);
1da177e4 278 }
1da177e4
LT
279}
280
75927af8
NP
281static int
282madvise_behavior_valid(int behavior)
283{
284 switch (behavior) {
285 case MADV_DOFORK:
286 case MADV_DONTFORK:
287 case MADV_NORMAL:
288 case MADV_SEQUENTIAL:
289 case MADV_RANDOM:
290 case MADV_REMOVE:
291 case MADV_WILLNEED:
292 case MADV_DONTNEED:
f8af4da3
HD
293#ifdef CONFIG_KSM
294 case MADV_MERGEABLE:
295 case MADV_UNMERGEABLE:
0af4e98b
AA
296#endif
297#ifdef CONFIG_TRANSPARENT_HUGEPAGE
298 case MADV_HUGEPAGE:
a664b2d8 299 case MADV_NOHUGEPAGE:
f8af4da3 300#endif
accb61fe
JB
301 case MADV_DONTDUMP:
302 case MADV_DODUMP:
75927af8
NP
303 return 1;
304
305 default:
306 return 0;
307 }
308}
3866ea90 309
1da177e4
LT
310/*
311 * The madvise(2) system call.
312 *
313 * Applications can use madvise() to advise the kernel how it should
314 * handle paging I/O in this VM area. The idea is to help the kernel
315 * use appropriate read-ahead and caching techniques. The information
316 * provided is advisory only, and can be safely disregarded by the
317 * kernel without affecting the correct operation of the application.
318 *
319 * behavior values:
320 * MADV_NORMAL - the default behavior is to read clusters. This
321 * results in some read-ahead and read-behind.
322 * MADV_RANDOM - the system should read the minimum amount of data
323 * on any access, since it is unlikely that the appli-
324 * cation will need more than what it asks for.
325 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
326 * once, so they can be aggressively read ahead, and
327 * can be freed soon after they are accessed.
328 * MADV_WILLNEED - the application is notifying the system to read
329 * some pages ahead.
330 * MADV_DONTNEED - the application is finished with the given range,
331 * so the kernel can free resources associated with it.
f6b3ec23
BP
332 * MADV_REMOVE - the application wants to free up the given range of
333 * pages and associated backing store.
3866ea90
HD
334 * MADV_DONTFORK - omit this area from child's address space when forking:
335 * typically, to avoid COWing pages pinned by get_user_pages().
336 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
f8af4da3
HD
337 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
338 * this area with pages of identical content from other such areas.
339 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1da177e4
LT
340 *
341 * return values:
342 * zero - success
343 * -EINVAL - start + len < 0, start is not page-aligned,
344 * "behavior" is not a valid value, or application
345 * is attempting to release locked or shared pages.
346 * -ENOMEM - addresses in the specified range are not currently
347 * mapped, or are outside the AS of the process.
348 * -EIO - an I/O error occurred while paging in data.
349 * -EBADF - map exists, but area maps something that isn't a file.
350 * -EAGAIN - a kernel resource was temporarily unavailable.
351 */
3480b257 352SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4 353{
05b74384
PM
354 unsigned long end, tmp;
355 struct vm_area_struct * vma, *prev;
1da177e4
LT
356 int unmapped_error = 0;
357 int error = -EINVAL;
f7977793 358 int write;
1da177e4
LT
359 size_t len;
360
9893e49d 361#ifdef CONFIG_MEMORY_FAILURE
afcf938e
AK
362 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
363 return madvise_hwpoison(behavior, start, start+len_in);
9893e49d 364#endif
75927af8
NP
365 if (!madvise_behavior_valid(behavior))
366 return error;
367
f7977793
JB
368 write = madvise_need_mmap_write(behavior);
369 if (write)
0a27a14a
NP
370 down_write(&current->mm->mmap_sem);
371 else
372 down_read(&current->mm->mmap_sem);
1da177e4
LT
373
374 if (start & ~PAGE_MASK)
375 goto out;
376 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
377
378 /* Check to see whether len was rounded up from small -ve to zero */
379 if (len_in && !len)
380 goto out;
381
382 end = start + len;
383 if (end < start)
384 goto out;
385
386 error = 0;
387 if (end == start)
388 goto out;
389
390 /*
391 * If the interval [start,end) covers some unmapped address
392 * ranges, just ignore them, but return -ENOMEM at the end.
05b74384 393 * - different from the way of handling in mlock etc.
1da177e4 394 */
05b74384 395 vma = find_vma_prev(current->mm, start, &prev);
836d5ffd
HD
396 if (vma && start > vma->vm_start)
397 prev = vma;
398
1da177e4
LT
399 for (;;) {
400 /* Still start < end. */
401 error = -ENOMEM;
402 if (!vma)
403 goto out;
404
05b74384 405 /* Here start < (end|vma->vm_end). */
1da177e4
LT
406 if (start < vma->vm_start) {
407 unmapped_error = -ENOMEM;
408 start = vma->vm_start;
05b74384
PM
409 if (start >= end)
410 goto out;
1da177e4
LT
411 }
412
05b74384
PM
413 /* Here vma->vm_start <= start < (end|vma->vm_end) */
414 tmp = vma->vm_end;
415 if (end < tmp)
416 tmp = end;
1da177e4 417
05b74384
PM
418 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
419 error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4
LT
420 if (error)
421 goto out;
05b74384 422 start = tmp;
90ed52eb 423 if (prev && start < prev->vm_end)
05b74384
PM
424 start = prev->vm_end;
425 error = unmapped_error;
426 if (start >= end)
427 goto out;
90ed52eb
HD
428 if (prev)
429 vma = prev->vm_next;
430 else /* madvise_remove dropped mmap_sem */
431 vma = find_vma(current->mm, start);
1da177e4 432 }
1da177e4 433out:
f7977793 434 if (write)
0a27a14a
NP
435 up_write(&current->mm->mmap_sem);
436 else
437 up_read(&current->mm->mmap_sem);
438
1da177e4
LT
439 return error;
440}