z3fold: extend compaction function
[linux-block.git] / mm / madvise.c
1 /*
2  *      linux/mm/madvise.c
3  *
4  * Copyright (C) 1999  Linus Torvalds
5  * Copyright (C) 2002  Christoph Hellwig
6  */
7
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/syscalls.h>
11 #include <linux/mempolicy.h>
12 #include <linux/page-isolation.h>
13 #include <linux/userfaultfd_k.h>
14 #include <linux/hugetlb.h>
15 #include <linux/falloc.h>
16 #include <linux/sched.h>
17 #include <linux/ksm.h>
18 #include <linux/fs.h>
19 #include <linux/file.h>
20 #include <linux/blkdev.h>
21 #include <linux/backing-dev.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/mmu_notifier.h>
25
26 #include <asm/tlb.h>
27
28 #include "internal.h"
29
30 /*
31  * Any behaviour which results in changes to the vma->vm_flags needs to
32  * take mmap_sem for writing. Others, which simply traverse vmas, need
33  * to only take it for reading.
34  */
35 static int madvise_need_mmap_write(int behavior)
36 {
37         switch (behavior) {
38         case MADV_REMOVE:
39         case MADV_WILLNEED:
40         case MADV_DONTNEED:
41         case MADV_FREE:
42                 return 0;
43         default:
44                 /* be safe, default to 1. list exceptions explicitly */
45                 return 1;
46         }
47 }
48
49 /*
50  * We can potentially split a vm area into separate
51  * areas, each area with its own behavior.
52  */
53 static long madvise_behavior(struct vm_area_struct *vma,
54                      struct vm_area_struct **prev,
55                      unsigned long start, unsigned long end, int behavior)
56 {
57         struct mm_struct *mm = vma->vm_mm;
58         int error = 0;
59         pgoff_t pgoff;
60         unsigned long new_flags = vma->vm_flags;
61
62         switch (behavior) {
63         case MADV_NORMAL:
64                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
65                 break;
66         case MADV_SEQUENTIAL:
67                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
68                 break;
69         case MADV_RANDOM:
70                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
71                 break;
72         case MADV_DONTFORK:
73                 new_flags |= VM_DONTCOPY;
74                 break;
75         case MADV_DOFORK:
76                 if (vma->vm_flags & VM_IO) {
77                         error = -EINVAL;
78                         goto out;
79                 }
80                 new_flags &= ~VM_DONTCOPY;
81                 break;
82         case MADV_DONTDUMP:
83                 new_flags |= VM_DONTDUMP;
84                 break;
85         case MADV_DODUMP:
86                 if (new_flags & VM_SPECIAL) {
87                         error = -EINVAL;
88                         goto out;
89                 }
90                 new_flags &= ~VM_DONTDUMP;
91                 break;
92         case MADV_MERGEABLE:
93         case MADV_UNMERGEABLE:
94                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
95                 if (error)
96                         goto out;
97                 break;
98         case MADV_HUGEPAGE:
99         case MADV_NOHUGEPAGE:
100                 error = hugepage_madvise(vma, &new_flags, behavior);
101                 if (error)
102                         goto out;
103                 break;
104         }
105
106         if (new_flags == vma->vm_flags) {
107                 *prev = vma;
108                 goto out;
109         }
110
111         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
112         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
113                           vma->vm_file, pgoff, vma_policy(vma),
114                           vma->vm_userfaultfd_ctx);
115         if (*prev) {
116                 vma = *prev;
117                 goto success;
118         }
119
120         *prev = vma;
121
122         if (start != vma->vm_start) {
123                 error = split_vma(mm, vma, start, 1);
124                 if (error)
125                         goto out;
126         }
127
128         if (end != vma->vm_end) {
129                 error = split_vma(mm, vma, end, 0);
130                 if (error)
131                         goto out;
132         }
133
134 success:
135         /*
136          * vm_flags is protected by the mmap_sem held in write mode.
137          */
138         vma->vm_flags = new_flags;
139
140 out:
141         if (error == -ENOMEM)
142                 error = -EAGAIN;
143         return error;
144 }
145
146 #ifdef CONFIG_SWAP
147 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
148         unsigned long end, struct mm_walk *walk)
149 {
150         pte_t *orig_pte;
151         struct vm_area_struct *vma = walk->private;
152         unsigned long index;
153
154         if (pmd_none_or_trans_huge_or_clear_bad(pmd))
155                 return 0;
156
157         for (index = start; index != end; index += PAGE_SIZE) {
158                 pte_t pte;
159                 swp_entry_t entry;
160                 struct page *page;
161                 spinlock_t *ptl;
162
163                 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
164                 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
165                 pte_unmap_unlock(orig_pte, ptl);
166
167                 if (pte_present(pte) || pte_none(pte))
168                         continue;
169                 entry = pte_to_swp_entry(pte);
170                 if (unlikely(non_swap_entry(entry)))
171                         continue;
172
173                 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
174                                                                 vma, index);
175                 if (page)
176                         put_page(page);
177         }
178
179         return 0;
180 }
181
182 static void force_swapin_readahead(struct vm_area_struct *vma,
183                 unsigned long start, unsigned long end)
184 {
185         struct mm_walk walk = {
186                 .mm = vma->vm_mm,
187                 .pmd_entry = swapin_walk_pmd_entry,
188                 .private = vma,
189         };
190
191         walk_page_range(start, end, &walk);
192
193         lru_add_drain();        /* Push any new pages onto the LRU now */
194 }
195
196 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
197                 unsigned long start, unsigned long end,
198                 struct address_space *mapping)
199 {
200         pgoff_t index;
201         struct page *page;
202         swp_entry_t swap;
203
204         for (; start < end; start += PAGE_SIZE) {
205                 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
206
207                 page = find_get_entry(mapping, index);
208                 if (!radix_tree_exceptional_entry(page)) {
209                         if (page)
210                                 put_page(page);
211                         continue;
212                 }
213                 swap = radix_to_swp_entry(page);
214                 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
215                                                                 NULL, 0);
216                 if (page)
217                         put_page(page);
218         }
219
220         lru_add_drain();        /* Push any new pages onto the LRU now */
221 }
222 #endif          /* CONFIG_SWAP */
223
224 /*
225  * Schedule all required I/O operations.  Do not wait for completion.
226  */
227 static long madvise_willneed(struct vm_area_struct *vma,
228                              struct vm_area_struct **prev,
229                              unsigned long start, unsigned long end)
230 {
231         struct file *file = vma->vm_file;
232
233 #ifdef CONFIG_SWAP
234         if (!file) {
235                 *prev = vma;
236                 force_swapin_readahead(vma, start, end);
237                 return 0;
238         }
239
240         if (shmem_mapping(file->f_mapping)) {
241                 *prev = vma;
242                 force_shm_swapin_readahead(vma, start, end,
243                                         file->f_mapping);
244                 return 0;
245         }
246 #else
247         if (!file)
248                 return -EBADF;
249 #endif
250
251         if (IS_DAX(file_inode(file))) {
252                 /* no bad return value, but ignore advice */
253                 return 0;
254         }
255
256         *prev = vma;
257         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
258         if (end > vma->vm_end)
259                 end = vma->vm_end;
260         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
261
262         force_page_cache_readahead(file->f_mapping, file, start, end - start);
263         return 0;
264 }
265
266 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
267                                 unsigned long end, struct mm_walk *walk)
268
269 {
270         struct mmu_gather *tlb = walk->private;
271         struct mm_struct *mm = tlb->mm;
272         struct vm_area_struct *vma = walk->vma;
273         spinlock_t *ptl;
274         pte_t *orig_pte, *pte, ptent;
275         struct page *page;
276         int nr_swap = 0;
277         unsigned long next;
278
279         next = pmd_addr_end(addr, end);
280         if (pmd_trans_huge(*pmd))
281                 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
282                         goto next;
283
284         if (pmd_trans_unstable(pmd))
285                 return 0;
286
287         tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
288         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
289         arch_enter_lazy_mmu_mode();
290         for (; addr != end; pte++, addr += PAGE_SIZE) {
291                 ptent = *pte;
292
293                 if (pte_none(ptent))
294                         continue;
295                 /*
296                  * If the pte has swp_entry, just clear page table to
297                  * prevent swap-in which is more expensive rather than
298                  * (page allocation + zeroing).
299                  */
300                 if (!pte_present(ptent)) {
301                         swp_entry_t entry;
302
303                         entry = pte_to_swp_entry(ptent);
304                         if (non_swap_entry(entry))
305                                 continue;
306                         nr_swap--;
307                         free_swap_and_cache(entry);
308                         pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
309                         continue;
310                 }
311
312                 page = vm_normal_page(vma, addr, ptent);
313                 if (!page)
314                         continue;
315
316                 /*
317                  * If pmd isn't transhuge but the page is THP and
318                  * is owned by only this process, split it and
319                  * deactivate all pages.
320                  */
321                 if (PageTransCompound(page)) {
322                         if (page_mapcount(page) != 1)
323                                 goto out;
324                         get_page(page);
325                         if (!trylock_page(page)) {
326                                 put_page(page);
327                                 goto out;
328                         }
329                         pte_unmap_unlock(orig_pte, ptl);
330                         if (split_huge_page(page)) {
331                                 unlock_page(page);
332                                 put_page(page);
333                                 pte_offset_map_lock(mm, pmd, addr, &ptl);
334                                 goto out;
335                         }
336                         put_page(page);
337                         unlock_page(page);
338                         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
339                         pte--;
340                         addr -= PAGE_SIZE;
341                         continue;
342                 }
343
344                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
345
346                 if (PageSwapCache(page) || PageDirty(page)) {
347                         if (!trylock_page(page))
348                                 continue;
349                         /*
350                          * If page is shared with others, we couldn't clear
351                          * PG_dirty of the page.
352                          */
353                         if (page_mapcount(page) != 1) {
354                                 unlock_page(page);
355                                 continue;
356                         }
357
358                         if (PageSwapCache(page) && !try_to_free_swap(page)) {
359                                 unlock_page(page);
360                                 continue;
361                         }
362
363                         ClearPageDirty(page);
364                         unlock_page(page);
365                 }
366
367                 if (pte_young(ptent) || pte_dirty(ptent)) {
368                         /*
369                          * Some of architecture(ex, PPC) don't update TLB
370                          * with set_pte_at and tlb_remove_tlb_entry so for
371                          * the portability, remap the pte with old|clean
372                          * after pte clearing.
373                          */
374                         ptent = ptep_get_and_clear_full(mm, addr, pte,
375                                                         tlb->fullmm);
376
377                         ptent = pte_mkold(ptent);
378                         ptent = pte_mkclean(ptent);
379                         set_pte_at(mm, addr, pte, ptent);
380                         if (PageActive(page))
381                                 deactivate_page(page);
382                         tlb_remove_tlb_entry(tlb, pte, addr);
383                 }
384         }
385 out:
386         if (nr_swap) {
387                 if (current->mm == mm)
388                         sync_mm_rss(mm);
389
390                 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
391         }
392         arch_leave_lazy_mmu_mode();
393         pte_unmap_unlock(orig_pte, ptl);
394         cond_resched();
395 next:
396         return 0;
397 }
398
399 static void madvise_free_page_range(struct mmu_gather *tlb,
400                              struct vm_area_struct *vma,
401                              unsigned long addr, unsigned long end)
402 {
403         struct mm_walk free_walk = {
404                 .pmd_entry = madvise_free_pte_range,
405                 .mm = vma->vm_mm,
406                 .private = tlb,
407         };
408
409         tlb_start_vma(tlb, vma);
410         walk_page_range(addr, end, &free_walk);
411         tlb_end_vma(tlb, vma);
412 }
413
414 static int madvise_free_single_vma(struct vm_area_struct *vma,
415                         unsigned long start_addr, unsigned long end_addr)
416 {
417         unsigned long start, end;
418         struct mm_struct *mm = vma->vm_mm;
419         struct mmu_gather tlb;
420
421         if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
422                 return -EINVAL;
423
424         /* MADV_FREE works for only anon vma at the moment */
425         if (!vma_is_anonymous(vma))
426                 return -EINVAL;
427
428         start = max(vma->vm_start, start_addr);
429         if (start >= vma->vm_end)
430                 return -EINVAL;
431         end = min(vma->vm_end, end_addr);
432         if (end <= vma->vm_start)
433                 return -EINVAL;
434
435         lru_add_drain();
436         tlb_gather_mmu(&tlb, mm, start, end);
437         update_hiwater_rss(mm);
438
439         mmu_notifier_invalidate_range_start(mm, start, end);
440         madvise_free_page_range(&tlb, vma, start, end);
441         mmu_notifier_invalidate_range_end(mm, start, end);
442         tlb_finish_mmu(&tlb, start, end);
443
444         return 0;
445 }
446
447 static long madvise_free(struct vm_area_struct *vma,
448                              struct vm_area_struct **prev,
449                              unsigned long start, unsigned long end)
450 {
451         *prev = vma;
452         return madvise_free_single_vma(vma, start, end);
453 }
454
455 /*
456  * Application no longer needs these pages.  If the pages are dirty,
457  * it's OK to just throw them away.  The app will be more careful about
458  * data it wants to keep.  Be sure to free swap resources too.  The
459  * zap_page_range call sets things up for shrink_active_list to actually free
460  * these pages later if no one else has touched them in the meantime,
461  * although we could add these pages to a global reuse list for
462  * shrink_active_list to pick up before reclaiming other pages.
463  *
464  * NB: This interface discards data rather than pushes it out to swap,
465  * as some implementations do.  This has performance implications for
466  * applications like large transactional databases which want to discard
467  * pages in anonymous maps after committing to backing store the data
468  * that was kept in them.  There is no reason to write this data out to
469  * the swap area if the application is discarding it.
470  *
471  * An interface that causes the system to free clean pages and flush
472  * dirty pages is already available as msync(MS_INVALIDATE).
473  */
474 static long madvise_dontneed(struct vm_area_struct *vma,
475                              struct vm_area_struct **prev,
476                              unsigned long start, unsigned long end)
477 {
478         *prev = vma;
479         if (!can_madv_dontneed_vma(vma))
480                 return -EINVAL;
481
482         userfaultfd_remove(vma, prev, start, end);
483         zap_page_range(vma, start, end - start);
484         return 0;
485 }
486
487 /*
488  * Application wants to free up the pages and associated backing store.
489  * This is effectively punching a hole into the middle of a file.
490  */
491 static long madvise_remove(struct vm_area_struct *vma,
492                                 struct vm_area_struct **prev,
493                                 unsigned long start, unsigned long end)
494 {
495         loff_t offset;
496         int error;
497         struct file *f;
498
499         *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
500
501         if (vma->vm_flags & VM_LOCKED)
502                 return -EINVAL;
503
504         f = vma->vm_file;
505
506         if (!f || !f->f_mapping || !f->f_mapping->host) {
507                         return -EINVAL;
508         }
509
510         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
511                 return -EACCES;
512
513         offset = (loff_t)(start - vma->vm_start)
514                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
515
516         /*
517          * Filesystem's fallocate may need to take i_mutex.  We need to
518          * explicitly grab a reference because the vma (and hence the
519          * vma's reference to the file) can go away as soon as we drop
520          * mmap_sem.
521          */
522         get_file(f);
523         userfaultfd_remove(vma, prev, start, end);
524         up_read(&current->mm->mmap_sem);
525         error = vfs_fallocate(f,
526                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
527                                 offset, end - start);
528         fput(f);
529         down_read(&current->mm->mmap_sem);
530         return error;
531 }
532
533 #ifdef CONFIG_MEMORY_FAILURE
534 /*
535  * Error injection support for memory error handling.
536  */
537 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
538 {
539         struct page *p;
540         if (!capable(CAP_SYS_ADMIN))
541                 return -EPERM;
542         for (; start < end; start += PAGE_SIZE <<
543                                 compound_order(compound_head(p))) {
544                 int ret;
545
546                 ret = get_user_pages_fast(start, 1, 0, &p);
547                 if (ret != 1)
548                         return ret;
549
550                 if (PageHWPoison(p)) {
551                         put_page(p);
552                         continue;
553                 }
554                 if (bhv == MADV_SOFT_OFFLINE) {
555                         pr_info("Soft offlining page %#lx at %#lx\n",
556                                 page_to_pfn(p), start);
557                         ret = soft_offline_page(p, MF_COUNT_INCREASED);
558                         if (ret)
559                                 return ret;
560                         continue;
561                 }
562                 pr_info("Injecting memory failure for page %#lx at %#lx\n",
563                        page_to_pfn(p), start);
564                 ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
565                 if (ret)
566                         return ret;
567         }
568         return 0;
569 }
570 #endif
571
572 static long
573 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
574                 unsigned long start, unsigned long end, int behavior)
575 {
576         switch (behavior) {
577         case MADV_REMOVE:
578                 return madvise_remove(vma, prev, start, end);
579         case MADV_WILLNEED:
580                 return madvise_willneed(vma, prev, start, end);
581         case MADV_FREE:
582                 /*
583                  * XXX: In this implementation, MADV_FREE works like
584                  * MADV_DONTNEED on swapless system or full swap.
585                  */
586                 if (get_nr_swap_pages() > 0)
587                         return madvise_free(vma, prev, start, end);
588                 /* passthrough */
589         case MADV_DONTNEED:
590                 return madvise_dontneed(vma, prev, start, end);
591         default:
592                 return madvise_behavior(vma, prev, start, end, behavior);
593         }
594 }
595
596 static bool
597 madvise_behavior_valid(int behavior)
598 {
599         switch (behavior) {
600         case MADV_DOFORK:
601         case MADV_DONTFORK:
602         case MADV_NORMAL:
603         case MADV_SEQUENTIAL:
604         case MADV_RANDOM:
605         case MADV_REMOVE:
606         case MADV_WILLNEED:
607         case MADV_DONTNEED:
608         case MADV_FREE:
609 #ifdef CONFIG_KSM
610         case MADV_MERGEABLE:
611         case MADV_UNMERGEABLE:
612 #endif
613 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
614         case MADV_HUGEPAGE:
615         case MADV_NOHUGEPAGE:
616 #endif
617         case MADV_DONTDUMP:
618         case MADV_DODUMP:
619                 return true;
620
621         default:
622                 return false;
623         }
624 }
625
626 /*
627  * The madvise(2) system call.
628  *
629  * Applications can use madvise() to advise the kernel how it should
630  * handle paging I/O in this VM area.  The idea is to help the kernel
631  * use appropriate read-ahead and caching techniques.  The information
632  * provided is advisory only, and can be safely disregarded by the
633  * kernel without affecting the correct operation of the application.
634  *
635  * behavior values:
636  *  MADV_NORMAL - the default behavior is to read clusters.  This
637  *              results in some read-ahead and read-behind.
638  *  MADV_RANDOM - the system should read the minimum amount of data
639  *              on any access, since it is unlikely that the appli-
640  *              cation will need more than what it asks for.
641  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
642  *              once, so they can be aggressively read ahead, and
643  *              can be freed soon after they are accessed.
644  *  MADV_WILLNEED - the application is notifying the system to read
645  *              some pages ahead.
646  *  MADV_DONTNEED - the application is finished with the given range,
647  *              so the kernel can free resources associated with it.
648  *  MADV_FREE - the application marks pages in the given range as lazy free,
649  *              where actual purges are postponed until memory pressure happens.
650  *  MADV_REMOVE - the application wants to free up the given range of
651  *              pages and associated backing store.
652  *  MADV_DONTFORK - omit this area from child's address space when forking:
653  *              typically, to avoid COWing pages pinned by get_user_pages().
654  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
655  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
656  *              were corrupted by unrecoverable hardware memory failure.
657  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
658  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
659  *              this area with pages of identical content from other such areas.
660  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
661  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
662  *              huge pages in the future. Existing pages might be coalesced and
663  *              new pages might be allocated as THP.
664  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
665  *              transparent huge pages so the existing pages will not be
666  *              coalesced into THP and new pages will not be allocated as THP.
667  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
668  *              from being included in its core dump.
669  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
670  *
671  * return values:
672  *  zero    - success
673  *  -EINVAL - start + len < 0, start is not page-aligned,
674  *              "behavior" is not a valid value, or application
675  *              is attempting to release locked or shared pages.
676  *  -ENOMEM - addresses in the specified range are not currently
677  *              mapped, or are outside the AS of the process.
678  *  -EIO    - an I/O error occurred while paging in data.
679  *  -EBADF  - map exists, but area maps something that isn't a file.
680  *  -EAGAIN - a kernel resource was temporarily unavailable.
681  */
682 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
683 {
684         unsigned long end, tmp;
685         struct vm_area_struct *vma, *prev;
686         int unmapped_error = 0;
687         int error = -EINVAL;
688         int write;
689         size_t len;
690         struct blk_plug plug;
691
692 #ifdef CONFIG_MEMORY_FAILURE
693         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
694                 return madvise_hwpoison(behavior, start, start+len_in);
695 #endif
696         if (!madvise_behavior_valid(behavior))
697                 return error;
698
699         if (start & ~PAGE_MASK)
700                 return error;
701         len = (len_in + ~PAGE_MASK) & PAGE_MASK;
702
703         /* Check to see whether len was rounded up from small -ve to zero */
704         if (len_in && !len)
705                 return error;
706
707         end = start + len;
708         if (end < start)
709                 return error;
710
711         error = 0;
712         if (end == start)
713                 return error;
714
715         write = madvise_need_mmap_write(behavior);
716         if (write) {
717                 if (down_write_killable(&current->mm->mmap_sem))
718                         return -EINTR;
719         } else {
720                 down_read(&current->mm->mmap_sem);
721         }
722
723         /*
724          * If the interval [start,end) covers some unmapped address
725          * ranges, just ignore them, but return -ENOMEM at the end.
726          * - different from the way of handling in mlock etc.
727          */
728         vma = find_vma_prev(current->mm, start, &prev);
729         if (vma && start > vma->vm_start)
730                 prev = vma;
731
732         blk_start_plug(&plug);
733         for (;;) {
734                 /* Still start < end. */
735                 error = -ENOMEM;
736                 if (!vma)
737                         goto out;
738
739                 /* Here start < (end|vma->vm_end). */
740                 if (start < vma->vm_start) {
741                         unmapped_error = -ENOMEM;
742                         start = vma->vm_start;
743                         if (start >= end)
744                                 goto out;
745                 }
746
747                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
748                 tmp = vma->vm_end;
749                 if (end < tmp)
750                         tmp = end;
751
752                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
753                 error = madvise_vma(vma, &prev, start, tmp, behavior);
754                 if (error)
755                         goto out;
756                 start = tmp;
757                 if (prev && start < prev->vm_end)
758                         start = prev->vm_end;
759                 error = unmapped_error;
760                 if (start >= end)
761                         goto out;
762                 if (prev)
763                         vma = prev->vm_next;
764                 else    /* madvise_remove dropped mmap_sem */
765                         vma = find_vma(current->mm, start);
766         }
767 out:
768         blk_finish_plug(&plug);
769         if (write)
770                 up_write(&current->mm->mmap_sem);
771         else
772                 up_read(&current->mm->mmap_sem);
773
774         return error;
775 }