4c965ba77f9f8cb96cc539b770e7bbe886ea3b37
[linux-2.6-block.git] / kernel / events / uprobes.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * User-space Probes (UProbes)
4  *
5  * Copyright (C) IBM Corporation, 2008-2012
6  * Authors:
7  *      Srikar Dronamraju
8  *      Jim Keniston
9  * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
10  */
11
12 #include <linux/kernel.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>      /* read_mapping_page */
15 #include <linux/slab.h>
16 #include <linux/sched.h>
17 #include <linux/sched/mm.h>
18 #include <linux/export.h>
19 #include <linux/rmap.h>         /* anon_vma_prepare */
20 #include <linux/mmu_notifier.h>
21 #include <linux/swap.h>         /* folio_free_swap */
22 #include <linux/ptrace.h>       /* user_enable_single_step */
23 #include <linux/kdebug.h>       /* notifier mechanism */
24 #include <linux/percpu-rwsem.h>
25 #include <linux/task_work.h>
26 #include <linux/shmem_fs.h>
27 #include <linux/khugepaged.h>
28 #include <linux/rcupdate_trace.h>
29 #include <linux/workqueue.h>
30 #include <linux/srcu.h>
31 #include <linux/oom.h>          /* check_stable_address_space */
32 #include <linux/pagewalk.h>
33
34 #include <linux/uprobes.h>
35
36 #define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
37 #define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
38
39 static struct rb_root uprobes_tree = RB_ROOT;
40 /*
41  * allows us to skip the uprobe_mmap if there are no uprobe events active
42  * at this time.  Probably a fine grained per inode count is better?
43  */
44 #define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
45
46 static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
47 static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
48
49 #define UPROBES_HASH_SZ 13
50 /* serialize uprobe->pending_list */
51 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
52 #define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
53
54 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
55
56 /* Covers return_instance's uprobe lifetime. */
57 DEFINE_STATIC_SRCU(uretprobes_srcu);
58
59 /* Have a copy of original instruction */
60 #define UPROBE_COPY_INSN        0
61
62 struct uprobe {
63         struct rb_node          rb_node;        /* node in the rb tree */
64         refcount_t              ref;
65         struct rw_semaphore     register_rwsem;
66         struct rw_semaphore     consumer_rwsem;
67         struct list_head        pending_list;
68         struct list_head        consumers;
69         struct inode            *inode;         /* Also hold a ref to inode */
70         union {
71                 struct rcu_head         rcu;
72                 struct work_struct      work;
73         };
74         loff_t                  offset;
75         loff_t                  ref_ctr_offset;
76         unsigned long           flags;          /* "unsigned long" so bitops work */
77
78         /*
79          * The generic code assumes that it has two members of unknown type
80          * owned by the arch-specific code:
81          *
82          *      insn -  copy_insn() saves the original instruction here for
83          *              arch_uprobe_analyze_insn().
84          *
85          *      ixol -  potentially modified instruction to execute out of
86          *              line, copied to xol_area by xol_get_insn_slot().
87          */
88         struct arch_uprobe      arch;
89 };
90
91 struct delayed_uprobe {
92         struct list_head list;
93         struct uprobe *uprobe;
94         struct mm_struct *mm;
95 };
96
97 static DEFINE_MUTEX(delayed_uprobe_lock);
98 static LIST_HEAD(delayed_uprobe_list);
99
100 /*
101  * Execute out of line area: anonymous executable mapping installed
102  * by the probed task to execute the copy of the original instruction
103  * mangled by set_swbp().
104  *
105  * On a breakpoint hit, thread contests for a slot.  It frees the
106  * slot after singlestep. Currently a fixed number of slots are
107  * allocated.
108  */
109 struct xol_area {
110         wait_queue_head_t               wq;             /* if all slots are busy */
111         unsigned long                   *bitmap;        /* 0 = free slot */
112
113         struct page                     *page;
114         /*
115          * We keep the vma's vm_start rather than a pointer to the vma
116          * itself.  The probed process or a naughty kernel module could make
117          * the vma go away, and we must handle that reasonably gracefully.
118          */
119         unsigned long                   vaddr;          /* Page(s) of instruction slots */
120 };
121
122 static void uprobe_warn(struct task_struct *t, const char *msg)
123 {
124         pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
125 }
126
127 /*
128  * valid_vma: Verify if the specified vma is an executable vma
129  * Relax restrictions while unregistering: vm_flags might have
130  * changed after breakpoint was inserted.
131  *      - is_register: indicates if we are in register context.
132  *      - Return 1 if the specified virtual address is in an
133  *        executable vma.
134  */
135 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
136 {
137         vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
138
139         if (is_register)
140                 flags |= VM_WRITE;
141
142         return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
143 }
144
145 static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
146 {
147         return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
148 }
149
150 static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
151 {
152         return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
153 }
154
155 /**
156  * is_swbp_insn - check if instruction is breakpoint instruction.
157  * @insn: instruction to be checked.
158  * Default implementation of is_swbp_insn
159  * Returns true if @insn is a breakpoint instruction.
160  */
161 bool __weak is_swbp_insn(uprobe_opcode_t *insn)
162 {
163         return *insn == UPROBE_SWBP_INSN;
164 }
165
166 /**
167  * is_trap_insn - check if instruction is breakpoint instruction.
168  * @insn: instruction to be checked.
169  * Default implementation of is_trap_insn
170  * Returns true if @insn is a breakpoint instruction.
171  *
172  * This function is needed for the case where an architecture has multiple
173  * trap instructions (like powerpc).
174  */
175 bool __weak is_trap_insn(uprobe_opcode_t *insn)
176 {
177         return is_swbp_insn(insn);
178 }
179
180 static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
181 {
182         void *kaddr = kmap_atomic(page);
183         memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
184         kunmap_atomic(kaddr);
185 }
186
187 static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
188 {
189         void *kaddr = kmap_atomic(page);
190         memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
191         kunmap_atomic(kaddr);
192 }
193
194 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
195 {
196         uprobe_opcode_t old_opcode;
197         bool is_swbp;
198
199         /*
200          * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
201          * We do not check if it is any other 'trap variant' which could
202          * be conditional trap instruction such as the one powerpc supports.
203          *
204          * The logic is that we do not care if the underlying instruction
205          * is a trap variant; uprobes always wins over any other (gdb)
206          * breakpoint.
207          */
208         copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
209         is_swbp = is_swbp_insn(&old_opcode);
210
211         if (is_swbp_insn(new_opcode)) {
212                 if (is_swbp)            /* register: already installed? */
213                         return 0;
214         } else {
215                 if (!is_swbp)           /* unregister: was it changed by us? */
216                         return 0;
217         }
218
219         return 1;
220 }
221
222 static struct delayed_uprobe *
223 delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
224 {
225         struct delayed_uprobe *du;
226
227         list_for_each_entry(du, &delayed_uprobe_list, list)
228                 if (du->uprobe == uprobe && du->mm == mm)
229                         return du;
230         return NULL;
231 }
232
233 static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
234 {
235         struct delayed_uprobe *du;
236
237         if (delayed_uprobe_check(uprobe, mm))
238                 return 0;
239
240         du  = kzalloc(sizeof(*du), GFP_KERNEL);
241         if (!du)
242                 return -ENOMEM;
243
244         du->uprobe = uprobe;
245         du->mm = mm;
246         list_add(&du->list, &delayed_uprobe_list);
247         return 0;
248 }
249
250 static void delayed_uprobe_delete(struct delayed_uprobe *du)
251 {
252         if (WARN_ON(!du))
253                 return;
254         list_del(&du->list);
255         kfree(du);
256 }
257
258 static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
259 {
260         struct list_head *pos, *q;
261         struct delayed_uprobe *du;
262
263         if (!uprobe && !mm)
264                 return;
265
266         list_for_each_safe(pos, q, &delayed_uprobe_list) {
267                 du = list_entry(pos, struct delayed_uprobe, list);
268
269                 if (uprobe && du->uprobe != uprobe)
270                         continue;
271                 if (mm && du->mm != mm)
272                         continue;
273
274                 delayed_uprobe_delete(du);
275         }
276 }
277
278 static bool valid_ref_ctr_vma(struct uprobe *uprobe,
279                               struct vm_area_struct *vma)
280 {
281         unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
282
283         return uprobe->ref_ctr_offset &&
284                 vma->vm_file &&
285                 file_inode(vma->vm_file) == uprobe->inode &&
286                 (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
287                 vma->vm_start <= vaddr &&
288                 vma->vm_end > vaddr;
289 }
290
291 static struct vm_area_struct *
292 find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
293 {
294         VMA_ITERATOR(vmi, mm, 0);
295         struct vm_area_struct *tmp;
296
297         for_each_vma(vmi, tmp)
298                 if (valid_ref_ctr_vma(uprobe, tmp))
299                         return tmp;
300
301         return NULL;
302 }
303
304 static int
305 __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
306 {
307         void *kaddr;
308         struct page *page;
309         int ret;
310         short *ptr;
311
312         if (!vaddr || !d)
313                 return -EINVAL;
314
315         ret = get_user_pages_remote(mm, vaddr, 1,
316                                     FOLL_WRITE, &page, NULL);
317         if (unlikely(ret <= 0)) {
318                 /*
319                  * We are asking for 1 page. If get_user_pages_remote() fails,
320                  * it may return 0, in that case we have to return error.
321                  */
322                 return ret == 0 ? -EBUSY : ret;
323         }
324
325         kaddr = kmap_atomic(page);
326         ptr = kaddr + (vaddr & ~PAGE_MASK);
327
328         if (unlikely(*ptr + d < 0)) {
329                 pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
330                         "curr val: %d, delta: %d\n", vaddr, *ptr, d);
331                 ret = -EINVAL;
332                 goto out;
333         }
334
335         *ptr += d;
336         ret = 0;
337 out:
338         kunmap_atomic(kaddr);
339         put_page(page);
340         return ret;
341 }
342
343 static void update_ref_ctr_warn(struct uprobe *uprobe,
344                                 struct mm_struct *mm, short d)
345 {
346         pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
347                 "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
348                 d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
349                 (unsigned long long) uprobe->offset,
350                 (unsigned long long) uprobe->ref_ctr_offset, mm);
351 }
352
353 static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
354                           short d)
355 {
356         struct vm_area_struct *rc_vma;
357         unsigned long rc_vaddr;
358         int ret = 0;
359
360         rc_vma = find_ref_ctr_vma(uprobe, mm);
361
362         if (rc_vma) {
363                 rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
364                 ret = __update_ref_ctr(mm, rc_vaddr, d);
365                 if (ret)
366                         update_ref_ctr_warn(uprobe, mm, d);
367
368                 if (d > 0)
369                         return ret;
370         }
371
372         mutex_lock(&delayed_uprobe_lock);
373         if (d > 0)
374                 ret = delayed_uprobe_add(uprobe, mm);
375         else
376                 delayed_uprobe_remove(uprobe, mm);
377         mutex_unlock(&delayed_uprobe_lock);
378
379         return ret;
380 }
381
382 static bool orig_page_is_identical(struct vm_area_struct *vma,
383                 unsigned long vaddr, struct page *page, bool *pmd_mappable)
384 {
385         const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
386         struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
387                                                     index);
388         struct page *orig_page;
389         bool identical;
390
391         if (IS_ERR(orig_folio))
392                 return false;
393         orig_page = folio_file_page(orig_folio, index);
394
395         *pmd_mappable = folio_test_pmd_mappable(orig_folio);
396         identical = folio_test_uptodate(orig_folio) &&
397                     pages_identical(page, orig_page);
398         folio_put(orig_folio);
399         return identical;
400 }
401
402 static int __uprobe_write_opcode(struct vm_area_struct *vma,
403                 struct folio_walk *fw, struct folio *folio,
404                 unsigned long opcode_vaddr, uprobe_opcode_t opcode)
405 {
406         const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
407         const bool is_register = !!is_swbp_insn(&opcode);
408         bool pmd_mappable;
409
410         /* For now, we'll only handle PTE-mapped folios. */
411         if (fw->level != FW_LEVEL_PTE)
412                 return -EFAULT;
413
414         /*
415          * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
416          * but the VMA might not be writable.
417          */
418         if (!pte_write(fw->pte)) {
419                 if (!PageAnonExclusive(fw->page))
420                         return -EFAULT;
421                 if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
422                         return -EFAULT;
423                 /* SOFTDIRTY is handled via pte_mkdirty() below. */
424         }
425
426         /*
427          * We'll temporarily unmap the page and flush the TLB, such that we can
428          * modify the page atomically.
429          */
430         flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
431         fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
432         copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
433
434         /*
435          * When unregistering, we may only zap a PTE if uffd is disabled and
436          * there are no unexpected folio references ...
437          */
438         if (is_register || userfaultfd_missing(vma) ||
439             (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
440              folio_test_swapcache(folio) * folio_nr_pages(folio)))
441                 goto remap;
442
443         /*
444          * ... and the mapped page is identical to the original page that
445          * would get faulted in on next access.
446          */
447         if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
448                 goto remap;
449
450         dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
451         folio_remove_rmap_pte(folio, fw->page, vma);
452         if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
453              folio_trylock(folio)) {
454                 folio_free_swap(folio);
455                 folio_unlock(folio);
456         }
457         folio_put(folio);
458
459         return pmd_mappable;
460 remap:
461         /*
462          * Make sure that our copy_to_page() changes become visible before the
463          * set_pte_at() write.
464          */
465         smp_wmb();
466         /* We modified the page. Make sure to mark the PTE dirty. */
467         set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
468         return 0;
469 }
470
471 /*
472  * NOTE:
473  * Expect the breakpoint instruction to be the smallest size instruction for
474  * the architecture. If an arch has variable length instruction and the
475  * breakpoint instruction is not of the smallest length instruction
476  * supported by that architecture then we need to modify is_trap_at_addr and
477  * uprobe_write_opcode accordingly. This would never be a problem for archs
478  * that have fixed length instructions.
479  *
480  * uprobe_write_opcode - write the opcode at a given virtual address.
481  * @auprobe: arch specific probepoint information.
482  * @vma: the probed virtual memory area.
483  * @opcode_vaddr: the virtual address to store the opcode.
484  * @opcode: opcode to be written at @opcode_vaddr.
485  *
486  * Called with mm->mmap_lock held for read or write.
487  * Return 0 (success) or a negative errno.
488  */
489 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
490                 const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
491 {
492         const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
493         struct mm_struct *mm = vma->vm_mm;
494         struct uprobe *uprobe;
495         int ret, is_register, ref_ctr_updated = 0;
496         unsigned int gup_flags = FOLL_FORCE;
497         struct mmu_notifier_range range;
498         struct folio_walk fw;
499         struct folio *folio;
500         struct page *page;
501
502         is_register = is_swbp_insn(&opcode);
503         uprobe = container_of(auprobe, struct uprobe, arch);
504
505         if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
506                 return -EINVAL;
507
508         /*
509          * When registering, we have to break COW to get an exclusive anonymous
510          * page that we can safely modify. Use FOLL_WRITE to trigger a write
511          * fault if required. When unregistering, we might be lucky and the
512          * anon page is already gone. So defer write faults until really
513          * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
514          * cannot deal with PMDs yet.
515          */
516         if (is_register)
517                 gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
518
519 retry:
520         ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
521         if (ret <= 0)
522                 goto out;
523         folio = page_folio(page);
524
525         ret = verify_opcode(page, opcode_vaddr, &opcode);
526         if (ret <= 0) {
527                 folio_put(folio);
528                 goto out;
529         }
530
531         /* We are going to replace instruction, update ref_ctr. */
532         if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
533                 ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
534                 if (ret) {
535                         folio_put(folio);
536                         goto out;
537                 }
538
539                 ref_ctr_updated = 1;
540         }
541
542         ret = 0;
543         if (unlikely(!folio_test_anon(folio))) {
544                 VM_WARN_ON_ONCE(is_register);
545                 folio_put(folio);
546                 goto out;
547         }
548
549         if (!is_register) {
550                 /*
551                  * In the common case, we'll be able to zap the page when
552                  * unregistering. So trigger MMU notifiers now, as we won't
553                  * be able to do it under PTL.
554                  */
555                 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
556                                         vaddr, vaddr + PAGE_SIZE);
557                 mmu_notifier_invalidate_range_start(&range);
558         }
559
560         ret = -EAGAIN;
561         /* Walk the page tables again, to perform the actual update. */
562         if (folio_walk_start(&fw, vma, vaddr, 0)) {
563                 if (fw.page == page)
564                         ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
565                 folio_walk_end(&fw, vma);
566         }
567
568         if (!is_register)
569                 mmu_notifier_invalidate_range_end(&range);
570
571         folio_put(folio);
572         switch (ret) {
573         case -EFAULT:
574                 gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
575                 fallthrough;
576         case -EAGAIN:
577                 goto retry;
578         default:
579                 break;
580         }
581
582 out:
583         /* Revert back reference counter if instruction update failed. */
584         if (ret < 0 && is_register && ref_ctr_updated)
585                 update_ref_ctr(uprobe, mm, -1);
586
587         /* try collapse pmd for compound page */
588         if (ret > 0)
589                 collapse_pte_mapped_thp(mm, vaddr, false);
590
591         return ret < 0 ? ret : 0;
592 }
593
594 /**
595  * set_swbp - store breakpoint at a given address.
596  * @auprobe: arch specific probepoint information.
597  * @vma: the probed virtual memory area.
598  * @vaddr: the virtual address to insert the opcode.
599  *
600  * For mm @mm, store the breakpoint instruction at @vaddr.
601  * Return 0 (success) or a negative errno.
602  */
603 int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
604                 unsigned long vaddr)
605 {
606         return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
607 }
608
609 /**
610  * set_orig_insn - Restore the original instruction.
611  * @vma: the probed virtual memory area.
612  * @auprobe: arch specific probepoint information.
613  * @vaddr: the virtual address to insert the opcode.
614  *
615  * For mm @mm, restore the original opcode (opcode) at @vaddr.
616  * Return 0 (success) or a negative errno.
617  */
618 int __weak set_orig_insn(struct arch_uprobe *auprobe,
619                 struct vm_area_struct *vma, unsigned long vaddr)
620 {
621         return uprobe_write_opcode(auprobe, vma, vaddr,
622                         *(uprobe_opcode_t *)&auprobe->insn);
623 }
624
625 /* uprobe should have guaranteed positive refcount */
626 static struct uprobe *get_uprobe(struct uprobe *uprobe)
627 {
628         refcount_inc(&uprobe->ref);
629         return uprobe;
630 }
631
632 /*
633  * uprobe should have guaranteed lifetime, which can be either of:
634  *   - caller already has refcount taken (and wants an extra one);
635  *   - uprobe is RCU protected and won't be freed until after grace period;
636  *   - we are holding uprobes_treelock (for read or write, doesn't matter).
637  */
638 static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
639 {
640         if (refcount_inc_not_zero(&uprobe->ref))
641                 return uprobe;
642         return NULL;
643 }
644
645 static inline bool uprobe_is_active(struct uprobe *uprobe)
646 {
647         return !RB_EMPTY_NODE(&uprobe->rb_node);
648 }
649
650 static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
651 {
652         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
653
654         kfree(uprobe);
655 }
656
657 static void uprobe_free_srcu(struct rcu_head *rcu)
658 {
659         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
660
661         call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
662 }
663
664 static void uprobe_free_deferred(struct work_struct *work)
665 {
666         struct uprobe *uprobe = container_of(work, struct uprobe, work);
667
668         write_lock(&uprobes_treelock);
669
670         if (uprobe_is_active(uprobe)) {
671                 write_seqcount_begin(&uprobes_seqcount);
672                 rb_erase(&uprobe->rb_node, &uprobes_tree);
673                 write_seqcount_end(&uprobes_seqcount);
674         }
675
676         write_unlock(&uprobes_treelock);
677
678         /*
679          * If application munmap(exec_vma) before uprobe_unregister()
680          * gets called, we don't get a chance to remove uprobe from
681          * delayed_uprobe_list from remove_breakpoint(). Do it here.
682          */
683         mutex_lock(&delayed_uprobe_lock);
684         delayed_uprobe_remove(uprobe, NULL);
685         mutex_unlock(&delayed_uprobe_lock);
686
687         /* start srcu -> rcu_tasks_trace -> kfree chain */
688         call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
689 }
690
691 static void put_uprobe(struct uprobe *uprobe)
692 {
693         if (!refcount_dec_and_test(&uprobe->ref))
694                 return;
695
696         INIT_WORK(&uprobe->work, uprobe_free_deferred);
697         schedule_work(&uprobe->work);
698 }
699
700 /* Initialize hprobe as SRCU-protected "leased" uprobe */
701 static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
702 {
703         WARN_ON(!uprobe);
704         hprobe->state = HPROBE_LEASED;
705         hprobe->uprobe = uprobe;
706         hprobe->srcu_idx = srcu_idx;
707 }
708
709 /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
710 static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
711 {
712         hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
713         hprobe->uprobe = uprobe;
714         hprobe->srcu_idx = -1;
715 }
716
717 /*
718  * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
719  * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
720  * used only once for a given hprobe.
721  *
722  * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
723  * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
724  * is appropriate.
725  */
726 static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
727 {
728         *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
729         switch (*hstate) {
730         case HPROBE_LEASED:
731         case HPROBE_STABLE:
732                 return hprobe->uprobe;
733         case HPROBE_GONE:       /* uprobe is NULL, no SRCU */
734         case HPROBE_CONSUMED:   /* uprobe was finalized already, do nothing */
735                 return NULL;
736         default:
737                 WARN(1, "hprobe invalid state %d", *hstate);
738                 return NULL;
739         }
740 }
741
742 /*
743  * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
744  * hprobe_finalize() can only be used from current context after
745  * hprobe_consume() call (which determines uprobe and hstate value).
746  */
747 static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
748 {
749         switch (hstate) {
750         case HPROBE_LEASED:
751                 __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
752                 break;
753         case HPROBE_STABLE:
754                 put_uprobe(hprobe->uprobe);
755                 break;
756         case HPROBE_GONE:
757         case HPROBE_CONSUMED:
758                 break;
759         default:
760                 WARN(1, "hprobe invalid state %d", hstate);
761                 break;
762         }
763 }
764
765 /*
766  * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
767  * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
768  * them can win the race to perform SRCU unlocking. Whoever wins must perform
769  * SRCU unlock.
770  *
771  * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
772  * to begin with or we failed to bump its refcount and it's going away.
773  *
774  * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
775  * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
776  * an extra refcount for caller to assume and use. Otherwise, it's not
777  * guaranteed that returned uprobe has a positive refcount, so caller has to
778  * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
779  * SRCU lock region. See dup_utask().
780  */
781 static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
782 {
783         enum hprobe_state hstate;
784
785         /*
786          * Caller should guarantee that return_instance is not going to be
787          * freed from under us. This can be achieved either through holding
788          * rcu_read_lock() or by owning return_instance in the first place.
789          *
790          * Underlying uprobe is itself protected from reuse by SRCU, so ensure
791          * SRCU lock is held properly.
792          */
793         lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
794
795         hstate = READ_ONCE(hprobe->state);
796         switch (hstate) {
797         case HPROBE_STABLE:
798                 /* uprobe has positive refcount, bump refcount, if necessary */
799                 return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
800         case HPROBE_GONE:
801                 /*
802                  * SRCU was unlocked earlier and we didn't manage to take
803                  * uprobe refcnt, so it's effectively NULL
804                  */
805                 return NULL;
806         case HPROBE_CONSUMED:
807                 /*
808                  * uprobe was consumed, so it's effectively NULL as far as
809                  * uretprobe processing logic is concerned
810                  */
811                 return NULL;
812         case HPROBE_LEASED: {
813                 struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
814                 /*
815                  * Try to switch hprobe state, guarding against
816                  * hprobe_consume() or another hprobe_expire() racing with us.
817                  * Note, if we failed to get uprobe refcount, we use special
818                  * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
819                  * be used as it will be freed after SRCU is unlocked.
820                  */
821                 if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
822                         /* We won the race, we are the ones to unlock SRCU */
823                         __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
824                         return get ? get_uprobe(uprobe) : uprobe;
825                 }
826
827                 /*
828                  * We lost the race, undo refcount bump (if it ever happened),
829                  * unless caller would like an extra refcount anyways.
830                  */
831                 if (uprobe && !get)
832                         put_uprobe(uprobe);
833                 /*
834                  * Even if hprobe_consume() or another hprobe_expire() wins
835                  * the state update race and unlocks SRCU from under us, we
836                  * still have a guarantee that underyling uprobe won't be
837                  * freed due to ongoing caller's SRCU lock region, so we can
838                  * return it regardless. Also, if `get` was true, we also have
839                  * an extra ref for the caller to own. This is used in dup_utask().
840                  */
841                 return uprobe;
842         }
843         default:
844                 WARN(1, "unknown hprobe state %d", hstate);
845                 return NULL;
846         }
847 }
848
849 static __always_inline
850 int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
851                const struct uprobe *r)
852 {
853         if (l_inode < r->inode)
854                 return -1;
855
856         if (l_inode > r->inode)
857                 return 1;
858
859         if (l_offset < r->offset)
860                 return -1;
861
862         if (l_offset > r->offset)
863                 return 1;
864
865         return 0;
866 }
867
868 #define __node_2_uprobe(node) \
869         rb_entry((node), struct uprobe, rb_node)
870
871 struct __uprobe_key {
872         struct inode *inode;
873         loff_t offset;
874 };
875
876 static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
877 {
878         const struct __uprobe_key *a = key;
879         return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
880 }
881
882 static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
883 {
884         struct uprobe *u = __node_2_uprobe(a);
885         return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
886 }
887
888 /*
889  * Assumes being inside RCU protected region.
890  * No refcount is taken on returned uprobe.
891  */
892 static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
893 {
894         struct __uprobe_key key = {
895                 .inode = inode,
896                 .offset = offset,
897         };
898         struct rb_node *node;
899         unsigned int seq;
900
901         lockdep_assert(rcu_read_lock_trace_held());
902
903         do {
904                 seq = read_seqcount_begin(&uprobes_seqcount);
905                 node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
906                 /*
907                  * Lockless RB-tree lookups can result only in false negatives.
908                  * If the element is found, it is correct and can be returned
909                  * under RCU protection. If we find nothing, we need to
910                  * validate that seqcount didn't change. If it did, we have to
911                  * try again as we might have missed the element (false
912                  * negative). If seqcount is unchanged, search truly failed.
913                  */
914                 if (node)
915                         return __node_2_uprobe(node);
916         } while (read_seqcount_retry(&uprobes_seqcount, seq));
917
918         return NULL;
919 }
920
921 /*
922  * Attempt to insert a new uprobe into uprobes_tree.
923  *
924  * If uprobe already exists (for given inode+offset), we just increment
925  * refcount of previously existing uprobe.
926  *
927  * If not, a provided new instance of uprobe is inserted into the tree (with
928  * assumed initial refcount == 1).
929  *
930  * In any case, we return a uprobe instance that ends up being in uprobes_tree.
931  * Caller has to clean up new uprobe instance, if it ended up not being
932  * inserted into the tree.
933  *
934  * We assume that uprobes_treelock is held for writing.
935  */
936 static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
937 {
938         struct rb_node *node;
939 again:
940         node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
941         if (node) {
942                 struct uprobe *u = __node_2_uprobe(node);
943
944                 if (!try_get_uprobe(u)) {
945                         rb_erase(node, &uprobes_tree);
946                         RB_CLEAR_NODE(&u->rb_node);
947                         goto again;
948                 }
949
950                 return u;
951         }
952
953         return uprobe;
954 }
955
956 /*
957  * Acquire uprobes_treelock and insert uprobe into uprobes_tree
958  * (or reuse existing one, see __insert_uprobe() comments above).
959  */
960 static struct uprobe *insert_uprobe(struct uprobe *uprobe)
961 {
962         struct uprobe *u;
963
964         write_lock(&uprobes_treelock);
965         write_seqcount_begin(&uprobes_seqcount);
966         u = __insert_uprobe(uprobe);
967         write_seqcount_end(&uprobes_seqcount);
968         write_unlock(&uprobes_treelock);
969
970         return u;
971 }
972
973 static void
974 ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
975 {
976         pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
977                 "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
978                 uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
979                 (unsigned long long) cur_uprobe->ref_ctr_offset,
980                 (unsigned long long) uprobe->ref_ctr_offset);
981 }
982
983 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
984                                    loff_t ref_ctr_offset)
985 {
986         struct uprobe *uprobe, *cur_uprobe;
987
988         uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
989         if (!uprobe)
990                 return ERR_PTR(-ENOMEM);
991
992         uprobe->inode = inode;
993         uprobe->offset = offset;
994         uprobe->ref_ctr_offset = ref_ctr_offset;
995         INIT_LIST_HEAD(&uprobe->consumers);
996         init_rwsem(&uprobe->register_rwsem);
997         init_rwsem(&uprobe->consumer_rwsem);
998         RB_CLEAR_NODE(&uprobe->rb_node);
999         refcount_set(&uprobe->ref, 1);
1000
1001         /* add to uprobes_tree, sorted on inode:offset */
1002         cur_uprobe = insert_uprobe(uprobe);
1003         /* a uprobe exists for this inode:offset combination */
1004         if (cur_uprobe != uprobe) {
1005                 if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
1006                         ref_ctr_mismatch_warn(cur_uprobe, uprobe);
1007                         put_uprobe(cur_uprobe);
1008                         kfree(uprobe);
1009                         return ERR_PTR(-EINVAL);
1010                 }
1011                 kfree(uprobe);
1012                 uprobe = cur_uprobe;
1013         }
1014
1015         return uprobe;
1016 }
1017
1018 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
1019 {
1020         static atomic64_t id;
1021
1022         down_write(&uprobe->consumer_rwsem);
1023         list_add_rcu(&uc->cons_node, &uprobe->consumers);
1024         uc->id = (__u64) atomic64_inc_return(&id);
1025         up_write(&uprobe->consumer_rwsem);
1026 }
1027
1028 /*
1029  * For uprobe @uprobe, delete the consumer @uc.
1030  * Should never be called with consumer that's not part of @uprobe->consumers.
1031  */
1032 static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
1033 {
1034         down_write(&uprobe->consumer_rwsem);
1035         list_del_rcu(&uc->cons_node);
1036         up_write(&uprobe->consumer_rwsem);
1037 }
1038
1039 static int __copy_insn(struct address_space *mapping, struct file *filp,
1040                         void *insn, int nbytes, loff_t offset)
1041 {
1042         struct page *page;
1043         /*
1044          * Ensure that the page that has the original instruction is populated
1045          * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
1046          * see uprobe_register().
1047          */
1048         if (mapping->a_ops->read_folio)
1049                 page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
1050         else
1051                 page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
1052         if (IS_ERR(page))
1053                 return PTR_ERR(page);
1054
1055         copy_from_page(page, offset, insn, nbytes);
1056         put_page(page);
1057
1058         return 0;
1059 }
1060
1061 static int copy_insn(struct uprobe *uprobe, struct file *filp)
1062 {
1063         struct address_space *mapping = uprobe->inode->i_mapping;
1064         loff_t offs = uprobe->offset;
1065         void *insn = &uprobe->arch.insn;
1066         int size = sizeof(uprobe->arch.insn);
1067         int len, err = -EIO;
1068
1069         /* Copy only available bytes, -EIO if nothing was read */
1070         do {
1071                 if (offs >= i_size_read(uprobe->inode))
1072                         break;
1073
1074                 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
1075                 err = __copy_insn(mapping, filp, insn, len, offs);
1076                 if (err)
1077                         break;
1078
1079                 insn += len;
1080                 offs += len;
1081                 size -= len;
1082         } while (size);
1083
1084         return err;
1085 }
1086
1087 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
1088                                 struct mm_struct *mm, unsigned long vaddr)
1089 {
1090         int ret = 0;
1091
1092         if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1093                 return ret;
1094
1095         /* TODO: move this into _register, until then we abuse this sem. */
1096         down_write(&uprobe->consumer_rwsem);
1097         if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1098                 goto out;
1099
1100         ret = copy_insn(uprobe, file);
1101         if (ret)
1102                 goto out;
1103
1104         ret = -ENOTSUPP;
1105         if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
1106                 goto out;
1107
1108         ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
1109         if (ret)
1110                 goto out;
1111
1112         smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
1113         set_bit(UPROBE_COPY_INSN, &uprobe->flags);
1114
1115  out:
1116         up_write(&uprobe->consumer_rwsem);
1117
1118         return ret;
1119 }
1120
1121 static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
1122 {
1123         return !uc->filter || uc->filter(uc, mm);
1124 }
1125
1126 static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
1127 {
1128         struct uprobe_consumer *uc;
1129         bool ret = false;
1130
1131         down_read(&uprobe->consumer_rwsem);
1132         list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1133                 ret = consumer_filter(uc, mm);
1134                 if (ret)
1135                         break;
1136         }
1137         up_read(&uprobe->consumer_rwsem);
1138
1139         return ret;
1140 }
1141
1142 static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
1143                 unsigned long vaddr)
1144 {
1145         struct mm_struct *mm = vma->vm_mm;
1146         bool first_uprobe;
1147         int ret;
1148
1149         ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
1150         if (ret)
1151                 return ret;
1152
1153         /*
1154          * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
1155          * the task can hit this breakpoint right after __replace_page().
1156          */
1157         first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
1158         if (first_uprobe)
1159                 set_bit(MMF_HAS_UPROBES, &mm->flags);
1160
1161         ret = set_swbp(&uprobe->arch, vma, vaddr);
1162         if (!ret)
1163                 clear_bit(MMF_RECALC_UPROBES, &mm->flags);
1164         else if (first_uprobe)
1165                 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1166
1167         return ret;
1168 }
1169
1170 static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
1171                 unsigned long vaddr)
1172 {
1173         struct mm_struct *mm = vma->vm_mm;
1174
1175         set_bit(MMF_RECALC_UPROBES, &mm->flags);
1176         return set_orig_insn(&uprobe->arch, vma, vaddr);
1177 }
1178
1179 struct map_info {
1180         struct map_info *next;
1181         struct mm_struct *mm;
1182         unsigned long vaddr;
1183 };
1184
1185 static inline struct map_info *free_map_info(struct map_info *info)
1186 {
1187         struct map_info *next = info->next;
1188         kfree(info);
1189         return next;
1190 }
1191
1192 static struct map_info *
1193 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
1194 {
1195         unsigned long pgoff = offset >> PAGE_SHIFT;
1196         struct vm_area_struct *vma;
1197         struct map_info *curr = NULL;
1198         struct map_info *prev = NULL;
1199         struct map_info *info;
1200         int more = 0;
1201
1202  again:
1203         i_mmap_lock_read(mapping);
1204         vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1205                 if (!valid_vma(vma, is_register))
1206                         continue;
1207
1208                 if (!prev && !more) {
1209                         /*
1210                          * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
1211                          * reclaim. This is optimistic, no harm done if it fails.
1212                          */
1213                         prev = kmalloc(sizeof(struct map_info),
1214                                         GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
1215                         if (prev)
1216                                 prev->next = NULL;
1217                 }
1218                 if (!prev) {
1219                         more++;
1220                         continue;
1221                 }
1222
1223                 if (!mmget_not_zero(vma->vm_mm))
1224                         continue;
1225
1226                 info = prev;
1227                 prev = prev->next;
1228                 info->next = curr;
1229                 curr = info;
1230
1231                 info->mm = vma->vm_mm;
1232                 info->vaddr = offset_to_vaddr(vma, offset);
1233         }
1234         i_mmap_unlock_read(mapping);
1235
1236         if (!more)
1237                 goto out;
1238
1239         prev = curr;
1240         while (curr) {
1241                 mmput(curr->mm);
1242                 curr = curr->next;
1243         }
1244
1245         do {
1246                 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
1247                 if (!info) {
1248                         curr = ERR_PTR(-ENOMEM);
1249                         goto out;
1250                 }
1251                 info->next = prev;
1252                 prev = info;
1253         } while (--more);
1254
1255         goto again;
1256  out:
1257         while (prev)
1258                 prev = free_map_info(prev);
1259         return curr;
1260 }
1261
1262 static int
1263 register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
1264 {
1265         bool is_register = !!new;
1266         struct map_info *info;
1267         int err = 0;
1268
1269         percpu_down_write(&dup_mmap_sem);
1270         info = build_map_info(uprobe->inode->i_mapping,
1271                                         uprobe->offset, is_register);
1272         if (IS_ERR(info)) {
1273                 err = PTR_ERR(info);
1274                 goto out;
1275         }
1276
1277         while (info) {
1278                 struct mm_struct *mm = info->mm;
1279                 struct vm_area_struct *vma;
1280
1281                 if (err && is_register)
1282                         goto free;
1283                 /*
1284                  * We take mmap_lock for writing to avoid the race with
1285                  * find_active_uprobe_rcu() which takes mmap_lock for reading.
1286                  * Thus this install_breakpoint() can not make
1287                  * is_trap_at_addr() true right after find_uprobe_rcu()
1288                  * returns NULL in find_active_uprobe_rcu().
1289                  */
1290                 mmap_write_lock(mm);
1291                 if (check_stable_address_space(mm))
1292                         goto unlock;
1293
1294                 vma = find_vma(mm, info->vaddr);
1295                 if (!vma || !valid_vma(vma, is_register) ||
1296                     file_inode(vma->vm_file) != uprobe->inode)
1297                         goto unlock;
1298
1299                 if (vma->vm_start > info->vaddr ||
1300                     vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
1301                         goto unlock;
1302
1303                 if (is_register) {
1304                         /* consult only the "caller", new consumer. */
1305                         if (consumer_filter(new, mm))
1306                                 err = install_breakpoint(uprobe, vma, info->vaddr);
1307                 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
1308                         if (!filter_chain(uprobe, mm))
1309                                 err |= remove_breakpoint(uprobe, vma, info->vaddr);
1310                 }
1311
1312  unlock:
1313                 mmap_write_unlock(mm);
1314  free:
1315                 mmput(mm);
1316                 info = free_map_info(info);
1317         }
1318  out:
1319         percpu_up_write(&dup_mmap_sem);
1320         return err;
1321 }
1322
1323 /**
1324  * uprobe_unregister_nosync - unregister an already registered probe.
1325  * @uprobe: uprobe to remove
1326  * @uc: identify which probe if multiple probes are colocated.
1327  */
1328 void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
1329 {
1330         int err;
1331
1332         down_write(&uprobe->register_rwsem);
1333         consumer_del(uprobe, uc);
1334         err = register_for_each_vma(uprobe, NULL);
1335         up_write(&uprobe->register_rwsem);
1336
1337         /* TODO : cant unregister? schedule a worker thread */
1338         if (unlikely(err)) {
1339                 uprobe_warn(current, "unregister, leaking uprobe");
1340                 return;
1341         }
1342
1343         put_uprobe(uprobe);
1344 }
1345 EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
1346
1347 void uprobe_unregister_sync(void)
1348 {
1349         /*
1350          * Now that handler_chain() and handle_uretprobe_chain() iterate over
1351          * uprobe->consumers list under RCU protection without holding
1352          * uprobe->register_rwsem, we need to wait for RCU grace period to
1353          * make sure that we can't call into just unregistered
1354          * uprobe_consumer's callbacks anymore. If we don't do that, fast and
1355          * unlucky enough caller can free consumer's memory and cause
1356          * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
1357          */
1358         synchronize_rcu_tasks_trace();
1359         synchronize_srcu(&uretprobes_srcu);
1360 }
1361 EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
1362
1363 /**
1364  * uprobe_register - register a probe
1365  * @inode: the file in which the probe has to be placed.
1366  * @offset: offset from the start of the file.
1367  * @ref_ctr_offset: offset of SDT marker / reference counter
1368  * @uc: information on howto handle the probe..
1369  *
1370  * Apart from the access refcount, uprobe_register() takes a creation
1371  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
1372  * inserted into the rbtree (i.e first consumer for a @inode:@offset
1373  * tuple).  Creation refcount stops uprobe_unregister from freeing the
1374  * @uprobe even before the register operation is complete. Creation
1375  * refcount is released when the last @uc for the @uprobe
1376  * unregisters. Caller of uprobe_register() is required to keep @inode
1377  * (and the containing mount) referenced.
1378  *
1379  * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
1380  */
1381 struct uprobe *uprobe_register(struct inode *inode,
1382                                 loff_t offset, loff_t ref_ctr_offset,
1383                                 struct uprobe_consumer *uc)
1384 {
1385         struct uprobe *uprobe;
1386         int ret;
1387
1388         /* Uprobe must have at least one set consumer */
1389         if (!uc->handler && !uc->ret_handler)
1390                 return ERR_PTR(-EINVAL);
1391
1392         /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
1393         if (!inode->i_mapping->a_ops->read_folio &&
1394             !shmem_mapping(inode->i_mapping))
1395                 return ERR_PTR(-EIO);
1396         /* Racy, just to catch the obvious mistakes */
1397         if (offset > i_size_read(inode))
1398                 return ERR_PTR(-EINVAL);
1399
1400         /*
1401          * This ensures that copy_from_page(), copy_to_page() and
1402          * __update_ref_ctr() can't cross page boundary.
1403          */
1404         if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
1405                 return ERR_PTR(-EINVAL);
1406         if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
1407                 return ERR_PTR(-EINVAL);
1408
1409         uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
1410         if (IS_ERR(uprobe))
1411                 return uprobe;
1412
1413         down_write(&uprobe->register_rwsem);
1414         consumer_add(uprobe, uc);
1415         ret = register_for_each_vma(uprobe, uc);
1416         up_write(&uprobe->register_rwsem);
1417
1418         if (ret) {
1419                 uprobe_unregister_nosync(uprobe, uc);
1420                 /*
1421                  * Registration might have partially succeeded, so we can have
1422                  * this consumer being called right at this time. We need to
1423                  * sync here. It's ok, it's unlikely slow path.
1424                  */
1425                 uprobe_unregister_sync();
1426                 return ERR_PTR(ret);
1427         }
1428
1429         return uprobe;
1430 }
1431 EXPORT_SYMBOL_GPL(uprobe_register);
1432
1433 /**
1434  * uprobe_apply - add or remove the breakpoints according to @uc->filter
1435  * @uprobe: uprobe which "owns" the breakpoint
1436  * @uc: consumer which wants to add more or remove some breakpoints
1437  * @add: add or remove the breakpoints
1438  * Return: 0 on success or negative error code.
1439  */
1440 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
1441 {
1442         struct uprobe_consumer *con;
1443         int ret = -ENOENT;
1444
1445         down_write(&uprobe->register_rwsem);
1446
1447         rcu_read_lock_trace();
1448         list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1449                 if (con == uc) {
1450                         ret = register_for_each_vma(uprobe, add ? uc : NULL);
1451                         break;
1452                 }
1453         }
1454         rcu_read_unlock_trace();
1455
1456         up_write(&uprobe->register_rwsem);
1457
1458         return ret;
1459 }
1460
1461 static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
1462 {
1463         VMA_ITERATOR(vmi, mm, 0);
1464         struct vm_area_struct *vma;
1465         int err = 0;
1466
1467         mmap_read_lock(mm);
1468         for_each_vma(vmi, vma) {
1469                 unsigned long vaddr;
1470                 loff_t offset;
1471
1472                 if (!valid_vma(vma, false) ||
1473                     file_inode(vma->vm_file) != uprobe->inode)
1474                         continue;
1475
1476                 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1477                 if (uprobe->offset <  offset ||
1478                     uprobe->offset >= offset + vma->vm_end - vma->vm_start)
1479                         continue;
1480
1481                 vaddr = offset_to_vaddr(vma, uprobe->offset);
1482                 err |= remove_breakpoint(uprobe, vma, vaddr);
1483         }
1484         mmap_read_unlock(mm);
1485
1486         return err;
1487 }
1488
1489 static struct rb_node *
1490 find_node_in_range(struct inode *inode, loff_t min, loff_t max)
1491 {
1492         struct rb_node *n = uprobes_tree.rb_node;
1493
1494         while (n) {
1495                 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
1496
1497                 if (inode < u->inode) {
1498                         n = n->rb_left;
1499                 } else if (inode > u->inode) {
1500                         n = n->rb_right;
1501                 } else {
1502                         if (max < u->offset)
1503                                 n = n->rb_left;
1504                         else if (min > u->offset)
1505                                 n = n->rb_right;
1506                         else
1507                                 break;
1508                 }
1509         }
1510
1511         return n;
1512 }
1513
1514 /*
1515  * For a given range in vma, build a list of probes that need to be inserted.
1516  */
1517 static void build_probe_list(struct inode *inode,
1518                                 struct vm_area_struct *vma,
1519                                 unsigned long start, unsigned long end,
1520                                 struct list_head *head)
1521 {
1522         loff_t min, max;
1523         struct rb_node *n, *t;
1524         struct uprobe *u;
1525
1526         INIT_LIST_HEAD(head);
1527         min = vaddr_to_offset(vma, start);
1528         max = min + (end - start) - 1;
1529
1530         read_lock(&uprobes_treelock);
1531         n = find_node_in_range(inode, min, max);
1532         if (n) {
1533                 for (t = n; t; t = rb_prev(t)) {
1534                         u = rb_entry(t, struct uprobe, rb_node);
1535                         if (u->inode != inode || u->offset < min)
1536                                 break;
1537                         /* if uprobe went away, it's safe to ignore it */
1538                         if (try_get_uprobe(u))
1539                                 list_add(&u->pending_list, head);
1540                 }
1541                 for (t = n; (t = rb_next(t)); ) {
1542                         u = rb_entry(t, struct uprobe, rb_node);
1543                         if (u->inode != inode || u->offset > max)
1544                                 break;
1545                         /* if uprobe went away, it's safe to ignore it */
1546                         if (try_get_uprobe(u))
1547                                 list_add(&u->pending_list, head);
1548                 }
1549         }
1550         read_unlock(&uprobes_treelock);
1551 }
1552
1553 /* @vma contains reference counter, not the probed instruction. */
1554 static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
1555 {
1556         struct list_head *pos, *q;
1557         struct delayed_uprobe *du;
1558         unsigned long vaddr;
1559         int ret = 0, err = 0;
1560
1561         mutex_lock(&delayed_uprobe_lock);
1562         list_for_each_safe(pos, q, &delayed_uprobe_list) {
1563                 du = list_entry(pos, struct delayed_uprobe, list);
1564
1565                 if (du->mm != vma->vm_mm ||
1566                     !valid_ref_ctr_vma(du->uprobe, vma))
1567                         continue;
1568
1569                 vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
1570                 ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
1571                 if (ret) {
1572                         update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
1573                         if (!err)
1574                                 err = ret;
1575                 }
1576                 delayed_uprobe_delete(du);
1577         }
1578         mutex_unlock(&delayed_uprobe_lock);
1579         return err;
1580 }
1581
1582 /*
1583  * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
1584  *
1585  * Currently we ignore all errors and always return 0, the callers
1586  * can't handle the failure anyway.
1587  */
1588 int uprobe_mmap(struct vm_area_struct *vma)
1589 {
1590         struct list_head tmp_list;
1591         struct uprobe *uprobe, *u;
1592         struct inode *inode;
1593
1594         if (no_uprobe_events())
1595                 return 0;
1596
1597         if (vma->vm_file &&
1598             (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
1599             test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
1600                 delayed_ref_ctr_inc(vma);
1601
1602         if (!valid_vma(vma, true))
1603                 return 0;
1604
1605         inode = file_inode(vma->vm_file);
1606         if (!inode)
1607                 return 0;
1608
1609         mutex_lock(uprobes_mmap_hash(inode));
1610         build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1611         /*
1612          * We can race with uprobe_unregister(), this uprobe can be already
1613          * removed. But in this case filter_chain() must return false, all
1614          * consumers have gone away.
1615          */
1616         list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1617                 if (!fatal_signal_pending(current) &&
1618                     filter_chain(uprobe, vma->vm_mm)) {
1619                         unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1620                         install_breakpoint(uprobe, vma, vaddr);
1621                 }
1622                 put_uprobe(uprobe);
1623         }
1624         mutex_unlock(uprobes_mmap_hash(inode));
1625
1626         return 0;
1627 }
1628
1629 static bool
1630 vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1631 {
1632         loff_t min, max;
1633         struct inode *inode;
1634         struct rb_node *n;
1635
1636         inode = file_inode(vma->vm_file);
1637
1638         min = vaddr_to_offset(vma, start);
1639         max = min + (end - start) - 1;
1640
1641         read_lock(&uprobes_treelock);
1642         n = find_node_in_range(inode, min, max);
1643         read_unlock(&uprobes_treelock);
1644
1645         return !!n;
1646 }
1647
1648 /*
1649  * Called in context of a munmap of a vma.
1650  */
1651 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1652 {
1653         if (no_uprobe_events() || !valid_vma(vma, false))
1654                 return;
1655
1656         if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1657                 return;
1658
1659         if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1660              test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1661                 return;
1662
1663         if (vma_has_uprobes(vma, start, end))
1664                 set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1665 }
1666
1667 static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
1668                             struct vm_area_struct *vma, struct vm_fault *vmf)
1669 {
1670         struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
1671
1672         vmf->page = area->page;
1673         get_page(vmf->page);
1674         return 0;
1675 }
1676
1677 static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
1678 {
1679         return -EPERM;
1680 }
1681
1682 static const struct vm_special_mapping xol_mapping = {
1683         .name = "[uprobes]",
1684         .fault = xol_fault,
1685         .mremap = xol_mremap,
1686 };
1687
1688 /* Slot allocation for XOL */
1689 static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1690 {
1691         struct vm_area_struct *vma;
1692         int ret;
1693
1694         if (mmap_write_lock_killable(mm))
1695                 return -EINTR;
1696
1697         if (mm->uprobes_state.xol_area) {
1698                 ret = -EALREADY;
1699                 goto fail;
1700         }
1701
1702         if (!area->vaddr) {
1703                 /* Try to map as high as possible, this is only a hint. */
1704                 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1705                                                 PAGE_SIZE, 0, 0);
1706                 if (IS_ERR_VALUE(area->vaddr)) {
1707                         ret = area->vaddr;
1708                         goto fail;
1709                 }
1710         }
1711
1712         vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1713                                 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
1714                                 VM_SEALED_SYSMAP,
1715                                 &xol_mapping);
1716         if (IS_ERR(vma)) {
1717                 ret = PTR_ERR(vma);
1718                 goto fail;
1719         }
1720
1721         ret = 0;
1722         /* pairs with get_xol_area() */
1723         smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
1724  fail:
1725         mmap_write_unlock(mm);
1726
1727         return ret;
1728 }
1729
1730 void * __weak arch_uprobe_trampoline(unsigned long *psize)
1731 {
1732         static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1733
1734         *psize = UPROBE_SWBP_INSN_SIZE;
1735         return &insn;
1736 }
1737
1738 static struct xol_area *__create_xol_area(unsigned long vaddr)
1739 {
1740         struct mm_struct *mm = current->mm;
1741         unsigned long insns_size;
1742         struct xol_area *area;
1743         void *insns;
1744
1745         area = kzalloc(sizeof(*area), GFP_KERNEL);
1746         if (unlikely(!area))
1747                 goto out;
1748
1749         area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
1750                                GFP_KERNEL);
1751         if (!area->bitmap)
1752                 goto free_area;
1753
1754         area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
1755         if (!area->page)
1756                 goto free_bitmap;
1757
1758         area->vaddr = vaddr;
1759         init_waitqueue_head(&area->wq);
1760         /* Reserve the 1st slot for get_trampoline_vaddr() */
1761         set_bit(0, area->bitmap);
1762         insns = arch_uprobe_trampoline(&insns_size);
1763         arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
1764
1765         if (!xol_add_vma(mm, area))
1766                 return area;
1767
1768         __free_page(area->page);
1769  free_bitmap:
1770         kfree(area->bitmap);
1771  free_area:
1772         kfree(area);
1773  out:
1774         return NULL;
1775 }
1776
1777 /*
1778  * get_xol_area - Allocate process's xol_area if necessary.
1779  * This area will be used for storing instructions for execution out of line.
1780  *
1781  * Returns the allocated area or NULL.
1782  */
1783 static struct xol_area *get_xol_area(void)
1784 {
1785         struct mm_struct *mm = current->mm;
1786         struct xol_area *area;
1787
1788         if (!mm->uprobes_state.xol_area)
1789                 __create_xol_area(0);
1790
1791         /* Pairs with xol_add_vma() smp_store_release() */
1792         area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
1793         return area;
1794 }
1795
1796 /*
1797  * uprobe_clear_state - Free the area allocated for slots.
1798  */
1799 void uprobe_clear_state(struct mm_struct *mm)
1800 {
1801         struct xol_area *area = mm->uprobes_state.xol_area;
1802
1803         mutex_lock(&delayed_uprobe_lock);
1804         delayed_uprobe_remove(NULL, mm);
1805         mutex_unlock(&delayed_uprobe_lock);
1806
1807         if (!area)
1808                 return;
1809
1810         put_page(area->page);
1811         kfree(area->bitmap);
1812         kfree(area);
1813 }
1814
1815 void uprobe_start_dup_mmap(void)
1816 {
1817         percpu_down_read(&dup_mmap_sem);
1818 }
1819
1820 void uprobe_end_dup_mmap(void)
1821 {
1822         percpu_up_read(&dup_mmap_sem);
1823 }
1824
1825 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1826 {
1827         if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1828                 set_bit(MMF_HAS_UPROBES, &newmm->flags);
1829                 /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1830                 set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1831         }
1832 }
1833
1834 static unsigned long xol_get_slot_nr(struct xol_area *area)
1835 {
1836         unsigned long slot_nr;
1837
1838         slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1839         if (slot_nr < UINSNS_PER_PAGE) {
1840                 if (!test_and_set_bit(slot_nr, area->bitmap))
1841                         return slot_nr;
1842         }
1843
1844         return UINSNS_PER_PAGE;
1845 }
1846
1847 /*
1848  * xol_get_insn_slot - allocate a slot for xol.
1849  */
1850 static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
1851 {
1852         struct xol_area *area = get_xol_area();
1853         unsigned long slot_nr;
1854
1855         if (!area)
1856                 return false;
1857
1858         wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
1859
1860         utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
1861         arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
1862                               &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1863         return true;
1864 }
1865
1866 /*
1867  * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
1868  */
1869 static void xol_free_insn_slot(struct uprobe_task *utask)
1870 {
1871         struct xol_area *area = current->mm->uprobes_state.xol_area;
1872         unsigned long offset = utask->xol_vaddr - area->vaddr;
1873         unsigned int slot_nr;
1874
1875         utask->xol_vaddr = 0;
1876         /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
1877         if (WARN_ON_ONCE(offset >= PAGE_SIZE))
1878                 return;
1879
1880         slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1881         clear_bit(slot_nr, area->bitmap);
1882         smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
1883         if (waitqueue_active(&area->wq))
1884                 wake_up(&area->wq);
1885 }
1886
1887 void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1888                                   void *src, unsigned long len)
1889 {
1890         /* Initialize the slot */
1891         copy_to_page(page, vaddr, src, len);
1892
1893         /*
1894          * We probably need flush_icache_user_page() but it needs vma.
1895          * This should work on most of architectures by default. If
1896          * architecture needs to do something different it can define
1897          * its own version of the function.
1898          */
1899         flush_dcache_page(page);
1900 }
1901
1902 /**
1903  * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1904  * @regs: Reflects the saved state of the task after it has hit a breakpoint
1905  * instruction.
1906  * Return the address of the breakpoint instruction.
1907  */
1908 unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1909 {
1910         return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1911 }
1912
1913 unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1914 {
1915         struct uprobe_task *utask = current->utask;
1916
1917         if (unlikely(utask && utask->active_uprobe))
1918                 return utask->vaddr;
1919
1920         return instruction_pointer(regs);
1921 }
1922
1923 static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
1924 {
1925         ri->cons_cnt = 0;
1926         ri->next = utask->ri_pool;
1927         utask->ri_pool = ri;
1928 }
1929
1930 static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
1931 {
1932         struct return_instance *ri = utask->ri_pool;
1933
1934         if (likely(ri))
1935                 utask->ri_pool = ri->next;
1936
1937         return ri;
1938 }
1939
1940 static void ri_free(struct return_instance *ri)
1941 {
1942         kfree(ri->extra_consumers);
1943         kfree_rcu(ri, rcu);
1944 }
1945
1946 static void free_ret_instance(struct uprobe_task *utask,
1947                               struct return_instance *ri, bool cleanup_hprobe)
1948 {
1949         unsigned seq;
1950
1951         if (cleanup_hprobe) {
1952                 enum hprobe_state hstate;
1953
1954                 (void)hprobe_consume(&ri->hprobe, &hstate);
1955                 hprobe_finalize(&ri->hprobe, hstate);
1956         }
1957
1958         /*
1959          * At this point return_instance is unlinked from utask's
1960          * return_instances list and this has become visible to ri_timer().
1961          * If seqcount now indicates that ri_timer's return instance
1962          * processing loop isn't active, we can return ri into the pool of
1963          * to-be-reused return instances for future uretprobes. If ri_timer()
1964          * happens to be running right now, though, we fallback to safety and
1965          * just perform RCU-delated freeing of ri.
1966          * Admittedly, this is a rather simple use of seqcount, but it nicely
1967          * abstracts away all the necessary memory barriers, so we use
1968          * a well-supported kernel primitive here.
1969          */
1970         if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
1971                 /* immediate reuse of ri without RCU GP is OK */
1972                 ri_pool_push(utask, ri);
1973         } else {
1974                 /* we might be racing with ri_timer(), so play it safe */
1975                 ri_free(ri);
1976         }
1977 }
1978
1979 /*
1980  * Called with no locks held.
1981  * Called in context of an exiting or an exec-ing thread.
1982  */
1983 void uprobe_free_utask(struct task_struct *t)
1984 {
1985         struct uprobe_task *utask = t->utask;
1986         struct return_instance *ri, *ri_next;
1987
1988         if (!utask)
1989                 return;
1990
1991         t->utask = NULL;
1992         WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
1993
1994         timer_delete_sync(&utask->ri_timer);
1995
1996         ri = utask->return_instances;
1997         while (ri) {
1998                 ri_next = ri->next;
1999                 free_ret_instance(utask, ri, true /* cleanup_hprobe */);
2000                 ri = ri_next;
2001         }
2002
2003         /* free_ret_instance() above might add to ri_pool, so this loop should come last */
2004         ri = utask->ri_pool;
2005         while (ri) {
2006                 ri_next = ri->next;
2007                 ri_free(ri);
2008                 ri = ri_next;
2009         }
2010
2011         kfree(utask);
2012 }
2013
2014 #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
2015
2016 #define for_each_ret_instance_rcu(pos, head) \
2017         for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
2018
2019 static void ri_timer(struct timer_list *timer)
2020 {
2021         struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
2022         struct return_instance *ri;
2023
2024         /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
2025         guard(srcu)(&uretprobes_srcu);
2026         /* RCU protects return_instance from freeing. */
2027         guard(rcu)();
2028
2029         /*
2030          * See free_ret_instance() for notes on seqcount use.
2031          * We also employ raw API variants to avoid lockdep false-positive
2032          * warning complaining about enabled preemption. The timer can only be
2033          * invoked once for a uprobe_task. Therefore there can only be one
2034          * writer. The reader does not require an even sequence count to make
2035          * progress, so it is OK to remain preemptible on PREEMPT_RT.
2036          */
2037         raw_write_seqcount_begin(&utask->ri_seqcount);
2038
2039         for_each_ret_instance_rcu(ri, utask->return_instances)
2040                 hprobe_expire(&ri->hprobe, false);
2041
2042         raw_write_seqcount_end(&utask->ri_seqcount);
2043 }
2044
2045 static struct uprobe_task *alloc_utask(void)
2046 {
2047         struct uprobe_task *utask;
2048
2049         utask = kzalloc(sizeof(*utask), GFP_KERNEL);
2050         if (!utask)
2051                 return NULL;
2052
2053         timer_setup(&utask->ri_timer, ri_timer, 0);
2054         seqcount_init(&utask->ri_seqcount);
2055
2056         return utask;
2057 }
2058
2059 /*
2060  * Allocate a uprobe_task object for the task if necessary.
2061  * Called when the thread hits a breakpoint.
2062  *
2063  * Returns:
2064  * - pointer to new uprobe_task on success
2065  * - NULL otherwise
2066  */
2067 static struct uprobe_task *get_utask(void)
2068 {
2069         if (!current->utask)
2070                 current->utask = alloc_utask();
2071         return current->utask;
2072 }
2073
2074 static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
2075 {
2076         struct return_instance *ri;
2077
2078         ri = ri_pool_pop(utask);
2079         if (ri)
2080                 return ri;
2081
2082         ri = kzalloc(sizeof(*ri), GFP_KERNEL);
2083         if (!ri)
2084                 return ZERO_SIZE_PTR;
2085
2086         return ri;
2087 }
2088
2089 static struct return_instance *dup_return_instance(struct return_instance *old)
2090 {
2091         struct return_instance *ri;
2092
2093         ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
2094         if (!ri)
2095                 return NULL;
2096
2097         if (unlikely(old->cons_cnt > 1)) {
2098                 ri->extra_consumers = kmemdup(old->extra_consumers,
2099                                               sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
2100                                               GFP_KERNEL);
2101                 if (!ri->extra_consumers) {
2102                         kfree(ri);
2103                         return NULL;
2104                 }
2105         }
2106
2107         return ri;
2108 }
2109
2110 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
2111 {
2112         struct uprobe_task *n_utask;
2113         struct return_instance **p, *o, *n;
2114         struct uprobe *uprobe;
2115
2116         n_utask = alloc_utask();
2117         if (!n_utask)
2118                 return -ENOMEM;
2119         t->utask = n_utask;
2120
2121         /* protect uprobes from freeing, we'll need try_get_uprobe() them */
2122         guard(srcu)(&uretprobes_srcu);
2123
2124         p = &n_utask->return_instances;
2125         for (o = o_utask->return_instances; o; o = o->next) {
2126                 n = dup_return_instance(o);
2127                 if (!n)
2128                         return -ENOMEM;
2129
2130                 /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
2131                 uprobe = hprobe_expire(&o->hprobe, true);
2132
2133                 /*
2134                  * New utask will have stable properly refcounted uprobe or
2135                  * NULL. Even if we failed to get refcounted uprobe, we still
2136                  * need to preserve full set of return_instances for proper
2137                  * uretprobe handling and nesting in forked task.
2138                  */
2139                 hprobe_init_stable(&n->hprobe, uprobe);
2140
2141                 n->next = NULL;
2142                 rcu_assign_pointer(*p, n);
2143                 p = &n->next;
2144
2145                 n_utask->depth++;
2146         }
2147
2148         return 0;
2149 }
2150
2151 static void dup_xol_work(struct callback_head *work)
2152 {
2153         if (current->flags & PF_EXITING)
2154                 return;
2155
2156         if (!__create_xol_area(current->utask->dup_xol_addr) &&
2157                         !fatal_signal_pending(current))
2158                 uprobe_warn(current, "dup xol area");
2159 }
2160
2161 /*
2162  * Called in context of a new clone/fork from copy_process.
2163  */
2164 void uprobe_copy_process(struct task_struct *t, unsigned long flags)
2165 {
2166         struct uprobe_task *utask = current->utask;
2167         struct mm_struct *mm = current->mm;
2168         struct xol_area *area;
2169
2170         t->utask = NULL;
2171
2172         if (!utask || !utask->return_instances)
2173                 return;
2174
2175         if (mm == t->mm && !(flags & CLONE_VFORK))
2176                 return;
2177
2178         if (dup_utask(t, utask))
2179                 return uprobe_warn(t, "dup ret instances");
2180
2181         /* The task can fork() after dup_xol_work() fails */
2182         area = mm->uprobes_state.xol_area;
2183         if (!area)
2184                 return uprobe_warn(t, "dup xol area");
2185
2186         if (mm == t->mm)
2187                 return;
2188
2189         t->utask->dup_xol_addr = area->vaddr;
2190         init_task_work(&t->utask->dup_xol_work, dup_xol_work);
2191         task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
2192 }
2193
2194 /*
2195  * Current area->vaddr notion assume the trampoline address is always
2196  * equal area->vaddr.
2197  *
2198  * Returns -1 in case the xol_area is not allocated.
2199  */
2200 unsigned long uprobe_get_trampoline_vaddr(void)
2201 {
2202         unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
2203         struct xol_area *area;
2204
2205         /* Pairs with xol_add_vma() smp_store_release() */
2206         area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
2207         if (area)
2208                 trampoline_vaddr = area->vaddr;
2209
2210         return trampoline_vaddr;
2211 }
2212
2213 static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
2214                                         struct pt_regs *regs)
2215 {
2216         struct return_instance *ri = utask->return_instances, *ri_next;
2217         enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
2218
2219         while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
2220                 ri_next = ri->next;
2221                 rcu_assign_pointer(utask->return_instances, ri_next);
2222                 utask->depth--;
2223
2224                 free_ret_instance(utask, ri, true /* cleanup_hprobe */);
2225                 ri = ri_next;
2226         }
2227 }
2228
2229 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
2230                               struct return_instance *ri)
2231 {
2232         struct uprobe_task *utask = current->utask;
2233         unsigned long orig_ret_vaddr, trampoline_vaddr;
2234         bool chained;
2235         int srcu_idx;
2236
2237         if (!get_xol_area())
2238                 goto free;
2239
2240         if (utask->depth >= MAX_URETPROBE_DEPTH) {
2241                 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
2242                                 " nestedness limit pid/tgid=%d/%d\n",
2243                                 current->pid, current->tgid);
2244                 goto free;
2245         }
2246
2247         trampoline_vaddr = uprobe_get_trampoline_vaddr();
2248         orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
2249         if (orig_ret_vaddr == -1)
2250                 goto free;
2251
2252         /* drop the entries invalidated by longjmp() */
2253         chained = (orig_ret_vaddr == trampoline_vaddr);
2254         cleanup_return_instances(utask, chained, regs);
2255
2256         /*
2257          * We don't want to keep trampoline address in stack, rather keep the
2258          * original return address of first caller thru all the consequent
2259          * instances. This also makes breakpoint unwrapping easier.
2260          */
2261         if (chained) {
2262                 if (!utask->return_instances) {
2263                         /*
2264                          * This situation is not possible. Likely we have an
2265                          * attack from user-space.
2266                          */
2267                         uprobe_warn(current, "handle tail call");
2268                         goto free;
2269                 }
2270                 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
2271         }
2272
2273         /* __srcu_read_lock() because SRCU lock survives switch to user space */
2274         srcu_idx = __srcu_read_lock(&uretprobes_srcu);
2275
2276         ri->func = instruction_pointer(regs);
2277         ri->stack = user_stack_pointer(regs);
2278         ri->orig_ret_vaddr = orig_ret_vaddr;
2279         ri->chained = chained;
2280
2281         utask->depth++;
2282
2283         hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
2284         ri->next = utask->return_instances;
2285         rcu_assign_pointer(utask->return_instances, ri);
2286
2287         mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
2288
2289         return;
2290 free:
2291         ri_free(ri);
2292 }
2293
2294 /* Prepare to single-step probed instruction out of line. */
2295 static int
2296 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
2297 {
2298         struct uprobe_task *utask = current->utask;
2299         int err;
2300
2301         if (!try_get_uprobe(uprobe))
2302                 return -EINVAL;
2303
2304         if (!xol_get_insn_slot(uprobe, utask)) {
2305                 err = -ENOMEM;
2306                 goto err_out;
2307         }
2308
2309         utask->vaddr = bp_vaddr;
2310         err = arch_uprobe_pre_xol(&uprobe->arch, regs);
2311         if (unlikely(err)) {
2312                 xol_free_insn_slot(utask);
2313                 goto err_out;
2314         }
2315
2316         utask->active_uprobe = uprobe;
2317         utask->state = UTASK_SSTEP;
2318         return 0;
2319 err_out:
2320         put_uprobe(uprobe);
2321         return err;
2322 }
2323
2324 /*
2325  * If we are singlestepping, then ensure this thread is not connected to
2326  * non-fatal signals until completion of singlestep.  When xol insn itself
2327  * triggers the signal,  restart the original insn even if the task is
2328  * already SIGKILL'ed (since coredump should report the correct ip).  This
2329  * is even more important if the task has a handler for SIGSEGV/etc, The
2330  * _same_ instruction should be repeated again after return from the signal
2331  * handler, and SSTEP can never finish in this case.
2332  */
2333 bool uprobe_deny_signal(void)
2334 {
2335         struct task_struct *t = current;
2336         struct uprobe_task *utask = t->utask;
2337
2338         if (likely(!utask || !utask->active_uprobe))
2339                 return false;
2340
2341         WARN_ON_ONCE(utask->state != UTASK_SSTEP);
2342
2343         if (task_sigpending(t)) {
2344                 utask->signal_denied = true;
2345                 clear_tsk_thread_flag(t, TIF_SIGPENDING);
2346
2347                 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
2348                         utask->state = UTASK_SSTEP_TRAPPED;
2349                         set_tsk_thread_flag(t, TIF_UPROBE);
2350                 }
2351         }
2352
2353         return true;
2354 }
2355
2356 static void mmf_recalc_uprobes(struct mm_struct *mm)
2357 {
2358         VMA_ITERATOR(vmi, mm, 0);
2359         struct vm_area_struct *vma;
2360
2361         for_each_vma(vmi, vma) {
2362                 if (!valid_vma(vma, false))
2363                         continue;
2364                 /*
2365                  * This is not strictly accurate, we can race with
2366                  * uprobe_unregister() and see the already removed
2367                  * uprobe if delete_uprobe() was not yet called.
2368                  * Or this uprobe can be filtered out.
2369                  */
2370                 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
2371                         return;
2372         }
2373
2374         clear_bit(MMF_HAS_UPROBES, &mm->flags);
2375 }
2376
2377 static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
2378 {
2379         struct page *page;
2380         uprobe_opcode_t opcode;
2381         int result;
2382
2383         if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
2384                 return -EINVAL;
2385
2386         pagefault_disable();
2387         result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
2388         pagefault_enable();
2389
2390         if (likely(result == 0))
2391                 goto out;
2392
2393         result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
2394         if (result < 0)
2395                 return result;
2396
2397         copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
2398         put_page(page);
2399  out:
2400         /* This needs to return true for any variant of the trap insn */
2401         return is_trap_insn(&opcode);
2402 }
2403
2404 static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
2405 {
2406         struct mm_struct *mm = current->mm;
2407         struct uprobe *uprobe = NULL;
2408         struct vm_area_struct *vma;
2409         struct file *vm_file;
2410         loff_t offset;
2411         unsigned int seq;
2412
2413         guard(rcu)();
2414
2415         if (!mmap_lock_speculate_try_begin(mm, &seq))
2416                 return NULL;
2417
2418         vma = vma_lookup(mm, bp_vaddr);
2419         if (!vma)
2420                 return NULL;
2421
2422         /*
2423          * vm_file memory can be reused for another instance of struct file,
2424          * but can't be freed from under us, so it's safe to read fields from
2425          * it, even if the values are some garbage values; ultimately
2426          * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
2427          * that whatever we speculatively found is correct
2428          */
2429         vm_file = READ_ONCE(vma->vm_file);
2430         if (!vm_file)
2431                 return NULL;
2432
2433         offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
2434         uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
2435         if (!uprobe)
2436                 return NULL;
2437
2438         /* now double check that nothing about MM changed */
2439         if (mmap_lock_speculate_retry(mm, seq))
2440                 return NULL;
2441
2442         return uprobe;
2443 }
2444
2445 /* assumes being inside RCU protected region */
2446 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
2447 {
2448         struct mm_struct *mm = current->mm;
2449         struct uprobe *uprobe = NULL;
2450         struct vm_area_struct *vma;
2451
2452         uprobe = find_active_uprobe_speculative(bp_vaddr);
2453         if (uprobe)
2454                 return uprobe;
2455
2456         mmap_read_lock(mm);
2457         vma = vma_lookup(mm, bp_vaddr);
2458         if (vma) {
2459                 if (vma->vm_file) {
2460                         struct inode *inode = file_inode(vma->vm_file);
2461                         loff_t offset = vaddr_to_offset(vma, bp_vaddr);
2462
2463                         uprobe = find_uprobe_rcu(inode, offset);
2464                 }
2465
2466                 if (!uprobe)
2467                         *is_swbp = is_trap_at_addr(mm, bp_vaddr);
2468         } else {
2469                 *is_swbp = -EFAULT;
2470         }
2471
2472         if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
2473                 mmf_recalc_uprobes(mm);
2474         mmap_read_unlock(mm);
2475
2476         return uprobe;
2477 }
2478
2479 static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
2480 {
2481         struct return_consumer *ric;
2482
2483         if (unlikely(ri == ZERO_SIZE_PTR))
2484                 return ri;
2485
2486         if (unlikely(ri->cons_cnt > 0)) {
2487                 ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
2488                 if (!ric) {
2489                         ri_free(ri);
2490                         return ZERO_SIZE_PTR;
2491                 }
2492                 ri->extra_consumers = ric;
2493         }
2494
2495         ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
2496         ric->id = id;
2497         ric->cookie = cookie;
2498
2499         ri->cons_cnt++;
2500         return ri;
2501 }
2502
2503 static struct return_consumer *
2504 return_consumer_find(struct return_instance *ri, int *iter, int id)
2505 {
2506         struct return_consumer *ric;
2507         int idx;
2508
2509         for (idx = *iter; idx < ri->cons_cnt; idx++)
2510         {
2511                 ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
2512                 if (ric->id == id) {
2513                         *iter = idx + 1;
2514                         return ric;
2515                 }
2516         }
2517
2518         return NULL;
2519 }
2520
2521 static bool ignore_ret_handler(int rc)
2522 {
2523         return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
2524 }
2525
2526 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
2527 {
2528         struct uprobe_consumer *uc;
2529         bool has_consumers = false, remove = true;
2530         struct return_instance *ri = NULL;
2531         struct uprobe_task *utask = current->utask;
2532
2533         utask->auprobe = &uprobe->arch;
2534
2535         list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2536                 bool session = uc->handler && uc->ret_handler;
2537                 __u64 cookie = 0;
2538                 int rc = 0;
2539
2540                 if (uc->handler) {
2541                         rc = uc->handler(uc, regs, &cookie);
2542                         WARN(rc < 0 || rc > 2,
2543                                 "bad rc=0x%x from %ps()\n", rc, uc->handler);
2544                 }
2545
2546                 remove &= rc == UPROBE_HANDLER_REMOVE;
2547                 has_consumers = true;
2548
2549                 if (!uc->ret_handler || ignore_ret_handler(rc))
2550                         continue;
2551
2552                 if (!ri)
2553                         ri = alloc_return_instance(utask);
2554
2555                 if (session)
2556                         ri = push_consumer(ri, uc->id, cookie);
2557         }
2558         utask->auprobe = NULL;
2559
2560         if (!ZERO_OR_NULL_PTR(ri))
2561                 prepare_uretprobe(uprobe, regs, ri);
2562
2563         if (remove && has_consumers) {
2564                 down_read(&uprobe->register_rwsem);
2565
2566                 /* re-check that removal is still required, this time under lock */
2567                 if (!filter_chain(uprobe, current->mm)) {
2568                         WARN_ON(!uprobe_is_active(uprobe));
2569                         unapply_uprobe(uprobe, current->mm);
2570                 }
2571
2572                 up_read(&uprobe->register_rwsem);
2573         }
2574 }
2575
2576 static void
2577 handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
2578 {
2579         struct return_consumer *ric;
2580         struct uprobe_consumer *uc;
2581         int ric_idx = 0;
2582
2583         /* all consumers unsubscribed meanwhile */
2584         if (unlikely(!uprobe))
2585                 return;
2586
2587         rcu_read_lock_trace();
2588         list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2589                 bool session = uc->handler && uc->ret_handler;
2590
2591                 if (uc->ret_handler) {
2592                         ric = return_consumer_find(ri, &ric_idx, uc->id);
2593                         if (!session || ric)
2594                                 uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
2595                 }
2596         }
2597         rcu_read_unlock_trace();
2598 }
2599
2600 static struct return_instance *find_next_ret_chain(struct return_instance *ri)
2601 {
2602         bool chained;
2603
2604         do {
2605                 chained = ri->chained;
2606                 ri = ri->next;  /* can't be NULL if chained */
2607         } while (chained);
2608
2609         return ri;
2610 }
2611
2612 void uprobe_handle_trampoline(struct pt_regs *regs)
2613 {
2614         struct uprobe_task *utask;
2615         struct return_instance *ri, *ri_next, *next_chain;
2616         struct uprobe *uprobe;
2617         enum hprobe_state hstate;
2618         bool valid;
2619
2620         utask = current->utask;
2621         if (!utask)
2622                 goto sigill;
2623
2624         ri = utask->return_instances;
2625         if (!ri)
2626                 goto sigill;
2627
2628         do {
2629                 /*
2630                  * We should throw out the frames invalidated by longjmp().
2631                  * If this chain is valid, then the next one should be alive
2632                  * or NULL; the latter case means that nobody but ri->func
2633                  * could hit this trampoline on return. TODO: sigaltstack().
2634                  */
2635                 next_chain = find_next_ret_chain(ri);
2636                 valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
2637
2638                 instruction_pointer_set(regs, ri->orig_ret_vaddr);
2639                 do {
2640                         /* pop current instance from the stack of pending return instances,
2641                          * as it's not pending anymore: we just fixed up original
2642                          * instruction pointer in regs and are about to call handlers;
2643                          * this allows fixup_uretprobe_trampoline_entries() to properly fix up
2644                          * captured stack traces from uretprobe handlers, in which pending
2645                          * trampoline addresses on the stack are replaced with correct
2646                          * original return addresses
2647                          */
2648                         ri_next = ri->next;
2649                         rcu_assign_pointer(utask->return_instances, ri_next);
2650                         utask->depth--;
2651
2652                         uprobe = hprobe_consume(&ri->hprobe, &hstate);
2653                         if (valid)
2654                                 handle_uretprobe_chain(ri, uprobe, regs);
2655                         hprobe_finalize(&ri->hprobe, hstate);
2656
2657                         /* We already took care of hprobe, no need to waste more time on that. */
2658                         free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
2659                         ri = ri_next;
2660                 } while (ri != next_chain);
2661         } while (!valid);
2662
2663         return;
2664
2665 sigill:
2666         uprobe_warn(current, "handle uretprobe, sending SIGILL.");
2667         force_sig(SIGILL);
2668 }
2669
2670 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
2671 {
2672         return false;
2673 }
2674
2675 bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
2676                                         struct pt_regs *regs)
2677 {
2678         return true;
2679 }
2680
2681 /*
2682  * Run handler and ask thread to singlestep.
2683  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
2684  */
2685 static void handle_swbp(struct pt_regs *regs)
2686 {
2687         struct uprobe *uprobe;
2688         unsigned long bp_vaddr;
2689         int is_swbp;
2690
2691         bp_vaddr = uprobe_get_swbp_addr(regs);
2692         if (bp_vaddr == uprobe_get_trampoline_vaddr())
2693                 return uprobe_handle_trampoline(regs);
2694
2695         rcu_read_lock_trace();
2696
2697         uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
2698         if (!uprobe) {
2699                 if (is_swbp > 0) {
2700                         /* No matching uprobe; signal SIGTRAP. */
2701                         force_sig(SIGTRAP);
2702                 } else {
2703                         /*
2704                          * Either we raced with uprobe_unregister() or we can't
2705                          * access this memory. The latter is only possible if
2706                          * another thread plays with our ->mm. In both cases
2707                          * we can simply restart. If this vma was unmapped we
2708                          * can pretend this insn was not executed yet and get
2709                          * the (correct) SIGSEGV after restart.
2710                          */
2711                         instruction_pointer_set(regs, bp_vaddr);
2712                 }
2713                 goto out;
2714         }
2715
2716         /* change it in advance for ->handler() and restart */
2717         instruction_pointer_set(regs, bp_vaddr);
2718
2719         /*
2720          * TODO: move copy_insn/etc into _register and remove this hack.
2721          * After we hit the bp, _unregister + _register can install the
2722          * new and not-yet-analyzed uprobe at the same address, restart.
2723          */
2724         if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
2725                 goto out;
2726
2727         /*
2728          * Pairs with the smp_wmb() in prepare_uprobe().
2729          *
2730          * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
2731          * we must also see the stores to &uprobe->arch performed by the
2732          * prepare_uprobe() call.
2733          */
2734         smp_rmb();
2735
2736         /* Tracing handlers use ->utask to communicate with fetch methods */
2737         if (!get_utask())
2738                 goto out;
2739
2740         if (arch_uprobe_ignore(&uprobe->arch, regs))
2741                 goto out;
2742
2743         handler_chain(uprobe, regs);
2744
2745         if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
2746                 goto out;
2747
2748         if (pre_ssout(uprobe, regs, bp_vaddr))
2749                 goto out;
2750
2751 out:
2752         /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
2753         rcu_read_unlock_trace();
2754 }
2755
2756 /*
2757  * Perform required fix-ups and disable singlestep.
2758  * Allow pending signals to take effect.
2759  */
2760 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
2761 {
2762         struct uprobe *uprobe;
2763         int err = 0;
2764
2765         uprobe = utask->active_uprobe;
2766         if (utask->state == UTASK_SSTEP_ACK)
2767                 err = arch_uprobe_post_xol(&uprobe->arch, regs);
2768         else if (utask->state == UTASK_SSTEP_TRAPPED)
2769                 arch_uprobe_abort_xol(&uprobe->arch, regs);
2770         else
2771                 WARN_ON_ONCE(1);
2772
2773         put_uprobe(uprobe);
2774         utask->active_uprobe = NULL;
2775         utask->state = UTASK_RUNNING;
2776         xol_free_insn_slot(utask);
2777
2778         if (utask->signal_denied) {
2779                 set_thread_flag(TIF_SIGPENDING);
2780                 utask->signal_denied = false;
2781         }
2782
2783         if (unlikely(err)) {
2784                 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
2785                 force_sig(SIGILL);
2786         }
2787 }
2788
2789 /*
2790  * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
2791  * allows the thread to return from interrupt. After that handle_swbp()
2792  * sets utask->active_uprobe.
2793  *
2794  * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
2795  * and allows the thread to return from interrupt.
2796  *
2797  * While returning to userspace, thread notices the TIF_UPROBE flag and calls
2798  * uprobe_notify_resume().
2799  */
2800 void uprobe_notify_resume(struct pt_regs *regs)
2801 {
2802         struct uprobe_task *utask;
2803
2804         clear_thread_flag(TIF_UPROBE);
2805
2806         utask = current->utask;
2807         if (utask && utask->active_uprobe)
2808                 handle_singlestep(utask, regs);
2809         else
2810                 handle_swbp(regs);
2811 }
2812
2813 /*
2814  * uprobe_pre_sstep_notifier gets called from interrupt context as part of
2815  * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
2816  */
2817 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
2818 {
2819         if (!current->mm)
2820                 return 0;
2821
2822         if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
2823             (!current->utask || !current->utask->return_instances))
2824                 return 0;
2825
2826         set_thread_flag(TIF_UPROBE);
2827         return 1;
2828 }
2829
2830 /*
2831  * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
2832  * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
2833  */
2834 int uprobe_post_sstep_notifier(struct pt_regs *regs)
2835 {
2836         struct uprobe_task *utask = current->utask;
2837
2838         if (!current->mm || !utask || !utask->active_uprobe)
2839                 /* task is currently not uprobed */
2840                 return 0;
2841
2842         utask->state = UTASK_SSTEP_ACK;
2843         set_thread_flag(TIF_UPROBE);
2844         return 1;
2845 }
2846
2847 static struct notifier_block uprobe_exception_nb = {
2848         .notifier_call          = arch_uprobe_exception_notify,
2849         .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
2850 };
2851
2852 void __init uprobes_init(void)
2853 {
2854         int i;
2855
2856         for (i = 0; i < UPROBES_HASH_SZ; i++)
2857                 mutex_init(&uprobes_mmap_mutex[i]);
2858
2859         BUG_ON(register_die_notifier(&uprobe_exception_nb));
2860 }