Merge tag 'perf-tools-fixes-for-v6.9-2024-04-19' of git://git.kernel.org/pub/scm...
[linux-2.6-block.git] / include / linux / userfaultfd_k.h
index e4056547fbe6151ab79bfed1609446a2c99ee9e4..05d59f74fc887f1bcd781c550387d12e00d2cac6 100644 (file)
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ *     fd_wqh.lock
+ *             fault_pending_wqh.lock
+ *                     fault_wqh.lock
+ *             event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+       /* waitqueue head for the pending (i.e. not read) userfaults */
+       wait_queue_head_t fault_pending_wqh;
+       /* waitqueue head for the userfaults */
+       wait_queue_head_t fault_wqh;
+       /* waitqueue head for the pseudo fd to wakeup poll/read */
+       wait_queue_head_t fd_wqh;
+       /* waitqueue head for events */
+       wait_queue_head_t event_wqh;
+       /* a refile sequence protected by fault_pending_wqh lock */
+       seqcount_spinlock_t refile_seq;
+       /* pseudo fd refcounting */
+       refcount_t refcount;
+       /* userfaultfd syscall flags */
+       unsigned int flags;
+       /* features requested from the userspace */
+       unsigned int features;
+       /* released */
+       bool released;
+       /*
+        * Prevents userfaultfd operations (fill/move/wp) from happening while
+        * some non-cooperative event(s) is taking place. Increments are done
+        * in write-mode. Whereas, userfaultfd operations, which includes
+        * reading mmap_changing, is done under read-mode.
+        */
+       struct rw_semaphore map_changing_lock;
+       /* memory mappings are changing because of non-cooperative event */
+       atomic_t mmap_changing;
+       /* mm with one ore more vmas attached to this userfaultfd_ctx */
+       struct mm_struct *mm;
+};
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 /* A combined operation mode + behavior flags. */
@@ -74,31 +120,26 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
                                    unsigned long dst_addr, struct page *page,
                                    bool newly_allocated, uffd_flags_t flags);
 
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
                                 unsigned long src_start, unsigned long len,
-                                atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+                                uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
                                     unsigned long dst_start,
-                                    unsigned long len,
-                                    atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-                                    unsigned long len, atomic_t *mmap_changing,
-                                    uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
-                                  unsigned long len, atomic_t *mmap_changing,
-                                  uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
-                              unsigned long start, unsigned long len,
-                              bool enable_wp, atomic_t *mmap_changing);
+                                    unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+                                    unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+                                  unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+                              unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
                          unsigned long start, unsigned long len, bool enable_wp);
 
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-                  unsigned long dst_start, unsigned long src_start,
-                  unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+                  unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
                        struct vm_area_struct *dst_vma,
                        struct vm_area_struct *src_vma,