include/linux/hmm.h

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Copyright 2013 Red Hat Inc.
   4  *
   5  * Authors: Jérôme Glisse <jglisse@redhat.com>
   6  */
   7 /*
   8  * Heterogeneous Memory Management (HMM)
   9  *
  10  * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
  11  * is for. Here we focus on the HMM API description, with some explanation of
  12  * the underlying implementation.
  13  *
  14  * Short description: HMM provides a set of helpers to share a virtual address
  15  * space between CPU and a device, so that the device can access any valid
  16  * address of the process (while still obeying memory protection). HMM also
  17  * provides helpers to migrate process memory to device memory, and back. Each
  18  * set of functionality (address space mirroring, and migration to and from
  19  * device memory) can be used independently of the other.
  20  *
  21  *
  22  * HMM address space mirroring API:
  23  *
  24  * Use HMM address space mirroring if you want to mirror range of the CPU page
  25  * table of a process into a device page table. Here, "mirror" means "keep
  26  * synchronized". Prerequisites: the device must provide the ability to write-
  27  * protect its page tables (at PAGE_SIZE granularity), and must be able to
  28  * recover from the resulting potential page faults.
  29  *
  30  * HMM guarantees that at any point in time, a given virtual address points to
  31  * either the same memory in both CPU and device page tables (that is: CPU and
  32  * device page tables each point to the same pages), or that one page table (CPU
  33  * or device) points to no entry, while the other still points to the old page
  34  * for the address. The latter case happens when the CPU page table update
  35  * happens first, and then the update is mirrored over to the device page table.
  36  * This does not cause any issue, because the CPU page table cannot start
  37  * pointing to a new page until the device page table is invalidated.
  38  *
  39  * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
  40  * updates to each device driver that has registered a mirror. It also provides
  41  * some API calls to help with taking a snapshot of the CPU page table, and to
  42  * synchronize with any updates that might happen concurrently.
  43  *
  44  *
  45  * HMM migration to and from device memory:
  46  *
  47  * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
  48  * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
  49  * of the device memory, and allows the device driver to manage its memory
  50  * using those struct pages. Having struct pages for device memory makes
  51  * migration easier. Because that memory is not addressable by the CPU it must
  52  * never be pinned to the device; in other words, any CPU page fault can always
  53  * cause the device memory to be migrated (copied/moved) back to regular memory.
  54  *
  55  * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
  56  * allows use of a device DMA engine to perform the copy operation between
  57  * regular system memory and device memory.
  58  */
  59 #ifndef LINUX_HMM_H
  60 #define LINUX_HMM_H
  61
  62 #include <linux/kconfig.h>
  63 #include <asm/pgtable.h>
  64
  65 #if IS_ENABLED(CONFIG_HMM)
  66
  67 #include <linux/device.h>
  68 #include <linux/migrate.h>
  69 #include <linux/memremap.h>
  70 #include <linux/completion.h>
  71 #include <linux/mmu_notifier.h>
  72
  73
  74 /*
  75  * struct hmm - HMM per mm struct
  76  *
  77  * @mm: mm struct this HMM struct is bound to
  78  * @lock: lock protecting ranges list
  79  * @ranges: list of range being snapshotted
  80  * @mirrors: list of mirrors for this mm
  81  * @mmu_notifier: mmu notifier to track updates to CPU page table
  82  * @mirrors_sem: read/write semaphore protecting the mirrors list
  83  * @wq: wait queue for user waiting on a range invalidation
  84  * @notifiers: count of active mmu notifiers
  85  * @dead: is the mm dead ?
  86  */
  87 struct hmm {
  88         struct mm_struct        *mm;
  89         struct kref             kref;
  90         struct mutex            lock;
  91         struct list_head        ranges;
  92         struct list_head        mirrors;
  93         struct mmu_notifier     mmu_notifier;
  94         struct rw_semaphore     mirrors_sem;
  95         wait_queue_head_t       wq;
  96         long                    notifiers;
  97         bool                    dead;
  98 };
  99
 100 /*
 101  * hmm_pfn_flag_e - HMM flag enums
 102  *
 103  * Flags:
 104  * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
 105  * HMM_PFN_WRITE: CPU page table has write permission set
 106  * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
 107  *
 108  * The driver provide a flags array, if driver valid bit for an entry is bit
 109  * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
 110  * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
 111  * Same logic apply to all flags. This is same idea as vm_page_prot in vma
 112  * except that this is per device driver rather than per architecture.
 113  */
 114 enum hmm_pfn_flag_e {
 115         HMM_PFN_VALID = 0,
 116         HMM_PFN_WRITE,
 117         HMM_PFN_DEVICE_PRIVATE,
 118         HMM_PFN_FLAG_MAX
 119 };
 120
 121 /*
 122  * hmm_pfn_value_e - HMM pfn special value
 123  *
 124  * Flags:
 125  * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
 126  * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
 127  * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
 128  *      result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
 129  *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
 130  *      set and the pfn value is undefined.
 131  *
 132  * Driver provide entry value for none entry, error entry and special entry,
 133  * driver can alias (ie use same value for error and special for instance). It
 134  * should not alias none and error or special.
 135  *
 136  * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
 137  * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
 138  * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
 139  * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
 140  */
 141 enum hmm_pfn_value_e {
 142         HMM_PFN_ERROR,
 143         HMM_PFN_NONE,
 144         HMM_PFN_SPECIAL,
 145         HMM_PFN_VALUE_MAX
 146 };
 147
 148 /*
 149  * struct hmm_range - track invalidation lock on virtual address range
 150  *
 151  * @hmm: the core HMM structure this range is active against
 152  * @vma: the vm area struct for the range
 153  * @list: all range lock are on a list
 154  * @start: range virtual start address (inclusive)
 155  * @end: range virtual end address (exclusive)
 156  * @pfns: array of pfns (big enough for the range)
 157  * @flags: pfn flags to match device driver page table
 158  * @values: pfn value for some special case (none, special, error, ...)
 159  * @default_flags: default flags for the range (write, read, ... see hmm doc)
 160  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
 161  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
 162  * @valid: pfns array did not change since it has been fill by an HMM function
 163  */
 164 struct hmm_range {
 165         struct hmm              *hmm;
 166         struct vm_area_struct   *vma;
 167         struct list_head        list;
 168         unsigned long           start;
 169         unsigned long           end;
 170         uint64_t                *pfns;
 171         const uint64_t          *flags;
 172         const uint64_t          *values;
 173         uint64_t                default_flags;
 174         uint64_t                pfn_flags_mask;
 175         uint8_t                 page_shift;
 176         uint8_t                 pfn_shift;
 177         bool                    valid;
 178 };
 179
 180 /*
 181  * hmm_range_page_shift() - return the page shift for the range
 182  * @range: range being queried
 183  * Returns: page shift (page size = 1 << page shift) for the range
 184  */
 185 static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 186 {
 187         return range->page_shift;
 188 }
 189
 190 /*
 191  * hmm_range_page_size() - return the page size for the range
 192  * @range: range being queried
 193  * Returns: page size for the range in bytes
 194  */
 195 static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 196 {
 197         return 1UL << hmm_range_page_shift(range);
 198 }
 199
 200 /*
 201  * hmm_range_wait_until_valid() - wait for range to be valid
 202  * @range: range affected by invalidation to wait on
 203  * @timeout: time out for wait in ms (ie abort wait after that period of time)
 204  * Returns: true if the range is valid, false otherwise.
 205  */
 206 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 207                                               unsigned long timeout)
 208 {
 209         /* Check if mm is dead ? */
 210         if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
 211                 range->valid = false;
 212                 return false;
 213         }
 214         if (range->valid)
 215                 return true;
 216         wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
 217                            msecs_to_jiffies(timeout));
 218         /* Return current valid status just in case we get lucky */
 219         return range->valid;
 220 }
 221
 222 /*
 223  * hmm_range_valid() - test if a range is valid or not
 224  * @range: range
 225  * Returns: true if the range is valid, false otherwise.
 226  */
 227 static inline bool hmm_range_valid(struct hmm_range *range)
 228 {
 229         return range->valid;
 230 }
 231
 232 /*
 233  * hmm_device_entry_to_page() - return struct page pointed to by a device entry
 234  * @range: range use to decode device entry value
 235  * @entry: device entry value to get corresponding struct page from
 236  * Returns: struct page pointer if entry is a valid, NULL otherwise
 237  *
 238  * If the device entry is valid (ie valid flag set) then return the struct page
 239  * matching the entry value. Otherwise return NULL.
 240  */
 241 static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
 242                                                     uint64_t entry)
 243 {
 244         if (entry == range->values[HMM_PFN_NONE])
 245                 return NULL;
 246         if (entry == range->values[HMM_PFN_ERROR])
 247                 return NULL;
 248         if (entry == range->values[HMM_PFN_SPECIAL])
 249                 return NULL;
 250         if (!(entry & range->flags[HMM_PFN_VALID]))
 251                 return NULL;
 252         return pfn_to_page(entry >> range->pfn_shift);
 253 }
 254
 255 /*
 256  * hmm_device_entry_to_pfn() - return pfn value store in a device entry
 257  * @range: range use to decode device entry value
 258  * @entry: device entry to extract pfn from
 259  * Returns: pfn value if device entry is valid, -1UL otherwise
 260  */
 261 static inline unsigned long
 262 hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
 263 {
 264         if (pfn == range->values[HMM_PFN_NONE])
 265                 return -1UL;
 266         if (pfn == range->values[HMM_PFN_ERROR])
 267                 return -1UL;
 268         if (pfn == range->values[HMM_PFN_SPECIAL])
 269                 return -1UL;
 270         if (!(pfn & range->flags[HMM_PFN_VALID]))
 271                 return -1UL;
 272         return (pfn >> range->pfn_shift);
 273 }
 274
 275 /*
 276  * hmm_device_entry_from_page() - create a valid device entry for a page
 277  * @range: range use to encode HMM pfn value
 278  * @page: page for which to create the device entry
 279  * Returns: valid device entry for the page
 280  */
 281 static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
 282                                                   struct page *page)
 283 {
 284         return (page_to_pfn(page) << range->pfn_shift) |
 285                 range->flags[HMM_PFN_VALID];
 286 }
 287
 288 /*
 289  * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
 290  * @range: range use to encode HMM pfn value
 291  * @pfn: pfn value for which to create the device entry
 292  * Returns: valid device entry for the pfn
 293  */
 294 static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 295                                                  unsigned long pfn)
 296 {
 297         return (pfn << range->pfn_shift) |
 298                 range->flags[HMM_PFN_VALID];
 299 }
 300
 301 /*
 302  * Old API:
 303  * hmm_pfn_to_page()
 304  * hmm_pfn_to_pfn()
 305  * hmm_pfn_from_page()
 306  * hmm_pfn_from_pfn()
 307  *
 308  * This are the OLD API please use new API, it is here to avoid cross-tree
 309  * merge painfullness ie we convert things to new API in stages.
 310  */
 311 static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
 312                                            uint64_t pfn)
 313 {
 314         return hmm_device_entry_to_page(range, pfn);
 315 }
 316
 317 static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
 318                                            uint64_t pfn)
 319 {
 320         return hmm_device_entry_to_pfn(range, pfn);
 321 }
 322
 323 static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
 324                                          struct page *page)
 325 {
 326         return hmm_device_entry_from_page(range, page);
 327 }
 328
 329 static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
 330                                         unsigned long pfn)
 331 {
 332         return hmm_device_entry_from_pfn(range, pfn);
 333 }
 334
 335
 336
 337 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 338 /*
 339  * Mirroring: how to synchronize device page table with CPU page table.
 340  *
 341  * A device driver that is participating in HMM mirroring must always
 342  * synchronize with CPU page table updates. For this, device drivers can either
 343  * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
 344  * drivers can decide to register one mirror per device per process, or just
 345  * one mirror per process for a group of devices. The pattern is:
 346  *
 347  *      int device_bind_address_space(..., struct mm_struct *mm, ...)
 348  *      {
 349  *          struct device_address_space *das;
 350  *
 351  *          // Device driver specific initialization, and allocation of das
 352  *          // which contains an hmm_mirror struct as one of its fields.
 353  *          ...
 354  *
 355  *          ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
 356  *          if (ret) {
 357  *              // Cleanup on error
 358  *              return ret;
 359  *          }
 360  *
 361  *          // Other device driver specific initialization
 362  *          ...
 363  *      }
 364  *
 365  * Once an hmm_mirror is registered for an address space, the device driver
 366  * will get callbacks through sync_cpu_device_pagetables() operation (see
 367  * hmm_mirror_ops struct).
 368  *
 369  * Device driver must not free the struct containing the hmm_mirror struct
 370  * before calling hmm_mirror_unregister(). The expected usage is to do that when
 371  * the device driver is unbinding from an address space.
 372  *
 373  *
 374  *      void device_unbind_address_space(struct device_address_space *das)
 375  *      {
 376  *          // Device driver specific cleanup
 377  *          ...
 378  *
 379  *          hmm_mirror_unregister(&das->mirror);
 380  *
 381  *          // Other device driver specific cleanup, and now das can be freed
 382  *          ...
 383  *      }
 384  */
 385
 386 struct hmm_mirror;
 387
 388 /*
 389  * enum hmm_update_event - type of update
 390  * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
 391  */
 392 enum hmm_update_event {
 393         HMM_UPDATE_INVALIDATE,
 394 };
 395
 396 /*
 397  * struct hmm_update - HMM update informations for callback
 398  *
 399  * @start: virtual start address of the range to update
 400  * @end: virtual end address of the range to update
 401  * @event: event triggering the update (what is happening)
 402  * @blockable: can the callback block/sleep ?
 403  */
 404 struct hmm_update {
 405         unsigned long start;
 406         unsigned long end;
 407         enum hmm_update_event event;
 408         bool blockable;
 409 };
 410
 411 /*
 412  * struct hmm_mirror_ops - HMM mirror device operations callback
 413  *
 414  * @update: callback to update range on a device
 415  */
 416 struct hmm_mirror_ops {
 417         /* release() - release hmm_mirror
 418          *
 419          * @mirror: pointer to struct hmm_mirror
 420          *
 421          * This is called when the mm_struct is being released.
 422          * The callback should make sure no references to the mirror occur
 423          * after the callback returns.
 424          */
 425         void (*release)(struct hmm_mirror *mirror);
 426
 427         /* sync_cpu_device_pagetables() - synchronize page tables
 428          *
 429          * @mirror: pointer to struct hmm_mirror
 430          * @update: update informations (see struct hmm_update)
 431          * Returns: -EAGAIN if update.blockable false and callback need to
 432          *          block, 0 otherwise.
 433          *
 434          * This callback ultimately originates from mmu_notifiers when the CPU
 435          * page table is updated. The device driver must update its page table
 436          * in response to this callback. The update argument tells what action
 437          * to perform.
 438          *
 439          * The device driver must not return from this callback until the device
 440          * page tables are completely updated (TLBs flushed, etc); this is a
 441          * synchronous call.
 442          */
 443         int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
 444                                           const struct hmm_update *update);
 445 };
 446
 447 /*
 448  * struct hmm_mirror - mirror struct for a device driver
 449  *
 450  * @hmm: pointer to struct hmm (which is unique per mm_struct)
 451  * @ops: device driver callback for HMM mirror operations
 452  * @list: for list of mirrors of a given mm
 453  *
 454  * Each address space (mm_struct) being mirrored by a device must register one
 455  * instance of an hmm_mirror struct with HMM. HMM will track the list of all
 456  * mirrors for each mm_struct.
 457  */
 458 struct hmm_mirror {
 459         struct hmm                      *hmm;
 460         const struct hmm_mirror_ops     *ops;
 461         struct list_head                list;
 462 };
 463
 464 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 465 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 466
 467 /*
 468  * hmm_mirror_mm_is_alive() - test if mm is still alive
 469  * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
 470  * Returns: false if the mm is dead, true otherwise
 471  *
 472  * This is an optimization it will not accurately always return -EINVAL if the
 473  * mm is dead ie there can be false negative (process is being kill but HMM is
 474  * not yet inform of that). It is only intented to be use to optimize out case
 475  * where driver is about to do something time consuming and it would be better
 476  * to skip it if the mm is dead.
 477  */
 478 static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 479 {
 480         struct mm_struct *mm;
 481
 482         if (!mirror || !mirror->hmm)
 483                 return false;
 484         mm = READ_ONCE(mirror->hmm->mm);
 485         if (mirror->hmm->dead || !mm)
 486                 return false;
 487
 488         return true;
 489 }
 490
 491
 492 /*
 493  * Please see Documentation/vm/hmm.rst for how to use the range API.
 494  */
 495 int hmm_range_register(struct hmm_range *range,
 496                        struct mm_struct *mm,
 497                        unsigned long start,
 498                        unsigned long end,
 499                        unsigned page_shift);
 500 void hmm_range_unregister(struct hmm_range *range);
 501 long hmm_range_snapshot(struct hmm_range *range);
 502 long hmm_range_fault(struct hmm_range *range, bool block);
 503 long hmm_range_dma_map(struct hmm_range *range,
 504                        struct device *device,
 505                        dma_addr_t *daddrs,
 506                        bool block);
 507 long hmm_range_dma_unmap(struct hmm_range *range,
 508                          struct vm_area_struct *vma,
 509                          struct device *device,
 510                          dma_addr_t *daddrs,
 511                          bool dirty);
 512
 513 /*
 514  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
 515  *
 516  * When waiting for mmu notifiers we need some kind of time out otherwise we
 517  * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
 518  * wait already.
 519  */
 520 #define HMM_RANGE_DEFAULT_TIMEOUT 1000
 521
 522 /* This is a temporary helper to avoid merge conflict between trees. */
 523 static inline bool hmm_vma_range_done(struct hmm_range *range)
 524 {
 525         bool ret = hmm_range_valid(range);
 526
 527         hmm_range_unregister(range);
 528         return ret;
 529 }
 530
 531 /* This is a temporary helper to avoid merge conflict between trees. */
 532 static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 533 {
 534         long ret;
 535
 536         /*
 537          * With the old API the driver must set each individual entries with
 538          * the requested flags (valid, write, ...). So here we set the mask to
 539          * keep intact the entries provided by the driver and zero out the
 540          * default_flags.
 541          */
 542         range->default_flags = 0;
 543         range->pfn_flags_mask = -1UL;
 544
 545         ret = hmm_range_register(range, range->vma->vm_mm,
 546                                  range->start, range->end,
 547                                  PAGE_SHIFT);
 548         if (ret)
 549                 return (int)ret;
 550
 551         if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
 552                 /*
 553                  * The mmap_sem was taken by driver we release it here and
 554                  * returns -EAGAIN which correspond to mmap_sem have been
 555                  * drop in the old API.
 556                  */
 557                 up_read(&range->vma->vm_mm->mmap_sem);
 558                 return -EAGAIN;
 559         }
 560
 561         ret = hmm_range_fault(range, block);
 562         if (ret <= 0) {
 563                 if (ret == -EBUSY || !ret) {
 564                         /* Same as above  drop mmap_sem to match old API. */
 565                         up_read(&range->vma->vm_mm->mmap_sem);
 566                         ret = -EBUSY;
 567                 } else if (ret == -EAGAIN)
 568                         ret = -EBUSY;
 569                 hmm_range_unregister(range);
 570                 return ret;
 571         }
 572         return 0;
 573 }
 574
 575 /* Below are for HMM internal use only! Not to be used by device driver! */
 576 void hmm_mm_destroy(struct mm_struct *mm);
 577
 578 static inline void hmm_mm_init(struct mm_struct *mm)
 579 {
 580         mm->hmm = NULL;
 581 }
 582 #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 583 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 584 static inline void hmm_mm_init(struct mm_struct *mm) {}
 585 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 586
 587 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 588 struct hmm_devmem;
 589
 590 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
 591                                        unsigned long addr);
 592
 593 /*
 594  * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
 595  *
 596  * @free: call when refcount on page reach 1 and thus is no longer use
 597  * @fault: call when there is a page fault to unaddressable memory
 598  *
 599  * Both callback happens from page_free() and page_fault() callback of struct
 600  * dev_pagemap respectively. See include/linux/memremap.h for more details on
 601  * those.
 602  *
 603  * The hmm_devmem_ops callback are just here to provide a coherent and
 604  * uniq API to device driver and device driver should not register their
 605  * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
 606  * back.
 607  */
 608 struct hmm_devmem_ops {
 609         /*
 610          * free() - free a device page
 611          * @devmem: device memory structure (see struct hmm_devmem)
 612          * @page: pointer to struct page being freed
 613          *
 614          * Call back occurs whenever a device page refcount reach 1 which
 615          * means that no one is holding any reference on the page anymore
 616          * (ZONE_DEVICE page have an elevated refcount of 1 as default so
 617          * that they are not release to the general page allocator).
 618          *
 619          * Note that callback has exclusive ownership of the page (as no
 620          * one is holding any reference).
 621          */
 622         void (*free)(struct hmm_devmem *devmem, struct page *page);
 623         /*
 624          * fault() - CPU page fault or get user page (GUP)
 625          * @devmem: device memory structure (see struct hmm_devmem)
 626          * @vma: virtual memory area containing the virtual address
 627          * @addr: virtual address that faulted or for which there is a GUP
 628          * @page: pointer to struct page backing virtual address (unreliable)
 629          * @flags: FAULT_FLAG_* (see include/linux/mm.h)
 630          * @pmdp: page middle directory
 631          * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
 632          *   on error
 633          *
 634          * The callback occurs whenever there is a CPU page fault or GUP on a
 635          * virtual address. This means that the device driver must migrate the
 636          * page back to regular memory (CPU accessible).
 637          *
 638          * The device driver is free to migrate more than one page from the
 639          * fault() callback as an optimization. However if device decide to
 640          * migrate more than one page it must always priotirize the faulting
 641          * address over the others.
 642          *
 643          * The struct page pointer is only given as an hint to allow quick
 644          * lookup of internal device driver data. A concurrent migration
 645          * might have already free that page and the virtual address might
 646          * not longer be back by it. So it should not be modified by the
 647          * callback.
 648          *
 649          * Note that mmap semaphore is held in read mode at least when this
 650          * callback occurs, hence the vma is valid upon callback entry.
 651          */
 652         vm_fault_t (*fault)(struct hmm_devmem *devmem,
 653                      struct vm_area_struct *vma,
 654                      unsigned long addr,
 655                      const struct page *page,
 656                      unsigned int flags,
 657                      pmd_t *pmdp);
 658 };
 659
 660 /*
 661  * struct hmm_devmem - track device memory
 662  *
 663  * @completion: completion object for device memory
 664  * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
 665  * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
 666  * @resource: IO resource reserved for this chunk of memory
 667  * @pagemap: device page map for that chunk
 668  * @device: device to bind resource to
 669  * @ops: memory operations callback
 670  * @ref: per CPU refcount
 671  * @page_fault: callback when CPU fault on an unaddressable device page
 672  *
 673  * This an helper structure for device drivers that do not wish to implement
 674  * the gory details related to hotplugging new memoy and allocating struct
 675  * pages.
 676  *
 677  * Device drivers can directly use ZONE_DEVICE memory on their own if they
 678  * wish to do so.
 679  *
 680  * The page_fault() callback must migrate page back, from device memory to
 681  * system memory, so that the CPU can access it. This might fail for various
 682  * reasons (device issues,  device have been unplugged, ...). When such error
 683  * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
 684  * set the CPU page table entry to "poisoned".
 685  *
 686  * Note that because memory cgroup charges are transferred to the device memory,
 687  * this should never fail due to memory restrictions. However, allocation
 688  * of a regular system page might still fail because we are out of memory. If
 689  * that happens, the page_fault() callback must return VM_FAULT_OOM.
 690  *
 691  * The page_fault() callback can also try to migrate back multiple pages in one
 692  * chunk, as an optimization. It must, however, prioritize the faulting address
 693  * over all the others.
 694  */
 695 typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
 696                                 unsigned long addr,
 697                                 const struct page *page,
 698                                 unsigned int flags,
 699                                 pmd_t *pmdp);
 700
 701 struct hmm_devmem {
 702         struct completion               completion;
 703         unsigned long                   pfn_first;
 704         unsigned long                   pfn_last;
 705         struct resource                 *resource;
 706         struct device                   *device;
 707         struct dev_pagemap              pagemap;
 708         const struct hmm_devmem_ops     *ops;
 709         struct percpu_ref               ref;
 710         dev_page_fault_t                page_fault;
 711 };
 712
 713 /*
 714  * To add (hotplug) device memory, HMM assumes that there is no real resource
 715  * that reserves a range in the physical address space (this is intended to be
 716  * use by unaddressable device memory). It will reserve a physical range big
 717  * enough and allocate struct page for it.
 718  *
 719  * The device driver can wrap the hmm_devmem struct inside a private device
 720  * driver struct.
 721  */
 722 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 723                                   struct device *device,
 724                                   unsigned long size);
 725 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 726                                            struct device *device,
 727                                            struct resource *res);
 728
 729 /*
 730  * hmm_devmem_page_set_drvdata - set per-page driver data field
 731  *
 732  * @page: pointer to struct page
 733  * @data: driver data value to set
 734  *
 735  * Because page can not be on lru we have an unsigned long that driver can use
 736  * to store a per page field. This just a simple helper to do that.
 737  */
 738 static inline void hmm_devmem_page_set_drvdata(struct page *page,
 739                                                unsigned long data)
 740 {
 741         page->hmm_data = data;
 742 }
 743
 744 /*
 745  * hmm_devmem_page_get_drvdata - get per page driver data field
 746  *
 747  * @page: pointer to struct page
 748  * Return: driver data value
 749  */
 750 static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 751 {
 752         return page->hmm_data;
 753 }
 754
 755
 756 /*
 757  * struct hmm_device - fake device to hang device memory onto
 758  *
 759  * @device: device struct
 760  * @minor: device minor number
 761  */
 762 struct hmm_device {
 763         struct device           device;
 764         unsigned int            minor;
 765 };
 766
 767 /*
 768  * A device driver that wants to handle multiple devices memory through a
 769  * single fake device can use hmm_device to do so. This is purely a helper and
 770  * it is not strictly needed, in order to make use of any HMM functionality.
 771  */
 772 struct hmm_device *hmm_device_new(void *drvdata);
 773 void hmm_device_put(struct hmm_device *hmm_device);
 774 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 775 #else /* IS_ENABLED(CONFIG_HMM) */
 776 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 777 static inline void hmm_mm_init(struct mm_struct *mm) {}
 778 #endif /* IS_ENABLED(CONFIG_HMM) */
 779
 780 #endif /* LINUX_HMM_H */