Merge tag 'leds-for-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/j.anasz...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Sep 2019 01:40:42 +0000 (18:40 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Sep 2019 01:40:42 +0000 (18:40 -0700)
Pull LED updates from Jacek Anaszewski:
 "In this cycle we've finally managed to contribute the patch set
  sorting out LED naming issues. Besides that there are many changes
  scattered among various LED class drivers and triggers.

  LED naming related improvements:

   - add new 'function' and 'color' fwnode properties and deprecate
     'label' property which has been frequently abused for conveying
     vendor specific names that have been available in sysfs anyway

   - introduce a set of standard LED_FUNCTION* definitions

   - introduce a set of standard LED_COLOR_ID* definitions

   - add a new {devm_}led_classdev_register_ext() API with the
     capability of automatic LED name composition basing on the
     properties available in the passed fwnode; the function is
     backwards compatible in a sense that it uses 'label' data, if
     present in the fwnode, for creating LED name

   - add tools/leds/get_led_device_info.sh script for retrieving LED
     vendor, product and bus names, if applicable; it also performs
     basic validation of an LED name

   - update following drivers and their DT bindings to use the new LED
     registration API:

        - leds-an30259a, leds-gpio, leds-as3645a, leds-aat1290, leds-cr0014114,
          leds-lm3601x, leds-lm3692x, leds-lp8860, leds-lt3593, leds-sc27xx-blt

  Other LED class improvements:

   - replace {devm_}led_classdev_register() macros with inlines

   - allow to call led_classdev_unregister() unconditionally

   - switch to use fwnode instead of be stuck with OF one

  LED triggers improvements:

   - led-triggers:
        - fix dereferencing of null pointer
        - fix a memory leak bug

   - ledtrig-gpio:
        - GPIO 0 is valid

  Drop superseeded apu2/3 support from leds-apu since for apu2+ a newer,
  more complete driver exists, based on a generic driver for the AMD
  SOCs gpio-controller, supporting LEDs as well other devices:

   - drop profile field from priv data

   - drop iosize field from priv data

   - drop enum_apu_led_platform_types

   - drop superseeded apu2/3 led support

   - add pr_fmt prefix for better log output

   - fix error message on probing failure

  Other misc fixes and improvements to existing LED class drivers:

   - leds-ns2, leds-max77650:
        - add of_node_put() before return

   - leds-pwm, leds-is31fl32xx:
        - use struct_size() helper

   - leds-lm3697, leds-lm36274, leds-lm3532:
        - switch to use fwnode_property_count_uXX()

   - leds-lm3532:
        - fix brightness control for i2c mode
        - change the define for the fs current register
        - fixes for the driver for stability
        - add full scale current configuration
        - dt: Add property for full scale current.
        - avoid potentially unpaired regulator calls
        - move static keyword to the front of declarations
        - fix optional led-max-microamp prop error handling

   - leds-max77650:
        - add of_node_put() before return
        - add MODULE_ALIAS()
        - Switch to fwnode property API

   - leds-as3645a:
        - fix misuse of strlcpy

   - leds-netxbig:
        - add of_node_put() in netxbig_leds_get_of_pdata()
        - remove legacy board-file support

   - leds-is31fl319x:
        - simplify getting the adapter of a client

   - leds-ti-lmu-common:
        - fix coccinelle issue
        - move static keyword to the front of declaration

   - leds-syscon:
        - use resource managed variant of device register

   - leds-ktd2692:
        - fix a typo in the name of a constant

   - leds-lp5562:
        - allow firmware files up to the maximum length

   - leds-an30259a:
        - fix typo

   - leds-pca953x:
        - include the right header"

* tag 'leds-for-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/j.anaszewski/linux-leds: (72 commits)
  leds: lm3532: Fix optional led-max-microamp prop error handling
  led: triggers: Fix dereferencing of null pointer
  leds: ti-lmu-common: Move static keyword to the front of declaration
  leds: lm3532: Move static keyword to the front of declarations
  leds: trigger: gpio: GPIO 0 is valid
  leds: pwm: Use struct_size() helper
  leds: is31fl32xx: Use struct_size() helper
  leds: ti-lmu-common: Fix coccinelle issue in TI LMU
  leds: lm3532: Avoid potentially unpaired regulator calls
  leds: syscon: Use resource managed variant of device register
  leds: Replace {devm_}led_classdev_register() macros with inlines
  leds: Allow to call led_classdev_unregister() unconditionally
  leds: lm3532: Add full scale current configuration
  dt: lm3532: Add property for full scale current.
  leds: lm3532: Fixes for the driver for stability
  leds: lm3532: Change the define for the fs current register
  leds: lm3532: Fix brightness control for i2c mode
  leds: Switch to use fwnode instead of be stuck with OF one
  leds: max77650: Switch to fwnode property API
  led: triggers: Fix a memory leak bug
  ...

12 files changed:
1  2 
drivers/base/core.c
drivers/base/platform.c
drivers/hwtracing/stm/core.c
drivers/i2c/i2c-core-acpi.c
drivers/infiniband/hw/hns/hns_roce_hw_v1.c
drivers/iommu/arm-smmu-v3.c
drivers/iommu/arm-smmu.c
drivers/regulator/of_regulator.c
drivers/s390/crypto/zcrypt_api.c
drivers/spi/spi.c
drivers/usb/core/devio.c
include/linux/device.h

diff --combined drivers/base/core.c
index f0dd8e38fee3925cf2448903a1645b2458c0a5d3,e22e29b3dc970ba73dbb7f8f9e423ed6a6910022..832d4eae501e41e3f11533211cc8a6dc5f6ae294
@@@ -68,11 -68,6 +68,11 @@@ void device_links_read_unlock(int idx
  {
        srcu_read_unlock(&device_links_srcu, idx);
  }
 +
 +int device_links_read_lock_held(void)
 +{
 +      return srcu_read_lock_held(&device_links_srcu);
 +}
  #else /* !CONFIG_SRCU */
  static DECLARE_RWSEM(device_links_lock);
  
@@@ -96,13 -91,6 +96,13 @@@ void device_links_read_unlock(int not_u
  {
        up_read(&device_links_lock);
  }
 +
 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
 +int device_links_read_lock_held(void)
 +{
 +      return lockdep_is_held(&device_links_lock);
 +}
 +#endif
  #endif /* !CONFIG_SRCU */
  
  /**
@@@ -1675,9 -1663,6 +1675,9 @@@ void device_initialize(struct device *d
        kobject_init(&dev->kobj, &device_ktype);
        INIT_LIST_HEAD(&dev->dma_pools);
        mutex_init(&dev->mutex);
 +#ifdef CONFIG_PROVE_LOCKING
 +      mutex_init(&dev->lockdep_mutex);
 +#endif
        lockdep_set_novalidate_class(&dev->mutex);
        spin_lock_init(&dev->devres_lock);
        INIT_LIST_HEAD(&dev->devres_head);
@@@ -1835,63 -1820,12 +1835,63 @@@ static inline struct kobject *get_glue_
   */
  static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
  {
 +      unsigned int ref;
 +
        /* see if we live in a "glue" directory */
        if (!live_in_glue_dir(glue_dir, dev))
                return;
  
        mutex_lock(&gdp_mutex);
 -      if (!kobject_has_children(glue_dir))
 +      /**
 +       * There is a race condition between removing glue directory
 +       * and adding a new device under the glue directory.
 +       *
 +       * CPU1:                                         CPU2:
 +       *
 +       * device_add()
 +       *   get_device_parent()
 +       *     class_dir_create_and_add()
 +       *       kobject_add_internal()
 +       *         create_dir()    // create glue_dir
 +       *
 +       *                                               device_add()
 +       *                                                 get_device_parent()
 +       *                                                   kobject_get() // get glue_dir
 +       *
 +       * device_del()
 +       *   cleanup_glue_dir()
 +       *     kobject_del(glue_dir)
 +       *
 +       *                                               kobject_add()
 +       *                                                 kobject_add_internal()
 +       *                                                   create_dir() // in glue_dir
 +       *                                                     sysfs_create_dir_ns()
 +       *                                                       kernfs_create_dir_ns(sd)
 +       *
 +       *       sysfs_remove_dir() // glue_dir->sd=NULL
 +       *       sysfs_put()        // free glue_dir->sd
 +       *
 +       *                                                         // sd is freed
 +       *                                                         kernfs_new_node(sd)
 +       *                                                           kernfs_get(glue_dir)
 +       *                                                           kernfs_add_one()
 +       *                                                           kernfs_put()
 +       *
 +       * Before CPU1 remove last child device under glue dir, if CPU2 add
 +       * a new device under glue dir, the glue_dir kobject reference count
 +       * will be increase to 2 in kobject_get(k). And CPU2 has been called
 +       * kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
 +       * and sysfs_put(). This result in glue_dir->sd is freed.
 +       *
 +       * Then the CPU2 will see a stale "empty" but still potentially used
 +       * glue dir around in kernfs_new_node().
 +       *
 +       * In order to avoid this happening, we also should make sure that
 +       * kernfs_node for glue_dir is released in CPU1 only when refcount
 +       * for glue_dir kobj is 1.
 +       */
 +      ref = kref_read(&glue_dir->kref);
 +      if (!kobject_has_children(glue_dir) && !--ref)
                kobject_del(glue_dir);
        kobject_put(glue_dir);
        mutex_unlock(&gdp_mutex);
@@@ -2277,24 -2211,6 +2277,24 @@@ void put_device(struct device *dev
  }
  EXPORT_SYMBOL_GPL(put_device);
  
 +bool kill_device(struct device *dev)
 +{
 +      /*
 +       * Require the device lock and set the "dead" flag to guarantee that
 +       * the update behavior is consistent with the other bitfields near
 +       * it and that we cannot have an asynchronous probe routine trying
 +       * to run while we are tearing out the bus/class/sysfs from
 +       * underneath the device.
 +       */
 +      lockdep_assert_held(&dev->mutex);
 +
 +      if (dev->p->dead)
 +              return false;
 +      dev->p->dead = true;
 +      return true;
 +}
 +EXPORT_SYMBOL_GPL(kill_device);
 +
  /**
   * device_del - delete device from system.
   * @dev: device.
@@@ -2314,8 -2230,15 +2314,8 @@@ void device_del(struct device *dev
        struct kobject *glue_dir = NULL;
        struct class_interface *class_intf;
  
 -      /*
 -       * Hold the device lock and set the "dead" flag to guarantee that
 -       * the update behavior is consistent with the other bitfields near
 -       * it and that we cannot have an asynchronous probe routine trying
 -       * to run while we are tearing out the bus/class/sysfs from
 -       * underneath the device.
 -       */
        device_lock(dev);
 -      dev->p->dead = true;
 +      kill_device(dev);
        device_unlock(dev);
  
        /* Notify clients of device removal.  This call must come
@@@ -2944,13 -2867,6 +2944,6 @@@ struct device *device_create_with_group
  }
  EXPORT_SYMBOL_GPL(device_create_with_groups);
  
- static int __match_devt(struct device *dev, const void *data)
- {
-       const dev_t *devt = data;
-       return dev->devt == *devt;
- }
  /**
   * device_destroy - removes a device that was created with device_create()
   * @class: pointer to the struct class that this device was registered with
@@@ -2963,7 -2879,7 +2956,7 @@@ void device_destroy(struct class *class
  {
        struct device *dev;
  
-       dev = class_find_device(class, NULL, &devt, __match_devt);
+       dev = class_find_device_by_devt(class, devt);
        if (dev) {
                put_device(dev);
                device_unregister(dev);
@@@ -3434,8 -3350,38 +3427,38 @@@ void device_set_of_node_from_dev(struc
  }
  EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
  
+ int device_match_name(struct device *dev, const void *name)
+ {
+       return sysfs_streq(dev_name(dev), name);
+ }
+ EXPORT_SYMBOL_GPL(device_match_name);
  int device_match_of_node(struct device *dev, const void *np)
  {
        return dev->of_node == np;
  }
  EXPORT_SYMBOL_GPL(device_match_of_node);
+ int device_match_fwnode(struct device *dev, const void *fwnode)
+ {
+       return dev_fwnode(dev) == fwnode;
+ }
+ EXPORT_SYMBOL_GPL(device_match_fwnode);
+ int device_match_devt(struct device *dev, const void *pdevt)
+ {
+       return dev->devt == *(dev_t *)pdevt;
+ }
+ EXPORT_SYMBOL_GPL(device_match_devt);
+ int device_match_acpi_dev(struct device *dev, const void *adev)
+ {
+       return ACPI_COMPANION(dev) == adev;
+ }
+ EXPORT_SYMBOL(device_match_acpi_dev);
+ int device_match_any(struct device *dev, const void *unused)
+ {
+       return 1;
+ }
+ EXPORT_SYMBOL_GPL(device_match_any);
diff --combined drivers/base/platform.c
index ec974ba9c0c4ca93a3b1cfcaf541c6e4a4a3bd9c,a174ce5ea17ccbd9df1ad66da3cd5082ebb2fc3c..eb018378d60a7780e369aa94bb3cc37b7e480cac
@@@ -157,13 -157,8 +157,13 @@@ int platform_get_irq(struct platform_de
         * the device will only expose one IRQ, and this fallback
         * allows a common code path across either kind of resource.
         */
 -      if (num == 0 && has_acpi_companion(&dev->dev))
 -              return acpi_dev_gpio_irq_get(ACPI_COMPANION(&dev->dev), num);
 +      if (num == 0 && has_acpi_companion(&dev->dev)) {
 +              int ret = acpi_dev_gpio_irq_get(ACPI_COMPANION(&dev->dev), num);
 +
 +              /* Our callers expect -ENXIO for missing IRQs. */
 +              if (ret >= 0 || ret == -EPROBE_DEFER)
 +                      return ret;
 +      }
  
        return -ENXIO;
  #endif
@@@ -1202,6 -1197,20 +1202,20 @@@ struct bus_type platform_bus_type = 
  };
  EXPORT_SYMBOL_GPL(platform_bus_type);
  
+ /**
+  * platform_find_device_by_driver - Find a platform device with a given
+  * driver.
+  * @start: The device to start the search from.
+  * @drv: The device driver to look for.
+  */
+ struct device *platform_find_device_by_driver(struct device *start,
+                                             const struct device_driver *drv)
+ {
+       return bus_find_device(&platform_bus_type, start, drv,
+                              (void *)platform_match);
+ }
+ EXPORT_SYMBOL_GPL(platform_find_device_by_driver);
  int __init platform_bus_init(void)
  {
        int error;
index 181e7ff1ec4fc374e31968e3a89c0fc9592a6d73,2b6bd42632e8e306a616e132eb4de2cad48cfc96..603b83ac50852a81b99f5f28fc9a26df8c2a5cb8
@@@ -89,13 -89,6 +89,6 @@@ static struct class stm_class = 
        .dev_groups     = stm_groups,
  };
  
- static int stm_dev_match(struct device *dev, const void *data)
- {
-       const char *name = data;
-       return sysfs_streq(name, dev_name(dev));
- }
  /**
   * stm_find_device() - find stm device by name
   * @buf:      character buffer containing the name
@@@ -116,7 -109,7 +109,7 @@@ struct stm_device *stm_find_device(cons
        if (!stm_core_up)
                return NULL;
  
-       dev = class_find_device(&stm_class, NULL, buf, stm_dev_match);
+       dev = class_find_device_by_name(&stm_class, buf);
        if (!dev)
                return NULL;
  
@@@ -1276,6 -1269,7 +1269,6 @@@ int stm_source_register_device(struct d
  
  err:
        put_device(&src->dev);
 -      kfree(src);
  
        return err;
  }
index 4dbbc9a35f6561fb918dbd663b5bd2e4cd64eee2,bc80aafb521faa89af5a15c6362c74f7deff30d2..bb6b39fe343a80f0b3ae072e17696c5b25f382a5
@@@ -344,27 -344,21 +344,10 @@@ u32 i2c_acpi_find_bus_speed(struct devi
  }
  EXPORT_SYMBOL_GPL(i2c_acpi_find_bus_speed);
  
--static int i2c_acpi_find_match_adapter(struct device *dev, const void *data)
--{
--      struct i2c_adapter *adapter = i2c_verify_adapter(dev);
--
--      if (!adapter)
--              return 0;
--
--      return ACPI_HANDLE(dev) == (acpi_handle)data;
--}
--
- static int i2c_acpi_find_match_device(struct device *dev, const void *data)
- {
-       return ACPI_COMPANION(dev) == data;
- }
--
  struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
  {
-       struct device *dev;
+       struct device *dev = bus_find_device_by_acpi_dev(&i2c_bus_type, handle);
  
-       dev = bus_find_device(&i2c_bus_type, NULL, handle,
-                             i2c_acpi_find_match_adapter);
        return dev ? i2c_verify_adapter(dev) : NULL;
  }
  EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
@@@ -373,8 -367,7 +356,7 @@@ static struct i2c_client *i2c_acpi_find
  {
        struct device *dev;
  
-       dev = bus_find_device(&i2c_bus_type, NULL, adev,
-                             i2c_acpi_find_match_device);
+       dev = bus_find_device_by_acpi_dev(&i2c_bus_type, adev);
        return dev ? i2c_verify_client(dev) : NULL;
  }
  
index c07e387a07a38b88a02d175e6ddd889e13ef00bf,fa05e943038ae29122be8a182843409e33bd884a..141205e763144901912a8146b0b35db13a378e91
@@@ -750,10 -750,8 +750,10 @@@ static int hns_roce_v1_rsv_lp_qp(struc
        atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
  
        pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
 -      if (!pd)
 +      if (!pd) {
 +              ret = -ENOMEM;
                goto alloc_mem_failed;
 +      }
  
        pd->device  = ibdev;
        ret = hns_roce_alloc_pd(pd, NULL);
@@@ -4501,19 -4499,13 +4501,13 @@@ static const struct acpi_device_id hns_
  };
  MODULE_DEVICE_TABLE(acpi, hns_roce_acpi_match);
  
- static int hns_roce_node_match(struct device *dev, const void *fwnode)
- {
-       return dev->fwnode == fwnode;
- }
  static struct
  platform_device *hns_roce_find_pdev(struct fwnode_handle *fwnode)
  {
        struct device *dev;
  
        /* get the 'device' corresponding to the matching 'fwnode' */
-       dev = bus_find_device(&platform_bus_type, NULL,
-                             fwnode, hns_roce_node_match);
+       dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode);
        /* get the platform device */
        return dev ? to_platform_device(dev) : NULL;
  }
index 4aa414843557651673c8c1f1f971d944a37f2647,6f0e13fa5e1a59a462ccb3b986f17382974730e8..8da93e730d6fd9f7eb6f1d9a2e8072232c7e3754
  #define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1
  #define ARM_SMMU_MEMATTR_OIWB         0xf
  
 -#define Q_IDX(q, p)                   ((p) & ((1 << (q)->max_n_shift) - 1))
 -#define Q_WRP(q, p)                   ((p) & (1 << (q)->max_n_shift))
 -#define Q_OVERFLOW_FLAG                       (1 << 31)
 -#define Q_OVF(q, p)                   ((p) & Q_OVERFLOW_FLAG)
 +#define Q_IDX(llq, p)                 ((p) & ((1 << (llq)->max_n_shift) - 1))
 +#define Q_WRP(llq, p)                 ((p) & (1 << (llq)->max_n_shift))
 +#define Q_OVERFLOW_FLAG                       (1U << 31)
 +#define Q_OVF(p)                      ((p) & Q_OVERFLOW_FLAG)
  #define Q_ENT(q, p)                   ((q)->base +                    \
 -                                       Q_IDX(q, p) * (q)->ent_dwords)
 +                                       Q_IDX(&((q)->llq), p) *        \
 +                                       (q)->ent_dwords)
  
  #define Q_BASE_RWA                    (1UL << 62)
  #define Q_BASE_ADDR_MASK              GENMASK_ULL(51, 5)
  #define CMDQ_ERR_CERROR_ABT_IDX               2
  #define CMDQ_ERR_CERROR_ATC_INV_IDX   3
  
 +#define CMDQ_PROD_OWNED_FLAG          Q_OVERFLOW_FLAG
 +
 +/*
 + * This is used to size the command queue and therefore must be at least
 + * BITS_PER_LONG so that the valid_map works correctly (it relies on the
 + * total number of queue entries being a multiple of BITS_PER_LONG).
 + */
 +#define CMDQ_BATCH_ENTRIES            BITS_PER_LONG
 +
  #define CMDQ_0_OP                     GENMASK_ULL(7, 0)
  #define CMDQ_0_SSV                    (1UL << 11)
  
  #define PRIQ_1_ADDR_MASK              GENMASK_ULL(63, 12)
  
  /* High-level queue structures */
 -#define ARM_SMMU_POLL_TIMEOUT_US      100
 -#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
 -#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
 +#define ARM_SMMU_POLL_TIMEOUT_US      1000000 /* 1s! */
 +#define ARM_SMMU_POLL_SPIN_COUNT      10
  
  #define MSI_IOVA_BASE                 0x8000000
  #define MSI_IOVA_LENGTH                       0x100000
@@@ -481,29 -472,13 +481,29 @@@ struct arm_smmu_cmdq_ent 
  
                #define CMDQ_OP_CMD_SYNC        0x46
                struct {
 -                      u32                     msidata;
                        u64                     msiaddr;
                } sync;
        };
  };
  
 +struct arm_smmu_ll_queue {
 +      union {
 +              u64                     val;
 +              struct {
 +                      u32             prod;
 +                      u32             cons;
 +              };
 +              struct {
 +                      atomic_t        prod;
 +                      atomic_t        cons;
 +              } atomic;
 +              u8                      __pad[SMP_CACHE_BYTES];
 +      } ____cacheline_aligned_in_smp;
 +      u32                             max_n_shift;
 +};
 +
  struct arm_smmu_queue {
 +      struct arm_smmu_ll_queue        llq;
        int                             irq; /* Wired interrupt */
  
        __le64                          *base;
        u64                             q_base;
  
        size_t                          ent_dwords;
 -      u32                             max_n_shift;
 -      u32                             prod;
 -      u32                             cons;
  
        u32 __iomem                     *prod_reg;
        u32 __iomem                     *cons_reg;
  };
  
 +struct arm_smmu_queue_poll {
 +      ktime_t                         timeout;
 +      unsigned int                    delay;
 +      unsigned int                    spin_cnt;
 +      bool                            wfe;
 +};
 +
  struct arm_smmu_cmdq {
        struct arm_smmu_queue           q;
 -      spinlock_t                      lock;
 +      atomic_long_t                   *valid_map;
 +      atomic_t                        owner_prod;
 +      atomic_t                        lock;
  };
  
  struct arm_smmu_evtq {
@@@ -607,6 -576,8 +607,6 @@@ struct arm_smmu_device 
  
        int                             gerr_irq;
        int                             combined_irq;
 -      u32                             sync_nr;
 -      u8                              prev_cmd_opcode;
  
        unsigned long                   ias; /* IPA */
        unsigned long                   oas; /* PA */
  
        struct arm_smmu_strtab_cfg      strtab_cfg;
  
 -      /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
 -      union {
 -              u32                     sync_count;
 -              u64                     padding;
 -      };
 -
        /* IOMMU core code handle */
        struct iommu_device             iommu;
  };
@@@ -637,7 -614,7 +637,7 @@@ struct arm_smmu_master 
        struct list_head                domain_head;
        u32                             *sids;
        unsigned int                    num_sids;
 -      bool                            ats_enabled             :1;
 +      bool                            ats_enabled;
  };
  
  /* SMMU private data for an IOMMU domain */
@@@ -654,7 -631,6 +654,7 @@@ struct arm_smmu_domain 
  
        struct io_pgtable_ops           *pgtbl_ops;
        bool                            non_strict;
 +      atomic_t                        nr_ats_masters;
  
        enum arm_smmu_domain_stage      stage;
        union {
@@@ -709,97 -685,85 +709,97 @@@ static void parse_driver_options(struc
  }
  
  /* Low-level queue manipulation functions */
 -static bool queue_full(struct arm_smmu_queue *q)
 +static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
 +{
 +      u32 space, prod, cons;
 +
 +      prod = Q_IDX(q, q->prod);
 +      cons = Q_IDX(q, q->cons);
 +
 +      if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
 +              space = (1 << q->max_n_shift) - (prod - cons);
 +      else
 +              space = cons - prod;
 +
 +      return space >= n;
 +}
 +
 +static bool queue_full(struct arm_smmu_ll_queue *q)
  {
        return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
               Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
  }
  
 -static bool queue_empty(struct arm_smmu_queue *q)
 +static bool queue_empty(struct arm_smmu_ll_queue *q)
  {
        return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
               Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
  }
  
 -static void queue_sync_cons(struct arm_smmu_queue *q)
 +static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
  {
 -      q->cons = readl_relaxed(q->cons_reg);
 +      return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
 +              (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
 +             ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
 +              (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
  }
  
 -static void queue_inc_cons(struct arm_smmu_queue *q)
 +static void queue_sync_cons_out(struct arm_smmu_queue *q)
  {
 -      u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
 -
 -      q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
 -
        /*
         * Ensure that all CPU accesses (reads and writes) to the queue
         * are complete before we update the cons pointer.
         */
        mb();
 -      writel_relaxed(q->cons, q->cons_reg);
 +      writel_relaxed(q->llq.cons, q->cons_reg);
 +}
 +
 +static void queue_inc_cons(struct arm_smmu_ll_queue *q)
 +{
 +      u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
 +      q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
  }
  
 -static int queue_sync_prod(struct arm_smmu_queue *q)
 +static int queue_sync_prod_in(struct arm_smmu_queue *q)
  {
        int ret = 0;
        u32 prod = readl_relaxed(q->prod_reg);
  
 -      if (Q_OVF(q, prod) != Q_OVF(q, q->prod))
 +      if (Q_OVF(prod) != Q_OVF(q->llq.prod))
                ret = -EOVERFLOW;
  
 -      q->prod = prod;
 +      q->llq.prod = prod;
        return ret;
  }
  
 -static void queue_inc_prod(struct arm_smmu_queue *q)
 +static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
  {
 -      u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
 -
 -      q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
 -      writel(q->prod, q->prod_reg);
 +      u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
 +      return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
  }
  
 -/*
 - * Wait for the SMMU to consume items. If sync is true, wait until the queue
 - * is empty. Otherwise, wait until there is at least one free slot.
 - */
 -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
 +static void queue_poll_init(struct arm_smmu_device *smmu,
 +                          struct arm_smmu_queue_poll *qp)
  {
 -      ktime_t timeout;
 -      unsigned int delay = 1, spin_cnt = 0;
 -
 -      /* Wait longer if it's a CMD_SYNC */
 -      timeout = ktime_add_us(ktime_get(), sync ?
 -                                          ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
 -                                          ARM_SMMU_POLL_TIMEOUT_US);
 +      qp->delay = 1;
 +      qp->spin_cnt = 0;
 +      qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 +      qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 +}
  
 -      while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
 -              if (ktime_compare(ktime_get(), timeout) > 0)
 -                      return -ETIMEDOUT;
 +static int queue_poll(struct arm_smmu_queue_poll *qp)
 +{
 +      if (ktime_compare(ktime_get(), qp->timeout) > 0)
 +              return -ETIMEDOUT;
  
 -              if (wfe) {
 -                      wfe();
 -              } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
 -                      cpu_relax();
 -                      continue;
 -              } else {
 -                      udelay(delay);
 -                      delay *= 2;
 -                      spin_cnt = 0;
 -              }
 +      if (qp->wfe) {
 +              wfe();
 +      } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
 +              cpu_relax();
 +      } else {
 +              udelay(qp->delay);
 +              qp->delay *= 2;
 +              qp->spin_cnt = 0;
        }
  
        return 0;
@@@ -813,6 -777,16 +813,6 @@@ static void queue_write(__le64 *dst, u6
                *dst++ = cpu_to_le64(*src++);
  }
  
 -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
 -{
 -      if (queue_full(q))
 -              return -ENOSPC;
 -
 -      queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
 -      queue_inc_prod(q);
 -      return 0;
 -}
 -
  static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
  {
        int i;
  
  static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
  {
 -      if (queue_empty(q))
 +      if (queue_empty(&q->llq))
                return -EAGAIN;
  
 -      queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords);
 -      queue_inc_cons(q);
 +      queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
 +      queue_inc_cons(&q->llq);
 +      queue_sync_cons_out(q);
        return 0;
  }
  
@@@ -895,14 -868,20 +895,14 @@@ static int arm_smmu_cmdq_build_cmd(u64 
                cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
                break;
        case CMDQ_OP_CMD_SYNC:
 -              if (ent->sync.msiaddr)
 +              if (ent->sync.msiaddr) {
                        cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
 -              else
 +                      cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 +              } else {
                        cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
 +              }
                cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
                cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
 -              /*
 -               * Commands are written little-endian, but we want the SMMU to
 -               * receive MSIData, and thus write it back to memory, in CPU
 -               * byte order, so big-endian needs an extra byteswap here.
 -               */
 -              cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
 -                                   cpu_to_le32(ent->sync.msidata));
 -              cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
                break;
        default:
                return -ENOENT;
        return 0;
  }
  
 +static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 +                                       u32 prod)
 +{
 +      struct arm_smmu_queue *q = &smmu->cmdq.q;
 +      struct arm_smmu_cmdq_ent ent = {
 +              .opcode = CMDQ_OP_CMD_SYNC,
 +      };
 +
 +      /*
 +       * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
 +       * payload, so the write will zero the entire command on that platform.
 +       */
 +      if (smmu->features & ARM_SMMU_FEAT_MSI &&
 +          smmu->features & ARM_SMMU_FEAT_COHERENCY) {
 +              ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
 +                                 q->ent_dwords * 8;
 +      }
 +
 +      arm_smmu_cmdq_build_cmd(cmd, &ent);
 +}
 +
  static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
  {
        static const char *cerror_str[] = {
        queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
  }
  
 -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
 +/*
 + * Command queue locking.
 + * This is a form of bastardised rwlock with the following major changes:
 + *
 + * - The only LOCK routines are exclusive_trylock() and shared_lock().
 + *   Neither have barrier semantics, and instead provide only a control
 + *   dependency.
 + *
 + * - The UNLOCK routines are supplemented with shared_tryunlock(), which
 + *   fails if the caller appears to be the last lock holder (yes, this is
 + *   racy). All successful UNLOCK routines have RELEASE semantics.
 + */
 +static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
  {
 -      struct arm_smmu_queue *q = &smmu->cmdq.q;
 -      bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 +      int val;
 +
 +      /*
 +       * We can try to avoid the cmpxchg() loop by simply incrementing the
 +       * lock counter. When held in exclusive state, the lock counter is set
 +       * to INT_MIN so these increments won't hurt as the value will remain
 +       * negative.
 +       */
 +      if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
 +              return;
 +
 +      do {
 +              val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
 +      } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
 +}
 +
 +static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
 +{
 +      (void)atomic_dec_return_release(&cmdq->lock);
 +}
  
 -      smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
 +static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
 +{
 +      if (atomic_read(&cmdq->lock) == 1)
 +              return false;
  
 -      while (queue_insert_raw(q, cmd) == -ENOSPC) {
 -              if (queue_poll_cons(q, false, wfe))
 -                      dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 +      arm_smmu_cmdq_shared_unlock(cmdq);
 +      return true;
 +}
 +
 +#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)          \
 +({                                                                    \
 +      bool __ret;                                                     \
 +      local_irq_save(flags);                                          \
 +      __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN);       \
 +      if (!__ret)                                                     \
 +              local_irq_restore(flags);                               \
 +      __ret;                                                          \
 +})
 +
 +#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags)                \
 +({                                                                    \
 +      atomic_set_release(&cmdq->lock, 0);                             \
 +      local_irq_restore(flags);                                       \
 +})
 +
 +
 +/*
 + * Command queue insertion.
 + * This is made fiddly by our attempts to achieve some sort of scalability
 + * since there is one queue shared amongst all of the CPUs in the system.  If
 + * you like mixed-size concurrency, dependency ordering and relaxed atomics,
 + * then you'll *love* this monstrosity.
 + *
 + * The basic idea is to split the queue up into ranges of commands that are
 + * owned by a given CPU; the owner may not have written all of the commands
 + * itself, but is responsible for advancing the hardware prod pointer when
 + * the time comes. The algorithm is roughly:
 + *
 + *    1. Allocate some space in the queue. At this point we also discover
 + *       whether the head of the queue is currently owned by another CPU,
 + *       or whether we are the owner.
 + *
 + *    2. Write our commands into our allocated slots in the queue.
 + *
 + *    3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
 + *
 + *    4. If we are an owner:
 + *            a. Wait for the previous owner to finish.
 + *            b. Mark the queue head as unowned, which tells us the range
 + *               that we are responsible for publishing.
 + *            c. Wait for all commands in our owned range to become valid.
 + *            d. Advance the hardware prod pointer.
 + *            e. Tell the next owner we've finished.
 + *
 + *    5. If we are inserting a CMD_SYNC (we may or may not have been an
 + *       owner), then we need to stick around until it has completed:
 + *            a. If we have MSIs, the SMMU can write back into the CMD_SYNC
 + *               to clear the first 4 bytes.
 + *            b. Otherwise, we spin waiting for the hardware cons pointer to
 + *               advance past our command.
 + *
 + * The devil is in the details, particularly the use of locking for handling
 + * SYNC completion and freeing up space in the queue before we think that it is
 + * full.
 + */
 +static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
 +                                             u32 sprod, u32 eprod, bool set)
 +{
 +      u32 swidx, sbidx, ewidx, ebidx;
 +      struct arm_smmu_ll_queue llq = {
 +              .max_n_shift    = cmdq->q.llq.max_n_shift,
 +              .prod           = sprod,
 +      };
 +
 +      ewidx = BIT_WORD(Q_IDX(&llq, eprod));
 +      ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
 +
 +      while (llq.prod != eprod) {
 +              unsigned long mask;
 +              atomic_long_t *ptr;
 +              u32 limit = BITS_PER_LONG;
 +
 +              swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
 +              sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
 +
 +              ptr = &cmdq->valid_map[swidx];
 +
 +              if ((swidx == ewidx) && (sbidx < ebidx))
 +                      limit = ebidx;
 +
 +              mask = GENMASK(limit - 1, sbidx);
 +
 +              /*
 +               * The valid bit is the inverse of the wrap bit. This means
 +               * that a zero-initialised queue is invalid and, after marking
 +               * all entries as valid, they become invalid again when we
 +               * wrap.
 +               */
 +              if (set) {
 +                      atomic_long_xor(mask, ptr);
 +              } else { /* Poll */
 +                      unsigned long valid;
 +
 +                      valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
 +                      atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
 +              }
 +
 +              llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
        }
  }
  
 -static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 -                                  struct arm_smmu_cmdq_ent *ent)
 +/* Mark all entries in the range [sprod, eprod) as valid */
 +static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
 +                                      u32 sprod, u32 eprod)
 +{
 +      __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
 +}
 +
 +/* Wait for all entries in the range [sprod, eprod) to become valid */
 +static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
 +                                       u32 sprod, u32 eprod)
 +{
 +      __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
 +}
 +
 +/* Wait for the command queue to become non-full */
 +static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
 +                                           struct arm_smmu_ll_queue *llq)
  {
 -      u64 cmd[CMDQ_ENT_DWORDS];
        unsigned long flags;
 +      struct arm_smmu_queue_poll qp;
 +      struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 +      int ret = 0;
  
 -      if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
 -              dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
 -                       ent->opcode);
 -              return;
 +      /*
 +       * Try to update our copy of cons by grabbing exclusive cmdq access. If
 +       * that fails, spin until somebody else updates it for us.
 +       */
 +      if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
 +              WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
 +              arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
 +              llq->val = READ_ONCE(cmdq->q.llq.val);
 +              return 0;
        }
  
 -      spin_lock_irqsave(&smmu->cmdq.lock, flags);
 -      arm_smmu_cmdq_insert_cmd(smmu, cmd);
 -      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 +      queue_poll_init(smmu, &qp);
 +      do {
 +              llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
 +              if (!queue_full(llq))
 +                      break;
 +
 +              ret = queue_poll(&qp);
 +      } while (!ret);
 +
 +      return ret;
  }
  
  /*
 - * The difference between val and sync_idx is bounded by the maximum size of
 - * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
 + * Wait until the SMMU signals a CMD_SYNC completion MSI.
 + * Must be called with the cmdq lock held in some capacity.
   */
 -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 +static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
 +                                        struct arm_smmu_ll_queue *llq)
  {
 -      ktime_t timeout;
 -      u32 val;
 +      int ret = 0;
 +      struct arm_smmu_queue_poll qp;
 +      struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 +      u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
  
 -      timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
 -      val = smp_cond_load_acquire(&smmu->sync_count,
 -                                  (int)(VAL - sync_idx) >= 0 ||
 -                                  !ktime_before(ktime_get(), timeout));
 +      queue_poll_init(smmu, &qp);
  
 -      return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 +      /*
 +       * The MSI won't generate an event, since it's being written back
 +       * into the command queue.
 +       */
 +      qp.wfe = false;
 +      smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
 +      llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
 +      return ret;
  }
  
 -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 +/*
 + * Wait until the SMMU cons index passes llq->prod.
 + * Must be called with the cmdq lock held in some capacity.
 + */
 +static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
 +                                             struct arm_smmu_ll_queue *llq)
  {
 -      u64 cmd[CMDQ_ENT_DWORDS];
 -      unsigned long flags;
 -      struct arm_smmu_cmdq_ent ent = {
 -              .opcode = CMDQ_OP_CMD_SYNC,
 -              .sync   = {
 -                      .msiaddr = virt_to_phys(&smmu->sync_count),
 -              },
 -      };
 +      struct arm_smmu_queue_poll qp;
 +      struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 +      u32 prod = llq->prod;
 +      int ret = 0;
 +
 +      queue_poll_init(smmu, &qp);
 +      llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
 +      do {
 +              if (queue_consumed(llq, prod))
 +                      break;
  
 -      spin_lock_irqsave(&smmu->cmdq.lock, flags);
 +              ret = queue_poll(&qp);
  
 -      /* Piggy-back on the previous command if it's a SYNC */
 -      if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
 -              ent.sync.msidata = smmu->sync_nr;
 -      } else {
 -              ent.sync.msidata = ++smmu->sync_nr;
 -              arm_smmu_cmdq_build_cmd(cmd, &ent);
 -              arm_smmu_cmdq_insert_cmd(smmu, cmd);
 -      }
 +              /*
 +               * This needs to be a readl() so that our subsequent call
 +               * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
 +               *
 +               * Specifically, we need to ensure that we observe all
 +               * shared_lock()s by other CMD_SYNCs that share our owner,
 +               * so that a failing call to tryunlock() means that we're
 +               * the last one out and therefore we can safely advance
 +               * cmdq->q.llq.cons. Roughly speaking:
 +               *
 +               * CPU 0                CPU1                    CPU2 (us)
 +               *
 +               * if (sync)
 +               *      shared_lock();
 +               *
 +               * dma_wmb();
 +               * set_valid_map();
 +               *
 +               *                      if (owner) {
 +               *                              poll_valid_map();
 +               *                              <control dependency>
 +               *                              writel(prod_reg);
 +               *
 +               *                                              readl(cons_reg);
 +               *                                              tryunlock();
 +               *
 +               * Requires us to see CPU 0's shared_lock() acquisition.
 +               */
 +              llq->cons = readl(cmdq->q.cons_reg);
 +      } while (!ret);
  
 -      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 +      return ret;
 +}
 +
 +static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
 +                                       struct arm_smmu_ll_queue *llq)
 +{
 +      if (smmu->features & ARM_SMMU_FEAT_MSI &&
 +          smmu->features & ARM_SMMU_FEAT_COHERENCY)
 +              return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
  
 -      return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
 +      return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
  }
  
 -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 +static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
 +                                      u32 prod, int n)
  {
 -      u64 cmd[CMDQ_ENT_DWORDS];
 +      int i;
 +      struct arm_smmu_ll_queue llq = {
 +              .max_n_shift    = cmdq->q.llq.max_n_shift,
 +              .prod           = prod,
 +      };
 +
 +      for (i = 0; i < n; ++i) {
 +              u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
 +
 +              prod = queue_inc_prod_n(&llq, i);
 +              queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
 +      }
 +}
 +
 +/*
 + * This is the actual insertion function, and provides the following
 + * ordering guarantees to callers:
 + *
 + * - There is a dma_wmb() before publishing any commands to the queue.
 + *   This can be relied upon to order prior writes to data structures
 + *   in memory (such as a CD or an STE) before the command.
 + *
 + * - On completion of a CMD_SYNC, there is a control dependency.
 + *   This can be relied upon to order subsequent writes to memory (e.g.
 + *   freeing an IOVA) after completion of the CMD_SYNC.
 + *
 + * - Command insertion is totally ordered, so if two CPUs each race to
 + *   insert their own list of commands then all of the commands from one
 + *   CPU will appear before any of the commands from the other CPU.
 + */
 +static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 +                                     u64 *cmds, int n, bool sync)
 +{
 +      u64 cmd_sync[CMDQ_ENT_DWORDS];
 +      u32 prod;
        unsigned long flags;
 -      bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 -      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
 -      int ret;
 +      bool owner;
 +      struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 +      struct arm_smmu_ll_queue llq = {
 +              .max_n_shift = cmdq->q.llq.max_n_shift,
 +      }, head = llq;
 +      int ret = 0;
  
 -      arm_smmu_cmdq_build_cmd(cmd, &ent);
 +      /* 1. Allocate some space in the queue */
 +      local_irq_save(flags);
 +      llq.val = READ_ONCE(cmdq->q.llq.val);
 +      do {
 +              u64 old;
 +
 +              while (!queue_has_space(&llq, n + sync)) {
 +                      local_irq_restore(flags);
 +                      if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
 +                              dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 +                      local_irq_save(flags);
 +              }
 +
 +              head.cons = llq.cons;
 +              head.prod = queue_inc_prod_n(&llq, n + sync) |
 +                                           CMDQ_PROD_OWNED_FLAG;
  
 -      spin_lock_irqsave(&smmu->cmdq.lock, flags);
 -      arm_smmu_cmdq_insert_cmd(smmu, cmd);
 -      ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
 -      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 +              old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
 +              if (old == llq.val)
 +                      break;
 +
 +              llq.val = old;
 +      } while (1);
 +      owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
 +      head.prod &= ~CMDQ_PROD_OWNED_FLAG;
 +      llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
 +
 +      /*
 +       * 2. Write our commands into the queue
 +       * Dependency ordering from the cmpxchg() loop above.
 +       */
 +      arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
 +      if (sync) {
 +              prod = queue_inc_prod_n(&llq, n);
 +              arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
 +              queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
 +
 +              /*
 +               * In order to determine completion of our CMD_SYNC, we must
 +               * ensure that the queue can't wrap twice without us noticing.
 +               * We achieve that by taking the cmdq lock as shared before
 +               * marking our slot as valid.
 +               */
 +              arm_smmu_cmdq_shared_lock(cmdq);
 +      }
 +
 +      /* 3. Mark our slots as valid, ensuring commands are visible first */
 +      dma_wmb();
 +      arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
 +
 +      /* 4. If we are the owner, take control of the SMMU hardware */
 +      if (owner) {
 +              /* a. Wait for previous owner to finish */
 +              atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
 +
 +              /* b. Stop gathering work by clearing the owned flag */
 +              prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
 +                                                 &cmdq->q.llq.atomic.prod);
 +              prod &= ~CMDQ_PROD_OWNED_FLAG;
 +
 +              /*
 +               * c. Wait for any gathered work to be written to the queue.
 +               * Note that we read our own entries so that we have the control
 +               * dependency required by (d).
 +               */
 +              arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
 +
 +              /*
 +               * d. Advance the hardware prod pointer
 +               * Control dependency ordering from the entries becoming valid.
 +               */
 +              writel_relaxed(prod, cmdq->q.prod_reg);
 +
 +              /*
 +               * e. Tell the next owner we're done
 +               * Make sure we've updated the hardware first, so that we don't
 +               * race to update prod and potentially move it backwards.
 +               */
 +              atomic_set_release(&cmdq->owner_prod, prod);
 +      }
 +
 +      /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
 +      if (sync) {
 +              llq.prod = queue_inc_prod_n(&llq, n);
 +              ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
 +              if (ret) {
 +                      dev_err_ratelimited(smmu->dev,
 +                                          "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
 +                                          llq.prod,
 +                                          readl_relaxed(cmdq->q.prod_reg),
 +                                          readl_relaxed(cmdq->q.cons_reg));
 +              }
 +
 +              /*
 +               * Try to unlock the cmq lock. This will fail if we're the last
 +               * reader, in which case we can safely update cmdq->q.llq.cons
 +               */
 +              if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
 +                      WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
 +                      arm_smmu_cmdq_shared_unlock(cmdq);
 +              }
 +      }
  
 +      local_irq_restore(flags);
        return ret;
  }
  
 -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 +                                 struct arm_smmu_cmdq_ent *ent)
  {
 -      int ret;
 -      bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
 -                 (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 +      u64 cmd[CMDQ_ENT_DWORDS];
  
 -      ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
 -                : __arm_smmu_cmdq_issue_sync(smmu);
 -      if (ret)
 -              dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 -      return ret;
 +      if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
 +              dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
 +                       ent->opcode);
 +              return -EINVAL;
 +      }
 +
 +      return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
 +}
 +
 +static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 +{
 +      return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
  }
  
  /* Context descriptor manipulation functions */
@@@ -1575,8 -1186,8 +1575,8 @@@ static void arm_smmu_write_strtab_ent(s
                        ste_live = true;
                        break;
                case STRTAB_STE_0_CFG_ABORT:
 -                      if (disable_bypass)
 -                              break;
 +                      BUG_ON(!disable_bypass);
 +                      break;
                default:
                        BUG(); /* STE corruption */
                }
@@@ -1694,7 -1305,6 +1694,7 @@@ static irqreturn_t arm_smmu_evtq_thread
        int i;
        struct arm_smmu_device *smmu = dev;
        struct arm_smmu_queue *q = &smmu->evtq.q;
 +      struct arm_smmu_ll_queue *llq = &q->llq;
        u64 evt[EVTQ_ENT_DWORDS];
  
        do {
                 * Not much we can do on overflow, so scream and pretend we're
                 * trying harder.
                 */
 -              if (queue_sync_prod(q) == -EOVERFLOW)
 +              if (queue_sync_prod_in(q) == -EOVERFLOW)
                        dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
 -      } while (!queue_empty(q));
 +      } while (!queue_empty(llq));
  
        /* Sync our overflow flag, as we believe we're up to speed */
 -      q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
 +      llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
 +                  Q_IDX(llq, llq->cons);
        return IRQ_HANDLED;
  }
  
@@@ -1764,21 -1373,19 +1764,21 @@@ static irqreturn_t arm_smmu_priq_thread
  {
        struct arm_smmu_device *smmu = dev;
        struct arm_smmu_queue *q = &smmu->priq.q;
 +      struct arm_smmu_ll_queue *llq = &q->llq;
        u64 evt[PRIQ_ENT_DWORDS];
  
        do {
                while (!queue_remove_raw(q, evt))
                        arm_smmu_handle_ppr(smmu, evt);
  
 -              if (queue_sync_prod(q) == -EOVERFLOW)
 +              if (queue_sync_prod_in(q) == -EOVERFLOW)
                        dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
 -      } while (!queue_empty(q));
 +      } while (!queue_empty(llq));
  
        /* Sync our overflow flag, as we believe we're up to speed */
 -      q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
 -      writel(q->cons, q->cons_reg);
 +      llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
 +                    Q_IDX(llq, llq->cons);
 +      queue_sync_cons_out(q);
        return IRQ_HANDLED;
  }
  
@@@ -1927,23 -1534,6 +1927,23 @@@ static int arm_smmu_atc_inv_domain(stru
        if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
                return 0;
  
 +      /*
 +       * Ensure that we've completed prior invalidation of the main TLBs
 +       * before we read 'nr_ats_masters' in case of a concurrent call to
 +       * arm_smmu_enable_ats():
 +       *
 +       *      // unmap()                      // arm_smmu_enable_ats()
 +       *      TLBI+SYNC                       atomic_inc(&nr_ats_masters);
 +       *      smp_mb();                       [...]
 +       *      atomic_read(&nr_ats_masters);   pci_enable_ats() // writel()
 +       *
 +       * Ensures that we always see the incremented 'nr_ats_masters' count if
 +       * ATS was enabled at the PCI device before completion of the TLBI.
 +       */
 +      smp_mb();
 +      if (!atomic_read(&smmu_domain->nr_ats_masters))
 +              return 0;
 +
        arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
  
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
  }
  
  /* IO_PGTABLE API */
 -static void arm_smmu_tlb_sync(void *cookie)
 -{
 -      struct arm_smmu_domain *smmu_domain = cookie;
 -
 -      arm_smmu_cmdq_issue_sync(smmu_domain->smmu);
 -}
 -
  static void arm_smmu_tlb_inv_context(void *cookie)
  {
        struct arm_smmu_domain *smmu_domain = cookie;
        /*
         * NOTE: when io-pgtable is in non-strict mode, we may get here with
         * PTEs previously cleared by unmaps on the current CPU not yet visible
 -       * to the SMMU. We are relying on the DSB implicit in queue_inc_prod()
 -       * to guarantee those are observed before the TLBI. Do be careful, 007.
 +       * to the SMMU. We are relying on the dma_wmb() implicit during cmd
 +       * insertion to guarantee those are observed before the TLBI. Do be
 +       * careful, 007.
         */
        arm_smmu_cmdq_issue_cmd(smmu, &cmd);
        arm_smmu_cmdq_issue_sync(smmu);
 +      arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
  }
  
 -static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 -                                        size_t granule, bool leaf, void *cookie)
 +static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
 +                                 size_t granule, bool leaf,
 +                                 struct arm_smmu_domain *smmu_domain)
  {
 -      struct arm_smmu_domain *smmu_domain = cookie;
 +      u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
        struct arm_smmu_device *smmu = smmu_domain->smmu;
 +      unsigned long start = iova, end = iova + size;
 +      int i = 0;
        struct arm_smmu_cmdq_ent cmd = {
                .tlbi = {
                        .leaf   = leaf,
 -                      .addr   = iova,
                },
        };
  
 +      if (!size)
 +              return;
 +
        if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
                cmd.opcode      = CMDQ_OP_TLBI_NH_VA;
                cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
                cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
        }
  
 -      do {
 -              arm_smmu_cmdq_issue_cmd(smmu, &cmd);
 -              cmd.tlbi.addr += granule;
 -      } while (size -= granule);
 +      while (iova < end) {
 +              if (i == CMDQ_BATCH_ENTRIES) {
 +                      arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
 +                      i = 0;
 +              }
 +
 +              cmd.tlbi.addr = iova;
 +              arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
 +              iova += granule;
 +              i++;
 +      }
 +
 +      arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true);
 +
 +      /*
 +       * Unfortunately, this can't be leaf-only since we may have
 +       * zapped an entire table.
 +       */
 +      arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
 +}
 +
 +static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
 +                                       unsigned long iova, size_t granule,
 +                                       void *cookie)
 +{
 +      struct arm_smmu_domain *smmu_domain = cookie;
 +      struct iommu_domain *domain = &smmu_domain->domain;
 +
 +      iommu_iotlb_gather_add_page(domain, gather, iova, granule);
  }
  
 -static const struct iommu_gather_ops arm_smmu_gather_ops = {
 +static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
 +                                size_t granule, void *cookie)
 +{
 +      arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
 +}
 +
 +static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
 +                                size_t granule, void *cookie)
 +{
 +      arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
 +}
 +
 +static const struct iommu_flush_ops arm_smmu_flush_ops = {
        .tlb_flush_all  = arm_smmu_tlb_inv_context,
 -      .tlb_add_flush  = arm_smmu_tlb_inv_range_nosync,
 -      .tlb_sync       = arm_smmu_tlb_sync,
 +      .tlb_flush_walk = arm_smmu_tlb_inv_walk,
 +      .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
 +      .tlb_add_page   = arm_smmu_tlb_inv_page_nosync,
  };
  
  /* IOMMU API */
@@@ -2244,7 -1796,7 +2244,7 @@@ static int arm_smmu_domain_finalise(str
                .ias            = ias,
                .oas            = oas,
                .coherent_walk  = smmu->features & ARM_SMMU_FEAT_COHERENCY,
 -              .tlb            = &arm_smmu_gather_ops,
 +              .tlb            = &arm_smmu_flush_ops,
                .iommu_dev      = smmu->dev,
        };
  
@@@ -2311,65 -1863,44 +2311,65 @@@ static void arm_smmu_install_ste_for_de
        }
  }
  
 -static int arm_smmu_enable_ats(struct arm_smmu_master *master)
 +#ifdef CONFIG_PCI_ATS
 +static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
  {
 -      int ret;
 -      size_t stu;
        struct pci_dev *pdev;
        struct arm_smmu_device *smmu = master->smmu;
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
  
        if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
            !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled())
 -              return -ENXIO;
 +              return false;
  
        pdev = to_pci_dev(master->dev);
 -      if (pdev->untrusted)
 -              return -EPERM;
 +      return !pdev->untrusted && pdev->ats_cap;
 +}
 +#else
 +static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 +{
 +      return false;
 +}
 +#endif
 +
 +static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 +{
 +      size_t stu;
 +      struct pci_dev *pdev;
 +      struct arm_smmu_device *smmu = master->smmu;
 +      struct arm_smmu_domain *smmu_domain = master->domain;
 +
 +      /* Don't enable ATS at the endpoint if it's not enabled in the STE */
 +      if (!master->ats_enabled)
 +              return;
  
        /* Smallest Translation Unit: log2 of the smallest supported granule */
        stu = __ffs(smmu->pgsize_bitmap);
 +      pdev = to_pci_dev(master->dev);
  
 -      ret = pci_enable_ats(pdev, stu);
 -      if (ret)
 -              return ret;
 -
 -      master->ats_enabled = true;
 -      return 0;
 +      atomic_inc(&smmu_domain->nr_ats_masters);
 +      arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 +      if (pci_enable_ats(pdev, stu))
 +              dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
  }
  
  static void arm_smmu_disable_ats(struct arm_smmu_master *master)
  {
        struct arm_smmu_cmdq_ent cmd;
 +      struct arm_smmu_domain *smmu_domain = master->domain;
  
 -      if (!master->ats_enabled || !dev_is_pci(master->dev))
 +      if (!master->ats_enabled)
                return;
  
 +      pci_disable_ats(to_pci_dev(master->dev));
 +      /*
 +       * Ensure ATS is disabled at the endpoint before we issue the
 +       * ATC invalidation via the SMMU.
 +       */
 +      wmb();
        arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
        arm_smmu_atc_inv_master(master, &cmd);
 -      pci_disable_ats(to_pci_dev(master->dev));
 -      master->ats_enabled = false;
 +      atomic_dec(&smmu_domain->nr_ats_masters);
  }
  
  static void arm_smmu_detach_dev(struct arm_smmu_master *master)
        if (!smmu_domain)
                return;
  
 +      arm_smmu_disable_ats(master);
 +
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
        list_del(&master->domain_head);
        spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
  
        master->domain = NULL;
 +      master->ats_enabled = false;
        arm_smmu_install_ste_for_dev(master);
 -
 -      arm_smmu_disable_ats(master);
  }
  
  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
  
        master->domain = smmu_domain;
  
 -      spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 -      list_add(&master->domain_head, &smmu_domain->devices);
 -      spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 -
        if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
 -              arm_smmu_enable_ats(master);
 +              master->ats_enabled = arm_smmu_ats_supported(master);
  
        if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
                arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg);
  
        arm_smmu_install_ste_for_dev(master);
 +
 +      spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 +      list_add(&master->domain_head, &smmu_domain->devices);
 +      spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 +
 +      arm_smmu_enable_ats(master);
 +
  out_unlock:
        mutex_unlock(&smmu_domain->init_mutex);
        return ret;
@@@ -2458,16 -1985,21 +2458,16 @@@ static int arm_smmu_map(struct iommu_do
        return ops->map(ops, iova, paddr, size, prot);
  }
  
 -static size_t
 -arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 +static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 +                           size_t size, struct iommu_iotlb_gather *gather)
  {
 -      int ret;
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
        struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
  
        if (!ops)
                return 0;
  
 -      ret = ops->unmap(ops, iova, size);
 -      if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size))
 -              return 0;
 -
 -      return ret;
 +      return ops->unmap(ops, iova, size, gather);
  }
  
  static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
                arm_smmu_tlb_inv_context(smmu_domain);
  }
  
 -static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
 +static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
 +                              struct iommu_iotlb_gather *gather)
  {
 -      struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
 +      struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
  
 -      if (smmu)
 -              arm_smmu_cmdq_issue_sync(smmu);
 +      arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
 +                             gather->pgsize, true, smmu_domain);
  }
  
  static phys_addr_t
@@@ -2503,16 -2034,11 +2503,11 @@@ arm_smmu_iova_to_phys(struct iommu_doma
  
  static struct platform_driver arm_smmu_driver;
  
- static int arm_smmu_match_node(struct device *dev, const void *data)
- {
-       return dev->fwnode == data;
- }
  static
  struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
  {
-       struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL,
-                                               fwnode, arm_smmu_match_node);
+       struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
+                                                         fwnode);
        put_device(dev);
        return dev ? dev_get_drvdata(dev) : NULL;
  }
@@@ -2755,13 -2281,13 +2750,13 @@@ static int arm_smmu_init_one_queue(stru
        size_t qsz;
  
        do {
 -              qsz = ((1 << q->max_n_shift) * dwords) << 3;
 +              qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
                q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
                                              GFP_KERNEL);
                if (q->base || qsz < PAGE_SIZE)
                        break;
  
 -              q->max_n_shift--;
 +              q->llq.max_n_shift--;
        } while (1);
  
        if (!q->base) {
  
        if (!WARN_ON(q->base_dma & (qsz - 1))) {
                dev_info(smmu->dev, "allocated %u entries for %s\n",
 -                       1 << q->max_n_shift, name);
 +                       1 << q->llq.max_n_shift, name);
        }
  
        q->prod_reg     = arm_smmu_page1_fixup(prod_off, smmu);
  
        q->q_base  = Q_BASE_RWA;
        q->q_base |= q->base_dma & Q_BASE_ADDR_MASK;
 -      q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift);
 +      q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
  
 -      q->prod = q->cons = 0;
 +      q->llq.prod = q->llq.cons = 0;
        return 0;
  }
  
 +static void arm_smmu_cmdq_free_bitmap(void *data)
 +{
 +      unsigned long *bitmap = data;
 +      bitmap_free(bitmap);
 +}
 +
 +static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
 +{
 +      int ret = 0;
 +      struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 +      unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
 +      atomic_long_t *bitmap;
 +
 +      atomic_set(&cmdq->owner_prod, 0);
 +      atomic_set(&cmdq->lock, 0);
 +
 +      bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
 +      if (!bitmap) {
 +              dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
 +              ret = -ENOMEM;
 +      } else {
 +              cmdq->valid_map = bitmap;
 +              devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
 +      }
 +
 +      return ret;
 +}
 +
  static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
  {
        int ret;
  
        /* cmdq */
 -      spin_lock_init(&smmu->cmdq.lock);
        ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
                                      ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
                                      "cmdq");
        if (ret)
                return ret;
  
 +      ret = arm_smmu_cmdq_init(smmu);
 +      if (ret)
 +              return ret;
 +
        /* evtq */
        ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
                                      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@@@ -3208,8 -2703,8 +3203,8 @@@ static int arm_smmu_device_reset(struc
  
        /* Command queue */
        writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
 -      writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
 -      writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
 +      writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
 +      writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
  
        enables = CR0_CMDQEN;
        ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
  
        /* Event queue */
        writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
 -      writel_relaxed(smmu->evtq.q.prod,
 +      writel_relaxed(smmu->evtq.q.llq.prod,
                       arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu));
 -      writel_relaxed(smmu->evtq.q.cons,
 +      writel_relaxed(smmu->evtq.q.llq.cons,
                       arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu));
  
        enables |= CR0_EVTQEN;
        if (smmu->features & ARM_SMMU_FEAT_PRI) {
                writeq_relaxed(smmu->priq.q.q_base,
                               smmu->base + ARM_SMMU_PRIQ_BASE);
 -              writel_relaxed(smmu->priq.q.prod,
 +              writel_relaxed(smmu->priq.q.llq.prod,
                               arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu));
 -              writel_relaxed(smmu->priq.q.cons,
 +              writel_relaxed(smmu->priq.q.llq.cons,
                               arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu));
  
                enables |= CR0_PRIQEN;
@@@ -3409,24 -2904,18 +3404,24 @@@ static int arm_smmu_device_hw_probe(str
        }
  
        /* Queue sizes, capped to ensure natural alignment */
 -      smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
 -                                       FIELD_GET(IDR1_CMDQS, reg));
 -      if (!smmu->cmdq.q.max_n_shift) {
 -              /* Odd alignment restrictions on the base, so ignore for now */
 -              dev_err(smmu->dev, "unit-length command queue not supported\n");
 +      smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
 +                                           FIELD_GET(IDR1_CMDQS, reg));
 +      if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
 +              /*
 +               * We don't support splitting up batches, so one batch of
 +               * commands plus an extra sync needs to fit inside the command
 +               * queue. There's also no way we can handle the weird alignment
 +               * restrictions on the base pointer for a unit-length queue.
 +               */
 +              dev_err(smmu->dev, "command queue size <= %d entries not supported\n",
 +                      CMDQ_BATCH_ENTRIES);
                return -ENXIO;
        }
  
 -      smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
 -                                       FIELD_GET(IDR1_EVTQS, reg));
 -      smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
 -                                       FIELD_GET(IDR1_PRIQS, reg));
 +      smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
 +                                           FIELD_GET(IDR1_EVTQS, reg));
 +      smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
 +                                           FIELD_GET(IDR1_PRIQS, reg));
  
        /* SID/SSID sizes */
        smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg);
diff --combined drivers/iommu/arm-smmu.c
index 5b93c79371e98327ca20c9e7fb7a48db42f77a79,aa06498f291d4d8ebbed4cbbe193eef481208ea5..c3ef0cc8f7648b73339bed9bc4452ff131a7d01a
  
  #include <linux/acpi.h>
  #include <linux/acpi_iort.h>
 -#include <linux/atomic.h>
 +#include <linux/bitfield.h>
  #include <linux/delay.h>
  #include <linux/dma-iommu.h>
  #include <linux/dma-mapping.h>
  #include <linux/err.h>
  #include <linux/interrupt.h>
  #include <linux/io.h>
 -#include <linux/io-64-nonatomic-hi-lo.h>
 -#include <linux/io-pgtable.h>
 -#include <linux/iommu.h>
  #include <linux/iopoll.h>
  #include <linux/init.h>
  #include <linux/moduleparam.h>
  #include <linux/platform_device.h>
  #include <linux/pm_runtime.h>
  #include <linux/slab.h>
 -#include <linux/spinlock.h>
  
  #include <linux/amba/bus.h>
  #include <linux/fsl/mc.h>
  
 -#include "arm-smmu-regs.h"
 +#include "arm-smmu.h"
  
  /*
   * Apparently, some Qualcomm arm64 platforms which appear to expose their SMMU
   */
  #define QCOM_DUMMY_VAL -1
  
 -#define ARM_MMU500_ACTLR_CPRE         (1 << 1)
 -
 -#define ARM_MMU500_ACR_CACHE_LOCK     (1 << 26)
 -#define ARM_MMU500_ACR_S2CRB_TLBEN    (1 << 10)
 -#define ARM_MMU500_ACR_SMTNMB_TLBEN   (1 << 8)
 -
  #define TLB_LOOP_TIMEOUT              1000000 /* 1s! */
  #define TLB_SPIN_COUNT                        10
  
 -/* Maximum number of context banks per SMMU */
 -#define ARM_SMMU_MAX_CBS              128
 -
 -/* SMMU global address space */
 -#define ARM_SMMU_GR0(smmu)            ((smmu)->base)
 -#define ARM_SMMU_GR1(smmu)            ((smmu)->base + (1 << (smmu)->pgshift))
 -
 -/*
 - * SMMU global address space with conditional offset to access secure
 - * aliases of non-secure registers (e.g. nsCR0: 0x400, nsGFSR: 0x448,
 - * nsGFSYNR0: 0x450)
 - */
 -#define ARM_SMMU_GR0_NS(smmu)                                         \
 -      ((smmu)->base +                                                 \
 -              ((smmu->options & ARM_SMMU_OPT_SECURE_CFG_ACCESS)       \
 -                      ? 0x400 : 0))
 -
 -/*
 - * Some 64-bit registers only make sense to write atomically, but in such
 - * cases all the data relevant to AArch32 formats lies within the lower word,
 - * therefore this actually makes more sense than it might first appear.
 - */
 -#ifdef CONFIG_64BIT
 -#define smmu_write_atomic_lq          writeq_relaxed
 -#else
 -#define smmu_write_atomic_lq          writel_relaxed
 -#endif
 -
 -/* Translation context bank */
 -#define ARM_SMMU_CB(smmu, n)  ((smmu)->cb_base + ((n) << (smmu)->pgshift))
 -
  #define MSI_IOVA_BASE                 0x8000000
  #define MSI_IOVA_LENGTH                       0x100000
  
@@@ -72,6 -113,19 +72,6 @@@ module_param(disable_bypass, bool, S_IR
  MODULE_PARM_DESC(disable_bypass,
        "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
  
 -enum arm_smmu_arch_version {
 -      ARM_SMMU_V1,
 -      ARM_SMMU_V1_64K,
 -      ARM_SMMU_V2,
 -};
 -
 -enum arm_smmu_implementation {
 -      GENERIC_SMMU,
 -      ARM_MMU500,
 -      CAVIUM_SMMUV2,
 -      QCOM_SMMUV2,
 -};
 -
  struct arm_smmu_s2cr {
        struct iommu_group              *group;
        int                             count;
@@@ -109,8 -163,117 +109,8 @@@ struct arm_smmu_master_cfg 
  #define for_each_cfg_sme(fw, i, idx) \
        for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
  
 -struct arm_smmu_device {
 -      struct device                   *dev;
 -
 -      void __iomem                    *base;
 -      void __iomem                    *cb_base;
 -      unsigned long                   pgshift;
 -
 -#define ARM_SMMU_FEAT_COHERENT_WALK   (1 << 0)
 -#define ARM_SMMU_FEAT_STREAM_MATCH    (1 << 1)
 -#define ARM_SMMU_FEAT_TRANS_S1                (1 << 2)
 -#define ARM_SMMU_FEAT_TRANS_S2                (1 << 3)
 -#define ARM_SMMU_FEAT_TRANS_NESTED    (1 << 4)
 -#define ARM_SMMU_FEAT_TRANS_OPS               (1 << 5)
 -#define ARM_SMMU_FEAT_VMID16          (1 << 6)
 -#define ARM_SMMU_FEAT_FMT_AARCH64_4K  (1 << 7)
 -#define ARM_SMMU_FEAT_FMT_AARCH64_16K (1 << 8)
 -#define ARM_SMMU_FEAT_FMT_AARCH64_64K (1 << 9)
 -#define ARM_SMMU_FEAT_FMT_AARCH32_L   (1 << 10)
 -#define ARM_SMMU_FEAT_FMT_AARCH32_S   (1 << 11)
 -#define ARM_SMMU_FEAT_EXIDS           (1 << 12)
 -      u32                             features;
 -
 -#define ARM_SMMU_OPT_SECURE_CFG_ACCESS (1 << 0)
 -      u32                             options;
 -      enum arm_smmu_arch_version      version;
 -      enum arm_smmu_implementation    model;
 -
 -      u32                             num_context_banks;
 -      u32                             num_s2_context_banks;
 -      DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
 -      struct arm_smmu_cb              *cbs;
 -      atomic_t                        irptndx;
 -
 -      u32                             num_mapping_groups;
 -      u16                             streamid_mask;
 -      u16                             smr_mask_mask;
 -      struct arm_smmu_smr             *smrs;
 -      struct arm_smmu_s2cr            *s2crs;
 -      struct mutex                    stream_map_mutex;
 -
 -      unsigned long                   va_size;
 -      unsigned long                   ipa_size;
 -      unsigned long                   pa_size;
 -      unsigned long                   pgsize_bitmap;
 -
 -      u32                             num_global_irqs;
 -      u32                             num_context_irqs;
 -      unsigned int                    *irqs;
 -      struct clk_bulk_data            *clks;
 -      int                             num_clks;
 -
 -      u32                             cavium_id_base; /* Specific to Cavium */
 -
 -      spinlock_t                      global_sync_lock;
 -
 -      /* IOMMU core code handle */
 -      struct iommu_device             iommu;
 -};
 -
 -enum arm_smmu_context_fmt {
 -      ARM_SMMU_CTX_FMT_NONE,
 -      ARM_SMMU_CTX_FMT_AARCH64,
 -      ARM_SMMU_CTX_FMT_AARCH32_L,
 -      ARM_SMMU_CTX_FMT_AARCH32_S,
 -};
 -
 -struct arm_smmu_cfg {
 -      u8                              cbndx;
 -      u8                              irptndx;
 -      union {
 -              u16                     asid;
 -              u16                     vmid;
 -      };
 -      u32                             cbar;
 -      enum arm_smmu_context_fmt       fmt;
 -};
 -#define INVALID_IRPTNDX                       0xff
 -
 -enum arm_smmu_domain_stage {
 -      ARM_SMMU_DOMAIN_S1 = 0,
 -      ARM_SMMU_DOMAIN_S2,
 -      ARM_SMMU_DOMAIN_NESTED,
 -      ARM_SMMU_DOMAIN_BYPASS,
 -};
 -
 -struct arm_smmu_domain {
 -      struct arm_smmu_device          *smmu;
 -      struct io_pgtable_ops           *pgtbl_ops;
 -      const struct iommu_gather_ops   *tlb_ops;
 -      struct arm_smmu_cfg             cfg;
 -      enum arm_smmu_domain_stage      stage;
 -      bool                            non_strict;
 -      struct mutex                    init_mutex; /* Protects smmu pointer */
 -      spinlock_t                      cb_lock; /* Serialises ATS1* ops and TLB syncs */
 -      struct iommu_domain             domain;
 -};
 -
 -struct arm_smmu_option_prop {
 -      u32 opt;
 -      const char *prop;
 -};
 -
 -static atomic_t cavium_smmu_context_count = ATOMIC_INIT(0);
 -
  static bool using_legacy_binding, using_generic_binding;
  
 -static struct arm_smmu_option_prop arm_smmu_options[] = {
 -      { ARM_SMMU_OPT_SECURE_CFG_ACCESS, "calxeda,smmu-secure-config-access" },
 -      { 0, NULL},
 -};
 -
  static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu)
  {
        if (pm_runtime_enabled(smmu->dev))
@@@ -130,6 -293,20 +130,6 @@@ static struct arm_smmu_domain *to_smmu_
        return container_of(dom, struct arm_smmu_domain, domain);
  }
  
 -static void parse_driver_options(struct arm_smmu_device *smmu)
 -{
 -      int i = 0;
 -
 -      do {
 -              if (of_property_read_bool(smmu->dev->of_node,
 -                                              arm_smmu_options[i].prop)) {
 -                      smmu->options |= arm_smmu_options[i].opt;
 -                      dev_notice(smmu->dev, "option %s\n",
 -                              arm_smmu_options[i].prop);
 -              }
 -      } while (arm_smmu_options[++i].opt);
 -}
 -
  static struct device_node *dev_get_dev_node(struct device *dev)
  {
        if (dev_is_pci(dev)) {
@@@ -238,17 -415,15 +238,17 @@@ static void __arm_smmu_free_bitmap(unsi
  }
  
  /* Wait for any pending TLB invalidations to complete */
 -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
 -                              void __iomem *sync, void __iomem *status)
 +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
 +                              int sync, int status)
  {
        unsigned int spin_cnt, delay;
 +      u32 reg;
  
 -      writel_relaxed(QCOM_DUMMY_VAL, sync);
 +      arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
        for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
                for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
 -                      if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
 +                      reg = arm_smmu_readl(smmu, page, status);
 +                      if (!(reg & sTLBGSTATUS_GSACTIVE))
                                return;
                        cpu_relax();
                }
  
  static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
  {
 -      void __iomem *base = ARM_SMMU_GR0(smmu);
        unsigned long flags;
  
        spin_lock_irqsave(&smmu->global_sync_lock, flags);
 -      __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
 -                          base + ARM_SMMU_GR0_sTLBGSTATUS);
 +      __arm_smmu_tlb_sync(smmu, ARM_SMMU_GR0, ARM_SMMU_GR0_sTLBGSYNC,
 +                          ARM_SMMU_GR0_sTLBGSTATUS);
        spin_unlock_irqrestore(&smmu->global_sync_lock, flags);
  }
  
@@@ -272,11 -448,12 +272,11 @@@ static void arm_smmu_tlb_sync_context(v
  {
        struct arm_smmu_domain *smmu_domain = cookie;
        struct arm_smmu_device *smmu = smmu_domain->smmu;
 -      void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
        unsigned long flags;
  
        spin_lock_irqsave(&smmu_domain->cb_lock, flags);
 -      __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
 -                          base + ARM_SMMU_CB_TLBSTATUS);
 +      __arm_smmu_tlb_sync(smmu, ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx),
 +                          ARM_SMMU_CB_TLBSYNC, ARM_SMMU_CB_TLBSTATUS);
        spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
  }
  
@@@ -290,13 -467,14 +290,13 @@@ static void arm_smmu_tlb_sync_vmid(voi
  static void arm_smmu_tlb_inv_context_s1(void *cookie)
  {
        struct arm_smmu_domain *smmu_domain = cookie;
 -      struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 -      void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
 -
        /*
 -       * NOTE: this is not a relaxed write; it needs to guarantee that PTEs
 -       * cleared by the current CPU are visible to the SMMU before the TLBI.
 +       * The TLBI write may be relaxed, so ensure that PTEs cleared by the
 +       * current CPU are visible beforehand.
         */
 -      writel(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
 +      wmb();
 +      arm_smmu_cb_write(smmu_domain->smmu, smmu_domain->cfg.cbndx,
 +                        ARM_SMMU_CB_S1_TLBIASID, smmu_domain->cfg.asid);
        arm_smmu_tlb_sync_context(cookie);
  }
  
@@@ -304,143 -482,87 +304,143 @@@ static void arm_smmu_tlb_inv_context_s2
  {
        struct arm_smmu_domain *smmu_domain = cookie;
        struct arm_smmu_device *smmu = smmu_domain->smmu;
 -      void __iomem *base = ARM_SMMU_GR0(smmu);
  
 -      /* NOTE: see above */
 -      writel(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
 +      /* See above */
 +      wmb();
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
        arm_smmu_tlb_sync_global(smmu);
  }
  
 -static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 -                                        size_t granule, bool leaf, void *cookie)
 +static void arm_smmu_tlb_inv_range_s1(unsigned long iova, size_t size,
 +                                    size_t granule, bool leaf, void *cookie)
  {
        struct arm_smmu_domain *smmu_domain = cookie;
 +      struct arm_smmu_device *smmu = smmu_domain->smmu;
        struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 -      bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
 -      void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
 +      int reg, idx = cfg->cbndx;
  
 -      if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
 +      if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
                wmb();
  
 -      if (stage1) {
 -              reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 -
 -              if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
 -                      iova &= ~12UL;
 -                      iova |= cfg->asid;
 -                      do {
 -                              writel_relaxed(iova, reg);
 -                              iova += granule;
 -                      } while (size -= granule);
 -              } else {
 -                      iova >>= 12;
 -                      iova |= (u64)cfg->asid << 48;
 -                      do {
 -                              writeq_relaxed(iova, reg);
 -                              iova += granule >> 12;
 -                      } while (size -= granule);
 -              }
 +      reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 +
 +      if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
 +              iova = (iova >> 12) << 12;
 +              iova |= cfg->asid;
 +              do {
 +                      arm_smmu_cb_write(smmu, idx, reg, iova);
 +                      iova += granule;
 +              } while (size -= granule);
        } else {
 -              reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 -                            ARM_SMMU_CB_S2_TLBIIPAS2;
                iova >>= 12;
 +              iova |= (u64)cfg->asid << 48;
                do {
 -                      smmu_write_atomic_lq(iova, reg);
 +                      arm_smmu_cb_writeq(smmu, idx, reg, iova);
                        iova += granule >> 12;
                } while (size -= granule);
        }
  }
  
 +static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size,
 +                                    size_t granule, bool leaf, void *cookie)
 +{
 +      struct arm_smmu_domain *smmu_domain = cookie;
 +      struct arm_smmu_device *smmu = smmu_domain->smmu;
 +      int reg, idx = smmu_domain->cfg.cbndx;
 +
 +      if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
 +              wmb();
 +
 +      reg = leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L : ARM_SMMU_CB_S2_TLBIIPAS2;
 +      iova >>= 12;
 +      do {
 +              if (smmu_domain->cfg.fmt == ARM_SMMU_CTX_FMT_AARCH64)
 +                      arm_smmu_cb_writeq(smmu, idx, reg, iova);
 +              else
 +                      arm_smmu_cb_write(smmu, idx, reg, iova);
 +              iova += granule >> 12;
 +      } while (size -= granule);
 +}
 +
  /*
   * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
   * almost negligible, but the benefit of getting the first one in as far ahead
   * of the sync as possible is significant, hence we don't just make this a
 - * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think.
 + * no-op and set .tlb_sync to arm_smmu_tlb_inv_context_s2() as you might think.
   */
  static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size,
                                         size_t granule, bool leaf, void *cookie)
  {
        struct arm_smmu_domain *smmu_domain = cookie;
 -      void __iomem *base = ARM_SMMU_GR0(smmu_domain->smmu);
 +      struct arm_smmu_device *smmu = smmu_domain->smmu;
  
 -      if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
 +      if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
                wmb();
  
 -      writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
  }
  
 -static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = {
 -      .tlb_flush_all  = arm_smmu_tlb_inv_context_s1,
 -      .tlb_add_flush  = arm_smmu_tlb_inv_range_nosync,
 -      .tlb_sync       = arm_smmu_tlb_sync_context,
 +static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
 +                                size_t granule, void *cookie)
 +{
 +      struct arm_smmu_domain *smmu_domain = cookie;
 +      const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
 +
 +      ops->tlb_inv_range(iova, size, granule, false, cookie);
 +      ops->tlb_sync(cookie);
 +}
 +
 +static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
 +                                size_t granule, void *cookie)
 +{
 +      struct arm_smmu_domain *smmu_domain = cookie;
 +      const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
 +
 +      ops->tlb_inv_range(iova, size, granule, true, cookie);
 +      ops->tlb_sync(cookie);
 +}
 +
 +static void arm_smmu_tlb_add_page(struct iommu_iotlb_gather *gather,
 +                                unsigned long iova, size_t granule,
 +                                void *cookie)
 +{
 +      struct arm_smmu_domain *smmu_domain = cookie;
 +      const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops;
 +
 +      ops->tlb_inv_range(iova, granule, granule, true, cookie);
 +}
 +
 +static const struct arm_smmu_flush_ops arm_smmu_s1_tlb_ops = {
 +      .tlb = {
 +              .tlb_flush_all  = arm_smmu_tlb_inv_context_s1,
 +              .tlb_flush_walk = arm_smmu_tlb_inv_walk,
 +              .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
 +              .tlb_add_page   = arm_smmu_tlb_add_page,
 +      },
 +      .tlb_inv_range          = arm_smmu_tlb_inv_range_s1,
 +      .tlb_sync               = arm_smmu_tlb_sync_context,
  };
  
 -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = {
 -      .tlb_flush_all  = arm_smmu_tlb_inv_context_s2,
 -      .tlb_add_flush  = arm_smmu_tlb_inv_range_nosync,
 -      .tlb_sync       = arm_smmu_tlb_sync_context,
 +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = {
 +      .tlb = {
 +              .tlb_flush_all  = arm_smmu_tlb_inv_context_s2,
 +              .tlb_flush_walk = arm_smmu_tlb_inv_walk,
 +              .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
 +              .tlb_add_page   = arm_smmu_tlb_add_page,
 +      },
 +      .tlb_inv_range          = arm_smmu_tlb_inv_range_s2,
 +      .tlb_sync               = arm_smmu_tlb_sync_context,
  };
  
 -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = {
 -      .tlb_flush_all  = arm_smmu_tlb_inv_context_s2,
 -      .tlb_add_flush  = arm_smmu_tlb_inv_vmid_nosync,
 -      .tlb_sync       = arm_smmu_tlb_sync_vmid,
 +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
 +      .tlb = {
 +              .tlb_flush_all  = arm_smmu_tlb_inv_context_s2,
 +              .tlb_flush_walk = arm_smmu_tlb_inv_walk,
 +              .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
 +              .tlb_add_page   = arm_smmu_tlb_add_page,
 +      },
 +      .tlb_inv_range          = arm_smmu_tlb_inv_vmid_nosync,
 +      .tlb_sync               = arm_smmu_tlb_sync_vmid,
  };
  
  static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
        unsigned long iova;
        struct iommu_domain *domain = dev;
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 -      struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
        struct arm_smmu_device *smmu = smmu_domain->smmu;
 -      void __iomem *gr1_base = ARM_SMMU_GR1(smmu);
 -      void __iomem *cb_base;
 -
 -      cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 -      fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
 +      int idx = smmu_domain->cfg.cbndx;
  
 +      fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
        if (!(fsr & FSR_FAULT))
                return IRQ_NONE;
  
 -      fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
 -      iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR);
 -      cbfrsynra = readl_relaxed(gr1_base + ARM_SMMU_GR1_CBFRSYNRA(cfg->cbndx));
 +      fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
 +      iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
 +      cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
  
        dev_err_ratelimited(smmu->dev,
        "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
 -                          fsr, iova, fsynr, cbfrsynra, cfg->cbndx);
 +                          fsr, iova, fsynr, cbfrsynra, idx);
  
 -      writel(fsr, cb_base + ARM_SMMU_CB_FSR);
 +      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
        return IRQ_HANDLED;
  }
  
@@@ -472,11 -598,12 +472,11 @@@ static irqreturn_t arm_smmu_global_faul
  {
        u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
        struct arm_smmu_device *smmu = dev;
 -      void __iomem *gr0_base = ARM_SMMU_GR0_NS(smmu);
  
 -      gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
 -      gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
 -      gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
 -      gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);
 +      gfsr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
 +      gfsynr0 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR0);
 +      gfsynr1 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR1);
 +      gfsynr2 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR2);
  
        if (!gfsr)
                return IRQ_NONE;
                "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
                gfsr, gfsynr0, gfsynr1, gfsynr2);
  
 -      writel(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, gfsr);
        return IRQ_HANDLED;
  }
  
@@@ -500,16 -627,16 +500,16 @@@ static void arm_smmu_init_context_bank(
  
        cb->cfg = cfg;
  
 -      /* TTBCR */
 +      /* TCR */
        if (stage1) {
                if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
                        cb->tcr[0] = pgtbl_cfg->arm_v7s_cfg.tcr;
                } else {
                        cb->tcr[0] = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
                        cb->tcr[1] = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32;
 -                      cb->tcr[1] |= TTBCR2_SEP_UPSTREAM;
 +                      cb->tcr[1] |= FIELD_PREP(TCR2_SEP, TCR2_SEP_UPSTREAM);
                        if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
 -                              cb->tcr[1] |= TTBCR2_AS;
 +                              cb->tcr[1] |= TCR2_AS;
                }
        } else {
                cb->tcr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
                        cb->ttbr[1] = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
                } else {
                        cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
 -                      cb->ttbr[0] |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 +                      cb->ttbr[0] |= FIELD_PREP(TTBRn_ASID, cfg->asid);
                        cb->ttbr[1] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
 -                      cb->ttbr[1] |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 +                      cb->ttbr[1] |= FIELD_PREP(TTBRn_ASID, cfg->asid);
                }
        } else {
                cb->ttbr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
@@@ -548,71 -675,74 +548,71 @@@ static void arm_smmu_write_context_bank
        bool stage1;
        struct arm_smmu_cb *cb = &smmu->cbs[idx];
        struct arm_smmu_cfg *cfg = cb->cfg;
 -      void __iomem *cb_base, *gr1_base;
 -
 -      cb_base = ARM_SMMU_CB(smmu, idx);
  
        /* Unassigned context banks only need disabling */
        if (!cfg) {
 -              writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, 0);
                return;
        }
  
 -      gr1_base = ARM_SMMU_GR1(smmu);
        stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
  
        /* CBA2R */
        if (smmu->version > ARM_SMMU_V1) {
                if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
 -                      reg = CBA2R_RW64_64BIT;
 +                      reg = CBA2R_VA64;
                else
 -                      reg = CBA2R_RW64_32BIT;
 +                      reg = 0;
                /* 16-bit VMIDs live in CBA2R */
                if (smmu->features & ARM_SMMU_FEAT_VMID16)
 -                      reg |= cfg->vmid << CBA2R_VMID_SHIFT;
 +                      reg |= FIELD_PREP(CBA2R_VMID16, cfg->vmid);
  
 -              writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(idx));
 +              arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBA2R(idx), reg);
        }
  
        /* CBAR */
 -      reg = cfg->cbar;
 +      reg = FIELD_PREP(CBAR_TYPE, cfg->cbar);
        if (smmu->version < ARM_SMMU_V2)
 -              reg |= cfg->irptndx << CBAR_IRPTNDX_SHIFT;
 +              reg |= FIELD_PREP(CBAR_IRPTNDX, cfg->irptndx);
  
        /*
         * Use the weakest shareability/memory types, so they are
         * overridden by the ttbcr/pte.
         */
        if (stage1) {
 -              reg |= (CBAR_S1_BPSHCFG_NSH << CBAR_S1_BPSHCFG_SHIFT) |
 -                      (CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
 +              reg |= FIELD_PREP(CBAR_S1_BPSHCFG, CBAR_S1_BPSHCFG_NSH) |
 +                      FIELD_PREP(CBAR_S1_MEMATTR, CBAR_S1_MEMATTR_WB);
        } else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
                /* 8-bit VMIDs live in CBAR */
 -              reg |= cfg->vmid << CBAR_VMID_SHIFT;
 +              reg |= FIELD_PREP(CBAR_VMID, cfg->vmid);
        }
 -      writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(idx));
 +      arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(idx), reg);
  
        /*
 -       * TTBCR
 +       * TCR
         * We must write this before the TTBRs, since it determines the
         * access behaviour of some fields (in particular, ASID[15:8]).
         */
        if (stage1 && smmu->version > ARM_SMMU_V1)
 -              writel_relaxed(cb->tcr[1], cb_base + ARM_SMMU_CB_TTBCR2);
 -      writel_relaxed(cb->tcr[0], cb_base + ARM_SMMU_CB_TTBCR);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR2, cb->tcr[1]);
 +      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR, cb->tcr[0]);
  
        /* TTBRs */
        if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
 -              writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
 -              writel_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0);
 -              writel_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_CONTEXTIDR, cfg->asid);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR1, cb->ttbr[1]);
        } else {
 -              writeq_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0);
 +              arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
                if (stage1)
 -                      writeq_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1);
 +                      arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR1,
 +                                         cb->ttbr[1]);
        }
  
        /* MAIRs (stage-1 only) */
        if (stage1) {
 -              writel_relaxed(cb->mair[0], cb_base + ARM_SMMU_CB_S1_MAIR0);
 -              writel_relaxed(cb->mair[1], cb_base + ARM_SMMU_CB_S1_MAIR1);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR0, cb->mair[0]);
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR1, cb->mair[1]);
        }
  
        /* SCTLR */
        if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
                reg |= SCTLR_E;
  
 -      writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR);
 +      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
  }
  
  static int arm_smmu_init_domain_context(struct iommu_domain *domain,
                        ias = min(ias, 32UL);
                        oas = min(oas, 32UL);
                }
 -              smmu_domain->tlb_ops = &arm_smmu_s1_tlb_ops;
 +              smmu_domain->flush_ops = &arm_smmu_s1_tlb_ops;
                break;
        case ARM_SMMU_DOMAIN_NESTED:
                /*
                        oas = min(oas, 40UL);
                }
                if (smmu->version == ARM_SMMU_V2)
 -                      smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v2;
 +                      smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v2;
                else
 -                      smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v1;
 +                      smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v1;
                break;
        default:
                ret = -EINVAL;
        }
  
        if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
 -              cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
 +              cfg->vmid = cfg->cbndx + 1;
        else
 -              cfg->asid = cfg->cbndx + smmu->cavium_id_base;
 +              cfg->asid = cfg->cbndx;
 +
 +      smmu_domain->smmu = smmu;
 +      if (smmu->impl && smmu->impl->init_context) {
 +              ret = smmu->impl->init_context(smmu_domain);
 +              if (ret)
 +                      goto out_unlock;
 +      }
  
        pgtbl_cfg = (struct io_pgtable_cfg) {
                .pgsize_bitmap  = smmu->pgsize_bitmap,
                .ias            = ias,
                .oas            = oas,
                .coherent_walk  = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK,
 -              .tlb            = smmu_domain->tlb_ops,
 +              .tlb            = &smmu_domain->flush_ops->tlb,
                .iommu_dev      = smmu->dev,
        };
  
        if (smmu_domain->non_strict)
                pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
  
 -      smmu_domain->smmu = smmu;
        pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
        if (!pgtbl_ops) {
                ret = -ENOMEM;
@@@ -895,24 -1019,24 +895,24 @@@ static void arm_smmu_domain_free(struc
  static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx)
  {
        struct arm_smmu_smr *smr = smmu->smrs + idx;
 -      u32 reg = smr->id << SMR_ID_SHIFT | smr->mask << SMR_MASK_SHIFT;
 +      u32 reg = FIELD_PREP(SMR_ID, smr->id) | FIELD_PREP(SMR_MASK, smr->mask);
  
        if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid)
                reg |= SMR_VALID;
 -      writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_SMR(idx));
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(idx), reg);
  }
  
  static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
  {
        struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
 -      u32 reg = (s2cr->type & S2CR_TYPE_MASK) << S2CR_TYPE_SHIFT |
 -                (s2cr->cbndx & S2CR_CBNDX_MASK) << S2CR_CBNDX_SHIFT |
 -                (s2cr->privcfg & S2CR_PRIVCFG_MASK) << S2CR_PRIVCFG_SHIFT;
 +      u32 reg = FIELD_PREP(S2CR_TYPE, s2cr->type) |
 +                FIELD_PREP(S2CR_CBNDX, s2cr->cbndx) |
 +                FIELD_PREP(S2CR_PRIVCFG, s2cr->privcfg);
  
        if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs &&
            smmu->smrs[idx].valid)
                reg |= S2CR_EXIDVALID;
 -      writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_S2CR(idx));
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_S2CR(idx), reg);
  }
  
  static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx)
   */
  static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu)
  {
 -      void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
        u32 smr;
  
        if (!smmu->smrs)
         * bits are set, so check each one separately. We can reject
         * masters later if they try to claim IDs outside these masks.
         */
 -      smr = smmu->streamid_mask << SMR_ID_SHIFT;
 -      writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
 -      smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
 -      smmu->streamid_mask = smr >> SMR_ID_SHIFT;
 +      smr = FIELD_PREP(SMR_ID, smmu->streamid_mask);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(0), smr);
 +      smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(0));
 +      smmu->streamid_mask = FIELD_GET(SMR_ID, smr);
  
 -      smr = smmu->streamid_mask << SMR_MASK_SHIFT;
 -      writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
 -      smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
 -      smmu->smr_mask_mask = smr >> SMR_MASK_SHIFT;
 +      smr = FIELD_PREP(SMR_MASK, smmu->streamid_mask);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(0), smr);
 +      smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(0));
 +      smmu->smr_mask_mask = FIELD_GET(SMR_MASK, smr);
  }
  
  static int arm_smmu_find_sme(struct arm_smmu_device *smmu, u16 id, u16 mask)
@@@ -1015,8 -1140,8 +1015,8 @@@ static int arm_smmu_master_alloc_smes(s
        mutex_lock(&smmu->stream_map_mutex);
        /* Figure out a viable stream map entry allocation */
        for_each_cfg_sme(fwspec, i, idx) {
 -              u16 sid = fwspec->ids[i];
 -              u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT;
 +              u16 sid = FIELD_GET(SMR_ID, fwspec->ids[i]);
 +              u16 mask = FIELD_GET(SMR_MASK, fwspec->ids[i]);
  
                if (idx != INVALID_SMENDX) {
                        ret = -EEXIST;
@@@ -1176,7 -1301,7 +1176,7 @@@ static int arm_smmu_map(struct iommu_do
  }
  
  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 -                           size_t size)
 +                           size_t size, struct iommu_iotlb_gather *gather)
  {
        struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
        struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
                return 0;
  
        arm_smmu_rpm_get(smmu);
 -      ret = ops->unmap(ops, iova, size);
 +      ret = ops->unmap(ops, iova, size, gather);
        arm_smmu_rpm_put(smmu);
  
        return ret;
@@@ -1197,22 -1322,21 +1197,22 @@@ static void arm_smmu_flush_iotlb_all(st
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
        struct arm_smmu_device *smmu = smmu_domain->smmu;
  
 -      if (smmu_domain->tlb_ops) {
 +      if (smmu_domain->flush_ops) {
                arm_smmu_rpm_get(smmu);
 -              smmu_domain->tlb_ops->tlb_flush_all(smmu_domain);
 +              smmu_domain->flush_ops->tlb.tlb_flush_all(smmu_domain);
                arm_smmu_rpm_put(smmu);
        }
  }
  
 -static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
 +static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
 +                              struct iommu_iotlb_gather *gather)
  {
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
        struct arm_smmu_device *smmu = smmu_domain->smmu;
  
 -      if (smmu_domain->tlb_ops) {
 +      if (smmu_domain->flush_ops) {
                arm_smmu_rpm_get(smmu);
 -              smmu_domain->tlb_ops->tlb_sync(smmu_domain);
 +              smmu_domain->flush_ops->tlb_sync(smmu_domain);
                arm_smmu_rpm_put(smmu);
        }
  }
@@@ -1225,25 -1349,28 +1225,25 @@@ static phys_addr_t arm_smmu_iova_to_phy
        struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
        struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
        struct device *dev = smmu->dev;
 -      void __iomem *cb_base;
 +      void __iomem *reg;
        u32 tmp;
        u64 phys;
        unsigned long va, flags;
 -      int ret;
 +      int ret, idx = cfg->cbndx;
  
        ret = arm_smmu_rpm_get(smmu);
        if (ret < 0)
                return 0;
  
 -      cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 -
        spin_lock_irqsave(&smmu_domain->cb_lock, flags);
 -      /* ATS1 registers can only be written atomically */
        va = iova & ~0xfffUL;
 -      if (smmu->version == ARM_SMMU_V2)
 -              smmu_write_atomic_lq(va, cb_base + ARM_SMMU_CB_ATS1PR);
 -      else /* Register is only 32-bit in v1 */
 -              writel_relaxed(va, cb_base + ARM_SMMU_CB_ATS1PR);
 +      if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
 +              arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
 +      else
 +              arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
  
 -      if (readl_poll_timeout_atomic(cb_base + ARM_SMMU_CB_ATSR, tmp,
 -                                    !(tmp & ATSR_ACTIVE), 5, 50)) {
 +      reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR;
 +      if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ATSR_ACTIVE), 5, 50)) {
                spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
                dev_err(dev,
                        "iova to phys timed out on %pad. Falling back to software table walk.\n",
                return ops->iova_to_phys(ops, iova);
        }
  
 -      phys = readq_relaxed(cb_base + ARM_SMMU_CB_PAR);
 +      phys = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_PAR);
        spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
        if (phys & CB_PAR_F) {
                dev_err(dev, "translation fault!\n");
@@@ -1299,16 -1426,11 +1299,11 @@@ static bool arm_smmu_capable(enum iommu
        }
  }
  
- static int arm_smmu_match_node(struct device *dev, const void *data)
- {
-       return dev->fwnode == data;
- }
  static
  struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
  {
-       struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL,
-                                               fwnode, arm_smmu_match_node);
+       struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
+                                                         fwnode);
        put_device(dev);
        return dev ? dev_get_drvdata(dev) : NULL;
  }
@@@ -1339,8 -1461,8 +1334,8 @@@ static int arm_smmu_add_device(struct d
  
        ret = -EINVAL;
        for (i = 0; i < fwspec->num_ids; i++) {
 -              u16 sid = fwspec->ids[i];
 -              u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT;
 +              u16 sid = FIELD_GET(SMR_ID, fwspec->ids[i]);
 +              u16 mask = FIELD_GET(SMR_MASK, fwspec->ids[i]);
  
                if (sid & ~smmu->streamid_mask) {
                        dev_err(dev, "stream ID 0x%x out of range for SMMU (0x%x)\n",
@@@ -1521,12 -1643,12 +1516,12 @@@ static int arm_smmu_of_xlate(struct dev
        u32 mask, fwid = 0;
  
        if (args->args_count > 0)
 -              fwid |= (u16)args->args[0];
 +              fwid |= FIELD_PREP(SMR_ID, args->args[0]);
  
        if (args->args_count > 1)
 -              fwid |= (u16)args->args[1] << SMR_MASK_SHIFT;
 +              fwid |= FIELD_PREP(SMR_MASK, args->args[1]);
        else if (!of_property_read_u32(args->np, "stream-match-mask", &mask))
 -              fwid |= (u16)mask << SMR_MASK_SHIFT;
 +              fwid |= FIELD_PREP(SMR_MASK, mask);
  
        return iommu_fwspec_add_ids(dev, &fwid, 1);
  }
@@@ -1579,12 -1701,13 +1574,12 @@@ static struct iommu_ops arm_smmu_ops = 
  
  static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
  {
 -      void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
        int i;
 -      u32 reg, major;
 +      u32 reg;
  
        /* clear global FSR */
 -      reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
 -      writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
 +      reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, reg);
  
        /*
         * Reset stream mapping groups: Initial values mark all SMRn as
        for (i = 0; i < smmu->num_mapping_groups; ++i)
                arm_smmu_write_sme(smmu, i);
  
 -      if (smmu->model == ARM_MMU500) {
 -              /*
 -               * Before clearing ARM_MMU500_ACTLR_CPRE, need to
 -               * clear CACHE_LOCK bit of ACR first. And, CACHE_LOCK
 -               * bit is only present in MMU-500r2 onwards.
 -               */
 -              reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID7);
 -              major = (reg >> ID7_MAJOR_SHIFT) & ID7_MAJOR_MASK;
 -              reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sACR);
 -              if (major >= 2)
 -                      reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
 -              /*
 -               * Allow unmatched Stream IDs to allocate bypass
 -               * TLB entries for reduced latency.
 -               */
 -              reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
 -              writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_sACR);
 -      }
 -
        /* Make sure all context banks are disabled and clear CB_FSR  */
        for (i = 0; i < smmu->num_context_banks; ++i) {
 -              void __iomem *cb_base = ARM_SMMU_CB(smmu, i);
 -
                arm_smmu_write_context_bank(smmu, i);
 -              writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
 -              /*
 -               * Disable MMU-500's not-particularly-beneficial next-page
 -               * prefetcher for the sake of errata #841119 and #826419.
 -               */
 -              if (smmu->model == ARM_MMU500) {
 -                      reg = readl_relaxed(cb_base + ARM_SMMU_CB_ACTLR);
 -                      reg &= ~ARM_MMU500_ACTLR_CPRE;
 -                      writel_relaxed(reg, cb_base + ARM_SMMU_CB_ACTLR);
 -              }
 +              arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, FSR_FAULT);
        }
  
        /* Invalidate the TLB, just in case */
 -      writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLH);
 -      writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLH, QCOM_DUMMY_VAL);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLNSNH, QCOM_DUMMY_VAL);
  
 -      reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 +      reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sCR0);
  
        /* Enable fault reporting */
        reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
        reg &= ~sCR0_FB;
  
        /* Don't upgrade barriers */
 -      reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
 +      reg &= ~(sCR0_BSU);
  
        if (smmu->features & ARM_SMMU_FEAT_VMID16)
                reg |= sCR0_VMID16EN;
        if (smmu->features & ARM_SMMU_FEAT_EXIDS)
                reg |= sCR0_EXIDENABLE;
  
 +      if (smmu->impl && smmu->impl->reset)
 +              smmu->impl->reset(smmu);
 +
        /* Push the button */
        arm_smmu_tlb_sync_global(smmu);
 -      writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, reg);
  }
  
  static int arm_smmu_id_size_to_bits(int size)
  
  static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
  {
 -      unsigned long size;
 -      void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 +      unsigned int size;
        u32 id;
        bool cttw_reg, cttw_fw = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK;
        int i;
                        smmu->version == ARM_SMMU_V2 ? 2 : 1);
  
        /* ID0 */
 -      id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID0);
 +      id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID0);
  
        /* Restrict available stages based on module parameter */
        if (force_stage == 1)
                smmu->features |= ARM_SMMU_FEAT_EXIDS;
                size = 1 << 16;
        } else {
 -              size = 1 << ((id >> ID0_NUMSIDB_SHIFT) & ID0_NUMSIDB_MASK);
 +              size = 1 << FIELD_GET(ID0_NUMSIDB, id);
        }
        smmu->streamid_mask = size - 1;
        if (id & ID0_SMS) {
                smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH;
 -              size = (id >> ID0_NUMSMRG_SHIFT) & ID0_NUMSMRG_MASK;
 +              size = FIELD_GET(ID0_NUMSMRG, id);
                if (size == 0) {
                        dev_err(smmu->dev,
                                "stream-matching supported, but no SMRs present!\n");
                        return -ENOMEM;
  
                dev_notice(smmu->dev,
 -                         "\tstream matching with %lu register groups", size);
 +                         "\tstream matching with %u register groups", size);
        }
        /* s2cr->type == 0 means translation, so initialise explicitly */
        smmu->s2crs = devm_kmalloc_array(smmu->dev, size, sizeof(*smmu->s2crs),
        }
  
        /* ID1 */
 -      id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1);
 +      id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID1);
        smmu->pgshift = (id & ID1_PAGESIZE) ? 16 : 12;
  
        /* Check for size mismatch of SMMU address space from mapped region */
 -      size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
 -      size <<= smmu->pgshift;
 -      if (smmu->cb_base != gr0_base + size)
 +      size = 1 << (FIELD_GET(ID1_NUMPAGENDXB, id) + 1);
 +      if (smmu->numpage != 2 * size << smmu->pgshift)
                dev_warn(smmu->dev,
 -                      "SMMU address space size (0x%lx) differs from mapped region size (0x%tx)!\n",
 -                      size * 2, (smmu->cb_base - gr0_base) * 2);
 +                      "SMMU address space size (0x%x) differs from mapped region size (0x%x)!\n",
 +                      2 * size << smmu->pgshift, smmu->numpage);
 +      /* Now properly encode NUMPAGE to subsequently derive SMMU_CB_BASE */
 +      smmu->numpage = size;
  
 -      smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
 -      smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
 +      smmu->num_s2_context_banks = FIELD_GET(ID1_NUMS2CB, id);
 +      smmu->num_context_banks = FIELD_GET(ID1_NUMCB, id);
        if (smmu->num_s2_context_banks > smmu->num_context_banks) {
                dev_err(smmu->dev, "impossible number of S2 context banks!\n");
                return -ENODEV;
        }
        dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n",
                   smmu->num_context_banks, smmu->num_s2_context_banks);
 -      /*
 -       * Cavium CN88xx erratum #27704.
 -       * Ensure ASID and VMID allocation is unique across all SMMUs in
 -       * the system.
 -       */
 -      if (smmu->model == CAVIUM_SMMUV2) {
 -              smmu->cavium_id_base =
 -                      atomic_add_return(smmu->num_context_banks,
 -                                        &cavium_smmu_context_count);
 -              smmu->cavium_id_base -= smmu->num_context_banks;
 -              dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
 -      }
        smmu->cbs = devm_kcalloc(smmu->dev, smmu->num_context_banks,
                                 sizeof(*smmu->cbs), GFP_KERNEL);
        if (!smmu->cbs)
                return -ENOMEM;
  
        /* ID2 */
 -      id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID2);
 -      size = arm_smmu_id_size_to_bits((id >> ID2_IAS_SHIFT) & ID2_IAS_MASK);
 +      id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID2);
 +      size = arm_smmu_id_size_to_bits(FIELD_GET(ID2_IAS, id));
        smmu->ipa_size = size;
  
        /* The output mask is also applied for bypass */
 -      size = arm_smmu_id_size_to_bits((id >> ID2_OAS_SHIFT) & ID2_OAS_MASK);
 +      size = arm_smmu_id_size_to_bits(FIELD_GET(ID2_OAS, id));
        smmu->pa_size = size;
  
        if (id & ID2_VMID16)
                if (smmu->version == ARM_SMMU_V1_64K)
                        smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
        } else {
 -              size = (id >> ID2_UBS_SHIFT) & ID2_UBS_MASK;
 +              size = FIELD_GET(ID2_UBS, id);
                smmu->va_size = arm_smmu_id_size_to_bits(size);
                if (id & ID2_PTFS_4K)
                        smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_4K;
                dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
                           smmu->ipa_size, smmu->pa_size);
  
 +      if (smmu->impl && smmu->impl->cfg_probe)
 +              return smmu->impl->cfg_probe(smmu);
 +
        return 0;
  }
  
@@@ -1966,6 -2125,8 +1961,6 @@@ static int arm_smmu_device_dt_probe(str
        smmu->version = data->version;
        smmu->model = data->model;
  
 -      parse_driver_options(smmu);
 -
        legacy_binding = of_find_property(dev->of_node, "mmu-masters", NULL);
        if (legacy_binding && !using_generic_binding) {
                if (!using_legacy_binding)
@@@ -2028,20 -2189,12 +2023,20 @@@ static int arm_smmu_device_probe(struc
        if (err)
                return err;
  
 +      smmu = arm_smmu_impl_init(smmu);
 +      if (IS_ERR(smmu))
 +              return PTR_ERR(smmu);
 +
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        ioaddr = res->start;
        smmu->base = devm_ioremap_resource(dev, res);
        if (IS_ERR(smmu->base))
                return PTR_ERR(smmu->base);
 -      smmu->cb_base = smmu->base + resource_size(res) / 2;
 +      /*
 +       * The resource size should effectively match the value of SMMU_TOP;
 +       * stash that temporarily until we know PAGESIZE to validate it with.
 +       */
 +      smmu->numpage = resource_size(res);
  
        num_irqs = 0;
        while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
@@@ -2181,7 -2334,7 +2176,7 @@@ static void arm_smmu_device_shutdown(st
  
        arm_smmu_rpm_get(smmu);
        /* Turn the thing off */
 -      writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 +      arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, sCR0_CLIENTPD);
        arm_smmu_rpm_put(smmu);
  
        if (pm_runtime_enabled(smmu->dev))
index 9112faa6a9a0e12830e8f931e83fcd56070e1ac5,20dcc9c03adc29d058bc7334f05f7f4e9e4665e1..afefb29ce1b0decf8f7ab2105be70ee9f5d59b7a
@@@ -416,10 -416,8 +416,10 @@@ device_node *regulator_of_get_init_node
                if (!name)
                        name = child->name;
  
 -              if (!strcmp(desc->of_match, name))
 +              if (!strcmp(desc->of_match, name)) {
 +                      of_node_put(search);
                        return of_node_get(child);
 +              }
        }
  
        of_node_put(search);
@@@ -462,16 -460,11 +462,11 @@@ error
        return NULL;
  }
  
- static int of_node_match(struct device *dev, const void *data)
- {
-       return dev->of_node == data;
- }
  struct regulator_dev *of_find_regulator_by_node(struct device_node *np)
  {
        struct device *dev;
  
-       dev = class_find_device(&regulator_class, NULL, np, of_node_match);
+       dev = class_find_device_by_of_node(&regulator_class, np);
  
        return dev ? dev_to_rdev(dev) : NULL;
  }
index 563801427fe4609aa53c79c75bee34a07b0450a7,150f6236c9bb4eeeabb2789215f35fa60e38b626..45bdb47f84c10b69572c77cbfa1b1f9ca16b1b5a
@@@ -35,7 -35,6 +35,7 @@@
  
  #include "zcrypt_msgtype6.h"
  #include "zcrypt_msgtype50.h"
 +#include "zcrypt_ccamisc.h"
  
  /*
   * Module description.
@@@ -134,18 -133,6 +134,6 @@@ struct zcdn_device 
  static int zcdn_create(const char *name);
  static int zcdn_destroy(const char *name);
  
- /* helper function, matches the name for find_zcdndev_by_name() */
- static int __match_zcdn_name(struct device *dev, const void *data)
- {
-       return strcmp(dev_name(dev), (const char *)data) == 0;
- }
- /* helper function, matches the devt value for find_zcdndev_by_devt() */
- static int __match_zcdn_devt(struct device *dev, const void *data)
- {
-       return dev->devt == *((dev_t *) data);
- }
  /*
   * Find zcdn device by name.
   * Returns reference to the zcdn device which needs to be released
   */
  static inline struct zcdn_device *find_zcdndev_by_name(const char *name)
  {
-       struct device *dev =
-               class_find_device(zcrypt_class, NULL,
-                                 (void *) name,
-                                 __match_zcdn_name);
+       struct device *dev = class_find_device_by_name(zcrypt_class, name);
  
        return dev ? to_zcdn_dev(dev) : NULL;
  }
   */
  static inline struct zcdn_device *find_zcdndev_by_devt(dev_t devt)
  {
-       struct device *dev =
-               class_find_device(zcrypt_class, NULL,
-                                 (void *) &devt,
-                                 __match_zcdn_devt);
+       struct device *dev = class_find_device_by_devt(zcrypt_class, devt);
  
        return dev ? to_zcdn_dev(dev) : NULL;
  }
@@@ -1161,34 -1142,6 +1143,34 @@@ void zcrypt_device_status_mask_ext(stru
  }
  EXPORT_SYMBOL(zcrypt_device_status_mask_ext);
  
 +int zcrypt_device_status_ext(int card, int queue,
 +                           struct zcrypt_device_status_ext *devstat)
 +{
 +      struct zcrypt_card *zc;
 +      struct zcrypt_queue *zq;
 +
 +      memset(devstat, 0, sizeof(*devstat));
 +
 +      spin_lock(&zcrypt_list_lock);
 +      for_each_zcrypt_card(zc) {
 +              for_each_zcrypt_queue(zq, zc) {
 +                      if (card == AP_QID_CARD(zq->queue->qid) &&
 +                          queue == AP_QID_QUEUE(zq->queue->qid)) {
 +                              devstat->hwtype = zc->card->ap_dev.device_type;
 +                              devstat->functions = zc->card->functions >> 26;
 +                              devstat->qid = zq->queue->qid;
 +                              devstat->online = zq->online ? 0x01 : 0x00;
 +                              spin_unlock(&zcrypt_list_lock);
 +                              return 0;
 +                      }
 +              }
 +      }
 +      spin_unlock(&zcrypt_list_lock);
 +
 +      return -ENODEV;
 +}
 +EXPORT_SYMBOL(zcrypt_device_status_ext);
 +
  static void zcrypt_status_mask(char status[], size_t max_adapters)
  {
        struct zcrypt_card *zc;
@@@ -1903,7 -1856,6 +1885,7 @@@ void __exit zcrypt_api_exit(void
        misc_deregister(&zcrypt_misc_device);
        zcrypt_msgtype6_exit();
        zcrypt_msgtype50_exit();
 +      zcrypt_ccamisc_exit();
        zcrypt_debug_exit();
  }
  
diff --combined drivers/spi/spi.c
index f8b4654a57d3950f9678e0b3df0de8ddbfe4bee5,c486a6f84c2c9f14ffc9d0aba0a0b7933a553129..f9502dbbb5c1e5a590289bf2818205f508d5b105
@@@ -1265,9 -1265,8 +1265,9 @@@ EXPORT_SYMBOL_GPL(spi_finalize_current_
   */
  static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
  {
 -      unsigned long flags;
 +      struct spi_message *msg;
        bool was_busy = false;
 +      unsigned long flags;
        int ret;
  
        /* Lock queue */
        }
  
        /* Extract head of queue */
 -      ctlr->cur_msg =
 -              list_first_entry(&ctlr->queue, struct spi_message, queue);
 +      msg = list_first_entry(&ctlr->queue, struct spi_message, queue);
 +      ctlr->cur_msg = msg;
  
 -      list_del_init(&ctlr->cur_msg->queue);
 +      list_del_init(&msg->queue);
        if (ctlr->busy)
                was_busy = true;
        else
                        if (ctlr->auto_runtime_pm)
                                pm_runtime_put(ctlr->dev.parent);
  
 -                      ctlr->cur_msg->status = ret;
 +                      msg->status = ret;
                        spi_finalize_current_message(ctlr);
  
                        mutex_unlock(&ctlr->io_mutex);
                }
        }
  
 -      trace_spi_message_start(ctlr->cur_msg);
 +      trace_spi_message_start(msg);
  
        if (ctlr->prepare_message) {
 -              ret = ctlr->prepare_message(ctlr, ctlr->cur_msg);
 +              ret = ctlr->prepare_message(ctlr, msg);
                if (ret) {
                        dev_err(&ctlr->dev, "failed to prepare message: %d\n",
                                ret);
 -                      ctlr->cur_msg->status = ret;
 +                      msg->status = ret;
                        spi_finalize_current_message(ctlr);
                        goto out;
                }
                ctlr->cur_msg_prepared = true;
        }
  
 -      ret = spi_map_msg(ctlr, ctlr->cur_msg);
 +      ret = spi_map_msg(ctlr, msg);
        if (ret) {
 -              ctlr->cur_msg->status = ret;
 +              msg->status = ret;
                spi_finalize_current_message(ctlr);
                goto out;
        }
  
 -      ret = ctlr->transfer_one_message(ctlr, ctlr->cur_msg);
 +      ret = ctlr->transfer_one_message(ctlr, msg);
        if (ret) {
                dev_err(&ctlr->dev,
                        "failed to transfer one message from queue\n");
@@@ -1435,7 -1434,7 +1435,7 @@@ static void spi_pump_messages(struct kt
   */
  static void spi_set_thread_rt(struct spi_controller *ctlr)
  {
 -      struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 +      struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
  
        dev_info(&ctlr->dev,
                "will run message pump with realtime priority\n");
@@@ -2106,8 -2105,8 +2106,8 @@@ static int match_true(struct device *de
        return 1;
  }
  
 -static ssize_t spi_slave_show(struct device *dev,
 -                            struct device_attribute *attr, char *buf)
 +static ssize_t slave_show(struct device *dev, struct device_attribute *attr,
 +                        char *buf)
  {
        struct spi_controller *ctlr = container_of(dev, struct spi_controller,
                                                   dev);
                       child ? to_spi_device(child)->modalias : NULL);
  }
  
 -static ssize_t spi_slave_store(struct device *dev,
 -                             struct device_attribute *attr, const char *buf,
 -                             size_t count)
 +static ssize_t slave_store(struct device *dev, struct device_attribute *attr,
 +                         const char *buf, size_t count)
  {
        struct spi_controller *ctlr = container_of(dev, struct spi_controller,
                                                   dev);
        return count;
  }
  
 -static DEVICE_ATTR(slave, 0644, spi_slave_show, spi_slave_store);
 +static DEVICE_ATTR_RW(slave);
  
  static struct attribute *spi_slave_attrs[] = {
        &dev_attr_slave.attr,
@@@ -2188,10 -2188,8 +2188,10 @@@ extern struct class spi_slave_class;  /
   * __spi_alloc_controller - allocate an SPI master or slave controller
   * @dev: the controller, possibly using the platform_bus
   * @size: how much zeroed driver-private data to allocate; the pointer to this
 - *    memory is in the driver_data field of the returned device,
 - *    accessible with spi_controller_get_devdata().
 + *    memory is in the driver_data field of the returned device, accessible
 + *    with spi_controller_get_devdata(); the memory is cacheline aligned;
 + *    drivers granting DMA access to portions of their private data need to
 + *    round up @size using ALIGN(size, dma_get_cache_alignment()).
   * @slave: flag indicating whether to allocate an SPI master (false) or SPI
   *    slave (true) controller
   * Context: can sleep
@@@ -2213,12 -2211,11 +2213,12 @@@ struct spi_controller *__spi_alloc_cont
                                              unsigned int size, bool slave)
  {
        struct spi_controller   *ctlr;
 +      size_t ctlr_size = ALIGN(sizeof(*ctlr), dma_get_cache_alignment());
  
        if (!dev)
                return NULL;
  
 -      ctlr = kzalloc(size + sizeof(*ctlr), GFP_KERNEL);
 +      ctlr = kzalloc(size + ctlr_size, GFP_KERNEL);
        if (!ctlr)
                return NULL;
  
                ctlr->dev.class = &spi_master_class;
        ctlr->dev.parent = dev;
        pm_suspend_ignore_children(&ctlr->dev, true);
 -      spi_controller_set_devdata(ctlr, &ctlr[1]);
 +      spi_controller_set_devdata(ctlr, (void *)ctlr + ctlr_size);
  
        return ctlr;
  }
  EXPORT_SYMBOL_GPL(__spi_alloc_controller);
  
  #ifdef CONFIG_OF
 -static int of_spi_register_master(struct spi_controller *ctlr)
 +static int of_spi_get_gpio_numbers(struct spi_controller *ctlr)
  {
        int nb, i, *cs;
        struct device_node *np = ctlr->dev.of_node;
        return 0;
  }
  #else
 -static int of_spi_register_master(struct spi_controller *ctlr)
 +static int of_spi_get_gpio_numbers(struct spi_controller *ctlr)
  {
        return 0;
  }
@@@ -2459,7 -2456,7 +2459,7 @@@ int spi_register_controller(struct spi_
                        ctlr->mode_bits |= SPI_CS_HIGH;
                } else {
                        /* Legacy code path for GPIOs from DT */
 -                      status = of_spi_register_master(ctlr);
 +                      status = of_spi_get_gpio_numbers(ctlr);
                        if (status)
                                return status;
                }
@@@ -3655,37 -3652,25 +3655,25 @@@ EXPORT_SYMBOL_GPL(spi_write_then_read)
  /*-------------------------------------------------------------------------*/
  
  #if IS_ENABLED(CONFIG_OF)
- static int __spi_of_device_match(struct device *dev, const void *data)
- {
-       return dev->of_node == data;
- }
  /* must call put_device() when done with returned spi_device device */
  struct spi_device *of_find_spi_device_by_node(struct device_node *node)
  {
-       struct device *dev = bus_find_device(&spi_bus_type, NULL, node,
-                                               __spi_of_device_match);
+       struct device *dev = bus_find_device_by_of_node(&spi_bus_type, node);
        return dev ? to_spi_device(dev) : NULL;
  }
  EXPORT_SYMBOL_GPL(of_find_spi_device_by_node);
  #endif /* IS_ENABLED(CONFIG_OF) */
  
  #if IS_ENABLED(CONFIG_OF_DYNAMIC)
- static int __spi_of_controller_match(struct device *dev, const void *data)
- {
-       return dev->of_node == data;
- }
  /* the spi controllers are not using spi_bus, so we find it with another way */
  static struct spi_controller *of_find_spi_controller_by_node(struct device_node *node)
  {
        struct device *dev;
  
-       dev = class_find_device(&spi_master_class, NULL, node,
-                               __spi_of_controller_match);
+       dev = class_find_device_by_of_node(&spi_master_class, node);
        if (!dev && IS_ENABLED(CONFIG_SPI_SLAVE))
-               dev = class_find_device(&spi_slave_class, NULL, node,
-                                       __spi_of_controller_match);
+               dev = class_find_device_by_of_node(&spi_slave_class, node);
        if (!dev)
                return NULL;
  
@@@ -3756,11 -3741,6 +3744,6 @@@ static int spi_acpi_controller_match(st
        return ACPI_COMPANION(dev->parent) == data;
  }
  
- static int spi_acpi_device_match(struct device *dev, const void *data)
- {
-       return ACPI_COMPANION(dev) == data;
- }
  static struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev)
  {
        struct device *dev;
@@@ -3780,8 -3760,7 +3763,7 @@@ static struct spi_device *acpi_spi_find
  {
        struct device *dev;
  
-       dev = bus_find_device(&spi_bus_type, NULL, adev, spi_acpi_device_match);
+       dev = bus_find_device_by_acpi_dev(&spi_bus_type, adev);
        return dev ? to_spi_device(dev) : NULL;
  }
  
diff --combined drivers/usb/core/devio.c
index 9063ede411ae73e9ceffe7430ad578f6c7283878,60268aee93a8439ba1e73922ab1d8149fc9b710c..e3696601e43bbc6cb4fc7605fe43d72d70915a46
@@@ -942,17 -942,11 +942,11 @@@ error
        return ret;
  }
  
- static int match_devt(struct device *dev, const void *data)
- {
-       return dev->devt == (dev_t)(unsigned long)(void *)data;
- }
  static struct usb_device *usbdev_lookup_by_devt(dev_t devt)
  {
        struct device *dev;
  
-       dev = bus_find_device(&usb_bus_type, NULL,
-                             (void *) (unsigned long) devt, match_devt);
+       dev = bus_find_device_by_devt(&usb_bus_type, devt);
        if (!dev)
                return NULL;
        return to_usb_device(dev);
@@@ -1812,6 -1806,8 +1806,6 @@@ static int proc_do_submiturb(struct usb
        return 0;
  
   error:
 -      if (as && as->usbm)
 -              dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);
        kfree(isopkt);
        kfree(dr);
        if (as)
diff --combined include/linux/device.h
index 6717adee33f0199627c04c6e5b9363552e97a574,8ae3f4b472934270e99b3eff7c4d1a2fe7636639..8b9bffde0d8662ce93f01e34a589a6faae84a8b0
@@@ -164,16 -164,100 +164,100 @@@ void subsys_dev_iter_init(struct subsys
  struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter);
  void subsys_dev_iter_exit(struct subsys_dev_iter *iter);
  
+ int device_match_name(struct device *dev, const void *name);
  int device_match_of_node(struct device *dev, const void *np);
+ int device_match_fwnode(struct device *dev, const void *fwnode);
+ int device_match_devt(struct device *dev, const void *pdevt);
+ int device_match_acpi_dev(struct device *dev, const void *adev);
+ int device_match_any(struct device *dev, const void *unused);
  
  int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
                     int (*fn)(struct device *dev, void *data));
  struct device *bus_find_device(struct bus_type *bus, struct device *start,
                               const void *data,
                               int (*match)(struct device *dev, const void *data));
- struct device *bus_find_device_by_name(struct bus_type *bus,
-                                      struct device *start,
-                                      const char *name);
+ /**
+  * bus_find_device_by_name - device iterator for locating a particular device
+  * of a specific name.
+  * @bus: bus type
+  * @start: Device to begin with
+  * @name: name of the device to match
+  */
+ static inline struct device *bus_find_device_by_name(struct bus_type *bus,
+                                                    struct device *start,
+                                                    const char *name)
+ {
+       return bus_find_device(bus, start, name, device_match_name);
+ }
+ /**
+  * bus_find_device_by_of_node : device iterator for locating a particular device
+  * matching the of_node.
+  * @bus: bus type
+  * @np: of_node of the device to match.
+  */
+ static inline struct device *
+ bus_find_device_by_of_node(struct bus_type *bus, const struct device_node *np)
+ {
+       return bus_find_device(bus, NULL, np, device_match_of_node);
+ }
+ /**
+  * bus_find_device_by_fwnode : device iterator for locating a particular device
+  * matching the fwnode.
+  * @bus: bus type
+  * @fwnode: fwnode of the device to match.
+  */
+ static inline struct device *
+ bus_find_device_by_fwnode(struct bus_type *bus, const struct fwnode_handle *fwnode)
+ {
+       return bus_find_device(bus, NULL, fwnode, device_match_fwnode);
+ }
+ /**
+  * bus_find_device_by_devt : device iterator for locating a particular device
+  * matching the device type.
+  * @bus: bus type
+  * @devt: device type of the device to match.
+  */
+ static inline struct device *bus_find_device_by_devt(struct bus_type *bus,
+                                                    dev_t devt)
+ {
+       return bus_find_device(bus, NULL, &devt, device_match_devt);
+ }
+ /**
+  * bus_find_next_device - Find the next device after a given device in a
+  * given bus.
+  */
+ static inline struct device *
+ bus_find_next_device(struct bus_type *bus,struct device *cur)
+ {
+       return bus_find_device(bus, cur, NULL, device_match_any);
+ }
+ #ifdef CONFIG_ACPI
+ struct acpi_device;
+ /**
+  * bus_find_device_by_acpi_dev : device iterator for locating a particular device
+  * matching the ACPI COMPANION device.
+  * @bus: bus type
+  * @adev: ACPI COMPANION device to match.
+  */
+ static inline struct device *
+ bus_find_device_by_acpi_dev(struct bus_type *bus, const struct acpi_device *adev)
+ {
+       return bus_find_device(bus, NULL, adev, device_match_acpi_dev);
+ }
+ #else
+ static inline struct device *
+ bus_find_device_by_acpi_dev(struct bus_type *bus, const void *adev)
+ {
+       return NULL;
+ }
+ #endif
  struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id,
                                        struct device *hint);
  int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,
@@@ -342,6 -426,83 +426,83 @@@ struct device *driver_find_device(struc
                                  struct device *start, const void *data,
                                  int (*match)(struct device *dev, const void *data));
  
+ /**
+  * driver_find_device_by_name - device iterator for locating a particular device
+  * of a specific name.
+  * @driver: the driver we're iterating
+  * @name: name of the device to match
+  */
+ static inline struct device *driver_find_device_by_name(struct device_driver *drv,
+                                                       const char *name)
+ {
+       return driver_find_device(drv, NULL, name, device_match_name);
+ }
+ /**
+  * driver_find_device_by_of_node- device iterator for locating a particular device
+  * by of_node pointer.
+  * @driver: the driver we're iterating
+  * @np: of_node pointer to match.
+  */
+ static inline struct device *
+ driver_find_device_by_of_node(struct device_driver *drv,
+                             const struct device_node *np)
+ {
+       return driver_find_device(drv, NULL, np, device_match_of_node);
+ }
+ /**
+  * driver_find_device_by_fwnode- device iterator for locating a particular device
+  * by fwnode pointer.
+  * @driver: the driver we're iterating
+  * @fwnode: fwnode pointer to match.
+  */
+ static inline struct device *
+ driver_find_device_by_fwnode(struct device_driver *drv,
+                            const struct fwnode_handle *fwnode)
+ {
+       return driver_find_device(drv, NULL, fwnode, device_match_fwnode);
+ }
+ /**
+  * driver_find_device_by_devt- device iterator for locating a particular device
+  * by devt.
+  * @driver: the driver we're iterating
+  * @devt: devt pointer to match.
+  */
+ static inline struct device *driver_find_device_by_devt(struct device_driver *drv,
+                                                       dev_t devt)
+ {
+       return driver_find_device(drv, NULL, &devt, device_match_devt);
+ }
+ static inline struct device *driver_find_next_device(struct device_driver *drv,
+                                                    struct device *start)
+ {
+       return driver_find_device(drv, start, NULL, device_match_any);
+ }
+ #ifdef CONFIG_ACPI
+ /**
+  * driver_find_device_by_acpi_dev : device iterator for locating a particular
+  * device matching the ACPI_COMPANION device.
+  * @driver: the driver we're iterating
+  * @adev: ACPI_COMPANION device to match.
+  */
+ static inline struct device *
+ driver_find_device_by_acpi_dev(struct device_driver *drv,
+                              const struct acpi_device *adev)
+ {
+       return driver_find_device(drv, NULL, adev, device_match_acpi_dev);
+ }
+ #else
+ static inline struct device *
+ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev)
+ {
+       return NULL;
+ }
+ #endif
  void driver_deferred_probe_add(struct device *dev);
  int driver_deferred_probe_check_state(struct device *dev);
  int driver_deferred_probe_check_state_continue(struct device *dev);
@@@ -471,6 -632,76 +632,76 @@@ extern struct device *class_find_device
                                        struct device *start, const void *data,
                                        int (*match)(struct device *, const void *));
  
+ /**
+  * class_find_device_by_name - device iterator for locating a particular device
+  * of a specific name.
+  * @class: class type
+  * @name: name of the device to match
+  */
+ static inline struct device *class_find_device_by_name(struct class *class,
+                                                      const char *name)
+ {
+       return class_find_device(class, NULL, name, device_match_name);
+ }
+ /**
+  * class_find_device_by_of_node : device iterator for locating a particular device
+  * matching the of_node.
+  * @class: class type
+  * @np: of_node of the device to match.
+  */
+ static inline struct device *
+ class_find_device_by_of_node(struct class *class, const struct device_node *np)
+ {
+       return class_find_device(class, NULL, np, device_match_of_node);
+ }
+ /**
+  * class_find_device_by_fwnode : device iterator for locating a particular device
+  * matching the fwnode.
+  * @class: class type
+  * @fwnode: fwnode of the device to match.
+  */
+ static inline struct device *
+ class_find_device_by_fwnode(struct class *class,
+                           const struct fwnode_handle *fwnode)
+ {
+       return class_find_device(class, NULL, fwnode, device_match_fwnode);
+ }
+ /**
+  * class_find_device_by_devt : device iterator for locating a particular device
+  * matching the device type.
+  * @class: class type
+  * @devt: device type of the device to match.
+  */
+ static inline struct device *class_find_device_by_devt(struct class *class,
+                                                      dev_t devt)
+ {
+       return class_find_device(class, NULL, &devt, device_match_devt);
+ }
+ #ifdef CONFIG_ACPI
+ struct acpi_device;
+ /**
+  * class_find_device_by_acpi_dev : device iterator for locating a particular
+  * device matching the ACPI_COMPANION device.
+  * @class: class type
+  * @adev: ACPI_COMPANION device to match.
+  */
+ static inline struct device *
+ class_find_device_by_acpi_dev(struct class *class, const struct acpi_device *adev)
+ {
+       return class_find_device(class, NULL, adev, device_match_acpi_dev);
+ }
+ #else
+ static inline struct device *
+ class_find_device_by_acpi_dev(struct class *class, const void *adev)
+ {
+       return NULL;
+ }
+ #endif
  struct class_attribute {
        struct attribute attr;
        ssize_t (*show)(struct class *class, struct class_attribute *attr,
@@@ -915,8 -1146,6 +1146,8 @@@ struct dev_links_info 
   *            This identifies the device type and carries type-specific
   *            information.
   * @mutex:    Mutex to synchronize calls to its driver.
 + * @lockdep_mutex: An optional debug lock that a subsystem can use as a
 + *            peer lock to gain localized lockdep coverage of the device_lock.
   * @bus:      Type of bus device is on.
   * @driver:   Which driver has allocated this
   * @platform_data: Platform data specific to the device.
@@@ -1000,9 -1229,6 +1231,9 @@@ struct device 
                                           core doesn't touch it */
        void            *driver_data;   /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
 +#ifdef CONFIG_PROVE_LOCKING
 +      struct mutex            lockdep_mutex;
 +#endif
        struct mutex            mutex;  /* mutex to synchronize calls to
                                         * its driver.
                                         */
@@@ -1388,7 -1614,6 +1619,7 @@@ extern int (*platform_notify_remove)(st
   */
  extern struct device *get_device(struct device *dev);
  extern void put_device(struct device *dev);
 +extern bool kill_device(struct device *dev);
  
  #ifdef CONFIG_DEVTMPFS
  extern int devtmpfs_create_node(struct device *dev);