Merge tag 'devdax-for-5.1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 16 Mar 2019 20:05:32 +0000 (13:05 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 16 Mar 2019 20:05:32 +0000 (13:05 -0700)
Pull device-dax updates from Dan Williams:
 "New device-dax infrastructure to allow persistent memory and other
  "reserved" / performance differentiated memories, to be assigned to
  the core-mm as "System RAM".

  Some users want to use persistent memory as additional volatile
  memory. They are willing to cope with potential performance
  differences, for example between DRAM and 3D Xpoint, and want to use
  typical Linux memory management apis rather than a userspace memory
  allocator layered over an mmap() of a dax file. The administration
  model is to decide how much Persistent Memory (pmem) to use as System
  RAM, create a device-dax-mode namespace of that size, and then assign
  it to the core-mm. The rationale for device-dax is that it is a
  generic memory-mapping driver that can be layered over any "special
  purpose" memory, not just pmem. On subsequent boots udev rules can be
  used to restore the memory assignment.

  One implication of using pmem as RAM is that mlock() no longer keeps
  data off persistent media. For this reason it is recommended to enable
  NVDIMM Security (previously merged for 5.0) to encrypt pmem contents
  at rest. We considered making this recommendation an actively enforced
  requirement, but in the end decided to leave it as a distribution /
  administrator policy to allow for emulation and test environments that
  lack security capable NVDIMMs.

  Summary:

   - Replace the /sys/class/dax device model with /sys/bus/dax, and
     include a compat driver so distributions can opt-in to the new ABI.

   - Allow for an alternative driver for the device-dax address-range

   - Introduce the 'kmem' driver to hotplug / assign a device-dax
     address-range to the core-mm.

   - Arrange for the device-dax target-node to be onlined so that the
     newly added memory range can be uniquely referenced by numa apis"

NOTE! I'm not entirely happy with the whole "PMEM as RAM" model because
we currently have special - and very annoying rules in the kernel about
accessing PMEM only with the "MC safe" accessors, because machine checks
inside the regular repeat string copy functions can be fatal in some
(not described) circumstances.

And apparently the PMEM modules can cause that a lot more than regular
RAM.  The argument is that this happens because PMEM doesn't necessarily
get scrubbed at boot like RAM does, but that is planned to be added for
the user space tooling.

Quoting Dan from another email:
 "The exposure can be reduced in the volatile-RAM case by scanning for
  and clearing errors before it is onlined as RAM. The userspace tooling
  for that can be in place before v5.1-final. There's also runtime
  notifications of errors via acpi_nfit_uc_error_notify() from
  background scrubbers on the DIMM devices. With that mechanism the
  kernel could proactively clear newly discovered poison in the volatile
  case, but that would be additional development more suitable for v5.2.

  I understand the concern, and the need to highlight this issue by
  tapping the brakes on feature development, but I don't see PMEM as RAM
  making the situation worse when the exposure is also there via DAX in
  the PMEM case. Volatile-RAM is arguably a safer use case since it's
  possible to repair pages where the persistent case needs active
  application coordination"

* tag 'devdax-for-5.1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  device-dax: "Hotplug" persistent memory for use like normal RAM
  mm/resource: Let walk_system_ram_range() search child resources
  mm/memory-hotplug: Allow memory resources to be children
  mm/resource: Move HMM pr_debug() deeper into resource code
  mm/resource: Return real error codes from walk failures
  device-dax: Add a 'modalias' attribute to DAX 'bus' devices
  device-dax: Add a 'target_node' attribute
  device-dax: Auto-bind device after successful new_id
  acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node
  device-dax: Add /sys/class/dax backwards compatibility
  device-dax: Add support for a dax override driver
  device-dax: Move resource pinning+mapping into the common driver
  device-dax: Introduce bus + driver model
  device-dax: Start defining a dax bus model
  device-dax: Remove multi-resource infrastructure
  device-dax: Kill dax_region base
  device-dax: Kill dax_region ida

1  2 
arch/powerpc/platforms/pseries/papr_scm.c
drivers/acpi/nfit/core.c
drivers/acpi/numa.c
drivers/dax/super.c
drivers/nvdimm/nd.h
drivers/nvdimm/of_pmem.c
drivers/nvdimm/region_devs.c
include/linux/acpi.h
include/linux/libnvdimm.h
kernel/resource.c
mm/memory_hotplug.c

index bba281b1fe1b0730f8a0d31fc469f324118a8933,8806ac8226275ffa8f9fe1707bdbf89d2345e6bc..96c53b23e58f9c843fea7fed17192c5e60d2a0fd
@@@ -43,7 -43,6 +43,7 @@@ static int drc_pmem_bind(struct papr_sc
  {
        unsigned long ret[PLPAR_HCALL_BUFSIZE];
        uint64_t rc, token;
 +      uint64_t saved = 0;
  
        /*
         * When the hypervisor cannot map all the requested memory in a single
@@@ -57,8 -56,6 +57,8 @@@
                rc = plpar_hcall(H_SCM_BIND_MEM, ret, p->drc_index, 0,
                                p->blocks, BIND_ANY_ADDR, token);
                token = ret[0];
 +              if (!saved)
 +                      saved = ret[1];
                cond_resched();
        } while (rc == H_BUSY);
  
@@@ -67,7 -64,7 +67,7 @@@
                return -ENXIO;
        }
  
 -      p->bound_addr = ret[1];
 +      p->bound_addr = saved;
  
        dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res);
  
@@@ -239,6 -236,7 +239,7 @@@ static int papr_scm_nvdimm_init(struct 
        memset(&ndr_desc, 0, sizeof(ndr_desc));
        ndr_desc.attr_groups = region_attr_groups;
        ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
+       ndr_desc.target_node = ndr_desc.numa_node;
        ndr_desc.res = &p->res;
        ndr_desc.of_node = p->dn;
        ndr_desc.provider_data = p;
diff --combined drivers/acpi/nfit/core.c
index df8979008dd4ec6496c1e5600ec856a300dec08f,475899974c7002816de86a9ef62d867043fdbd6c..5a389a4f4f652edda26c109baf5e595bf6325903
@@@ -26,6 -26,7 +26,6 @@@
  #include <acpi/nfit.h>
  #include "intel.h"
  #include "nfit.h"
 -#include "intel.h"
  
  /*
   * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
@@@ -55,10 -56,6 +55,10 @@@ static bool no_init_ars
  module_param(no_init_ars, bool, 0644);
  MODULE_PARM_DESC(no_init_ars, "Skip ARS run at nfit init time");
  
 +static bool force_labels;
 +module_param(force_labels, bool, 0444);
 +MODULE_PARM_DESC(force_labels, "Opt-in to labels despite missing methods");
 +
  LIST_HEAD(acpi_descs);
  DEFINE_MUTEX(acpi_desc_lock);
  
@@@ -81,6 -78,12 +81,6 @@@ const guid_t *to_nfit_uuid(enum nfit_uu
  }
  EXPORT_SYMBOL(to_nfit_uuid);
  
 -static struct acpi_nfit_desc *to_acpi_nfit_desc(
 -              struct nvdimm_bus_descriptor *nd_desc)
 -{
 -      return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
 -}
 -
  static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
  {
        struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
@@@ -413,40 -416,10 +413,40 @@@ static bool payload_dumpable(struct nvd
        return true;
  }
  
 +static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd,
 +              struct nd_cmd_pkg *call_pkg)
 +{
 +      if (call_pkg) {
 +              int i;
 +
 +              if (nfit_mem && nfit_mem->family != call_pkg->nd_family)
 +                      return -ENOTTY;
 +
 +              for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
 +                      if (call_pkg->nd_reserved2[i])
 +                              return -EINVAL;
 +              return call_pkg->nd_command;
 +      }
 +
 +      /* In the !call_pkg case, bus commands == bus functions */
 +      if (!nfit_mem)
 +              return cmd;
 +
 +      /* Linux ND commands == NVDIMM_FAMILY_INTEL function numbers */
 +      if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
 +              return cmd;
 +
 +      /*
 +       * Force function number validation to fail since 0 is never
 +       * published as a valid function in dsm_mask.
 +       */
 +      return 0;
 +}
 +
  int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
                unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
  {
 -      struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
 +      struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
        struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
        union acpi_object in_obj, in_buf, *out_obj;
        const struct nd_cmd_desc *desc = NULL;
        unsigned long cmd_mask, dsm_mask;
        u32 offset, fw_status = 0;
        acpi_handle handle;
 -      unsigned int func;
        const guid_t *guid;
 -      int rc, i;
 +      int func, rc, i;
  
        if (cmd_rc)
                *cmd_rc = -EINVAL;
 -      func = cmd;
 -      if (cmd == ND_CMD_CALL) {
 -              call_pkg = buf;
 -              func = call_pkg->nd_command;
  
 -              for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
 -                      if (call_pkg->nd_reserved2[i])
 -                              return -EINVAL;
 -      }
 +      if (cmd == ND_CMD_CALL)
 +              call_pkg = buf;
 +      func = cmd_to_func(nfit_mem, cmd, call_pkg);
 +      if (func < 0)
 +              return func;
  
        if (nvdimm) {
                struct acpi_device *adev = nfit_mem->adev;
  
                if (!adev)
                        return -ENOTTY;
 -              if (call_pkg && nfit_mem->family != call_pkg->nd_family)
 -                      return -ENOTTY;
  
                dimm_name = nvdimm_name(nvdimm);
                cmd_name = nvdimm_cmd_name(cmd);
  
                cmd_name = nvdimm_bus_cmd_name(cmd);
                cmd_mask = nd_desc->cmd_mask;
 -              dsm_mask = cmd_mask;
 -              if (cmd == ND_CMD_CALL)
 -                      dsm_mask = nd_desc->bus_dsm_mask;
 +              dsm_mask = nd_desc->bus_dsm_mask;
                desc = nd_cmd_bus_desc(cmd);
                guid = to_nfit_uuid(NFIT_DEV_BUS);
                handle = adev->handle;
        if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
                return -ENOTTY;
  
 -      if (!test_bit(cmd, &cmd_mask) || !test_bit(func, &dsm_mask))
 +      /*
 +       * Check for a valid command.  For ND_CMD_CALL, we also have to
 +       * make sure that the DSM function is supported.
 +       */
 +      if (cmd == ND_CMD_CALL && !test_bit(func, &dsm_mask))
 +              return -ENOTTY;
 +      else if (!test_bit(cmd, &cmd_mask))
                return -ENOTTY;
  
        in_obj.type = ACPI_TYPE_PACKAGE;
                return -EINVAL;
        }
  
 +      if (out_obj->type != ACPI_TYPE_BUFFER) {
 +              dev_dbg(dev, "%s unexpected output object type cmd: %s type: %d\n",
 +                              dimm_name, cmd_name, out_obj->type);
 +              rc = -EINVAL;
 +              goto out;
 +      }
 +
        if (call_pkg) {
                call_pkg->nd_fw_size = out_obj->buffer.length;
                memcpy(call_pkg->nd_payload + call_pkg->nd_size_in,
                return 0;
        }
  
 -      if (out_obj->package.type != ACPI_TYPE_BUFFER) {
 -              dev_dbg(dev, "%s unexpected output object type cmd: %s type: %d\n",
 -                              dimm_name, cmd_name, out_obj->type);
 -              rc = -EINVAL;
 -              goto out;
 -      }
 -
        dev_dbg(dev, "%s cmd: %s output length: %d\n", dimm_name,
                        cmd_name, out_obj->buffer.length);
        print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4,
@@@ -746,7 -721,6 +746,7 @@@ int nfit_get_smbios_id(u32 device_handl
        struct acpi_nfit_memory_map *memdev;
        struct acpi_nfit_desc *acpi_desc;
        struct nfit_mem *nfit_mem;
 +      u16 physical_id;
  
        mutex_lock(&acpi_desc_lock);
        list_for_each_entry(acpi_desc, &acpi_descs, list) {
                list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
                        memdev = __to_nfit_memdev(nfit_mem);
                        if (memdev->device_handle == device_handle) {
 +                              *flags = memdev->flags;
 +                              physical_id = memdev->physical_id;
                                mutex_unlock(&acpi_desc->init_mutex);
                                mutex_unlock(&acpi_desc_lock);
 -                              *flags = memdev->flags;
 -                              return memdev->physical_id;
 +                              return physical_id;
                        }
                }
                mutex_unlock(&acpi_desc->init_mutex);
@@@ -1323,30 -1296,19 +1323,30 @@@ static ssize_t scrub_show(struct devic
                struct device_attribute *attr, char *buf)
  {
        struct nvdimm_bus_descriptor *nd_desc;
 +      struct acpi_nfit_desc *acpi_desc;
        ssize_t rc = -ENXIO;
 +      bool busy;
  
        device_lock(dev);
        nd_desc = dev_get_drvdata(dev);
 -      if (nd_desc) {
 -              struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 +      if (!nd_desc) {
 +              device_unlock(dev);
 +              return rc;
 +      }
 +      acpi_desc = to_acpi_desc(nd_desc);
  
 -              mutex_lock(&acpi_desc->init_mutex);
 -              rc = sprintf(buf, "%d%s", acpi_desc->scrub_count,
 -                              acpi_desc->scrub_busy
 -                              && !acpi_desc->cancel ? "+\n" : "\n");
 -              mutex_unlock(&acpi_desc->init_mutex);
 +      mutex_lock(&acpi_desc->init_mutex);
 +      busy = test_bit(ARS_BUSY, &acpi_desc->scrub_flags)
 +              && !test_bit(ARS_CANCEL, &acpi_desc->scrub_flags);
 +      rc = sprintf(buf, "%d%s", acpi_desc->scrub_count, busy ? "+\n" : "\n");
 +      /* Allow an admin to poll the busy state at a higher rate */
 +      if (busy && capable(CAP_SYS_RAWIO) && !test_and_set_bit(ARS_POLL,
 +                              &acpi_desc->scrub_flags)) {
 +              acpi_desc->scrub_tmo = 1;
 +              mod_delayed_work(nfit_wq, &acpi_desc->dwork, HZ);
        }
 +
 +      mutex_unlock(&acpi_desc->init_mutex);
        device_unlock(dev);
        return rc;
  }
@@@ -1776,14 -1738,14 +1776,14 @@@ static bool acpi_nvdimm_has_method(stru
  
  __weak void nfit_intel_shutdown_status(struct nfit_mem *nfit_mem)
  {
 +      struct device *dev = &nfit_mem->adev->dev;
        struct nd_intel_smart smart = { 0 };
        union acpi_object in_buf = {
 -              .type = ACPI_TYPE_BUFFER,
 -              .buffer.pointer = (char *) &smart,
 -              .buffer.length = sizeof(smart),
 +              .buffer.type = ACPI_TYPE_BUFFER,
 +              .buffer.length = 0,
        };
        union acpi_object in_obj = {
 -              .type = ACPI_TYPE_PACKAGE,
 +              .package.type = ACPI_TYPE_PACKAGE,
                .package.count = 1,
                .package.elements = &in_buf,
        };
                return;
  
        out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj);
 -      if (!out_obj)
 +      if (!out_obj || out_obj->type != ACPI_TYPE_BUFFER
 +                      || out_obj->buffer.length < sizeof(smart)) {
 +              dev_dbg(dev->parent, "%s: failed to retrieve initial health\n",
 +                              dev_name(dev));
 +              ACPI_FREE(out_obj);
                return;
 +      }
 +      memcpy(&smart, out_obj->buffer.pointer, sizeof(smart));
 +      ACPI_FREE(out_obj);
  
        if (smart.flags & ND_INTEL_SMART_SHUTDOWN_VALID) {
                if (smart.shutdown_state)
                set_bit(NFIT_MEM_DIRTY_COUNT, &nfit_mem->flags);
                nfit_mem->dirty_shutdown = smart.shutdown_count;
        }
 -      ACPI_FREE(out_obj);
  }
  
  static void populate_shutdown_status(struct nfit_mem *nfit_mem)
@@@ -1884,17 -1840,9 +1884,17 @@@ static int acpi_nfit_add_dimm(struct ac
        dev_set_drvdata(&adev_dimm->dev, nfit_mem);
  
        /*
 -       * Until standardization materializes we need to consider 4
 -       * different command sets.  Note, that checking for function0 (bit0)
 -       * tells us if any commands are reachable through this GUID.
 +       * There are 4 "legacy" NVDIMM command sets
 +       * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before
 +       * an EFI working group was established to constrain this
 +       * proliferation. The nfit driver probes for the supported command
 +       * set by GUID. Note, if you're a platform developer looking to add
 +       * a new command set to this probe, consider using an existing set,
 +       * or otherwise seek approval to publish the command set at
 +       * http://www.uefi.org/RFIC_LIST.
 +       *
 +       * Note, that checking for function0 (bit0) tells us if any commands
 +       * are reachable through this GUID.
         */
        for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
                if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
                        dsm_mask &= ~(1 << 8);
        } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) {
                dsm_mask = 0xffffffff;
 +      } else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) {
 +              dsm_mask = 0x1f;
        } else {
                dev_dbg(dev, "unknown dimm command family\n");
                nfit_mem->family = -1;
                return 0;
        }
  
 +      /*
 +       * Function 0 is the command interrogation function, don't
 +       * export it to potential userspace use, and enable it to be
 +       * used as an error value in acpi_nfit_ctl().
 +       */
 +      dsm_mask &= ~1UL;
 +
        guid = to_nfit_uuid(nfit_mem->family);
        for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
                if (acpi_check_dsm(adev_dimm->handle, guid,
                | 1 << ND_CMD_SET_CONFIG_DATA;
        if (family == NVDIMM_FAMILY_INTEL
                        && (dsm_mask & label_mask) == label_mask)
 -              return 0;
 +              /* skip _LS{I,R,W} enabling */;
 +      else {
 +              if (acpi_nvdimm_has_method(adev_dimm, "_LSI")
 +                              && acpi_nvdimm_has_method(adev_dimm, "_LSR")) {
 +                      dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
 +                      set_bit(NFIT_MEM_LSR, &nfit_mem->flags);
 +              }
  
 -      if (acpi_nvdimm_has_method(adev_dimm, "_LSI")
 -                      && acpi_nvdimm_has_method(adev_dimm, "_LSR")) {
 -              dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
 -              set_bit(NFIT_MEM_LSR, &nfit_mem->flags);
 -      }
 +              if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)
 +                              && acpi_nvdimm_has_method(adev_dimm, "_LSW")) {
 +                      dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
 +                      set_bit(NFIT_MEM_LSW, &nfit_mem->flags);
 +              }
  
 -      if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)
 -                      && acpi_nvdimm_has_method(adev_dimm, "_LSW")) {
 -              dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
 -              set_bit(NFIT_MEM_LSW, &nfit_mem->flags);
 +              /*
 +               * Quirk read-only label configurations to preserve
 +               * access to label-less namespaces by default.
 +               */
 +              if (!test_bit(NFIT_MEM_LSW, &nfit_mem->flags)
 +                              && !force_labels) {
 +                      dev_dbg(dev, "%s: No _LSW, disable labels\n",
 +                                      dev_name(&adev_dimm->dev));
 +                      clear_bit(NFIT_MEM_LSR, &nfit_mem->flags);
 +              } else
 +                      dev_dbg(dev, "%s: Force enable labels\n",
 +                                      dev_name(&adev_dimm->dev));
        }
  
        populate_shutdown_status(nfit_mem);
@@@ -2074,10 -1999,6 +2074,10 @@@ static int acpi_nfit_register_dimms(str
                        cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
                }
  
 +              /* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
 +              if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
 +                      set_bit(NDD_NOBLK, &flags);
 +
                if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
                        set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
                        set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
                if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
                        continue;
  
 -              dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n",
 +              dev_err(acpi_desc->dev, "Error found in NVDIMM %s flags:%s%s%s%s%s\n",
                                nvdimm_name(nvdimm),
                  mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "",
                  mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"",
                if (!nvdimm)
                        continue;
  
 -              rc = nvdimm_security_setup_events(nvdimm);
 -              if (rc < 0)
 -                      dev_warn(acpi_desc->dev,
 -                              "security event setup failed: %d\n", rc);
 -
                nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit");
                if (nfit_kernfs)
                        nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs,
@@@ -2305,6 -2231,7 +2305,6 @@@ static int acpi_nfit_init_interleave_se
        nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
        if (!nd_set)
                return -ENOMEM;
 -      ndr_desc->nd_set = nd_set;
        guid_copy(&nd_set->type_guid, (guid_t *) spa->range_guid);
  
        info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL);
@@@ -2692,10 -2619,7 +2692,10 @@@ static int ars_start(struct acpi_nfit_d
  
        if (rc < 0)
                return rc;
 -      return cmd_rc;
 +      if (cmd_rc < 0)
 +              return cmd_rc;
 +      set_bit(ARS_VALID, &acpi_desc->scrub_flags);
 +      return 0;
  }
  
  static int ars_continue(struct acpi_nfit_desc *acpi_desc)
        struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
        struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status;
  
 -      memset(&ars_start, 0, sizeof(ars_start));
 -      ars_start.address = ars_status->restart_address;
 -      ars_start.length = ars_status->restart_length;
 -      ars_start.type = ars_status->type;
 -      ars_start.flags = acpi_desc->ars_start_flags;
 +      ars_start = (struct nd_cmd_ars_start) {
 +              .address = ars_status->restart_address,
 +              .length = ars_status->restart_length,
 +              .type = ars_status->type,
 +      };
        rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start,
                        sizeof(ars_start), &cmd_rc);
        if (rc < 0)
@@@ -2788,17 -2712,6 +2788,17 @@@ static int ars_status_process_records(s
         */
        if (ars_status->out_length < 44)
                return 0;
 +
 +      /*
 +       * Ignore potentially stale results that are only refreshed
 +       * after a start-ARS event.
 +       */
 +      if (!test_and_clear_bit(ARS_VALID, &acpi_desc->scrub_flags)) {
 +              dev_dbg(acpi_desc->dev, "skip %d stale records\n",
 +                              ars_status->num_records);
 +              return 0;
 +      }
 +
        for (i = 0; i < ars_status->num_records; i++) {
                /* only process full records */
                if (ars_status->out_length
@@@ -2956,11 -2869,15 +2956,15 @@@ static int acpi_nfit_register_region(st
        ndr_desc->res = &res;
        ndr_desc->provider_data = nfit_spa;
        ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
-       if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
+       if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
                ndr_desc->numa_node = acpi_map_pxm_to_online_node(
                                                spa->proximity_domain);
-       else
+               ndr_desc->target_node = acpi_map_pxm_to_node(
+                               spa->proximity_domain);
+       } else {
                ndr_desc->numa_node = NUMA_NO_NODE;
+               ndr_desc->target_node = NUMA_NO_NODE;
+       }
  
        /*
         * Persistence domain bits are hierarchical, if
@@@ -3069,16 -2986,14 +3073,16 @@@ static int ars_register(struct acpi_nfi
  {
        int rc;
  
 -      if (no_init_ars || test_bit(ARS_FAILED, &nfit_spa->ars_state))
 +      if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
                return acpi_nfit_register_region(acpi_desc, nfit_spa);
  
        set_bit(ARS_REQ_SHORT, &nfit_spa->ars_state);
 -      set_bit(ARS_REQ_LONG, &nfit_spa->ars_state);
 +      if (!no_init_ars)
 +              set_bit(ARS_REQ_LONG, &nfit_spa->ars_state);
  
        switch (acpi_nfit_query_poison(acpi_desc)) {
        case 0:
 +      case -ENOSPC:
        case -EAGAIN:
                rc = ars_start(acpi_desc, nfit_spa, ARS_REQ_SHORT);
                /* shouldn't happen, try again later */
                break;
        case -EBUSY:
        case -ENOMEM:
 -      case -ENOSPC:
                /*
                 * BIOS was using ARS, wait for it to complete (or
                 * resources to become available) and then perform our
@@@ -3137,7 -3053,7 +3141,7 @@@ static unsigned int __acpi_nfit_scrub(s
  
        lockdep_assert_held(&acpi_desc->init_mutex);
  
 -      if (acpi_desc->cancel)
 +      if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags))
                return 0;
  
        if (query_rc == -EBUSY) {
@@@ -3211,7 -3127,7 +3215,7 @@@ static void __sched_ars(struct acpi_nfi
  {
        lockdep_assert_held(&acpi_desc->init_mutex);
  
 -      acpi_desc->scrub_busy = 1;
 +      set_bit(ARS_BUSY, &acpi_desc->scrub_flags);
        /* note this should only be set from within the workqueue */
        if (tmo)
                acpi_desc->scrub_tmo = tmo;
@@@ -3227,7 -3143,7 +3231,7 @@@ static void notify_ars_done(struct acpi
  {
        lockdep_assert_held(&acpi_desc->init_mutex);
  
 -      acpi_desc->scrub_busy = 0;
 +      clear_bit(ARS_BUSY, &acpi_desc->scrub_flags);
        acpi_desc->scrub_count++;
        if (acpi_desc->scrub_count_state)
                sysfs_notify_dirent(acpi_desc->scrub_count_state);
@@@ -3248,7 -3164,6 +3252,7 @@@ static void acpi_nfit_scrub(struct work
        else
                notify_ars_done(acpi_desc);
        memset(acpi_desc->ars_status, 0, acpi_desc->max_ars);
 +      clear_bit(ARS_POLL, &acpi_desc->scrub_flags);
        mutex_unlock(&acpi_desc->init_mutex);
  }
  
@@@ -3283,7 -3198,6 +3287,7 @@@ static int acpi_nfit_register_regions(s
        struct nfit_spa *nfit_spa;
        int rc;
  
 +      set_bit(ARS_VALID, &acpi_desc->scrub_flags);
        list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
                switch (nfit_spa_type(nfit_spa->spa)) {
                case NFIT_SPA_VOLATILE:
@@@ -3457,7 -3371,7 +3461,7 @@@ EXPORT_SYMBOL_GPL(acpi_nfit_init)
  
  static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
  {
 -      struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
 +      struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
        struct device *dev = acpi_desc->dev;
  
        /* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */
  static int __acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
                struct nvdimm *nvdimm, unsigned int cmd)
  {
 -      struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
 +      struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
  
        if (nvdimm)
                return 0;
@@@ -3518,7 -3432,7 +3522,7 @@@ int acpi_nfit_ars_rescan(struct acpi_nf
        struct nfit_spa *nfit_spa;
  
        mutex_lock(&acpi_desc->init_mutex);
 -      if (acpi_desc->cancel) {
 +      if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags)) {
                mutex_unlock(&acpi_desc->init_mutex);
                return 0;
        }
@@@ -3597,7 -3511,7 +3601,7 @@@ void acpi_nfit_shutdown(void *data
        mutex_unlock(&acpi_desc_lock);
  
        mutex_lock(&acpi_desc->init_mutex);
 -      acpi_desc->cancel = 1;
 +      set_bit(ARS_CANCEL, &acpi_desc->scrub_flags);
        cancel_delayed_work_sync(&acpi_desc->dwork);
        mutex_unlock(&acpi_desc->init_mutex);
  
@@@ -3797,7 -3711,6 +3801,7 @@@ static __init int nfit_init(void
        guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
        guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
        guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]);
 +      guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]);
  
        nfit_wq = create_singlethread_workqueue("nfit");
        if (!nfit_wq)
diff --combined drivers/acpi/numa.c
index 7bbbf8256a41aa404c8b738756fc53f9d5c053d9,b9d86babb13ae4180f81cc97787b9cb9f54a124a..867f6e3f2b4f42fea98920d5726f24ce398547fc
@@@ -84,6 -84,7 +84,7 @@@ int acpi_map_pxm_to_node(int pxm
  
        return node;
  }
+ EXPORT_SYMBOL(acpi_map_pxm_to_node);
  
  /**
   * acpi_map_pxm_to_online_node - Map proximity ID to online node
@@@ -146,9 -147,9 +147,9 @@@ acpi_table_print_srat_entry(struct acpi
                {
                        struct acpi_srat_mem_affinity *p =
                            (struct acpi_srat_mem_affinity *)header;
 -                      pr_debug("SRAT Memory (0x%lx length 0x%lx) in proximity domain %d %s%s%s\n",
 -                               (unsigned long)p->base_address,
 -                               (unsigned long)p->length,
 +                      pr_debug("SRAT Memory (0x%llx length 0x%llx) in proximity domain %d %s%s%s\n",
 +                               (unsigned long long)p->base_address,
 +                               (unsigned long long)p->length,
                                 p->proximity_domain,
                                 (p->flags & ACPI_SRAT_MEM_ENABLED) ?
                                 "enabled" : "disabled",
diff --combined drivers/dax/super.c
index 0cb8c30ea27882ee14ae52254f8d5a379e95b0a5,ccb22d8db3a26dbc58158fd14f63a1e8a05337a7..0a339b85133e1e2ea621dd24f6f42b637e6dac7e
@@@ -22,6 -22,7 +22,7 @@@
  #include <linux/uio.h>
  #include <linux/dax.h>
  #include <linux/fs.h>
+ #include "dax-private.h"
  
  static dev_t dax_devt;
  DEFINE_STATIC_SRCU(dax_srcu);
@@@ -86,14 -87,12 +87,14 @@@ bool __bdev_dax_supported(struct block_
  {
        struct dax_device *dax_dev;
        bool dax_enabled = false;
 +      pgoff_t pgoff, pgoff_end;
        struct request_queue *q;
 -      pgoff_t pgoff;
 -      int err, id;
 -      pfn_t pfn;
 -      long len;
        char buf[BDEVNAME_SIZE];
 +      void *kaddr, *end_kaddr;
 +      pfn_t pfn, end_pfn;
 +      sector_t last_page;
 +      long len, len2;
 +      int err, id;
  
        if (blocksize != PAGE_SIZE) {
                pr_debug("%s: error: unsupported blocksize for dax\n",
                return false;
        }
  
 +      last_page = PFN_DOWN(i_size_read(bdev->bd_inode) - 1) * 8;
 +      err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
 +      if (err) {
 +              pr_debug("%s: error: unaligned partition for dax\n",
 +                              bdevname(bdev, buf));
 +              return false;
 +      }
 +
        dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
        if (!dax_dev) {
                pr_debug("%s: error: device does not support dax\n",
        }
  
        id = dax_read_lock();
 -      len = dax_direct_access(dax_dev, pgoff, 1, NULL, &pfn);
 +      len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
 +      len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn);
        dax_read_unlock(id);
  
        put_dax(dax_dev);
  
 -      if (len < 1) {
 +      if (len < 1 || len2 < 1) {
                pr_debug("%s: error: dax access failed (%ld)\n",
 -                              bdevname(bdev, buf), len);
 +                              bdevname(bdev, buf), len < 1 ? len : len2);
                return false;
        }
  
                 */
                WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
                dax_enabled = true;
 -      } else if (pfn_t_devmap(pfn)) {
 -              struct dev_pagemap *pgmap;
 +      } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) {
 +              struct dev_pagemap *pgmap, *end_pgmap;
  
                pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
 -              if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
 +              end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL);
 +              if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX
 +                              && pfn_t_to_page(pfn)->pgmap == pgmap
 +                              && pfn_t_to_page(end_pfn)->pgmap == pgmap
 +                              && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr))
 +                              && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr)))
                        dax_enabled = true;
                put_dev_pagemap(pgmap);
 +              put_dev_pagemap(end_pgmap);
 +
        }
  
        if (!dax_enabled) {
@@@ -383,11 -366,15 +384,15 @@@ void kill_dax(struct dax_device *dax_de
        spin_lock(&dax_host_lock);
        hlist_del_init(&dax_dev->list);
        spin_unlock(&dax_host_lock);
-       dax_dev->private = NULL;
  }
  EXPORT_SYMBOL_GPL(kill_dax);
  
+ void run_dax(struct dax_device *dax_dev)
+ {
+       set_bit(DAXDEV_ALIVE, &dax_dev->flags);
+ }
+ EXPORT_SYMBOL_GPL(run_dax);
  static struct inode *dax_alloc_inode(struct super_block *sb)
  {
        struct dax_device *dax_dev;
@@@ -602,6 -589,8 +607,8 @@@ EXPORT_SYMBOL_GPL(dax_inode)
  
  void *dax_get_private(struct dax_device *dax_dev)
  {
+       if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags))
+               return NULL;
        return dax_dev->private;
  }
  EXPORT_SYMBOL_GPL(dax_get_private);
@@@ -615,7 -604,7 +622,7 @@@ static void init_once(void *_dax_dev
        inode_init_once(inode);
  }
  
- static int __dax_fs_init(void)
+ static int dax_fs_init(void)
  {
        int rc;
  
        return rc;
  }
  
- static void __dax_fs_exit(void)
+ static void dax_fs_exit(void)
  {
        kern_unmount(dax_mnt);
        unregister_filesystem(&dax_fs_type);
        kmem_cache_destroy(dax_cache);
  }
  
- static int __init dax_fs_init(void)
+ static int __init dax_core_init(void)
  {
        int rc;
  
-       rc = __dax_fs_init();
+       rc = dax_fs_init();
        if (rc)
                return rc;
  
        rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
        if (rc)
-               __dax_fs_exit();
-       return rc;
+               goto err_chrdev;
+       rc = dax_bus_init();
+       if (rc)
+               goto err_bus;
+       return 0;
+ err_bus:
+       unregister_chrdev_region(dax_devt, MINORMASK+1);
+ err_chrdev:
+       dax_fs_exit();
+       return 0;
  }
  
- static void __exit dax_fs_exit(void)
+ static void __exit dax_core_exit(void)
  {
        unregister_chrdev_region(dax_devt, MINORMASK+1);
        ida_destroy(&dax_minor_ida);
-       __dax_fs_exit();
+       dax_fs_exit();
  }
  
  MODULE_AUTHOR("Intel Corporation");
  MODULE_LICENSE("GPL v2");
- subsys_initcall(dax_fs_init);
- module_exit(dax_fs_exit);
+ subsys_initcall(dax_core_init);
+ module_exit(dax_core_exit);
diff --combined drivers/nvdimm/nd.h
index 379bf4305e6159a2568940df7cb06f07f774b55c,0b3d7595b3cb8221c014614acb5231b09de686e2..a5ac3b240293b3567a6295b3e7488c35d4ab0bd2
@@@ -153,7 -153,7 +153,7 @@@ struct nd_region 
        u16 ndr_mappings;
        u64 ndr_size;
        u64 ndr_start;
-       int id, num_lanes, ro, numa_node;
+       int id, num_lanes, ro, numa_node, target_node;
        void *provider_data;
        struct kernfs_node *bb_state;
        struct badblocks bb;
@@@ -250,7 -250,6 +250,7 @@@ long nvdimm_clear_poison(struct device 
  void nvdimm_set_aliasing(struct device *dev);
  void nvdimm_set_locked(struct device *dev);
  void nvdimm_clear_locked(struct device *dev);
 +int nvdimm_security_setup_events(struct device *dev);
  #if IS_ENABLED(CONFIG_NVDIMM_KEYS)
  int nvdimm_security_unlock(struct device *dev);
  #else
diff --combined drivers/nvdimm/of_pmem.c
index 11b9821eba8594ce61688ffd92873d4df61d2583,ecaaa27438e2526ac19653599d7366e05d06125c..a0c8dcfa0bf923cc1e8a1b2ffacb74f2f801d681
@@@ -68,6 -68,7 +68,7 @@@ static int of_pmem_region_probe(struct 
                memset(&ndr_desc, 0, sizeof(ndr_desc));
                ndr_desc.attr_groups = region_attr_groups;
                ndr_desc.numa_node = dev_to_node(&pdev->dev);
+               ndr_desc.target_node = ndr_desc.numa_node;
                ndr_desc.res = &pdev->resource[i];
                ndr_desc.of_node = np;
                set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
@@@ -108,6 -109,7 +109,6 @@@ static struct platform_driver of_pmem_r
        .remove = of_pmem_region_remove,
        .driver = {
                .name = "of_pmem",
 -              .owner = THIS_MODULE,
                .of_match_table = of_pmem_region_match,
        },
  };
index 3b58baa44b5cf4b2622c0ab487a0eeebe08b6d89,caf2f3129ccdb65f7ed9ca9399427f34234b2c90..b4ef7d9ff22ebb517744566ea68eb677db052285
@@@ -1003,13 -1003,6 +1003,13 @@@ static struct nd_region *nd_region_crea
  
                if (test_bit(NDD_UNARMED, &nvdimm->flags))
                        ro = 1;
 +
 +              if (test_bit(NDD_NOBLK, &nvdimm->flags)
 +                              && dev_type == &nd_blk_device_type) {
 +                      dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not BLK capable\n",
 +                                      caller, dev_name(&nvdimm->dev), i);
 +                      return NULL;
 +              }
        }
  
        if (dev_type == &nd_blk_device_type) {
        nd_region->flags = ndr_desc->flags;
        nd_region->ro = ro;
        nd_region->numa_node = ndr_desc->numa_node;
+       nd_region->target_node = ndr_desc->target_node;
        ida_init(&nd_region->ns_ida);
        ida_init(&nd_region->btt_ida);
        ida_init(&nd_region->pfn_ida);
diff --combined include/linux/acpi.h
index 6ac47f5ea514ac72d8d9fa80b5a0b8cac369f7a1,eddf2736e5a61cedc9b533f47c7f3a87ed47cb8b..d5dcebd7aad334e635bfb12a2abc79e09b2323ba
@@@ -400,12 -400,17 +400,17 @@@ extern bool acpi_osi_is_win8(void)
  
  #ifdef CONFIG_ACPI_NUMA
  int acpi_map_pxm_to_online_node(int pxm);
+ int acpi_map_pxm_to_node(int pxm);
  int acpi_get_node(acpi_handle handle);
  #else
  static inline int acpi_map_pxm_to_online_node(int pxm)
  {
        return 0;
  }
+ static inline int acpi_map_pxm_to_node(int pxm)
+ {
+       return 0;
+ }
  static inline int acpi_get_node(acpi_handle handle)
  {
        return 0;
@@@ -953,6 -958,9 +958,6 @@@ acpi_handle_printk(const char *level, v
  #if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
  __printf(3, 4)
  void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...);
 -#else
 -#define __acpi_handle_debug(descriptor, handle, fmt, ...)             \
 -      acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__);
  #endif
  
  /*
  #else
  #if defined(CONFIG_DYNAMIC_DEBUG)
  #define acpi_handle_debug(handle, fmt, ...)                           \
 -do {                                                                  \
 -      DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);                 \
 -      if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))          \
 -              __acpi_handle_debug(&descriptor, handle, pr_fmt(fmt),   \
 -                              ##__VA_ARGS__);                         \
 -} while (0)
 +      _dynamic_func_call(fmt, __acpi_handle_debug,                    \
 +                         handle, pr_fmt(fmt), ##__VA_ARGS__)
  #else
  #define acpi_handle_debug(handle, fmt, ...)                           \
  ({                                                                    \
@@@ -1007,13 -1019,6 +1012,13 @@@ struct acpi_gpio_mapping 
  
  /* Ignore IoRestriction field */
  #define ACPI_GPIO_QUIRK_NO_IO_RESTRICTION     BIT(0)
 +/*
 + * When ACPI GPIO mapping table is in use the index parameter inside it
 + * refers to the GPIO resource in _CRS method. That index has no
 + * distinction of actual type of the resource. When consumer wants to
 + * get GpioIo type explicitly, this quirk may be used.
 + */
 +#define ACPI_GPIO_QUIRK_ONLY_GPIOIO           BIT(1)
  
        unsigned int quirks;
  };
@@@ -1061,6 -1066,17 +1066,6 @@@ static inline int acpi_dev_gpio_irq_get
  }
  #endif
  
 -#if defined(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C)
 -bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 -                             struct acpi_resource_i2c_serialbus **i2c);
 -#else
 -static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 -                                           struct acpi_resource_i2c_serialbus **i2c)
 -{
 -      return false;
 -}
 -#endif
 -
  /* Device properties */
  
  #ifdef CONFIG_ACPI
index 43348303cb4bfb6323e9db9fbc7da2eb349eca3f,56bc545ad3b25d512c78acd4a4869866c0385822..feb342d026f2e935a3e01515c2dc94852a83ef77
@@@ -42,8 -42,6 +42,8 @@@ enum 
        NDD_SECURITY_OVERWRITE = 3,
        /*  tracking whether or not there is a pending device reference */
        NDD_WORK_PENDING = 4,
 +      /* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
 +      NDD_NOBLK = 5,
  
        /* need to set a limit somewhere, but yes, this is likely overkill */
        ND_IOCTL_MAX_BUFLEN = SZ_4M,
@@@ -130,6 -128,7 +130,7 @@@ struct nd_region_desc 
        void *provider_data;
        int num_lanes;
        int numa_node;
+       int target_node;
        unsigned long flags;
        struct device_node *of_node;
  };
@@@ -162,7 -161,6 +163,7 @@@ static inline struct nd_blk_region_des
  }
  
  enum nvdimm_security_state {
 +      NVDIMM_SECURITY_ERROR = -1,
        NVDIMM_SECURITY_DISABLED,
        NVDIMM_SECURITY_UNLOCKED,
        NVDIMM_SECURITY_LOCKED,
@@@ -237,6 -235,7 +238,6 @@@ static inline struct nvdimm *nvdimm_cre
                        cmd_mask, num_flush, flush_wpq, NULL, NULL);
  }
  
 -int nvdimm_security_setup_events(struct nvdimm *nvdimm);
  const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
  const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
  u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
diff --combined kernel/resource.c
index e81b17b53fa53aa025ccb8f12c236158cb2e61b2,e7f9d2a5db2500eb1f05548717f22c5777d326a6..92190f62ebc53438b7da8fcd2845c7590f002e03
@@@ -382,7 -382,7 +382,7 @@@ static int __walk_iomem_res_desc(resour
                                 int (*func)(struct resource *, void *))
  {
        struct resource res;
-       int ret = -1;
+       int ret = -EINVAL;
  
        while (start < end &&
               !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
@@@ -448,10 -448,15 +448,13 @@@ int walk_mem_res(u64 start, u64 end, vo
                                     arg, func);
  }
  
 -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 -
  /*
   * This function calls the @func callback against all memory ranges of type
   * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
   * It is to be used only for System RAM.
+  *
+  * This will find System RAM ranges that are children of top-level resources
+  * in addition to top-level System RAM resources.
   */
  int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                          void *arg, int (*func)(unsigned long, unsigned long, void *))
        unsigned long flags;
        struct resource res;
        unsigned long pfn, end_pfn;
-       int ret = -1;
+       int ret = -EINVAL;
  
        start = (u64) start_pfn << PAGE_SHIFT;
        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        while (start < end &&
               !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
-                                   true, &res)) {
+                                   false, &res)) {
                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                end_pfn = (res.end + 1) >> PAGE_SHIFT;
                if (end_pfn > pfn)
        return ret;
  }
  
 -#endif
 -
  static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
  {
        return 1;
@@@ -1128,6 -1135,15 +1131,15 @@@ struct resource * __request_region(stru
                conflict = __request_resource(parent, res);
                if (!conflict)
                        break;
+               /*
+                * mm/hmm.c reserves physical addresses which then
+                * become unavailable to other users.  Conflicts are
+                * not expected.  Warn to aid debugging if encountered.
+                */
+               if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+                       pr_warn("Unaddressable device %s %pR conflicts with %pR",
+                               conflict->name, conflict, res);
+               }
                if (conflict != parent) {
                        if (!(conflict->flags & IORESOURCE_BUSY)) {
                                parent = conflict;
diff --combined mm/memory_hotplug.c
index cd23c081924deed7774f98407748becf817c21b0,b37f3a5c48336fb12669faed8d19322a90522259..f767582af4f8c0f28102f2d77d8dc6a667ec3df5
@@@ -47,7 -47,7 +47,7 @@@
   * and restore_online_page_callback() for generic callback restore.
   */
  
 -static void generic_online_page(struct page *page);
 +static void generic_online_page(struct page *page, unsigned int order);
  
  static online_page_callback_t online_page_callback = generic_online_page;
  static DEFINE_MUTEX(online_page_callback_lock);
@@@ -96,33 -96,24 +96,29 @@@ void mem_hotplug_done(void
        cpus_read_unlock();
  }
  
 +u64 max_mem_size = U64_MAX;
 +
  /* add this memory to iomem resource */
  static struct resource *register_memory_resource(u64 start, u64 size)
  {
-       struct resource *res, *conflict;
+       struct resource *res;
+       unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       char *resource_name = "System RAM";
  
-       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
-       if (!res)
-               return ERR_PTR(-ENOMEM);
-       res->name = "System RAM";
-       res->start = start;
-       res->end = start + size - 1;
-       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-       conflict =  request_resource_conflict(&iomem_resource, res);
-       if (conflict) {
-               if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
-                       pr_debug("Device unaddressable memory block "
-                                "memory hotplug at %#010llx !\n",
-                                (unsigned long long)start);
-               }
-               pr_debug("System RAM resource %pR cannot be added\n", res);
-               kfree(res);
 +      if (start + size > max_mem_size)
 +              return ERR_PTR(-E2BIG);
 +
+       /*
+        * Request ownership of the new memory range.  This might be
+        * a child of an existing resource that was present but
+        * not marked as busy.
+        */
+       res = __request_region(&iomem_resource, start, size,
+                              resource_name, flags);
+       if (!res) {
+               pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
+                               start, start + size);
                return ERR_PTR(-EEXIST);
        }
        return res;
@@@ -662,40 -653,26 +658,40 @@@ void __online_page_free(struct page *pa
  }
  EXPORT_SYMBOL_GPL(__online_page_free);
  
 -static void generic_online_page(struct page *page)
 +static void generic_online_page(struct page *page, unsigned int order)
 +{
 +      kernel_map_pages(page, 1 << order, 1);
 +      __free_pages_core(page, order);
 +      totalram_pages_add(1UL << order);
 +#ifdef CONFIG_HIGHMEM
 +      if (PageHighMem(page))
 +              totalhigh_pages_add(1UL << order);
 +#endif
 +}
 +
 +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
  {
 -      __online_page_set_limits(page);
 -      __online_page_increment_counters(page);
 -      __online_page_free(page);
 +      unsigned long end = start + nr_pages;
 +      int order, onlined_pages = 0;
 +
 +      while (start < end) {
 +              order = min(MAX_ORDER - 1,
 +                      get_order(PFN_PHYS(end) - PFN_PHYS(start)));
 +              (*online_page_callback)(pfn_to_page(start), order);
 +
 +              onlined_pages += (1UL << order);
 +              start += (1UL << order);
 +      }
 +      return onlined_pages;
  }
  
  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
                        void *arg)
  {
 -      unsigned long i;
        unsigned long onlined_pages = *(unsigned long *)arg;
 -      struct page *page;
  
        if (PageReserved(pfn_to_page(start_pfn)))
 -              for (i = 0; i < nr_pages; i++) {
 -                      page = pfn_to_page(start_pfn + i);
 -                      (*online_page_callback)(page);
 -                      onlined_pages++;
 -              }
 +              onlined_pages += online_pages_blocks(start_pfn, nr_pages);
  
        online_mem_sections(start_pfn, start_pfn + nr_pages);
  
@@@ -709,9 -686,9 +705,9 @@@ static void node_states_check_changes_o
  {
        int nid = zone_to_nid(zone);
  
 -      arg->status_change_nid = -1;
 -      arg->status_change_nid_normal = -1;
 -      arg->status_change_nid_high = -1;
 +      arg->status_change_nid = NUMA_NO_NODE;
 +      arg->status_change_nid_normal = NUMA_NO_NODE;
 +      arg->status_change_nid_high = NUMA_NO_NODE;
  
        if (!node_state(nid, N_MEMORY))
                arg->status_change_nid = nid;
@@@ -1208,13 -1185,11 +1204,13 @@@ static inline int pageblock_free(struc
        return PageBuddy(page) && page_order(page) >= pageblock_order;
  }
  
 -/* Return the start of the next active pageblock after a given page */
 -static struct page *next_active_pageblock(struct page *page)
 +/* Return the pfn of the start of the next active pageblock after a given pfn */
 +static unsigned long next_active_pageblock(unsigned long pfn)
  {
 +      struct page *page = pfn_to_page(pfn);
 +
        /* Ensure the starting page is pageblock-aligned */
 -      BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
 +      BUG_ON(pfn & (pageblock_nr_pages - 1));
  
        /* If the entire pageblock is free, move to the end of free page */
        if (pageblock_free(page)) {
                /* be careful. we don't have locks, page_order can be changed.*/
                order = page_order(page);
                if ((order < MAX_ORDER) && (order >= pageblock_order))
 -                      return page + (1 << order);
 +                      return pfn + (1 << order);
        }
  
 -      return page + pageblock_nr_pages;
 +      return pfn + pageblock_nr_pages;
  }
  
 -static bool is_pageblock_removable_nolock(struct page *page)
 +static bool is_pageblock_removable_nolock(unsigned long pfn)
  {
 +      struct page *page = pfn_to_page(pfn);
        struct zone *zone;
 -      unsigned long pfn;
  
        /*
         * We have to be careful here because we are iterating over memory
  /* Checks if this range of memory is likely to be hot-removable. */
  bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
  {
 -      struct page *page = pfn_to_page(start_pfn);
 -      struct page *end_page = page + nr_pages;
 +      unsigned long end_pfn, pfn;
 +
 +      end_pfn = min(start_pfn + nr_pages,
 +                      zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
  
        /* Check the starting page of each pageblock within the range */
 -      for (; page < end_page; page = next_active_pageblock(page)) {
 -              if (!is_pageblock_removable_nolock(page))
 +      for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
 +              if (!is_pageblock_removable_nolock(pfn))
                        return false;
                cond_resched();
        }
@@@ -1297,9 -1270,6 +1293,9 @@@ int test_pages_in_a_zone(unsigned long 
                                i++;
                        if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
                                continue;
 +                      /* Check if we got outside of the zone */
 +                      if (zone && !zone_spans_pfn(zone, pfn + i))
 +                              return 0;
                        page = pfn_to_page(pfn + i);
                        if (zone && page_zone(page) != zone)
                                return 0;
  static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
  {
        unsigned long pfn;
 -      struct page *page;
 +
        for (pfn = start; pfn < end; pfn++) {
 -              if (pfn_valid(pfn)) {
 -                      page = pfn_to_page(pfn);
 -                      if (PageLRU(page))
 -                              return pfn;
 -                      if (__PageMovable(page))
 -                              return pfn;
 -                      if (PageHuge(page)) {
 -                              if (hugepage_migration_supported(page_hstate(page)) &&
 -                                  page_huge_active(page))
 -                                      return pfn;
 -                              else
 -                                      pfn = round_up(pfn + 1,
 -                                              1 << compound_order(page)) - 1;
 -                      }
 -              }
 +              struct page *page, *head;
 +              unsigned long skip;
 +
 +              if (!pfn_valid(pfn))
 +                      continue;
 +              page = pfn_to_page(pfn);
 +              if (PageLRU(page))
 +                      return pfn;
 +              if (__PageMovable(page))
 +                      return pfn;
 +
 +              if (!PageHuge(page))
 +                      continue;
 +              head = compound_head(page);
 +              if (hugepage_migration_supported(page_hstate(head)) &&
 +                  page_huge_active(head))
 +                      return pfn;
 +              skip = (1 << compound_order(head)) - (page - head);
 +              pfn += skip - 1;
        }
        return 0;
  }
@@@ -1375,6 -1341,7 +1371,6 @@@ do_migrate_range(unsigned long start_pf
  {
        unsigned long pfn;
        struct page *page;
 -      int not_managed = 0;
        int ret = 0;
        LIST_HEAD(source);
  
  
                if (PageHuge(page)) {
                        struct page *head = compound_head(page);
 -                      pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
                        if (compound_order(head) > PFN_SECTION_SHIFT) {
                                ret = -EBUSY;
                                break;
                        }
 -                      isolate_huge_page(page, &source);
 +                      pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
 +                      isolate_huge_page(head, &source);
                        continue;
                } else if (PageTransHuge(page))
                        pfn = page_to_pfn(compound_head(page))
                else
                        ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
                if (!ret) { /* Success */
 -                      put_page(page);
                        list_add_tail(&page->lru, &source);
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
                } else {
                        pr_warn("failed to isolate pfn %lx\n", pfn);
                        dump_page(page, "isolation failed");
 -                      put_page(page);
 -                      /* Because we don't have big zone->lock. we should
 -                         check this again here. */
 -                      if (page_count(page)) {
 -                              not_managed++;
 -                              ret = -EBUSY;
 -                              break;
 -                      }
                }
 +              put_page(page);
        }
        if (!list_empty(&source)) {
 -              if (not_managed) {
 -                      putback_movable_pages(&source);
 -                      goto out;
 -              }
 -
                /* Allocate a new page from the nearest neighbor node */
                ret = migrate_pages(&source, new_node_page, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                        putback_movable_pages(&source);
                }
        }
 -out:
 +
        return ret;
  }
  
@@@ -1516,9 -1496,9 +1512,9 @@@ static void node_states_check_changes_o
        unsigned long present_pages = 0;
        enum zone_type zt;
  
 -      arg->status_change_nid = -1;
 -      arg->status_change_nid_normal = -1;
 -      arg->status_change_nid_high = -1;
 +      arg->status_change_nid = NUMA_NO_NODE;
 +      arg->status_change_nid_normal = NUMA_NO_NODE;
 +      arg->status_change_nid_high = NUMA_NO_NODE;
  
        /*
         * Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@@ -1593,6 -1573,7 +1589,6 @@@ static int __ref __offline_pages(unsign
           we assume this for now. .*/
        if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
                                  &valid_end)) {
 -              mem_hotplug_done();
                ret = -EINVAL;
                reason = "multizone range";
                goto failed_removal;
                                       MIGRATE_MOVABLE,
                                       SKIP_HWPOISON | REPORT_FAILURE);
        if (ret) {
 -              mem_hotplug_done();
                reason = "failure to isolate range";
                goto failed_removal;
        }
  
                        cond_resched();
                        lru_add_drain_all();
 -                      drain_all_pages(zone);
  
                        pfn = scan_movable_pages(pfn, end_pfn);
                        if (pfn) {