[linux-2.6-block.git] / drivers / iommu / intel-svm.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright © 2015 Intel Corporation.
 *
 * Authors: David Woodhouse <dwmw2@infradead.org>
 */

#include <linux/intel-iommu.h>
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/intel-svm.h>
#include <linux/rculist.h>
#include <linux/pci.h>
#include <linux/pci-ats.h>
#include <linux/dmar.h>
#include <linux/interrupt.h>
#include <linux/mm_types.h>
#include <asm/page.h>

#include "intel-pasid.h"

static irqreturn_t prq_event_thread(int irq, void *d);

int intel_svm_init(struct intel_iommu *iommu)
{
	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
			!cap_fl1gp_support(iommu->cap))
		return -EINVAL;

	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
			!cap_5lp_support(iommu->cap))
		return -EINVAL;

	return 0;
}

#define PRQ_ORDER 0

int intel_svm_enable_prq(struct intel_iommu *iommu)
{
	struct page *pages;
	int irq, ret;

	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
	if (!pages) {
		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
			iommu->name);
		return -ENOMEM;
	}
	iommu->prq = page_address(pages);

	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
	if (irq <= 0) {
		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
		       iommu->name);
		ret = -EINVAL;
	err:
		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
		iommu->prq = NULL;
		return ret;
	}
	iommu->pr_irq = irq;

	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);

	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
				   iommu->prq_name, iommu);
	if (ret) {
		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
		       iommu->name);
		dmar_free_hwirq(irq);
		iommu->pr_irq = 0;
		goto err;
	}
	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);

	return 0;
}

int intel_svm_finish_prq(struct intel_iommu *iommu)
{
	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);

	if (iommu->pr_irq) {
		free_irq(iommu->pr_irq, iommu);
		dmar_free_hwirq(iommu->pr_irq);
		iommu->pr_irq = 0;
	}

	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
	iommu->prq = NULL;

	return 0;
}

static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
				       unsigned long address, unsigned long pages, int ih, int gl)
{
	struct qi_desc desc;

	if (pages == -1) {
		/* For global kernel pages we have to flush them in *all* PASIDs
		 * because that's the only option the hardware gives us. Despite
		 * the fact that they are actually only accessible through one. */
		if (gl)
			desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
					QI_EIOTLB_DID(sdev->did) |
					QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) |
					QI_EIOTLB_TYPE;
		else
			desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
					QI_EIOTLB_DID(sdev->did) |
					QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
					QI_EIOTLB_TYPE;
		desc.qw1 = 0;
	} else {
		int mask = ilog2(__roundup_pow_of_two(pages));

		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
				QI_EIOTLB_DID(sdev->did) |
				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
				QI_EIOTLB_TYPE;
		desc.qw1 = QI_EIOTLB_ADDR(address) |
				QI_EIOTLB_GL(gl) |
				QI_EIOTLB_IH(ih) |
				QI_EIOTLB_AM(mask);
	}
	desc.qw2 = 0;
	desc.qw3 = 0;
	qi_submit_sync(&desc, svm->iommu);

	if (sdev->dev_iotlb) {
		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
				QI_DEV_EIOTLB_SID(sdev->sid) |
				QI_DEV_EIOTLB_QDEP(sdev->qdep) |
				QI_DEIOTLB_TYPE;
		if (pages == -1) {
			desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
					QI_DEV_EIOTLB_SIZE;
		} else if (pages > 1) {
			/* The least significant zero bit indicates the size. So,
			 * for example, an "address" value of 0x12345f000 will
			 * flush from 0x123440000 to 0x12347ffff (256KiB). */
			unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
			unsigned long mask = __rounddown_pow_of_two(address ^ last);

			desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
					(mask - 1)) | QI_DEV_EIOTLB_SIZE;
		} else {
			desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
		}
		desc.qw2 = 0;
		desc.qw3 = 0;
		qi_submit_sync(&desc, svm->iommu);
	}
}

static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
				  unsigned long pages, int ih, int gl)
{
	struct intel_svm_dev *sdev;

	rcu_read_lock();
	list_for_each_entry_rcu(sdev, &svm->devs, list)
		intel_flush_svm_range_dev(svm, sdev, address, pages, ih, gl);
	rcu_read_unlock();
}

/* Pages have been freed at this point */
static void intel_invalidate_range(struct mmu_notifier *mn,
				   struct mm_struct *mm,
				   unsigned long start, unsigned long end)
{
	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

	intel_flush_svm_range(svm, start,
			      (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0, 0);
}

static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
	struct intel_svm_dev *sdev;

	/* This might end up being called from exit_mmap(), *before* the page
	 * tables are cleared. And __mmu_notifier_release() will delete us from
	 * the list of notifiers so that our invalidate_range() callback doesn't
	 * get called when the page tables are cleared. So we need to protect
	 * against hardware accessing those page tables.
	 *
	 * We do it by clearing the entry in the PASID table and then flushing
	 * the IOTLB and the PASID table caches. This might upset hardware;
	 * perhaps we'll want to point the PASID to a dummy PGD (like the zero
	 * page) so that we end up taking a fault that the hardware really
	 * *has* to handle gracefully without affecting other processes.
	 */
	rcu_read_lock();
	list_for_each_entry_rcu(sdev, &svm->devs, list) {
		intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
	}
	rcu_read_unlock();

}

static const struct mmu_notifier_ops intel_mmuops = {
	.release = intel_mm_release,
	.invalidate_range = intel_invalidate_range,
};

static DEFINE_MUTEX(pasid_mutex);
static LIST_HEAD(global_svm_list);

int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
{
	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
	struct device_domain_info *info;
	struct intel_svm_dev *sdev;
	struct intel_svm *svm = NULL;
	struct mm_struct *mm = NULL;
	int pasid_max;
	int ret;

	if (!iommu || dmar_disabled)
		return -EINVAL;

	if (dev_is_pci(dev)) {
		pasid_max = pci_max_pasids(to_pci_dev(dev));
		if (pasid_max < 0)
			return -EINVAL;
	} else
		pasid_max = 1 << 20;

	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
		if (!ecap_srs(iommu->ecap))
			return -EINVAL;
	} else if (pasid) {
		mm = get_task_mm(current);
		BUG_ON(!mm);
	}

	mutex_lock(&pasid_mutex);
	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
		struct intel_svm *t;

		list_for_each_entry(t, &global_svm_list, list) {
			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
				continue;

			svm = t;
			if (svm->pasid >= pasid_max) {
				dev_warn(dev,
					 "Limited PASID width. Cannot use existing PASID %d\n",
					 svm->pasid);
				ret = -ENOSPC;
				goto out;
			}

			list_for_each_entry(sdev, &svm->devs, list) {
				if (dev == sdev->dev) {
					if (sdev->ops != ops) {
						ret = -EBUSY;
						goto out;
					}
					sdev->users++;
					goto success;
				}
			}

			break;
		}
	}

	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
	if (!sdev) {
		ret = -ENOMEM;
		goto out;
	}
	sdev->dev = dev;

	ret = intel_iommu_enable_pasid(iommu, dev);
	if (ret || !pasid) {
		/* If they don't actually want to assign a PASID, this is
		 * just an enabling check/preparation. */
		kfree(sdev);
		goto out;
	}

	info = dev->archdata.iommu;
	if (!info || !info->pasid_supported) {
		kfree(sdev);
		goto out;
	}

	sdev->did = FLPT_DEFAULT_DID;
	sdev->sid = PCI_DEVID(info->bus, info->devfn);
	if (info->ats_enabled) {
		sdev->dev_iotlb = 1;
		sdev->qdep = info->ats_qdep;
		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
			sdev->qdep = 0;
	}

	/* Finish the setup now we know we're keeping it */
	sdev->users = 1;
	sdev->ops = ops;
	init_rcu_head(&sdev->rcu);

	if (!svm) {
		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
		if (!svm) {
			ret = -ENOMEM;
			kfree(sdev);
			goto out;
		}
		svm->iommu = iommu;

		if (pasid_max > intel_pasid_max_id)
			pasid_max = intel_pasid_max_id;

		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
		ret = intel_pasid_alloc_id(svm,
					   !!cap_caching_mode(iommu->cap),
					   pasid_max - 1, GFP_KERNEL);
		if (ret < 0) {
			kfree(svm);
			kfree(sdev);
			goto out;
		}
		svm->pasid = ret;
		svm->notifier.ops = &intel_mmuops;
		svm->mm = mm;
		svm->flags = flags;
		INIT_LIST_HEAD_RCU(&svm->devs);
		INIT_LIST_HEAD(&svm->list);
		ret = -ENOMEM;
		if (mm) {
			ret = mmu_notifier_register(&svm->notifier, mm);
			if (ret) {
				intel_pasid_free_id(svm->pasid);
				kfree(svm);
				kfree(sdev);
				goto out;
			}
		}

		spin_lock(&iommu->lock);
		ret = intel_pasid_setup_first_level(iommu, dev,
				mm ? mm->pgd : init_mm.pgd,
				svm->pasid, FLPT_DEFAULT_DID,
				mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
		spin_unlock(&iommu->lock);
		if (ret) {
			if (mm)
				mmu_notifier_unregister(&svm->notifier, mm);
			intel_pasid_free_id(svm->pasid);
			kfree(svm);
			kfree(sdev);
			goto out;
		}

		list_add_tail(&svm->list, &global_svm_list);
	} else {
		/*
		 * Binding a new device with existing PASID, need to setup
		 * the PASID entry.
		 */
		spin_lock(&iommu->lock);
		ret = intel_pasid_setup_first_level(iommu, dev,
						mm ? mm->pgd : init_mm.pgd,
						svm->pasid, FLPT_DEFAULT_DID,
						mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
		spin_unlock(&iommu->lock);
		if (ret) {
			kfree(sdev);
			goto out;
		}
	}
	list_add_rcu(&sdev->list, &svm->devs);

 success:
	*pasid = svm->pasid;
	ret = 0;
 out:
	mutex_unlock(&pasid_mutex);
	if (mm)
		mmput(mm);
	return ret;
}
EXPORT_SYMBOL_GPL(intel_svm_bind_mm);

int intel_svm_unbind_mm(struct device *dev, int pasid)
{
	struct intel_svm_dev *sdev;
	struct intel_iommu *iommu;
	struct intel_svm *svm;
	int ret = -EINVAL;

	mutex_lock(&pasid_mutex);
	iommu = intel_svm_device_to_iommu(dev);
	if (!iommu)
		goto out;

	svm = intel_pasid_lookup_id(pasid);
	if (!svm)
		goto out;

	list_for_each_entry(sdev, &svm->devs, list) {
		if (dev == sdev->dev) {
			ret = 0;
			sdev->users--;
			if (!sdev->users) {
				list_del_rcu(&sdev->list);
				/* Flush the PASID cache and IOTLB for this device.
				 * Note that we do depend on the hardware *not* using
				 * the PASID any more. Just as we depend on other
				 * devices never using PASIDs that they have no right
				 * to use. We have a *shared* PASID table, because it's
				 * large and has to be physically contiguous. So it's
				 * hard to be as defensive as we might like. */
				intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
				intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
				kfree_rcu(sdev, rcu);

				if (list_empty(&svm->devs)) {
					intel_pasid_free_id(svm->pasid);
					if (svm->mm)
						mmu_notifier_unregister(&svm->notifier, svm->mm);

					list_del(&svm->list);

					/* We mandate that no page faults may be outstanding
					 * for the PASID when intel_svm_unbind_mm() is called.
					 * If that is not obeyed, subtle errors will happen.
					 * Let's make them less subtle... */
					memset(svm, 0x6b, sizeof(*svm));
					kfree(svm);
				}
			}
			break;
		}
	}
 out:
	mutex_unlock(&pasid_mutex);

	return ret;
}
EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);

int intel_svm_is_pasid_valid(struct device *dev, int pasid)
{
	struct intel_iommu *iommu;
	struct intel_svm *svm;
	int ret = -EINVAL;

	mutex_lock(&pasid_mutex);
	iommu = intel_svm_device_to_iommu(dev);
	if (!iommu)
		goto out;

	svm = intel_pasid_lookup_id(pasid);
	if (!svm)
		goto out;

	/* init_mm is used in this case */
	if (!svm->mm)
		ret = 1;
	else if (atomic_read(&svm->mm->mm_users) > 0)
		ret = 1;
	else
		ret = 0;

 out:
	mutex_unlock(&pasid_mutex);

	return ret;
}
EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);

/* Page request queue descriptor */
struct page_req_dsc {
	union {
		struct {
			u64 type:8;
			u64 pasid_present:1;
			u64 priv_data_present:1;
			u64 rsvd:6;
			u64 rid:16;
			u64 pasid:20;
			u64 exe_req:1;
			u64 pm_req:1;
			u64 rsvd2:10;
		};
		u64 qw_0;
	};
	union {
		struct {
			u64 rd_req:1;
			u64 wr_req:1;
			u64 lpig:1;
			u64 prg_index:9;
			u64 addr:52;
		};
		u64 qw_1;
	};
	u64 priv_data[2];
};

#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)

static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
{
	unsigned long requested = 0;

	if (req->exe_req)
		requested |= VM_EXEC;

	if (req->rd_req)
		requested |= VM_READ;

	if (req->wr_req)
		requested |= VM_WRITE;

	return (requested & ~vma->vm_flags) != 0;
}

static bool is_canonical_address(u64 addr)
{
	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
	long saddr = (long) addr;

	return (((saddr << shift) >> shift) == saddr);
}

static irqreturn_t prq_event_thread(int irq, void *d)
{
	struct intel_iommu *iommu = d;
	struct intel_svm *svm = NULL;
	int head, tail, handled = 0;

	/* Clear PPR bit before reading head/tail registers, to
	 * ensure that we get a new interrupt if needed. */
	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);

	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
	while (head != tail) {
		struct intel_svm_dev *sdev;
		struct vm_area_struct *vma;
		struct page_req_dsc *req;
		struct qi_desc resp;
		int result;
		vm_fault_t ret;
		u64 address;

		handled = 1;

		req = &iommu->prq[head / sizeof(*req)];

		result = QI_RESP_FAILURE;
		address = (u64)req->addr << VTD_PAGE_SHIFT;
		if (!req->pasid_present) {
			pr_err("%s: Page request without PASID: %08llx %08llx\n",
			       iommu->name, ((unsigned long long *)req)[0],
			       ((unsigned long long *)req)[1]);
			goto no_pasid;
		}

		if (!svm || svm->pasid != req->pasid) {
			rcu_read_lock();
			svm = intel_pasid_lookup_id(req->pasid);
			/* It *can't* go away, because the driver is not permitted
			 * to unbind the mm while any page faults are outstanding.
			 * So we only need RCU to protect the internal idr code. */
			rcu_read_unlock();

			if (!svm) {
				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
				       ((unsigned long long *)req)[1]);
				goto no_pasid;
			}
		}

		result = QI_RESP_INVALID;
		/* Since we're using init_mm.pgd directly, we should never take
		 * any faults on kernel addresses. */
		if (!svm->mm)
			goto bad_req;
		/* If the mm is already defunct, don't handle faults. */
		if (!mmget_not_zero(svm->mm))
			goto bad_req;

		/* If address is not canonical, return invalid response */
		if (!is_canonical_address(address))
			goto bad_req;

		down_read(&svm->mm->mmap_sem);
		vma = find_extend_vma(svm->mm, address);
		if (!vma || address < vma->vm_start)
			goto invalid;

		if (access_error(vma, req))
			goto invalid;

		ret = handle_mm_fault(vma, address,
				      req->wr_req ? FAULT_FLAG_WRITE : 0);
		if (ret & VM_FAULT_ERROR)
			goto invalid;

		result = QI_RESP_SUCCESS;
	invalid:
		up_read(&svm->mm->mmap_sem);
		mmput(svm->mm);
	bad_req:
		/* Accounting for major/minor faults? */
		rcu_read_lock();
		list_for_each_entry_rcu(sdev, &svm->devs, list) {
			if (sdev->sid == req->rid)
				break;
		}
		/* Other devices can go away, but the drivers are not permitted
		 * to unbind while any page faults might be in flight. So it's
		 * OK to drop the 'lock' here now we have it. */
		rcu_read_unlock();

		if (WARN_ON(&sdev->list == &svm->devs))
			sdev = NULL;

		if (sdev && sdev->ops && sdev->ops->fault_cb) {
			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
				(req->exe_req << 1) | (req->pm_req);
			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
					    req->priv_data, rwxp, result);
		}
		/* We get here in the error case where the PASID lookup failed,
		   and these can be NULL. Do not use them below this point! */
		sdev = NULL;
		svm = NULL;
	no_pasid:
		if (req->lpig || req->priv_data_present) {
			/*
			 * Per VT-d spec. v3.0 ch7.7, system software must
			 * respond with page group response if private data
			 * is present (PDP) or last page in group (LPIG) bit
			 * is set. This is an additional VT-d feature beyond
			 * PCI ATS spec.
			 */
			resp.qw0 = QI_PGRP_PASID(req->pasid) |
				QI_PGRP_DID(req->rid) |
				QI_PGRP_PASID_P(req->pasid_present) |
				QI_PGRP_PDP(req->pasid_present) |
				QI_PGRP_RESP_CODE(result) |
				QI_PGRP_RESP_TYPE;
			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
				QI_PGRP_LPIG(req->lpig);

			if (req->priv_data_present)
				memcpy(&resp.qw2, req->priv_data,
				       sizeof(req->priv_data));
		}
		resp.qw2 = 0;
		resp.qw3 = 0;
		qi_submit_sync(&resp, iommu);

		head = (head + sizeof(*req)) & PRQ_RING_MASK;
	}

	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);

	return IRQ_RETVAL(handled);
}
Commit	Line	Data
2025cf9e	1	// SPDX-License-Identifier: GPL-2.0-only
8a94ade4 DW	2	/*
	3	* Copyright © 2015 Intel Corporation.
	4	*
8a94ade4 DW	5	* Authors: David Woodhouse <dwmw2@infradead.org>
	6	*/
	7
	8	#include <linux/intel-iommu.h>
2f26e0a9 DW	9	#include <linux/mmu_notifier.h>
2f26e0a9 DW	10	#include <linux/sched.h>
6e84f315	11	#include <linux/sched/mm.h>
2f26e0a9 DW	12	#include <linux/slab.h>
	13	#include <linux/intel-svm.h>
	14	#include <linux/rculist.h>
	15	#include <linux/pci.h>
	16	#include <linux/pci-ats.h>
a222a7f0 DW	17	#include <linux/dmar.h>
a222a7f0 DW	18	#include <linux/interrupt.h>
50a7ca3c	19	#include <linux/mm_types.h>
9d8c3af3	20	#include <asm/page.h>
a222a7f0	21
af395073 LB	22	#include "intel-pasid.h"
af395073 LB	23
a222a7f0	24	static irqreturn_t prq_event_thread(int irq, void *d);
2f26e0a9	25
d9737953	26	int intel_svm_init(struct intel_iommu *iommu)
8a94ade4	27	{
59103caa SM	28	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
	29	!cap_fl1gp_support(iommu->cap))
	30	return -EINVAL;
	31
f1ac10c2 SM	32	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
	33	!cap_5lp_support(iommu->cap))
	34	return -EINVAL;
	35
8a94ade4 DW	36	return 0;
8a94ade4 DW	37	}
2f26e0a9	38
a222a7f0 DW	39	#define PRQ_ORDER 0
	40
	41	int intel_svm_enable_prq(struct intel_iommu *iommu)
	42	{
	43	struct page *pages;
	44	int irq, ret;
	45
	46	pages = alloc_pages(GFP_KERNEL \| __GFP_ZERO, PRQ_ORDER);
	47	if (!pages) {
	48	pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
	49	iommu->name);
	50	return -ENOMEM;
	51	}
	52	iommu->prq = page_address(pages);
	53
	54	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
	55	if (irq <= 0) {
	56	pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
	57	iommu->name);
	58	ret = -EINVAL;
	59	err:
	60	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
	61	iommu->prq = NULL;
	62	return ret;
	63	}
	64	iommu->pr_irq = irq;
	65
	66	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
	67
	68	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
	69	iommu->prq_name, iommu);
	70	if (ret) {
	71	pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
	72	iommu->name);
	73	dmar_free_hwirq(irq);
72d54811	74	iommu->pr_irq = 0;
a222a7f0 DW	75	goto err;
	76	}
	77	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	78	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	79	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) \| PRQ_ORDER);
	80
	81	return 0;
	82	}
	83
	84	int intel_svm_finish_prq(struct intel_iommu *iommu)
	85	{
	86	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
	87	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
	88	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
	89
72d54811 JS	90	if (iommu->pr_irq) {
	91	free_irq(iommu->pr_irq, iommu);
	92	dmar_free_hwirq(iommu->pr_irq);
	93	iommu->pr_irq = 0;
	94	}
a222a7f0 DW	95
	96	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
	97	iommu->prq = NULL;
	98
	99	return 0;
	100	}
	101
2f26e0a9	102	static void intel_flush_svm_range_dev (struct intel_svm svm, struct intel_svm_dev sdev,
5d52f482	103	unsigned long address, unsigned long pages, int ih, int gl)
2f26e0a9 DW	104	{
2f26e0a9 DW	105	struct qi_desc desc;
2f26e0a9	106
5d52f482	107	if (pages == -1) {
e0349921 DW	108	/* For global kernel pages we have to flush them in all PASIDs
	109	* because that's the only option the hardware gives us. Despite
	110	* the fact that they are actually only accessible through one. */
	111	if (gl)
5d308fc1 LB	112	desc.qw0 = QI_EIOTLB_PASID(svm->pasid) \|
	113	QI_EIOTLB_DID(sdev->did) \|
	114	QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) \|
	115	QI_EIOTLB_TYPE;
e0349921	116	else
5d308fc1 LB	117	desc.qw0 = QI_EIOTLB_PASID(svm->pasid) \|
	118	QI_EIOTLB_DID(sdev->did) \|
	119	QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) \|
	120	QI_EIOTLB_TYPE;
	121	desc.qw1 = 0;
2f26e0a9	122	} else {
5d52f482 DW	123	int mask = ilog2(__roundup_pow_of_two(pages));
5d52f482 DW	124
5d308fc1 LB	125	desc.qw0 = QI_EIOTLB_PASID(svm->pasid) \|
	126	QI_EIOTLB_DID(sdev->did) \|
	127	QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) \|
	128	QI_EIOTLB_TYPE;
	129	desc.qw1 = QI_EIOTLB_ADDR(address) \|
	130	QI_EIOTLB_GL(gl) \|
	131	QI_EIOTLB_IH(ih) \|
	132	QI_EIOTLB_AM(mask);
2f26e0a9	133	}
5d308fc1 LB	134	desc.qw2 = 0;
5d308fc1 LB	135	desc.qw3 = 0;
2f26e0a9 DW	136	qi_submit_sync(&desc, svm->iommu);
	137
	138	if (sdev->dev_iotlb) {
5d308fc1 LB	139	desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) \|
	140	QI_DEV_EIOTLB_SID(sdev->sid) \|
	141	QI_DEV_EIOTLB_QDEP(sdev->qdep) \|
	142	QI_DEIOTLB_TYPE;
5d52f482	143	if (pages == -1) {
5d308fc1 LB	144	desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) \|
5d308fc1 LB	145	QI_DEV_EIOTLB_SIZE;
5d52f482 DW	146	} else if (pages > 1) {
	147	/* The least significant zero bit indicates the size. So,
	148	* for example, an "address" value of 0x12345f000 will
	149	* flush from 0x123440000 to 0x12347ffff (256KiB). */
	150	unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
ed7158ba	151	unsigned long mask = __rounddown_pow_of_two(address ^ last);
5d52f482	152
5d308fc1 LB	153	desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) \|
5d308fc1 LB	154	(mask - 1)) \| QI_DEV_EIOTLB_SIZE;
2f26e0a9	155	} else {
5d308fc1	156	desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
2f26e0a9	157	}
5d308fc1 LB	158	desc.qw2 = 0;
5d308fc1 LB	159	desc.qw3 = 0;
2f26e0a9 DW	160	qi_submit_sync(&desc, svm->iommu);
	161	}
	162	}
	163
	164	static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
5d52f482	165	unsigned long pages, int ih, int gl)
2f26e0a9 DW	166	{
	167	struct intel_svm_dev *sdev;
	168
	169	rcu_read_lock();
	170	list_for_each_entry_rcu(sdev, &svm->devs, list)
e0349921	171	intel_flush_svm_range_dev(svm, sdev, address, pages, ih, gl);
2f26e0a9 DW	172	rcu_read_unlock();
	173	}
	174
2f26e0a9 DW	175	/* Pages have been freed at this point */
	176	static void intel_invalidate_range(struct mmu_notifier *mn,
	177	struct mm_struct *mm,
	178	unsigned long start, unsigned long end)
	179	{
	180	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
	181
	182	intel_flush_svm_range(svm, start,
e0349921	183	(end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0, 0);
2f26e0a9 DW	184	}
2f26e0a9 DW	185
2f26e0a9 DW	186	static void intel_mm_release(struct mmu_notifier mn, struct mm_struct mm)
	187	{
	188	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
e57e58bd	189	struct intel_svm_dev *sdev;
2f26e0a9	190
e57e58bd DW	191	/* This might end up being called from exit_mmap(), before the page
	192	* tables are cleared. And __mmu_notifier_release() will delete us from
	193	* the list of notifiers so that our invalidate_range() callback doesn't
	194	* get called when the page tables are cleared. So we need to protect
	195	* against hardware accessing those page tables.
	196	*
	197	* We do it by clearing the entry in the PASID table and then flushing
	198	* the IOTLB and the PASID table caches. This might upset hardware;
	199	* perhaps we'll want to point the PASID to a dummy PGD (like the zero
	200	* page) so that we end up taking a fault that the hardware really
	201	* has to handle gracefully without affecting other processes.
	202	*/
e57e58bd DW	203	rcu_read_lock();
e57e58bd DW	204	list_for_each_entry_rcu(sdev, &svm->devs, list) {
1c4f88b7	205	intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
e57e58bd DW	206	intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
	207	}
	208	rcu_read_unlock();
2f26e0a9	209
2f26e0a9 DW	210	}
	211
	212	static const struct mmu_notifier_ops intel_mmuops = {
	213	.release = intel_mm_release,
2f26e0a9 DW	214	.invalidate_range = intel_invalidate_range,
	215	};
	216
	217	static DEFINE_MUTEX(pasid_mutex);
51261aac	218	static LIST_HEAD(global_svm_list);
2f26e0a9	219
0204a496	220	int intel_svm_bind_mm(struct device dev, int pasid, int flags, struct svm_dev_ops *ops)
2f26e0a9 DW	221	{
2f26e0a9 DW	222	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
d7cbc0f3	223	struct device_domain_info *info;
2f26e0a9 DW	224	struct intel_svm_dev *sdev;
2f26e0a9 DW	225	struct intel_svm *svm = NULL;
5cec7537	226	struct mm_struct *mm = NULL;
2f26e0a9 DW	227	int pasid_max;
	228	int ret;
	229
c56cba5d	230	if (!iommu \|\| dmar_disabled)
2f26e0a9 DW	231	return -EINVAL;
	232
	233	if (dev_is_pci(dev)) {
	234	pasid_max = pci_max_pasids(to_pci_dev(dev));
	235	if (pasid_max < 0)
	236	return -EINVAL;
	237	} else
	238	pasid_max = 1 << 20;
	239
bb37f7db	240	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
5cec7537 DW	241	if (!ecap_srs(iommu->ecap))
	242	return -EINVAL;
	243	} else if (pasid) {
	244	mm = get_task_mm(current);
	245	BUG_ON(!mm);
	246	}
	247
2f26e0a9	248	mutex_lock(&pasid_mutex);
569e4f77	249	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
51261aac	250	struct intel_svm *t;
2f26e0a9	251
51261aac LB	252	list_for_each_entry(t, &global_svm_list, list) {
51261aac LB	253	if (t->mm != mm \|\| (t->flags & SVM_FLAG_PRIVATE_PASID))
2f26e0a9 DW	254	continue;
2f26e0a9 DW	255
51261aac	256	svm = t;
2f26e0a9 DW	257	if (svm->pasid >= pasid_max) {
	258	dev_warn(dev,
	259	"Limited PASID width. Cannot use existing PASID %d\n",
	260	svm->pasid);
	261	ret = -ENOSPC;
	262	goto out;
	263	}
	264
	265	list_for_each_entry(sdev, &svm->devs, list) {
	266	if (dev == sdev->dev) {
0204a496 DW	267	if (sdev->ops != ops) {
	268	ret = -EBUSY;
	269	goto out;
	270	}
2f26e0a9 DW	271	sdev->users++;
	272	goto success;
	273	}
	274	}
	275
	276	break;
	277	}
	278	}
	279
	280	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
	281	if (!sdev) {
	282	ret = -ENOMEM;
	283	goto out;
	284	}
	285	sdev->dev = dev;
	286
d7cbc0f3	287	ret = intel_iommu_enable_pasid(iommu, dev);
2f26e0a9 DW	288	if (ret \|\| !pasid) {
	289	/* If they don't actually want to assign a PASID, this is
	290	* just an enabling check/preparation. */
	291	kfree(sdev);
	292	goto out;
	293	}
d7cbc0f3 LB	294
	295	info = dev->archdata.iommu;
	296	if (!info \|\| !info->pasid_supported) {
	297	kfree(sdev);
	298	goto out;
	299	}
	300
	301	sdev->did = FLPT_DEFAULT_DID;
	302	sdev->sid = PCI_DEVID(info->bus, info->devfn);
	303	if (info->ats_enabled) {
	304	sdev->dev_iotlb = 1;
	305	sdev->qdep = info->ats_qdep;
	306	if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
	307	sdev->qdep = 0;
	308	}
	309
2f26e0a9 DW	310	/* Finish the setup now we know we're keeping it */
2f26e0a9 DW	311	sdev->users = 1;
0204a496	312	sdev->ops = ops;
2f26e0a9 DW	313	init_rcu_head(&sdev->rcu);
	314
	315	if (!svm) {
	316	svm = kzalloc(sizeof(*svm), GFP_KERNEL);
	317	if (!svm) {
	318	ret = -ENOMEM;
	319	kfree(sdev);
	320	goto out;
	321	}
	322	svm->iommu = iommu;
	323
4774cc52 LB	324	if (pasid_max > intel_pasid_max_id)
4774cc52 LB	325	pasid_max = intel_pasid_max_id;
2f26e0a9	326
5a10ba27	327	/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
af395073 LB	328	ret = intel_pasid_alloc_id(svm,
	329	!!cap_caching_mode(iommu->cap),
	330	pasid_max - 1, GFP_KERNEL);
2f26e0a9 DW	331	if (ret < 0) {
2f26e0a9 DW	332	kfree(svm);
bbe4b3af	333	kfree(sdev);
2f26e0a9 DW	334	goto out;
	335	}
	336	svm->pasid = ret;
	337	svm->notifier.ops = &intel_mmuops;
5cec7537	338	svm->mm = mm;
569e4f77	339	svm->flags = flags;
2f26e0a9	340	INIT_LIST_HEAD_RCU(&svm->devs);
51261aac	341	INIT_LIST_HEAD(&svm->list);
2f26e0a9	342	ret = -ENOMEM;
5cec7537 DW	343	if (mm) {
	344	ret = mmu_notifier_register(&svm->notifier, mm);
	345	if (ret) {
af395073	346	intel_pasid_free_id(svm->pasid);
5cec7537 DW	347	kfree(svm);
	348	kfree(sdev);
	349	goto out;
	350	}
1c4f88b7	351	}
97140101	352
1c4f88b7 LB	353	spin_lock(&iommu->lock);
	354	ret = intel_pasid_setup_first_level(iommu, dev,
	355	mm ? mm->pgd : init_mm.pgd,
	356	svm->pasid, FLPT_DEFAULT_DID,
	357	mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
	358	spin_unlock(&iommu->lock);
	359	if (ret) {
	360	if (mm)
	361	mmu_notifier_unregister(&svm->notifier, mm);
	362	intel_pasid_free_id(svm->pasid);
	363	kfree(svm);
	364	kfree(sdev);
	365	goto out;
	366	}
51261aac LB	367
51261aac LB	368	list_add_tail(&svm->list, &global_svm_list);
d7af4d98 JP	369	} else {
	370	/*
	371	* Binding a new device with existing PASID, need to setup
	372	* the PASID entry.
	373	*/
	374	spin_lock(&iommu->lock);
	375	ret = intel_pasid_setup_first_level(iommu, dev,
	376	mm ? mm->pgd : init_mm.pgd,
	377	svm->pasid, FLPT_DEFAULT_DID,
	378	mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
	379	spin_unlock(&iommu->lock);
	380	if (ret) {
	381	kfree(sdev);
	382	goto out;
	383	}
2f26e0a9 DW	384	}
	385	list_add_rcu(&sdev->list, &svm->devs);
	386
	387	success:
	388	*pasid = svm->pasid;
	389	ret = 0;
	390	out:
	391	mutex_unlock(&pasid_mutex);
5cec7537 DW	392	if (mm)
5cec7537 DW	393	mmput(mm);
2f26e0a9 DW	394	return ret;
	395	}
	396	EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
	397
	398	int intel_svm_unbind_mm(struct device *dev, int pasid)
	399	{
	400	struct intel_svm_dev *sdev;
	401	struct intel_iommu *iommu;
	402	struct intel_svm *svm;
	403	int ret = -EINVAL;
	404
	405	mutex_lock(&pasid_mutex);
	406	iommu = intel_svm_device_to_iommu(dev);
4774cc52	407	if (!iommu)
2f26e0a9 DW	408	goto out;
2f26e0a9 DW	409
af395073	410	svm = intel_pasid_lookup_id(pasid);
2f26e0a9 DW	411	if (!svm)
	412	goto out;
	413
	414	list_for_each_entry(sdev, &svm->devs, list) {
	415	if (dev == sdev->dev) {
	416	ret = 0;
	417	sdev->users--;
	418	if (!sdev->users) {
	419	list_del_rcu(&sdev->list);
	420	/* Flush the PASID cache and IOTLB for this device.
	421	* Note that we do depend on the hardware not using
	422	* the PASID any more. Just as we depend on other
	423	* devices never using PASIDs that they have no right
	424	* to use. We have a shared PASID table, because it's
	425	* large and has to be physically contiguous. So it's
	426	* hard to be as defensive as we might like. */
1c4f88b7	427	intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
e0349921	428	intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
2f26e0a9 DW	429	kfree_rcu(sdev, rcu);
	430
	431	if (list_empty(&svm->devs)) {
af395073	432	intel_pasid_free_id(svm->pasid);
5cec7537	433	if (svm->mm)
e57e58bd DW	434	mmu_notifier_unregister(&svm->notifier, svm->mm);
e57e58bd DW	435
51261aac LB	436	list_del(&svm->list);
51261aac LB	437
2f26e0a9 DW	438	/* We mandate that no page faults may be outstanding
	439	* for the PASID when intel_svm_unbind_mm() is called.
	440	* If that is not obeyed, subtle errors will happen.
	441	* Let's make them less subtle... */
	442	memset(svm, 0x6b, sizeof(*svm));
	443	kfree(svm);
	444	}
	445	}
	446	break;
	447	}
	448	}
	449	out:
	450	mutex_unlock(&pasid_mutex);
	451
	452	return ret;
	453	}
	454	EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);
a222a7f0	455
15060aba CT	456	int intel_svm_is_pasid_valid(struct device *dev, int pasid)
	457	{
	458	struct intel_iommu *iommu;
	459	struct intel_svm *svm;
	460	int ret = -EINVAL;
	461
	462	mutex_lock(&pasid_mutex);
	463	iommu = intel_svm_device_to_iommu(dev);
4774cc52	464	if (!iommu)
15060aba CT	465	goto out;
15060aba CT	466
af395073	467	svm = intel_pasid_lookup_id(pasid);
15060aba CT	468	if (!svm)
	469	goto out;
	470
	471	/* init_mm is used in this case */
	472	if (!svm->mm)
	473	ret = 1;
	474	else if (atomic_read(&svm->mm->mm_users) > 0)
	475	ret = 1;
	476	else
	477	ret = 0;
	478
	479	out:
	480	mutex_unlock(&pasid_mutex);
	481
	482	return ret;
	483	}
	484	EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
	485
a222a7f0 DW	486	/* Page request queue descriptor */
a222a7f0 DW	487	struct page_req_dsc {
5b438f4b JP	488	union {
	489	struct {
	490	u64 type:8;
	491	u64 pasid_present:1;
	492	u64 priv_data_present:1;
	493	u64 rsvd:6;
	494	u64 rid:16;
	495	u64 pasid:20;
	496	u64 exe_req:1;
	497	u64 pm_req:1;
	498	u64 rsvd2:10;
	499	};
	500	u64 qw_0;
	501	};
	502	union {
	503	struct {
	504	u64 rd_req:1;
	505	u64 wr_req:1;
	506	u64 lpig:1;
	507	u64 prg_index:9;
	508	u64 addr:52;
	509	};
	510	u64 qw_1;
	511	};
	512	u64 priv_data[2];
a222a7f0 DW	513	};
	514
	515	#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
7f8312a3 JR	516
	517	static bool access_error(struct vm_area_struct vma, struct page_req_dsc req)
	518	{
	519	unsigned long requested = 0;
	520
	521	if (req->exe_req)
	522	requested \|= VM_EXEC;
	523
	524	if (req->rd_req)
	525	requested \|= VM_READ;
	526
	527	if (req->wr_req)
	528	requested \|= VM_WRITE;
	529
	530	return (requested & ~vma->vm_flags) != 0;
	531	}
	532
9d8c3af3 AR	533	static bool is_canonical_address(u64 addr)
	534	{
	535	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
	536	long saddr = (long) addr;
	537
	538	return (((saddr << shift) >> shift) == saddr);
	539	}
	540
a222a7f0 DW	541	static irqreturn_t prq_event_thread(int irq, void *d)
	542	{
	543	struct intel_iommu *iommu = d;
	544	struct intel_svm *svm = NULL;
	545	int head, tail, handled = 0;
	546
46924008 DW	547	/* Clear PPR bit before reading head/tail registers, to
	548	* ensure that we get a new interrupt if needed. */
	549	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
	550
a222a7f0 DW	551	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
	552	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
	553	while (head != tail) {
0204a496	554	struct intel_svm_dev *sdev;
a222a7f0 DW	555	struct vm_area_struct *vma;
	556	struct page_req_dsc *req;
	557	struct qi_desc resp;
50a7ca3c SJ	558	int result;
50a7ca3c SJ	559	vm_fault_t ret;
a222a7f0 DW	560	u64 address;
	561
	562	handled = 1;
	563
	564	req = &iommu->prq[head / sizeof(*req)];
	565
	566	result = QI_RESP_FAILURE;
7f92a2e9	567	address = (u64)req->addr << VTD_PAGE_SHIFT;
a222a7f0 DW	568	if (!req->pasid_present) {
	569	pr_err("%s: Page request without PASID: %08llx %08llx\n",
	570	iommu->name, ((unsigned long long *)req)[0],
	571	((unsigned long long *)req)[1]);
19ed3e2d	572	goto no_pasid;
a222a7f0 DW	573	}
	574
	575	if (!svm \|\| svm->pasid != req->pasid) {
	576	rcu_read_lock();
af395073	577	svm = intel_pasid_lookup_id(req->pasid);
a222a7f0 DW	578	/* It can't go away, because the driver is not permitted
	579	* to unbind the mm while any page faults are outstanding.
	580	* So we only need RCU to protect the internal idr code. */
	581	rcu_read_unlock();
	582
	583	if (!svm) {
	584	pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
	585	iommu->name, req->pasid, ((unsigned long long *)req)[0],
	586	((unsigned long long *)req)[1]);
26322ab5	587	goto no_pasid;
a222a7f0 DW	588	}
	589	}
	590
	591	result = QI_RESP_INVALID;
5cec7537 DW	592	/* Since we're using init_mm.pgd directly, we should never take
	593	* any faults on kernel addresses. */
	594	if (!svm->mm)
	595	goto bad_req;
e57e58bd	596	/* If the mm is already defunct, don't handle faults. */
388f7934	597	if (!mmget_not_zero(svm->mm))
e57e58bd	598	goto bad_req;
9d8c3af3 AR	599
	600	/* If address is not canonical, return invalid response */
	601	if (!is_canonical_address(address))
	602	goto bad_req;
	603
a222a7f0 DW	604	down_read(&svm->mm->mmap_sem);
	605	vma = find_extend_vma(svm->mm, address);
	606	if (!vma \|\| address < vma->vm_start)
	607	goto invalid;
	608
7f8312a3 JR	609	if (access_error(vma, req))
	610	goto invalid;
	611
dcddffd4	612	ret = handle_mm_fault(vma, address,
a222a7f0 DW	613	req->wr_req ? FAULT_FLAG_WRITE : 0);
	614	if (ret & VM_FAULT_ERROR)
	615	goto invalid;
	616
	617	result = QI_RESP_SUCCESS;
	618	invalid:
	619	up_read(&svm->mm->mmap_sem);
e57e58bd	620	mmput(svm->mm);
a222a7f0 DW	621	bad_req:
a222a7f0 DW	622	/* Accounting for major/minor faults? */
0204a496 DW	623	rcu_read_lock();
0204a496 DW	624	list_for_each_entry_rcu(sdev, &svm->devs, list) {
5b438f4b	625	if (sdev->sid == req->rid)
0204a496 DW	626	break;
	627	}
	628	/* Other devices can go away, but the drivers are not permitted
	629	* to unbind while any page faults might be in flight. So it's
	630	* OK to drop the 'lock' here now we have it. */
	631	rcu_read_unlock();
	632
	633	if (WARN_ON(&sdev->list == &svm->devs))
	634	sdev = NULL;
	635
	636	if (sdev && sdev->ops && sdev->ops->fault_cb) {
	637	int rwxp = (req->rd_req << 3) \| (req->wr_req << 2) \|
5b438f4b JP	638	(req->exe_req << 1) \| (req->pm_req);
	639	sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
	640	req->priv_data, rwxp, result);
0204a496	641	}
26322ab5 DW	642	/* We get here in the error case where the PASID lookup failed,
	643	and these can be NULL. Do not use them below this point! */
	644	sdev = NULL;
	645	svm = NULL;
	646	no_pasid:
5b438f4b JP	647	if (req->lpig \|\| req->priv_data_present) {
	648	/*
	649	* Per VT-d spec. v3.0 ch7.7, system software must
	650	* respond with page group response if private data
	651	* is present (PDP) or last page in group (LPIG) bit
	652	* is set. This is an additional VT-d feature beyond
	653	* PCI ATS spec.
	654	*/
5d308fc1	655	resp.qw0 = QI_PGRP_PASID(req->pasid) \|
5b438f4b	656	QI_PGRP_DID(req->rid) \|
a222a7f0	657	QI_PGRP_PASID_P(req->pasid_present) \|
5b438f4b JP	658	QI_PGRP_PDP(req->pasid_present) \|
5b438f4b JP	659	QI_PGRP_RESP_CODE(result) \|
a222a7f0	660	QI_PGRP_RESP_TYPE;
5d308fc1	661	resp.qw1 = QI_PGRP_IDX(req->prg_index) \|
5b438f4b JP	662	QI_PGRP_LPIG(req->lpig);
	663
	664	if (req->priv_data_present)
	665	memcpy(&resp.qw2, req->priv_data,
	666	sizeof(req->priv_data));
a222a7f0	667	}
5d308fc1 LB	668	resp.qw2 = 0;
	669	resp.qw3 = 0;
	670	qi_submit_sync(&resp, iommu);
a222a7f0 DW	671
	672	head = (head + sizeof(*req)) & PRQ_RING_MASK;
	673	}
	674
	675	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
	676
	677	return IRQ_RETVAL(handled);
	678	}